Spaces:
Running
Running
update
Browse files- README.md +3 -148
- character_util.py +1 -1
- compression_app.py +2 -1
- utils/lang_util.py +3 -0
- utils/oov.md +0 -202
- vocab.py +1 -1
README.md
CHANGED
@@ -43,79 +43,6 @@ python utils/compress_rate_util.py
|
|
43 |
| aya_101 | 250100 | 3.3 | 0.3 | 3.22 | 0.31 | 3.53 |
|
44 |
| baichuan | 64000 | 3.74 | 0.27 | 3.65 | 0.27 | 4 |
|
45 |
| baichuan2 | 125696 | 3.89 | 0.26 | 3.8 | 0.26 | 4.17 |
|
46 |
-
| bert_base_cased | 28996 | 3.64 | 0.27 | 3.55 | 0.28 | 3.89 |
|
47 |
-
| bert_base_chinese | 21128 | 2.78 | 0.36 | 2.71 | 0.37 | 2.97 |
|
48 |
-
| bert_base_uncased | 30522 | 3.73 | 0.27 | 3.65 | 0.27 | 4 |
|
49 |
-
| bloom | 250680 | 4.07 | 0.25 | 3.97 | 0.25 | 4.36 |
|
50 |
-
| byt5_small | 256 | 0.92 | 1.08 | 0.9 | 1.11 | 0.99 |
|
51 |
-
| character_glm_6b | 64794 | 3.62 | 0.28 | 3.54 | 0.28 | 3.88 |
|
52 |
-
| chatglm2_6b | 64794 | 3.62 | 0.28 | 3.54 | 0.28 | 3.88 |
|
53 |
-
| chatglm3_6b | 64798 | 3.62 | 0.28 | 3.54 | 0.28 | 3.88 |
|
54 |
-
| chatglm_6b | 150344 | 3.68 | 0.27 | 3.59 | 0.28 | 3.94 |
|
55 |
-
| chatyuan_large_v2 | 32128 | 1.95 | 0.51 | 1.91 | 0.52 | 2.09 |
|
56 |
-
| chinese_llama | 49953 | 3.59 | 0.28 | 3.51 | 0.28 | 3.85 |
|
57 |
-
| chinese_llama2 | 55296 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 |
|
58 |
-
| code_davinci_002 | 50281 | 4.05 | 0.25 | 3.96 | 0.25 | 4.34 |
|
59 |
-
| crystal_coder | 32000 | 3.68 | 0.27 | 3.59 | 0.28 | 3.94 |
|
60 |
-
| dbrx_instruct | 100277 | 4.11 | 0.24 | 4.01 | 0.25 | 4.4 |
|
61 |
-
| deepseek_coder_33b_instruct | 32000 | 3.64 | 0.27 | 3.56 | 0.28 | 3.9 |
|
62 |
-
| deepseek_llm_7b_base | 100000 | 3.85 | 0.26 | 3.76 | 0.27 | 4.12 |
|
63 |
-
| falcon_180b | 65024 | 3.99 | 0.25 | 3.9 | 0.26 | 4.27 |
|
64 |
-
| falcon_7b | 65024 | 3.99 | 0.25 | 3.9 | 0.26 | 4.27 |
|
65 |
-
| fastchat_t5_3b | 32000 | 2.16 | 0.46 | 2.11 | 0.47 | 2.31 |
|
66 |
-
| flan_t5_base | 32100 | 3.61 | 0.28 | 3.53 | 0.28 | 3.87 |
|
67 |
-
| gemma_7b | 256000 | 3.91 | 0.26 | 3.82 | 0.26 | 4.18 |
|
68 |
-
| gpt2 | 50257 | 4.05 | 0.25 | 3.96 | 0.25 | 4.34 |
|
69 |
-
| gpt2_chinese | 21128 | 2.67 | 0.37 | 2.61 | 0.38 | 2.86 |
|
70 |
-
| gpt_35_turbo | 100277 | 4.11 | 0.24 | 4.01 | 0.25 | 4.4 |
|
71 |
-
| gpt_4 | 100277 | 4.11 | 0.24 | 4.01 | 0.25 | 4.4 |
|
72 |
-
| gpt_nexo_20b | 50254 | 4.04 | 0.25 | 3.94 | 0.25 | 4.32 |
|
73 |
-
| grok_1 | 131072 | 4.06 | 0.25 | 3.96 | 0.25 | 4.35 |
|
74 |
-
| internlm2_chat_7b | 92544 | 3.86 | 0.26 | 3.77 | 0.27 | 4.13 |
|
75 |
-
| internlm2_math_7b | 92544 | 3.86 | 0.26 | 3.77 | 0.27 | 4.13 |
|
76 |
-
| internlm_chat_7b | 103168 | 3.86 | 0.26 | 3.77 | 0.27 | 4.13 |
|
77 |
-
| internlm_xcomposer_7b | 103168 | 3.86 | 0.26 | 3.77 | 0.27 | 4.13 |
|
78 |
-
| jamba_v0_1 | 65536 | 3.82 | 0.26 | 3.73 | 0.27 | 4.09 |
|
79 |
-
| kplug | 10261 | 2.66 | 0.38 | 2.6 | 0.38 | 2.85 |
|
80 |
-
| llama | 32000 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 |
|
81 |
-
| llama2 | 32000 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 |
|
82 |
-
| llama3 | 128000 | 4.11 | 0.24 | 4.01 | 0.25 | 4.4 |
|
83 |
-
| mistral_7b | 32000 | 3.67 | 0.27 | 3.58 | 0.28 | 3.92 |
|
84 |
-
| mixtral_8_7b | 32000 | 3.67 | 0.27 | 3.58 | 0.28 | 3.92 |
|
85 |
-
| mobilebert_uncased | 30522 | 3.73 | 0.27 | 3.65 | 0.27 | 4 |
|
86 |
-
| moss | 106029 | 4.08 | 0.25 | 3.98 | 0.25 | 4.36 |
|
87 |
-
| mt5_large | 250100 | 3.3 | 0.3 | 3.22 | 0.31 | 3.53 |
|
88 |
-
| olmo_7b | 50280 | 4.04 | 0.25 | 3.94 | 0.25 | 4.32 |
|
89 |
-
| orion_14b_chat | 84608 | 3.94 | 0.25 | 3.85 | 0.26 | 4.22 |
|
90 |
-
| phi_1 | 50257 | 4.05 | 0.25 | 3.96 | 0.25 | 4.34 |
|
91 |
-
| phi_2 | 50257 | 4.05 | 0.25 | 3.96 | 0.25 | 4.34 |
|
92 |
-
| pko_t5_large | 50258 | 1.59 | 0.63 | 1.55 | 0.64 | 1.7 |
|
93 |
-
| prompt_clue | 32128 | 1.95 | 0.51 | 1.91 | 0.52 | 2.09 |
|
94 |
-
| qwen1_5_14b_chat | 151643 | 4.06 | 0.25 | 3.97 | 0.25 | 4.35 |
|
95 |
-
| qwen_1_8b_chat | 151851 | 4.06 | 0.25 | 3.97 | 0.25 | 4.35 |
|
96 |
-
| qwen_72b_chat | 151851 | 4.06 | 0.25 | 3.97 | 0.25 | 4.35 |
|
97 |
-
| qwen_7b_chat | 151851 | 4.06 | 0.25 | 3.97 | 0.25 | 4.35 |
|
98 |
-
| roberta_chinese_clue | 8021 | 1.8 | 0.56 | 1.75 | 0.57 | 1.92 |
|
99 |
-
| skywork_13b_base | 65519 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 |
|
100 |
-
| skywork_13b_math | 65519 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 |
|
101 |
-
| solar_10_7b | 32000 | 3.67 | 0.27 | 3.58 | 0.28 | 3.92 |
|
102 |
-
| starchat_alpha | 49152 | 3.63 | 0.28 | 3.54 | 0.28 | 3.88 |
|
103 |
-
| switch_c_2048 | 32100 | 3.61 | 0.28 | 3.53 | 0.28 | 3.87 |
|
104 |
-
| t5_base | 32100 | 3.61 | 0.28 | 3.53 | 0.28 | 3.87 |
|
105 |
-
| t5_large | 32100 | 3.61 | 0.28 | 3.53 | 0.28 | 3.87 |
|
106 |
-
| t5_small | 32100 | 3.61 | 0.28 | 3.53 | 0.28 | 3.87 |
|
107 |
-
| text_davinci_003 | 50281 | 4.05 | 0.25 | 3.96 | 0.25 | 4.34 |
|
108 |
-
| tigerbot_13b_chat_v2 | 60512 | 3.67 | 0.27 | 3.58 | 0.28 | 3.93 |
|
109 |
-
| tigerbot_70b_chat_v4_4k | 65107 | 3.65 | 0.27 | 3.57 | 0.28 | 3.91 |
|
110 |
-
| wizardcoder_15b_v1 | 49152 | 3.63 | 0.28 | 3.54 | 0.28 | 3.88 |
|
111 |
-
| wizardcoder_python_7b_v1 | 32000 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 |
|
112 |
-
| wizardlm_7b_v1 | 32000 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 |
|
113 |
-
| wizardmath_70b_v1 | 32000 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 |
|
114 |
-
| xlm_roberta | 250002 | 3.49 | 0.29 | 3.41 | 0.29 | 3.74 |
|
115 |
-
| yi_34b | 64000 | 3.87 | 0.26 | 3.78 | 0.26 | 4.15 |
|
116 |
-
| yi_6b | 64000 | 3.87 | 0.26 | 3.78 | 0.26 | 4.15 |
|
117 |
-
| yi_vl34b | 64000 | 3.88 | 0.26 | 3.79 | 0.26 | 4.16 |
|
118 |
-
| zephyr_7b_beta | 32000 | 3.67 | 0.27 | 3.58 | 0.28 | 3.92 |
|
119 |
|
120 |
</details>
|
121 |
|
@@ -128,80 +55,6 @@ python utils/compress_rate_util.py
|
|
128 |
| amber | 32000 | 1.84 | 0.54 | 1.8 | 0.56 | 0.7 |
|
129 |
| aya_101 | 250100 | 3.89 | 0.26 | 3.79 | 0.26 | 1.47 |
|
130 |
| baichuan | 64000 | 3.92 | 0.26 | 3.82 | 0.26 | 1.48 |
|
131 |
-
| baichuan2 | 125696 | 4.53 | 0.22 | 4.42 | 0.23 | 1.71 |
|
132 |
-
| bert_base_cased | 28996 | 2.73 | 0.37 | 2.66 | 0.38 | 1.03 |
|
133 |
-
| bert_base_chinese | 21128 | 2.74 | 0.37 | 2.67 | 0.37 | 1.03 |
|
134 |
-
| bert_base_uncased | 30522 | 2.73 | 0.37 | 2.67 | 0.38 | 1.03 |
|
135 |
-
| bloom | 250680 | 4.28 | 0.23 | 4.18 | 0.24 | 1.62 |
|
136 |
-
| byt5_small | 256 | 0.93 | 1.08 | 0.91 | 1.1 | 0.35 |
|
137 |
-
| character_glm_6b | 64794 | 4.2 | 0.24 | 4.1 | 0.24 | 1.59 |
|
138 |
-
| chatglm2_6b | 64794 | 4.2 | 0.24 | 4.1 | 0.24 | 1.59 |
|
139 |
-
| chatglm3_6b | 64798 | 4.2 | 0.24 | 4.1 | 0.24 | 1.59 |
|
140 |
-
| chatglm_6b | 150344 | 4.65 | 0.22 | 4.54 | 0.22 | 1.76 |
|
141 |
-
| chatyuan_large_v2 | 32128 | 4.34 | 0.23 | 4.24 | 0.24 | 1.64 |
|
142 |
-
| chinese_llama | 49953 | 3.93 | 0.25 | 3.84 | 0.26 | 1.49 |
|
143 |
-
| chinese_llama2 | 55296 | 3.92 | 0.26 | 3.83 | 0.26 | 1.48 |
|
144 |
-
| code_davinci_002 | 50281 | 1.31 | 0.77 | 1.28 | 0.78 | 0.49 |
|
145 |
-
| crystal_coder | 32000 | 1.86 | 0.54 | 1.81 | 0.55 | 0.7 |
|
146 |
-
| dbrx_instruct | 100277 | 2.26 | 0.44 | 2.21 | 0.45 | 0.85 |
|
147 |
-
| deepseek_coder_33b_instruct | 32000 | 3.4 | 0.29 | 3.32 | 0.3 | 1.29 |
|
148 |
-
| deepseek_llm_7b_base | 100000 | 4.05 | 0.25 | 3.96 | 0.25 | 1.53 |
|
149 |
-
| falcon_180b | 65024 | 2.18 | 0.46 | 2.13 | 0.47 | 0.82 |
|
150 |
-
| falcon_7b | 65024 | 2.18 | 0.46 | 2.13 | 0.47 | 0.82 |
|
151 |
-
| fastchat_t5_3b | 32000 | 13.7 | 0.07 | 13.38 | 0.07 | 5.18 |
|
152 |
-
| flan_t5_base | 32100 | 14.13 | 0.07 | 13.8 | 0.07 | 5.34 |
|
153 |
-
| gemma_7b | 256000 | 3.82 | 0.26 | 3.73 | 0.27 | 1.44 |
|
154 |
-
| gpt2 | 50257 | 1.31 | 0.77 | 1.28 | 0.78 | 0.49 |
|
155 |
-
| gpt2_chinese | 21128 | 2.73 | 0.37 | 2.66 | 0.38 | 1.03 |
|
156 |
-
| gpt_35_turbo | 100277 | 2.26 | 0.44 | 2.21 | 0.45 | 0.85 |
|
157 |
-
| gpt_4 | 100277 | 2.26 | 0.44 | 2.21 | 0.45 | 0.85 |
|
158 |
-
| gpt_nexo_20b | 50254 | 2.01 | 0.5 | 1.96 | 0.51 | 0.76 |
|
159 |
-
| grok_1 | 131072 | 1.73 | 0.58 | 1.69 | 0.59 | 0.66 |
|
160 |
-
| internlm2_chat_7b | 92544 | 4.23 | 0.24 | 4.13 | 0.24 | 1.6 |
|
161 |
-
| internlm2_math_7b | 92544 | 4.23 | 0.24 | 4.13 | 0.24 | 1.6 |
|
162 |
-
| internlm_chat_7b | 103168 | 4.23 | 0.24 | 4.14 | 0.24 | 1.6 |
|
163 |
-
| internlm_xcomposer_7b | 103168 | 4.23 | 0.24 | 4.14 | 0.24 | 1.6 |
|
164 |
-
| jamba_v0_1 | 65536 | 2.3 | 0.44 | 2.24 | 0.45 | 0.87 |
|
165 |
-
| kplug | 10261 | 2.72 | 0.37 | 2.65 | 0.38 | 1.03 |
|
166 |
-
| llama | 32000 | 1.84 | 0.54 | 1.8 | 0.56 | 0.7 |
|
167 |
-
| llama2 | 32000 | 1.84 | 0.54 | 1.8 | 0.56 | 0.7 |
|
168 |
-
| llama3 | 128000 | 3.28 | 0.3 | 3.2 | 0.31 | 1.24 |
|
169 |
-
| mistral_7b | 32000 | 2.36 | 0.42 | 2.3 | 0.43 | 0.89 |
|
170 |
-
| mixtral_8_7b | 32000 | 2.36 | 0.42 | 2.3 | 0.43 | 0.89 |
|
171 |
-
| mobilebert_uncased | 30522 | 2.73 | 0.37 | 2.67 | 0.38 | 1.03 |
|
172 |
-
| moss | 106029 | 4.4 | 0.23 | 4.3 | 0.23 | 1.66 |
|
173 |
-
| mt5_large | 250100 | 3.89 | 0.26 | 3.79 | 0.26 | 1.47 |
|
174 |
-
| olmo_7b | 50280 | 2.01 | 0.5 | 1.96 | 0.51 | 0.76 |
|
175 |
-
| orion_14b_chat | 84608 | 4.63 | 0.22 | 4.52 | 0.22 | 1.75 |
|
176 |
-
| phi_1 | 50257 | 1.31 | 0.77 | 1.28 | 0.78 | 0.49 |
|
177 |
-
| phi_2 | 50257 | 1.31 | 0.77 | 1.28 | 0.78 | 0.49 |
|
178 |
-
| pko_t5_large | 50258 | 0.97 | 1.03 | 0.95 | 1.06 | 0.37 |
|
179 |
-
| prompt_clue | 32128 | 4.34 | 0.23 | 4.24 | 0.24 | 1.64 |
|
180 |
-
| qwen1_5_14b_chat | 151643 | 4.16 | 0.24 | 4.06 | 0.25 | 1.57 |
|
181 |
-
| qwen_1_8b_chat | 151851 | 4.16 | 0.24 | 4.06 | 0.25 | 1.57 |
|
182 |
-
| qwen_72b_chat | 151851 | 4.16 | 0.24 | 4.06 | 0.25 | 1.57 |
|
183 |
-
| qwen_7b_chat | 151851 | 4.16 | 0.24 | 4.06 | 0.25 | 1.57 |
|
184 |
-
| roberta_chinese_clue | 8021 | 2.7 | 0.37 | 2.64 | 0.38 | 1.02 |
|
185 |
-
| skywork_13b_base | 65519 | 3.69 | 0.27 | 3.61 | 0.28 | 1.4 |
|
186 |
-
| skywork_13b_math | 65519 | 3.69 | 0.27 | 3.61 | 0.28 | 1.4 |
|
187 |
-
| solar_10_7b | 32000 | 2.36 | 0.42 | 2.3 | 0.43 | 0.89 |
|
188 |
-
| starchat_alpha | 49152 | 2.78 | 0.36 | 2.72 | 0.37 | 1.05 |
|
189 |
-
| switch_c_2048 | 32100 | 14.13 | 0.07 | 13.8 | 0.07 | 5.34 |
|
190 |
-
| t5_base | 32100 | 14.13 | 0.07 | 13.8 | 0.07 | 5.34 |
|
191 |
-
| t5_large | 32100 | 14.13 | 0.07 | 13.8 | 0.07 | 5.34 |
|
192 |
-
| t5_small | 32100 | 14.13 | 0.07 | 13.8 | 0.07 | 5.34 |
|
193 |
-
| text_davinci_003 | 50281 | 1.31 | 0.77 | 1.28 | 0.78 | 0.49 |
|
194 |
-
| tigerbot_13b_chat_v2 | 60512 | 4.25 | 0.24 | 4.15 | 0.24 | 1.61 |
|
195 |
-
| tigerbot_70b_chat_v4_4k | 65107 | 4.25 | 0.24 | 4.15 | 0.24 | 1.61 |
|
196 |
-
| wizardcoder_15b_v1 | 49152 | 2.78 | 0.36 | 2.72 | 0.37 | 1.05 |
|
197 |
-
| wizardcoder_python_7b_v1 | 32000 | 1.84 | 0.54 | 1.8 | 0.56 | 0.7 |
|
198 |
-
| wizardlm_7b_v1 | 32000 | 1.84 | 0.54 | 1.8 | 0.56 | 0.7 |
|
199 |
-
| wizardmath_70b_v1 | 32000 | 1.84 | 0.54 | 1.8 | 0.56 | 0.7 |
|
200 |
-
| xlm_roberta | 250002 | 3.96 | 0.25 | 3.86 | 0.26 | 1.5 |
|
201 |
-
| yi_34b | 64000 | 4.17 | 0.24 | 4.07 | 0.25 | 1.58 |
|
202 |
-
| yi_6b | 64000 | 4.17 | 0.24 | 4.07 | 0.25 | 1.58 |
|
203 |
-
| yi_vl34b | 64000 | 4.11 | 0.24 | 4.02 | 0.25 | 1.56 |
|
204 |
-
| zephyr_7b_beta | 32000 | 2.36 | 0.42 | 2.3 | 0.43 | 0.89 |
|
205 |
|
206 |
</details>
|
207 |
|
@@ -212,12 +65,14 @@ python utils/compress_rate_util.py
|
|
212 |
|
213 |
- Getting the most out of your tokenizer for pre-training and domain adaptation
|
214 |
- Efficient and Effective Text Encoding for Chinese LLaMA and Alpaca
|
215 |
-
- blog
|
216 |
- https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
|
217 |
- https://huggingface.co/docs/transformers/tokenizer_summary#sentencepiece
|
218 |
- https://www.huaxiaozhuan.com/%E5%B7%A5%E5%85%B7/huggingface_transformer/chapters/1_tokenizer.html
|
219 |
- https://zhuanlan.zhihu.com/p/652520262
|
220 |
- https://github.com/QwenLM/Qwen/blob/main/tokenization_note_zh.md
|
|
|
|
|
221 |
- demo
|
222 |
- https://huggingface.co/spaces/Xenova/the-tokenizer-playground
|
223 |
- https://github.com/dqbd/tiktokenizer
|
|
|
43 |
| aya_101 | 250100 | 3.3 | 0.3 | 3.22 | 0.31 | 3.53 |
|
44 |
| baichuan | 64000 | 3.74 | 0.27 | 3.65 | 0.27 | 4 |
|
45 |
| baichuan2 | 125696 | 3.89 | 0.26 | 3.8 | 0.26 | 4.17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
</details>
|
48 |
|
|
|
55 |
| amber | 32000 | 1.84 | 0.54 | 1.8 | 0.56 | 0.7 |
|
56 |
| aya_101 | 250100 | 3.89 | 0.26 | 3.79 | 0.26 | 1.47 |
|
57 |
| baichuan | 64000 | 3.92 | 0.26 | 3.82 | 0.26 | 1.48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
</details>
|
60 |
|
|
|
65 |
|
66 |
- Getting the most out of your tokenizer for pre-training and domain adaptation
|
67 |
- Efficient and Effective Text Encoding for Chinese LLaMA and Alpaca
|
68 |
+
- blog
|
69 |
- https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
|
70 |
- https://huggingface.co/docs/transformers/tokenizer_summary#sentencepiece
|
71 |
- https://www.huaxiaozhuan.com/%E5%B7%A5%E5%85%B7/huggingface_transformer/chapters/1_tokenizer.html
|
72 |
- https://zhuanlan.zhihu.com/p/652520262
|
73 |
- https://github.com/QwenLM/Qwen/blob/main/tokenization_note_zh.md
|
74 |
+
- https://tonybaloney.github.io/posts/cjk-chinese-japanese-korean-llm-ai-best-practices.html
|
75 |
+
-
|
76 |
- demo
|
77 |
- https://huggingface.co/spaces/Xenova/the-tokenizer-playground
|
78 |
- https://github.com/dqbd/tiktokenizer
|
character_util.py
CHANGED
@@ -83,7 +83,7 @@ def iter_vocab(
|
|
83 |
with open(cache_path, "r", encoding="utf-8") as f_tmp:
|
84 |
cache.update(json.load(f_tmp))
|
85 |
if from_cache and tokenizer_name in cache:
|
86 |
-
logger.info(f"load {tokenizer_config.name_or_path} from cache")
|
87 |
return cache[tokenizer_name]
|
88 |
|
89 |
tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
|
|
|
83 |
with open(cache_path, "r", encoding="utf-8") as f_tmp:
|
84 |
cache.update(json.load(f_tmp))
|
85 |
if from_cache and tokenizer_name in cache:
|
86 |
+
# logger.info(f"load {tokenizer_config.name_or_path} from cache")
|
87 |
return cache[tokenizer_name]
|
88 |
|
89 |
tokenizer = tokenizer_factory.get_tokenizer(tokenizer_name)
|
compression_app.py
CHANGED
@@ -75,13 +75,14 @@ with gr.Blocks() as demo:
|
|
75 |
)
|
76 |
|
77 |
gr.Markdown(
|
|
|
78 |
"- `corpus`: tokenization is performed on the selected subsets of [cc100](https://huggingface.co/datasets/cc100) corpus.\n"
|
79 |
"- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus.\n"
|
80 |
"- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus.\n"
|
81 |
# "- `g_bytes/b_tokens` measures how many gigabytes corpus per billion tokens.\n"
|
82 |
# "- `t_bytes/t_tokens` measures how many terabytes corpus per trillion tokens.\n"
|
83 |
"- `char/token` measures how many chars per token on the tokenized corpus.\n"
|
84 |
-
"- `oov_ratio`: out-of-vocabulary ratio on the selected corpus. 👉 get [oov charset](https://huggingface.co/spaces/eson/tokenizer-arena/
|
85 |
"You can reproduce this procedure with [compression_util.py](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/compression_util.py)."
|
86 |
)
|
87 |
|
|
|
75 |
)
|
76 |
|
77 |
gr.Markdown(
|
78 |
+
# "Note:\n\n"
|
79 |
"- `corpus`: tokenization is performed on the selected subsets of [cc100](https://huggingface.co/datasets/cc100) corpus.\n"
|
80 |
"- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus.\n"
|
81 |
"- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus.\n"
|
82 |
# "- `g_bytes/b_tokens` measures how many gigabytes corpus per billion tokens.\n"
|
83 |
# "- `t_bytes/t_tokens` measures how many terabytes corpus per trillion tokens.\n"
|
84 |
"- `char/token` measures how many chars per token on the tokenized corpus.\n"
|
85 |
+
"- `oov_ratio`: out-of-vocabulary ratio on the selected corpus. 👉 get [oov charset](https://huggingface.co/spaces/eson/tokenizer-arena/raw/main/stats/compression_rate.json)\n\n"
|
86 |
"You can reproduce this procedure with [compression_util.py](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/compression_util.py)."
|
87 |
)
|
88 |
|
utils/lang_util.py
CHANGED
@@ -10,6 +10,9 @@
|
|
10 |
然而,需要强调的是,这种方法的准确性受限于所选语言特征的全面性和独特性。
|
11 |
例如,English的检测范围仅限于基本的A-Z字母,这可能导致它与其他使用相同字母集的语言重叠。
|
12 |
此外,有些语言(如法语和西班牙语)在某些情况下可能共享特定的重音符号,这可能导致一个字符串被错误地识别为多种语言。
|
|
|
|
|
|
|
13 |
"""
|
14 |
|
15 |
import re
|
|
|
10 |
然而,需要强调的是,这种方法的准确性受限于所选语言特征的全面性和独特性。
|
11 |
例如,English的检测范围仅限于基本的A-Z字母,这可能导致它与其他使用相同字母集的语言重叠。
|
12 |
此外,有些语言(如法语和西班牙语)在某些情况下可能共享特定的重音符号,这可能导致一个字符串被错误地识别为多种语言。
|
13 |
+
|
14 |
+
## 常用语言
|
15 |
+
English | 简体中文 | 繁體中文 | 한국어 | Español | 日本語 | हिन्दी | Русский | Рortuguês | తెలుగు | Français | Deutsch | Tiếng Việt |
|
16 |
"""
|
17 |
|
18 |
import re
|
utils/oov.md
DELETED
@@ -1,202 +0,0 @@
|
|
1 |
-
|
2 |
-
```sh
|
3 |
-
###################################
|
4 |
-
ClueAI/ChatYuan-large-v2, <class 'tokenizers.models.Unigram'>
|
5 |
-
reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2000; oov: []
|
6 |
-
text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
7 |
-
decoding[7] = "<unk>амглав<unk> у<unk>равления развития; <unk> <unk> 15~17<unk> <unk> 3<unk>; 確実に春が近づいてること; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk>ا<unk> <unk> <unk>ا<unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk>зейн<unk>я асо<unk>:; <unk> <unk> <unk> <unk>; <unk>;<unk>"
|
8 |
-
|
9 |
-
|
10 |
-
###################################
|
11 |
-
ClueAI/PromptCLUE-base, <class 'tokenizers.models.Unigram'>
|
12 |
-
reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2000; oov: []
|
13 |
-
text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
14 |
-
decoding[7] = "<unk>амглав<unk> у<unk>равления развития; <unk> <unk> 15~17<unk> <unk> 3<unk>; 確実に春が近づいてること; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk>ا<unk> <unk> <unk>ا<unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk>зейн<unk>я асо<unk>:; <unk> <unk> <unk> <unk>; <unk>;<unk>"
|
15 |
-
###################################
|
16 |
-
CohereForAI/aya-101, <class 'tokenizers.models.Unigram'>
|
17 |
-
reversible: false; unk_token: <unk>, 2, unk_ratio: 0.0079; oov: []
|
18 |
-
text[73] = " a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
19 |
-
decoding[73] = "a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; <unk>❤❥웃유♋☮✊;װיקיװערטערבוך"
|
20 |
-
###################################
|
21 |
-
FacebookAI/xlm-roberta-base, <class 'tokenizers.models.Unigram'>
|
22 |
-
reversible: false; unk_token: <unk>, 3, unk_ratio: 0.0096; oov: []
|
23 |
-
text[73] = " a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
24 |
-
decoding[73] = "a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; <unk>❤❥웃유♋☮✊;װיקיװערטערבוך"
|
25 |
-
###################################
|
26 |
-
OrionStarAI/Orion-14B-Chat, sp_model, byte_num: 0
|
27 |
-
reversible: false; unk_token: <unk>, 0, unk_ratio: 0.0495; oov: []
|
28 |
-
text[71] = "; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
29 |
-
decoding[71] = "; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئ<unk> ⁇ ردوغان <unk> ⁇ قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለ<unk> ⁇ ጭ የግድግ<unk> ⁇ ; Дзейныя асобы:; « <unk> ⁇ <unk> ⁇ <unk> ⁇ ; \t\n <unk> ⁇ ❤❥웃유♋☮✊; <unk> ⁇ יקי<unk> ⁇ ערטערבוך "
|
30 |
-
###################################
|
31 |
-
THUDM/chatglm-6b, byte_num: 256
|
32 |
-
reversible: false; unk_token: <unk>, 0, unk_ratio: 0.0000; oov: []
|
33 |
-
text[237] = "\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
34 |
-
decoding[237] = " 🦙❤❥웃유♋☮✊;װיקיװערטערבוך"
|
35 |
-
###################################
|
36 |
-
abeja/gpt-neox-japanese-2.7b, japanese-bpe: https://github.com/tanreinama/Japanese-BPEEncoder_V2
|
37 |
-
reversible: false; unk_token: <|endoftext|>, 31999, unk_ratio: 0.0000; oov: []
|
38 |
-
text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
39 |
-
decoding[7] = "���������������� �������������������� ����������������; ������ ������ 15~17��� ��������� 3������; 確実に春が近づいてること; a k��zoktat��ssal? _ Belf��ld; pum��, i vjet��r, vje��; ���������������� ���� ���������������������� ; ��������������� ��������� ������ ��������� ������ ������������������������; ��������������� ��������������� ; �������������� ����������:; ǀ ��������������������������� ��������������� ���������������; \t\n\n🐯❤‖������🟥🟥🤚;��������������������������"
|
40 |
-
|
41 |
-
|
42 |
-
###################################
|
43 |
-
baichuan-inc/Baichuan-7B, sp_model, byte_num: 256
|
44 |
-
reversible: false; unk_token: <unk>, 0, unk_ratio: 0.0000; oov: []
|
45 |
-
text[237] = "\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
46 |
-
decoding[237] = " 🦙❤❥웃유♋☮✊;װיקיװערטערבוך "
|
47 |
-
###################################
|
48 |
-
ckiplab/gpt2-base-chinese, <class 'tokenizers.models.WordPiece'>
|
49 |
-
reversible: false; unk_token: [UNK], 100, unk_ratio: 0.1185; oov: []
|
50 |
-
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
51 |
-
decoding[5] = " ; замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; 確 実 に 春 か 近 ついてること ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; дзеиныя асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
|
52 |
-
|
53 |
-
|
54 |
-
###################################
|
55 |
-
cl-tohoku/bert-base-japanese, wordpiece.MecabTokenizer, 支持byte-level https://github.com/polm/fugashi
|
56 |
-
reversible: false; unk_token: [UNK], 1, unk_ratio: 0.3951; oov: []
|
57 |
-
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
58 |
-
decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ 17 [UNK] [UNK] 3 [UNK] ; 確実 に 春 が 近づい てる こと ; a közoktatással? _ Belföld ; [UNK], i [UNK], vjeç ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] [UNK] :; [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK]"
|
59 |
-
|
60 |
-
|
61 |
-
###################################
|
62 |
-
clue/roberta_chinese_clue_tiny, <class 'tokenizers.models.WordPiece'>
|
63 |
-
reversible: false; unk_token: [UNK], 100, unk_ratio: 0.3580; oov: []
|
64 |
-
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
65 |
-
decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] 実 [UNK] 春 [UNK] 近 [UNK] ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
|
66 |
-
|
67 |
-
|
68 |
-
###################################
|
69 |
-
dbmdz/bert-base-german-uncased, <class 'tokenizers.models.WordPiece'>
|
70 |
-
reversible: false; unk_token: [UNK], 101, unk_ratio: 0.4459; oov: []
|
71 |
-
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
72 |
-
decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] [UNK] : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
|
73 |
-
###################################
|
74 |
-
deepseek-ai/deepseek-coder-33b-instruct, <class 'tokenizers.models.BPE'>
|
75 |
-
reversible: false; unk_token: None, None, unk_ratio: 0.0000; oov: []
|
76 |
-
text[77] = "özoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
77 |
-
decoding[77] = "�zoktatással? _ Belf�ld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך "
|
78 |
-
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
79 |
-
[2024-05-12 00:30:36] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer eson/kplug-base-encoder
|
80 |
-
###################################
|
81 |
-
deepseek-ai/deepseek-llm-7b-base, <class 'tokenizers.models.BPE'>
|
82 |
-
reversible: false; unk_token: None, None, unk_ratio: 0.0000; oov: []
|
83 |
-
text[77] = "özoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
84 |
-
decoding[77] = "�zoktatással? _ Belf�ld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך "
|
85 |
-
[2024-05-12 00:30:56] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer fnlp/moss-moon-003-sft
|
86 |
-
###################################
|
87 |
-
eson/kplug-base-encoder, <class 'tokenizers.models.WordPiece'>
|
88 |
-
reversible: false; unk_token: [UNK], 100, unk_ratio: 0.3625; oov: []
|
89 |
-
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
90 |
-
decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] 実 [UNK] 春 [UNK] 近 [UNK] ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
|
91 |
-
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
92 |
-
[2024-05-12 00:31:36] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-cased
|
93 |
-
###################################
|
94 |
-
fnlp/moss-moon-003-sft, 应该是 sentencepiece.byte_bpe,待确认
|
95 |
-
reversible: false; unk_token: <|endoftext|>, 106028, unk_ratio: 0.0000; oov: []
|
96 |
-
text[74] = "a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
97 |
-
decoding[74] = " a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך "
|
98 |
-
###################################
|
99 |
-
google-bert/bert-base-cased, <class 'tokenizers.models.WordPiece'>
|
100 |
-
reversible: false; unk_token: [UNK], 100, unk_ratio: 0.1732; oov: []
|
101 |
-
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
102 |
-
decoding[5] = " ; Замглавы управления развития ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] [UNK] に [UNK] [UNK] [UNK] [UNK] ; a közoktatással? _ Belföld ; pumë, i vjetër, vjeç ; [UNK] [UNK] قىرغىزىستان ; निम्न में से [UNK] सा [UNK] ; [UNK] [UNK] ; Дзейныя асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
|
103 |
-
[2024-05-12 00:31:56] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-chinese
|
104 |
-
[2024-05-12 00:32:16] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-german-cased
|
105 |
-
###################################
|
106 |
-
google-bert/bert-base-chinese, <class 'tokenizers.models.WordPiece'>
|
107 |
-
reversible: false; unk_token: [UNK], 100, unk_ratio: 0.3704; oov: []
|
108 |
-
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
109 |
-
decoding[5] = " ; [UNK] управления развития ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; 確 実 に 春 [UNK] 近 [UNK] ; a [UNK]? _ [UNK] ; [UNK], i [UNK], [UNK] ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
|
110 |
-
###################################
|
111 |
-
google-bert/bert-base-german-cased, <class 'tokenizers.models.WordPiece'>
|
112 |
-
reversible: false; unk_token: [UNK], 2, unk_ratio: 0.5938; oov: []
|
113 |
-
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
114 |
-
decoding[5] = " ; [UNK] [UNK] [UNK] ; [UNK] [UNK] 15 ~ [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; a [UNK]? _ Belföld ; [UNK], i [UNK], [UNK] ; [UNK] [UNK] [UNK] ; [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] ; [UNK] [UNK] ; [UNK] [UNK] : ; [UNK] [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
|
115 |
-
[2024-05-12 00:32:36] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-multilingual-cased
|
116 |
-
[2024-05-12 00:32:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-multilingual-uncased
|
117 |
-
###################################
|
118 |
-
google-bert/bert-base-multilingual-cased, <class 'tokenizers.models.WordPiece'>
|
119 |
-
reversible: false; unk_token: [UNK], 100, unk_ratio: 0.0531; oov: []
|
120 |
-
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
121 |
-
decoding[5] = " ; Замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; 確 実 に 春 が 近 づいてること ; a közoktatással? _ Belföld ; pumë, i vjetër, vjeç ; [UNK] [UNK] قىرغىزىستان ; निम्न में से कौन सा हारडवेयर ; [UNK] [UNK] ; Дзейныя асобы : ; « અમરેલીનાં મહિલા વિકાસ ; [UNK] ; [UNK]"
|
122 |
-
[2024-05-12 00:33:17] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-bert/bert-base-uncased
|
123 |
-
###################################
|
124 |
-
google-bert/bert-base-multilingual-uncased, <class 'tokenizers.models.WordPiece'>
|
125 |
-
reversible: false; unk_token: [UNK], 100, unk_ratio: 0.0360; oov: []
|
126 |
-
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
127 |
-
decoding[5] = " ; замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; 確 実 に 春 か 近 ついてること ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; يەردوغان ۋە قىرغىزىستان ; निमन म स कौन सा हारडवयर ; [UNK] [UNK] ; дзеиныя асобы : ; « અમરલીના મહિલા વિકાસ ; [UNK] ; [UNK]"
|
128 |
-
[2024-05-12 00:33:37] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google-t5/t5-large
|
129 |
-
###################################
|
130 |
-
google-bert/bert-base-uncased, <class 'tokenizers.models.WordPiece'>
|
131 |
-
reversible: false; unk_token: [UNK], 100, unk_ratio: 0.0867; oov: []
|
132 |
-
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
133 |
-
decoding[5] = " ; замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; [UNK] [UNK] に 春 か [UNK] ついてること ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] قىرغىزىستان ; निमन म स [UNK] सा हारडवयर ; [UNK] [UNK] ; дзеиныя асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
|
134 |
-
###################################
|
135 |
-
google-t5/t5-large, <class 'tokenizers.models.Unigram'>
|
136 |
-
reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2769; oov: []
|
137 |
-
text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
138 |
-
decoding[7] = "<unk>ам<unk>лав<unk> у<unk>равлени<unk> ра<unk>вити<unk>; <unk> <unk> 15<unk>17<unk> <unk> 3<unk>; <unk>; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk> <unk> <unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk>е<unk>н<unk> асо<unk>:; « <unk> <unk> <unk>; <unk>;<unk>"
|
139 |
-
[2024-05-12 00:34:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/byt5-small
|
140 |
-
[2024-05-12 00:35:18] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/gemma-7b
|
141 |
-
[2024-05-12 00:35:39] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/mobilebert-uncased
|
142 |
-
[2024-05-12 00:36:59] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/mt5-large
|
143 |
-
###################################
|
144 |
-
google/mobilebert-uncased, <class 'tokenizers.models.WordPiece'>
|
145 |
-
reversible: false; unk_token: [UNK], 100, unk_ratio: 0.0867; oov: []
|
146 |
-
text[5] = "; Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
147 |
-
decoding[5] = " ; замглавы управления развития ; 특히 주소 15 ~ 17번 홀에선 3연속 ; [UNK] [UNK] に 春 か [UNK] ついてること ; a kozoktatassal? _ belfold ; pume, i vjeter, vjec ; [UNK] [UNK] قىرغىزىستان ; निमन म स [UNK] सा हारडवयर ; [UNK] [UNK] ; дзеиныя асобы : ; « [UNK] [UNK] [UNK] ; [UNK] ; [UNK]"
|
148 |
-
C:\Users\xusong28\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\convert_slow_tokenizer.py:560: UserWarning: The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.
|
149 |
-
warnings.warn(
|
150 |
-
[2024-05-12 00:37:23] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer google/switch-c-2048
|
151 |
-
###################################
|
152 |
-
google/mt5-large, <class 'tokenizers.models.Unigram'>
|
153 |
-
reversible: false; unk_token: <unk>, 2, unk_ratio: 0.0079; oov: []
|
154 |
-
text[73] = " a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
155 |
-
decoding[73] = "a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; <unk>❤❥웃유♋☮✊;װיקיװערטערבוך"
|
156 |
-
[2024-05-12 00:37:43] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hfl/chinese-alpaca-lora-7b
|
157 |
-
###################################
|
158 |
-
google/switch-c-2048, <class 'tokenizers.models.Unigram'>
|
159 |
-
reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2769; oov: []
|
160 |
-
text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
161 |
-
decoding[7] = "<unk>ам<unk>лав<unk> у<unk>равлени<unk> ра<unk>вити<unk>; <unk> <unk> 15<unk>17<unk> <unk> 3<unk>; <unk>; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk> <unk> <unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk>е<unk>н<unk> асо<unk>:; « <unk> <unk> <unk>; <unk>;<unk>"
|
162 |
-
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
|
163 |
-
[2024-05-12 00:38:04] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hfl/chinese-llama-2-7b
|
164 |
-
[2024-05-12 00:38:25] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hfl/chinese-llama-lora-7b
|
165 |
-
[2024-05-12 00:38:46] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hfl/llama-3-chinese-8b
|
166 |
-
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
167 |
-
[2024-05-12 00:39:07] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer hpcai-tech/grok-1
|
168 |
-
[2024-05-12 00:39:28] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer internlm/internlm-chat-7b
|
169 |
-
[2024-05-12 00:40:09] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer internlm/internlm-xcomposer-7b
|
170 |
-
[2024-05-12 00:40:31] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer internlm/internlm2-chat-7b
|
171 |
-
[2024-05-12 00:41:13] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer internlm/internlm2-math-7b
|
172 |
-
[2024-05-12 00:41:35] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer lmsys/fastchat-t5-3b-v1.0
|
173 |
-
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
174 |
-
###################################
|
175 |
-
[2024-05-12 00:41:55] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer meta-llama/Llama-2-7b-chat
|
176 |
-
lmsys/fastchat-t5-3b-v1.0, sp_model, byte_num: 0
|
177 |
-
reversible: false; unk_token: <unk>, 2, unk_ratio: 0.2105; oov: []
|
178 |
-
text[7] = "Замглавы управления развития; 특히 주소 15~17번 홀에선 3연속; 確実に春が近づいてること; a közoktatással? _ Belföld; pumë, i vjetër, vjeç; ئەردوغان ۋە قىرغىزىستان ; निम्न में से कौन सा हारडवेयर; ተለዋዋጭ የግድግዳ ; Дзейныя асобы:; « અમરેલીનાં મહિલા વિકાસ; \t\n\r🦙❤❥웃유♋☮✊;װיקיװערטערבוך ",
|
179 |
-
decoding[7] = " <unk> ам<unk> лав<unk> у<unk> равлени<unk> ра<unk> вити<unk>; <unk> <unk> 15<unk> 17<unk> <unk> 3<unk>; <unk>; a közoktatással? _ Belföld; pum<unk>, i vjet<unk>r, vjeç; <unk> <unk> <unk> ; <unk> <unk> <unk> <unk> <unk> <unk>; <unk> <unk> ; <unk> е<unk> н<unk> асо<unk>:; « <unk> <unk> <unk>; \t \n <unk> ;<unk> "
|
180 |
-
[2024-05-12 00:41:55] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer meta-llama/Meta-Llama-3-8B
|
181 |
-
[2024-05-12 00:41:55] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer microsoft/Phi-3-mini-4k-instruct
|
182 |
-
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
183 |
-
[2024-05-12 00:42:16] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer microsoft/phi-1
|
184 |
-
[2024-05-12 00:42:36] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer microsoft/phi-2
|
185 |
-
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
186 |
-
[2024-05-12 00:42:56] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer mistralai/Mistral-7B-v0.1
|
187 |
-
[2024-05-12 00:43:16] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer mistralai/Mixtral-8x7B-v0.1
|
188 |
-
[2024-05-12 00:43:37] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai-community/gpt2
|
189 |
-
[2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai/code-davinci-002
|
190 |
-
[2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai/gpt-3.5-turbo
|
191 |
-
[2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai/gpt-4
|
192 |
-
[2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer openai/text-davinci-003
|
193 |
-
[2024-05-12 00:43:57] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer paust/pko-t5-large
|
194 |
-
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
195 |
-
[2024-05-12 00:44:18] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer thu-coai/CharacterGLM-6B
|
196 |
-
[2024-05-12 00:44:58] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer tiiuae/falcon-180b
|
197 |
-
[2024-05-12 00:45:19] [INFO] [34044:7360] [__init__.py:343:load_tokenizer] loading tokenizer tiiuae/falcon-7b
|
198 |
-
|
199 |
-
Process finished with exit code 0
|
200 |
-
|
201 |
-
|
202 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab.py
CHANGED
@@ -206,7 +206,7 @@ _all_tokenizer_config = [
|
|
206 |
meta="去掉了繁体字, https://github.com/CLUEbenchmark/CLUEPretrainedModels/blob/master/README.md"),
|
207 |
TokenizerConfig("eson/kplug-base-encoder", name_display="eson/kplug", impl=TokenizerImpl.BertTokenizer, org="JD"),
|
208 |
TokenizerConfig("ckiplab/gpt2-base-chinese", impl=TokenizerImpl.BertTokenizer, org="SINICA"), # 台湾中央研究院
|
209 |
-
# WoBERT
|
210 |
# WoBERT Plus https://github.com/ZhuiyiTechnology/WoBERT
|
211 |
|
212 |
|
|
|
206 |
meta="去掉了繁体字, https://github.com/CLUEbenchmark/CLUEPretrainedModels/blob/master/README.md"),
|
207 |
TokenizerConfig("eson/kplug-base-encoder", name_display="eson/kplug", impl=TokenizerImpl.BertTokenizer, org="JD"),
|
208 |
TokenizerConfig("ckiplab/gpt2-base-chinese", impl=TokenizerImpl.BertTokenizer, org="SINICA"), # 台湾中央研究院
|
209 |
+
# WoBERT https://kexue.fm/archives/7758
|
210 |
# WoBERT Plus https://github.com/ZhuiyiTechnology/WoBERT
|
211 |
|
212 |
|