Spaces:
Running
Running
update
Browse files- README.md +15 -0
- app.py +9 -1
- compression_app.py +11 -11
- compression_util.py +1 -1
- requirements.txt +12 -12
- stats/character_stats.json +21 -0
- stats/compression_rate.json +48 -0
- utils/lang_util.py +2 -2
- vocab.py +1 -1
README.md
CHANGED
@@ -9,9 +9,24 @@ app_file: app.py
|
|
9 |
pinned: false
|
10 |
datasets:
|
11 |
- cc100
|
|
|
|
|
|
|
12 |
---
|
13 |
|
14 |
|
15 |
|
16 |
|
17 |
Please visit our GitHub repo for more information: https://github.com/xu-song/tokenizer-arena
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
pinned: false
|
10 |
datasets:
|
11 |
- cc100
|
12 |
+
tags:
|
13 |
+
- tokenizer
|
14 |
+
short_description: Compare different tokenizers in char-level and byte-level.
|
15 |
---
|
16 |
|
17 |
|
18 |
|
19 |
|
20 |
Please visit our GitHub repo for more information: https://github.com/xu-song/tokenizer-arena
|
21 |
+
|
22 |
+
|
23 |
+
## Run gradio demo
|
24 |
+
|
25 |
+
```
|
26 |
+
python app.py
|
27 |
+
```
|
28 |
+
|
29 |
+
|
30 |
+
|
31 |
+
## ss
|
32 |
+
|
app.py
CHANGED
@@ -12,13 +12,21 @@ if auth_token:
|
|
12 |
login(token=auth_token)
|
13 |
|
14 |
|
15 |
-
title = '<div align="center">Tokenizer Arena ⚔️</div>'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
interface_list = [playground_tab, compression_tab, character_tab]
|
17 |
tab_names = [" ⚔️ Playground", "🏆 Compression Leaderboard", "📊 Character Statistics"]
|
18 |
|
19 |
# interface_list = [compression_tab, character_tab]
|
20 |
# tab_names = ["🏆 Compression Leaderboard", "📊 Character Statistics"]
|
21 |
|
|
|
22 |
with gr.Blocks(css="css/style.css", js="js/onload.js") as demo:
|
23 |
gr.HTML(
|
24 |
f"<h1 style='text-align: center; margin-bottom: 1rem'>{title}</h1>"
|
|
|
12 |
login(token=auth_token)
|
13 |
|
14 |
|
15 |
+
# title = '<div align="center">Tokenizer Arena ⚔️</div>'
|
16 |
+
title = """
|
17 |
+
<div align="center">
|
18 |
+
<span style="background-color: rgb(254, 226, 226);">Token</span><span style="background-color: rgb(220, 252, 231);">ization</span>
|
19 |
+
<span style="background-color: rgb(219, 234, 254);"> Arena</span>
|
20 |
+
<span style="background-color: rgb(254, 249, 195);"> ⚔️</span>
|
21 |
+
</div>
|
22 |
+
"""
|
23 |
interface_list = [playground_tab, compression_tab, character_tab]
|
24 |
tab_names = [" ⚔️ Playground", "🏆 Compression Leaderboard", "📊 Character Statistics"]
|
25 |
|
26 |
# interface_list = [compression_tab, character_tab]
|
27 |
# tab_names = ["🏆 Compression Leaderboard", "📊 Character Statistics"]
|
28 |
|
29 |
+
|
30 |
with gr.Blocks(css="css/style.css", js="js/onload.js") as demo:
|
31 |
gr.HTML(
|
32 |
f"<h1 style='text-align: center; margin-bottom: 1rem'>{title}</h1>"
|
compression_app.py
CHANGED
@@ -35,21 +35,21 @@ The encoding and decoding process can be formulated as
|
|
35 |
decoded_text = tokenizer.decode(token_ids) # reconstructed text
|
36 |
```
|
37 |
|
38 |
-
|
39 |
Lossless tokenization preserves the exact original text, i.e. `decoded_text = input_text`. There are mainly two causes of compression loss.
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
|
50 |
-
|
51 |
|
52 |
-
|
|
|
53 |
There are mainly two types of metric to represent the `input_text`:
|
54 |
- `char-level`: the number of characters in the given text
|
55 |
- `byte-level`: the number of bytes in the given text.
|
|
|
35 |
decoded_text = tokenizer.decode(token_ids) # reconstructed text
|
36 |
```
|
37 |
|
38 |
+
**Lossless**<br>
|
39 |
Lossless tokenization preserves the exact original text, i.e. `decoded_text = input_text`. There are mainly two causes of compression loss.
|
40 |
|
41 |
+
1. `OOV`: Most lossy tokenizers get many out-of-vocabulary(OOV) words. 👉 Check the OOV and
|
42 |
+
tokenization loss of [bert](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/stats/compression_rate/google-bert.bert-base-cased%20%40%20cc100.zh-Hans.diff.json) and
|
43 |
+
[t5](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/stats/compression_rate/google-t5.t5-large%20%40%20cc100.es.diff.json).
|
44 |
+
2. `Normalization`: Even if a tokenizer has no OOV, it can be lossy due to text normalization. For example, qwen performs [unicode normalization](https://github.com/huggingface/transformers/blob/v4.42.3/src/transformers/models/qwen2/tokenization_qwen2.py#L338) in encoding process,
|
45 |
+
llama performs [clean_up_tokenization_spaces](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B/blob/main/tokenizer_config.json#L2053) in decoding process,
|
46 |
+
which may bring some slight differences to the reconstructed text. 👉 Check the tokenization loss of
|
47 |
+
[qwen](https://huggingface.co/spaces/eson/tokenizer-arena/raw/main/stats/compression_rate/Qwen.Qwen1.5-1.8B%20@%20cc100.ja.diff.json) and
|
48 |
+
[llama](https://huggingface.co/spaces/eson/tokenizer-arena/raw/main/stats/compression_rate/meta-llama.Meta-Llama-3.1-405B%20@%20cc100.en.diff.json).
|
49 |
|
|
|
50 |
|
51 |
+
|
52 |
+
**Compression Rate**<br>
|
53 |
There are mainly two types of metric to represent the `input_text`:
|
54 |
- `char-level`: the number of characters in the given text
|
55 |
- `byte-level`: the number of bytes in the given text.
|
compression_util.py
CHANGED
@@ -318,4 +318,4 @@ def main():
|
|
318 |
|
319 |
|
320 |
if __name__ == "__main__":
|
321 |
-
main()
|
|
|
318 |
|
319 |
|
320 |
if __name__ == "__main__":
|
321 |
+
main()
|
requirements.txt
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
-
gradio>=4.38.1
|
2 |
-
transformers
|
3 |
-
sentencepiece
|
4 |
-
tiktoken
|
5 |
-
icetk
|
6 |
-
torch
|
7 |
-
nltk
|
8 |
-
boto3
|
9 |
-
protobuf==4.25.3
|
10 |
-
ai2-olmo
|
11 |
-
ipadic
|
12 |
-
fugashi
|
|
|
1 |
+
gradio>=4.38.1
|
2 |
+
transformers>4.40.0
|
3 |
+
sentencepiece
|
4 |
+
tiktoken
|
5 |
+
icetk
|
6 |
+
torch
|
7 |
+
nltk
|
8 |
+
boto3
|
9 |
+
protobuf==4.25.3
|
10 |
+
ai2-olmo
|
11 |
+
ipadic
|
12 |
+
fugashi
|
stats/character_stats.json
CHANGED
@@ -1936,5 +1936,26 @@
|
|
1936 |
"len(ja-kana)": "1,2,11",
|
1937 |
"num(ko)": 4492,
|
1938 |
"len(ko)": "1,3,6"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1939 |
}
|
1940 |
}
|
|
|
1936 |
"len(ja-kana)": "1,2,11",
|
1937 |
"num(ko)": 4492,
|
1938 |
"len(ko)": "1,3,6"
|
1939 |
+
},
|
1940 |
+
"allenai/OLMo-7B-hf": {
|
1941 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/allenai/OLMo-7B-hf\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">OLMo-7B-hf</a>",
|
1942 |
+
"organization": "Allen AI",
|
1943 |
+
"vocab_size": 50280,
|
1944 |
+
"num(digit)": 2036,
|
1945 |
+
"len(digit)": "1,3,35",
|
1946 |
+
"num(space)": 29019,
|
1947 |
+
"len(space)": "1,7,512",
|
1948 |
+
"num(ar)": 94,
|
1949 |
+
"len(ar)": "1,2,4",
|
1950 |
+
"num(zh)": 313,
|
1951 |
+
"len(zh)": "1,1,2",
|
1952 |
+
"num(ja)": 480,
|
1953 |
+
"len(ja)": "1,1,4",
|
1954 |
+
"num(ja-kana)": 167,
|
1955 |
+
"len(ja-kana)": "1,1,4",
|
1956 |
+
"num(ko)": 25,
|
1957 |
+
"len(ko)": "1,1,2",
|
1958 |
+
"num(la)": 48651,
|
1959 |
+
"len(la)": "1,6,512"
|
1960 |
}
|
1961 |
}
|
stats/compression_rate.json
CHANGED
@@ -10306,5 +10306,53 @@
|
|
10306 |
"oov_ratio": 0.0,
|
10307 |
"_oov_charset": "[]",
|
10308 |
"lossless": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10309 |
}
|
10310 |
}
|
|
|
10306 |
"oov_ratio": 0.0,
|
10307 |
"_oov_charset": "[]",
|
10308 |
"lossless": true
|
10309 |
+
},
|
10310 |
+
"allenai/OLMo-7B-hf @ cc100/en": {
|
10311 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/allenai/OLMo-7B-hf\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">OLMo-7B-hf</a>",
|
10312 |
+
"organization": "Allen AI",
|
10313 |
+
"vocab_size": 50280,
|
10314 |
+
"_n_bytes": 1124813,
|
10315 |
+
"_n_tokens": 259357,
|
10316 |
+
"_n_chars": 1121360,
|
10317 |
+
"_n_oov_chars": 0,
|
10318 |
+
"oov_ratio": 0.0,
|
10319 |
+
"_oov_charset": "[]",
|
10320 |
+
"lossless": false
|
10321 |
+
},
|
10322 |
+
"allenai/OLMo-7B-hf @ cc100/zh-Hans": {
|
10323 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/allenai/OLMo-7B-hf\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">OLMo-7B-hf</a>",
|
10324 |
+
"organization": "Allen AI",
|
10325 |
+
"vocab_size": 50280,
|
10326 |
+
"_n_bytes": 2633047,
|
10327 |
+
"_n_tokens": 1220529,
|
10328 |
+
"_n_chars": 927311,
|
10329 |
+
"_n_oov_chars": 0,
|
10330 |
+
"oov_ratio": 0.0,
|
10331 |
+
"_oov_charset": "[]",
|
10332 |
+
"lossless": false
|
10333 |
+
},
|
10334 |
+
"allenai/OLMo-7B-hf @ cc100/fr": {
|
10335 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/allenai/OLMo-7B-hf\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">OLMo-7B-hf</a>",
|
10336 |
+
"organization": "Allen AI",
|
10337 |
+
"vocab_size": 50280,
|
10338 |
+
"_n_bytes": 1540504,
|
10339 |
+
"_n_tokens": 458961,
|
10340 |
+
"_n_chars": 1484970,
|
10341 |
+
"_n_oov_chars": 0,
|
10342 |
+
"oov_ratio": 0.0,
|
10343 |
+
"_oov_charset": "[]",
|
10344 |
+
"lossless": false
|
10345 |
+
},
|
10346 |
+
"allenai/OLMo-7B-hf @ cc100/es": {
|
10347 |
+
"tokenizer": "<a target=\"_blank\" href=\"https://huggingface.co/allenai/OLMo-7B-hf\" style=\"color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;\">OLMo-7B-hf</a>",
|
10348 |
+
"organization": "Allen AI",
|
10349 |
+
"vocab_size": 50280,
|
10350 |
+
"_n_bytes": 1664455,
|
10351 |
+
"_n_tokens": 494577,
|
10352 |
+
"_n_chars": 1630297,
|
10353 |
+
"_n_oov_chars": 0,
|
10354 |
+
"oov_ratio": 0.0,
|
10355 |
+
"_oov_charset": "[]",
|
10356 |
+
"lossless": false
|
10357 |
}
|
10358 |
}
|
utils/lang_util.py
CHANGED
@@ -12,7 +12,7 @@
|
|
12 |
此外,有些语言(如法语和西班牙语)在某些情况下可能共享特定的重音符号,这可能导致一个字符串被错误地识别为多种语言。
|
13 |
|
14 |
## common language
|
15 |
-
English | 简体中文 | 繁體中文 | 한국어 | Español | 日本語 | हिन्दी | Русский | Рortuguês | తెలుగు | Français | Deutsch | Tiếng Việt |
|
16 |
"""
|
17 |
|
18 |
import re
|
@@ -85,4 +85,4 @@ if __name__ == "__main__":
|
|
85 |
|
86 |
for s, expected in test_strings.items():
|
87 |
# print(f"'{s}' === Detected lang: {detect_language(s)} === Expected: {expected}")
|
88 |
-
print(f"'{s}'\nDetected lang: {detect_language_by_unicode(s)}\nExpected lang: {expected}")
|
|
|
12 |
此外,有些语言(如法语和西班牙语)在某些情况下可能共享特定的重音符号,这可能导致一个字符串被错误地识别为多种语言。
|
13 |
|
14 |
## common language
|
15 |
+
English | 简体中文 | 繁體中文 | 한국어 | Español | 日本語 | हिन्दी | Русский | Рortuguês | తెలుగు | Français | Deutsch | Tiếng Việt |
|
16 |
"""
|
17 |
|
18 |
import re
|
|
|
85 |
|
86 |
for s, expected in test_strings.items():
|
87 |
# print(f"'{s}' === Detected lang: {detect_language(s)} === Expected: {expected}")
|
88 |
+
print(f"'{s}'\nDetected lang: {detect_language_by_unicode(s)}\nExpected lang: {expected}")
|
vocab.py
CHANGED
@@ -367,7 +367,7 @@ _all_tokenizer_config = [
|
|
367 |
TokenizerConfig("deepseek-ai/DeepSeek-V2", org="DeepSeek"),
|
368 |
TokenizerConfig("google/gemma-7b", org="Google"),
|
369 |
TokenizerConfig("google/gemma-2-9b", org="Google"),
|
370 |
-
TokenizerConfig("allenai/OLMo-7B", org="Allen AI"),
|
371 |
TokenizerConfig("HuggingFaceH4/zephyr-7b-beta", org="HuggingFace"),
|
372 |
TokenizerConfig("ai21labs/Jamba-v0.1", org="AI21"),
|
373 |
TokenizerConfig("databricks/dbrx-instruct", org="Databricks"),
|
|
|
367 |
TokenizerConfig("deepseek-ai/DeepSeek-V2", org="DeepSeek"),
|
368 |
TokenizerConfig("google/gemma-7b", org="Google"),
|
369 |
TokenizerConfig("google/gemma-2-9b", org="Google"),
|
370 |
+
TokenizerConfig("allenai/OLMo-7B-hf", org="Allen AI"),
|
371 |
TokenizerConfig("HuggingFaceH4/zephyr-7b-beta", org="HuggingFace"),
|
372 |
TokenizerConfig("ai21labs/Jamba-v0.1", org="AI21"),
|
373 |
TokenizerConfig("databricks/dbrx-instruct", org="Databricks"),
|