Spaces:
Running
Running
update
Browse files- compression_app.py +4 -3
- compression_util.py +0 -2
- stats/compression_rate.json +0 -0
compression_app.py
CHANGED
@@ -27,7 +27,8 @@ from compression_util import get_compression_leaderboard, common_corpuses
|
|
27 |
# exactly reconstructed from compressed tokens
|
28 |
docs = """## 📖 What is a good tokenizer?
|
29 |
|
30 |
-
From a compression perspective, a good tokenizer should be lossless,
|
|
|
31 |
The encoding and decoding process can be formulated as
|
32 |
```python
|
33 |
token_ids = tokenizer.encode(input_text) # compressed tokens
|
@@ -142,9 +143,9 @@ with gr.Blocks(theme=theme) as demo:
|
|
142 |
)
|
143 |
|
144 |
gr.Markdown("## 🏆 Compression Rate Leaderboard\n"
|
145 |
-
"
|
146 |
"Lower `oov_ratio` refers to less out-of-vocabulary tokens.\n"
|
147 |
-
"
|
148 |
)
|
149 |
search_bar = gr.Textbox(
|
150 |
placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
|
|
|
27 |
# exactly reconstructed from compressed tokens
|
28 |
docs = """## 📖 What is a good tokenizer?
|
29 |
|
30 |
+
From a compression perspective, a good tokenizer should be lossless,
|
31 |
+
and keep high compression rate (less tokens for a given text).
|
32 |
The encoding and decoding process can be formulated as
|
33 |
```python
|
34 |
token_ids = tokenizer.encode(input_text) # compressed tokens
|
|
|
143 |
)
|
144 |
|
145 |
gr.Markdown("## 🏆 Compression Rate Leaderboard\n"
|
146 |
+
"This leaderboard aims to evaluate tokenizer performance on different languages.\n"
|
147 |
"Lower `oov_ratio` refers to less out-of-vocabulary tokens.\n"
|
148 |
+
"Lower `char/token` means more words might be segmented into subwords."
|
149 |
)
|
150 |
search_bar = gr.Textbox(
|
151 |
placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
|
compression_util.py
CHANGED
@@ -297,9 +297,7 @@ def get_compression_leaderboard(
|
|
297 |
reverse_unit = f"{file_size_unit}/{token_number_unit}"
|
298 |
stats = to_dataframe(stats, ["char/token", unit, reverse_unit])
|
299 |
stats = stats.sort_values(["oov_ratio", "char/token"], ascending=[True, False])
|
300 |
-
|
301 |
# stats = stats.sort_values(["oov_ratio", unit], ascending=[True, True])
|
302 |
-
|
303 |
stats = stats.rename(columns={"oov_ratio": f' ⬆️oov_ratio'}).rename(columns={"char/token": ' ⬇️char/token'}) #
|
304 |
return stats
|
305 |
|
|
|
297 |
reverse_unit = f"{file_size_unit}/{token_number_unit}"
|
298 |
stats = to_dataframe(stats, ["char/token", unit, reverse_unit])
|
299 |
stats = stats.sort_values(["oov_ratio", "char/token"], ascending=[True, False])
|
|
|
300 |
# stats = stats.sort_values(["oov_ratio", unit], ascending=[True, True])
|
|
|
301 |
stats = stats.rename(columns={"oov_ratio": f' ⬆️oov_ratio'}).rename(columns={"char/token": ' ⬇️char/token'}) #
|
302 |
return stats
|
303 |
|
stats/compression_rate.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|