Spaces:
Running
Running
""" | |
TODO: | |
- 统计 tokenizer_impl | |
- 统计 OOV | |
- 统计 reversal | |
- 增加 math,code | |
""" | |
import gradio as gr | |
from compression_util import get_compression_leaderboard, common_corpuses | |
with gr.Blocks() as demo: | |
# gr.Markdown("## Convertor") | |
# with gr.Accordion("Convertor", open=False): | |
# gr.Markdown("Tokenize {} corpus") | |
# with gr.Row(elem_classes="no-border"): | |
# gr.Button("File Size", min_width=50) | |
# file_size = gr.Textbox( | |
# show_label=False, | |
# min_width=50, | |
# # elem_classes="textbox-as-text" | |
# ) | |
# gr.Dropdown( | |
# choices=['MB', 'GB', 'TB'], | |
# show_label=False, | |
# min_width=15, | |
# # elem_classes="textbox-as-text" | |
# ) | |
# # gr.Markdown('<h2 align="center">≈</h2>') | |
# # gr.HTML('<h2 style="margin: auto;">≈</h2>') | |
# gr.Button( | |
# "≈", | |
# min_width=10, | |
# elem_classes="button-white h2-font" | |
# | |
# ) | |
# | |
# gr.Button( | |
# "Tokens", | |
# min_width=50 | |
# ) | |
# gr.Textbox( | |
# show_label=False, | |
# min_width=50 | |
# ) | |
# gr.Dropdown( | |
# ['million', 'billion', 'trillion'], | |
# show_label=False, | |
# min_width=15, | |
# elem_classes="button-white" | |
# ) | |
gr.Markdown("## 🛠️ Setting") # ⚙ | |
with gr.Accordion("Please select the corpus and measure of compression rate.", open=True): | |
# file size 💽 🖴, tokens 🧮 | |
# Total amount of disk used | |
with gr.Row(): | |
with gr.Column(): | |
compress_rate_corpus = gr.Dropdown( | |
common_corpuses, # , "code" | |
value=["cc100/en", "cc100/zh-Hans", "cc100/fr", "cc100/es"], | |
label="corpus", | |
multiselect=True | |
# info="" | |
) | |
# unit of file_size: gigabyte terabyte | |
# unit of token_num: million billion trillion | |
# The most common units of measurement include length (meter, inch, foot), weight (gram, kilogram, pound), volume (liter, gallon, milliliter), time (second, minute, hour) | |
compress_rate_unit = gr.Radio( | |
["b_tokens/g_bytes", "t_tokens/t_bytes"], | |
value="b_tokens/g_bytes", | |
label="measure", # evaluation metric | |
) | |
gr.Markdown( | |
"- `corpus`: tokenization is performed on the selected subsets of [cc100](https://huggingface.co/datasets/cc100) corpus.\n" | |
"- `b_tokens/g_bytes` measures how many billion tokens per gigabytes corpus.\n" | |
"- `t_tokens/t_bytes` measures how many trillion tokens per terabytes corpus.\n" | |
# "- `g_bytes/b_tokens` measures how many gigabytes corpus per billion tokens.\n" | |
# "- `t_bytes/t_tokens` measures how many terabytes corpus per trillion tokens.\n" | |
"- `char/token` measures how many chars per token on the tokenized corpus.\n" | |
"- `oov_ratio`: out-of-vocabulary ratio on the selected corpus. 👉 get [oov charset](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/stats/compression_rate.json)\n\n" | |
"You can reproduce this procedure with [compression_util.py](https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/compression_util.py)." | |
) | |
gr.Markdown("## 🏆 Compression Rate Leaderboard") | |
search_bar = gr.Textbox( | |
placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...", | |
show_label=False, | |
elem_id="search-bar", | |
) | |
compress_rate_table = gr.Dataframe(datatype="html") | |
# func call | |
compress_rate_corpus.change( | |
get_compression_leaderboard, | |
inputs=[compress_rate_corpus, compress_rate_unit, search_bar], | |
outputs=compress_rate_table | |
) | |
compress_rate_unit.change( | |
get_compression_leaderboard, | |
inputs=[compress_rate_corpus, compress_rate_unit, search_bar], | |
outputs=compress_rate_table | |
) | |
# file_size.change( | |
# get_all_compress_rate, | |
# outputs=compress_rate_table | |
# ) | |
search_bar.submit( | |
get_compression_leaderboard, | |
inputs=[ | |
compress_rate_corpus, | |
compress_rate_unit, | |
search_bar, | |
], | |
outputs=compress_rate_table | |
) | |
demo.load( | |
get_compression_leaderboard, | |
inputs=[compress_rate_corpus, compress_rate_unit], | |
outputs=compress_rate_table | |
) | |
if __name__ == "__main__": | |
demo.launch() | |