Spaces:
Running
Running
import gradio as gr | |
from character_util import get_character_table | |
all_columns = [ | |
("digit", "digit"), | |
("space", "space"), | |
("lang-chinese", 'zh'), | |
("lang-korea", 'ko'), | |
("lang-japanese", 'ja'), | |
# ("byte", "byte"), | |
# ("oov", "oov") | |
] | |
default_columns = ["digit", "zh"] | |
# columns = ["lang-zh", "lang-korea", "lang-japanese", "number", "space", "bytes", "oov"] | |
abbr2name = {column[1]: column[0].split('-')[-1] for column in all_columns} | |
def get_column_info(columns): | |
print(columns) | |
markdown = "" | |
for column in columns: | |
markdown += f"- `num({column})`: num of tokens containing {abbr2name[column]} characters\n" \ | |
f"- `len({column})`: `min,median,max` length of tokens containing {abbr2name[column]} characters\n" | |
return markdown | |
with gr.Blocks() as demo: | |
gr.Markdown("## 🛠️ Setting") # ⚙ | |
with gr.Accordion("Please select the type of character you want to count.", open=True): | |
# file size 💽 🖴, tokens 🧮 | |
with gr.Row(): | |
with gr.Column(): | |
columns = gr.Checkboxgroup( | |
all_columns, | |
value=default_columns, | |
label="character type", | |
# info="" | |
) | |
gr.Markdown( | |
"To count other types of characters, you can modify [character_util.py]" | |
"(https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/character_util.py). " | |
) | |
column_info = gr.Markdown( | |
get_column_info(default_columns) | |
) | |
gr.Markdown("## 📊 Character Statistics") | |
search_bar = gr.Textbox( | |
placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...", | |
show_label=False, | |
elem_id="search-bar", | |
) | |
compress_rate_table = gr.Dataframe(datatype="html", wrap=True) | |
search_bar.submit( | |
get_character_table, | |
inputs=[search_bar, columns], | |
outputs=compress_rate_table | |
) | |
columns.change( | |
get_character_table, | |
inputs=[search_bar, columns], | |
outputs=compress_rate_table | |
) | |
columns.change( | |
get_column_info, | |
inputs=[columns], | |
outputs=column_info | |
) | |
demo.load( | |
get_character_table, | |
inputs=[search_bar, columns], | |
outputs=compress_rate_table | |
) | |
if __name__ == "__main__": | |
demo.launch() | |