File size: 2,648 Bytes
7c73423
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""

##

"""

import gradio as gr
from character_util import get_character_table, default_columns

all_columns = [
    ("digit", "digit"),
    ("space", "space"),
    ("lang-chinese", 'zh'),
    ("lang-korea", 'ko'),
    ("lang-japanese", 'ja'),
    # ("byte", "byte"),
    # ("oov", "oov")
]


# columns = ["lang-zh", "lang-korea", "lang-japanese", "number", "space", "bytes", "oov"]

abbr2name = {column[1]: column[0].split('-')[-1] for column in all_columns}


def get_column_info(columns):
    markdown = ""
    for column in columns:
        markdown += f"- `num({column})`: num of tokens containing {abbr2name[column]} characters\n" \
                    f"- `len({column})`: `min,median,max` length of tokens containing {abbr2name[column]} characters\n"
    return markdown


with gr.Blocks() as demo:
    gr.Markdown("## 🛠️ Setting")  # ⚙
    with gr.Accordion("Please select the type of character you want to count.", open=True):
        # file size 💽 🖴, tokens 🧮
        with gr.Row():
            with gr.Column():
                columns = gr.Checkboxgroup(
                    all_columns,
                    value=default_columns,
                    label="character type",
                    # info=""
                )
                gr.Markdown(
                    "To count other types of characters, you can modify [lang_util.py]"
                    "(https://huggingface.co/spaces/eson/tokenizer-arena/blob/main/utils/lang_util.py). "
                )
            column_info = gr.Markdown(
                get_column_info(default_columns)
            )

        gr.Markdown("## 📊 Character Statistics")
        search_bar = gr.Textbox(
            placeholder="🔍 Search by tokenizer or organization (e.g., 'llama', 'openai') and press ENTER...",
            show_label=False,
            elem_id="search-bar",
        )
        compress_rate_table = gr.Dataframe(datatype="html", wrap=True)

        search_bar.submit(
            get_character_table,
            inputs=[search_bar, columns],
            outputs=compress_rate_table
        )
        columns.change(
            get_character_table,
            inputs=[search_bar, columns],
            outputs=compress_rate_table
        )
        columns.change(
            get_column_info,
            inputs=[columns],
            outputs=column_info
        )

        demo.load(
            get_character_table,
            inputs=[search_bar, columns],
            outputs=compress_rate_table
        )

    if __name__ == "__main__":
        demo.launch()