MohamedRashad
commited on
Commit
·
62d4a12
1
Parent(s):
203ba9b
Update tokenization and visualization logic
Browse files
app.py
CHANGED
@@ -6,6 +6,10 @@ from datasets import load_dataset
|
|
6 |
import random
|
7 |
from pathlib import Path
|
8 |
|
|
|
|
|
|
|
|
|
9 |
initial_list_of_models = [
|
10 |
"Xenova/gpt-4o",
|
11 |
"NousResearch/Meta-Llama-3-8B",
|
@@ -18,12 +22,12 @@ dataset = load_dataset("MohamedRashad/rasaif-translations", split="train")["arab
|
|
18 |
|
19 |
dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
|
20 |
if dataframe_path.exists():
|
21 |
-
df = pd.
|
22 |
else:
|
23 |
-
df = pd.DataFrame(columns=["Models", "Total Number of Tokens", "Vocab Size", "Tokenizer Class"])
|
24 |
|
25 |
for model_name in tqdm(initial_list_of_models):
|
26 |
-
if model_name in df["Models"].values:
|
27 |
continue
|
28 |
tokenizer = AutoTokenizer.from_pretrained(
|
29 |
model_name, use_fast=True, trust_remote_code=True
|
@@ -32,16 +36,16 @@ for model_name in tqdm(initial_list_of_models):
|
|
32 |
number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
|
33 |
df = df._append(
|
34 |
{
|
35 |
-
"Models": model_name,
|
36 |
-
"Vocab Size": vocab_size,
|
37 |
-
"Total Number of Tokens": number_of_tokens,
|
38 |
"Tokenizer Class": tokenizer.__class__.__name__,
|
39 |
},
|
40 |
ignore_index=True,
|
41 |
)
|
42 |
|
43 |
# Sort the dataframe by the number of tokens
|
44 |
-
df = df.sort_values(by="Total Number of Tokens", ascending=True)
|
45 |
|
46 |
# Save the dataframe to a csv file
|
47 |
df.to_json(dataframe_path, lines=True, orient="records")
|
@@ -49,8 +53,8 @@ df.to_json(dataframe_path, lines=True, orient="records")
|
|
49 |
# Gradio Functions
|
50 |
def refresh():
|
51 |
global df
|
52 |
-
df = df.sort_values(by="Total Number of Tokens", ascending=True)
|
53 |
-
return gr.Dataframe(df), gr.BarPlot(df)
|
54 |
|
55 |
def submit(model_name):
|
56 |
global df
|
@@ -61,9 +65,9 @@ def submit(model_name):
|
|
61 |
number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
|
62 |
df = df._append(
|
63 |
{
|
64 |
-
"Models": model_name,
|
65 |
-
"
|
66 |
-
"
|
67 |
"Tokenizer Class": tokenizer.__class__.__name__,
|
68 |
},
|
69 |
ignore_index=True,
|
@@ -144,13 +148,23 @@ def decode_arabic_tokens(tokens):
|
|
144 |
def tokenize_text(text, chosen_model):
|
145 |
tokenizer = AutoTokenizer.from_pretrained(chosen_model)
|
146 |
tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
|
147 |
-
# tokenized_text = decode_arabic_tokens(tokenizer.tokenize(text))
|
148 |
random_colors = generate_distinct_colors(len(tokenized_text))
|
149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
output = []
|
152 |
color_map = {}
|
153 |
-
for idx, token in enumerate(
|
154 |
output.append((token, str(idx)))
|
155 |
color_map[str(idx+1)] = random_colors[idx % len(random_colors)]
|
156 |
|
@@ -169,16 +183,15 @@ with gr.Blocks() as demo:
|
|
169 |
with gr.Accordion("Barplot", open=False):
|
170 |
barplot = gr.BarPlot(
|
171 |
df,
|
172 |
-
x="Models",
|
173 |
-
y="Total Number of Tokens",
|
174 |
x_title=" ",
|
175 |
y_title=" ",
|
176 |
width=1000,
|
177 |
height=400,
|
178 |
-
tooltip=["Vocab Size", "Total Number of Tokens"],
|
179 |
vertical=False,
|
180 |
x_label_angle=30,
|
181 |
-
caption="Total Number of Tokens",
|
182 |
)
|
183 |
model_name = gr.Textbox(
|
184 |
label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
|
@@ -190,14 +203,14 @@ with gr.Blocks() as demo:
|
|
190 |
text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right")
|
191 |
dropdown = gr.Dropdown(
|
192 |
label="Select a model",
|
193 |
-
choices=df["Models"].tolist(),
|
194 |
-
value=df["Models"].tolist()[0],
|
195 |
)
|
196 |
submit_text_btn = gr.Button(value="Submit", variant="primary")
|
197 |
tokenized_textbox = gr.HighlightedText(label="Tokenized text")
|
198 |
|
199 |
submit_new_model_btn.click(submit, model_name)
|
200 |
-
refresh_btn.click(refresh, outputs=[dataframe, barplot])
|
201 |
submit_text_btn.click(tokenize_text, inputs=[text, dropdown], outputs=[tokenized_textbox])
|
202 |
|
203 |
|
|
|
6 |
import random
|
7 |
from pathlib import Path
|
8 |
|
9 |
+
# tokenizer = AutoTokenizer.from_pretrained("Xenova/gpt-4o")
|
10 |
+
# token_ids = tokenizer.encode("السلام عليكم ورحمة الله")
|
11 |
+
# exit()
|
12 |
+
|
13 |
initial_list_of_models = [
|
14 |
"Xenova/gpt-4o",
|
15 |
"NousResearch/Meta-Llama-3-8B",
|
|
|
22 |
|
23 |
dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
|
24 |
if dataframe_path.exists():
|
25 |
+
df = pd.read_json(dataframe_path, lines=True)
|
26 |
else:
|
27 |
+
df = pd.DataFrame(columns=["📛 Models", "➕ Total Number of Tokens", "📘 Vocab Size", "Tokenizer Class"])
|
28 |
|
29 |
for model_name in tqdm(initial_list_of_models):
|
30 |
+
if model_name in df["📛 Models"].values:
|
31 |
continue
|
32 |
tokenizer = AutoTokenizer.from_pretrained(
|
33 |
model_name, use_fast=True, trust_remote_code=True
|
|
|
36 |
number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
|
37 |
df = df._append(
|
38 |
{
|
39 |
+
"📛 Models": model_name,
|
40 |
+
"📘 Vocab Size": vocab_size,
|
41 |
+
"➕ Total Number of Tokens": number_of_tokens,
|
42 |
"Tokenizer Class": tokenizer.__class__.__name__,
|
43 |
},
|
44 |
ignore_index=True,
|
45 |
)
|
46 |
|
47 |
# Sort the dataframe by the number of tokens
|
48 |
+
df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
|
49 |
|
50 |
# Save the dataframe to a csv file
|
51 |
df.to_json(dataframe_path, lines=True, orient="records")
|
|
|
53 |
# Gradio Functions
|
54 |
def refresh():
|
55 |
global df
|
56 |
+
df = df.sort_values(by="➕ Total Number of Tokens", ascending=True)
|
57 |
+
return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist())
|
58 |
|
59 |
def submit(model_name):
|
60 |
global df
|
|
|
65 |
number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
|
66 |
df = df._append(
|
67 |
{
|
68 |
+
"📛 Models": model_name,
|
69 |
+
"➕ Total Number of Tokens": number_of_tokens,
|
70 |
+
"📘 Vocab Size": vocab_size,
|
71 |
"Tokenizer Class": tokenizer.__class__.__name__,
|
72 |
},
|
73 |
ignore_index=True,
|
|
|
148 |
def tokenize_text(text, chosen_model):
|
149 |
tokenizer = AutoTokenizer.from_pretrained(chosen_model)
|
150 |
tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
|
|
|
151 |
random_colors = generate_distinct_colors(len(tokenized_text))
|
152 |
+
|
153 |
+
final_tokenized_text = []
|
154 |
+
for token in tokenized_text:
|
155 |
+
correct_tokenized_text = ""
|
156 |
+
for char in text:
|
157 |
+
correct_tokenized_text += char
|
158 |
+
current_token = decode_bpe_tokens(tokenizer.tokenize(correct_tokenized_text))
|
159 |
+
if current_token[0] == token:
|
160 |
+
final_tokenized_text.append(correct_tokenized_text)
|
161 |
+
text = text[len(correct_tokenized_text):]
|
162 |
+
break
|
163 |
+
print(final_tokenized_text)
|
164 |
|
165 |
output = []
|
166 |
color_map = {}
|
167 |
+
for idx, token in enumerate(final_tokenized_text):
|
168 |
output.append((token, str(idx)))
|
169 |
color_map[str(idx+1)] = random_colors[idx % len(random_colors)]
|
170 |
|
|
|
183 |
with gr.Accordion("Barplot", open=False):
|
184 |
barplot = gr.BarPlot(
|
185 |
df,
|
186 |
+
x="📛 Models",
|
187 |
+
y="➕ Total Number of Tokens",
|
188 |
x_title=" ",
|
189 |
y_title=" ",
|
190 |
width=1000,
|
191 |
height=400,
|
192 |
+
tooltip=["📘 Vocab Size", "➕ Total Number of Tokens"],
|
193 |
vertical=False,
|
194 |
x_label_angle=30,
|
|
|
195 |
)
|
196 |
model_name = gr.Textbox(
|
197 |
label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
|
|
|
203 |
text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right")
|
204 |
dropdown = gr.Dropdown(
|
205 |
label="Select a model",
|
206 |
+
choices=df["📛 Models"].tolist(),
|
207 |
+
value=df["📛 Models"].tolist()[0],
|
208 |
)
|
209 |
submit_text_btn = gr.Button(value="Submit", variant="primary")
|
210 |
tokenized_textbox = gr.HighlightedText(label="Tokenized text")
|
211 |
|
212 |
submit_new_model_btn.click(submit, model_name)
|
213 |
+
refresh_btn.click(refresh, outputs=[dataframe, barplot, dropdown])
|
214 |
submit_text_btn.click(tokenize_text, inputs=[text, dropdown], outputs=[tokenized_textbox])
|
215 |
|
216 |
|