MohamedRashad commited on
Commit
62d4a12
·
1 Parent(s): 203ba9b

Update tokenization and visualization logic

Browse files
Files changed (1) hide show
  1. app.py +35 -22
app.py CHANGED
@@ -6,6 +6,10 @@ from datasets import load_dataset
6
  import random
7
  from pathlib import Path
8
 
 
 
 
 
9
  initial_list_of_models = [
10
  "Xenova/gpt-4o",
11
  "NousResearch/Meta-Llama-3-8B",
@@ -18,12 +22,12 @@ dataset = load_dataset("MohamedRashad/rasaif-translations", split="train")["arab
18
 
19
  dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
20
  if dataframe_path.exists():
21
- df = pd.read_jsonl(dataframe_path, lines=True)
22
  else:
23
- df = pd.DataFrame(columns=["Models", "Total Number of Tokens", "Vocab Size", "Tokenizer Class"])
24
 
25
  for model_name in tqdm(initial_list_of_models):
26
- if model_name in df["Models"].values:
27
  continue
28
  tokenizer = AutoTokenizer.from_pretrained(
29
  model_name, use_fast=True, trust_remote_code=True
@@ -32,16 +36,16 @@ for model_name in tqdm(initial_list_of_models):
32
  number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
33
  df = df._append(
34
  {
35
- "Models": model_name,
36
- "Vocab Size": vocab_size,
37
- "Total Number of Tokens": number_of_tokens,
38
  "Tokenizer Class": tokenizer.__class__.__name__,
39
  },
40
  ignore_index=True,
41
  )
42
 
43
  # Sort the dataframe by the number of tokens
44
- df = df.sort_values(by="Total Number of Tokens", ascending=True)
45
 
46
  # Save the dataframe to a csv file
47
  df.to_json(dataframe_path, lines=True, orient="records")
@@ -49,8 +53,8 @@ df.to_json(dataframe_path, lines=True, orient="records")
49
  # Gradio Functions
50
  def refresh():
51
  global df
52
- df = df.sort_values(by="Total Number of Tokens", ascending=True)
53
- return gr.Dataframe(df), gr.BarPlot(df)
54
 
55
  def submit(model_name):
56
  global df
@@ -61,9 +65,9 @@ def submit(model_name):
61
  number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
62
  df = df._append(
63
  {
64
- "Models": model_name,
65
- "Vocab Size": vocab_size,
66
- "Total Number of Tokens": number_of_tokens,
67
  "Tokenizer Class": tokenizer.__class__.__name__,
68
  },
69
  ignore_index=True,
@@ -144,13 +148,23 @@ def decode_arabic_tokens(tokens):
144
  def tokenize_text(text, chosen_model):
145
  tokenizer = AutoTokenizer.from_pretrained(chosen_model)
146
  tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
147
- # tokenized_text = decode_arabic_tokens(tokenizer.tokenize(text))
148
  random_colors = generate_distinct_colors(len(tokenized_text))
149
- print(tokenized_text)
 
 
 
 
 
 
 
 
 
 
 
150
 
151
  output = []
152
  color_map = {}
153
- for idx, token in enumerate(tokenized_text):
154
  output.append((token, str(idx)))
155
  color_map[str(idx+1)] = random_colors[idx % len(random_colors)]
156
 
@@ -169,16 +183,15 @@ with gr.Blocks() as demo:
169
  with gr.Accordion("Barplot", open=False):
170
  barplot = gr.BarPlot(
171
  df,
172
- x="Models",
173
- y="Total Number of Tokens",
174
  x_title=" ",
175
  y_title=" ",
176
  width=1000,
177
  height=400,
178
- tooltip=["Vocab Size", "Total Number of Tokens"],
179
  vertical=False,
180
  x_label_angle=30,
181
- caption="Total Number of Tokens",
182
  )
183
  model_name = gr.Textbox(
184
  label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
@@ -190,14 +203,14 @@ with gr.Blocks() as demo:
190
  text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right")
191
  dropdown = gr.Dropdown(
192
  label="Select a model",
193
- choices=df["Models"].tolist(),
194
- value=df["Models"].tolist()[0],
195
  )
196
  submit_text_btn = gr.Button(value="Submit", variant="primary")
197
  tokenized_textbox = gr.HighlightedText(label="Tokenized text")
198
 
199
  submit_new_model_btn.click(submit, model_name)
200
- refresh_btn.click(refresh, outputs=[dataframe, barplot])
201
  submit_text_btn.click(tokenize_text, inputs=[text, dropdown], outputs=[tokenized_textbox])
202
 
203
 
 
6
  import random
7
  from pathlib import Path
8
 
9
+ # tokenizer = AutoTokenizer.from_pretrained("Xenova/gpt-4o")
10
+ # token_ids = tokenizer.encode("السلام عليكم ورحمة الله")
11
+ # exit()
12
+
13
  initial_list_of_models = [
14
  "Xenova/gpt-4o",
15
  "NousResearch/Meta-Llama-3-8B",
 
22
 
23
  dataframe_path = Path(__file__).parent / "arabic_tokenizers_leaderboard.jsonl"
24
  if dataframe_path.exists():
25
+ df = pd.read_json(dataframe_path, lines=True)
26
  else:
27
+ df = pd.DataFrame(columns=["📛 Models", "Total Number of Tokens", "📘 Vocab Size", "Tokenizer Class"])
28
 
29
  for model_name in tqdm(initial_list_of_models):
30
+ if model_name in df["📛 Models"].values:
31
  continue
32
  tokenizer = AutoTokenizer.from_pretrained(
33
  model_name, use_fast=True, trust_remote_code=True
 
36
  number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
37
  df = df._append(
38
  {
39
+ "📛 Models": model_name,
40
+ "📘 Vocab Size": vocab_size,
41
+ "Total Number of Tokens": number_of_tokens,
42
  "Tokenizer Class": tokenizer.__class__.__name__,
43
  },
44
  ignore_index=True,
45
  )
46
 
47
  # Sort the dataframe by the number of tokens
48
+ df = df.sort_values(by="Total Number of Tokens", ascending=True)
49
 
50
  # Save the dataframe to a csv file
51
  df.to_json(dataframe_path, lines=True, orient="records")
 
53
  # Gradio Functions
54
  def refresh():
55
  global df
56
+ df = df.sort_values(by="Total Number of Tokens", ascending=True)
57
+ return gr.Dataframe(df), gr.BarPlot(df), gr.Dropdown(choices=df["📛 Models"].tolist())
58
 
59
  def submit(model_name):
60
  global df
 
65
  number_of_tokens = sum([len(x) for x in tokenizer(dataset).input_ids])
66
  df = df._append(
67
  {
68
+ "📛 Models": model_name,
69
+ " Total Number of Tokens": number_of_tokens,
70
+ "📘 Vocab Size": vocab_size,
71
  "Tokenizer Class": tokenizer.__class__.__name__,
72
  },
73
  ignore_index=True,
 
148
  def tokenize_text(text, chosen_model):
149
  tokenizer = AutoTokenizer.from_pretrained(chosen_model)
150
  tokenized_text = decode_bpe_tokens(tokenizer.tokenize(text))
 
151
  random_colors = generate_distinct_colors(len(tokenized_text))
152
+
153
+ final_tokenized_text = []
154
+ for token in tokenized_text:
155
+ correct_tokenized_text = ""
156
+ for char in text:
157
+ correct_tokenized_text += char
158
+ current_token = decode_bpe_tokens(tokenizer.tokenize(correct_tokenized_text))
159
+ if current_token[0] == token:
160
+ final_tokenized_text.append(correct_tokenized_text)
161
+ text = text[len(correct_tokenized_text):]
162
+ break
163
+ print(final_tokenized_text)
164
 
165
  output = []
166
  color_map = {}
167
+ for idx, token in enumerate(final_tokenized_text):
168
  output.append((token, str(idx)))
169
  color_map[str(idx+1)] = random_colors[idx % len(random_colors)]
170
 
 
183
  with gr.Accordion("Barplot", open=False):
184
  barplot = gr.BarPlot(
185
  df,
186
+ x="📛 Models",
187
+ y="Total Number of Tokens",
188
  x_title=" ",
189
  y_title=" ",
190
  width=1000,
191
  height=400,
192
+ tooltip=["📘 Vocab Size", "Total Number of Tokens"],
193
  vertical=False,
194
  x_label_angle=30,
 
195
  )
196
  model_name = gr.Textbox(
197
  label="Model Name from Hugging Face (e.g. Xenova/gpt-4o)"
 
203
  text = gr.Textbox(label="Enter a text", lines=5, value="السلام عليكم ورحمة الله", rtl=True, text_align="right")
204
  dropdown = gr.Dropdown(
205
  label="Select a model",
206
+ choices=df["📛 Models"].tolist(),
207
+ value=df["📛 Models"].tolist()[0],
208
  )
209
  submit_text_btn = gr.Button(value="Submit", variant="primary")
210
  tokenized_textbox = gr.HighlightedText(label="Tokenized text")
211
 
212
  submit_new_model_btn.click(submit, model_name)
213
+ refresh_btn.click(refresh, outputs=[dataframe, barplot, dropdown])
214
  submit_text_btn.click(tokenize_text, inputs=[text, dropdown], outputs=[tokenized_textbox])
215
 
216