José Ángel González commited on
Commit
ed1f9e1
·
1 Parent(s): c563d70

improve distinction between Spanish and Spanish Mixed

Browse files
Files changed (2) hide show
  1. app.py +337 -223
  2. etc/languages_settings.yml +2 -2
app.py CHANGED
@@ -8,6 +8,7 @@ from pathlib import Path
8
  import pandas as pd
9
  import streamlit as st
10
  import plotly.express as px
 
11
 
12
  from datasets import load_dataset
13
  from huggingface_hub import CommitScheduler, hf_hub_download
@@ -20,7 +21,12 @@ from src.task_mappings import professional_mapping, semantic_categories
20
  # -----------------------------------------------------------------------------
21
  # Page configuration and global CSS styles for modern look and improved UX
22
  # -----------------------------------------------------------------------------
23
- st.set_page_config(page_title="IberBench", layout="wide", initial_sidebar_state="expanded", page_icon="🌍")
 
 
 
 
 
24
 
25
  st.markdown(
26
  """
@@ -68,8 +74,16 @@ request_folder = request_file.parent
68
  LANGUAGES_SETTINGS = Path("etc/languages_settings.yml")
69
 
70
  dataset_columns = [
71
- "workshop", "shared_task", "year", "task_type", "language",
72
- "url", "language_variety", "problem_type", "num_labels", "labels",
 
 
 
 
 
 
 
 
73
  ]
74
  model_columns = ["model_name", "model_type", "num_parameters"]
75
 
@@ -83,30 +97,42 @@ scheduler = CommitScheduler(
83
  every=10,
84
  )
85
 
 
86
  def log_submission(input_dict: dict) -> None:
87
  with scheduler.lock:
88
  with request_file.open("a") as f:
89
  f.write(json.dumps(input_dict))
90
  f.write("\n")
91
 
 
92
  def get_lang_columns(columns: list, lang: str):
93
- lang_norm = lang.lower().replace(" ", "_")
94
- return [col for col in columns if lang_norm in col]
 
 
 
 
 
 
 
95
 
96
  @st.cache_data
97
  def load_data(lang) -> pd.DataFrame:
98
  try:
99
- data = load_dataset("iberbench/lm-eval-results", token=st.secrets["HF_TOKEN"])["train"].to_pandas()
 
 
100
  task_columns = [col for col in data.columns if col not in model_columns]
101
  task_lang_columns = get_lang_columns(task_columns, lang)
102
- data[task_columns] = data[task_columns]*100
103
  data = data[model_columns + task_lang_columns]
104
- #data["Active"] = False
105
  return data
106
  except FileNotFoundError:
107
  st.error("iberbench/lm-eval-results was not found in the hub 😕")
108
  return pd.DataFrame()
109
 
 
110
  def load_dataset_card(task) -> list:
111
  name_repo = "iberbench/" + task
112
  try:
@@ -130,16 +156,24 @@ def load_dataset_card(task) -> list:
130
 
131
 
132
  def active_data(lang) -> pd.DataFrame:
133
- return st.session_state[f"leaderboard_data_{lang}"][st.session_state[f"leaderboard_data_{lang}"]["Active"] == True].copy()
 
 
 
134
 
135
  def get_index(lang, row) -> pd.Series:
136
  return active_data(lang).iloc[row].name
137
 
 
138
  def commit(lang) -> None:
139
  for row in st.session_state[f"edited_data_{lang}"]["edited_rows"]:
140
  row_index = get_index(lang, row)
141
- for key, value in st.session_state[f"edited_data_{lang}"]["edited_rows"][row].items():
142
- st.session_state[f"leaderboard_data_{lang}"].at[row_index, key] = value
 
 
 
 
143
 
144
 
145
  # -----------------------------------------------------------------------------
@@ -172,10 +206,14 @@ def create_table_results(df_mean: pd.DataFrame):
172
 
173
  def create_table_all_results(aggregated_df: pd.DataFrame):
174
  combined_df = create_data_results_per_language()
175
- df_lang= combined_df.pivot(index='model_name', columns='language', values='Mean')
176
- aggregated_df[df_lang.columns]=df_lang[df_lang.columns].values
 
 
177
  rank_value = []
178
- for i in aggregated_df["Mean"].rank(method="dense", ascending=False).astype(int):
 
 
179
  if i == 1:
180
  rank_value.append(f"{i} 🥇")
181
  elif i == 2:
@@ -195,7 +233,7 @@ def create_table_all_results(aggregated_df: pd.DataFrame):
195
  "model_type": st.column_config.TextColumn("Type 📌"),
196
  "num_parameters": st.column_config.NumberColumn("Model Size 🔢"),
197
  },
198
- )
199
 
200
 
201
  def create_scatter_chart(df: pd.DataFrame, id_: str):
@@ -206,40 +244,57 @@ def create_scatter_chart(df: pd.DataFrame, id_: str):
206
  color="model_name",
207
  size="num_parameters",
208
  hover_data=["model_type"],
209
- labels={"num_parameters": "Num parameters"}
210
  )
211
  fig.update_layout(template="plotly_white")
212
- st.plotly_chart(fig, use_container_width=True, key=id_ + str(random.random()))
 
 
 
213
 
214
  def create_radar_chart(df: pd.DataFrame, id_: str):
215
  df = df.sort_values(by="Mean", ascending=False)
216
- radar_df = pd.DataFrame({
217
- "r": df["Mean"][:10],
218
- "theta": df["model_name"][:10]
219
- })
220
  fig = px.line_polar(
221
- radar_df, r="r", theta="theta", line_close=True, markers=True,
 
 
 
 
222
  )
223
  fig.update_traces(fill="toself")
224
- st.plotly_chart(fig, use_container_width=True, key=id_ + str(random.random()))
 
 
225
 
226
 
227
  def create_pie_chart(df: pd.DataFrame, id_: str):
228
  df_pie = df["model_type"].value_counts().reset_index()
229
  df_pie.columns = ["model_type", "count"]
230
  fig = px.pie(
231
- df_pie, values="count", names="model_type",
232
- labels={"model_type": "Model type"}
 
 
 
 
 
233
  )
234
- st.plotly_chart(fig, use_container_width=True, key=id_ + str(random.random()))
235
 
236
 
237
  def create_box_plot(df: pd.DataFrame, id_: str):
238
  fig = px.box(
239
- df, x="model_type", y="Mean", points="all",
240
- labels={"model_type": "Model type"}
 
 
 
 
 
 
241
  )
242
- st.plotly_chart(fig, use_container_width=True, key=id_ + str(random.random()))
243
 
244
 
245
  def get_summary_df(lang: str, task_types: list) -> pd.DataFrame:
@@ -247,8 +302,11 @@ def get_summary_df(lang: str, task_types: list) -> pd.DataFrame:
247
  if not st.session_state[f"leaderboard_data_{lang}"].empty:
248
  for t in task_types:
249
  task_list = semantic_categories[t]
250
- cols = [col for col in st.session_state[f"leaderboard_data_{lang}"].columns
251
- if "iberbench/" + col in task_list]
 
 
 
252
  if cols:
253
  tmp = st.session_state[f"leaderboard_data_{lang}"][cols]
254
  df[t] = tmp.mean(axis=1).round(2)
@@ -259,7 +317,6 @@ def get_summary_df(lang: str, task_types: list) -> pd.DataFrame:
259
  return df
260
 
261
 
262
-
263
  def get_all_languages_summary_df() -> pd.DataFrame:
264
  """Combine leaderboard summary data from all languages using get_summary_df."""
265
  combined_df = pd.DataFrame()
@@ -269,7 +326,9 @@ def get_all_languages_summary_df() -> pd.DataFrame:
269
  task_types = select_task_per_language(lang)
270
  summary_df = get_summary_df(lang, task_types)
271
  summary_df["language"] = lang
272
- combined_df = pd.concat([combined_df, summary_df], ignore_index=True)
 
 
273
  return combined_df
274
 
275
 
@@ -283,14 +342,16 @@ def create_results_visualization_lang(lang: str):
283
  create_table_results(summary_df)
284
  st.markdown("### Language plots 📊")
285
  # Display the results table for the selected language
286
-
287
- in_lang_tabs = st.tabs([
288
- "Top 10 performance 🥇",
289
- "Performance vs. size 📏",
290
- "Performance per type 💡",
291
- "Fundamental vs industry ⚖️",
292
- "Performance per task category 📈",
293
- ])
 
 
294
  with in_lang_tabs[0]:
295
  create_radar_chart(summary_df, lang + "in_radar")
296
  with in_lang_tabs[1]:
@@ -301,29 +362,38 @@ def create_results_visualization_lang(lang: str):
301
  create_box_plot_per_task_category(tasks_df, lang + "in_box_task_cat")
302
  with in_lang_tabs[4]:
303
  create_box_plot_per_semantic_category(tasks_df, lang + "in_box_sem_cat")
304
-
 
305
  # -----------------------------------------------------------------------------
306
  # Functions for other visualization sections
307
  # -----------------------------------------------------------------------------
308
 
 
309
  def select_task_per_language(lang: str):
310
  types = []
311
  for k, v in semantic_categories.items():
312
  for vv in v:
313
  task_name = vv.split("iberbench/")[1]
314
- if task_name in list(st.session_state[f"leaderboard_data_{lang}"].columns):
 
 
315
  if k not in types:
316
  types.append(k)
317
  return types
318
 
 
319
  def create_dataset_info_per_language(lang: str):
320
  all_values = []
321
  if not st.session_state[f"leaderboard_data_{lang}"].empty:
322
- cols = [col for col in st.session_state[f"leaderboard_data_{lang}"].columns if col not in model_columns]
 
 
 
 
323
  if len(cols) > 1:
324
- for task in cols[:-1]:
325
- values = load_dataset_card(task)
326
- all_values.append(values)
327
  else:
328
  values = load_dataset_card(cols[0])
329
  all_values.append(values)
@@ -331,27 +401,50 @@ def create_dataset_info_per_language(lang: str):
331
  st.dataframe(
332
  df,
333
  column_config={
334
- "workshop": st.column_config.TextColumn("Workshop 🏫", help="Workshop to belong to the shared task"),
335
- "shared_task": st.column_config.TextColumn("Shared Task 📋", help="Shared Task name"),
336
- "year": st.column_config.TextColumn("Year 📅", help="Year of the shared task"),
337
- "task_type": st.column_config.TextColumn("Task Type 🔖", help="Shared Task type"),
338
- "language": st.column_config.TextColumn("Language 🌐", help="Shared Task language"),
339
- "url": st.column_config.ListColumn("Task URL 🔗", help="Shared Task url"),
340
- "language_variety": st.column_config.TextColumn("Language Variety 🗣️", help="Shared Task language variety"),
341
- "problem_type": st.column_config.TextColumn("Problem Type ❓", help="Shared Task problem type"),
342
- "num_labels": st.column_config.NumberColumn("Number of Labels 🔢", help="Shared Task number of labels"),
343
- "labels": st.column_config.ListColumn("Labels 🏷️", help="Shared Task labels"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  },
345
  hide_index=True,
346
  )
347
  else:
348
  st.write("No data found to display on leaderboard 😔.")
349
 
 
350
  def create_box_plot_per_task_category(df: pd.DataFrame, id_: str):
351
  # Compute average performance for each professional category (using professional_mapping).
352
  melt_vars = []
353
  for category, tasks in professional_mapping.items():
354
- relevant_cols = [col for col in df.columns if "iberbench/" + col in tasks]
 
 
355
  if relevant_cols:
356
  df[category] = df[relevant_cols].mean(axis=1).round(2)
357
  melt_vars.append(category)
@@ -359,18 +452,31 @@ def create_box_plot_per_task_category(df: pd.DataFrame, id_: str):
359
  id_vars = model_columns.copy()
360
  if "language" in df.columns:
361
  id_vars.append("language")
362
- df_melt = df.melt(id_vars=id_vars, value_vars=melt_vars, var_name="Task Category", value_name="Performance")
 
 
 
 
 
363
  fig = px.box(
364
- df_melt, x="Task Category", y="Performance", points="all",
365
- labels={"Performance": "Performance (%)"}
 
 
 
 
 
 
366
  )
367
- st.plotly_chart(fig, use_container_width=True, key=id_ + str(random.random()))
368
 
369
  def create_box_plot_per_semantic_category(df: pd.DataFrame, id_: str):
370
  # Compute average performance for each semantic category defined in semantic_categories.
371
  melt_vars = []
372
  for category, tasks in semantic_categories.items():
373
- relevant_cols = [col for col in df.columns if "iberbench/" + col in tasks]
 
 
374
  if relevant_cols:
375
  df[category] = df[relevant_cols].mean(axis=1).round(2)
376
  melt_vars.append(category)
@@ -378,19 +484,35 @@ def create_box_plot_per_semantic_category(df: pd.DataFrame, id_: str):
378
  id_vars = model_columns.copy()
379
  if "language" in df.columns:
380
  id_vars.append("language")
381
- df_melt = df.melt(id_vars=id_vars, value_vars=melt_vars, var_name="Task Category", value_name="Performance")
 
 
 
 
 
382
  fig = px.box(
383
- df_melt, x="Task Category", y="Performance", points="all",
384
- labels={"Performance": "Performance (%)"}
 
 
 
385
  )
386
- st.plotly_chart(fig, use_container_width=True, key=id_ + str(random.random()))
 
 
 
387
 
388
  def create_histogram(df: pd.DataFrame, id_: str):
389
  fig = px.histogram(
390
- df, x="num_parameters", nbins=20, labels={"num_parameters": "Num parameters", "count": "Count"},
 
 
 
391
  )
392
  fig.update_layout(template="plotly_white")
393
- st.plotly_chart(fig, use_container_width=True, key=id_ + str(random.random()))
 
 
394
 
395
 
396
  def create_data_results_per_language() -> pd.DataFrame:
@@ -404,7 +526,7 @@ def create_data_results_per_language() -> pd.DataFrame:
404
  lang = key.split("leaderboard_data_")[1]
405
  temp_df["language"] = lang
406
  combined_df = pd.concat([combined_df, temp_df], ignore_index=True)
407
-
408
  if combined_df.empty:
409
  st.warning("No data available for any language ⚠️.")
410
  return
@@ -415,28 +537,36 @@ def create_data_results_per_language() -> pd.DataFrame:
415
  model_columns = ["model_name", "model_type", "num_parameters"]
416
  # Exclude metadata, language, and any non-numeric columns.
417
  performance_cols = [
418
- col for col in combined_df.columns
419
- if col not in model_columns + ["language", "Active"]
 
420
  and pd.api.types.is_numeric_dtype(combined_df[col])
421
  ]
422
  if performance_cols:
423
- combined_df["Mean"] = combined_df[performance_cols].mean(axis=1).round(2)
 
 
424
  else:
425
- st.warning("No numeric task performance columns available to compute 'Mean' ⚠️.")
 
 
426
  return
427
  return combined_df
428
-
429
- def create_box_plot_per_language(id_: str):
 
430
  # Create a boxplot with performance (Mean) per language.
431
  combined_df = create_data_results_per_language()
432
  fig = px.box(
433
- combined_df,
434
- x="language",
435
- y="Mean",
436
  points="all",
437
  labels={"language": "Language", "Mean": "Performance (%)"},
438
  )
439
- st.plotly_chart(fig, use_container_width=True, key=id_ + str(random.random()))
 
 
440
 
441
 
442
  def get_all_languages_summary_df() -> pd.DataFrame:
@@ -448,7 +578,9 @@ def get_all_languages_summary_df() -> pd.DataFrame:
448
  task_types = select_task_per_language(lang)
449
  summary_df = get_summary_df(lang, task_types)
450
  summary_df["language"] = lang
451
- combined_df = pd.concat([combined_df, summary_df], ignore_index=True)
 
 
452
  return combined_df
453
 
454
 
@@ -458,14 +590,17 @@ def get_all_languages_aggregated_summary_df() -> pd.DataFrame:
458
  across languages. Use this aggregated data for radar, scatter, pie, box, and histogram plots.
459
  """
460
  df = get_all_languages_summary_df()
461
- agg_df = df.groupby("model_name", as_index=False).agg({
462
- "model_type": "first", # choose an aggregation that makes sense
463
- "num_parameters": "mean", # average model size across languages
464
- "Mean": "mean", # average performance
465
- })
466
- agg_df['Mean']=agg_df['Mean'].round(2)
 
 
467
  return agg_df
468
 
 
469
  def get_all_languages_raw_df() -> pd.DataFrame:
470
  """
471
  Combine the raw leaderboard data from all languages.
@@ -484,8 +619,12 @@ def get_all_languages_raw_df() -> pd.DataFrame:
484
  # -----------------------------------------------------------------------------
485
  # Sidebar for Navigation and Global Settings
486
  # -----------------------------------------------------------------------------
487
- st.sidebar.markdown("<h2 style='text-align: center;'>IberBench 🌍</h2>", unsafe_allow_html=True)
488
- menu = st.sidebar.radio("", ["Leaderboard 📊", "Submit Model 🚀", "Datasets 📚", "About ℹ️"])
 
 
 
 
489
  st.sidebar.markdown("---")
490
  st.sidebar.markdown(
491
  """
@@ -496,17 +635,16 @@ st.sidebar.markdown(
496
  unsafe_allow_html=True,
497
  )
498
 
 
499
  def load_languages_set():
500
  with open(LANGUAGES_SETTINGS, "r") as f:
501
  return yaml_load(f)
502
 
 
503
  lang_set = load_languages_set()
504
 
505
  for lang in lang_set.keys():
506
- if lang == "Mixed":
507
- data = load_data("Spanish")
508
- else:
509
- data = load_data(lang)
510
  if f"leaderboard_data_{lang}" not in st.session_state:
511
  st.session_state[f"leaderboard_data_{lang}"] = data
512
 
@@ -514,29 +652,38 @@ for lang in lang_set.keys():
514
  # Main Content based on Navigation
515
  # -----------------------------------------------------------------------------
516
  if menu == "Leaderboard 📊":
517
- st.markdown("<div class='main-header'><h1>Leaderboard 📊</h1></div>", unsafe_allow_html=True)
518
- lang_iber = [k for k, v in lang_set.items() if v["category"] == "Iberian Peninsula languages"]
 
 
 
 
 
 
 
519
  st.markdown("### General ranking 🏆")
520
-
521
  # ---------------------------
522
  # All-language plots section
523
  # ---------------------------
524
- # Use aggregated data for plots where each model must appear once with averaged values.
525
  aggregated_df = get_all_languages_aggregated_summary_df()
526
  create_table_all_results(aggregated_df)
527
  st.markdown("### General plots 📊")
528
  # Use raw data for Fundamental vs Professional and Task Category plots.
529
  raw_all_df = get_all_languages_raw_df()
530
- all_lang_tabs = st.tabs([
531
- "Top 10 performance 🥇",
532
- "Performance vs. size 📏",
533
- "Type distribution 🎨",
534
- "Performance per type 💡",
535
- "Distribution of sizes 📊",
536
- "Fundamental vs industry ⚖️",
537
- "Performance per task category 📈",
538
- "Performance per language 🌐",
539
- ])
 
 
540
  with all_lang_tabs[0]:
541
  create_radar_chart(aggregated_df, "all_radar")
542
  with all_lang_tabs[1]:
@@ -554,13 +701,19 @@ if menu == "Leaderboard 📊":
554
  create_box_plot_per_semantic_category(raw_all_df, "all_box_sem_cat")
555
  with all_lang_tabs[7]:
556
  create_box_plot_per_language("all_box_language")
557
-
558
- # Results per language
559
  st.markdown("---")
560
  st.markdown("### Language ranking 🏆")
561
- lang_choice = st.selectbox("Select a language 🌐:", list(lang_iber), key="lang_leaderboard")
 
 
562
  if lang_choice == "Spanish":
563
- variations = [k for k, v in lang_set.items() if v["category"] in ["Spanish Variations languages", "Mixed languages"]]
 
 
 
 
564
  tabs_var = st.tabs(variations)
565
  for var, tab in zip(variations, tabs_var):
566
  with tab:
@@ -569,11 +722,15 @@ if menu == "Leaderboard 📊":
569
  create_results_visualization_lang(lang_choice)
570
 
571
  elif menu == "Submit Model 🚀":
572
- st.markdown("<div class='main-header'><h1>Submit Your Model 🚀</h1></div>", unsafe_allow_html=True)
 
 
 
573
  st.markdown("## How to submit a model 📤")
574
 
575
  # CSS
576
- st.markdown("""
 
577
  <style>
578
  .card-container {
579
  max-width: 300px;
@@ -611,7 +768,9 @@ elif menu == "Submit Model 🚀":
611
  margin-left: 8px;
612
  }
613
  </style>
614
- """, unsafe_allow_html=True)
 
 
615
 
616
  def render_card(content):
617
  html = f"""
@@ -643,7 +802,10 @@ elif menu == "Submit Model 🚀":
643
  index = row * num_columns + col
644
  if index < len(guide_info_list):
645
  with cols[col]:
646
- st.markdown(render_card(guide_info_list[index]), unsafe_allow_html=True)
 
 
 
647
 
648
  st.markdown("## Submission form 📝")
649
  with st.form("submit_model_form", clear_on_submit=True):
@@ -655,7 +817,10 @@ elif menu == "Submit Model 🚀":
655
  "Description ✍️",
656
  help="Add a description of the proposed model for the evaluation to help prioritize its evaluation.",
657
  )
658
- user_contact = st.text_input("Your Contact Email 📧", help="User e-mail to contact when there are updates.")
 
 
 
659
  precision_option = st.selectbox(
660
  "Choose precision format 🔢:",
661
  help="Size limits vary by precision. Choose carefully as incorrect precision can cause evaluation errors.",
@@ -668,7 +833,11 @@ elif menu == "Submit Model 🚀":
668
  options=["Original", "Adapter", "Delta"],
669
  index=0,
670
  )
671
- base_model_name = st.text_input("Base model (if applicable) 🏗️", help="Required for delta weights or adapters. This helps calculate total parameter count.", value="")
 
 
 
 
672
  model_type = st.selectbox(
673
  "Choose model type 🔍:",
674
  help="🟢 Pretrained: Base models, 🔶 Fine-tuned: Domain-specific, 💬 Chat: Conversational, 🤝 Merge: Combined weights.",
@@ -678,7 +847,11 @@ elif menu == "Submit Model 🚀":
678
  if submit_button:
679
  use_chat_template = True if model_type == "💬 Chat" else False
680
  validation_error = validate_model(
681
- model_name, precision_option, base_model_name, weight_type_option, use_chat_template
 
 
 
 
682
  )
683
  if validation_error is not None:
684
  st.error(validation_error)
@@ -698,121 +871,62 @@ elif menu == "Submit Model 🚀":
698
  log_submission(input_dict)
699
  st.success("Your request has been sent successfully 🎉.")
700
  except Exception as e:
701
- st.error(f"Failed to send your request: {e}. Please try again later.")
 
 
702
 
703
  elif menu == "Datasets 📚":
704
- st.markdown("<div class='main-header'><h1>Dataset Information 📚</h1></div>", unsafe_allow_html=True)
 
 
 
705
  st.markdown("### Check the datasets 🔍")
706
- lang_iber = [k for k, v in lang_set.items() if v["category"] == "Iberian Peninsula languages"]
707
- lang_choice = st.selectbox("Select a language 🌐:", list(lang_iber), key="lang_dataset")
708
- if lang_choice == "Spanish":
709
- variations = [k for k, v in lang_set.items() if v["category"] in ["Spanish Variations languages", "Mixed languages"]]
 
 
 
 
 
 
 
 
 
 
710
  tabs_var = st.tabs(variations)
711
  for var, tab in zip(variations, tabs_var):
712
  with tab:
713
- if var == "Mixed":
714
- create_dataset_info_per_language("Spanish")
715
- else:
716
- create_dataset_info_per_language(var)
717
  else:
718
  create_dataset_info_per_language(lang_choice)
719
  st.markdown("### Task mappings 🔄")
720
- st.markdown("For the sake of completeness, here we show the mappings we use in the leaderboard to aggregate tasks.")
721
- tab1, tab2 = st.tabs(["Semantic categories 🗂️", "Fundamental vs. Industry ⚖️"])
 
 
 
 
722
  with tab1:
723
- st.json({category: [task.removeprefix("iberbench/") for task in tasks] for category, tasks in semantic_categories.items()})
 
 
 
 
 
724
  with tab2:
725
- st.json({category: [task.removeprefix("iberbench/") for task in tasks] for category, tasks in professional_mapping.items()})
 
 
 
 
 
726
 
727
  elif menu == "About ℹ️":
728
- st.markdown("<div class='main-header'><h1>About ℹ️</h1></div>", unsafe_allow_html=True)
729
- st.markdown("""### 📖 What is IberBench?
730
- IberBench is a hub comprised of datasets for languages across Iberian and Latin American regions, aimed to be used as a benchmark to evaluate causal language models. This initiative aims to enrich the Natural Language Processing (NLP) community in the Iberian Peninsula and Latin America. The benchmark enables the evaluation of NLP models in multiple Spanish variants and other languages such as Catalan, Galician, Basque, Portuguese, and Latin American Spanish, fostering assessments and developments that reflect the linguistic diversity of these regions.
731
-
732
- We hope to drive multilingual research that considers the cultural and linguistic richness and complexity of the Spanish-speaking world, encouraging the creation of models that are truly representative of these realities.
733
-
734
- ### 📂 What are the data sources?
735
-
736
- IberBench contains datasets from prominent workshops in the field such as [IberLEF@SEPLN](https://sepln2024.infor.uva.es/eventos/iberlef-es/) or [PAN@CLEF](https://pan.webis.de/clef24/pan24-web/index.html), as well as stablished existing benchmarks as those from HiTZ, UPF, BSC, CiTIUS-USC, among others, with the aim to incorporate standardized and consistent evaluation within this context, enhancing the value of the data and models derived from this effort.
737
-
738
- We strictly adhere to all established guidelines and regulations concerning the use and publication of this data. Specifically:
739
-
740
- - The collected datasets are published on 🤗HuggingFace private repositories, with appropriate credit given to the authors in the model card.
741
- - Under no circumstances we claim ownership of the datasets.
742
- - The test splits of the datasets are kept private to avoid leakage from IberBench side.
743
-
744
- In any publication or presentation resulting from work with this data, we recognize the importance of citing and crediting to the organizing teams that crafted the datasets used at IberBench.
745
-
746
- ### 🙋 How can I join to IberBench?
747
-
748
- IberBench comprises a committee composed of specialists in NLP, language ethics, and gender discrimination, drawn from both academia and industry, which will oversee the development of the project, ensuring its quality and relevance.
749
-
750
- To be part of this committee, you can ask to join the [IberBench organization at 🤗HuggingFace](https://huggingface.co/iberbench). Your request will be validated by experts already belonging to the organization.
751
-
752
- ### 🤝 How can I contribute to IberBench?
753
-
754
- First, the initial committee will gather all the datasets from prominent workshops. From this, you can contribute with new datasets to the IberBench organization. The process is as follows:
755
-
756
- 1. Open a new discussion in the [IberBench discussions space](https://huggingface.co/spaces/iberbench/README/discussions), linking to an existing dataset in the 🤗HuggingFace hub and explaining why the inclusion is relevant.
757
- 2. Discuss with the committee for the approval or rejection of the dataset.
758
- 3. If approval: your dataset will be included into the IberBench datasets, and will be used to evaluate LLMs in the IberBench leaderboard.
759
-
760
- IberBench will never claim ownership over the dataset, the original author will receive all credits.
761
-
762
- ### 💬 Social networks
763
-
764
- You can reach us at:
765
-
766
- - **X**: [https://x.com/IberBench](https://x.com/IberBench)
767
- - **🤗 Discussions**: [https://huggingface.co/spaces/iberbench/README/discussions](https://huggingface.co/spaces/iberbench/README/discussions)
768
-
769
- ### 🫶 Acknowledgements
770
-
771
- We are incredibly grateful to the amazing teams behind the datasets from workshops like IberLEF, IberEval, and TASS under the umbrella of the [SEPLN](http://www.sepln.org/sepln), as well as the established benchmarks from HiTZ, UPF, BSC, CiTIUS-USC, among others. Their hard work and dedication to advancing NLP have made this benchmark possible. Huge thanks for sharing your invaluable resources with the community! 🚀👏
772
-
773
- IberBench has been funded by the Valencian Institute for Business Competitiveness (IVACE). </br>
774
-
775
- <style>
776
- body {
777
- margin: 0;
778
- display: flex;
779
- flex-direction: column;
780
- min-height: 100vh;
781
- }
782
- .footer {
783
- margin-top: auto;
784
- display: flex;
785
- flex-direction: column;
786
- align-items: center;
787
- text-align: center;
788
- width: 100%;
789
- background: white;
790
- padding: 5px 0;
791
- }
792
- .footer p {
793
- margin: 0;
794
- font-size: 16px;
795
- }
796
- .logos {
797
- display: flex;
798
- justify-content: center;
799
- align-items: center; /* Align images properly */
800
- gap: 20px;
801
- }
802
- .logos img {
803
- display: block;
804
- margin: 0;
805
- padding: 0;
806
- max-height: 100px; /* Ensures both images have the same height */
807
- width: auto; /* Keeps aspect ratio */
808
- }
809
- </style>
810
- </br>
811
- <div class="footer">
812
- <p>Developed by Symanto with ❤️</p>
813
- <div class="logos">
814
- <img src="https://www.ivace.es/images/logo2-ivace.PNG">
815
- <img src="https://www.symanto.com/wp-content/uploads/Logos/symanto.svg">
816
- </div>
817
- </div>
818
- """, unsafe_allow_html=True)
 
8
  import pandas as pd
9
  import streamlit as st
10
  import plotly.express as px
11
+ import plotly.graph_objects as go
12
 
13
  from datasets import load_dataset
14
  from huggingface_hub import CommitScheduler, hf_hub_download
 
21
  # -----------------------------------------------------------------------------
22
  # Page configuration and global CSS styles for modern look and improved UX
23
  # -----------------------------------------------------------------------------
24
+ st.set_page_config(
25
+ page_title="IberBench",
26
+ layout="wide",
27
+ initial_sidebar_state="expanded",
28
+ page_icon="🌍",
29
+ )
30
 
31
  st.markdown(
32
  """
 
74
  LANGUAGES_SETTINGS = Path("etc/languages_settings.yml")
75
 
76
  dataset_columns = [
77
+ "workshop",
78
+ "shared_task",
79
+ "year",
80
+ "task_type",
81
+ "language",
82
+ "url",
83
+ "language_variety",
84
+ "problem_type",
85
+ "num_labels",
86
+ "labels",
87
  ]
88
  model_columns = ["model_name", "model_type", "num_parameters"]
89
 
 
97
  every=10,
98
  )
99
 
100
+
101
  def log_submission(input_dict: dict) -> None:
102
  with scheduler.lock:
103
  with request_file.open("a") as f:
104
  f.write(json.dumps(input_dict))
105
  f.write("\n")
106
 
107
+
108
  def get_lang_columns(columns: list, lang: str):
109
+ # Mixed needs to return all the columns that ends
110
+ # with the language, but doesn't have variation at the end
111
+ if "Mixed" in lang:
112
+ lang = lang.lower().split(" ")[0]
113
+ return [col for col in columns if col.endswith(lang)]
114
+ else:
115
+ lang_norm = lang.lower().replace(" ", "_")
116
+ return [col for col in columns if lang_norm in col]
117
+
118
 
119
  @st.cache_data
120
  def load_data(lang) -> pd.DataFrame:
121
  try:
122
+ data = load_dataset(
123
+ "iberbench/lm-eval-results", token=st.secrets["HF_TOKEN"]
124
+ )["train"].to_pandas()
125
  task_columns = [col for col in data.columns if col not in model_columns]
126
  task_lang_columns = get_lang_columns(task_columns, lang)
127
+ data[task_columns] = data[task_columns] * 100
128
  data = data[model_columns + task_lang_columns]
129
+ # data["Active"] = False
130
  return data
131
  except FileNotFoundError:
132
  st.error("iberbench/lm-eval-results was not found in the hub 😕")
133
  return pd.DataFrame()
134
 
135
+
136
  def load_dataset_card(task) -> list:
137
  name_repo = "iberbench/" + task
138
  try:
 
156
 
157
 
158
  def active_data(lang) -> pd.DataFrame:
159
+ return st.session_state[f"leaderboard_data_{lang}"][
160
+ st.session_state[f"leaderboard_data_{lang}"]["Active"] == True
161
+ ].copy()
162
+
163
 
164
  def get_index(lang, row) -> pd.Series:
165
  return active_data(lang).iloc[row].name
166
 
167
+
168
  def commit(lang) -> None:
169
  for row in st.session_state[f"edited_data_{lang}"]["edited_rows"]:
170
  row_index = get_index(lang, row)
171
+ for key, value in st.session_state[f"edited_data_{lang}"][
172
+ "edited_rows"
173
+ ][row].items():
174
+ st.session_state[f"leaderboard_data_{lang}"].at[
175
+ row_index, key
176
+ ] = value
177
 
178
 
179
  # -----------------------------------------------------------------------------
 
206
 
207
  def create_table_all_results(aggregated_df: pd.DataFrame):
208
  combined_df = create_data_results_per_language()
209
+ df_lang = combined_df.pivot(
210
+ index="model_name", columns="language", values="Mean"
211
+ )
212
+ aggregated_df[df_lang.columns] = df_lang[df_lang.columns].values
213
  rank_value = []
214
+ for i in (
215
+ aggregated_df["Mean"].rank(method="dense", ascending=False).astype(int)
216
+ ):
217
  if i == 1:
218
  rank_value.append(f"{i} 🥇")
219
  elif i == 2:
 
233
  "model_type": st.column_config.TextColumn("Type 📌"),
234
  "num_parameters": st.column_config.NumberColumn("Model Size 🔢"),
235
  },
236
+ )
237
 
238
 
239
  def create_scatter_chart(df: pd.DataFrame, id_: str):
 
244
  color="model_name",
245
  size="num_parameters",
246
  hover_data=["model_type"],
247
+ labels={"num_parameters": "Num parameters"},
248
  )
249
  fig.update_layout(template="plotly_white")
250
+ st.plotly_chart(
251
+ fig, use_container_width=True, key=id_ + str(random.random())
252
+ )
253
+
254
 
255
  def create_radar_chart(df: pd.DataFrame, id_: str):
256
  df = df.sort_values(by="Mean", ascending=False)
257
+ radar_df = pd.DataFrame(
258
+ {"r": df["Mean"][:10], "theta": df["model_name"][:10]}
259
+ )
 
260
  fig = px.line_polar(
261
+ radar_df,
262
+ r="r",
263
+ theta="theta",
264
+ line_close=True,
265
+ markers=True,
266
  )
267
  fig.update_traces(fill="toself")
268
+ st.plotly_chart(
269
+ fig, use_container_width=True, key=id_ + str(random.random())
270
+ )
271
 
272
 
273
  def create_pie_chart(df: pd.DataFrame, id_: str):
274
  df_pie = df["model_type"].value_counts().reset_index()
275
  df_pie.columns = ["model_type", "count"]
276
  fig = px.pie(
277
+ df_pie,
278
+ values="count",
279
+ names="model_type",
280
+ labels={"model_type": "Model type"},
281
+ )
282
+ st.plotly_chart(
283
+ fig, use_container_width=True, key=id_ + str(random.random())
284
  )
 
285
 
286
 
287
  def create_box_plot(df: pd.DataFrame, id_: str):
288
  fig = px.box(
289
+ df,
290
+ x="model_type",
291
+ y="Mean",
292
+ points="all",
293
+ labels={"model_type": "Model type"},
294
+ )
295
+ st.plotly_chart(
296
+ fig, use_container_width=True, key=id_ + str(random.random())
297
  )
 
298
 
299
 
300
  def get_summary_df(lang: str, task_types: list) -> pd.DataFrame:
 
302
  if not st.session_state[f"leaderboard_data_{lang}"].empty:
303
  for t in task_types:
304
  task_list = semantic_categories[t]
305
+ cols = [
306
+ col
307
+ for col in st.session_state[f"leaderboard_data_{lang}"].columns
308
+ if "iberbench/" + col in task_list
309
+ ]
310
  if cols:
311
  tmp = st.session_state[f"leaderboard_data_{lang}"][cols]
312
  df[t] = tmp.mean(axis=1).round(2)
 
317
  return df
318
 
319
 
 
320
  def get_all_languages_summary_df() -> pd.DataFrame:
321
  """Combine leaderboard summary data from all languages using get_summary_df."""
322
  combined_df = pd.DataFrame()
 
326
  task_types = select_task_per_language(lang)
327
  summary_df = get_summary_df(lang, task_types)
328
  summary_df["language"] = lang
329
+ combined_df = pd.concat(
330
+ [combined_df, summary_df], ignore_index=True
331
+ )
332
  return combined_df
333
 
334
 
 
342
  create_table_results(summary_df)
343
  st.markdown("### Language plots 📊")
344
  # Display the results table for the selected language
345
+
346
+ in_lang_tabs = st.tabs(
347
+ [
348
+ "Top 10 performance 🥇",
349
+ "Performance vs. size 📏",
350
+ "Performance per type 💡",
351
+ "Fundamental vs industry ⚖️",
352
+ "Performance per task category 📈",
353
+ ]
354
+ )
355
  with in_lang_tabs[0]:
356
  create_radar_chart(summary_df, lang + "in_radar")
357
  with in_lang_tabs[1]:
 
362
  create_box_plot_per_task_category(tasks_df, lang + "in_box_task_cat")
363
  with in_lang_tabs[4]:
364
  create_box_plot_per_semantic_category(tasks_df, lang + "in_box_sem_cat")
365
+
366
+
367
  # -----------------------------------------------------------------------------
368
  # Functions for other visualization sections
369
  # -----------------------------------------------------------------------------
370
 
371
+
372
  def select_task_per_language(lang: str):
373
  types = []
374
  for k, v in semantic_categories.items():
375
  for vv in v:
376
  task_name = vv.split("iberbench/")[1]
377
+ if task_name in list(
378
+ st.session_state[f"leaderboard_data_{lang}"].columns
379
+ ):
380
  if k not in types:
381
  types.append(k)
382
  return types
383
 
384
+
385
  def create_dataset_info_per_language(lang: str):
386
  all_values = []
387
  if not st.session_state[f"leaderboard_data_{lang}"].empty:
388
+ cols = [
389
+ col
390
+ for col in st.session_state[f"leaderboard_data_{lang}"].columns
391
+ if col not in model_columns
392
+ ]
393
  if len(cols) > 1:
394
+ for task in cols[:-1]:
395
+ values = load_dataset_card(task)
396
+ all_values.append(values)
397
  else:
398
  values = load_dataset_card(cols[0])
399
  all_values.append(values)
 
401
  st.dataframe(
402
  df,
403
  column_config={
404
+ "workshop": st.column_config.TextColumn(
405
+ "Workshop 🏫", help="Workshop to belong to the shared task"
406
+ ),
407
+ "shared_task": st.column_config.TextColumn(
408
+ "Shared Task 📋", help="Shared Task name"
409
+ ),
410
+ "year": st.column_config.TextColumn(
411
+ "Year 📅", help="Year of the shared task"
412
+ ),
413
+ "task_type": st.column_config.TextColumn(
414
+ "Task Type 🔖", help="Shared Task type"
415
+ ),
416
+ "language": st.column_config.TextColumn(
417
+ "Language 🌐", help="Shared Task language"
418
+ ),
419
+ "url": st.column_config.ListColumn(
420
+ "Task URL 🔗", help="Shared Task url"
421
+ ),
422
+ "language_variety": st.column_config.TextColumn(
423
+ "Language Variety 🗣️", help="Shared Task language variety"
424
+ ),
425
+ "problem_type": st.column_config.TextColumn(
426
+ "Problem Type ❓", help="Shared Task problem type"
427
+ ),
428
+ "num_labels": st.column_config.NumberColumn(
429
+ "Number of Labels 🔢", help="Shared Task number of labels"
430
+ ),
431
+ "labels": st.column_config.ListColumn(
432
+ "Labels 🏷️", help="Shared Task labels"
433
+ ),
434
  },
435
  hide_index=True,
436
  )
437
  else:
438
  st.write("No data found to display on leaderboard 😔.")
439
 
440
+
441
  def create_box_plot_per_task_category(df: pd.DataFrame, id_: str):
442
  # Compute average performance for each professional category (using professional_mapping).
443
  melt_vars = []
444
  for category, tasks in professional_mapping.items():
445
+ relevant_cols = [
446
+ col for col in df.columns if "iberbench/" + col in tasks
447
+ ]
448
  if relevant_cols:
449
  df[category] = df[relevant_cols].mean(axis=1).round(2)
450
  melt_vars.append(category)
 
452
  id_vars = model_columns.copy()
453
  if "language" in df.columns:
454
  id_vars.append("language")
455
+ df_melt = df.melt(
456
+ id_vars=id_vars,
457
+ value_vars=melt_vars,
458
+ var_name="Task Category",
459
+ value_name="Performance",
460
+ )
461
  fig = px.box(
462
+ df_melt,
463
+ x="Task Category",
464
+ y="Performance",
465
+ points="all",
466
+ labels={"Performance": "Performance (%)"},
467
+ )
468
+ st.plotly_chart(
469
+ fig, use_container_width=True, key=id_ + str(random.random())
470
  )
471
+
472
 
473
  def create_box_plot_per_semantic_category(df: pd.DataFrame, id_: str):
474
  # Compute average performance for each semantic category defined in semantic_categories.
475
  melt_vars = []
476
  for category, tasks in semantic_categories.items():
477
+ relevant_cols = [
478
+ col for col in df.columns if "iberbench/" + col in tasks
479
+ ]
480
  if relevant_cols:
481
  df[category] = df[relevant_cols].mean(axis=1).round(2)
482
  melt_vars.append(category)
 
484
  id_vars = model_columns.copy()
485
  if "language" in df.columns:
486
  id_vars.append("language")
487
+ df_melt = df.melt(
488
+ id_vars=id_vars,
489
+ value_vars=melt_vars,
490
+ var_name="Task Category",
491
+ value_name="Performance",
492
+ )
493
  fig = px.box(
494
+ df_melt,
495
+ x="Task Category",
496
+ y="Performance",
497
+ points="all",
498
+ labels={"Performance": "Performance (%)"},
499
  )
500
+ st.plotly_chart(
501
+ fig, use_container_width=True, key=id_ + str(random.random())
502
+ )
503
+
504
 
505
  def create_histogram(df: pd.DataFrame, id_: str):
506
  fig = px.histogram(
507
+ df,
508
+ x="num_parameters",
509
+ nbins=20,
510
+ labels={"num_parameters": "Num parameters", "count": "Count"},
511
  )
512
  fig.update_layout(template="plotly_white")
513
+ st.plotly_chart(
514
+ fig, use_container_width=True, key=id_ + str(random.random())
515
+ )
516
 
517
 
518
  def create_data_results_per_language() -> pd.DataFrame:
 
526
  lang = key.split("leaderboard_data_")[1]
527
  temp_df["language"] = lang
528
  combined_df = pd.concat([combined_df, temp_df], ignore_index=True)
529
+
530
  if combined_df.empty:
531
  st.warning("No data available for any language ⚠️.")
532
  return
 
537
  model_columns = ["model_name", "model_type", "num_parameters"]
538
  # Exclude metadata, language, and any non-numeric columns.
539
  performance_cols = [
540
+ col
541
+ for col in combined_df.columns
542
+ if col not in model_columns + ["language", "Active"]
543
  and pd.api.types.is_numeric_dtype(combined_df[col])
544
  ]
545
  if performance_cols:
546
+ combined_df["Mean"] = (
547
+ combined_df[performance_cols].mean(axis=1).round(2)
548
+ )
549
  else:
550
+ st.warning(
551
+ "No numeric task performance columns available to compute 'Mean' ⚠️."
552
+ )
553
  return
554
  return combined_df
555
+
556
+
557
+ def create_box_plot_per_language(id_: str):
558
  # Create a boxplot with performance (Mean) per language.
559
  combined_df = create_data_results_per_language()
560
  fig = px.box(
561
+ combined_df,
562
+ x="language",
563
+ y="Mean",
564
  points="all",
565
  labels={"language": "Language", "Mean": "Performance (%)"},
566
  )
567
+ st.plotly_chart(
568
+ fig, use_container_width=True, key=id_ + str(random.random())
569
+ )
570
 
571
 
572
  def get_all_languages_summary_df() -> pd.DataFrame:
 
578
  task_types = select_task_per_language(lang)
579
  summary_df = get_summary_df(lang, task_types)
580
  summary_df["language"] = lang
581
+ combined_df = pd.concat(
582
+ [combined_df, summary_df], ignore_index=True
583
+ )
584
  return combined_df
585
 
586
 
 
590
  across languages. Use this aggregated data for radar, scatter, pie, box, and histogram plots.
591
  """
592
  df = get_all_languages_summary_df()
593
+ agg_df = df.groupby("model_name", as_index=False).agg(
594
+ {
595
+ "model_type": "first", # choose an aggregation that makes sense
596
+ "num_parameters": "mean", # average model size across languages
597
+ "Mean": "mean", # average performance
598
+ }
599
+ )
600
+ agg_df["Mean"] = agg_df["Mean"].round(2)
601
  return agg_df
602
 
603
+
604
  def get_all_languages_raw_df() -> pd.DataFrame:
605
  """
606
  Combine the raw leaderboard data from all languages.
 
619
  # -----------------------------------------------------------------------------
620
  # Sidebar for Navigation and Global Settings
621
  # -----------------------------------------------------------------------------
622
+ st.sidebar.markdown(
623
+ "<h2 style='text-align: center;'>IberBench 🌍</h2>", unsafe_allow_html=True
624
+ )
625
+ menu = st.sidebar.radio(
626
+ "", ["Leaderboard 📊", "Submit Model 🚀", "Datasets 📚", "About ℹ️"]
627
+ )
628
  st.sidebar.markdown("---")
629
  st.sidebar.markdown(
630
  """
 
635
  unsafe_allow_html=True,
636
  )
637
 
638
+
639
  def load_languages_set():
640
  with open(LANGUAGES_SETTINGS, "r") as f:
641
  return yaml_load(f)
642
 
643
+
644
  lang_set = load_languages_set()
645
 
646
  for lang in lang_set.keys():
647
+ data = load_data(lang)
 
 
 
648
  if f"leaderboard_data_{lang}" not in st.session_state:
649
  st.session_state[f"leaderboard_data_{lang}"] = data
650
 
 
652
  # Main Content based on Navigation
653
  # -----------------------------------------------------------------------------
654
  if menu == "Leaderboard 📊":
655
+ st.markdown(
656
+ "<div class='main-header'><h1>Leaderboard 📊</h1></div>",
657
+ unsafe_allow_html=True,
658
+ )
659
+ lang_iber = [
660
+ k
661
+ for k, v in lang_set.items()
662
+ if v["category"] == "Iberian Peninsula languages"
663
+ ]
664
  st.markdown("### General ranking 🏆")
665
+
666
  # ---------------------------
667
  # All-language plots section
668
  # ---------------------------
669
+ # Use aggregated data for plots where each model must appear once with averaged values.
670
  aggregated_df = get_all_languages_aggregated_summary_df()
671
  create_table_all_results(aggregated_df)
672
  st.markdown("### General plots 📊")
673
  # Use raw data for Fundamental vs Professional and Task Category plots.
674
  raw_all_df = get_all_languages_raw_df()
675
+ all_lang_tabs = st.tabs(
676
+ [
677
+ "Top 10 performance 🥇",
678
+ "Performance vs. size 📏",
679
+ "Type distribution 🎨",
680
+ "Performance per type 💡",
681
+ "Distribution of sizes 📊",
682
+ "Fundamental vs industry ⚖️",
683
+ "Performance per task category 📈",
684
+ "Performance per language 🌐",
685
+ ]
686
+ )
687
  with all_lang_tabs[0]:
688
  create_radar_chart(aggregated_df, "all_radar")
689
  with all_lang_tabs[1]:
 
701
  create_box_plot_per_semantic_category(raw_all_df, "all_box_sem_cat")
702
  with all_lang_tabs[7]:
703
  create_box_plot_per_language("all_box_language")
704
+
705
+ # Results per language
706
  st.markdown("---")
707
  st.markdown("### Language ranking 🏆")
708
+ lang_choice = st.selectbox(
709
+ "Select a language 🌐:", list(lang_iber), key="lang_leaderboard"
710
+ )
711
  if lang_choice == "Spanish":
712
+ variations = [
713
+ k
714
+ for k, v in lang_set.items()
715
+ if v["category"] in ["Spanish Variations languages"]
716
+ ]
717
  tabs_var = st.tabs(variations)
718
  for var, tab in zip(variations, tabs_var):
719
  with tab:
 
722
  create_results_visualization_lang(lang_choice)
723
 
724
  elif menu == "Submit Model 🚀":
725
+ st.markdown(
726
+ "<div class='main-header'><h1>Submit Your Model 🚀</h1></div>",
727
+ unsafe_allow_html=True,
728
+ )
729
  st.markdown("## How to submit a model 📤")
730
 
731
  # CSS
732
+ st.markdown(
733
+ """
734
  <style>
735
  .card-container {
736
  max-width: 300px;
 
768
  margin-left: 8px;
769
  }
770
  </style>
771
+ """,
772
+ unsafe_allow_html=True,
773
+ )
774
 
775
  def render_card(content):
776
  html = f"""
 
802
  index = row * num_columns + col
803
  if index < len(guide_info_list):
804
  with cols[col]:
805
+ st.markdown(
806
+ render_card(guide_info_list[index]),
807
+ unsafe_allow_html=True,
808
+ )
809
 
810
  st.markdown("## Submission form 📝")
811
  with st.form("submit_model_form", clear_on_submit=True):
 
817
  "Description ✍️",
818
  help="Add a description of the proposed model for the evaluation to help prioritize its evaluation.",
819
  )
820
+ user_contact = st.text_input(
821
+ "Your Contact Email 📧",
822
+ help="User e-mail to contact when there are updates.",
823
+ )
824
  precision_option = st.selectbox(
825
  "Choose precision format 🔢:",
826
  help="Size limits vary by precision. Choose carefully as incorrect precision can cause evaluation errors.",
 
833
  options=["Original", "Adapter", "Delta"],
834
  index=0,
835
  )
836
+ base_model_name = st.text_input(
837
+ "Base model (if applicable) 🏗️",
838
+ help="Required for delta weights or adapters. This helps calculate total parameter count.",
839
+ value="",
840
+ )
841
  model_type = st.selectbox(
842
  "Choose model type 🔍:",
843
  help="🟢 Pretrained: Base models, 🔶 Fine-tuned: Domain-specific, 💬 Chat: Conversational, 🤝 Merge: Combined weights.",
 
847
  if submit_button:
848
  use_chat_template = True if model_type == "💬 Chat" else False
849
  validation_error = validate_model(
850
+ model_name,
851
+ precision_option,
852
+ base_model_name,
853
+ weight_type_option,
854
+ use_chat_template,
855
  )
856
  if validation_error is not None:
857
  st.error(validation_error)
 
871
  log_submission(input_dict)
872
  st.success("Your request has been sent successfully 🎉.")
873
  except Exception as e:
874
+ st.error(
875
+ f"Failed to send your request: {e}. Please try again later."
876
+ )
877
 
878
  elif menu == "Datasets 📚":
879
+ st.markdown(
880
+ "<div class='main-header'><h1>Dataset Information 📚</h1></div>",
881
+ unsafe_allow_html=True,
882
+ )
883
  st.markdown("### Check the datasets 🔍")
884
+ lang_iber = [
885
+ k
886
+ for k, v in lang_set.items()
887
+ if v["category"] == "Iberian Peninsula languages"
888
+ ]
889
+ lang_choice = st.selectbox(
890
+ "Select a language 🌐:", list(lang_iber), key="lang_dataset"
891
+ )
892
+ if lang_choice in ["Spanish"]:
893
+ variations = [
894
+ k
895
+ for k, v in lang_set.items()
896
+ if v["category"] in ["Spanish Variations languages"]
897
+ ]
898
  tabs_var = st.tabs(variations)
899
  for var, tab in zip(variations, tabs_var):
900
  with tab:
901
+ create_dataset_info_per_language(var)
 
 
 
902
  else:
903
  create_dataset_info_per_language(lang_choice)
904
  st.markdown("### Task mappings 🔄")
905
+ st.markdown(
906
+ "For the sake of completeness, here we show the mappings we use in the leaderboard to aggregate tasks."
907
+ )
908
+ tab1, tab2 = st.tabs(
909
+ ["Semantic categories 🗂️", "Fundamental vs. Industry ⚖️"]
910
+ )
911
  with tab1:
912
+ st.json(
913
+ {
914
+ category: [task.removeprefix("iberbench/") for task in tasks]
915
+ for category, tasks in semantic_categories.items()
916
+ }
917
+ )
918
  with tab2:
919
+ st.json(
920
+ {
921
+ category: [task.removeprefix("iberbench/") for task in tasks]
922
+ for category, tasks in professional_mapping.items()
923
+ }
924
+ )
925
 
926
  elif menu == "About ℹ️":
927
+ st.markdown(
928
+ "<div class='main-header'><h1>About ℹ️</h1></div>",
929
+ unsafe_allow_html=True,
930
+ )
931
+ with open("./assets/md/about.md", "r") as fr:
932
+ st.markdown(fr.read(), unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
etc/languages_settings.yml CHANGED
@@ -10,8 +10,8 @@ Galician:
10
  category: 'Iberian Peninsula languages'
11
  English:
12
  category: 'Iberian Peninsula languages'
13
- Mixed:
14
- category: 'Mixed languages'
15
  Costa Rica:
16
  category: 'Spanish Variations languages'
17
  Mexico:
 
10
  category: 'Iberian Peninsula languages'
11
  English:
12
  category: 'Iberian Peninsula languages'
13
+ Spanish Mixed:
14
+ category: 'Spanish Variations languages'
15
  Costa Rica:
16
  category: 'Spanish Variations languages'
17
  Mexico: