import json import os import re import uuid import random from pathlib import Path import pandas as pd import streamlit as st import plotly.express as px import plotly.graph_objects as go from datasets import load_dataset from huggingface_hub import CommitScheduler, hf_hub_download from huggingface_hub.utils import RepositoryNotFoundError from yaml import safe_load as yaml_load from src.check_validity import validate_model from src.task_mappings import professional_mapping, semantic_categories # ----------------------------------------------------------------------------- # Page configuration and global CSS styles for modern look and improved UX # ----------------------------------------------------------------------------- st.set_page_config( page_title="IberBench", layout="wide", initial_sidebar_state="expanded", page_icon="đ", ) st.markdown( """ """, unsafe_allow_html=True, ) # ----------------------------------------------------------------------------- # Global variables and helper functions # ----------------------------------------------------------------------------- request_file = Path("user_request/") / f"data_{uuid.uuid4()}.json" request_folder = request_file.parent LANGUAGES_SETTINGS = Path("etc/languages_settings.yml") dataset_columns = [ "workshop", "shared_task", "year", "task_type", "language", "url", "language_variety", "problem_type", "num_labels", "labels", ] model_columns = ["model_name", "model_type", "num_parameters"] scheduler = CommitScheduler( repo_id="iberbench/user-requests", repo_type="dataset", private=True, folder_path=request_folder, token=st.secrets["HF_TOKEN"], path_in_repo="data", every=10, ) def log_submission(input_dict: dict) -> None: with scheduler.lock: with request_file.open("a") as f: f.write(json.dumps(input_dict)) f.write("\n") def get_lang_columns(columns: list, lang: str): # Mixed needs to return all the columns that ends # with the language, but doesn't have variation at the end if "Mixed" in lang: lang = lang.lower().split(" ")[0] return [col for col in columns if col.endswith(lang)] else: lang_norm = lang.lower().replace(" ", "_") return [col for col in columns if lang_norm in col] @st.cache_data def load_data(lang) -> pd.DataFrame: try: data = load_dataset( "iberbench/lm-eval-results", token=st.secrets["HF_TOKEN"] )["train"].to_pandas() task_columns = [col for col in data.columns if col not in model_columns] task_lang_columns = get_lang_columns(task_columns, lang) data[task_columns] = data[task_columns] * 100 data = data[model_columns + task_lang_columns] # data["Active"] = False return data except FileNotFoundError: st.error("iberbench/lm-eval-results was not found in the hub đ") return pd.DataFrame() def load_dataset_card(task) -> list: name_repo = "iberbench/" + task try: info_path = hf_hub_download( repo_id=name_repo, filename="task_metadata.json", repo_type="dataset", ) with open(info_path, "r") as f: info = json.load(f) values_ = [] for i in dataset_columns: if i in info: values_.append(info[i]) else: values_.append([] if i == "labels" else "-") return values_ except RepositoryNotFoundError: st.error(task + ": dataset was not found in the hub đĢ") return ["-"] * len(dataset_columns) def active_data(lang) -> pd.DataFrame: return st.session_state[f"leaderboard_data_{lang}"][ st.session_state[f"leaderboard_data_{lang}"]["Active"] == True ].copy() def get_index(lang, row) -> pd.Series: return active_data(lang).iloc[row].name def commit(lang) -> None: for row in st.session_state[f"edited_data_{lang}"]["edited_rows"]: row_index = get_index(lang, row) for key, value in st.session_state[f"edited_data_{lang}"][ "edited_rows" ][row].items(): st.session_state[f"leaderboard_data_{lang}"].at[ row_index, key ] = value # ----------------------------------------------------------------------------- # Visualization helper functions # ----------------------------------------------------------------------------- def create_table_results(df_mean: pd.DataFrame): rank_value = [] for i in df_mean["Mean"].rank(method="dense", ascending=False).astype(int): if i == 1: rank_value.append(f"{i} đĨ") elif i == 2: rank_value.append(f"{i} đĨ") elif i == 3: rank_value.append(f"{i} đĨ") else: rank_value.append(str(i)) df_mean.insert(0, "Rank", rank_value) df_final = df_mean.sort_values("Mean", ascending=False) st.dataframe( df_final, hide_index=True, use_container_width=True, column_config={ "model_name": st.column_config.TextColumn("Model đ§ "), "model_type": st.column_config.TextColumn("Type đ"), "num_parameters": st.column_config.NumberColumn("Model Size đĸ"), }, ) def create_table_all_results(aggregated_df: pd.DataFrame): combined_df = create_data_results_per_language() df_lang = combined_df.pivot( index="model_name", columns="language", values="Mean" ) aggregated_df[df_lang.columns] = df_lang[df_lang.columns].values rank_value = [] for i in ( aggregated_df["Mean"].rank(method="dense", ascending=False).astype(int) ): if i == 1: rank_value.append(f"{i} đĨ") elif i == 2: rank_value.append(f"{i} đĨ") elif i == 3: rank_value.append(f"{i} đĨ") else: rank_value.append(str(i)) aggregated_df.insert(0, "Rank", rank_value) df_final = aggregated_df.sort_values("Mean", ascending=False) st.dataframe( df_final, hide_index=True, use_container_width=True, column_config={ "model_name": st.column_config.TextColumn("Model đ§ "), "model_type": st.column_config.TextColumn("Type đ"), "num_parameters": st.column_config.NumberColumn("Model Size đĸ"), }, ) def create_scatter_chart(df: pd.DataFrame, id_: str): fig = px.scatter( df, x="num_parameters", y="Mean", color="model_name", size="num_parameters", hover_data=["model_type"], labels={"num_parameters": "Num parameters"}, ) fig.update_layout(template="plotly_white") st.plotly_chart( fig, use_container_width=True, key=id_ + str(random.random()) ) def create_radar_chart(df: pd.DataFrame, id_: str): df = df.sort_values(by="Mean", ascending=False) radar_df = pd.DataFrame( {"r": df["Mean"][:10], "theta": df["model_name"][:10]} ) fig = px.line_polar( radar_df, r="r", theta="theta", line_close=True, markers=True, ) fig.update_traces(fill="toself") st.plotly_chart( fig, use_container_width=True, key=id_ + str(random.random()) ) def create_pie_chart(df: pd.DataFrame, id_: str): df_pie = df["model_type"].value_counts().reset_index() df_pie.columns = ["model_type", "count"] fig = px.pie( df_pie, values="count", names="model_type", labels={"model_type": "Model type"}, ) st.plotly_chart( fig, use_container_width=True, key=id_ + str(random.random()) ) def create_box_plot(df: pd.DataFrame, id_: str): fig = px.box( df, x="model_type", y="Mean", points="all", labels={"model_type": "Model type"}, ) st.plotly_chart( fig, use_container_width=True, key=id_ + str(random.random()) ) def get_summary_df(lang: str, task_types: list) -> pd.DataFrame: df = st.session_state[f"leaderboard_data_{lang}"][model_columns].copy() if not st.session_state[f"leaderboard_data_{lang}"].empty: for t in task_types: task_list = semantic_categories[t] cols = [ col for col in st.session_state[f"leaderboard_data_{lang}"].columns if "iberbench/" + col in task_list ] if cols: tmp = st.session_state[f"leaderboard_data_{lang}"][cols] df[t] = tmp.mean(axis=1).round(2) if df.shape[1] > 4: df.insert(3, "Mean", df.iloc[:, 3:-1].mean(axis=1).round(2)) else: df.insert(3, "Mean", df.iloc[:, 3].round(2)) return df def get_all_languages_summary_df() -> pd.DataFrame: """Combine leaderboard summary data from all languages using get_summary_df.""" combined_df = pd.DataFrame() for key in st.session_state: if key.startswith("leaderboard_data_"): lang = key.split("leaderboard_data_")[1] task_types = select_task_per_language(lang) summary_df = get_summary_df(lang, task_types) summary_df["language"] = lang combined_df = pd.concat( [combined_df, summary_df], ignore_index=True ) return combined_df def create_results_visualization_lang(lang: str): # --------------------------- # In-language plots section # --------------------------- task_types = select_task_per_language(lang) summary_df = get_summary_df(lang, task_types) tasks_df = st.session_state[f"leaderboard_data_{lang}"].copy() create_table_results(summary_df) st.markdown("### Language plots đ") # Display the results table for the selected language in_lang_tabs = st.tabs( [ "Top 10 performance đĨ", "Performance vs. size đ", "Performance per type đĄ", "Fundamental vs industry âī¸", "Performance per task category đ", ] ) with in_lang_tabs[0]: create_radar_chart(summary_df, lang + "in_radar") with in_lang_tabs[1]: create_scatter_chart(summary_df, lang + "in_scatter") with in_lang_tabs[2]: create_box_plot(summary_df, lang + "in_box") with in_lang_tabs[3]: create_box_plot_per_task_category(tasks_df, lang + "in_box_task_cat") with in_lang_tabs[4]: create_box_plot_per_semantic_category(tasks_df, lang + "in_box_sem_cat") # ----------------------------------------------------------------------------- # Functions for other visualization sections # ----------------------------------------------------------------------------- def select_task_per_language(lang: str): types = [] for k, v in semantic_categories.items(): for vv in v: task_name = vv.split("iberbench/")[1] if task_name in list( st.session_state[f"leaderboard_data_{lang}"].columns ): if k not in types: types.append(k) return types def create_dataset_info_per_language(lang: str): all_values = [] if not st.session_state[f"leaderboard_data_{lang}"].empty: cols = [ col for col in st.session_state[f"leaderboard_data_{lang}"].columns if col not in model_columns ] if len(cols) > 1: for task in cols[:-1]: values = load_dataset_card(task) all_values.append(values) else: values = load_dataset_card(cols[0]) all_values.append(values) df = pd.DataFrame(all_values, columns=dataset_columns) st.dataframe( df, column_config={ "workshop": st.column_config.TextColumn( "Workshop đĢ", help="Workshop to belong to the shared task" ), "shared_task": st.column_config.TextColumn( "Shared Task đ", help="Shared Task name" ), "year": st.column_config.TextColumn( "Year đ ", help="Year of the shared task" ), "task_type": st.column_config.TextColumn( "Task Type đ", help="Shared Task type" ), "language": st.column_config.TextColumn( "Language đ", help="Shared Task language" ), "url": st.column_config.ListColumn( "Task URL đ", help="Shared Task url" ), "language_variety": st.column_config.TextColumn( "Language Variety đŖī¸", help="Shared Task language variety" ), "problem_type": st.column_config.TextColumn( "Problem Type â", help="Shared Task problem type" ), "num_labels": st.column_config.NumberColumn( "Number of Labels đĸ", help="Shared Task number of labels" ), "labels": st.column_config.ListColumn( "Labels đˇī¸", help="Shared Task labels" ), }, hide_index=True, ) else: st.write("No data found to display on leaderboard đ.") def create_box_plot_per_task_category(df: pd.DataFrame, id_: str): # Compute average performance for each professional category (using professional_mapping). melt_vars = [] for category, tasks in professional_mapping.items(): relevant_cols = [ col for col in df.columns if "iberbench/" + col in tasks ] if relevant_cols: df[category] = df[relevant_cols].mean(axis=1).round(2) melt_vars.append(category) melt_vars = list(set(melt_vars)) id_vars = model_columns.copy() if "language" in df.columns: id_vars.append("language") df_melt = df.melt( id_vars=id_vars, value_vars=melt_vars, var_name="Task Category", value_name="Performance", ) fig = px.box( df_melt, x="Task Category", y="Performance", points="all", labels={"Performance": "Performance (%)"}, ) st.plotly_chart( fig, use_container_width=True, key=id_ + str(random.random()) ) def create_box_plot_per_semantic_category(df: pd.DataFrame, id_: str): # Compute average performance for each semantic category defined in semantic_categories. melt_vars = [] for category, tasks in semantic_categories.items(): relevant_cols = [ col for col in df.columns if "iberbench/" + col in tasks ] if relevant_cols: df[category] = df[relevant_cols].mean(axis=1).round(2) melt_vars.append(category) melt_vars = list(set(melt_vars)) id_vars = model_columns.copy() if "language" in df.columns: id_vars.append("language") df_melt = df.melt( id_vars=id_vars, value_vars=melt_vars, var_name="Task Category", value_name="Performance", ) fig = px.box( df_melt, x="Task Category", y="Performance", points="all", labels={"Performance": "Performance (%)"}, ) st.plotly_chart( fig, use_container_width=True, key=id_ + str(random.random()) ) def create_histogram(df: pd.DataFrame, id_: str): fig = px.histogram( df, x="num_parameters", nbins=20, labels={"num_parameters": "Num parameters", "count": "Count"}, ) fig.update_layout(template="plotly_white") st.plotly_chart( fig, use_container_width=True, key=id_ + str(random.random()) ) def create_data_results_per_language() -> pd.DataFrame: # Create a combined dataframe from all leaderboard data in session_state. combined_df = pd.DataFrame() for key in st.session_state.keys(): if key.startswith("leaderboard_data_"): temp_df = st.session_state[key].copy() # If the "language" column is missing, use the key to assign a language name. if "language" not in temp_df.columns: lang = key.split("leaderboard_data_")[1] temp_df["language"] = lang combined_df = pd.concat([combined_df, temp_df], ignore_index=True) if combined_df.empty: st.warning("No data available for any language â ī¸.") return # Check if the "Mean" column exists. If not, compute it. if "Mean" not in combined_df.columns: # Define model metadata columns that should be excluded from the performance calculation. model_columns = ["model_name", "model_type", "num_parameters"] # Exclude metadata, language, and any non-numeric columns. performance_cols = [ col for col in combined_df.columns if col not in model_columns + ["language", "Active"] and pd.api.types.is_numeric_dtype(combined_df[col]) ] if performance_cols: combined_df["Mean"] = ( combined_df[performance_cols].mean(axis=1).round(2) ) else: st.warning( "No numeric task performance columns available to compute 'Mean' â ī¸." ) return return combined_df def create_box_plot_per_language(id_: str): # Create a boxplot with performance (Mean) per language. combined_df = create_data_results_per_language() fig = px.box( combined_df, x="language", y="Mean", points="all", labels={"language": "Language", "Mean": "Performance (%)"}, ) st.plotly_chart( fig, use_container_width=True, key=id_ + str(random.random()) ) def get_all_languages_summary_df() -> pd.DataFrame: """Combine leaderboard summary data from all languages using get_summary_df.""" combined_df = pd.DataFrame() for key in st.session_state: if key.startswith("leaderboard_data_"): lang = key.split("leaderboard_data_")[1] task_types = select_task_per_language(lang) summary_df = get_summary_df(lang, task_types) summary_df["language"] = lang combined_df = pd.concat( [combined_df, summary_df], ignore_index=True ) return combined_df def get_all_languages_aggregated_summary_df() -> pd.DataFrame: """ Aggregate the combined summary data by model_name to compute mean performance across languages. Use this aggregated data for radar, scatter, pie, box, and histogram plots. """ df = get_all_languages_summary_df() agg_df = df.groupby("model_name", as_index=False).agg( { "model_type": "first", # choose an aggregation that makes sense "num_parameters": "mean", # average model size across languages "Mean": "mean", # average performance } ) agg_df["Mean"] = agg_df["Mean"].round(2) return agg_df def get_all_languages_raw_df() -> pd.DataFrame: """ Combine the raw leaderboard data from all languages. This is used for plots (e.g., Fundamental vs Professional) that rely on the original task columns. """ combined_df = pd.DataFrame() for key in st.session_state: if key.startswith("leaderboard_data_"): lang = key.split("leaderboard_data_")[1] temp_df = st.session_state[key].copy() temp_df["language"] = lang combined_df = pd.concat([combined_df, temp_df], ignore_index=True) return combined_df # ----------------------------------------------------------------------------- # Sidebar for Navigation and Global Settings # ----------------------------------------------------------------------------- st.sidebar.markdown( "
A leaderboard of LLMs on languages from the Iberian Peninsula and Ibero-America
""", unsafe_allow_html=True, ) def load_languages_set(): with open(LANGUAGES_SETTINGS, "r") as f: return yaml_load(f) lang_set = load_languages_set() for lang in lang_set.keys(): data = load_data(lang) if f"leaderboard_data_{lang}" not in st.session_state: st.session_state[f"leaderboard_data_{lang}"] = data # ----------------------------------------------------------------------------- # Main Content based on Navigation # ----------------------------------------------------------------------------- if menu == "Leaderboard đ": st.markdown( "