import json import os import re import uuid import random from pathlib import Path import pandas as pd import streamlit as st import plotly.express as px import plotly.graph_objects as go from datasets import load_dataset from huggingface_hub import CommitScheduler, hf_hub_download from huggingface_hub.utils import RepositoryNotFoundError from yaml import safe_load as yaml_load from src.check_validity import validate_model from src.task_mappings import professional_mapping, semantic_categories # ----------------------------------------------------------------------------- # Page configuration and global CSS styles for modern look and improved UX # ----------------------------------------------------------------------------- st.set_page_config( page_title="IberBench", layout="wide", initial_sidebar_state="expanded", page_icon="🌍", ) st.markdown( """ """, unsafe_allow_html=True, ) # ----------------------------------------------------------------------------- # Global variables and helper functions # ----------------------------------------------------------------------------- request_file = Path("user_request/") / f"data_{uuid.uuid4()}.json" request_folder = request_file.parent LANGUAGES_SETTINGS = Path("etc/languages_settings.yml") dataset_columns = [ "workshop", "shared_task", "year", "task_type", "language", "url", "language_variety", "problem_type", "num_labels", "labels", ] model_columns = ["model_name", "model_type", "num_parameters"] scheduler = CommitScheduler( repo_id="iberbench/user-requests", repo_type="dataset", private=True, folder_path=request_folder, token=st.secrets["HF_TOKEN"], path_in_repo="data", every=10, ) def log_submission(input_dict: dict) -> None: with scheduler.lock: with request_file.open("a") as f: f.write(json.dumps(input_dict)) f.write("\n") def get_lang_columns(columns: list, lang: str): # Mixed needs to return all the columns that ends # with the language, but doesn't have variation at the end if "Mixed" in lang: lang = lang.lower().split(" ")[0] return [col for col in columns if col.endswith(lang)] else: lang_norm = lang.lower().replace(" ", "_") return [col for col in columns if lang_norm in col] @st.cache_data def load_data(lang) -> pd.DataFrame: try: data = load_dataset( "iberbench/lm-eval-results", token=st.secrets["HF_TOKEN"] )["train"].to_pandas() task_columns = [col for col in data.columns if col not in model_columns] task_lang_columns = get_lang_columns(task_columns, lang) data[task_columns] = data[task_columns] * 100 data = data[model_columns + task_lang_columns] # data["Active"] = False return data except FileNotFoundError: st.error("iberbench/lm-eval-results was not found in the hub 😕") return pd.DataFrame() def load_dataset_card(task) -> list: name_repo = "iberbench/" + task try: info_path = hf_hub_download( repo_id=name_repo, filename="task_metadata.json", repo_type="dataset", ) with open(info_path, "r") as f: info = json.load(f) values_ = [] for i in dataset_columns: if i in info: values_.append(info[i]) else: values_.append([] if i == "labels" else "-") return values_ except RepositoryNotFoundError: st.error(task + ": dataset was not found in the hub đŸšĢ") return ["-"] * len(dataset_columns) def active_data(lang) -> pd.DataFrame: return st.session_state[f"leaderboard_data_{lang}"][ st.session_state[f"leaderboard_data_{lang}"]["Active"] == True ].copy() def get_index(lang, row) -> pd.Series: return active_data(lang).iloc[row].name def commit(lang) -> None: for row in st.session_state[f"edited_data_{lang}"]["edited_rows"]: row_index = get_index(lang, row) for key, value in st.session_state[f"edited_data_{lang}"][ "edited_rows" ][row].items(): st.session_state[f"leaderboard_data_{lang}"].at[ row_index, key ] = value # ----------------------------------------------------------------------------- # Visualization helper functions # ----------------------------------------------------------------------------- def create_table_results(df_mean: pd.DataFrame): rank_value = [] for i in df_mean["Mean"].rank(method="dense", ascending=False).astype(int): if i == 1: rank_value.append(f"{i} đŸĨ‡") elif i == 2: rank_value.append(f"{i} đŸĨˆ") elif i == 3: rank_value.append(f"{i} đŸĨ‰") else: rank_value.append(str(i)) df_mean.insert(0, "Rank", rank_value) df_final = df_mean.sort_values("Mean", ascending=False) st.dataframe( df_final, hide_index=True, use_container_width=True, column_config={ "model_name": st.column_config.TextColumn("Model 🧠"), "model_type": st.column_config.TextColumn("Type 📌"), "num_parameters": st.column_config.NumberColumn("Model Size đŸ”ĸ"), }, ) def create_table_all_results(aggregated_df: pd.DataFrame): combined_df = create_data_results_per_language() df_lang = combined_df.pivot( index="model_name", columns="language", values="Mean" ) aggregated_df[df_lang.columns] = df_lang[df_lang.columns].values rank_value = [] for i in ( aggregated_df["Mean"].rank(method="dense", ascending=False).astype(int) ): if i == 1: rank_value.append(f"{i} đŸĨ‡") elif i == 2: rank_value.append(f"{i} đŸĨˆ") elif i == 3: rank_value.append(f"{i} đŸĨ‰") else: rank_value.append(str(i)) aggregated_df.insert(0, "Rank", rank_value) df_final = aggregated_df.sort_values("Mean", ascending=False) st.dataframe( df_final, hide_index=True, use_container_width=True, column_config={ "model_name": st.column_config.TextColumn("Model 🧠"), "model_type": st.column_config.TextColumn("Type 📌"), "num_parameters": st.column_config.NumberColumn("Model Size đŸ”ĸ"), }, ) def create_scatter_chart(df: pd.DataFrame, id_: str): fig = px.scatter( df, x="num_parameters", y="Mean", color="model_name", size="num_parameters", hover_data=["model_type"], labels={"num_parameters": "Num parameters"}, ) fig.update_layout(template="plotly_white") st.plotly_chart( fig, use_container_width=True, key=id_ + str(random.random()) ) def create_radar_chart(df: pd.DataFrame, id_: str): df = df.sort_values(by="Mean", ascending=False) radar_df = pd.DataFrame( {"r": df["Mean"][:10], "theta": df["model_name"][:10]} ) fig = px.line_polar( radar_df, r="r", theta="theta", line_close=True, markers=True, ) fig.update_traces(fill="toself") st.plotly_chart( fig, use_container_width=True, key=id_ + str(random.random()) ) def create_pie_chart(df: pd.DataFrame, id_: str): df_pie = df["model_type"].value_counts().reset_index() df_pie.columns = ["model_type", "count"] fig = px.pie( df_pie, values="count", names="model_type", labels={"model_type": "Model type"}, ) st.plotly_chart( fig, use_container_width=True, key=id_ + str(random.random()) ) def create_box_plot(df: pd.DataFrame, id_: str): fig = px.box( df, x="model_type", y="Mean", points="all", labels={"model_type": "Model type"}, ) st.plotly_chart( fig, use_container_width=True, key=id_ + str(random.random()) ) def get_summary_df(lang: str, task_types: list) -> pd.DataFrame: df = st.session_state[f"leaderboard_data_{lang}"][model_columns].copy() if not st.session_state[f"leaderboard_data_{lang}"].empty: for t in task_types: task_list = semantic_categories[t] cols = [ col for col in st.session_state[f"leaderboard_data_{lang}"].columns if "iberbench/" + col in task_list ] if cols: tmp = st.session_state[f"leaderboard_data_{lang}"][cols] df[t] = tmp.mean(axis=1).round(2) if df.shape[1] > 4: df.insert(3, "Mean", df.iloc[:, 3:-1].mean(axis=1).round(2)) else: df.insert(3, "Mean", df.iloc[:, 3].round(2)) return df def get_all_languages_summary_df() -> pd.DataFrame: """Combine leaderboard summary data from all languages using get_summary_df.""" combined_df = pd.DataFrame() for key in st.session_state: if key.startswith("leaderboard_data_"): lang = key.split("leaderboard_data_")[1] task_types = select_task_per_language(lang) summary_df = get_summary_df(lang, task_types) summary_df["language"] = lang combined_df = pd.concat( [combined_df, summary_df], ignore_index=True ) return combined_df def create_results_visualization_lang(lang: str): # --------------------------- # In-language plots section # --------------------------- task_types = select_task_per_language(lang) summary_df = get_summary_df(lang, task_types) tasks_df = st.session_state[f"leaderboard_data_{lang}"].copy() create_table_results(summary_df) st.markdown("### Language plots 📊") # Display the results table for the selected language in_lang_tabs = st.tabs( [ "Top 10 performance đŸĨ‡", "Performance vs. size 📏", "Performance per type 💡", "Fundamental vs industry âš–ī¸", "Performance per task category 📈", ] ) with in_lang_tabs[0]: create_radar_chart(summary_df, lang + "in_radar") with in_lang_tabs[1]: create_scatter_chart(summary_df, lang + "in_scatter") with in_lang_tabs[2]: create_box_plot(summary_df, lang + "in_box") with in_lang_tabs[3]: create_box_plot_per_task_category(tasks_df, lang + "in_box_task_cat") with in_lang_tabs[4]: create_box_plot_per_semantic_category(tasks_df, lang + "in_box_sem_cat") # ----------------------------------------------------------------------------- # Functions for other visualization sections # ----------------------------------------------------------------------------- def select_task_per_language(lang: str): types = [] for k, v in semantic_categories.items(): for vv in v: task_name = vv.split("iberbench/")[1] if task_name in list( st.session_state[f"leaderboard_data_{lang}"].columns ): if k not in types: types.append(k) return types def create_dataset_info_per_language(lang: str): all_values = [] if not st.session_state[f"leaderboard_data_{lang}"].empty: cols = [ col for col in st.session_state[f"leaderboard_data_{lang}"].columns if col not in model_columns ] if len(cols) > 1: for task in cols[:-1]: values = load_dataset_card(task) all_values.append(values) else: values = load_dataset_card(cols[0]) all_values.append(values) df = pd.DataFrame(all_values, columns=dataset_columns) st.dataframe( df, column_config={ "workshop": st.column_config.TextColumn( "Workshop đŸĢ", help="Workshop to belong to the shared task" ), "shared_task": st.column_config.TextColumn( "Shared Task 📋", help="Shared Task name" ), "year": st.column_config.TextColumn( "Year 📅", help="Year of the shared task" ), "task_type": st.column_config.TextColumn( "Task Type 🔖", help="Shared Task type" ), "language": st.column_config.TextColumn( "Language 🌐", help="Shared Task language" ), "url": st.column_config.ListColumn( "Task URL 🔗", help="Shared Task url" ), "language_variety": st.column_config.TextColumn( "Language Variety đŸ—Ŗī¸", help="Shared Task language variety" ), "problem_type": st.column_config.TextColumn( "Problem Type ❓", help="Shared Task problem type" ), "num_labels": st.column_config.NumberColumn( "Number of Labels đŸ”ĸ", help="Shared Task number of labels" ), "labels": st.column_config.ListColumn( "Labels đŸˇī¸", help="Shared Task labels" ), }, hide_index=True, ) else: st.write("No data found to display on leaderboard 😔.") def create_box_plot_per_task_category(df: pd.DataFrame, id_: str): # Compute average performance for each professional category (using professional_mapping). melt_vars = [] for category, tasks in professional_mapping.items(): relevant_cols = [ col for col in df.columns if "iberbench/" + col in tasks ] if relevant_cols: df[category] = df[relevant_cols].mean(axis=1).round(2) melt_vars.append(category) melt_vars = list(set(melt_vars)) id_vars = model_columns.copy() if "language" in df.columns: id_vars.append("language") df_melt = df.melt( id_vars=id_vars, value_vars=melt_vars, var_name="Task Category", value_name="Performance", ) fig = px.box( df_melt, x="Task Category", y="Performance", points="all", labels={"Performance": "Performance (%)"}, ) st.plotly_chart( fig, use_container_width=True, key=id_ + str(random.random()) ) def create_box_plot_per_semantic_category(df: pd.DataFrame, id_: str): # Compute average performance for each semantic category defined in semantic_categories. melt_vars = [] for category, tasks in semantic_categories.items(): relevant_cols = [ col for col in df.columns if "iberbench/" + col in tasks ] if relevant_cols: df[category] = df[relevant_cols].mean(axis=1).round(2) melt_vars.append(category) melt_vars = list(set(melt_vars)) id_vars = model_columns.copy() if "language" in df.columns: id_vars.append("language") df_melt = df.melt( id_vars=id_vars, value_vars=melt_vars, var_name="Task Category", value_name="Performance", ) fig = px.box( df_melt, x="Task Category", y="Performance", points="all", labels={"Performance": "Performance (%)"}, ) st.plotly_chart( fig, use_container_width=True, key=id_ + str(random.random()) ) def create_histogram(df: pd.DataFrame, id_: str): fig = px.histogram( df, x="num_parameters", nbins=20, labels={"num_parameters": "Num parameters", "count": "Count"}, ) fig.update_layout(template="plotly_white") st.plotly_chart( fig, use_container_width=True, key=id_ + str(random.random()) ) def create_data_results_per_language() -> pd.DataFrame: # Create a combined dataframe from all leaderboard data in session_state. combined_df = pd.DataFrame() for key in st.session_state.keys(): if key.startswith("leaderboard_data_"): temp_df = st.session_state[key].copy() # If the "language" column is missing, use the key to assign a language name. if "language" not in temp_df.columns: lang = key.split("leaderboard_data_")[1] temp_df["language"] = lang combined_df = pd.concat([combined_df, temp_df], ignore_index=True) if combined_df.empty: st.warning("No data available for any language âš ī¸.") return # Check if the "Mean" column exists. If not, compute it. if "Mean" not in combined_df.columns: # Define model metadata columns that should be excluded from the performance calculation. model_columns = ["model_name", "model_type", "num_parameters"] # Exclude metadata, language, and any non-numeric columns. performance_cols = [ col for col in combined_df.columns if col not in model_columns + ["language", "Active"] and pd.api.types.is_numeric_dtype(combined_df[col]) ] if performance_cols: combined_df["Mean"] = ( combined_df[performance_cols].mean(axis=1).round(2) ) else: st.warning( "No numeric task performance columns available to compute 'Mean' âš ī¸." ) return return combined_df def create_box_plot_per_language(id_: str): # Create a boxplot with performance (Mean) per language. combined_df = create_data_results_per_language() fig = px.box( combined_df, x="language", y="Mean", points="all", labels={"language": "Language", "Mean": "Performance (%)"}, ) st.plotly_chart( fig, use_container_width=True, key=id_ + str(random.random()) ) def get_all_languages_summary_df() -> pd.DataFrame: """Combine leaderboard summary data from all languages using get_summary_df.""" combined_df = pd.DataFrame() for key in st.session_state: if key.startswith("leaderboard_data_"): lang = key.split("leaderboard_data_")[1] task_types = select_task_per_language(lang) summary_df = get_summary_df(lang, task_types) summary_df["language"] = lang combined_df = pd.concat( [combined_df, summary_df], ignore_index=True ) return combined_df def get_all_languages_aggregated_summary_df() -> pd.DataFrame: """ Aggregate the combined summary data by model_name to compute mean performance across languages. Use this aggregated data for radar, scatter, pie, box, and histogram plots. """ df = get_all_languages_summary_df() agg_df = df.groupby("model_name", as_index=False).agg( { "model_type": "first", # choose an aggregation that makes sense "num_parameters": "mean", # average model size across languages "Mean": "mean", # average performance } ) agg_df["Mean"] = agg_df["Mean"].round(2) return agg_df def get_all_languages_raw_df() -> pd.DataFrame: """ Combine the raw leaderboard data from all languages. This is used for plots (e.g., Fundamental vs Professional) that rely on the original task columns. """ combined_df = pd.DataFrame() for key in st.session_state: if key.startswith("leaderboard_data_"): lang = key.split("leaderboard_data_")[1] temp_df = st.session_state[key].copy() temp_df["language"] = lang combined_df = pd.concat([combined_df, temp_df], ignore_index=True) return combined_df # ----------------------------------------------------------------------------- # Sidebar for Navigation and Global Settings # ----------------------------------------------------------------------------- st.sidebar.markdown( "

IberBench 🌍

", unsafe_allow_html=True ) menu = st.sidebar.radio( "", ["Leaderboard 📊", "Submit Model 🚀", "Datasets 📚", "About â„šī¸"] ) st.sidebar.markdown("---") st.sidebar.markdown( """

A leaderboard of LLMs on languages from the Iberian Peninsula and Ibero-America

""", unsafe_allow_html=True, ) def load_languages_set(): with open(LANGUAGES_SETTINGS, "r") as f: return yaml_load(f) lang_set = load_languages_set() for lang in lang_set.keys(): data = load_data(lang) if f"leaderboard_data_{lang}" not in st.session_state: st.session_state[f"leaderboard_data_{lang}"] = data # ----------------------------------------------------------------------------- # Main Content based on Navigation # ----------------------------------------------------------------------------- if menu == "Leaderboard 📊": st.markdown( "

Leaderboard 📊

", unsafe_allow_html=True, ) lang_iber = [ k for k, v in lang_set.items() if v["category"] == "Iberian Peninsula languages" ] st.markdown("### General ranking 🏆") # --------------------------- # All-language plots section # --------------------------- # Use aggregated data for plots where each model must appear once with averaged values. aggregated_df = get_all_languages_aggregated_summary_df() create_table_all_results(aggregated_df) st.markdown("### General plots 📊") # Use raw data for Fundamental vs Professional and Task Category plots. raw_all_df = get_all_languages_raw_df() all_lang_tabs = st.tabs( [ "Top 10 performance đŸĨ‡", "Performance vs. size 📏", "Type distribution 🎨", "Performance per type 💡", "Distribution of sizes 📊", "Fundamental vs industry âš–ī¸", "Performance per task category 📈", "Performance per language 🌐", ] ) with all_lang_tabs[0]: create_radar_chart(aggregated_df, "all_radar") with all_lang_tabs[1]: create_scatter_chart(aggregated_df, "all_scatter") with all_lang_tabs[2]: create_pie_chart(aggregated_df, "all_pie") with all_lang_tabs[3]: create_box_plot(aggregated_df, "all_box") with all_lang_tabs[4]: create_histogram(aggregated_df, "all_hist") with all_lang_tabs[5]: # Use the raw combined data so that professional task columns are available. create_box_plot_per_task_category(raw_all_df, "all_box_task_cat") with all_lang_tabs[6]: create_box_plot_per_semantic_category(raw_all_df, "all_box_sem_cat") with all_lang_tabs[7]: create_box_plot_per_language("all_box_language") # Results per language st.markdown("---") st.markdown("### Language ranking 🏆") lang_choice = st.selectbox( "Select a language 🌐:", list(lang_iber), key="lang_leaderboard" ) if lang_choice == "Spanish": variations = [ k for k, v in lang_set.items() if v["category"] in ["Spanish Variations languages"] ] tabs_var = st.tabs(variations) for var, tab in zip(variations, tabs_var): with tab: create_results_visualization_lang(var) else: create_results_visualization_lang(lang_choice) elif menu == "Submit Model 🚀": st.markdown( "

Submit Your Model 🚀

", unsafe_allow_html=True, ) st.markdown("## How to submit a model 📤") # CSS st.markdown( """ """, unsafe_allow_html=True, ) def render_card(content): html = f"""
{content}
""" return html # Load your HTML content from files guide_info_list = [] html_path = "assets/html" filenames = sorted(os.listdir(html_path)) for filename in filenames: file_path = os.path.join(html_path, filename) with open(file_path, "r", encoding="utf-8") as file: raw_html = file.read() guide_info_list.append(raw_html) # Create the grid num_columns = 3 num_rows = 2 for row in range(num_rows): cols = st.columns(num_columns) for col in range(num_columns): index = row * num_columns + col if index < len(guide_info_list): with cols[col]: st.markdown( render_card(guide_info_list[index]), unsafe_allow_html=True, ) st.markdown("## Submission form 📝") with st.form("submit_model_form", clear_on_submit=True): model_name = st.text_input( "Model Name (format: user_name/model_name) 🧩", help="Your model should be public on the Hub and follow the username/model-id format (e.g. mistralai/Mistral-7B-v0.1).", ) description = st.text_area( "Description âœī¸", help="Add a description of the proposed model for the evaluation to help prioritize its evaluation.", ) user_contact = st.text_input( "Your Contact Email 📧", help="User e-mail to contact when there are updates.", ) precision_option = st.selectbox( "Choose precision format đŸ”ĸ:", help="Size limits vary by precision. Choose carefully as incorrect precision can cause evaluation errors.", options=["float16", "bfloat16", "8bit", "4bit", "GPTQ"], index=0, ) weight_type_option = st.selectbox( "Select weight type âš–ī¸:", help="Original: Complete model weights. Delta: Differences from base model. Adapter: Lightweight fine-tuning layers.", options=["Original", "Adapter", "Delta"], index=0, ) base_model_name = st.text_input( "Base model (if applicable) đŸ—ī¸", help="Required for delta weights or adapters. This helps calculate total parameter count.", value="", ) model_type = st.selectbox( "Choose model type 🔍:", help="đŸŸĸ Pretrained: Base models, đŸ”ļ Fine-tuned: Domain-specific, đŸ’Ŧ Chat: Conversational, 🤝 Merge: Combined weights.", options=["đŸŸĸ Pretrained", "đŸ”ļ Fine-tuned", "đŸ’Ŧ Chat", "🤝 Merge"], ) submit_button = st.form_submit_button("Submit Request 🚀") if submit_button: use_chat_template = True if model_type == "đŸ’Ŧ Chat" else False validation_error = validate_model( model_name, precision_option, base_model_name, weight_type_option, use_chat_template, ) if validation_error is not None: st.error(validation_error) elif not re.match(r"[^@]+@[^@]+\.[^@]+", user_contact): st.error("Invalid email address âš ī¸.") else: input_dict = { "model_name": model_name, "description": description, "user_contact": user_contact, "precision_option": precision_option, "weight_type_option": weight_type_option, "base_model_name": base_model_name, "model_type": model_type, } try: log_submission(input_dict) st.success("Your request has been sent successfully 🎉.") except Exception as e: st.error( f"Failed to send your request: {e}. Please try again later." ) elif menu == "Datasets 📚": st.markdown( "

Dataset Information 📚

", unsafe_allow_html=True, ) st.markdown("### Check the datasets 🔍") lang_iber = [ k for k, v in lang_set.items() if v["category"] == "Iberian Peninsula languages" ] lang_choice = st.selectbox( "Select a language 🌐:", list(lang_iber), key="lang_dataset" ) if lang_choice in ["Spanish"]: variations = [ k for k, v in lang_set.items() if v["category"] in ["Spanish Variations languages"] ] tabs_var = st.tabs(variations) for var, tab in zip(variations, tabs_var): with tab: create_dataset_info_per_language(var) else: create_dataset_info_per_language(lang_choice) st.markdown("### Task mappings 🔄") st.markdown( "For the sake of completeness, here we show the mappings we use in the leaderboard to aggregate tasks." ) tab1, tab2 = st.tabs( ["Semantic categories đŸ—‚ī¸", "Fundamental vs. Industry âš–ī¸"] ) with tab1: st.json( { category: [task.removeprefix("iberbench/") for task in tasks] for category, tasks in semantic_categories.items() } ) with tab2: st.json( { category: [task.removeprefix("iberbench/") for task in tasks] for category, tasks in professional_mapping.items() } ) elif menu == "About â„šī¸": st.markdown( "

About â„šī¸

", unsafe_allow_html=True, ) with open("./assets/md/about.md", "r") as fr: st.markdown(fr.read(), unsafe_allow_html=True)