import os import subprocess import streamlit as st import datasets from tqdm import tqdm from transformers import AutoModelForSequenceClassification, AutoTokenizer from constants import DIALECTS_WITH_LABELS from inspect import getmembers, isfunction import eval_utils import utils import numpy as np import pandas as pd from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score from huggingface_hub import HfApi api = HfApi() st.set_page_config(layout="wide") st.title("MLADI Leaderboard") st.write( "The Multi-label Arabic Dialect Identification (MLADI) leaderboard serves as a public interface for benchmarking ADI " "models using an 'extended version' of the NADI 2024 test set, " "the first multi-label country-level ADI dataset.\n\n" "🔜 More information about the dataset extension will be coming soon, stay tuned!" ) SHARED_TASK_TEAMS = { "Elyadata": "https://aclanthology.org/2024.arabicnlp-1.85/", "NLP_DI": "https://aclanthology.org/2024.arabicnlp-1.82/", "dzNlp": "https://aclanthology.org/2024.arabicnlp-1.84/", } tab1, tab2 = st.tabs(["Leaderboard", "Submit a Model"]) with tab1: # Load the labels dataset_name = os.environ["DATASET_NAME"] dataset = datasets.load_dataset(dataset_name)["test"] labels = {dialect: dataset[dialect] for dialect in DIALECTS_WITH_LABELS} print("Loaded the labels, no. of samples:", len(dataset)) # Load the models' predictions try: model_predictions_rows = datasets.load_dataset( os.environ["PREDICTIONS_DATASET_NAME"] )["train"] except Exception as e: st.info(f"Error in loading the results!") model_predictions_rows = [] if model_predictions_rows: # TODO: Store these metrics in a separate dataset! evaluation_metrics = [] for row in model_predictions_rows: # Evaluate the models accuracy_scores = {} f1_scores = {} recall_scores = {} precision_scores = {} predictions = row["predictions"] if not row["status"] == "completed": continue for dialect in DIALECTS_WITH_LABELS: y_true = labels[dialect] y_pred = [dialect in prediction for prediction in predictions] accuracy = accuracy_score(y_true, y_pred) f1 = f1_score(y_true, y_pred) recall = recall_score(y_true, y_pred) precision = precision_score(y_true, y_pred) accuracy_scores[dialect] = accuracy f1_scores[dialect] = f1 recall_scores[dialect] = recall precision_scores[dialect] = precision macro_avg_accuracy = np.mean(list(accuracy_scores.values())) macro_avg_f1 = np.mean(list(f1_scores.values())) macro_avg_recall = np.mean(list(recall_scores.values())) macro_avg_precision = np.mean(list(precision_scores.values())) evaluation_metrics.append( { "Model Name": row["model_name"] + "\n(" + row["inference_function"] + ")", "Accuracy": macro_avg_accuracy, "Recall": macro_avg_recall, "Precision": macro_avg_precision, "F1 score": macro_avg_f1, "URL": f"https://huggingface.co/{row['model_name']}" if ("shared task team" not in row["model_name"]) else SHARED_TASK_TEAMS[row["model_name"].split(" (")[0]], "Commit ID": row["commit_id"][:5] if ("shared task team" not in row["model_name"]) else "N/A", } ) if evaluation_metrics: results_df = pd.DataFrame(evaluation_metrics).sort_values( "F1 score", ascending=False ) results_df["Rank"] = range(1, len(results_df) + 1) results_df = results_df[ [ "Rank", "Model Name", "F1 score", "Precision", "Recall", "Accuracy", "URL", "Commit ID", ] ] st.data_editor( results_df, column_config={ "URL": st.column_config.LinkColumn("URL", required=False), }, hide_index=True, ) st.write("Note: The metrics are macro-averaged across all 11 dialects.") with st.expander("Click for more information."): inference_functions_names = [ func_name for func_name, _ in getmembers(eval_utils, isfunction) ] # Show the docstring of the inference functions inference_functions_docstring = [ getattr(eval_utils, func).__doc__ for func in inference_functions_names ] inference_functions_df = pd.DataFrame( { "Method": inference_functions_names, "Description": inference_functions_docstring, } ) st.markdown("## Inference Methods' Descriptions", unsafe_allow_html=True) st.markdown( inference_functions_df.to_markdown(index=False), unsafe_allow_html=True ) with open("leaderboard_info.md", "r") as f: MARKDOWN_TEXT = f.read() st.markdown(MARKDOWN_TEXT) st.markdown("For any inquiries, please do not hesistate to contact me: https://amr-keleg.github.io/") with st.expander("Cite this leaderboard!"): st.write( """ Please cite the following paper in which we introduced the NADI 2024 evaluation sets: ``` @inproceedings{abdul-mageed-etal-2024-nadi, title = "{NADI} 2024: The Fifth Nuanced {A}rabic Dialect Identification Shared Task", author = "Abdul-Mageed, Muhammad and Keleg, Amr and Elmadany, AbdelRahim and Zhang, Chiyu and Hamed, Injy and Magdy, Walid and Bouamor, Houda and Habash, Nizar", editor = "Habash, Nizar and Bouamor, Houda and Eskander, Ramy and Tomeh, Nadi and Abu Farha, Ibrahim and Abdelali, Ahmed and Touileb, Samia and Hamed, Injy and Onaizan, Yaser and Alhafni, Bashar and Antoun, Wissam and Khalifa, Salam and Haddad, Hatem and Zitouni, Imed and AlKhamissi, Badr and Almatham, Rawan and Mrini, Khalil", booktitle = "Proceedings of The Second Arabic Natural Language Processing Conference", month = aug, year = "2024", address = "Bangkok, Thailand", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2024.arabicnlp-1.79", doi = "10.18653/v1/2024.arabicnlp-1.79", pages = "709--728", } ``` """ ) # Evaluate the models queued if model_predictions_rows: models_to_be_evaluated = [] models_in_progress = [] for row in model_predictions_rows: if row["status"] == "queued": models_to_be_evaluated.append(row) elif row["status"] == "in_progress": models_in_progress.append(row) for model in models_in_progress: # Check if the evaluation is staled for more than a day! timestamp = model["last_updated_timestamp"] if utils.current_seconds_time() - timestamp > 86400: utils.update_model_queue( repo_id=os.environ["PREDICTIONS_DATASET_NAME"], model_name=model["model_name"], commit_id=model["commit_id"], inference_function=model["inference_function"], status="queued", ) print(f"Model {model['model_name']} is staled for more than a day.") models_to_be_evaluated.append(model) models_in_progress.remove(model) if models_in_progress == []: for row in models_to_be_evaluated: # Evaluate the model subprocess.Popen( [ "python", "background_inference.py", row["model_name"], row["commit_id"], row["inference_function"], ] ) print(f"Started the evaluation of {row['model_name']}.") with tab2: model_name = st.text_input("Enter a model's name on HF") model_revision = st.text_input( "Enter a model's revision on HF (commit id, or branch name)", placeholder="main", value="main", ) inference_functions_names = [ func_name for func_name, _ in getmembers(eval_utils, isfunction) ] inference_function = st.selectbox( "Inference Method", inference_functions_names, ) # TODO: Allow modifying the adhoc threshold values of the different inference methods # Show the docstring of the inference functions inference_functions_docstring = [ getattr(eval_utils, func).__doc__ for func in inference_functions_names ] inference_functions_df = pd.DataFrame( { "Method": inference_functions_names, "Description": inference_functions_docstring, } ) with st.expander("Check the inference methods' short descriptions"): st.markdown( inference_functions_df.to_markdown(index=False), unsafe_allow_html=True ) st.write( "Note: We are happy to discuss adding new custom inference methods for your models." ) if model_name and model_revision and inference_function: # Get the model's commit id commit_id = api.list_repo_commits(model_name, revision=model_revision)[ 0 ].commit_id model_predictions_rows = datasets.load_dataset( os.environ["PREDICTIONS_DATASET_NAME"] )["train"] # Check if the model is already in the leaderboard model_exists = any( [ row["model_name"] == model_name and row["commit_id"] == commit_id and row["inference_function"] == inference_function for row in model_predictions_rows ] ) if not model_exists: # Add the model to the evaluation queue utils.update_model_queue( repo_id=os.environ["PREDICTIONS_DATASET_NAME"], model_name=model_name, commit_id=commit_id, inference_function=inference_function, status="queued", ) st.info( f"The evaluation of the model {model_name} is queued for processing." ) else: st.info( f"The model {model_name} has already submitted to the leaderboard before." )