model-evaluator / app.py
lewtun's picture
lewtun HF staff
Archive project
daea199
import os
import time
from pathlib import Path
import pandas as pd
import streamlit as st
import yaml
from datasets import get_dataset_config_names
from dotenv import load_dotenv
from huggingface_hub import list_datasets
from evaluation import filter_evaluated_models
from utils import (
AUTOTRAIN_TASK_TO_HUB_TASK,
commit_evaluation_log,
create_autotrain_project_name,
format_col_mapping,
get_compatible_models,
get_config_metadata,
get_dataset_card_url,
get_key,
get_metadata,
http_get,
http_post,
)
if Path(".env").is_file():
load_dotenv(".env")
HF_TOKEN = os.getenv("HF_TOKEN")
AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME")
AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API")
DATASETS_PREVIEW_API = os.getenv("DATASETS_PREVIEW_API")
# Put image tasks on top
TASK_TO_ID = {
"image_binary_classification": 17,
"image_multi_class_classification": 18,
"binary_classification": 1,
"multi_class_classification": 2,
"natural_language_inference": 22,
"entity_extraction": 4,
"extractive_question_answering": 5,
"translation": 6,
"summarization": 8,
"text_zero_shot_classification": 23,
}
TASK_TO_DEFAULT_METRICS = {
"binary_classification": ["f1", "precision", "recall", "auc", "accuracy"],
"multi_class_classification": [
"f1",
"precision",
"recall",
"accuracy",
],
"natural_language_inference": ["f1", "precision", "recall", "auc", "accuracy"],
"entity_extraction": ["precision", "recall", "f1", "accuracy"],
"extractive_question_answering": ["f1", "exact_match"],
"translation": ["sacrebleu"],
"summarization": ["rouge1", "rouge2", "rougeL", "rougeLsum"],
"image_binary_classification": ["f1", "precision", "recall", "auc", "accuracy"],
"image_multi_class_classification": [
"f1",
"precision",
"recall",
"accuracy",
],
"text_zero_shot_classification": ["accuracy", "loss"],
}
AUTOTRAIN_TASK_TO_LANG = {
"translation": "en2de",
"image_binary_classification": "unk",
"image_multi_class_classification": "unk",
}
AUTOTRAIN_MACHINE = {"text_zero_shot_classification": "r5.16x"}
SUPPORTED_TASKS = list(TASK_TO_ID.keys())
# Extracted from utils.get_supported_metrics
# Hardcoded for now due to speed / caching constraints
SUPPORTED_METRICS = [
"accuracy",
"bertscore",
"bleu",
"cer",
"chrf",
"code_eval",
"comet",
"competition_math",
"coval",
"cuad",
"exact_match",
"f1",
"frugalscore",
"google_bleu",
"mae",
"mahalanobis",
"matthews_correlation",
"mean_iou",
"meteor",
"mse",
"pearsonr",
"perplexity",
"precision",
"recall",
"roc_auc",
"rouge",
"sacrebleu",
"sari",
"seqeval",
"spearmanr",
"squad",
"squad_v2",
"ter",
"trec_eval",
"wer",
"wiki_split",
"xnli",
"angelina-wang/directional_bias_amplification",
"jordyvl/ece",
"lvwerra/ai4code",
"lvwerra/amex",
]
#######
# APP #
#######
st.title("Evaluation on the Hub")
st.warning(
"**⚠️ This project has been archived. If you want to evaluate LLMs, checkout [this collection](https://huggingface.co/collections/clefourrier/llm-leaderboards-and-benchmarks-✨-64f99d2e11e92ca5568a7cce) of leaderboards.**"
)
st.markdown(
"""
Welcome to Hugging Face's automatic model evaluator πŸ‘‹!
This application allows you to evaluate πŸ€— Transformers
[models](https://huggingface.co/models?library=transformers&sort=downloads)
across a wide variety of [datasets](https://huggingface.co/datasets) on the
Hub. Please select the dataset and configuration below. The results of your
evaluation will be displayed on the [public
leaderboards](https://huggingface.co/spaces/autoevaluate/leaderboards). For
more details, check out out our [blog
post](https://huggingface.co/blog/eval-on-the-hub).
"""
)
# all_datasets = [d.id for d in list_datasets()]
# query_params = st.experimental_get_query_params()
# if "first_query_params" not in st.session_state:
# st.session_state.first_query_params = query_params
# first_query_params = st.session_state.first_query_params
# default_dataset = all_datasets[0]
# if "dataset" in first_query_params:
# if len(first_query_params["dataset"]) > 0 and first_query_params["dataset"][0] in all_datasets:
# default_dataset = first_query_params["dataset"][0]
# selected_dataset = st.selectbox(
# "Select a dataset",
# all_datasets,
# index=all_datasets.index(default_dataset),
# help="""Datasets with metadata can be evaluated with 1-click. Configure an evaluation job to add \
# new metadata to a dataset card.""",
# )
# st.experimental_set_query_params(**{"dataset": [selected_dataset]})
# # Check if selected dataset can be streamed
# is_valid_dataset = http_get(
# path="/is-valid",
# domain=DATASETS_PREVIEW_API,
# params={"dataset": selected_dataset},
# ).json()
# if is_valid_dataset["viewer"] is False and is_valid_dataset["preview"] is False:
# st.error(
# """The dataset you selected is not currently supported. Open a \
# [discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) for support."""
# )
# metadata = get_metadata(selected_dataset, token=HF_TOKEN)
# print(f"INFO -- Dataset metadata: {metadata}")
# if metadata is None:
# st.warning("No evaluation metadata found. Please configure the evaluation job below.")
# with st.expander("Advanced configuration"):
# # Select task
# selected_task = st.selectbox(
# "Select a task",
# SUPPORTED_TASKS,
# index=SUPPORTED_TASKS.index(metadata[0]["task_id"]) if metadata is not None else 0,
# help="""Don't see your favourite task here? Open a \
# [discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) to request it!""",
# )
# # Select config
# configs = get_dataset_config_names(selected_dataset)
# selected_config = st.selectbox(
# "Select a config",
# configs,
# help="""Some datasets contain several sub-datasets, known as _configurations_. \
# Select one to evaluate your models on. \
# See the [docs](https://huggingface.co/docs/datasets/master/en/load_hub#configurations) for more details.
# """,
# )
# # Some datasets have multiple metadata (one per config), so we grab the one associated with the selected config
# config_metadata = get_config_metadata(selected_config, metadata)
# print(f"INFO -- Config metadata: {config_metadata}")
# # Select splits
# splits_resp = http_get(
# path="/splits",
# domain=DATASETS_PREVIEW_API,
# params={"dataset": selected_dataset},
# )
# if splits_resp.status_code == 200:
# split_names = []
# all_splits = splits_resp.json()
# for split in all_splits["splits"]:
# if split["config"] == selected_config:
# split_names.append(split["split"])
# if config_metadata is not None:
# eval_split = config_metadata["splits"].get("eval_split", None)
# else:
# eval_split = None
# selected_split = st.selectbox(
# "Select a split",
# split_names,
# index=split_names.index(eval_split) if eval_split is not None else 0,
# help="Be wary when evaluating models on the `train` split.",
# )
# # Select columns
# rows_resp = http_get(
# path="/first-rows",
# domain=DATASETS_PREVIEW_API,
# params={
# "dataset": selected_dataset,
# "config": selected_config,
# "split": selected_split,
# },
# ).json()
# col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns)
# st.markdown("**Map your dataset columns**")
# st.markdown(
# """The model evaluator uses a standardised set of column names for the input examples and labels. \
# Please define the mapping between your dataset columns (right) and the standardised column names (left)."""
# )
# col1, col2 = st.columns(2)
# # TODO: find a better way to layout these items
# # TODO: need graceful way of handling dataset <--> task mismatch for datasets with metadata
# col_mapping = {}
# if selected_task in ["binary_classification", "multi_class_classification"]:
# with col1:
# st.markdown("`text` column")
# st.text("")
# st.text("")
# st.text("")
# st.text("")
# st.markdown("`target` column")
# with col2:
# text_col = st.selectbox(
# "This column should contain the text to be classified",
# col_names,
# index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
# if config_metadata is not None
# else 0,
# )
# target_col = st.selectbox(
# "This column should contain the labels associated with the text",
# col_names,
# index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
# if config_metadata is not None
# else 0,
# )
# col_mapping[text_col] = "text"
# col_mapping[target_col] = "target"
# elif selected_task == "text_zero_shot_classification":
# with col1:
# st.markdown("`text` column")
# st.text("")
# st.text("")
# st.text("")
# st.text("")
# st.markdown("`classes` column")
# st.text("")
# st.text("")
# st.text("")
# st.text("")
# st.markdown("`target` column")
# with col2:
# text_col = st.selectbox(
# "This column should contain the text to be classified",
# col_names,
# index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
# if config_metadata is not None
# else 0,
# )
# classes_col = st.selectbox(
# "This column should contain the classes associated with the text",
# col_names,
# index=col_names.index(get_key(config_metadata["col_mapping"], "classes"))
# if config_metadata is not None
# else 0,
# )
# target_col = st.selectbox(
# "This column should contain the index of the correct class",
# col_names,
# index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
# if config_metadata is not None
# else 0,
# )
# col_mapping[text_col] = "text"
# col_mapping[classes_col] = "classes"
# col_mapping[target_col] = "target"
# if selected_task in ["natural_language_inference"]:
# config_metadata = get_config_metadata(selected_config, metadata)
# with col1:
# st.markdown("`text1` column")
# st.text("")
# st.text("")
# st.text("")
# st.text("")
# st.text("")
# st.markdown("`text2` column")
# st.text("")
# st.text("")
# st.text("")
# st.text("")
# st.text("")
# st.markdown("`target` column")
# with col2:
# text1_col = st.selectbox(
# "This column should contain the first text passage to be classified",
# col_names,
# index=col_names.index(get_key(config_metadata["col_mapping"], "text1"))
# if config_metadata is not None
# else 0,
# )
# text2_col = st.selectbox(
# "This column should contain the second text passage to be classified",
# col_names,
# index=col_names.index(get_key(config_metadata["col_mapping"], "text2"))
# if config_metadata is not None
# else 0,
# )
# target_col = st.selectbox(
# "This column should contain the labels associated with the text",
# col_names,
# index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
# if config_metadata is not None
# else 0,
# )
# col_mapping[text1_col] = "text1"
# col_mapping[text2_col] = "text2"
# col_mapping[target_col] = "target"
# elif selected_task == "entity_extraction":
# with col1:
# st.markdown("`tokens` column")
# st.text("")
# st.text("")
# st.text("")
# st.text("")
# st.markdown("`tags` column")
# with col2:
# tokens_col = st.selectbox(
# "This column should contain the array of tokens to be classified",
# col_names,
# index=col_names.index(get_key(config_metadata["col_mapping"], "tokens"))
# if config_metadata is not None
# else 0,
# )
# tags_col = st.selectbox(
# "This column should contain the labels associated with each part of the text",
# col_names,
# index=col_names.index(get_key(config_metadata["col_mapping"], "tags"))
# if config_metadata is not None
# else 0,
# )
# col_mapping[tokens_col] = "tokens"
# col_mapping[tags_col] = "tags"
# elif selected_task == "translation":
# with col1:
# st.markdown("`source` column")
# st.text("")
# st.text("")
# st.text("")
# st.text("")
# st.markdown("`target` column")
# with col2:
# text_col = st.selectbox(
# "This column should contain the text to be translated",
# col_names,
# index=col_names.index(get_key(config_metadata["col_mapping"], "source"))
# if config_metadata is not None
# else 0,
# )
# target_col = st.selectbox(
# "This column should contain the target translation",
# col_names,
# index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
# if config_metadata is not None
# else 0,
# )
# col_mapping[text_col] = "source"
# col_mapping[target_col] = "target"
# elif selected_task == "summarization":
# with col1:
# st.markdown("`text` column")
# st.text("")
# st.text("")
# st.text("")
# st.text("")
# st.markdown("`target` column")
# with col2:
# text_col = st.selectbox(
# "This column should contain the text to be summarized",
# col_names,
# index=col_names.index(get_key(config_metadata["col_mapping"], "text"))
# if config_metadata is not None
# else 0,
# )
# target_col = st.selectbox(
# "This column should contain the target summary",
# col_names,
# index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
# if config_metadata is not None
# else 0,
# )
# col_mapping[text_col] = "text"
# col_mapping[target_col] = "target"
# elif selected_task == "extractive_question_answering":
# if config_metadata is not None:
# col_mapping = config_metadata["col_mapping"]
# # Hub YAML parser converts periods to hyphens, so we remap them here
# col_mapping = format_col_mapping(col_mapping)
# with col1:
# st.markdown("`context` column")
# st.text("")
# st.text("")
# st.text("")
# st.text("")
# st.markdown("`question` column")
# st.text("")
# st.text("")
# st.text("")
# st.text("")
# st.markdown("`answers.text` column")
# st.text("")
# st.text("")
# st.text("")
# st.text("")
# st.markdown("`answers.answer_start` column")
# with col2:
# context_col = st.selectbox(
# "This column should contain the question's context",
# col_names,
# index=col_names.index(get_key(col_mapping, "context")) if config_metadata is not None else 0,
# )
# question_col = st.selectbox(
# "This column should contain the question to be answered, given the context",
# col_names,
# index=col_names.index(get_key(col_mapping, "question")) if config_metadata is not None else 0,
# )
# answers_text_col = st.selectbox(
# "This column should contain example answers to the question, extracted from the context",
# col_names,
# index=col_names.index(get_key(col_mapping, "answers.text")) if config_metadata is not None else 0,
# )
# answers_start_col = st.selectbox(
# "This column should contain the indices in the context of the first character of each `answers.text`",
# col_names,
# index=col_names.index(get_key(col_mapping, "answers.answer_start"))
# if config_metadata is not None
# else 0,
# )
# col_mapping[context_col] = "context"
# col_mapping[question_col] = "question"
# col_mapping[answers_text_col] = "answers.text"
# col_mapping[answers_start_col] = "answers.answer_start"
# elif selected_task in ["image_binary_classification", "image_multi_class_classification"]:
# with col1:
# st.markdown("`image` column")
# st.text("")
# st.text("")
# st.text("")
# st.text("")
# st.markdown("`target` column")
# with col2:
# image_col = st.selectbox(
# "This column should contain the images to be classified",
# col_names,
# index=col_names.index(get_key(config_metadata["col_mapping"], "image"))
# if config_metadata is not None
# else 0,
# )
# target_col = st.selectbox(
# "This column should contain the labels associated with the images",
# col_names,
# index=col_names.index(get_key(config_metadata["col_mapping"], "target"))
# if config_metadata is not None
# else 0,
# )
# col_mapping[image_col] = "image"
# col_mapping[target_col] = "target"
# # Select metrics
# st.markdown("**Select metrics**")
# st.markdown("The following metrics will be computed")
# html_string = " ".join(
# [
# '<div style="padding-right:5px;padding-left:5px;padding-top:5px;padding-bottom:5px;float:left">'
# + '<div style="background-color:#D3D3D3;border-radius:5px;display:inline-block;padding-right:5px;'
# + 'padding-left:5px;color:white">'
# + metric
# + "</div></div>"
# for metric in TASK_TO_DEFAULT_METRICS[selected_task]
# ]
# )
# st.markdown(html_string, unsafe_allow_html=True)
# selected_metrics = st.multiselect(
# "(Optional) Select additional metrics",
# sorted(list(set(SUPPORTED_METRICS) - set(TASK_TO_DEFAULT_METRICS[selected_task]))),
# help="""User-selected metrics will be computed with their default arguments. \
# For example, `f1` will report results for binary labels. \
# Check out the [available metrics](https://huggingface.co/metrics) for more details.""",
# )
# with st.form(key="form"):
# compatible_models = get_compatible_models(selected_task, [selected_dataset])
# selected_models = st.multiselect(
# "Select the models you wish to evaluate",
# compatible_models,
# help="""Don't see your favourite model in this list? Add the dataset and task it was trained on to the \
# [model card metadata.](https://huggingface.co/docs/hub/models-cards#model-card-metadata)""",
# )
# print("INFO -- Selected models before filter:", selected_models)
# hf_username = st.text_input("Enter your πŸ€— Hub username to be notified when the evaluation is finished")
# submit_button = st.form_submit_button("Evaluate models πŸš€")
# if submit_button:
# if len(hf_username) == 0:
# st.warning("No πŸ€— Hub username provided! Please enter your username and try again.")
# elif len(selected_models) == 0:
# st.warning("⚠️ No models were selected for evaluation! Please select at least one model and try again.")
# elif len(selected_models) > 10:
# st.warning("Only 10 models can be evaluated at once. Please select fewer models and try again.")
# else:
# # Filter out previously evaluated models
# selected_models = filter_evaluated_models(
# selected_models,
# selected_task,
# selected_dataset,
# selected_config,
# selected_split,
# selected_metrics,
# )
# print("INFO -- Selected models after filter:", selected_models)
# if len(selected_models) > 0:
# project_payload = {
# "username": AUTOTRAIN_USERNAME,
# "proj_name": create_autotrain_project_name(selected_dataset, selected_config),
# "task": TASK_TO_ID[selected_task],
# "config": {
# "language": AUTOTRAIN_TASK_TO_LANG[selected_task]
# if selected_task in AUTOTRAIN_TASK_TO_LANG
# else "en",
# "max_models": 5,
# "instance": {
# "provider": "sagemaker" if selected_task in AUTOTRAIN_MACHINE.keys() else "ovh",
# "instance_type": AUTOTRAIN_MACHINE[selected_task]
# if selected_task in AUTOTRAIN_MACHINE.keys()
# else "p3",
# "max_runtime_seconds": 172800,
# "num_instances": 1,
# "disk_size_gb": 200,
# },
# "evaluation": {
# "metrics": selected_metrics,
# "models": selected_models,
# "hf_username": hf_username,
# },
# },
# }
# print(f"INFO -- Payload: {project_payload}")
# project_json_resp = http_post(
# path="/projects/create",
# payload=project_payload,
# token=HF_TOKEN,
# domain=AUTOTRAIN_BACKEND_API,
# ).json()
# print(f"INFO -- Project creation response: {project_json_resp}")
# if project_json_resp["created"]:
# data_payload = {
# "split": 4, # use "auto" split choice in AutoTrain
# "col_mapping": col_mapping,
# "load_config": {"max_size_bytes": 0, "shuffle": False},
# "dataset_id": selected_dataset,
# "dataset_config": selected_config,
# "dataset_split": selected_split,
# }
# data_json_resp = http_post(
# path=f"/projects/{project_json_resp['id']}/data/dataset",
# payload=data_payload,
# token=HF_TOKEN,
# domain=AUTOTRAIN_BACKEND_API,
# ).json()
# print(f"INFO -- Dataset creation response: {data_json_resp}")
# if data_json_resp["download_status"] == 1:
# train_json_resp = http_post(
# path=f"/projects/{project_json_resp['id']}/data/start_processing",
# token=HF_TOKEN,
# domain=AUTOTRAIN_BACKEND_API,
# ).json()
# # For local development we process and approve projects on-the-fly
# if "localhost" in AUTOTRAIN_BACKEND_API:
# with st.spinner("⏳ Waiting for data processing to complete ..."):
# is_data_processing_success = False
# while is_data_processing_success is not True:
# project_status = http_get(
# path=f"/projects/{project_json_resp['id']}",
# token=HF_TOKEN,
# domain=AUTOTRAIN_BACKEND_API,
# ).json()
# if project_status["status"] == 3:
# is_data_processing_success = True
# time.sleep(10)
# # Approve training job
# train_job_resp = http_post(
# path=f"/projects/{project_json_resp['id']}/start_training",
# token=HF_TOKEN,
# domain=AUTOTRAIN_BACKEND_API,
# ).json()
# st.success("βœ… Data processing and project approval complete - go forth and evaluate!")
# else:
# # Prod/staging submissions are evaluated in a cron job via run_evaluation_jobs.py
# print(f"INFO -- AutoTrain job response: {train_json_resp}")
# if train_json_resp["success"]:
# train_eval_index = {
# "train-eval-index": [
# {
# "config": selected_config,
# "task": AUTOTRAIN_TASK_TO_HUB_TASK[selected_task],
# "task_id": selected_task,
# "splits": {"eval_split": selected_split},
# "col_mapping": col_mapping,
# }
# ]
# }
# selected_metadata = yaml.dump(train_eval_index, sort_keys=False)
# dataset_card_url = get_dataset_card_url(selected_dataset)
# st.success("βœ… Successfully submitted evaluation job!")
# st.markdown(
# f"""
# Evaluation can take up to 1 hour to complete, so grab a β˜•οΈ or 🍡 while you wait:
# * πŸ”” A [Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions) with the evaluation results will be opened for each model you selected. Check your email for notifications.
# * πŸ“Š Click [here](https://hf.co/spaces/autoevaluate/leaderboards?dataset={selected_dataset}) to view the results from your submission once the Hub pull request is merged.
# * πŸ₯± Tired of configuring evaluations? Add the following metadata to the [dataset card]({dataset_card_url}) to enable 1-click evaluations:
# """ # noqa
# )
# st.markdown(
# f"""
# ```yaml
# {selected_metadata}
# """
# )
# print("INFO -- Pushing evaluation job logs to the Hub")
# evaluation_log = {}
# evaluation_log["project_id"] = project_json_resp["id"]
# evaluation_log["autotrain_env"] = (
# "staging" if "staging" in AUTOTRAIN_BACKEND_API else "prod"
# )
# evaluation_log["payload"] = project_payload
# evaluation_log["project_creation_response"] = project_json_resp
# evaluation_log["dataset_creation_response"] = data_json_resp
# evaluation_log["autotrain_job_response"] = train_json_resp
# commit_evaluation_log(evaluation_log, hf_access_token=HF_TOKEN)
# else:
# st.error("πŸ™ˆ Oh no, there was an error submitting your evaluation job!")
# else:
# st.warning("⚠️ No models left to evaluate! Please select other models and try again.")