|
import os |
|
import uuid |
|
from pathlib import Path |
|
|
|
import pandas as pd |
|
import streamlit as st |
|
from datasets import get_dataset_config_names |
|
from dotenv import load_dotenv |
|
from huggingface_hub import list_datasets |
|
|
|
from evaluation import filter_evaluated_models |
|
from utils import ( |
|
commit_evaluation_log, |
|
format_col_mapping, |
|
get_compatible_models, |
|
get_key, |
|
get_metadata, |
|
http_get, |
|
http_post, |
|
) |
|
|
|
if Path(".env").is_file(): |
|
load_dotenv(".env") |
|
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME") |
|
AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API") |
|
DATASETS_PREVIEW_API = os.getenv("DATASETS_PREVIEW_API") |
|
|
|
|
|
TASK_TO_ID = { |
|
"image_binary_classification": 17, |
|
"image_multi_class_classification": 18, |
|
"binary_classification": 1, |
|
"multi_class_classification": 2, |
|
"entity_extraction": 4, |
|
"extractive_question_answering": 5, |
|
"translation": 6, |
|
"summarization": 8, |
|
} |
|
|
|
TASK_TO_DEFAULT_METRICS = { |
|
"binary_classification": ["f1", "precision", "recall", "auc", "accuracy"], |
|
"multi_class_classification": [ |
|
"f1", |
|
"precision", |
|
"recall", |
|
"accuracy", |
|
], |
|
"entity_extraction": ["precision", "recall", "f1", "accuracy"], |
|
"extractive_question_answering": [], |
|
"translation": ["sacrebleu"], |
|
"summarization": ["rouge1", "rouge2", "rougeL", "rougeLsum"], |
|
"image_binary_classification": ["f1", "precision", "recall", "auc", "accuracy"], |
|
"image_multi_class_classification": [ |
|
"f1", |
|
"precision", |
|
"recall", |
|
"accuracy", |
|
], |
|
} |
|
|
|
AUTOTRAIN_TASK_TO_LANG = { |
|
"translation": "en2de", |
|
"image_binary_classification": "unk", |
|
"image_multi_class_classification": "unk", |
|
} |
|
|
|
|
|
SUPPORTED_TASKS = list(TASK_TO_ID.keys()) |
|
|
|
|
|
|
|
SUPPORTED_METRICS = [ |
|
"accuracy", |
|
"bertscore", |
|
"bleu", |
|
"cer", |
|
"chrf", |
|
"code_eval", |
|
"comet", |
|
"competition_math", |
|
"coval", |
|
"cuad", |
|
"exact_match", |
|
"f1", |
|
"frugalscore", |
|
"google_bleu", |
|
"mae", |
|
"mahalanobis", |
|
"matthews_correlation", |
|
"mean_iou", |
|
"meteor", |
|
"mse", |
|
"pearsonr", |
|
"perplexity", |
|
"precision", |
|
"recall", |
|
"roc_auc", |
|
"rouge", |
|
"sacrebleu", |
|
"sari", |
|
"seqeval", |
|
"spearmanr", |
|
"squad", |
|
"squad_v2", |
|
"ter", |
|
"trec_eval", |
|
"wer", |
|
"wiki_split", |
|
"xnli", |
|
"angelina-wang/directional_bias_amplification", |
|
"jordyvl/ece", |
|
"lvwerra/ai4code", |
|
"lvwerra/amex", |
|
"lvwerra/test", |
|
"lvwerra/test_metric", |
|
] |
|
|
|
|
|
|
|
|
|
|
|
st.title("Evaluation on the Hub") |
|
st.markdown( |
|
""" |
|
Welcome to Hugging Face's automatic model evaluator π! |
|
|
|
This application allows you to evaluate π€ Transformers |
|
[models](https://huggingface.co/models?library=transformers&sort=downloads) |
|
across a wide variety of [datasets](https://huggingface.co/datasets) on the |
|
Hub. Please select the dataset and configuration below. The results of your |
|
evaluation will be displayed on the [public |
|
leaderboard](https://huggingface.co/spaces/autoevaluate/leaderboards). |
|
""" |
|
) |
|
|
|
all_datasets = [d.id for d in list_datasets()] |
|
query_params = st.experimental_get_query_params() |
|
default_dataset = all_datasets[0] |
|
if "dataset" in query_params: |
|
if len(query_params["dataset"]) > 0 and query_params["dataset"][0] in all_datasets: |
|
default_dataset = query_params["dataset"][0] |
|
|
|
selected_dataset = st.selectbox( |
|
"Select a dataset", |
|
all_datasets, |
|
index=all_datasets.index(default_dataset), |
|
help="""Datasets with metadata can be evaluated with 1-click. Check out the \ |
|
[documentation](https://huggingface.co/docs/hub/datasets-cards) to add \ |
|
evaluation metadata to a dataset.""", |
|
) |
|
st.experimental_set_query_params(**{"dataset": [selected_dataset]}) |
|
|
|
|
|
is_valid_dataset = http_get( |
|
path="/is-valid", |
|
domain=DATASETS_PREVIEW_API, |
|
params={"dataset": selected_dataset}, |
|
).json() |
|
if is_valid_dataset["valid"] is False: |
|
st.error( |
|
"""The dataset you selected is not currently supported. Open a \ |
|
[discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) for support.""" |
|
) |
|
|
|
metadata = get_metadata(selected_dataset) |
|
print(f"INFO -- Dataset metadata: {metadata}") |
|
if metadata is None: |
|
st.warning("No evaluation metadata found. Please configure the evaluation job below.") |
|
|
|
with st.expander("Advanced configuration"): |
|
|
|
selected_task = st.selectbox( |
|
"Select a task", |
|
SUPPORTED_TASKS, |
|
index=SUPPORTED_TASKS.index(metadata[0]["task_id"]) if metadata is not None else 0, |
|
help="""Don't see your favourite task here? Open a \ |
|
[discussion](https://huggingface.co/spaces/autoevaluate/model-evaluator/discussions) to request it!""", |
|
) |
|
|
|
configs = get_dataset_config_names(selected_dataset) |
|
selected_config = st.selectbox( |
|
"Select a config", |
|
configs, |
|
help="""Some datasets contain several sub-datasets, known as _configurations_. \ |
|
Select one to evaluate your models on. \ |
|
See the [docs](https://huggingface.co/docs/datasets/master/en/load_hub#configurations) for more details. |
|
""", |
|
) |
|
|
|
|
|
splits_resp = http_get( |
|
path="/splits", |
|
domain=DATASETS_PREVIEW_API, |
|
params={"dataset": selected_dataset}, |
|
) |
|
if splits_resp.status_code == 200: |
|
split_names = [] |
|
all_splits = splits_resp.json() |
|
for split in all_splits["splits"]: |
|
if split["config"] == selected_config: |
|
split_names.append(split["split"]) |
|
|
|
if metadata is not None: |
|
eval_split = metadata[0]["splits"].get("eval_split", None) |
|
else: |
|
eval_split = None |
|
selected_split = st.selectbox( |
|
"Select a split", |
|
split_names, |
|
index=split_names.index(eval_split) if eval_split is not None else 0, |
|
help="Be wary when evaluating models on the `train` split.", |
|
) |
|
|
|
|
|
rows_resp = http_get( |
|
path="/rows", |
|
domain=DATASETS_PREVIEW_API, |
|
params={ |
|
"dataset": selected_dataset, |
|
"config": selected_config, |
|
"split": selected_split, |
|
}, |
|
).json() |
|
col_names = list(pd.json_normalize(rows_resp["rows"][0]["row"]).columns) |
|
|
|
st.markdown("**Map your dataset columns**") |
|
st.markdown( |
|
"""The model evaluator uses a standardised set of column names for the input examples and labels. \ |
|
Please define the mapping between your dataset columns (right) and the standardised column names (left).""" |
|
) |
|
col1, col2 = st.columns(2) |
|
|
|
|
|
|
|
col_mapping = {} |
|
if selected_task in ["binary_classification", "multi_class_classification"]: |
|
with col1: |
|
st.markdown("`text` column") |
|
st.text("") |
|
st.text("") |
|
st.text("") |
|
st.text("") |
|
st.markdown("`target` column") |
|
with col2: |
|
text_col = st.selectbox( |
|
"This column should contain the text to be classified", |
|
col_names, |
|
index=col_names.index(get_key(metadata[0]["col_mapping"], "text")) if metadata is not None else 0, |
|
) |
|
target_col = st.selectbox( |
|
"This column should contain the labels associated with the text", |
|
col_names, |
|
index=col_names.index(get_key(metadata[0]["col_mapping"], "target")) if metadata is not None else 0, |
|
) |
|
col_mapping[text_col] = "text" |
|
col_mapping[target_col] = "target" |
|
|
|
elif selected_task == "entity_extraction": |
|
with col1: |
|
st.markdown("`tokens` column") |
|
st.text("") |
|
st.text("") |
|
st.text("") |
|
st.text("") |
|
st.markdown("`tags` column") |
|
with col2: |
|
tokens_col = st.selectbox( |
|
"This column should contain the array of tokens to be classified", |
|
col_names, |
|
index=col_names.index(get_key(metadata[0]["col_mapping"], "tokens")) if metadata is not None else 0, |
|
) |
|
tags_col = st.selectbox( |
|
"This column should contain the labels associated with each part of the text", |
|
col_names, |
|
index=col_names.index(get_key(metadata[0]["col_mapping"], "tags")) if metadata is not None else 0, |
|
) |
|
col_mapping[tokens_col] = "tokens" |
|
col_mapping[tags_col] = "tags" |
|
|
|
elif selected_task == "translation": |
|
with col1: |
|
st.markdown("`source` column") |
|
st.text("") |
|
st.text("") |
|
st.text("") |
|
st.text("") |
|
st.markdown("`target` column") |
|
with col2: |
|
text_col = st.selectbox( |
|
"This column should contain the text to be translated", |
|
col_names, |
|
index=col_names.index(get_key(metadata[0]["col_mapping"], "source")) if metadata is not None else 0, |
|
) |
|
target_col = st.selectbox( |
|
"This column should contain the target translation", |
|
col_names, |
|
index=col_names.index(get_key(metadata[0]["col_mapping"], "target")) if metadata is not None else 0, |
|
) |
|
col_mapping[text_col] = "source" |
|
col_mapping[target_col] = "target" |
|
|
|
elif selected_task == "summarization": |
|
with col1: |
|
st.markdown("`text` column") |
|
st.text("") |
|
st.text("") |
|
st.text("") |
|
st.text("") |
|
st.markdown("`target` column") |
|
with col2: |
|
text_col = st.selectbox( |
|
"This column should contain the text to be summarized", |
|
col_names, |
|
index=col_names.index(get_key(metadata[0]["col_mapping"], "text")) if metadata is not None else 0, |
|
) |
|
target_col = st.selectbox( |
|
"This column should contain the target summary", |
|
col_names, |
|
index=col_names.index(get_key(metadata[0]["col_mapping"], "target")) if metadata is not None else 0, |
|
) |
|
col_mapping[text_col] = "text" |
|
col_mapping[target_col] = "target" |
|
|
|
elif selected_task == "extractive_question_answering": |
|
if metadata is not None: |
|
col_mapping = metadata[0]["col_mapping"] |
|
|
|
col_mapping = format_col_mapping(col_mapping) |
|
with col1: |
|
st.markdown("`context` column") |
|
st.text("") |
|
st.text("") |
|
st.text("") |
|
st.text("") |
|
st.markdown("`question` column") |
|
st.text("") |
|
st.text("") |
|
st.text("") |
|
st.text("") |
|
st.markdown("`answers.text` column") |
|
st.text("") |
|
st.text("") |
|
st.text("") |
|
st.text("") |
|
st.markdown("`answers.answer_start` column") |
|
with col2: |
|
context_col = st.selectbox( |
|
"This column should contain the question's context", |
|
col_names, |
|
index=col_names.index(get_key(col_mapping, "context")) if metadata is not None else 0, |
|
) |
|
question_col = st.selectbox( |
|
"This column should contain the question to be answered, given the context", |
|
col_names, |
|
index=col_names.index(get_key(col_mapping, "question")) if metadata is not None else 0, |
|
) |
|
answers_text_col = st.selectbox( |
|
"This column should contain example answers to the question, extracted from the context", |
|
col_names, |
|
index=col_names.index(get_key(col_mapping, "answers.text")) if metadata is not None else 0, |
|
) |
|
answers_start_col = st.selectbox( |
|
"This column should contain the indices in the context of the first character of each `answers.text`", |
|
col_names, |
|
index=col_names.index(get_key(col_mapping, "answers.answer_start")) if metadata is not None else 0, |
|
) |
|
col_mapping[context_col] = "context" |
|
col_mapping[question_col] = "question" |
|
col_mapping[answers_text_col] = "answers.text" |
|
col_mapping[answers_start_col] = "answers.answer_start" |
|
elif selected_task in ["image_binary_classification", "image_multi_class_classification"]: |
|
with col1: |
|
st.markdown("`image` column") |
|
st.text("") |
|
st.text("") |
|
st.text("") |
|
st.text("") |
|
st.markdown("`target` column") |
|
with col2: |
|
image_col = st.selectbox( |
|
"This column should contain the images to be classified", |
|
col_names, |
|
index=col_names.index(get_key(metadata[0]["col_mapping"], "image")) if metadata is not None else 0, |
|
) |
|
target_col = st.selectbox( |
|
"This column should contain the labels associated with the images", |
|
col_names, |
|
index=col_names.index(get_key(metadata[0]["col_mapping"], "target")) if metadata is not None else 0, |
|
) |
|
col_mapping[image_col] = "image" |
|
col_mapping[target_col] = "target" |
|
|
|
|
|
st.markdown("**Select metrics**") |
|
st.markdown("The following metrics will be computed") |
|
html_string = " ".join( |
|
[ |
|
'<div style="padding-right:5px;padding-left:5px;padding-top:5px;padding-bottom:5px;float:left">' |
|
+ '<div style="background-color:#D3D3D3;border-radius:5px;display:inline-block;padding-right:5px;' |
|
+ 'padding-left:5px;color:white">' |
|
+ metric |
|
+ "</div></div>" |
|
for metric in TASK_TO_DEFAULT_METRICS[selected_task] |
|
] |
|
) |
|
st.markdown(html_string, unsafe_allow_html=True) |
|
selected_metrics = st.multiselect( |
|
"(Optional) Select additional metrics", |
|
sorted(list(set(SUPPORTED_METRICS) - set(TASK_TO_DEFAULT_METRICS[selected_task]))), |
|
help="""User-selected metrics will be computed with their default arguments. \ |
|
For example, `f1` will report results for binary labels. \ |
|
Check out the [available metrics](https://huggingface.co/metrics) for more details.""", |
|
) |
|
|
|
with st.form(key="form"): |
|
compatible_models = get_compatible_models(selected_task, [selected_dataset]) |
|
selected_models = st.multiselect( |
|
"Select the models you wish to evaluate", |
|
compatible_models, |
|
help="""Don't see your favourite model in this list? Add the dataset and task it was trained on to the \ |
|
[model card metadata.](https://huggingface.co/docs/hub/models-cards#model-card-metadata)""", |
|
) |
|
print("INFO -- Selected models before filter:", selected_models) |
|
|
|
if len(selected_models) > 0: |
|
selected_models = filter_evaluated_models( |
|
selected_models, |
|
selected_task, |
|
selected_dataset, |
|
selected_config, |
|
selected_split, |
|
) |
|
print("INFO -- Selected models after filter:", selected_models) |
|
|
|
hf_username = st.text_input("Enter your π€ Hub username to be notified when the evaluation is finished") |
|
|
|
submit_button = st.form_submit_button("Evaluate models π") |
|
|
|
if submit_button: |
|
if len(selected_models) > 0: |
|
project_id = str(uuid.uuid4())[:8] |
|
project_payload = { |
|
"username": AUTOTRAIN_USERNAME, |
|
"proj_name": f"eval-project-{project_id}", |
|
"task": TASK_TO_ID[selected_task], |
|
"config": { |
|
"language": AUTOTRAIN_TASK_TO_LANG[selected_task] |
|
if selected_task in AUTOTRAIN_TASK_TO_LANG |
|
else "en", |
|
"max_models": 5, |
|
"instance": { |
|
"provider": "aws", |
|
"instance_type": "ml.g4dn.4xlarge", |
|
"max_runtime_seconds": 172800, |
|
"num_instances": 1, |
|
"disk_size_gb": 150, |
|
}, |
|
"evaluation": {"metrics": selected_metrics, "models": selected_models, "hf_username": hf_username}, |
|
}, |
|
} |
|
print(f"INFO -- Payload: {project_payload}") |
|
project_json_resp = http_post( |
|
path="/projects/create", |
|
payload=project_payload, |
|
token=HF_TOKEN, |
|
domain=AUTOTRAIN_BACKEND_API, |
|
).json() |
|
print(f"INFO -- Project creation response: {project_json_resp}") |
|
|
|
if project_json_resp["created"]: |
|
data_payload = { |
|
"split": 4, |
|
"col_mapping": col_mapping, |
|
"load_config": {"max_size_bytes": 0, "shuffle": False}, |
|
} |
|
data_json_resp = http_post( |
|
path=f"/projects/{project_json_resp['id']}/data/{selected_dataset}", |
|
payload=data_payload, |
|
token=HF_TOKEN, |
|
domain=AUTOTRAIN_BACKEND_API, |
|
params={ |
|
"type": "dataset", |
|
"config_name": selected_config, |
|
"split_name": selected_split, |
|
}, |
|
).json() |
|
print(f"INFO -- Dataset creation response: {data_json_resp}") |
|
if data_json_resp["download_status"] == 1: |
|
train_json_resp = http_get( |
|
path=f"/projects/{project_json_resp['id']}/data/start_process", |
|
token=HF_TOKEN, |
|
domain=AUTOTRAIN_BACKEND_API, |
|
).json() |
|
print(f"INFO -- AutoTrain job response: {train_json_resp}") |
|
if train_json_resp["success"]: |
|
st.success(f"β
Successfully submitted evaluation job with project ID {project_id}") |
|
st.markdown( |
|
f""" |
|
Evaluation can take up to 1 hour to complete, so grab a β or π΅ while you wait: |
|
|
|
π Click [here](https://hf.co/spaces/autoevaluate/leaderboards?dataset={selected_dataset}) \ |
|
to view the results from your submission |
|
""" |
|
) |
|
print("INFO -- Pushing evaluation job logs to the Hub") |
|
evaluation_log = {} |
|
evaluation_log["payload"] = project_payload |
|
evaluation_log["project_creation_response"] = project_json_resp |
|
evaluation_log["dataset_creation_response"] = data_json_resp |
|
evaluation_log["autotrain_job_response"] = train_json_resp |
|
commit_evaluation_log(evaluation_log, hf_access_token=HF_TOKEN) |
|
else: |
|
st.error("π Oh no, there was an error submitting your evaluation job!") |
|
else: |
|
st.warning("β οΈ No models were selected for evaluation!") |
|
|