Spaces:
AIR-Bench
/
Running on CPU Upgrade

leaderboard / app.py
nan's picture
feat-add-versions-to-benchmarks (#28)
3478401 verified
raw
history blame
23.7 kB
import os
import gradio as gr
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
from src.about import BENCHMARKS_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, TITLE
from src.benchmarks import LongDocBenchmarks, QABenchmarks
from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
from src.components import (
get_anonymous_checkbox,
get_domain_dropdown,
get_language_dropdown,
get_leaderboard_table,
get_metric_dropdown,
get_noreranking_dropdown,
get_reranking_dropdown,
get_revision_and_ts_checkbox,
get_search_bar,
get_version_dropdown,
)
from src.css_html_js import custom_css
from src.envs import (
API,
BENCHMARK_VERSION_LIST,
DEFAULT_METRIC_LONG_DOC,
DEFAULT_METRIC_QA,
EVAL_RESULTS_PATH,
LATEST_BENCHMARK_VERSION,
METRIC_LIST,
REPO_ID,
RESULTS_REPO,
TOKEN,
)
from src.loaders import load_eval_results
from src.models import TaskType, model_hyperlink
from src.utils import remove_html, reset_rank, set_listeners, submit_results, update_metric, upload_file
def restart_space():
API.restart_space(repo_id=REPO_ID)
try:
if not os.environ.get("LOCAL_MODE", False):
print("Running in local mode")
snapshot_download(
repo_id=RESULTS_REPO,
local_dir=EVAL_RESULTS_PATH,
repo_type="dataset",
tqdm_class=None,
etag_timeout=30,
token=TOKEN,
)
except Exception:
print("failed to download")
restart_space()
global ds_dict
ds_dict = load_eval_results(EVAL_RESULTS_PATH)
global datastore
datastore = ds_dict[LATEST_BENCHMARK_VERSION]
def update_qa_metric(
metric: str,
domains: list,
langs: list,
reranking_model: list,
query: str,
show_anonymous: bool,
show_revision_and_timestamp: bool,
):
global datastore
return update_metric(
datastore,
TaskType.qa,
metric,
domains,
langs,
reranking_model,
query,
show_anonymous,
show_revision_and_timestamp,
)
def update_doc_metric(
metric: str,
domains: list,
langs: list,
reranking_model: list,
query: str,
show_anonymous: bool,
show_revision_and_timestamp,
):
global datastore
return update_metric(
datastore,
TaskType.long_doc,
metric,
domains,
langs,
reranking_model,
query,
show_anonymous,
show_revision_and_timestamp,
)
def update_qa_version(version):
global datastore
global ds_dict
datastore = ds_dict[version]
domain_elem = get_domain_dropdown(QABenchmarks[datastore.slug])
lang_elem = get_language_dropdown(QABenchmarks[datastore.slug])
model_elem = get_reranking_dropdown(datastore.reranking_models)
df_elem = get_leaderboard_table(datastore.qa_fmt_df, datastore.qa_types)
hidden_df_elem = get_leaderboard_table(datastore.qa_raw_df, datastore.qa_types, visible=False)
return domain_elem, lang_elem, model_elem, df_elem, hidden_df_elem
def update_doc_version(version):
global datastore
global ds_dict
datastore = ds_dict[version]
domain_elem = get_domain_dropdown(LongDocBenchmarks[datastore.slug])
lang_elem = get_language_dropdown(LongDocBenchmarks[datastore.slug])
model_elem = get_reranking_dropdown(datastore.reranking_models)
df_elem = get_leaderboard_table(datastore.doc_fmt_df, datastore.doc_types)
hidden_df_elem = get_leaderboard_table(datastore.doc_raw_df, datastore.doc_types, visible=False)
return domain_elem, lang_elem, model_elem, df_elem, hidden_df_elem
demo = gr.Blocks(css=custom_css)
BM25_LINK = model_hyperlink("https://github.com/castorini/pyserini", "BM25")
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("Results", elem_id="results-tab-table"):
with gr.Row():
version = get_version_dropdown()
with gr.TabItem("QA", elem_id="qa-benchmark-tab-table", id=0):
with gr.Row():
with gr.Column(min_width=320):
# select domain
with gr.Row():
domains = get_domain_dropdown(QABenchmarks[datastore.slug])
# select language
with gr.Row():
langs = get_language_dropdown(QABenchmarks[datastore.slug])
with gr.Column():
# select the metric
metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_QA)
with gr.Row():
show_anonymous = get_anonymous_checkbox()
with gr.Row():
show_rev_ts = get_revision_and_ts_checkbox()
with gr.Tabs(elem_classes="tab-buttons") as sub_tabs:
with gr.TabItem("Retrieval + Reranking", id=10):
with gr.Row():
# search retrieval models
with gr.Column():
search_bar = get_search_bar()
# select reranking models
with gr.Column():
models = get_reranking_dropdown(datastore.reranking_models)
# shown_table
qa_df_elem_ret_rerank = get_leaderboard_table(datastore.qa_fmt_df, datastore.qa_types)
# Dummy leaderboard for handling the case when the user uses backspace key
qa_df_elem_ret_rerank_hidden = get_leaderboard_table(
datastore.qa_raw_df, datastore.qa_types, visible=False
)
version.change(
update_qa_version,
version,
[domains, langs, models, qa_df_elem_ret_rerank, qa_df_elem_ret_rerank_hidden],
)
set_listeners(
TaskType.qa,
qa_df_elem_ret_rerank,
qa_df_elem_ret_rerank_hidden,
search_bar,
version,
domains,
langs,
models,
show_anonymous,
show_rev_ts,
)
# set metric listener
metric.change(
update_qa_metric,
[metric, domains, langs, models, search_bar, show_anonymous, show_rev_ts],
qa_df_elem_ret_rerank,
queue=True,
)
with gr.TabItem("Retrieval Only", id=11):
with gr.Row():
with gr.Column(scale=1):
search_bar_ret = get_search_bar()
with gr.Column(scale=1):
models_ret = get_noreranking_dropdown()
_qa_df_ret = datastore.qa_fmt_df[datastore.qa_fmt_df[COL_NAME_RERANKING_MODEL] == "NoReranker"]
_qa_df_ret = reset_rank(_qa_df_ret)
qa_df_elem_ret = get_leaderboard_table(_qa_df_ret, datastore.qa_types)
# Dummy leaderboard for handling the case when the user uses backspace key
_qa_df_ret_hidden = datastore.qa_raw_df[
datastore.qa_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
]
_qa_df_ret_hidden = reset_rank(_qa_df_ret_hidden)
qa_df_elem_ret_hidden = get_leaderboard_table(
_qa_df_ret_hidden, datastore.qa_types, visible=False
)
version.change(
update_qa_version,
version,
[
domains,
langs,
models_ret,
qa_df_elem_ret,
qa_df_elem_ret_hidden,
],
)
set_listeners(
TaskType.qa,
qa_df_elem_ret,
qa_df_elem_ret_hidden,
search_bar_ret,
version,
domains,
langs,
models_ret,
show_anonymous,
show_rev_ts,
)
metric.change(
update_qa_metric,
[
metric,
domains,
langs,
models_ret,
search_bar_ret,
show_anonymous,
show_rev_ts,
],
qa_df_elem_ret,
queue=True,
)
with gr.TabItem("Reranking Only", id=12):
_qa_df_rerank = datastore.qa_fmt_df[datastore.qa_fmt_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
_qa_df_rerank = reset_rank(_qa_df_rerank)
qa_rerank_models = _qa_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
with gr.Row():
with gr.Column(scale=1):
qa_models_rerank = get_reranking_dropdown(qa_rerank_models)
with gr.Column(scale=1):
qa_search_bar_rerank = gr.Textbox(show_label=False, visible=False)
qa_df_elem_rerank = get_leaderboard_table(_qa_df_rerank, datastore.qa_types)
_qa_df_rerank_hidden = datastore.qa_raw_df[
datastore.qa_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
]
_qa_df_rerank_hidden = reset_rank(_qa_df_rerank_hidden)
qa_df_elem_rerank_hidden = get_leaderboard_table(
_qa_df_rerank_hidden, datastore.qa_types, visible=False
)
version.change(
update_qa_version,
version,
[domains, langs, qa_models_rerank, qa_df_elem_rerank, qa_df_elem_rerank_hidden],
)
set_listeners(
TaskType.qa,
qa_df_elem_rerank,
qa_df_elem_rerank_hidden,
qa_search_bar_rerank,
version,
domains,
langs,
qa_models_rerank,
show_anonymous,
show_rev_ts,
)
metric.change(
update_qa_metric,
[
metric,
domains,
langs,
qa_models_rerank,
qa_search_bar_rerank,
show_anonymous,
show_rev_ts,
],
qa_df_elem_rerank,
queue=True,
)
with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
with gr.Row():
with gr.Column(min_width=320):
# select domain
with gr.Row():
domains = get_domain_dropdown(LongDocBenchmarks[datastore.slug])
# select language
with gr.Row():
langs = get_language_dropdown(LongDocBenchmarks[datastore.slug])
with gr.Column():
# select the metric
with gr.Row():
metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_LONG_DOC)
with gr.Row():
show_anonymous = get_anonymous_checkbox()
with gr.Row():
show_rev_ts = get_revision_and_ts_checkbox()
with gr.Tabs(elem_classes="tab-buttons"):
with gr.TabItem("Retrieval + Reranking", id=20):
with gr.Row():
with gr.Column():
search_bar = get_search_bar()
with gr.Column():
models = get_reranking_dropdown(datastore.reranking_models)
doc_df_elem_ret_rerank = get_leaderboard_table(datastore.doc_fmt_df, datastore.doc_types)
# Dummy leaderboard for handling the case when the user uses backspace key
doc_df_elem_ret_rerank_hidden = get_leaderboard_table(
datastore.doc_raw_df, datastore.doc_types, visible=False
)
version.change(
update_doc_version,
version,
[domains, langs, models, doc_df_elem_ret_rerank, doc_df_elem_ret_rerank_hidden],
)
set_listeners(
TaskType.long_doc,
doc_df_elem_ret_rerank,
doc_df_elem_ret_rerank_hidden,
search_bar,
version,
domains,
langs,
models,
show_anonymous,
show_rev_ts,
)
# set metric listener
metric.change(
update_doc_metric,
[
metric,
domains,
langs,
models,
search_bar,
show_anonymous,
show_rev_ts,
],
doc_df_elem_ret_rerank,
queue=True,
)
with gr.TabItem("Retrieval Only", id=21):
with gr.Row():
with gr.Column(scale=1):
search_bar_ret = get_search_bar()
with gr.Column(scale=1):
models_ret = get_noreranking_dropdown()
_doc_df_ret = datastore.doc_fmt_df[
datastore.doc_fmt_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
]
_doc_df_ret = reset_rank(_doc_df_ret)
doc_df_elem_ret = get_leaderboard_table(_doc_df_ret, datastore.doc_types)
_doc_df_ret_hidden = datastore.doc_raw_df[
datastore.doc_raw_df[COL_NAME_RERANKING_MODEL] == "NoReranker"
]
_doc_df_ret_hidden = reset_rank(_doc_df_ret_hidden)
doc_df_elem_ret_hidden = get_leaderboard_table(
_doc_df_ret_hidden, datastore.doc_types, visible=False
)
version.change(
update_doc_version,
version,
[domains, langs, models_ret, doc_df_elem_ret, doc_df_elem_ret_hidden],
)
set_listeners(
TaskType.long_doc,
doc_df_elem_ret,
doc_df_elem_ret_hidden,
search_bar_ret,
version,
domains,
langs,
models_ret,
show_anonymous,
show_rev_ts,
)
metric.change(
update_doc_metric,
[
metric,
domains,
langs,
models_ret,
search_bar_ret,
show_anonymous,
show_rev_ts,
],
doc_df_elem_ret,
queue=True,
)
with gr.TabItem("Reranking Only", id=22):
_doc_df_rerank = datastore.doc_fmt_df[
datastore.doc_fmt_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
]
_doc_df_rerank = reset_rank(_doc_df_rerank)
doc_rerank_models = (
_doc_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
)
with gr.Row():
with gr.Column(scale=1):
doc_models_rerank = get_reranking_dropdown(doc_rerank_models)
with gr.Column(scale=1):
doc_search_bar_rerank = gr.Textbox(show_label=False, visible=False)
doc_df_elem_rerank = get_leaderboard_table(_doc_df_rerank, datastore.doc_types)
_doc_df_rerank_hidden = datastore.doc_raw_df[
datastore.doc_raw_df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK
]
_doc_df_rerank_hidden = reset_rank(_doc_df_rerank_hidden)
doc_df_elem_rerank_hidden = get_leaderboard_table(
_doc_df_rerank_hidden, datastore.doc_types, visible=False
)
version.change(
update_doc_version,
version,
[domains, langs, doc_models_rerank, doc_df_elem_rerank, doc_df_elem_rerank_hidden],
)
set_listeners(
TaskType.long_doc,
doc_df_elem_rerank,
doc_df_elem_rerank_hidden,
doc_search_bar_rerank,
version,
domains,
langs,
doc_models_rerank,
show_anonymous,
show_rev_ts,
)
metric.change(
update_doc_metric,
[
metric,
domains,
langs,
doc_models_rerank,
doc_search_bar_rerank,
show_anonymous,
show_rev_ts,
],
doc_df_elem_rerank,
queue=True,
)
with gr.TabItem("🚀Submit here!", elem_id="submit-tab-table", id=2):
with gr.Column():
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
with gr.Row():
gr.Markdown("## ✉️Submit your model here!", elem_classes="markdown-text")
with gr.Row():
with gr.Column():
model_name = gr.Textbox(label="Retrieval Method name")
with gr.Column():
model_url = gr.Textbox(label="Retrieval Method URL")
with gr.Row():
with gr.Column():
reranking_model_name = gr.Textbox(
label="Reranking Model name", info="Optional", value="NoReranker"
)
with gr.Column():
reranking_model_url = gr.Textbox(label="Reranking Model URL", info="Optional", value="")
with gr.Row():
with gr.Column():
benchmark_version = gr.Dropdown(
BENCHMARK_VERSION_LIST,
value=LATEST_BENCHMARK_VERSION,
interactive=True,
label="AIR-Bench Version",
)
with gr.Row():
upload_button = gr.UploadButton("Click to upload search results", file_count="single")
with gr.Row():
file_output = gr.File()
with gr.Row():
is_anonymous = gr.Checkbox(
label="Nope. I want to submit anonymously 🥷",
value=False,
info="Do you want to shown on the leaderboard by default?",
)
with gr.Row():
submit_button = gr.Button("Submit")
with gr.Row():
submission_result = gr.Markdown()
upload_button.upload(
upload_file,
[
upload_button,
],
file_output,
)
submit_button.click(
submit_results,
[
file_output,
model_name,
model_url,
reranking_model_name,
reranking_model_url,
benchmark_version,
is_anonymous,
],
submission_result,
show_progress="hidden",
)
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
gr.Markdown(BENCHMARKS_TEXT, elem_classes="markdown-text")
if __name__ == "__main__":
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40)
demo.launch()