Spaces:
AIR-Bench
/
Running on CPU Upgrade

leaderboard / app.py
hanhainebula's picture
update citation info
ae85833
import os
import gradio as gr
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
from src.about import (
BENCHMARKS_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, TITLE,
CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
)
from src.benchmarks import LongDocBenchmarks, QABenchmarks
from src.columns import COL_NAME_RERANKING_MODEL, COL_NAME_RETRIEVAL_MODEL
from src.components import (
get_anonymous_checkbox,
get_domain_dropdown,
get_language_dropdown,
get_leaderboard_table,
get_metric_dropdown,
get_noreranking_dropdown,
get_reranking_dropdown,
get_revision_and_ts_checkbox,
get_search_bar,
get_version_dropdown,
)
from src.css_html_js import custom_css
from src.envs import (
API,
BENCHMARK_VERSION_LIST,
DEFAULT_METRIC_LONG_DOC,
DEFAULT_METRIC_QA,
EVAL_RESULTS_PATH,
LATEST_BENCHMARK_VERSION,
METRIC_LIST,
REPO_ID,
RESULTS_REPO,
TOKEN,
)
from src.loaders import load_eval_results
from src.models import TaskType, model_hyperlink
from src.utils import remove_html, reset_rank, set_listeners, submit_results, update_metric, upload_file
def restart_space():
API.restart_space(repo_id=REPO_ID)
try:
if os.environ.get("LOCAL_MODE", False):
print("Loading the data")
snapshot_download(
repo_id=RESULTS_REPO,
local_dir=EVAL_RESULTS_PATH,
repo_type="dataset",
tqdm_class=None,
etag_timeout=30,
token=TOKEN,
)
else:
print("Running in local mode")
except Exception:
print("failed to download")
restart_space()
global ds_dict
ds_dict = load_eval_results(EVAL_RESULTS_PATH)
global datastore
datastore = ds_dict[LATEST_BENCHMARK_VERSION]
def update_qa_metric(
metric: str,
domains: list,
langs: list,
reranking_model: list,
query: str,
show_anonymous: bool,
show_revision_and_timestamp: bool,
):
global datastore
return update_metric(
datastore,
TaskType.qa,
metric,
domains,
langs,
reranking_model,
query,
show_anonymous,
show_revision_and_timestamp,
)
def update_doc_metric(
metric: str,
domains: list,
langs: list,
reranking_model: list,
query: str,
show_anonymous: bool,
show_revision_and_timestamp,
):
global datastore
return update_metric(
datastore,
TaskType.long_doc,
metric,
domains,
langs,
reranking_model,
query,
show_anonymous,
show_revision_and_timestamp,
)
def update_datastore(version):
global datastore
global ds_dict
if datastore.version != version:
print(f"updated data version: {datastore.version} -> {version}")
datastore = ds_dict[version]
else:
print(f"current data version: {datastore.version}")
return datastore
def update_qa_domains(version):
datastore = update_datastore(version)
domain_elem = get_domain_dropdown(QABenchmarks[datastore.slug])
return domain_elem
def update_doc_domains(version):
datastore = update_datastore(version)
domain_elem = get_domain_dropdown(LongDocBenchmarks[datastore.slug])
return domain_elem
def update_qa_langs(version):
datastore = update_datastore(version)
lang_elem = get_language_dropdown(QABenchmarks[datastore.slug])
return lang_elem
def update_doc_langs(version):
datastore = update_datastore(version)
lang_elem = get_language_dropdown(LongDocBenchmarks[datastore.slug])
return lang_elem
def update_qa_models(version):
datastore = update_datastore(version)
model_elem = get_reranking_dropdown(datastore.reranking_models)
return model_elem
def update_qa_df_ret_rerank(version):
datastore = update_datastore(version)
return get_leaderboard_table(datastore.qa_fmt_df, datastore.qa_types)
def update_qa_hidden_df_ret_rerank(version):
datastore = update_datastore(version)
return get_leaderboard_table(datastore.qa_raw_df, datastore.qa_types, visible=False)
def update_doc_df_ret_rerank(version):
datastore = update_datastore(version)
return get_leaderboard_table(datastore.doc_fmt_df, datastore.doc_types)
def update_doc_hidden_df_ret_rerank(version):
datastore = update_datastore(version)
return get_leaderboard_table(datastore.doc_raw_df, datastore.doc_types, visible=False)
def filter_df_ret(df):
df_ret = df[df[COL_NAME_RERANKING_MODEL] == "NoReranker"]
df_ret = reset_rank(df_ret)
return df_ret
def update_qa_df_ret(version):
datastore = update_datastore(version)
df_ret = filter_df_ret(datastore.qa_fmt_df)
return get_leaderboard_table(df_ret, datastore.qa_types)
def update_qa_hidden_df_ret(version):
datastore = update_datastore(version)
df_ret_hidden = filter_df_ret(datastore.qa_raw_df)
return get_leaderboard_table(df_ret_hidden, datastore.qa_types, visible=False)
def update_doc_df_ret(version):
datastore = update_datastore(version)
df_ret = filter_df_ret(datastore.doc_fmt_df)
return get_leaderboard_table(df_ret, datastore.doc_types)
def update_doc_hidden_df_ret(version):
datastore = update_datastore(version)
df_ret_hidden = filter_df_ret(datastore.doc_raw_df)
return get_leaderboard_table(df_ret_hidden, datastore.doc_types, visible=False)
def filter_df_rerank(df):
df_rerank = df[df[COL_NAME_RETRIEVAL_MODEL] == BM25_LINK]
df_rerank = reset_rank(df_rerank)
return df_rerank
def update_qa_df_rerank(version):
datastore = update_datastore(version)
df_rerank = filter_df_rerank(datastore.qa_fmt_df)
return get_leaderboard_table(df_rerank, datastore.qa_types)
def update_qa_hidden_df_rerank(version):
datastore = update_datastore(version)
df_rerank_hidden = filter_df_rerank(datastore.qa_raw_df)
return get_leaderboard_table(df_rerank_hidden, datastore.qa_types, visible=False)
def update_doc_df_rerank(version):
datastore = update_datastore(version)
df_rerank = filter_df_rerank(datastore.doc_fmt_df)
return get_leaderboard_table(df_rerank, datastore.doc_types)
def update_doc_hidden_df_rerank(version):
datastore = update_datastore(version)
df_rerank_hidden = filter_df_rerank(datastore.doc_raw_df)
return get_leaderboard_table(df_rerank_hidden, datastore.doc_types, visible=False)
demo = gr.Blocks(css=custom_css)
BM25_LINK = model_hyperlink("https://github.com/castorini/pyserini", "BM25")
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("Results", elem_id="results-tab-table"):
with gr.Row():
version = get_version_dropdown()
with gr.TabItem("QA", elem_id="qa-benchmark-tab-table", id=0):
with gr.Row():
with gr.Column(min_width=320):
# select domain
with gr.Row():
domains = get_domain_dropdown(QABenchmarks[datastore.slug])
version.change(update_qa_domains, version, domains)
# select language
with gr.Row():
langs = get_language_dropdown(QABenchmarks[datastore.slug])
version.change(update_qa_langs, version, langs)
with gr.Column():
# select the metric
metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_QA)
with gr.Row():
show_anonymous = get_anonymous_checkbox()
with gr.Row():
show_rev_ts = get_revision_and_ts_checkbox()
with gr.Tabs(elem_classes="tab-buttons") as sub_tabs:
with gr.TabItem("Retrieval + Reranking", id=10):
with gr.Row():
# search retrieval models
with gr.Column():
search_bar = get_search_bar()
# select reranking models
with gr.Column():
models = get_reranking_dropdown(datastore.reranking_models)
version.change(update_qa_models, version, models)
# shown_table
qa_df_elem_ret_rerank = get_leaderboard_table(datastore.qa_fmt_df, datastore.qa_types)
version.change(update_qa_df_ret_rerank, version, qa_df_elem_ret_rerank)
# Dummy leaderboard for handling the case when the user uses backspace key
qa_df_elem_ret_rerank_hidden = get_leaderboard_table(
datastore.qa_raw_df, datastore.qa_types, visible=False
)
version.change(update_qa_hidden_df_ret_rerank, version, qa_df_elem_ret_rerank_hidden)
set_listeners(
TaskType.qa,
qa_df_elem_ret_rerank,
qa_df_elem_ret_rerank_hidden,
search_bar,
version,
domains,
langs,
models,
show_anonymous,
show_rev_ts,
)
# set metric listener
metric.change(
update_qa_metric,
[metric, domains, langs, models, search_bar, show_anonymous, show_rev_ts],
qa_df_elem_ret_rerank,
queue=True,
)
with gr.TabItem("Retrieval Only", id=11):
with gr.Row():
with gr.Column(scale=1):
search_bar_ret = get_search_bar()
with gr.Column(scale=1):
models_ret = get_noreranking_dropdown()
version.change(update_qa_models, version, models_ret)
_qa_df_ret = filter_df_ret(datastore.qa_fmt_df)
qa_df_elem_ret = get_leaderboard_table(_qa_df_ret, datastore.qa_types)
version.change(update_qa_df_ret, version, qa_df_elem_ret)
# Dummy leaderboard for handling the case when the user uses backspace key
_qa_df_ret_hidden = filter_df_ret(datastore.qa_raw_df)
qa_df_elem_ret_hidden = get_leaderboard_table(
_qa_df_ret_hidden, datastore.qa_types, visible=False
)
version.change(update_qa_hidden_df_ret, version, qa_df_elem_ret_hidden)
set_listeners(
TaskType.qa,
qa_df_elem_ret,
qa_df_elem_ret_hidden,
search_bar_ret,
version,
domains,
langs,
models_ret,
show_anonymous,
show_rev_ts,
)
metric.change(
update_qa_metric,
[
metric,
domains,
langs,
models_ret,
search_bar_ret,
show_anonymous,
show_rev_ts,
],
qa_df_elem_ret,
queue=True,
)
with gr.TabItem("Reranking Only", id=12):
_qa_df_rerank = filter_df_rerank(datastore.qa_fmt_df)
qa_rerank_models = _qa_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
with gr.Row():
with gr.Column(scale=1):
qa_models_rerank = get_reranking_dropdown(qa_rerank_models)
version.change(update_qa_models, version, qa_models_rerank)
with gr.Column(scale=1):
qa_search_bar_rerank = gr.Textbox(show_label=False, visible=False)
qa_df_elem_rerank = get_leaderboard_table(_qa_df_rerank, datastore.qa_types)
version.change(update_qa_df_rerank, version, qa_df_elem_rerank)
_qa_df_rerank_hidden = filter_df_rerank(datastore.qa_raw_df)
qa_df_elem_rerank_hidden = get_leaderboard_table(
_qa_df_rerank_hidden, datastore.qa_types, visible=False
)
version.change(update_qa_hidden_df_rerank, version, qa_df_elem_rerank_hidden)
set_listeners(
TaskType.qa,
qa_df_elem_rerank,
qa_df_elem_rerank_hidden,
qa_search_bar_rerank,
version,
domains,
langs,
qa_models_rerank,
show_anonymous,
show_rev_ts,
)
metric.change(
update_qa_metric,
[
metric,
domains,
langs,
qa_models_rerank,
qa_search_bar_rerank,
show_anonymous,
show_rev_ts,
],
qa_df_elem_rerank,
queue=True,
)
with gr.TabItem("Long Doc", elem_id="long-doc-benchmark-tab-table", id=1):
with gr.Row():
with gr.Column(min_width=320):
# select domain
with gr.Row():
domains = get_domain_dropdown(LongDocBenchmarks[datastore.slug])
version.change(update_doc_domains, version, domains)
# select language
with gr.Row():
langs = get_language_dropdown(LongDocBenchmarks[datastore.slug])
version.change(update_doc_langs, version, langs)
with gr.Column():
# select the metric
with gr.Row():
metric = get_metric_dropdown(METRIC_LIST, DEFAULT_METRIC_LONG_DOC)
with gr.Row():
show_anonymous = get_anonymous_checkbox()
with gr.Row():
show_rev_ts = get_revision_and_ts_checkbox()
with gr.Tabs(elem_classes="tab-buttons"):
with gr.TabItem("Retrieval + Reranking", id=20):
with gr.Row():
with gr.Column():
search_bar = get_search_bar()
with gr.Column():
models = get_reranking_dropdown(datastore.reranking_models)
version.change(update_qa_models, version, models)
doc_df_elem_ret_rerank = get_leaderboard_table(datastore.doc_fmt_df, datastore.doc_types)
version.change(update_doc_df_ret_rerank, version, doc_df_elem_ret_rerank)
doc_df_elem_ret_rerank_hidden = get_leaderboard_table(
datastore.doc_raw_df, datastore.doc_types, visible=False
)
version.change(update_doc_hidden_df_ret_rerank, version, doc_df_elem_ret_rerank_hidden)
set_listeners(
TaskType.long_doc,
doc_df_elem_ret_rerank,
doc_df_elem_ret_rerank_hidden,
search_bar,
version,
domains,
langs,
models,
show_anonymous,
show_rev_ts,
)
# set metric listener
metric.change(
update_doc_metric,
[
metric,
domains,
langs,
models,
search_bar,
show_anonymous,
show_rev_ts,
],
doc_df_elem_ret_rerank,
queue=True,
)
with gr.TabItem("Retrieval Only", id=21):
with gr.Row():
with gr.Column(scale=1):
search_bar_ret = get_search_bar()
with gr.Column(scale=1):
models_ret = get_noreranking_dropdown()
_doc_df_ret = filter_df_ret(datastore.doc_fmt_df)
doc_df_elem_ret = get_leaderboard_table(_doc_df_ret, datastore.doc_types)
version.change(update_doc_df_ret, version, doc_df_elem_ret)
_doc_df_ret_hidden = filter_df_ret(datastore.doc_raw_df)
doc_df_elem_ret_hidden = get_leaderboard_table(
_doc_df_ret_hidden, datastore.doc_types, visible=False
)
version.change(update_doc_hidden_df_ret, version, doc_df_elem_ret_hidden)
set_listeners(
TaskType.long_doc,
doc_df_elem_ret,
doc_df_elem_ret_hidden,
search_bar_ret,
version,
domains,
langs,
models_ret,
show_anonymous,
show_rev_ts,
)
metric.change(
update_doc_metric,
[
metric,
domains,
langs,
models_ret,
search_bar_ret,
show_anonymous,
show_rev_ts,
],
doc_df_elem_ret,
queue=True,
)
with gr.TabItem("Reranking Only", id=22):
_doc_df_rerank = filter_df_rerank(datastore.doc_fmt_df)
doc_rerank_models = (
_doc_df_rerank[COL_NAME_RERANKING_MODEL].apply(remove_html).unique().tolist()
)
with gr.Row():
with gr.Column(scale=1):
doc_models_rerank = get_reranking_dropdown(doc_rerank_models)
with gr.Column(scale=1):
doc_search_bar_rerank = gr.Textbox(show_label=False, visible=False)
doc_df_elem_rerank = get_leaderboard_table(_doc_df_rerank, datastore.doc_types)
version.change(update_doc_df_rerank, version, doc_df_elem_rerank)
_doc_df_rerank_hidden = filter_df_rerank(datastore.doc_raw_df)
doc_df_elem_rerank_hidden = get_leaderboard_table(
_doc_df_rerank_hidden, datastore.doc_types, visible=False
)
version.change(update_doc_hidden_df_rerank, version, doc_df_elem_rerank_hidden)
set_listeners(
TaskType.long_doc,
doc_df_elem_rerank,
doc_df_elem_rerank_hidden,
doc_search_bar_rerank,
version,
domains,
langs,
doc_models_rerank,
show_anonymous,
show_rev_ts,
)
metric.change(
update_doc_metric,
[
metric,
domains,
langs,
doc_models_rerank,
doc_search_bar_rerank,
show_anonymous,
show_rev_ts,
],
doc_df_elem_rerank,
queue=True,
)
with gr.TabItem("🚀Submit here!", elem_id="submit-tab-table", id=2):
with gr.Column():
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
with gr.Row():
gr.Markdown("## ✉️Submit your model here!", elem_classes="markdown-text")
with gr.Row():
with gr.Column():
model_name = gr.Textbox(label="Retrieval Method name")
with gr.Column():
model_url = gr.Textbox(label="Retrieval Method URL")
with gr.Row():
with gr.Column():
reranking_model_name = gr.Textbox(
label="Reranking Model name", info="Optional", value="NoReranker"
)
with gr.Column():
reranking_model_url = gr.Textbox(label="Reranking Model URL", info="Optional", value="")
with gr.Row():
with gr.Column():
benchmark_version = gr.Dropdown(
BENCHMARK_VERSION_LIST,
value=LATEST_BENCHMARK_VERSION,
interactive=True,
label="AIR-Bench Version (🟠NOTE: Select the version you want to submit to)",
)
with gr.Row():
upload_button = gr.UploadButton("Click to upload search results", file_count="single")
with gr.Row():
file_output = gr.File()
with gr.Row():
is_anonymous = gr.Checkbox(
label="Nope. I want to submit anonymously 🥷",
value=False,
info="Do you want to shown on the leaderboard by default?",
)
with gr.Row():
submit_button = gr.Button("Submit")
with gr.Row():
submission_result = gr.Markdown()
upload_button.upload(
upload_file,
[
upload_button,
],
file_output,
)
submit_button.click(
submit_results,
[
file_output,
model_name,
model_url,
reranking_model_name,
reranking_model_url,
benchmark_version,
is_anonymous,
],
submission_result,
show_progress="hidden",
)
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
gr.Markdown(BENCHMARKS_TEXT, elem_classes="markdown-text")
gr.Markdown(f"{CITATION_BUTTON_LABEL}\n\n{CITATION_BUTTON_TEXT}", elem_classes="markdown-text")
if __name__ == "__main__":
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40)
demo.launch()