reward-bench / app.py
natolambert's picture
add dataset viewer
8e499f4
raw
history blame
5.04 kB
import gradio as gr
import os
from huggingface_hub import HfApi, snapshot_download
from datasets import load_dataset
from src.utils import load_all_data
from src.md import ABOUT_TEXT
import numpy as np
api = HfApi()
COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
evals_repo = "ai2-rlhf-collab/rm-benchmark-results"
prefs_repo = "ai2-rlhf-collab/rm-testset-results"
eval_set_repo = "ai2-rlhf-collab/rm-benchmark-dev"
repo_dir_herm = "./evals/herm/"
repo_dir_prefs = "./evals/prefs/"
# def restart_space():
# api.restart_space(repo_id="ai2-rlhf-collab/rm-benchmark-viewer", token=COLLAB_TOKEN)
print("Pulling evaluation results")
repo = snapshot_download(
local_dir=repo_dir_herm,
repo_id=evals_repo,
use_auth_token=COLLAB_TOKEN,
tqdm_class=None,
etag_timeout=30,
repo_type="dataset",
)
repo_pref_sets = snapshot_download(
local_dir=repo_dir_prefs,
repo_id=prefs_repo,
use_auth_token=COLLAB_TOKEN,
tqdm_class=None,
etag_timeout=30,
repo_type="dataset",
)
def avg_over_herm(dataframe):
"""
Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
"""
subsets = ["alpacaeval", "mt-bench", "llmbar", "refusals", "hep"]
# for each subset, avg the columns that have the subset in the column name, then add a new column with subset name and avg
for subset in subsets:
subset_cols = [col for col in dataframe.columns if subset in col]
dataframe[subset] = np.round(np.nanmean(dataframe[subset_cols].values, axis=1), 2)
keep_columns = ["model", "average"] + subsets
dataframe = dataframe[keep_columns]
# replace average column with new average
dataframe["average"] = np.round(np.nanmean(dataframe[subsets].values, axis=1), 2)
return dataframe
def expand_subsets(dataframe):
# TODO need to modify data/ script to do this
pass
herm_data = load_all_data(repo_dir_herm).sort_values(by='average', ascending=False)
herm_data_avg = avg_over_herm(herm_data).sort_values(by='average', ascending=False)
prefs_data = load_all_data(repo_dir_prefs).sort_values(by='average', ascending=False)
# prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
col_types_herm = ["markdown"] + ["number"] * (len(herm_data.columns) - 1)
col_types_herm_avg = ["markdown"] + ["number"] * (len(herm_data_avg.columns) - 1)
col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
# col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)
# for showing random samples
eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered")
def random_sample(r: gr.Request):
sample_index = np.random.randint(0, len(eval_set) - 1)
sample = eval_set[sample_index]
markdown_text = '\n\n'.join([f"**{key}**: {value}" for key, value in sample.items()])
return markdown_text
with gr.Blocks() as app:
# create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
with gr.Row():
gr.Markdown("# HERM Results Viewer")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("HERM - Overview"):
with gr.Row():
herm_table = gr.Dataframe(
herm_data_avg.values,
datatype=col_types_herm_avg,
headers=herm_data_avg.columns.tolist(),
elem_id="herm_dataframe_avg",
)
with gr.TabItem("HERM - Detailed"):
with gr.Row():
herm_table = gr.Dataframe(
herm_data.values,
datatype=col_types_herm,
headers=herm_data.columns.tolist(),
elem_id="herm_dataframe",
)
with gr.TabItem("Pref Sets - Overview"):
pref_sets_table = gr.Dataframe(
prefs_data.values,
datatype=col_types_prefs,
headers=prefs_data.columns.tolist(),
elem_id="prefs_dataframe",
)
with gr.TabItem("About"):
with gr.Row():
gr.Markdown(ABOUT_TEXT)
with gr.TabItem("Dataset Viewer"):
with gr.Row():
# loads one sample
gr.Markdown("## Random Dataset Sample Viewer")
button = gr.Button("Show Random Sample")
with gr.Row():
sample_display = gr.Markdown("{sampled data loads here}")
button.click(fn=random_sample, outputs=sample_display)
# Load data when app starts, TODO make this used somewhere...
def load_data_on_start():
data_herm = load_all_data(repo_dir_herm)
herm_table.update(data_herm)
data_herm_avg = avg_over_herm(repo_dir_herm)
herm_table.update(data_herm_avg)
data_prefs = load_all_data(repo_dir_prefs)
pref_sets_table.update(data_prefs)
app.launch()