import gradio as gr import os from huggingface_hub import HfApi, snapshot_download from datasets import load_dataset from src.utils import load_all_data from src.md import ABOUT_TEXT import numpy as np api = HfApi() COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN") evals_repo = "ai2-rlhf-collab/rm-benchmark-results" prefs_repo = "ai2-rlhf-collab/rm-testset-results" eval_set_repo = "ai2-rlhf-collab/rm-benchmark-dev" repo_dir_herm = "./evals/herm/" repo_dir_prefs = "./evals/prefs/" # def restart_space(): # api.restart_space(repo_id="ai2-rlhf-collab/rm-benchmark-viewer", token=COLLAB_TOKEN) print("Pulling evaluation results") repo = snapshot_download( local_dir=repo_dir_herm, repo_id=evals_repo, use_auth_token=COLLAB_TOKEN, tqdm_class=None, etag_timeout=30, repo_type="dataset", ) repo_pref_sets = snapshot_download( local_dir=repo_dir_prefs, repo_id=prefs_repo, use_auth_token=COLLAB_TOKEN, tqdm_class=None, etag_timeout=30, repo_type="dataset", ) def avg_over_herm(dataframe): """ Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns. """ subsets = ["alpacaeval", "mt-bench", "llmbar", "refusals", "hep"] # for each subset, avg the columns that have the subset in the column name, then add a new column with subset name and avg for subset in subsets: subset_cols = [col for col in dataframe.columns if subset in col] dataframe[subset] = np.round(np.nanmean(dataframe[subset_cols].values, axis=1), 2) keep_columns = ["model", "average"] + subsets dataframe = dataframe[keep_columns] # replace average column with new average dataframe["average"] = np.round(np.nanmean(dataframe[subsets].values, axis=1), 2) return dataframe def expand_subsets(dataframe): # TODO need to modify data/ script to do this pass herm_data = load_all_data(repo_dir_herm).sort_values(by='average', ascending=False) herm_data_avg = avg_over_herm(herm_data).sort_values(by='average', ascending=False) prefs_data = load_all_data(repo_dir_prefs).sort_values(by='average', ascending=False) # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False) col_types_herm = ["markdown"] + ["number"] * (len(herm_data.columns) - 1) col_types_herm_avg = ["markdown"] + ["number"] * (len(herm_data_avg.columns) - 1) col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1) # col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1) # for showing random samples eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered") def random_sample(r: gr.Request): sample_index = np.random.randint(0, len(eval_set) - 1) sample = eval_set[sample_index] markdown_text = '\n\n'.join([f"**{key}**: {value}" for key, value in sample.items()]) return markdown_text with gr.Blocks() as app: # create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About" with gr.Row(): gr.Markdown("# HERM Results Viewer") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("HERM - Overview"): with gr.Row(): herm_table = gr.Dataframe( herm_data_avg.values, datatype=col_types_herm_avg, headers=herm_data_avg.columns.tolist(), elem_id="herm_dataframe_avg", ) with gr.TabItem("HERM - Detailed"): with gr.Row(): herm_table = gr.Dataframe( herm_data.values, datatype=col_types_herm, headers=herm_data.columns.tolist(), elem_id="herm_dataframe", ) with gr.TabItem("Pref Sets - Overview"): pref_sets_table = gr.Dataframe( prefs_data.values, datatype=col_types_prefs, headers=prefs_data.columns.tolist(), elem_id="prefs_dataframe", ) with gr.TabItem("About"): with gr.Row(): gr.Markdown(ABOUT_TEXT) with gr.TabItem("Dataset Viewer"): with gr.Row(): # loads one sample gr.Markdown("## Random Dataset Sample Viewer") button = gr.Button("Show Random Sample") with gr.Row(): sample_display = gr.Markdown("{sampled data loads here}") button.click(fn=random_sample, outputs=sample_display) # Load data when app starts, TODO make this used somewhere... def load_data_on_start(): data_herm = load_all_data(repo_dir_herm) herm_table.update(data_herm) data_herm_avg = avg_over_herm(repo_dir_herm) herm_table.update(data_herm_avg) data_prefs = load_all_data(repo_dir_prefs) pref_sets_table.update(data_prefs) app.launch()