Spaces:
Sleeping
Sleeping
"""A gradio app that renders a static leaderboard. This is used for Hugging Face Space.""" | |
import ast | |
import argparse | |
import glob | |
import pickle | |
import plotly | |
import gradio as gr | |
import numpy as np | |
import pandas as pd | |
import gradio as gr | |
import pandas as pd | |
from pathlib import Path | |
from difflib import Differ | |
import json | |
from constants import BANNER, CITATION_TEXT, WINRATE_HEATMAP, css, js_code, all_task_types, DEFAULT_LP, TASK_TYPE_STR, js_light | |
from datetime import datetime, timezone | |
from data_utils import load_eval_results, sample_an_eval_result, apply_length_penalty, post_processing, add_winrates, add_winrates_tasks | |
# from gradio.themes.utils import colors, fonts, sizes | |
from themes import Seafoam | |
import datasets | |
from huggingface_hub import HfApi | |
# from datasets import Dataset, load_dataset, concatenate_datasets | |
import os, uuid | |
from utils_display import model_info | |
from tqdm import tqdm | |
# get the last updated time from the elo_ranks.all.jsonl file | |
LAST_UPDATED = None | |
with open("_intro.md", "r") as f: | |
INTRO_MD = f.read() | |
with open("_about_us.md", "r") as f: | |
ABOUT_MD = f.read() | |
with open("_header.md", "r") as f: | |
HEADER_MD = f.read() | |
original_df, ablation_df = None, None | |
eval_results = load_eval_results() | |
available_models = [] # to be filled in later | |
import random | |
random.seed(42) | |
np.random.seed(42) | |
def sample_an_feedback(task_category, task_difficulty, task_quality, feedback_score): | |
def filter_examples(item): | |
if task_category and item['category'] not in task_category: | |
return False | |
if task_difficulty and item['difficulty'] not in task_difficulty: | |
return False | |
if task_quality and item['quality'] not in task_quality: | |
return False | |
if feedback_score and item['feedback']['processed']['score'] not in feedback_score: | |
return False | |
return True | |
valid_examples = dataset.filter(filter_examples, num_proc=4) | |
if len(valid_examples) == 0: | |
raise ValueError("No examples found for the selected filters. Please try again with different filters.") | |
print(f"Found {len(valid_examples)} examples for the selected filters.") | |
example = random.choice(valid_examples) | |
plan_history = { | |
"user": [ | |
example['query'], | |
], | |
"assistant": [ | |
example['response'] | |
] | |
} | |
ground_history = { | |
"user": [ | |
example['query'], | |
], | |
"assistant": [ | |
example['revision']['processed'] | |
] | |
} | |
result_dict = { | |
"session_id": example['id'], | |
"category": example['category'], | |
"difficulty": example['difficulty'], | |
"quality": example['quality'], | |
"intent": example['intent'], | |
"plan_history": plan_history, | |
"ground_history": ground_history, | |
# "pred": str(model_response_1['feedback']['processed']['score']) if model_response_1['feedback']['processed'] else "A", | |
# "answer": str(model_response_2['feedback']['processed']['score']) if model_response_2['feedback']['processed'] else "A", | |
"pred": example['model'], # model that generates the original response | |
"answer": example['revision']['model'], # model that generates the revised response | |
"correctness": example['feedback']['model'], # model that generates the feedback for the original response | |
"image": "file/data_dir/test_images/000000341196.jpg" | |
} | |
return result_dict | |
def diff_texts(text1, text2): | |
d = Differ() | |
return [ | |
(token[2:], token[0] if token[0] != " " else None) | |
for token in d.compare(text1, text2) | |
] | |
def display_chat_history(task_category, task_difficulty, task_quality, feedback_score): | |
eval_item = sample_an_feedback(task_category, task_difficulty, task_quality, feedback_score) | |
print("---" * 10) | |
for key, value in eval_item.items(): | |
print(f"{key}: {value}") | |
print("---" * 10) | |
# eval_item = sample_an_feedback() | |
session_id = eval_item["session_id"] | |
category = eval_item["category"] | |
prediction = eval_item["pred"] | |
gold_answer = eval_item["answer"] | |
correctness = eval_item["correctness"] | |
difficulty = eval_item["difficulty"] | |
quality = eval_item["quality"] | |
intent = eval_item["intent"] | |
if eval_item["image"]: | |
image_path = eval_item["image"] | |
else: | |
image_path = "" | |
chats_plan = [] | |
for item_user, item_asst in zip(eval_item["plan_history"]["user"], eval_item["plan_history"]["assistant"]): | |
chats_plan += [item_user, item_asst] | |
chats_ground = [] | |
for item_user, item_asst in zip(eval_item["ground_history"]["user"], eval_item["ground_history"]["assistant"]): | |
chats_ground += [item_user, item_asst] | |
chats_plan = [(chats_plan[i], chats_plan[i+1]) for i in range(0, len(chats_plan), 2)] | |
chats_ground = [(chats_ground[i], chats_ground[i+1]) for i in range(0, len(chats_ground), 2)] | |
task_metadata = f"- ๐: `{session_id}` \n- **Category**: {category} \n- **Difficulty**: {difficulty} \n- **Quality**: {quality} \n- **Intent**: {intent}" | |
diff_text = diff_texts(chats_plan[-1][1], chats_ground[-1][1]) | |
print(f"Category: {category}") | |
print(f"Difficulty: {difficulty}") | |
print(f"Quality: {quality}") | |
print(f"Intent: {intent}") | |
print(f"Session ID: {session_id}") | |
print(f"Original Response: {chats_plan}") | |
print(f"Revised Response: {chats_ground}") | |
if image_path != "": | |
image = f'<div style="text-align: center;"> <img src="{image_path}" style="height: 250px;"> </div>' | |
return category, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, image, diff_text | |
else: | |
return category, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, f'<div style="text-align: center;"> </div>', diff_text | |
def slider_change_main(length_penalty): | |
global original_df, ablation_df | |
adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty) | |
adjusted_df = adjusted_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]] | |
adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False) | |
adjusted_df = add_winrates(adjusted_df) | |
adjusted_df = adjusted_df.drop(columns=["Length"]) | |
return adjusted_df | |
def slider_change_full(length_penalty, show_winrate): | |
global original_df, ablation_df | |
adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty) | |
# sort the model by the "Task-Avg Elo" column | |
adjusted_df = adjusted_df.sort_values(by="Task-Avg Elo", ascending=False) | |
adjusted_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"], inplace=True) | |
if show_winrate == "none": | |
return adjusted_df | |
elif show_winrate == "gpt-3.5": | |
adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-3.5") | |
elif show_winrate == "gpt-4": | |
adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4") | |
return adjusted_df | |
seafoam = Seafoam() | |
def build_demo(TYPES): | |
global available_categories, avaliable_difficulty, avaliable_quality, available_feedback_scores | |
with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo: | |
gr.Markdown(HEADER_MD, elem_classes="markdown-text") | |
with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
with gr.TabItem("๐ Explore", elem_id="od-benchmark-tab-table", id=2): | |
with gr.Row(): | |
btn_show_history = gr.Button("๐ฒ Click here to sample an example of Feedbacks ", elem_classes="sample_button") | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Accordion("Choose task difficulty", open=False, elem_classes="accordion-label"): | |
task_difficulty = gr.CheckboxGroup(avaliable_difficulty, info="", value=avaliable_difficulty, show_label=False, elem_id="select-difficulty") | |
clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1) | |
# clear the selected_models | |
clear_button.click(lambda: {task_difficulty: {"value": [], "__type__": "update"}}, inputs=[], outputs=[task_difficulty]) | |
with gr.Accordion("Choose task quality", open=False, elem_classes="accordion-label"): | |
task_quality = gr.CheckboxGroup(avaliable_quality, info="", value=avaliable_quality, show_label=False, elem_id="select-quality") | |
clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1) | |
# clear the selected_models | |
clear_button.click(lambda: {task_quality: {"value": [], "__type__": "update"}}, inputs=[], outputs=[task_quality]) | |
with gr.Accordion("Choose feedback score", open=False, elem_classes="accordion-label"): | |
feedback_score = gr.CheckboxGroup(available_feedback_scores, info="", value=available_feedback_scores, show_label=False, elem_id="select-feedback") | |
clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1) | |
# clear the selected_models | |
clear_button.click(lambda: {feedback_score: {"value": [], "__type__": "update"}}, inputs=[], outputs=[feedback_score]) | |
with gr.Accordion("Choose task category", open=False, elem_classes="accordion-label"): | |
task_category = gr.CheckboxGroup(available_categories, info="", value=available_categories, show_label=False, elem_id="select-category") | |
clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1) | |
# clear the selected_models | |
clear_button.click(lambda: {task_category: {"value": [], "__type__": "update"}}, inputs=[], outputs=[task_category]) | |
with gr.Row(visible=False): | |
with gr.Column(scale=1.5): | |
with gr.Accordion("๐ Task Description", open=True, elem_classes="accordion-label"): | |
task = gr.Markdown("", elem_classes="markdown-text-tiny") | |
task.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code) | |
with gr.Column(scale=1): | |
with gr.Accordion("Input Image (optional)", open=True, elem_classes="accordion-label"): | |
image = gr.HTML("", elem_id="markdown-text-tiny") | |
image.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code) | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Accordion("๐ Task Metadata", open=True, elem_classes="accordion-label"): | |
task_metadata = gr.Markdown("", elem_classes="markdown-text-tiny") | |
task_metadata.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code) | |
with gr.Row(): | |
with gr.Column(scale=1.1): | |
# gr.Markdown("## ๐ข Plan Module Process History w/ <span style='background-color: #FDFDBA;'>Execution Module Results</span>", elem_classes="accordion-label") | |
gr.Markdown("## ๐ข Model Original Response", elem_classes="accordion-label") | |
Chatbot_Common_Plan = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Plan History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble") | |
Chatbot_Common_Plan.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code) | |
with gr.Column(scale=1): | |
# gr.Markdown("## ๐ข Ground Module Process History", elem_classes="accordion-label") | |
gr.Markdown("## ๐ข Model Revised Response", elem_classes="accordion-label") | |
Chatbot_Common_Ground = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Ground History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble") | |
Chatbot_Common_Ground.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code) | |
with gr.Row(): | |
with gr.Column(): | |
with gr.Accordion("Highlighted differences", open=True, elem_classes="accordion-label"): | |
highlighted_diff = gr.HighlightedText(label="Highlighted differences", | |
combine_adjacent=False, | |
show_legend=True, | |
color_map={"+": "green", "-": "red"}) | |
with gr.Row(): | |
with gr.Column(): | |
# with gr.Accordion("๐ Prediction", open=True, elem_classes="accordion-label"): | |
with gr.Accordion("Policy Model", open=True, elem_classes="accordion-label"): | |
prediction = gr.Markdown("", elem_classes="markdown-text-tiny") | |
prediction.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code) | |
with gr.Column(): | |
# with gr.Accordion("๐ Ground-Truth Answer", open=True, elem_classes="accordion-label"): | |
with gr.Accordion("Revision Model", open=True, elem_classes="accordion-label"): | |
gold_answer = gr.HTML("", elem_id="markdown-text-tiny") | |
gold_answer.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code) | |
with gr.Column(visible=True): | |
with gr.Accordion("Feedback Model", open=True, elem_classes="accordion-label"): | |
correctness = gr.HTML("", elem_id="markdown-text-tiny") | |
correctness.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code) | |
# Display chat history when button is clicked | |
btn_show_history.click(fn=display_chat_history, | |
inputs=[task_category, task_difficulty, task_quality, feedback_score], | |
outputs=[task, Chatbot_Common_Plan, Chatbot_Common_Ground, task_metadata, prediction, gold_answer, correctness, image, highlighted_diff]) | |
with gr.TabItem("๐ฎ About Us", elem_id="od-benchmark-tab-table", id=3, visible=False): | |
gr.Markdown(ABOUT_MD, elem_classes="markdown-text") | |
gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text-small") | |
with gr.Row(): | |
with gr.Accordion("๐ Citation", open=False, elem_classes="accordion-label", visible=False): | |
gr.Textbox( | |
value=CITATION_TEXT, | |
lines=7, | |
label="Copy the BibTeX snippet to cite this source", | |
elem_id="citation-button", | |
show_copy_button=True) | |
# ).style(show_copy_button=True) | |
return demo | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--share", action="store_true") | |
parser.add_argument("--result_file", help="Path to results table", default="data_dir/pair_feedbacks_1.jsonl") | |
parser.add_argument("--length_balation_file", help="Path to results table", default="data_dir/elo_ranks.length_ablation.all.jsonl") | |
parser.add_argument("--skip_empty_result_file", help="Path to results table", default="data_dir/elo_ranks.skip_empty.all.jsonl") | |
parser.add_argument("--skip_empty_length_balation_file", help="Path to results table", default="data_dir/elo_ranks.skip_empty.length_ablation.all.jsonl") | |
args = parser.parse_args() | |
LAST_UPDATED = datetime.fromtimestamp(Path(args.result_file).stat().st_mtime, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S") | |
# available_models = sorted(list(set(list(original_df["model name "])))) | |
# available_models = list(model_info.keys()) | |
# dataset = datasets.Dataset.from_json(args.result_file) | |
dataset = datasets.load_dataset("DongfuJiang/VAPO", "pair_feedback_iter_1", split='train') | |
avaliable_difficulty = sorted(list(set(dataset['difficulty']))) | |
avaliable_quality = sorted(list(set(dataset['quality']))) | |
available_feedback_scores = sorted(list(set([item['feedback']['processed']['score'] for item in dataset]))) | |
available_categories = sorted(list(set(dataset['category']))) | |
TYPES = ["markdown", "number"] | |
demo = build_demo(TYPES) | |
demo.launch(share=args.share, allowed_paths=["."], height=1000) | |