Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter | |
from data_reviewer import create_data_viewer | |
# Define constants and enums | |
TITLE = "<h1>VL-RewardBench Leaderboard</h1>" | |
INTRODUCTION_TEXT = "https://vl-rewardbench.github.io/" | |
GOOGLE_SHEET_URL = ( | |
"https://docs.google.com/spreadsheets/d/1fPqZLF1FQFyy4n9I6GNk7MeDSGlJDVVes9yEBqN8RwU/export?gid=0&format=csv" | |
) | |
ABOUT_TEXT = """Welcome to VLRewardBench! | |
We introduce a novel benchmark VL-RewardBench, designed to expose limitations of vision-language reward models across visual perception, hallucination detection, and reasoning tasks. | |
Our evaluation reveals including that models primarily fail at basic visual perception rather than reasoning, and that performance on our benchmark strongly correlates (r>0.9) with downstream vision-language tasks. | |
The splits are: | |
- General (VLFeedback + WildVision | |
- Hallucination (POVID, RLAIF, RLHF-V) | |
- Reasoning (MMMU-Pro, MathVerse)""" | |
class AutoEvalColumn: | |
model = {"name": "Model", "type": "str", "displayed_by_default": True, "never_hidden": True} | |
license = {"name": "License", "type": "str", "displayed_by_default": False, "never_hidden": False} | |
general = {"name": "General", "type": "float", "displayed_by_default": True, "never_hidden": False} | |
hallucination = {"name": "Hallucination", "type": "float", "displayed_by_default": True, "never_hidden": False} | |
reasoning = {"name": "Reasoning", "type": "float", "displayed_by_default": True, "never_hidden": False} | |
overall = {"name": "Overall Consistency", "type": "float", "displayed_by_default": True, "never_hidden": False} | |
macro = {"name": "Macro Average", "type": "float", "displayed_by_default": True, "never_hidden": False} | |
# Create sample data | |
def get_sample_data(): | |
return pd.DataFrame( | |
{ | |
"Model": ["model1", "model2", "model3"], | |
"License": ["MIT", "Apache", "MIT"], | |
"Model Type": ["base", "instruct", "chat"], | |
"Precision": ["float16", "float32", "float16"], | |
"Parameters (B)": [7, 13, 70], | |
"Available": [True, True, False], | |
} | |
) | |
def get_result_data(): | |
return pd.read_csv(GOOGLE_SHEET_URL) | |
def init_leaderboard(dataframe): | |
if dataframe is None or dataframe.empty: | |
raise ValueError("Leaderboard DataFrame is empty or None.") | |
return Leaderboard( | |
value=dataframe, | |
datatype=[col["type"] for col in AutoEvalColumn.__dict__.values() if isinstance(col, dict)], | |
select_columns=SelectColumns( | |
default_selection=[ | |
col["name"] | |
for col in AutoEvalColumn.__dict__.values() | |
if isinstance(col, dict) and col["displayed_by_default"] | |
], | |
cant_deselect=[ | |
col["name"] | |
for col in AutoEvalColumn.__dict__.values() | |
if isinstance(col, dict) and col.get("never_hidden", False) | |
], | |
label="Select Columns to Display:", | |
), | |
search_columns=["Model", "License"], | |
filter_columns=[ | |
ColumnFilter("License", type="checkboxgroup", label="License"), | |
ColumnFilter("Model Size", type="checkboxgroup", label="Model Size"), | |
], | |
interactive=False, | |
) | |
# Initialize the Gradio interface | |
demo = gr.Blocks() | |
with demo: | |
gr.HTML(TITLE) | |
gr.Markdown(INTRODUCTION_TEXT) | |
with gr.Tabs() as tabs: | |
with gr.TabItem("π Leaderboard"): | |
# Load your DataFrame here instead of the sample data | |
df = get_result_data() | |
leaderboard = init_leaderboard(df) | |
with gr.TabItem("π Data Viewer"): | |
dataset_name, dataset_split, sample_idx = create_data_viewer() | |
with gr.TabItem("βΉοΈ About"): | |
gr.Markdown(ABOUT_TEXT) | |
demo.launch() | |