import gradio as gr import pandas as pd from datasets import load_dataset df_final = pd.read_pickle("./df_final.pkl") dataset = load_dataset("XAI/vlmsareblind") def show_row(row_index, selected_task): task_df = df_final[df_final["task"] == selected_task] row = task_df.iloc[int(row_index)] custom_id = int(row["custom_id"]) image = dataset["valid"][custom_id]["image"] prompt = dataset["valid"][custom_id]["prompt"] model_output = row["content_raw"] ground_truth = row["gt"] task = row["task"] is_correct = row["is_correct"] return image, prompt, model_output, ground_truth, task, is_correct def update_slider(selected_task): task_df = df_final[df_final["task"] == selected_task] return gr.Slider( minimum=0, maximum=len(task_df) - 1, step=1, label=f"Select Row Index (0-{len(task_df) - 1})", value=0, ) # Create accuracy breakdown dataframe accuracy_breakdown = ( df_final.groupby("task")["is_correct"] .mean() .sort_values(ascending=False) .mul(100) .apply(lambda x: f"{x:.2f}") .reset_index() ) accuracy_breakdown.columns = ["Task", "Accuracy (%)"] # Create the Gradio interface with gr.Blocks() as app: gr.Markdown("# VLMs Are Blind Results Review (GPT-4o-mini)") gr.HTML( """
This is a review of results from the GPT-4 model on the VLMs Are Blind dataset.
Project Website |
arXiv Paper