Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
from datasets import load_dataset | |
from PIL import Image, ImageOps | |
df_final = pd.read_pickle("./df_final.pkl") | |
dataset = load_dataset("XAI/vlmsareblind") | |
def show_row(row_index, selected_task): | |
task_df = df_final[df_final["task"] == selected_task] | |
row = task_df.iloc[int(row_index)] | |
custom_id = int(row["custom_id"]) | |
image = dataset["valid"][custom_id]["image"] | |
# Add white padding to the image | |
original_size = image.size | |
new_size = (original_size[0] * 2, original_size[1] * 2) | |
padding = ( | |
(new_size[0] - original_size[0]) // 2, | |
(new_size[1] - original_size[1]) // 2, | |
) | |
image_with_padding = ImageOps.expand(image, border=padding, fill="white") | |
prompt = dataset["valid"][custom_id]["prompt"] | |
model_output = row["content_raw"] | |
ground_truth = row["gt"] | |
task = row["task"] | |
is_correct = row["is_correct"] | |
return image_with_padding, prompt, model_output, ground_truth, task, is_correct | |
def update_slider(selected_task): | |
task_df = df_final[df_final["task"] == selected_task] | |
return gr.Slider( | |
minimum=0, | |
maximum=len(task_df) - 1, | |
step=1, | |
label=f"Select Row Index (0-{len(task_df) - 1})", | |
value=0, | |
) | |
# Create accuracy breakdown dataframe | |
accuracy_breakdown = ( | |
df_final.groupby("task")["is_correct"] | |
.mean() | |
.sort_values(ascending=False) | |
.mul(100) | |
.apply(lambda x: f"{x:.2f}") | |
.reset_index() | |
) | |
accuracy_breakdown.columns = ["Task", "Accuracy (%)"] | |
# Create the Gradio interface | |
with gr.Blocks() as app: | |
gr.Markdown("# BlindTest Results Review (GPT-4o mini)") | |
gr.HTML( | |
""" | |
<p style="text-align: center;"> | |
This is a review of results from the GPT-4 mini model on the VLMs Are Blind dataset. | |
<br> | |
<a href="https://vlmsareblind.github.io/" target="_blank">Project Website</a> | | |
<a href="https://arxiv.org/abs/2407.06581" target="_blank">arXiv Paper</a> | |
</p> | |
""" | |
) | |
with gr.Row(): | |
task_dropdown = gr.Dropdown( | |
choices=df_final["task"].unique().tolist(), | |
label="Select Task", | |
value=df_final["task"].unique()[0], | |
) | |
row_selector = gr.Slider( | |
minimum=0, | |
maximum=len(df_final[df_final["task"] == df_final["task"].unique()[0]]) - 1, | |
step=1, | |
label=f"Select Row Index (0-{len(df_final[df_final['task'] == df_final['task'].unique()[0]]) - 1})", | |
value=0, | |
) | |
with gr.Row(): | |
with gr.Column(scale=2): | |
image_output = gr.Image(label="Image", type="pil") | |
with gr.Column(scale=3): | |
prompt_output = gr.Textbox(label="Prompt", lines=3) | |
model_output = gr.Textbox(label="Model Output", lines=2) | |
ground_truth = gr.Textbox(label="Ground Truth", lines=2) | |
task = gr.Textbox(label="Task") | |
is_correct = gr.Checkbox(label="Is Correct") | |
gr.Markdown("## Accuracy Breakdown by Task") | |
gr.DataFrame(accuracy_breakdown) | |
task_dropdown.change(update_slider, inputs=task_dropdown, outputs=row_selector) | |
task_dropdown.change( | |
show_row, | |
inputs=[gr.Slider(value=0, visible=False), task_dropdown], | |
outputs=[ | |
image_output, | |
prompt_output, | |
model_output, | |
ground_truth, | |
task, | |
is_correct, | |
], | |
) | |
row_selector.change( | |
show_row, | |
inputs=[row_selector, task_dropdown], | |
outputs=[ | |
image_output, | |
prompt_output, | |
model_output, | |
ground_truth, | |
task, | |
is_correct, | |
], | |
) | |
# Launch the app | |
app.launch() | |