import json
import logging
from typing import Any
import gradio as gr
import numpy as np
import pandas as pd
from datasets import Dataset
from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState
from submission import submit
from workflows.qb.multi_step_agent import MultiStepTossupAgent
from workflows.qb.simple_agent import SimpleTossupAgent
from workflows.structs import ModelStep, Workflow
from .commons import get_qid_selector
from .plotting import (
create_scatter_pyplot,
create_tossup_confidence_pyplot,
create_tossup_html,
update_tossup_plot,
)
from .utils import evaluate_prediction
# TODO: Error handling on run tossup and evaluate tossup and show correct messages
# TODO: ^^ Same for Bonus
def add_model_scores(model_outputs: list[dict], clean_answers: list[str], run_indices: list[int]) -> list[dict]:
"""Add model scores to the model outputs."""
for output, run_idx in zip(model_outputs, run_indices):
output["score"] = evaluate_prediction(output["answer"], clean_answers)
output["token_position"] = run_idx + 1
return model_outputs
def prepare_buzz_evals(
run_indices: list[int], model_outputs: list[dict]
) -> tuple[list[str], list[tuple[int, float, bool]]]:
"""Process text into tokens and assign random values for demonstration."""
if not run_indices:
logging.warning("No run indices provided, returning empty results")
return [], []
eval_points = []
for i, v in zip(run_indices, model_outputs):
eval_points.append((int(i), v))
return eval_points
def initialize_eval_interface(example, model_outputs: list[dict]):
"""Initialize the interface with example text."""
try:
tokens = example["question"].split()
run_indices = example["run_indices"]
answer = example["answer_primary"]
clean_answers = example["clean_answers"]
eval_points = prepare_buzz_evals(run_indices, model_outputs)
if not tokens:
return "
No tokens found in the provided text.
", pd.DataFrame(), "{}"
highlighted_index = next((int(i) for i, v in eval_points if v["buzz"] == 1), -1)
html_content = create_tossup_html(tokens, answer, clean_answers, run_indices, eval_points)
plot_data = create_tossup_confidence_pyplot(tokens, eval_points, highlighted_index)
# Store tokens, values, and buzzes as JSON for later use
state = json.dumps({"tokens": tokens, "values": eval_points})
return html_content, plot_data, state
except Exception as e:
logging.error(f"Error initializing interface: {e}", exc_info=True)
return f"Error initializing interface: {str(e)}
", pd.DataFrame(), "{}"
def process_tossup_results(results: list[dict], top_k_mode: bool = False) -> pd.DataFrame:
"""Process results from tossup mode and prepare visualization data."""
# Create DataFrame for detailed results
if top_k_mode:
raise ValueError("Top-k mode not supported for tossup mode")
return pd.DataFrame(
[
{
"Token Position": r["token_position"],
"Correct?": "✅" if r["score"] == 1 else "❌",
"Confidence": r["confidence"],
"Prediction": r["answer"],
}
for r in results
]
)
def validate_workflow(workflow: Workflow):
"""
Validate that a workflow is properly configured for the tossup task.
Args:
workflow (Workflow): The workflow to validate
Raises:
ValueError: If the workflow is not properly configured
"""
if not workflow.steps:
raise ValueError("Workflow must have at least one step")
# Ensure all steps are properly configured
for step_id, step in workflow.steps.items():
validate_model_step(step)
# Check that the workflow has the correct structure
input_vars = set(workflow.inputs)
if "question" not in input_vars:
raise ValueError("Workflow must have 'question' as an input")
output_vars = set(workflow.outputs)
if not any("answer" in out_var for out_var in output_vars):
raise ValueError("Workflow must produce an 'answer' as output")
if not any("confidence" in out_var for out_var in output_vars):
raise ValueError("Workflow must produce a 'confidence' score as output")
def validate_model_step(model_step: ModelStep):
"""
Validate that a model step is properly configured for the tossup task.
Args:
model_step (ModelStep): The model step to validate
Raises:
ValueError: If the model step is not properly configured
"""
# Check required fields
if not model_step.model or not model_step.provider:
raise ValueError("Model step must have both model and provider specified")
if model_step.call_type != "llm":
raise ValueError("Model step must have call_type 'llm'")
# Validate temperature for LLM steps
if model_step.temperature is None:
raise ValueError("Temperature must be specified for LLM model steps")
if not (0.0 <= model_step.temperature <= 1.0):
raise ValueError(f"Temperature must be between 0.0 and 1.0, got {model_step.temperature}")
# Validate input fields
input_field_names = {field.name for field in model_step.input_fields}
if "question" not in input_field_names:
raise ValueError("Model step must have a 'question' input field")
# Validate output fields
output_field_names = {field.name for field in model_step.output_fields}
if "answer" not in output_field_names:
raise ValueError("Model step must have an 'answer' output field")
if "confidence" not in output_field_names:
raise ValueError("Model step must have a 'confidence' output field")
# Validate confidence output field is of type float
for field in model_step.output_fields:
if field.name == "confidence" and field.type != "float":
raise ValueError("The 'confidence' output field must be of type 'float'")
class TossupInterface:
"""Gradio interface for the Tossup mode."""
def __init__(self, app: gr.Blocks, dataset: Dataset, model_options: dict, defaults: dict):
"""Initialize the Tossup interface."""
logging.info(f"Initializing Tossup interface with dataset size: {len(dataset)}")
self.ds = dataset
self.model_options = model_options
self.app = app
self.defaults = defaults
self.output_state = gr.State(value="{}")
self.render()
def _render_model_interface(self, workflow: Workflow, simple: bool = True):
"""Render the model interface."""
self.pipeline_interface = PipelineInterface(
workflow,
simple=simple,
model_options=list(self.model_options.keys()),
)
with gr.Row():
self.buzz_t_slider = gr.Slider(
minimum=0.5,
maximum=1.0,
value=self.defaults["buzz_threshold"],
step=0.01,
label="Buzz Threshold",
)
self.early_stop_checkbox = gr.Checkbox(
value=self.defaults["early_stop"],
label="Early Stop",
info="Stop early if already buzzed",
)
def _render_qb_interface(self):
"""Render the quizbowl interface."""
with gr.Row(elem_classes="bonus-header-row form-inline"):
self.qid_selector = get_qid_selector(len(self.ds))
self.run_btn = gr.Button("Run on Tossup Question", variant="secondary")
self.question_display = gr.HTML(label="Question", elem_id="tossup-question-display")
with gr.Row():
self.confidence_plot = gr.Plot(
label="Buzz Confidence",
format="webp",
)
self.results_table = gr.DataFrame(
label="Model Outputs",
value=pd.DataFrame(columns=["Token Position", "Correct?", "Confidence", "Prediction"]),
)
with gr.Row():
self.eval_btn = gr.Button("Evaluate", variant="primary")
with gr.Accordion("Model Submission", elem_classes="model-submission-accordion", open=True):
with gr.Row():
self.model_name_input = gr.Textbox(label="Model Name")
self.description_input = gr.Textbox(label="Description")
with gr.Row():
gr.LoginButton()
self.submit_btn = gr.Button("Submit", variant="primary")
self.submit_status = gr.HTML(label="Submission Status")
def render(self):
"""Create the Gradio interface."""
self.hidden_input = gr.Textbox(value="", visible=False, elem_id="hidden-index")
workflow = self.defaults["init_workflow"]
with gr.Row():
# Model Panel
with gr.Column(scale=1):
self._render_model_interface(workflow, simple=self.defaults["simple_workflow"])
with gr.Column(scale=1):
self._render_qb_interface()
self._setup_event_listeners()
def validate_workflow(self, pipeline_state: PipelineState):
"""Validate the workflow."""
try:
validate_workflow(pipeline_state.workflow)
except Exception as e:
raise gr.Error(f"Error validating workflow: {str(e)}")
def get_new_question_html(self, question_id: int) -> str:
"""Get the HTML for a new question."""
if question_id is None:
logging.error("Question ID is None. Setting to 1")
question_id = 1
try:
example = self.ds[question_id - 1]
question_tokens = example["question"].split()
return create_tossup_html(
question_tokens, example["answer_primary"], example["clean_answers"], example["run_indices"]
)
except Exception as e:
return f"Error loading question: {str(e)}"
def get_model_outputs(self, example: dict, pipeline_state: PipelineState, buzz_threshold: float, early_stop: bool):
"""Get the model outputs for a given question ID."""
question_runs = []
tokens = example["question"].split()
for run_idx in example["run_indices"]:
question_runs.append(" ".join(tokens[: run_idx + 1]))
workflow = pipeline_state.workflow
if len(workflow.steps) > 1:
agent = MultiStepTossupAgent(workflow, buzz_threshold)
else:
agent = SimpleTossupAgent(workflow, buzz_threshold)
outputs = list(agent.run(question_runs, early_stop=early_stop))
outputs = add_model_scores(outputs, example["clean_answers"], example["run_indices"])
return outputs
def single_run(
self,
question_id: int,
pipeline_state: PipelineState,
buzz_threshold: float,
early_stop: bool = True,
) -> tuple[str, Any, Any]:
"""Run the agent in tossup mode with a system prompt."""
try:
# Validate inputs
question_id = int(question_id - 1)
if not self.ds or question_id < 0 or question_id >= len(self.ds):
return "Invalid question ID or dataset not loaded", None, None
example = self.ds[question_id]
outputs = self.get_model_outputs(example, pipeline_state, buzz_threshold, early_stop)
# Process results and prepare visualization data
tokens_html, plot_data, output_state = initialize_eval_interface(example, outputs)
df = process_tossup_results(outputs)
return (
tokens_html,
gr.update(value=plot_data, label=f"Buzz Confidence on Question {question_id + 1}"),
gr.update(value=output_state),
gr.update(value=df, label=f"Model Outputs for Question {question_id + 1}"),
)
except Exception as e:
import traceback
error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
return error_msg, None, None
def evaluate(self, pipeline_state: PipelineState, buzz_threshold: float, progress: gr.Progress = gr.Progress()):
"""Evaluate the tossup questions."""
try:
# Validate inputs
if not self.ds or not self.ds.num_rows:
return "No dataset loaded", None, None
buzz_counts = 0
correct_buzzes = 0
token_positions = []
correctness = []
for example in progress.tqdm(self.ds, desc="Evaluating tossup questions"):
model_outputs = self.get_model_outputs(example, pipeline_state, buzz_threshold, early_stop=True)
if model_outputs[-1]["buzz"]:
buzz_counts += 1
if model_outputs[-1]["score"] == 1:
correct_buzzes += 1
token_positions.append(model_outputs[-1]["token_position"])
correctness.append(model_outputs[-1]["score"])
buzz_accuracy = correct_buzzes / buzz_counts
df = pd.DataFrame(
[
{
"Avg Buzz Position": f"{np.mean(token_positions):.2f}",
"Buzz Accuracy": f"{buzz_accuracy:.2%}",
"Total Score": f"{correct_buzzes}/{len(self.ds)}",
}
]
)
plot_data = create_scatter_pyplot(token_positions, correctness)
return (
gr.update(value=df, label="Scores on Sample Set"),
gr.update(value=plot_data, label="Buzz Positions on Sample Set"),
)
except Exception:
import traceback
logging.error(f"Error evaluating tossups: {traceback.format_exc()}")
return "Error evaluating tossups", None, None
def submit_model(
self, model_name: str, description: str, pipeline_state: PipelineState, profile: gr.OAuthProfile = None
):
"""Submit the model output."""
return submit.submit_model(model_name, description, pipeline_state.workflow, "tossup", profile)
def _setup_event_listeners(self):
gr.on(
triggers=[self.app.load, self.qid_selector.change],
fn=self.get_new_question_html,
inputs=[self.qid_selector],
outputs=[self.question_display],
)
self.run_btn.click(
self.pipeline_interface.validate_workflow,
inputs=[self.pipeline_interface.pipeline_state],
outputs=[self.pipeline_interface.pipeline_state],
).success(
self.single_run,
inputs=[
self.qid_selector,
self.pipeline_interface.pipeline_state,
self.buzz_t_slider,
self.early_stop_checkbox,
],
outputs=[
self.question_display,
self.confidence_plot,
self.output_state,
self.results_table,
],
)
self.eval_btn.click(
fn=self.evaluate,
inputs=[self.pipeline_interface.pipeline_state, self.buzz_t_slider],
outputs=[self.results_table, self.confidence_plot],
)
self.submit_btn.click(
fn=self.submit_model,
inputs=[
self.model_name_input,
self.description_input,
self.pipeline_interface.pipeline_state,
],
outputs=[self.submit_status],
)
self.hidden_input.change(
fn=update_tossup_plot,
inputs=[self.hidden_input, self.output_state],
outputs=[self.confidence_plot],
)