import json import logging from typing import Any import gradio as gr import numpy as np import pandas as pd from datasets import Dataset from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState from submission import submit from workflows.qb.multi_step_agent import MultiStepTossupAgent from workflows.qb.simple_agent import SimpleTossupAgent from workflows.structs import ModelStep, Workflow from .commons import get_qid_selector from .plotting import ( create_scatter_pyplot, create_tossup_confidence_pyplot, create_tossup_html, update_tossup_plot, ) from .utils import evaluate_prediction # TODO: Error handling on run tossup and evaluate tossup and show correct messages # TODO: ^^ Same for Bonus def add_model_scores(model_outputs: list[dict], clean_answers: list[str], run_indices: list[int]) -> list[dict]: """Add model scores to the model outputs.""" for output, run_idx in zip(model_outputs, run_indices): output["score"] = evaluate_prediction(output["answer"], clean_answers) output["token_position"] = run_idx + 1 return model_outputs def prepare_buzz_evals( run_indices: list[int], model_outputs: list[dict] ) -> tuple[list[str], list[tuple[int, float, bool]]]: """Process text into tokens and assign random values for demonstration.""" if not run_indices: logging.warning("No run indices provided, returning empty results") return [], [] eval_points = [] for i, v in zip(run_indices, model_outputs): eval_points.append((int(i), v)) return eval_points def initialize_eval_interface(example, model_outputs: list[dict]): """Initialize the interface with example text.""" try: tokens = example["question"].split() run_indices = example["run_indices"] answer = example["answer_primary"] clean_answers = example["clean_answers"] eval_points = prepare_buzz_evals(run_indices, model_outputs) if not tokens: return "

No tokens found in the provided text.

", pd.DataFrame(), "{}" highlighted_index = next((int(i) for i, v in eval_points if v["buzz"] == 1), -1) html_content = create_tossup_html(tokens, answer, clean_answers, run_indices, eval_points) plot_data = create_tossup_confidence_pyplot(tokens, eval_points, highlighted_index) # Store tokens, values, and buzzes as JSON for later use state = json.dumps({"tokens": tokens, "values": eval_points}) return html_content, plot_data, state except Exception as e: logging.error(f"Error initializing interface: {e}", exc_info=True) return f"

Error initializing interface: {str(e)}

", pd.DataFrame(), "{}" def process_tossup_results(results: list[dict], top_k_mode: bool = False) -> pd.DataFrame: """Process results from tossup mode and prepare visualization data.""" # Create DataFrame for detailed results if top_k_mode: raise ValueError("Top-k mode not supported for tossup mode") return pd.DataFrame( [ { "Token Position": r["token_position"], "Correct?": "✅" if r["score"] == 1 else "❌", "Confidence": r["confidence"], "Prediction": r["answer"], } for r in results ] ) def validate_workflow(workflow: Workflow): """ Validate that a workflow is properly configured for the tossup task. Args: workflow (Workflow): The workflow to validate Raises: ValueError: If the workflow is not properly configured """ if not workflow.steps: raise ValueError("Workflow must have at least one step") # Ensure all steps are properly configured for step_id, step in workflow.steps.items(): validate_model_step(step) # Check that the workflow has the correct structure input_vars = set(workflow.inputs) if "question" not in input_vars: raise ValueError("Workflow must have 'question' as an input") output_vars = set(workflow.outputs) if not any("answer" in out_var for out_var in output_vars): raise ValueError("Workflow must produce an 'answer' as output") if not any("confidence" in out_var for out_var in output_vars): raise ValueError("Workflow must produce a 'confidence' score as output") def validate_model_step(model_step: ModelStep): """ Validate that a model step is properly configured for the tossup task. Args: model_step (ModelStep): The model step to validate Raises: ValueError: If the model step is not properly configured """ # Check required fields if not model_step.model or not model_step.provider: raise ValueError("Model step must have both model and provider specified") if model_step.call_type != "llm": raise ValueError("Model step must have call_type 'llm'") # Validate temperature for LLM steps if model_step.temperature is None: raise ValueError("Temperature must be specified for LLM model steps") if not (0.0 <= model_step.temperature <= 1.0): raise ValueError(f"Temperature must be between 0.0 and 1.0, got {model_step.temperature}") # Validate input fields input_field_names = {field.name for field in model_step.input_fields} if "question" not in input_field_names: raise ValueError("Model step must have a 'question' input field") # Validate output fields output_field_names = {field.name for field in model_step.output_fields} if "answer" not in output_field_names: raise ValueError("Model step must have an 'answer' output field") if "confidence" not in output_field_names: raise ValueError("Model step must have a 'confidence' output field") # Validate confidence output field is of type float for field in model_step.output_fields: if field.name == "confidence" and field.type != "float": raise ValueError("The 'confidence' output field must be of type 'float'") class TossupInterface: """Gradio interface for the Tossup mode.""" def __init__(self, app: gr.Blocks, dataset: Dataset, model_options: dict, defaults: dict): """Initialize the Tossup interface.""" logging.info(f"Initializing Tossup interface with dataset size: {len(dataset)}") self.ds = dataset self.model_options = model_options self.app = app self.defaults = defaults self.output_state = gr.State(value="{}") self.render() def _render_model_interface(self, workflow: Workflow, simple: bool = True): """Render the model interface.""" self.pipeline_interface = PipelineInterface( workflow, simple=simple, model_options=list(self.model_options.keys()), ) with gr.Row(): self.buzz_t_slider = gr.Slider( minimum=0.5, maximum=1.0, value=self.defaults["buzz_threshold"], step=0.01, label="Buzz Threshold", ) self.early_stop_checkbox = gr.Checkbox( value=self.defaults["early_stop"], label="Early Stop", info="Stop early if already buzzed", ) def _render_qb_interface(self): """Render the quizbowl interface.""" with gr.Row(elem_classes="bonus-header-row form-inline"): self.qid_selector = get_qid_selector(len(self.ds)) self.run_btn = gr.Button("Run on Tossup Question", variant="secondary") self.question_display = gr.HTML(label="Question", elem_id="tossup-question-display") with gr.Row(): self.confidence_plot = gr.Plot( label="Buzz Confidence", format="webp", ) self.results_table = gr.DataFrame( label="Model Outputs", value=pd.DataFrame(columns=["Token Position", "Correct?", "Confidence", "Prediction"]), ) with gr.Row(): self.eval_btn = gr.Button("Evaluate", variant="primary") with gr.Accordion("Model Submission", elem_classes="model-submission-accordion", open=True): with gr.Row(): self.model_name_input = gr.Textbox(label="Model Name") self.description_input = gr.Textbox(label="Description") with gr.Row(): gr.LoginButton() self.submit_btn = gr.Button("Submit", variant="primary") self.submit_status = gr.HTML(label="Submission Status") def render(self): """Create the Gradio interface.""" self.hidden_input = gr.Textbox(value="", visible=False, elem_id="hidden-index") workflow = self.defaults["init_workflow"] with gr.Row(): # Model Panel with gr.Column(scale=1): self._render_model_interface(workflow, simple=self.defaults["simple_workflow"]) with gr.Column(scale=1): self._render_qb_interface() self._setup_event_listeners() def validate_workflow(self, pipeline_state: PipelineState): """Validate the workflow.""" try: validate_workflow(pipeline_state.workflow) except Exception as e: raise gr.Error(f"Error validating workflow: {str(e)}") def get_new_question_html(self, question_id: int) -> str: """Get the HTML for a new question.""" if question_id is None: logging.error("Question ID is None. Setting to 1") question_id = 1 try: example = self.ds[question_id - 1] question_tokens = example["question"].split() return create_tossup_html( question_tokens, example["answer_primary"], example["clean_answers"], example["run_indices"] ) except Exception as e: return f"Error loading question: {str(e)}" def get_model_outputs(self, example: dict, pipeline_state: PipelineState, buzz_threshold: float, early_stop: bool): """Get the model outputs for a given question ID.""" question_runs = [] tokens = example["question"].split() for run_idx in example["run_indices"]: question_runs.append(" ".join(tokens[: run_idx + 1])) workflow = pipeline_state.workflow if len(workflow.steps) > 1: agent = MultiStepTossupAgent(workflow, buzz_threshold) else: agent = SimpleTossupAgent(workflow, buzz_threshold) outputs = list(agent.run(question_runs, early_stop=early_stop)) outputs = add_model_scores(outputs, example["clean_answers"], example["run_indices"]) return outputs def single_run( self, question_id: int, pipeline_state: PipelineState, buzz_threshold: float, early_stop: bool = True, ) -> tuple[str, Any, Any]: """Run the agent in tossup mode with a system prompt.""" try: # Validate inputs question_id = int(question_id - 1) if not self.ds or question_id < 0 or question_id >= len(self.ds): return "Invalid question ID or dataset not loaded", None, None example = self.ds[question_id] outputs = self.get_model_outputs(example, pipeline_state, buzz_threshold, early_stop) # Process results and prepare visualization data tokens_html, plot_data, output_state = initialize_eval_interface(example, outputs) df = process_tossup_results(outputs) return ( tokens_html, gr.update(value=plot_data, label=f"Buzz Confidence on Question {question_id + 1}"), gr.update(value=output_state), gr.update(value=df, label=f"Model Outputs for Question {question_id + 1}"), ) except Exception as e: import traceback error_msg = f"Error: {str(e)}\n{traceback.format_exc()}" return error_msg, None, None def evaluate(self, pipeline_state: PipelineState, buzz_threshold: float, progress: gr.Progress = gr.Progress()): """Evaluate the tossup questions.""" try: # Validate inputs if not self.ds or not self.ds.num_rows: return "No dataset loaded", None, None buzz_counts = 0 correct_buzzes = 0 token_positions = [] correctness = [] for example in progress.tqdm(self.ds, desc="Evaluating tossup questions"): model_outputs = self.get_model_outputs(example, pipeline_state, buzz_threshold, early_stop=True) if model_outputs[-1]["buzz"]: buzz_counts += 1 if model_outputs[-1]["score"] == 1: correct_buzzes += 1 token_positions.append(model_outputs[-1]["token_position"]) correctness.append(model_outputs[-1]["score"]) buzz_accuracy = correct_buzzes / buzz_counts df = pd.DataFrame( [ { "Avg Buzz Position": f"{np.mean(token_positions):.2f}", "Buzz Accuracy": f"{buzz_accuracy:.2%}", "Total Score": f"{correct_buzzes}/{len(self.ds)}", } ] ) plot_data = create_scatter_pyplot(token_positions, correctness) return ( gr.update(value=df, label="Scores on Sample Set"), gr.update(value=plot_data, label="Buzz Positions on Sample Set"), ) except Exception: import traceback logging.error(f"Error evaluating tossups: {traceback.format_exc()}") return "Error evaluating tossups", None, None def submit_model( self, model_name: str, description: str, pipeline_state: PipelineState, profile: gr.OAuthProfile = None ): """Submit the model output.""" return submit.submit_model(model_name, description, pipeline_state.workflow, "tossup", profile) def _setup_event_listeners(self): gr.on( triggers=[self.app.load, self.qid_selector.change], fn=self.get_new_question_html, inputs=[self.qid_selector], outputs=[self.question_display], ) self.run_btn.click( self.pipeline_interface.validate_workflow, inputs=[self.pipeline_interface.pipeline_state], outputs=[self.pipeline_interface.pipeline_state], ).success( self.single_run, inputs=[ self.qid_selector, self.pipeline_interface.pipeline_state, self.buzz_t_slider, self.early_stop_checkbox, ], outputs=[ self.question_display, self.confidence_plot, self.output_state, self.results_table, ], ) self.eval_btn.click( fn=self.evaluate, inputs=[self.pipeline_interface.pipeline_state, self.buzz_t_slider], outputs=[self.results_table, self.confidence_plot], ) self.submit_btn.click( fn=self.submit_model, inputs=[ self.model_name_input, self.description_input, self.pipeline_interface.pipeline_state, ], outputs=[self.submit_status], ) self.hidden_input.change( fn=update_tossup_plot, inputs=[self.hidden_input, self.output_state], outputs=[self.confidence_plot], )