import json import logging from typing import Any import gradio as gr import matplotlib.pyplot as plt import numpy as np import pandas as pd from datasets import Dataset from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState from submission import submit from workflows.qb.multi_step_agent import MultiStepBonusAgent from workflows.qb.simple_agent import SimpleBonusAgent from workflows.structs import ModelStep, Workflow from .plotting import ( create_pyplot, create_scatter_pyplot, evaluate_buzz, update_plot, ) def evaluate_bonus_part(prediction: str, clean_answers: list[str]) -> float: """Evaluate a single bonus part.""" return evaluate_buzz(prediction, clean_answers) def process_bonus_results(results: list[dict]) -> pd.DataFrame: """Process results from bonus mode and prepare visualization data.""" return pd.DataFrame( [ { "Part": f"Part {r['part_number']}", "Correct?": "✅" if r["score"] == 1 else "❌", "Confidence": r["confidence"], "Prediction": r["answer"], "Explanation": r["explanation"], } for r in results ] ) def initialize_eval_interface(example: dict, model_outputs: list[dict]): """Initialize the interface with example text.""" try: # Create HTML for leadin and parts leadin_html = f"
{example['leadin']}
" parts_html = [] for i, part in enumerate(example["parts"]): parts_html.append(f"
Part {i + 1}: {part['part']}
") html_content = f"{leadin_html}
{''.join(parts_html)}
" # Create confidence plot data plot_data = create_bonus_confidence_plot(example["parts"], model_outputs) # Store state state = json.dumps({"parts": example["parts"], "outputs": model_outputs}) return html_content, plot_data, state except Exception as e: logging.error(f"Error initializing interface: {e}", exc_info=True) return f"
Error initializing interface: {str(e)}
", pd.DataFrame(), "{}" def create_bonus_confidence_plot(parts: list[dict], model_outputs: list[dict]): """Create confidence plot for bonus parts.""" plt.style.use("ggplot") fig = plt.figure(figsize=(10, 6)) ax = fig.add_subplot(111) # Plot confidence for each part x = range(1, len(parts) + 1) confidences = [output["confidence"] for output in model_outputs] scores = [output["score"] for output in model_outputs] # Plot confidence bars bars = ax.bar(x, confidences, color="#4698cf") # Color bars based on correctness for i, score in enumerate(scores): bars[i].set_color("green" if score == 1 else "red") ax.set_title("Part Confidence") ax.set_xlabel("Part Number") ax.set_ylabel("Confidence") ax.set_xticks(x) ax.set_xticklabels([f"Part {i}" for i in x]) return fig def validate_workflow(workflow: Workflow): """Validate that a workflow is properly configured for the bonus task.""" if not workflow.steps: raise ValueError("Workflow must have at least one step") # Ensure all steps are properly configured for step_id, step in workflow.steps.items(): validate_model_step(step) # Check that the workflow has the correct structure input_vars = set(workflow.inputs) if "leadin" not in input_vars or "part" not in input_vars: raise ValueError("Workflow must have 'leadin' and 'part' as inputs") output_vars = set(workflow.outputs) if not all(var in output_vars for var in ["answer", "confidence", "explanation"]): raise ValueError("Workflow must produce 'answer', 'confidence', and 'explanation' as outputs") def validate_model_step(model_step: ModelStep): """Validate that a model step is properly configured for the bonus task.""" # Check required fields if not model_step.model or not model_step.provider: raise ValueError("Model step must have both model and provider specified") if model_step.call_type != "llm": raise ValueError("Model step must have call_type 'llm'") # Validate temperature for LLM steps if model_step.temperature is None: raise ValueError("Temperature must be specified for LLM model steps") if not (0.0 <= model_step.temperature <= 1.0): raise ValueError(f"Temperature must be between 0.0 and 1.0, got {model_step.temperature}") # Validate input fields input_field_names = {field.name for field in model_step.input_fields} if "leadin" not in input_field_names or "part" not in input_field_names: raise ValueError("Model step must have 'leadin' and 'part' input fields") # Validate output fields output_field_names = {field.name for field in model_step.output_fields} required_outputs = {"answer", "confidence", "explanation"} if not all(out in output_field_names for out in required_outputs): raise ValueError("Model step must have all required output fields: answer, confidence, explanation") # Validate confidence output field is of type float for field in model_step.output_fields: if field.name == "confidence" and field.type != "float": raise ValueError("The 'confidence' output field must be of type 'float'") class BonusInterface: """Gradio interface for the Bonus mode.""" def __init__(self, app: gr.Blocks, dataset: Dataset, model_options: dict, defaults: dict): """Initialize the Bonus interface.""" logging.info(f"Initializing Bonus interface with dataset size: {len(dataset)}") self.ds = dataset self.model_options = model_options self.app = app self.defaults = defaults self.output_state = gr.State(value="{}") self.render() def _render_model_interface(self, workflow: Workflow, simple: bool = True): """Render the model interface.""" self.pipeline_interface = PipelineInterface( workflow, simple=simple, model_options=list(self.model_options.keys()), ) with gr.Row(): self.run_btn = gr.Button("Run Bonus", variant="primary") def _render_qb_interface(self): """Render the quizbowl interface.""" with gr.Row(): self.qid_selector = gr.Number( label="Question ID", value=1, precision=0, minimum=1, maximum=len(self.ds), show_label=True, scale=0 ) self.answer_display = gr.Textbox( label="Answers", elem_id="answer-display", elem_classes="answer-box", interactive=False, scale=1 ) self.clean_answer_display = gr.Textbox( label="Acceptable Answers", elem_id="answer-display-2", elem_classes="answer-box", interactive=False, scale=2, ) self.question_display = gr.HTML(label="Question", elem_id="question-display") with gr.Row(): self.confidence_plot = gr.Plot( label="Part Confidence", format="webp", ) self.results_table = gr.DataFrame( label="Model Outputs", value=pd.DataFrame(columns=["Part", "Correct?", "Confidence", "Prediction", "Explanation"]), ) with gr.Row(): self.eval_btn = gr.Button("Evaluate") with gr.Accordion("Model Submission", elem_classes="model-submission-accordion", open=True): with gr.Row(): self.model_name_input = gr.Textbox(label="Model Name") self.description_input = gr.Textbox(label="Description") with gr.Row(): gr.LoginButton() self.submit_btn = gr.Button("Submit") self.submit_status = gr.HTML(label="Submission Status") def render(self): """Create the Gradio interface.""" self.hidden_input = gr.Textbox(value="", visible=False, elem_id="hidden-index") workflow = self.defaults["init_workflow"] with gr.Row(): # Model Panel with gr.Column(scale=1): self._render_model_interface(workflow, simple=self.defaults["simple_workflow"]) with gr.Column(scale=1): self._render_qb_interface() self._setup_event_listeners() def get_new_question_html(self, question_id: int): """Get the HTML for a new question.""" example = self.ds[question_id - 1] leadin = example["leadin"] parts = example["parts"] # Create HTML for leadin and parts leadin_html = f"
{leadin}
" parts_html = [] for i, part in enumerate(parts): parts_html.append(f"
{part['part']}
") parts_html_str = "
".join(parts_html) html_content = ( f"
{leadin_html}

{parts_html_str}
" ) # Format answers primary_answers = [f"{i + 1}. {part['answer_primary']}" for i, part in enumerate(parts)] clean_answers = [] for i, part in enumerate(parts): part_answers = [a for a in part["clean_answers"] if len(a.split()) <= 6] clean_answers.append(f"{i + 1}. {', '.join(part_answers)}") return html_content, "\n".join(primary_answers), "\n".join(clean_answers) def get_model_outputs(self, example: dict, pipeline_state: PipelineState): """Get the model outputs for a given question ID.""" outputs = [] leadin = example["leadin"] workflow = pipeline_state.workflow if len(workflow.steps) > 1: agent = MultiStepBonusAgent(workflow) else: agent = SimpleBonusAgent(workflow) for i, part in enumerate(example["parts"]): # Run model for each part part_output = agent.run(leadin, part["part"]) # Add part number and evaluate score part_output["part_number"] = i + 1 part_output["score"] = evaluate_bonus_part(part_output["answer"], part["clean_answers"]) outputs.append(part_output) return outputs def run_bonus( self, question_id: int, pipeline_state: PipelineState, ) -> tuple[str, Any, Any]: """Run the agent in bonus mode.""" try: # Validate inputs question_id = int(question_id - 1) if not self.ds or question_id < 0 or question_id >= len(self.ds): return "Invalid question ID or dataset not loaded", None, None example = self.ds[question_id] outputs = self.get_model_outputs(example, pipeline_state) # Process results and prepare visualization data html_content, plot_data, output_state = initialize_eval_interface(example, outputs) df = process_bonus_results(outputs) return ( html_content, gr.update(value=plot_data, label=f"Part Confidence on Question {question_id + 1}"), gr.update(value=output_state), gr.update(value=df, label=f"Model Outputs for Question {question_id + 1}"), ) except Exception as e: import traceback error_msg = f"Error: {str(e)}\n{traceback.format_exc()}" return error_msg, None, None def evaluate_bonus(self, pipeline_state: PipelineState, progress: gr.Progress = gr.Progress()): """Evaluate the bonus questions.""" try: # Validate inputs if not self.ds or not self.ds.num_rows: return "No dataset loaded", None, None total_correct = 0 total_parts = 0 part_scores = [] part_numbers = [] for example in progress.tqdm(self.ds, desc="Evaluating bonus questions"): model_outputs = self.get_model_outputs(example, pipeline_state) for output in model_outputs: total_parts += 1 if output["score"] == 1: total_correct += 1 part_scores.append(output["score"]) part_numbers.append(output["part_number"]) accuracy = total_correct / total_parts df = pd.DataFrame( [ { "Part Accuracy": f"{accuracy:.2%}", "Total Score": f"{total_correct}/{total_parts}", "Questions Evaluated": len(self.ds), } ] ) plot_data = create_scatter_pyplot(part_numbers, part_scores) return ( gr.update(value=df, label="Scores on Sample Set"), gr.update(value=plot_data, label="Part Scores on Sample Set"), ) except Exception: import traceback logging.error(f"Error evaluating bonus: {traceback.format_exc()}") return "Error evaluating bonus", None, None def submit_model( self, model_name: str, description: str, pipeline_state: PipelineState, profile: gr.OAuthProfile = None ): """Submit the model output.""" return submit.submit_model(model_name, description, pipeline_state.workflow, "bonus", profile) def _setup_event_listeners(self): # Initialize with the default question (ID 0) gr.on( triggers=[self.app.load, self.qid_selector.change], fn=self.get_new_question_html, inputs=[self.qid_selector], outputs=[self.question_display, self.answer_display, self.clean_answer_display], ) self.run_btn.click( self.pipeline_interface.validate_workflow, inputs=[self.pipeline_interface.pipeline_state], outputs=[self.pipeline_interface.pipeline_state], ).success( self.run_bonus, inputs=[ self.qid_selector, self.pipeline_interface.pipeline_state, ], outputs=[ self.question_display, self.confidence_plot, self.output_state, self.results_table, ], ) self.eval_btn.click( fn=self.evaluate_bonus, inputs=[self.pipeline_interface.pipeline_state], outputs=[self.results_table, self.confidence_plot], ) self.submit_btn.click( fn=self.submit_model, inputs=[ self.model_name_input, self.description_input, self.pipeline_interface.pipeline_state, ], outputs=[self.submit_status], ) self.hidden_input.change( fn=update_plot, inputs=[self.hidden_input, self.output_state], outputs=[self.confidence_plot], )