Maharshi Gor
Enhance model provider detection and add repository management script. Added support for multi step agent.
973519b
import json | |
import logging | |
from typing import Any | |
import gradio as gr | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
from datasets import Dataset | |
from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState | |
from submission import submit | |
from workflows.qb.multi_step_agent import MultiStepBonusAgent | |
from workflows.qb.simple_agent import SimpleBonusAgent | |
from workflows.structs import ModelStep, Workflow | |
from .plotting import ( | |
create_pyplot, | |
create_scatter_pyplot, | |
evaluate_buzz, | |
update_plot, | |
) | |
def evaluate_bonus_part(prediction: str, clean_answers: list[str]) -> float: | |
"""Evaluate a single bonus part.""" | |
return evaluate_buzz(prediction, clean_answers) | |
def process_bonus_results(results: list[dict]) -> pd.DataFrame: | |
"""Process results from bonus mode and prepare visualization data.""" | |
return pd.DataFrame( | |
[ | |
{ | |
"Part": f"Part {r['part_number']}", | |
"Correct?": "✅" if r["score"] == 1 else "❌", | |
"Confidence": r["confidence"], | |
"Prediction": r["answer"], | |
"Explanation": r["explanation"], | |
} | |
for r in results | |
] | |
) | |
def initialize_eval_interface(example: dict, model_outputs: list[dict]): | |
"""Initialize the interface with example text.""" | |
try: | |
# Create HTML for leadin and parts | |
leadin_html = f"<div class='leadin'>{example['leadin']}</div>" | |
parts_html = [] | |
for i, part in enumerate(example["parts"]): | |
parts_html.append(f"<div class='part'><b>Part {i + 1}:</b> {part['part']}</div>") | |
html_content = f"{leadin_html}<div class='parts-container'>{''.join(parts_html)}</div>" | |
# Create confidence plot data | |
plot_data = create_bonus_confidence_plot(example["parts"], model_outputs) | |
# Store state | |
state = json.dumps({"parts": example["parts"], "outputs": model_outputs}) | |
return html_content, plot_data, state | |
except Exception as e: | |
logging.error(f"Error initializing interface: {e}", exc_info=True) | |
return f"<div>Error initializing interface: {str(e)}</div>", pd.DataFrame(), "{}" | |
def create_bonus_confidence_plot(parts: list[dict], model_outputs: list[dict]): | |
"""Create confidence plot for bonus parts.""" | |
plt.style.use("ggplot") | |
fig = plt.figure(figsize=(10, 6)) | |
ax = fig.add_subplot(111) | |
# Plot confidence for each part | |
x = range(1, len(parts) + 1) | |
confidences = [output["confidence"] for output in model_outputs] | |
scores = [output["score"] for output in model_outputs] | |
# Plot confidence bars | |
bars = ax.bar(x, confidences, color="#4698cf") | |
# Color bars based on correctness | |
for i, score in enumerate(scores): | |
bars[i].set_color("green" if score == 1 else "red") | |
ax.set_title("Part Confidence") | |
ax.set_xlabel("Part Number") | |
ax.set_ylabel("Confidence") | |
ax.set_xticks(x) | |
ax.set_xticklabels([f"Part {i}" for i in x]) | |
return fig | |
def validate_workflow(workflow: Workflow): | |
"""Validate that a workflow is properly configured for the bonus task.""" | |
if not workflow.steps: | |
raise ValueError("Workflow must have at least one step") | |
# Ensure all steps are properly configured | |
for step_id, step in workflow.steps.items(): | |
validate_model_step(step) | |
# Check that the workflow has the correct structure | |
input_vars = set(workflow.inputs) | |
if "leadin" not in input_vars or "part" not in input_vars: | |
raise ValueError("Workflow must have 'leadin' and 'part' as inputs") | |
output_vars = set(workflow.outputs) | |
if not all(var in output_vars for var in ["answer", "confidence", "explanation"]): | |
raise ValueError("Workflow must produce 'answer', 'confidence', and 'explanation' as outputs") | |
def validate_model_step(model_step: ModelStep): | |
"""Validate that a model step is properly configured for the bonus task.""" | |
# Check required fields | |
if not model_step.model or not model_step.provider: | |
raise ValueError("Model step must have both model and provider specified") | |
if model_step.call_type != "llm": | |
raise ValueError("Model step must have call_type 'llm'") | |
# Validate temperature for LLM steps | |
if model_step.temperature is None: | |
raise ValueError("Temperature must be specified for LLM model steps") | |
if not (0.0 <= model_step.temperature <= 1.0): | |
raise ValueError(f"Temperature must be between 0.0 and 1.0, got {model_step.temperature}") | |
# Validate input fields | |
input_field_names = {field.name for field in model_step.input_fields} | |
if "leadin" not in input_field_names or "part" not in input_field_names: | |
raise ValueError("Model step must have 'leadin' and 'part' input fields") | |
# Validate output fields | |
output_field_names = {field.name for field in model_step.output_fields} | |
required_outputs = {"answer", "confidence", "explanation"} | |
if not all(out in output_field_names for out in required_outputs): | |
raise ValueError("Model step must have all required output fields: answer, confidence, explanation") | |
# Validate confidence output field is of type float | |
for field in model_step.output_fields: | |
if field.name == "confidence" and field.type != "float": | |
raise ValueError("The 'confidence' output field must be of type 'float'") | |
class BonusInterface: | |
"""Gradio interface for the Bonus mode.""" | |
def __init__(self, app: gr.Blocks, dataset: Dataset, model_options: dict, defaults: dict): | |
"""Initialize the Bonus interface.""" | |
logging.info(f"Initializing Bonus interface with dataset size: {len(dataset)}") | |
self.ds = dataset | |
self.model_options = model_options | |
self.app = app | |
self.defaults = defaults | |
self.output_state = gr.State(value="{}") | |
self.render() | |
def _render_model_interface(self, workflow: Workflow, simple: bool = True): | |
"""Render the model interface.""" | |
self.pipeline_interface = PipelineInterface( | |
workflow, | |
simple=simple, | |
model_options=list(self.model_options.keys()), | |
) | |
with gr.Row(): | |
self.run_btn = gr.Button("Run Bonus", variant="primary") | |
def _render_qb_interface(self): | |
"""Render the quizbowl interface.""" | |
with gr.Row(): | |
self.qid_selector = gr.Number( | |
label="Question ID", value=1, precision=0, minimum=1, maximum=len(self.ds), show_label=True, scale=0 | |
) | |
self.answer_display = gr.Textbox( | |
label="Answers", elem_id="answer-display", elem_classes="answer-box", interactive=False, scale=1 | |
) | |
self.clean_answer_display = gr.Textbox( | |
label="Acceptable Answers", | |
elem_id="answer-display-2", | |
elem_classes="answer-box", | |
interactive=False, | |
scale=2, | |
) | |
self.question_display = gr.HTML(label="Question", elem_id="question-display") | |
with gr.Row(): | |
self.confidence_plot = gr.Plot( | |
label="Part Confidence", | |
format="webp", | |
) | |
self.results_table = gr.DataFrame( | |
label="Model Outputs", | |
value=pd.DataFrame(columns=["Part", "Correct?", "Confidence", "Prediction", "Explanation"]), | |
) | |
with gr.Row(): | |
self.eval_btn = gr.Button("Evaluate") | |
with gr.Accordion("Model Submission", elem_classes="model-submission-accordion", open=True): | |
with gr.Row(): | |
self.model_name_input = gr.Textbox(label="Model Name") | |
self.description_input = gr.Textbox(label="Description") | |
with gr.Row(): | |
gr.LoginButton() | |
self.submit_btn = gr.Button("Submit") | |
self.submit_status = gr.HTML(label="Submission Status") | |
def render(self): | |
"""Create the Gradio interface.""" | |
self.hidden_input = gr.Textbox(value="", visible=False, elem_id="hidden-index") | |
workflow = self.defaults["init_workflow"] | |
with gr.Row(): | |
# Model Panel | |
with gr.Column(scale=1): | |
self._render_model_interface(workflow, simple=self.defaults["simple_workflow"]) | |
with gr.Column(scale=1): | |
self._render_qb_interface() | |
self._setup_event_listeners() | |
def get_new_question_html(self, question_id: int): | |
"""Get the HTML for a new question.""" | |
example = self.ds[question_id - 1] | |
leadin = example["leadin"] | |
parts = example["parts"] | |
# Create HTML for leadin and parts | |
leadin_html = f"<div class='leadin'>{leadin}</div>" | |
parts_html = [] | |
for i, part in enumerate(parts): | |
parts_html.append(f"<div class='part'>{part['part']}</div>") | |
parts_html_str = "<br>".join(parts_html) | |
html_content = ( | |
f"<div class='token-container'>{leadin_html}<div class='parts-container'><br>{parts_html_str}</div></div>" | |
) | |
# Format answers | |
primary_answers = [f"{i + 1}. {part['answer_primary']}" for i, part in enumerate(parts)] | |
clean_answers = [] | |
for i, part in enumerate(parts): | |
part_answers = [a for a in part["clean_answers"] if len(a.split()) <= 6] | |
clean_answers.append(f"{i + 1}. {', '.join(part_answers)}") | |
return html_content, "\n".join(primary_answers), "\n".join(clean_answers) | |
def get_model_outputs(self, example: dict, pipeline_state: PipelineState): | |
"""Get the model outputs for a given question ID.""" | |
outputs = [] | |
leadin = example["leadin"] | |
workflow = pipeline_state.workflow | |
if len(workflow.steps) > 1: | |
agent = MultiStepBonusAgent(workflow) | |
else: | |
agent = SimpleBonusAgent(workflow) | |
for i, part in enumerate(example["parts"]): | |
# Run model for each part | |
part_output = agent.run(leadin, part["part"]) | |
# Add part number and evaluate score | |
part_output["part_number"] = i + 1 | |
part_output["score"] = evaluate_bonus_part(part_output["answer"], part["clean_answers"]) | |
outputs.append(part_output) | |
return outputs | |
def run_bonus( | |
self, | |
question_id: int, | |
pipeline_state: PipelineState, | |
) -> tuple[str, Any, Any]: | |
"""Run the agent in bonus mode.""" | |
try: | |
# Validate inputs | |
question_id = int(question_id - 1) | |
if not self.ds or question_id < 0 or question_id >= len(self.ds): | |
return "Invalid question ID or dataset not loaded", None, None | |
example = self.ds[question_id] | |
outputs = self.get_model_outputs(example, pipeline_state) | |
# Process results and prepare visualization data | |
html_content, plot_data, output_state = initialize_eval_interface(example, outputs) | |
df = process_bonus_results(outputs) | |
return ( | |
html_content, | |
gr.update(value=plot_data, label=f"Part Confidence on Question {question_id + 1}"), | |
gr.update(value=output_state), | |
gr.update(value=df, label=f"Model Outputs for Question {question_id + 1}"), | |
) | |
except Exception as e: | |
import traceback | |
error_msg = f"Error: {str(e)}\n{traceback.format_exc()}" | |
return error_msg, None, None | |
def evaluate_bonus(self, pipeline_state: PipelineState, progress: gr.Progress = gr.Progress()): | |
"""Evaluate the bonus questions.""" | |
try: | |
# Validate inputs | |
if not self.ds or not self.ds.num_rows: | |
return "No dataset loaded", None, None | |
total_correct = 0 | |
total_parts = 0 | |
part_scores = [] | |
part_numbers = [] | |
for example in progress.tqdm(self.ds, desc="Evaluating bonus questions"): | |
model_outputs = self.get_model_outputs(example, pipeline_state) | |
for output in model_outputs: | |
total_parts += 1 | |
if output["score"] == 1: | |
total_correct += 1 | |
part_scores.append(output["score"]) | |
part_numbers.append(output["part_number"]) | |
accuracy = total_correct / total_parts | |
df = pd.DataFrame( | |
[ | |
{ | |
"Part Accuracy": f"{accuracy:.2%}", | |
"Total Score": f"{total_correct}/{total_parts}", | |
"Questions Evaluated": len(self.ds), | |
} | |
] | |
) | |
plot_data = create_scatter_pyplot(part_numbers, part_scores) | |
return ( | |
gr.update(value=df, label="Scores on Sample Set"), | |
gr.update(value=plot_data, label="Part Scores on Sample Set"), | |
) | |
except Exception: | |
import traceback | |
logging.error(f"Error evaluating bonus: {traceback.format_exc()}") | |
return "Error evaluating bonus", None, None | |
def submit_model( | |
self, model_name: str, description: str, pipeline_state: PipelineState, profile: gr.OAuthProfile = None | |
): | |
"""Submit the model output.""" | |
return submit.submit_model(model_name, description, pipeline_state.workflow, "bonus", profile) | |
def _setup_event_listeners(self): | |
# Initialize with the default question (ID 0) | |
gr.on( | |
triggers=[self.app.load, self.qid_selector.change], | |
fn=self.get_new_question_html, | |
inputs=[self.qid_selector], | |
outputs=[self.question_display, self.answer_display, self.clean_answer_display], | |
) | |
self.run_btn.click( | |
self.pipeline_interface.validate_workflow, | |
inputs=[self.pipeline_interface.pipeline_state], | |
outputs=[self.pipeline_interface.pipeline_state], | |
).success( | |
self.run_bonus, | |
inputs=[ | |
self.qid_selector, | |
self.pipeline_interface.pipeline_state, | |
], | |
outputs=[ | |
self.question_display, | |
self.confidence_plot, | |
self.output_state, | |
self.results_table, | |
], | |
) | |
self.eval_btn.click( | |
fn=self.evaluate_bonus, | |
inputs=[self.pipeline_interface.pipeline_state], | |
outputs=[self.results_table, self.confidence_plot], | |
) | |
self.submit_btn.click( | |
fn=self.submit_model, | |
inputs=[ | |
self.model_name_input, | |
self.description_input, | |
self.pipeline_interface.pipeline_state, | |
], | |
outputs=[self.submit_status], | |
) | |
self.hidden_input.change( | |
fn=update_plot, | |
inputs=[self.hidden_input, self.output_state], | |
outputs=[self.confidence_plot], | |
) | |