Spaces:

umdclip
/

quizbowl-submission

Running

Maharshi Gor commited on 4 days ago

Commit

f10a835

1 Parent(s): 9b07040

Major update:

* Moved login at top.
* Browser state for pipeline states to retain user changes to pipeline from before login
* Plots and metrics for tossups single run and eval.
* Refactored model_submission pane
* TypedDicts for pipeline interface defaults

Files changed (7) hide show

app.py +65 -10
src/components/commons.py +22 -0
src/components/quizbowl/bonus.py +84 -45
src/components/quizbowl/plotting.py +335 -64
src/components/quizbowl/tossup.py +141 -91
src/components/quizbowl/utils.py +0 -1
src/components/typed_dicts.py +16 -0

app.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import datasets
 import gradio as gr
 from apscheduler.schedulers.background import BackgroundScheduler
@@ -9,6 +11,7 @@ from about import LEADERBOARD_INTRODUCTION_TEXT, LEADERBOARD_TITLE
 from app_configs import DEFAULT_SELECTIONS, THEME
 from components.quizbowl.bonus import BonusInterface
 from components.quizbowl.tossup import TossupInterface
 from display.css_html_js import fonts_header, js_head, leaderboard_css
 from display.custom_css import css_bonus, css_pipeline, css_tossup
 from display.guide import BUILDING_MARKDOWN, GUIDE_MARKDOWN, QUICKSTART_MARKDOWN
@@ -76,6 +79,26 @@ def get_default_tab_id(request: gr.Request):
     return gr.update(selected=tab_key_value)
 if __name__ == "__main__":
     scheduler = BackgroundScheduler()
     scheduler.add_job(restart_space, "interval", seconds=SERVER_REFRESH_INTERVAL)
@@ -91,19 +114,36 @@ if __name__ == "__main__":
         theme=THEME,
         title="Quizbowl Bot",
     ) as demo:
         with gr.Row():
-            gr.Markdown("## Welcome to Quizbowl Bot! This is a tool for creating and testing quizbowl agents.")
         with gr.Tabs() as gtab:
             with gr.Tab("🛎️ Tossup Agents", id="tossup"):
-                defaults = DEFAULT_SELECTIONS["tossup"] | {
-                    "init_workflow": factory.create_simple_qb_tossup_workflow(),
-                }
-                tossup_interface = TossupInterface(demo, tossup_ds, AVAILABLE_MODELS, defaults)
             with gr.Tab("🙋🏻‍♂️ Bonus Round Agents", id="bonus"):
-                defaults = DEFAULT_SELECTIONS["bonus"] | {
-                    "init_workflow": factory.create_simple_qb_bonus_workflow(),
-                }
-                bonus_interface = BonusInterface(demo, bonus_ds, AVAILABLE_MODELS, defaults)
             with gr.Tab("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id="leaderboard"):
                 leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
                 gr.Markdown("<a id='leaderboard' href='#leaderboard'>QANTA Leaderboard</a>")
@@ -126,4 +166,19 @@ if __name__ == "__main__":
                     with gr.Column():
                         gr.Markdown(BUILDING_MARKDOWN)
-    demo.queue(default_concurrency_limit=40).launch()

+import json
 import datasets
 import gradio as gr
 from apscheduler.schedulers.background import BackgroundScheduler
 from app_configs import DEFAULT_SELECTIONS, THEME
 from components.quizbowl.bonus import BonusInterface
 from components.quizbowl.tossup import TossupInterface
+from components.typed_dicts import PipelineInterfaceDefaults, TossupInterfaceDefaults
 from display.css_html_js import fonts_header, js_head, leaderboard_css
 from display.custom_css import css_bonus, css_pipeline, css_tossup
 from display.guide import BUILDING_MARKDOWN, GUIDE_MARKDOWN, QUICKSTART_MARKDOWN
     return gr.update(selected=tab_key_value)
+def presave_pipeline_state(
+    login_btn,
+    browser_state: dict,
+    tossup_pipeline_state: dict,
+    tossup_output_state: dict,
+    bonus_pipeline_state: dict,
+    bonus_output_state: dict,
+):
+    browser_state.setdefault("tossup", {})
+    browser_state["tossup"]["pipeline_state"] = tossup_pipeline_state
+    browser_state["tossup"]["output_state"] = tossup_output_state
+    browser_state.setdefault("bonus", {})
+    browser_state["bonus"]["pipeline_state"] = bonus_pipeline_state
+    browser_state["bonus"]["output_state"] = bonus_output_state
+    logger.debug(
+        f"Pipeline state before login. Login button: {login_btn}, browser state: {json.dumps(browser_state, indent=4)}"
+    )
+    return login_btn, browser_state
 if __name__ == "__main__":
     scheduler = BackgroundScheduler()
     scheduler.add_job(restart_space, "interval", seconds=SERVER_REFRESH_INTERVAL)
         theme=THEME,
         title="Quizbowl Bot",
     ) as demo:
+        browser_state = gr.BrowserState(
+            {
+                "tossup": {"pipeline_state": None, "output_state": None},
+                "bonus": {"pipeline_state": None, "output_state": None},
+            }
+        )
         with gr.Row():
+            with gr.Column(scale=5):
+                gr.Markdown(
+                    "## Welcome to Quizbowl Arena! \n### Create, play around, and submit your quizbowl agents.",
+                    elem_classes="welcome-text",
+                )
+            login_btn = gr.LoginButton(scale=1)
+        gr.Markdown(
+            "**First time here?** Check out the [❓ Help](#help) tab for a quick introduction and the "
+            "[walkthrough documentation](https://github.com/stanford-crfm/quizbowl-arena/blob/main/docs/walkthrough.md) "
+            "for detailed examples and tutorials on how to create and compete with your own QuizBowl agents.",
+            elem_classes="help-text",
+        )
         with gr.Tabs() as gtab:
             with gr.Tab("🛎️ Tossup Agents", id="tossup"):
+                defaults = TossupInterfaceDefaults(
+                    **DEFAULT_SELECTIONS["tossup"], init_workflow=factory.create_simple_qb_tossup_workflow()
+                )
+                tossup_interface = TossupInterface(demo, browser_state, tossup_ds, AVAILABLE_MODELS, defaults)
             with gr.Tab("🙋🏻‍♂️ Bonus Round Agents", id="bonus"):
+                defaults = PipelineInterfaceDefaults(
+                    **DEFAULT_SELECTIONS["bonus"], init_workflow=factory.create_simple_qb_bonus_workflow()
+                )
+                bonus_interface = BonusInterface(demo, browser_state, bonus_ds, AVAILABLE_MODELS, defaults)
             with gr.Tab("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id="leaderboard"):
                 leaderboard_timer = gr.Timer(LEADERBOARD_REFRESH_INTERVAL)
                 gr.Markdown("<a id='leaderboard' href='#leaderboard'>QANTA Leaderboard</a>")
                     with gr.Column():
                         gr.Markdown(BUILDING_MARKDOWN)
+        # Event Listeners
+        login_btn.click(
+            fn=presave_pipeline_state,
+            inputs=[
+                login_btn,
+                browser_state,
+                tossup_interface.pipeline_state,
+                tossup_interface.output_state,
+                bonus_interface.pipeline_state,
+                bonus_interface.output_state,
+            ],
+            outputs=[login_btn, browser_state],
+        )
+        demo.queue(default_concurrency_limit=40).launch()

src/components/commons.py CHANGED Viewed

@@ -33,3 +33,25 @@ def get_panel_header(header: str, subheader: str | None = None):
     with gr.Row(elem_classes="md panel-header-container") as row:
         gr.HTML(html)
     return row

     with gr.Row(elem_classes="md panel-header-container") as row:
         gr.HTML(html)
     return row
+def get_model_submission_accordion(app: gr.Blocks):
+    with gr.Accordion(
+        "Feel happy with your agent? Make a submission!", elem_classes="model-submission-accordion", open=True
+    ):
+        with gr.Row():
+            model_name_input = gr.Textbox(label="Submission Name")
+            description_input = gr.Textbox(label="Submission Description")
+        with gr.Row():
+            # login_btn = gr.LoginButton()
+            submit_btn = gr.Button("Submit", variant="primary", interactive=False)
+        submit_status = gr.HTML(label="Submission Status")
+    def check_user_login(profile: gr.OAuthProfile | None):
+        if profile is not None:
+            return gr.update(interactive=True, value="Submit Agent")
+        return gr.update(interactive=False, value="Login to submit your agent")
+    gr.on(triggers=app.load, fn=check_user_login, inputs=[], outputs=[submit_btn])
+    return model_name_input, description_input, submit_btn, submit_status

src/components/quizbowl/bonus.py CHANGED Viewed

@@ -8,12 +8,12 @@ from loguru import logger
 from app_configs import CONFIGS, UNSELECTED_PIPELINE_NAME
 from components import commons
-from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState, PipelineUIState
 from components.typed_dicts import PipelineStateDict
 from display.formatting import styled_error
 from submission import submit
 from workflows.qb_agents import QuizBowlBonusAgent
-from workflows.structs import ModelStep, Workflow
 from . import populate, validation
 from .plotting import create_bonus_confidence_plot, create_bonus_html
@@ -56,9 +56,10 @@ def initialize_eval_interface(example: dict, model_outputs: list[dict]):
 class BonusInterface:
     """Gradio interface for the Bonus mode."""
-    def __init__(self, app: gr.Blocks, dataset: Dataset, model_options: dict, defaults: dict):
         """Initialize the Bonus interface."""
         logger.info(f"Initializing Bonus interface with dataset size: {len(dataset)}")
         self.ds = dataset
         self.model_options = model_options
         self.app = app
@@ -66,7 +67,24 @@ class BonusInterface:
         self.output_state = gr.State(value="{}")
         self.render()
-    def _render_pipeline_interface(self, workflow: Workflow, simple: bool = True):
         """Render the model interface."""
         with gr.Row(elem_classes="bonus-header-row form-inline"):
             self.pipeline_selector = commons.get_pipeline_selector([])
@@ -74,7 +92,8 @@ class BonusInterface:
         self.import_error_display = gr.HTML(label="Import Error", elem_id="import-error-display", visible=False)
         self.pipeline_interface = PipelineInterface(
             self.app,
-            workflow,
             model_options=list(self.model_options.keys()),
             config=self.defaults,
         )
@@ -97,24 +116,20 @@ class BonusInterface:
         with gr.Row():
             self.eval_btn = gr.Button("Evaluate", variant="primary")
-        with gr.Accordion("Model Submission", elem_classes="model-submission-accordion", open=True):
-            with gr.Row():
-                self.model_name_input = gr.Textbox(label="Model Name")
-                self.description_input = gr.Textbox(label="Description")
-            with gr.Row():
-                gr.LoginButton()
-                self.submit_btn = gr.Button("Submit", variant="primary")
-            self.submit_status = gr.HTML(label="Submission Status")
     def render(self):
         """Create the Gradio interface."""
         self.hidden_input = gr.Textbox(value="", visible=False, elem_id="hidden-index")
-        workflow = self.defaults["init_workflow"]
         with gr.Row():
             # Model Panel
             with gr.Column(scale=1):
-                self._render_pipeline_interface(workflow, simple=self.defaults["simple_workflow"])
             with gr.Column(scale=1):
                 self._render_qb_interface()
@@ -150,7 +165,27 @@ class BonusInterface:
         except Exception as e:
             return f"Error loading question: {str(e)}"
-    def get_model_outputs(self, example: dict, pipeline_state: PipelineState):
         """Get the model outputs for a given question ID."""
         outputs = []
         leadin = example["leadin"]
@@ -168,30 +203,21 @@ class BonusInterface:
         return outputs
-    def get_pipeline_names(self, profile: gr.OAuthProfile | None) -> list[str]:
-        names = [UNSELECTED_PIPELINE_NAME] + populate.get_pipeline_names("bonus", profile)
-        return gr.update(choices=names, value=UNSELECTED_PIPELINE_NAME)
-    def load_pipeline(
-        self, model_name: str, pipeline_change: bool, profile: gr.OAuthProfile | None
-    ) -> tuple[str, PipelineStateDict, bool, dict]:
-        try:
-            workflow = populate.load_workflow("bonus", model_name, profile)
-            if workflow is None:
-                logger.warning(f"Could not load workflow for {model_name}")
-                return UNSELECTED_PIPELINE_NAME, gr.skip(), gr.skip(), gr.update(visible=False)
-            pipeline_state_dict = PipelineState.from_workflow(workflow).model_dump()
-            return UNSELECTED_PIPELINE_NAME, pipeline_state_dict, not pipeline_change, gr.update(visible=True)
-        except Exception as e:
-            error_msg = styled_error(f"Error loading pipeline: {str(e)}")
-            return UNSELECTED_PIPELINE_NAME, gr.skip(), gr.skip(), gr.update(visible=True, value=error_msg)
     def single_run(
         self,
         question_id: int,
         state_dict: PipelineStateDict,
     ) -> tuple[str, Any, Any]:
-        """Run the agent in bonus mode."""
         try:
             pipeline_state = validation.validate_bonus_workflow(state_dict)
             question_id = int(question_id - 1)
@@ -199,7 +225,7 @@ class BonusInterface:
                 raise gr.Error("Invalid question ID or dataset not loaded")
             example = self.ds[question_id]
-            outputs = self.get_model_outputs(example, pipeline_state)
             # Process results and prepare visualization data
             html_content, plot_data, output_state = initialize_eval_interface(example, outputs)
@@ -239,7 +265,7 @@ class BonusInterface:
             part_numbers = []
             for example in progress.tqdm(self.ds, desc="Evaluating bonus questions"):
-                model_outputs = self.get_model_outputs(example, pipeline_state)
                 for output in model_outputs:
                     total_parts += 1
@@ -263,11 +289,12 @@ class BonusInterface:
             return (
                 gr.update(value=df, label="Scores on Sample Set"),
                 gr.update(visible=False),
             )
         except Exception as e:
             error_msg = styled_error(f"Error evaluating bonus: {e.args}")
             logger.exception(f"Error evaluating bonus: {e.args}")
-            return gr.skip(), gr.update(visible=True, value=error_msg)
     def submit_model(
         self,
@@ -280,6 +307,12 @@ class BonusInterface:
         pipeline_state = PipelineState(**state_dict)
         return submit.submit_model(model_name, description, pipeline_state.workflow, "bonus", profile)
     def _setup_event_listeners(self):
         # Initialize with the default question (ID 0)
@@ -296,20 +329,26 @@ class BonusInterface:
             outputs=[self.pipeline_selector],
         )
-        pipeline_state = self.pipeline_interface.pipeline_state
         pipeline_change = self.pipeline_interface.pipeline_change
         self.load_btn.click(
             fn=self.load_pipeline,
             inputs=[self.pipeline_selector, pipeline_change],
-            outputs=[self.pipeline_selector, pipeline_state, pipeline_change, self.import_error_display],
         )
-        self.pipeline_interface.add_triggers_for_pipeline_export([pipeline_state.change], pipeline_state)
         self.run_btn.click(
             self.single_run,
             inputs=[
                 self.qid_selector,
-                self.pipeline_interface.pipeline_state,
             ],
             outputs=[
                 self.question_display,
@@ -322,8 +361,8 @@ class BonusInterface:
         self.eval_btn.click(
             fn=self.evaluate,
-            inputs=[self.pipeline_interface.pipeline_state],
-            outputs=[self.results_table, self.error_display],
         )
         self.submit_btn.click(
@@ -331,7 +370,7 @@ class BonusInterface:
             inputs=[
                 self.model_name_input,
                 self.description_input,
-                self.pipeline_interface.pipeline_state,
             ],
             outputs=[self.submit_status],
         )

 from app_configs import CONFIGS, UNSELECTED_PIPELINE_NAME
 from components import commons
+from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState
 from components.typed_dicts import PipelineStateDict
 from display.formatting import styled_error
 from submission import submit
+from workflows import factory
 from workflows.qb_agents import QuizBowlBonusAgent
 from . import populate, validation
 from .plotting import create_bonus_confidence_plot, create_bonus_html
 class BonusInterface:
     """Gradio interface for the Bonus mode."""
+    def __init__(self, app: gr.Blocks, browser_state: dict, dataset: Dataset, model_options: dict, defaults: dict):
         """Initialize the Bonus interface."""
         logger.info(f"Initializing Bonus interface with dataset size: {len(dataset)}")
+        self.browser_state = browser_state
         self.ds = dataset
         self.model_options = model_options
         self.app = app
         self.output_state = gr.State(value="{}")
         self.render()
+    # ------------------------------------- LOAD PIPELINE STATE FROM BROWSER STATE -------------------------------------
+    def load_presaved_pipeline_state(self, browser_state: dict, pipeline_change: bool):
+        logger.debug(f"Loading presaved pipeline state from browser state:\n{json.dumps(browser_state, indent=4)}")
+        try:
+            state_dict = browser_state["bonus"].get("pipeline_state", {})
+            pipeline_state = PipelineState.model_validate(state_dict)
+            pipeline_state_dict = pipeline_state.model_dump()
+            output_state = browser_state["bonus"].get("output_state", "{}")
+        except Exception as e:
+            logger.warning(f"Error loading presaved pipeline state: {e}")
+            output_state = "{}"
+            workflow = self.defaults["init_workflow"]
+            pipeline_state_dict = PipelineState.from_workflow(workflow).model_dump()
+        return browser_state, not pipeline_change, pipeline_state_dict, output_state
+    # ------------------------------------------ INTERFACE RENDER FUNCTIONS -------------------------------------------
+    def _render_pipeline_interface(self, pipeline_state: PipelineState):
         """Render the model interface."""
         with gr.Row(elem_classes="bonus-header-row form-inline"):
             self.pipeline_selector = commons.get_pipeline_selector([])
         self.import_error_display = gr.HTML(label="Import Error", elem_id="import-error-display", visible=False)
         self.pipeline_interface = PipelineInterface(
             self.app,
+            pipeline_state.workflow,
+            ui_state=pipeline_state.ui_state,
             model_options=list(self.model_options.keys()),
             config=self.defaults,
         )
         with gr.Row():
             self.eval_btn = gr.Button("Evaluate", variant="primary")
+        self.model_name_input, self.description_input, self.submit_btn, self.submit_status = (
+            commons.get_model_submission_accordion(self.app)
+        )
     def render(self):
         """Create the Gradio interface."""
         self.hidden_input = gr.Textbox(value="", visible=False, elem_id="hidden-index")
+        workflow = factory.create_empty_tossup_workflow()
+        pipeline_state = PipelineState.from_workflow(workflow)
         with gr.Row():
             # Model Panel
             with gr.Column(scale=1):
+                self._render_pipeline_interface(pipeline_state)
             with gr.Column(scale=1):
                 self._render_qb_interface()
         except Exception as e:
             return f"Error loading question: {str(e)}"
+    def get_pipeline_names(self, profile: gr.OAuthProfile | None) -> list[str]:
+        names = [UNSELECTED_PIPELINE_NAME] + populate.get_pipeline_names("bonus", profile)
+        return gr.update(choices=names, value=UNSELECTED_PIPELINE_NAME)
+    def load_pipeline(
+        self, model_name: str, pipeline_change: bool, profile: gr.OAuthProfile | None
+    ) -> tuple[str, bool, PipelineStateDict, dict]:
+        try:
+            workflow = populate.load_workflow("bonus", model_name, profile)
+            if workflow is None:
+                logger.warning(f"Could not load workflow for {model_name}")
+                return UNSELECTED_PIPELINE_NAME, gr.skip(), gr.skip(), gr.update(visible=False)
+            pipeline_state_dict = PipelineState.from_workflow(workflow).model_dump()
+            return UNSELECTED_PIPELINE_NAME, not pipeline_change, pipeline_state_dict, gr.update(visible=True)
+        except Exception as e:
+            error_msg = styled_error(f"Error loading pipeline: {str(e)}")
+            return UNSELECTED_PIPELINE_NAME, gr.skip(), gr.skip(), gr.update(visible=True, value=error_msg)
+    # ------------------------------------- Agent Functions -----------------------------------------------------------
+    def get_agent_outputs(self, example: dict, pipeline_state: PipelineState):
         """Get the model outputs for a given question ID."""
         outputs = []
         leadin = example["leadin"]
         return outputs
     def single_run(
         self,
         question_id: int,
         state_dict: PipelineStateDict,
     ) -> tuple[str, Any, Any]:
+        """Run the agent in bonus mode and updates the interface.
+        Returns:
+            tuple: Contains the following components:
+                - question_display: HTML display content of the question
+                - output_state: Updated state with question parts and outputs
+                - results_table: DataFrame with model predictions and scores
+                - model_outputs_display: Detailed step outputs from the model
+                - error_display: Any error messages (if applicable)
+        """
         try:
             pipeline_state = validation.validate_bonus_workflow(state_dict)
             question_id = int(question_id - 1)
                 raise gr.Error("Invalid question ID or dataset not loaded")
             example = self.ds[question_id]
+            outputs = self.get_agent_outputs(example, pipeline_state)
             # Process results and prepare visualization data
             html_content, plot_data, output_state = initialize_eval_interface(example, outputs)
             part_numbers = []
             for example in progress.tqdm(self.ds, desc="Evaluating bonus questions"):
+                model_outputs = self.get_agent_outputs(example, pipeline_state)
                 for output in model_outputs:
                     total_parts += 1
             return (
                 gr.update(value=df, label="Scores on Sample Set"),
                 gr.update(visible=False),
+                gr.update(visible=False),
             )
         except Exception as e:
             error_msg = styled_error(f"Error evaluating bonus: {e.args}")
             logger.exception(f"Error evaluating bonus: {e.args}")
+            return gr.skip(), gr.skip(), gr.update(visible=True, value=error_msg)
     def submit_model(
         self,
         pipeline_state = PipelineState(**state_dict)
         return submit.submit_model(model_name, description, pipeline_state.workflow, "bonus", profile)
+    @property
+    def pipeline_state(self):
+        return self.pipeline_interface.pipeline_state
+    # ------------------------------------- Event Listeners -----------------------------------------------------------
     def _setup_event_listeners(self):
         # Initialize with the default question (ID 0)
             outputs=[self.pipeline_selector],
         )
         pipeline_change = self.pipeline_interface.pipeline_change
+        gr.on(
+            triggers=[self.app.load],
+            fn=self.load_presaved_pipeline_state,
+            inputs=[self.browser_state, pipeline_change],
+            outputs=[self.browser_state, pipeline_change, self.pipeline_state, self.output_state],
+        )
         self.load_btn.click(
             fn=self.load_pipeline,
             inputs=[self.pipeline_selector, pipeline_change],
+            outputs=[self.pipeline_selector, pipeline_change, self.pipeline_state, self.import_error_display],
         )
+        self.pipeline_interface.add_triggers_for_pipeline_export([self.pipeline_state.change], self.pipeline_state)
         self.run_btn.click(
             self.single_run,
             inputs=[
                 self.qid_selector,
+                self.pipeline_state,
             ],
             outputs=[
                 self.question_display,
         self.eval_btn.click(
             fn=self.evaluate,
+            inputs=[self.pipeline_state],
+            outputs=[self.results_table, self.model_outputs_display, self.error_display],
         )
         self.submit_btn.click(
             inputs=[
                 self.model_name_input,
                 self.description_input,
+                self.pipeline_state,
             ],
             outputs=[self.submit_status],
         )

src/components/quizbowl/plotting.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import json
 import logging
 import re
 from collections import Counter
 import matplotlib.pyplot as plt
 import pandas as pd
@@ -25,7 +27,7 @@ def _get_token_classes(confidence, buzz, score) -> str:
     if confidence is None:
         return "token"
     elif not buzz:
-        return "token guess-point no-buzz"
     else:
         return f"token guess-point buzz-{score}"
@@ -44,12 +46,19 @@ def _create_token_tooltip_html(values) -> str:
     color = "#a3c9a3" if score else "#ebbec4"  # Light green for correct, light pink for incorrect
     return f"""
         <div class="tooltip card" style="background-color: {color}; border-radius: 8px; padding: 12px; box-shadow: 2px 4px 8px rgba(0, 0, 0, 0.15);">
             <div class="tooltip-content" style="font-family: 'Arial', sans-serif; color: #000;">
                 <h4 style="margin: 0 0 8px; color: #000;">💡 Answer</h4>
-                <p style="font-weight: bold; margin: 0 0 8px; color: #000;">{answer}</p>
-                <p style="margin: 0 0 4px; color: #000;">📊 <b style="color: #000;">Confidence:</b> {confidence:.2f}</p>
                 <p style="margin: 0; color: #000;">🔍 <b style="color: #000;">Status:</b> {"✅ Correct" if score else "❌ Incorrect" if buzz else "🚫 No Buzz"}</p>
             </div>
         </div>
@@ -145,86 +154,48 @@ def create_bonus_html(leadin: str, parts: list[dict]) -> str:
     return html_content
-def create_line_plot(eval_points: list[tuple[int, dict]], highlighted_index: int = -1) -> pd.DataFrame:
-    """Create a Gradio LinePlot of token values with optional highlighting using DataFrame."""
-    try:
-        # Create base confidence data
-        data = []
-        # Add buzz points to the plot
-        for i, (v, b) in eval_points:
-            color = "#ff4444" if b == 0 else "#228b22"
-            data.append(
-                {
-                    "position": i,
-                    "value": v,
-                    "type": "buzz",
-                    "highlight": True,
-                    "color": color,
-                }
-            )
-        if highlighted_index >= 0:
-            # Add vertical line for the highlighted token
-            data.extend(
-                [
-                    {
-                        "position": highlighted_index,
-                        "value": 0,
-                        "type": "hover-line",
-                        "color": "#000000",
-                        "highlight": True,
-                    },
-                    {
-                        "position": highlighted_index,
-                        "value": 1,
-                        "type": "hover-line",
-                        "color": "#000000",
-                        "highlight": True,
-                    },
-                ]
-            )
-        return pd.DataFrame(data)
-    except Exception as e:
-        logging.error(f"Error creating line plot: {e}", exc_info=True)
-        # Return an empty DataFrame with the expected columns
-        return pd.DataFrame(columns=["position", "value", "type", "highlight", "color"])
 def create_tossup_confidence_pyplot(
-    tokens: list[str], eval_points: list[tuple[int, dict]], highlighted_index: int = -1
 ) -> plt.Figure:
     """Create a pyplot of token values with optional highlighting."""
     plt.style.use("ggplot")  # Set theme to grid paper
-    fig = plt.figure(figsize=(11, 5))  # Set figure size to 11x5
     ax = fig.add_subplot(111)
-    x = [0]
-    y = [0]
-    for i, v in eval_points:
-        x.append(i + 1)
-        y.append(v["confidence"])
-    ax.plot(x, y, "o--", color="#4698cf")
     for i, v in eval_points:
         if not v["buzz"]:
             continue
-        confidence = v["confidence"]
         color = "green" if v["score"] else "red"
-        ax.plot(i + 1, confidence, "o", color=color)
         if i >= len(tokens):
             print(f"Token index {i} is out of bounds for n_tokens: {len(tokens)}")
-        ax.annotate(f"{tokens[i]}", (i + 1, confidence), textcoords="offset points", xytext=(0, 10), ha="center")
-    if highlighted_index >= 0:
-        # Add light vertical line for the highlighted token from 0 to 1
-        ax.axvline(x=highlighted_index + 1, color="#ff9900", linestyle="--", ymin=0, ymax=1)
     ax.set_title("Buzz Confidence")
     ax.set_xlabel("Token Index")
     ax.set_ylabel("Confidence")
     ax.set_xticks(x)
     ax.set_xticklabels(x)
     return fig
@@ -300,3 +271,303 @@ def update_tossup_plot(highlighted_index: int, state: str) -> pd.DataFrame:
     except Exception as e:
         logging.error(f"Error updating plot: {e}")
         return pd.DataFrame()

+# %%
 import json
 import logging
 import re
 from collections import Counter
 import matplotlib.pyplot as plt
+import numpy as np
 import pandas as pd
     if confidence is None:
         return "token"
     elif not buzz:
+        return f"token guess-point buzz-{score}"
     else:
         return f"token guess-point buzz-{score}"
     color = "#a3c9a3" if score else "#ebbec4"  # Light green for correct, light pink for incorrect
+    if values.get("logprob", None) is not None:
+        prob = np.exp(values["logprob"])
+        prob_str = f"<p style='margin: 0 0 4px; color: #000;'> 📈 <b style='color: #000;'>Output Probability:</b> {prob:.3f}</p>"
+    else:
+        prob_str = ""
     return f"""
         <div class="tooltip card" style="background-color: {color}; border-radius: 8px; padding: 12px; box-shadow: 2px 4px 8px rgba(0, 0, 0, 0.15);">
             <div class="tooltip-content" style="font-family: 'Arial', sans-serif; color: #000;">
                 <h4 style="margin: 0 0 8px; color: #000;">💡 Answer</h4>
+                <p><code style="font-weight: bold; margin: 0 0 8px; color: #000;">{answer}</code></p>
+                <p style="margin: 0 0 4px; color: #000;">📈 <b style="color: #000;">Confidence:</b> {confidence:.2f}</p>
+                {prob_str}
                 <p style="margin: 0; color: #000;">🔍 <b style="color: #000;">Status:</b> {"✅ Correct" if score else "❌ Incorrect" if buzz else "🚫 No Buzz"}</p>
             </div>
         </div>
     return html_content
 def create_tossup_confidence_pyplot(
+    tokens: list[str],
+    eval_points: list[tuple[int, dict]],
+    confidence_threshold: float = 0.5,
+    prob_threshold: float | None = None,
 ) -> plt.Figure:
     """Create a pyplot of token values with optional highlighting."""
     plt.style.use("ggplot")  # Set theme to grid paper
+    fig = plt.figure(figsize=(10, 4), dpi=300)  # Set figure size to 11x5
     ax = fig.add_subplot(111)
+    x = [0] + [int(i + 1) for i, _ in eval_points]
+    y_conf = [0] + [v["confidence"] for _, v in eval_points]
+    logprob_values = [v["logprob"] for _, v in eval_points if v["logprob"] is not None]
+    y_prob = [0] + [np.exp(v) for v in logprob_values]
+    ax.plot(x, y_prob, "o-", color="#f2b150", label="Probability")
+    ax.plot(x, y_conf, "o-", color="#4996de", label="Confidence")
     for i, v in eval_points:
         if not v["buzz"]:
             continue
         color = "green" if v["score"] else "red"
+        conf = v["confidence"]
+        ax.plot(i + 1, conf, "o", color=color, markerfacecolor="none", markersize=12, markeredgewidth=2.5)
+        if v["logprob"] is not None:
+            prob = np.exp(v["logprob"])
+            ax.plot(i + 1, prob, "o", color=color, markerfacecolor="none", markersize=12, markeredgewidth=2.5)
         if i >= len(tokens):
             print(f"Token index {i} is out of bounds for n_tokens: {len(tokens)}")
+        ax.annotate(f"{tokens[i]}", (i + 1, conf), textcoords="offset points", xytext=(0, 10), ha="center")
+    # Add horizontal dashed line for confidence threshold
+    ax.axhline(y=confidence_threshold, color="#9370DB", linestyle="--", xmin=0, xmax=1, label="Confidence Threshold")
+    # Add horizontal dashed line for probability threshold if provided
+    if prob_threshold is not None:
+        ax.axhline(y=prob_threshold, color="#cf5757", linestyle="--", xmin=0, xmax=1, label="Probability Threshold")
     ax.set_title("Buzz Confidence")
     ax.set_xlabel("Token Index")
     ax.set_ylabel("Confidence")
     ax.set_xticks(x)
     ax.set_xticklabels(x)
+    ax.legend()
     return fig
     except Exception as e:
         logging.error(f"Error updating plot: {e}")
         return pd.DataFrame()
+# %%
+def create_df_entry(run_indices: list[int], run_outputs: list[dict]) -> dict:
+    """Create a dataframe entry from a list of model outputs."""
+    chosen_idx = None
+    earliest_ok_idx = None
+    is_correct = None
+    for i, o in enumerate(run_outputs):
+        if chosen_idx is None and o["buzz"]:
+            chosen_idx = run_indices[o["position"] - 1] + 1
+            is_correct = o["score"]
+        if earliest_ok_idx is None and o["score"]:
+            earliest_ok_idx = run_indices[o["position"] - 1] + 1
+    if is_correct is None:
+        is_correct = False
+    # if buzz is not the last index, correct scores 10, incorrect scores -5
+    # if buzz is the final index, correct scores 5, incorrect scores 0
+    if chosen_idx == -1:
+        tossup_score = 0
+    elif chosen_idx == run_indices[-1] + 1:
+        tossup_score = 5 if is_correct else 0
+    else:
+        tossup_score = 10 if is_correct else -5
+    gap = None if (chosen_idx is None or earliest_ok_idx is None) else chosen_idx - earliest_ok_idx
+    if earliest_ok_idx is None:
+        cls = "hopeless"
+    elif chosen_idx is None:
+        cls = "never-buzzed"  # Opportunity missed to score
+    elif chosen_idx == earliest_ok_idx:
+        cls = "best-buzz"  # Perfect timing
+    elif chosen_idx > earliest_ok_idx:
+        cls = "late-buzz"  # Opportunity missed to buzz earlier
+    elif chosen_idx < earliest_ok_idx:
+        cls = "premature"  # Opportunity missed to score
+    return {
+        "chosen_idx": chosen_idx,
+        "earliest_ok_idx": earliest_ok_idx,
+        "gap": gap,
+        "cls": cls,
+        "tossup_score": tossup_score,
+        "is_correct": int(is_correct),
+    }
+def prepare_tossup_results_df(run_indices: list[list[int]], model_outputs: list[list[dict]]) -> pd.DataFrame:
+    """Create a dataframe from a list of model outputs."""
+    records = []
+    for indices, outputs in zip(run_indices, model_outputs):
+        entry = create_df_entry(indices, outputs)
+        records.append(entry)
+    return pd.DataFrame.from_records(records)
+def create_tossup_eval_table(df: pd.DataFrame) -> pd.DataFrame:
+    """Create a table from a dataframe."""
+    # Prepare a dataframe of aggregated metrics:
+    # - Mean Tossup Score
+    # - Buzz Accuracy
+    # - Mean +ve Gap
+    # - Mean -ve Gap
+    # - Mean Buzz Position
+    positions = df["chosen_idx"].dropna()
+    gaps = df["gap"].dropna()
+    pos_gaps = gaps.loc[gaps >= 0]
+    neg_gaps = gaps.loc[gaps < 0]
+    mean_tossup_score = df["tossup_score"].sum() / len(df)
+    return pd.DataFrame(
+        [
+            {
+                "Tossup Score (10)": f"{mean_tossup_score:5.1f}",
+                "Buzz Accuracy": f"{df['is_correct'].mean():5.1%}",
+                "Buzz Position": f"{np.mean(positions):5.1f}",
+                "+ve Gap": f"{pos_gaps.mean():5.1f}",
+                "-ve Gap": f"{neg_gaps.mean():5.1f}",
+            }
+        ]
+    )
+def create_tossup_eval_dashboard(run_indices: list[list[int]], df: pd.DataFrame, *, figsize=(15, 8), title_prefix=""):
+    """
+    Visualise buzzing behaviour with three sub-plots:
+    1. Ceiling-accuracy vs. prefix length
+    2. Scatter of earliest-correct idx vs. chosen-buzz idx
+    3. Frequency distribution of narrative classes (vertical bars)
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Output of `build_buzz_dataframe` – must contain
+        columns: earliest_ok_idx, chosen_idx, cls.
+    eval_indices : sequence[int]
+        Token positions at which the model was probed.
+    figsize : tuple, optional
+        Figure size passed to `plt.subplots`.
+    title_prefix : str, optional
+        Prepended to each subplot title (useful when comparing models).
+    """
+    # ------------------------------------------------------------------
+    # 0. Prep (variables reused throughout the function)
+    # ------------------------------------------------------------------
+    # Collect all evaluation indices across questions so we know the
+    # x-axis domain and the padding for NaNs.
+    eval_indices = np.asarray(sorted({idx for indices in run_indices for idx in indices}))
+    # Narrative classes and their colours
+    classes = [
+        "best-buzz",
+        "late-buzz",
+        "never-buzzed",
+        "premature",
+        "hopeless",
+    ]
+    colors = ["tab:green", "tab:olive", "tab:orange", "tab:red", "tab:gray"]
+    palette = dict(zip(classes, colors))
+    max_idx = eval_indices.max() * 1.25  # padding for NaN replacement / axis limits
+    # ------------------------------------------------------------------
+    # 1. Figure / axes layout
+    # ------------------------------------------------------------------
+    # GridSpec layout → 2 rows × 3 cols.
+    #   ┌────────────┬────────────┬────────┐
+    #   │ Ceiling    │ Scatter    │  Bars  │  (row 0)
+    #   ├────────────┴────────────┴────────┤
+    #   │ Descriptions (spans all 3 cols)  │  (row 1)
+    #   └──────────────────────────────────┘
+    # Having a dedicated row for the narrative-class descriptions avoids
+    # overlapping with sub-plots and makes the whole figure more compact.
+    plt.style.use("ggplot")
+    fig = plt.figure(figsize=figsize)
+    gs = fig.add_gridspec(
+        nrows=2,
+        ncols=3,
+        height_ratios=[5, 1],  # extra space for plots vs. descriptions
+        width_ratios=[2.2, 2.2, 1],
+        hspace=0.2,  # reduced vertical spacing between plots
+        wspace=0.2,  # reduced horizontal spacing between plots
+        left=0.05,  # reduced left margin
+        right=0.95,  # reduced right margin
+        top=0.9,  # reduced top margin
+        bottom=0.05,  # reduced bottom margin
+    )
+    ax_ceiling = fig.add_subplot(gs[0, 0])  # Ceiling accuracy curve
+    ax_scatter = fig.add_subplot(gs[0, 1])  # Earliest vs. chosen scatter
+    ax_bars = fig.add_subplot(gs[0, 2])  # Outcome distribution bars
+    ax_desc = fig.add_subplot(gs[1, :])  # Textual descriptions
+    ax_desc.axis("off")
+    fig.suptitle("Buzzing behaviour", fontsize=16, fontweight="bold")
+    # ------------------------------------------------------------------
+    # 2. Ceiling accuracy curve
+    # ------------------------------------------------------------------
+    ceiling = [((df["earliest_ok_idx"].notna()) & (df["earliest_ok_idx"] <= idx)).mean() for idx in eval_indices]
+    ax_ceiling.plot(eval_indices, ceiling, marker="o", color="#4698cf")
+    ax_ceiling.set_xlabel("Token index shown")
+    ax_ceiling.set_ylabel("Proportion of questions correct")
+    ax_ceiling.set_ylim(0, 1.01)
+    ax_ceiling.set_title(f"{title_prefix}Ceiling accuracy vs. prefix")
+    # ------------------------------------------------------------------
+    # 3. Earliest-vs-Chosen scatter
+    # ------------------------------------------------------------------
+    for cls in classes:
+        sub = df[df["cls"] == cls]
+        if sub.empty:
+            continue
+        x = sub["earliest_ok_idx"].fillna(max_idx)
+        y = sub["chosen_idx"].fillna(max_idx)
+        ax_scatter.scatter(
+            x,
+            y,
+            label=cls,
+            alpha=0.7,
+            edgecolor="black",
+            linewidth=1,
+            marker="o",
+            s=90,
+            c=palette[cls],
+            facecolor="none",
+        )
+    lim = max_idx
+    ax_scatter.plot([0, lim], [0, lim], linestyle=":", linewidth=1)
+    ax_scatter.set_xlim(0, lim)
+    ax_scatter.set_ylim(0, lim)
+    ax_scatter.set_xlabel("Earliest index with correct answer")
+    ax_scatter.set_ylabel("Chosen buzz index")
+    ax_scatter.set_title(f"{title_prefix}Earliest vs. chosen index")
+    ax_scatter.legend(frameon=False, fontsize="small")
+    # ------------------------------------------------------------------
+    # 4. Outcome distribution (horizontal bars)
+    # ------------------------------------------------------------------
+    counts = df["cls"].value_counts().reindex(classes).fillna(0)
+    ax_bars.barh(
+        counts.index,
+        counts.values,
+        color=[palette[c] for c in counts.index],
+        alpha=0.7,
+        edgecolor="black",
+        linewidth=1,
+    )
+    ax_bars.set_xlabel("Number of questions")
+    ax_bars.set_title(f"{title_prefix}Outcome distribution")
+    # Ensure x-axis shows integer ticks only
+    from matplotlib.ticker import MaxNLocator
+    ax_bars.xaxis.set_major_locator(MaxNLocator(integer=True))
+    # ------------------------------------------------------------------
+    # 5. Narrative-class descriptions (bottom panel)
+    # ------------------------------------------------------------------
+    descriptions = {
+        "best-buzz": "Perfect timing. Buzzed at the earliest possible correct position",
+        "late-buzz": "Missed opportunity. Buzzed correctly but later than optimal",
+        "never-buzzed": "Missed opportunity. Never buzzed despite knowing the answer",
+        "premature": "Incorrect buzz. Buzzing at a later position could have been correct",
+        "hopeless": "Never knew the answer. No correct answer at any position",
+    }
+    y_pos = 1.0  # start at top of the description axis
+    for cls, color in zip(classes, colors):
+        ax_desc.text(
+            0.01,
+            y_pos,
+            f"■ {cls}: {descriptions[cls]}",
+            ha="left",
+            va="top",
+            color=color,
+            fontweight="bold",
+            fontsize=11,  # increased font size from 9 to 11
+            transform=ax_desc.transAxes,
+        )
+        y_pos -= 0.25  # increased vertical step inside the axis for more line height
+    # ------------------------------------------------------------------
+    # 6. Return the final figure
+    # ------------------------------------------------------------------
+    return fig
+# %%
+# Create dummy data for testing
+def create_dummy_model_outputs(n_entries=10, n_positions=5):
+    """Create dummy model outputs for testing."""
+    np.random.seed(42)
+    dummy_outputs = []
+    for _ in range(n_entries):
+        run_indices = sorted(np.random.choice(range(10, 50), n_positions, replace=False))
+        outputs = []
+        for i in range(n_positions):
+            # Randomly decide if model will buzz at this position
+            will_buzz = np.random.random() > 0.7
+            # Randomly decide if answer is correct
+            is_correct = np.random.random() > 0.4
+            outputs.append(
+                {
+                    "position": i + 1,
+                    "buzz": will_buzz,
+                    "score": 1 if is_correct else 0,
+                    "confidence": np.random.random(),
+                    "logprob": np.log(np.random.random()),
+                    "answer": f"Answer {i + 1}",
+                }
+            )
+        dummy_outputs.append({"run_indices": run_indices, "outputs": outputs})
+    return dummy_outputs
+# dummy_data = create_dummy_model_outputs()
+# dummy_df = pd.DataFrame([create_df_entry(entry["run_indices"], entry["outputs"]) for entry in dummy_data])
+# dummy_df
+# plot_buzz_dashboard(dummy_df, dummy_data[0]["run_indices"])
+# %%

src/components/quizbowl/tossup.py CHANGED Viewed

@@ -9,20 +9,20 @@ from loguru import logger
 from app_configs import CONFIGS, UNSELECTED_PIPELINE_NAME
 from components import commons
-from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState, PipelineUIState
 from components.model_pipeline.tossup_pipeline import TossupPipelineInterface, TossupPipelineState
-from components.typed_dicts import PipelineStateDict, TossupPipelineStateDict
 from display.formatting import styled_error
 from submission import submit
 from workflows.qb_agents import QuizBowlTossupAgent, TossupResult
-from workflows.structs import ModelStep, TossupWorkflow
 from . import populate, validation
 from .plotting import (
-    create_scatter_pyplot,
     create_tossup_confidence_pyplot,
     create_tossup_html,
-    update_tossup_plot,
 )
 from .utils import evaluate_prediction
@@ -53,13 +53,16 @@ def prepare_buzz_evals(
         logger.warning("No run indices provided, returning empty results")
         return [], []
     eval_points = []
-    for i, v in zip(run_indices, model_outputs):
-        eval_points.append((int(i), v))
     return eval_points
-def initialize_eval_interface(example, model_outputs: list[dict]):
     """Initialize the interface with example text."""
     try:
         tokens = example["question"].split()
@@ -70,9 +73,8 @@ def initialize_eval_interface(example, model_outputs: list[dict]):
         if not tokens:
             return "<div>No tokens found in the provided text.</div>", pd.DataFrame(), "{}"
-        highlighted_index = next((int(i) for i, v in eval_points if v["buzz"] == 1), -1)
         html_content = create_tossup_html(tokens, answer, clean_answers, run_indices, eval_points)
-        plot_data = create_tossup_confidence_pyplot(tokens, eval_points, highlighted_index)
         # Store tokens, values, and buzzes as JSON for later use
         state = json.dumps({"tokens": tokens, "values": eval_points})
@@ -83,30 +85,36 @@ def initialize_eval_interface(example, model_outputs: list[dict]):
         return f"<div>Error initializing interface: {str(e)}</div>", pd.DataFrame(), "{}"
-def process_tossup_results(results: list[dict], top_k_mode: bool = False) -> pd.DataFrame:
     """Process results from tossup mode and prepare visualization data."""
-    # Create DataFrame for detailed results
-    if top_k_mode:
-        raise ValueError("Top-k mode not supported for tossup mode")
-    return pd.DataFrame(
-        [
-            {
-                "Token Position": r["token_position"],
-                "Correct?": "✅" if r["score"] == 1 else "❌",
-                "Confidence": r["confidence"],
-                "Prediction": r["answer"],
-            }
-            for r in results
-        ]
-    )
 class TossupInterface:
     """Gradio interface for the Tossup mode."""
-    def __init__(self, app: gr.Blocks, dataset: Dataset, model_options: dict, defaults: dict):
         """Initialize the Tossup interface."""
         logger.info(f"Initializing Tossup interface with dataset size: {len(dataset)}")
         self.ds = dataset
         self.model_options = model_options
         self.app = app
@@ -114,7 +122,25 @@ class TossupInterface:
         self.output_state = gr.State(value="{}")
         self.render()
-    def _render_pipeline_interface(self, workflow: TossupWorkflow, simple: bool = True):
         """Render the model interface."""
         with gr.Row(elem_classes="bonus-header-row form-inline"):
             self.pipeline_selector = commons.get_pipeline_selector([])
@@ -122,7 +148,8 @@ class TossupInterface:
         self.import_error_display = gr.HTML(label="Import Error", elem_id="import-error-display", visible=False)
         self.pipeline_interface = TossupPipelineInterface(
             self.app,
-            workflow,
             model_options=list(self.model_options.keys()),
             config=self.defaults,
         )
@@ -154,32 +181,29 @@ class TossupInterface:
         with gr.Row():
             self.eval_btn = gr.Button("Evaluate", variant="primary")
-        with gr.Accordion("Model Submission", elem_classes="model-submission-accordion", open=True):
-            with gr.Row():
-                self.model_name_input = gr.Textbox(label="Model Name")
-                self.description_input = gr.Textbox(label="Description")
-            with gr.Row():
-                gr.LoginButton()
-                self.submit_btn = gr.Button("Submit", variant="primary")
-            self.submit_status = gr.HTML(label="Submission Status")
     def render(self):
         """Create the Gradio interface."""
         self.hidden_input = gr.Textbox(value="", visible=False, elem_id="hidden-index")
-        workflow = self.defaults["init_workflow"]
         with gr.Row():
             # Model Panel
             with gr.Column(scale=1):
-                self._render_pipeline_interface(workflow, simple=self.defaults["simple_workflow"])
             with gr.Column(scale=1):
                 self._render_qb_interface()
         self._setup_event_listeners()
     def get_new_question_html(self, question_id: int) -> str:
         """Get the HTML for a new question."""
         if question_id is None:
@@ -194,62 +218,89 @@ class TossupInterface:
         except Exception as e:
             return f"Error loading question: {str(e)}"
-    def get_model_outputs(
-        self, example: dict, pipeline_state: PipelineState, early_stop: bool
-    ) -> list[ScoredTossupResult]:
-        """Get the model outputs for a given question ID."""
-        question_runs = []
-        tokens = example["question"].split()
-        for run_idx in example["run_indices"]:
-            question_runs.append(" ".join(tokens[: run_idx + 1]))
-        agent = QuizBowlTossupAgent(pipeline_state.workflow)
-        outputs = list(agent.run(question_runs, early_stop=early_stop))
-        outputs = add_model_scores(outputs, example["clean_answers"], example["run_indices"])
-        return outputs
     def get_pipeline_names(self, profile: gr.OAuthProfile | None) -> list[str]:
         names = [UNSELECTED_PIPELINE_NAME] + populate.get_pipeline_names("tossup", profile)
         return gr.update(choices=names, value=UNSELECTED_PIPELINE_NAME)
     def load_pipeline(
         self, model_name: str, pipeline_change: bool, profile: gr.OAuthProfile | None
-    ) -> tuple[str, PipelineStateDict, bool, dict]:
         try:
             workflow = populate.load_workflow("tossup", model_name, profile)
             if workflow is None:
                 logger.warning(f"Could not load workflow for {model_name}")
                 return UNSELECTED_PIPELINE_NAME, gr.skip(), gr.skip(), gr.update(visible=False)
             pipeline_state_dict = TossupPipelineState.from_workflow(workflow).model_dump()
-            return UNSELECTED_PIPELINE_NAME, pipeline_state_dict, not pipeline_change, gr.update(visible=True)
         except Exception as e:
             logger.exception(e)
             error_msg = styled_error(f"Error loading pipeline: {str(e)}")
             return UNSELECTED_PIPELINE_NAME, gr.skip(), gr.skip(), gr.update(visible=True, value=error_msg)
     def single_run(
         self,
         question_id: int,
         state_dict: TossupPipelineStateDict,
         early_stop: bool = True,
     ) -> tuple[str, Any, Any]:
-        """Run the agent in tossup mode with a system prompt."""
         try:
             pipeline_state = validation.validate_tossup_workflow(state_dict)
             # Validate inputs
             question_id = int(question_id - 1)
             if not self.ds or question_id < 0 or question_id >= len(self.ds):
                 raise gr.Error("Invalid question ID or dataset not loaded")
             example = self.ds[question_id]
-            outputs = self.get_model_outputs(example, pipeline_state, early_stop)
             # Process results and prepare visualization data
-            tokens_html, plot_data, output_state = initialize_eval_interface(example, outputs)
             df = process_tossup_results(outputs)
-            step_outputs = [output["step_outputs"] for output in outputs]
             return (
                 tokens_html,
                 gr.update(value=output_state),
-                gr.update(value=plot_data, label=f"Buzz Confidence on Question {question_id + 1}"),
                 gr.update(value=df, label=f"Model Outputs for Question {question_id + 1}", visible=True),
                 gr.update(value=step_outputs, label=f"Step Outputs for Question {question_id + 1}", visible=True),
                 gr.update(visible=False),
@@ -274,32 +325,17 @@ class TossupInterface:
             if not self.ds or not self.ds.num_rows:
                 return "No dataset loaded", None, None
             pipeline_state = validation.validate_tossup_workflow(state_dict)
-            buzz_counts = 0
-            correct_buzzes = 0
-            token_positions = []
-            correctness = []
             for example in progress.tqdm(self.ds, desc="Evaluating tossup questions"):
-                model_outputs = self.get_model_outputs(example, pipeline_state, early_stop=True)
-                if model_outputs[-1]["buzz"]:
-                    buzz_counts += 1
-                    if model_outputs[-1]["score"] == 1:
-                        correct_buzzes += 1
-                    token_positions.append(model_outputs[-1]["token_position"])
-                    correctness.append(model_outputs[-1]["score"])
-            buzz_accuracy = correct_buzzes / buzz_counts
-            df = pd.DataFrame(
-                [
-                    {
-                        "Avg Buzz Position": f"{np.mean(token_positions):.2f}",
-                        "Buzz Accuracy": f"{buzz_accuracy:.2%}",
-                        "Total Score": f"{correct_buzzes}/{len(self.ds)}",
-                    }
-                ]
-            )
-            plot_data = create_scatter_pyplot(token_positions, correctness)
             return (
-                gr.update(value=plot_data, label="Buzz Positions on Sample Set"),
-                gr.update(value=df, label="Scores on Sample Set", visible=True),
                 gr.update(visible=False),
             )
         except Exception as e:
@@ -309,7 +345,8 @@ class TossupInterface:
             return (
                 gr.skip(),
                 gr.update(visible=False),
-                gr.update(visible=True, value=styled_error(f"Error: {str(e)}\n{traceback.format_exc()}")),
             )
     def submit_model(
@@ -327,6 +364,12 @@ class TossupInterface:
             logger.exception(f"Error submitting model: {e.args}")
             return styled_error(f"Error: {str(e)}")
     def _setup_event_listeners(self):
         gr.on(
             triggers=[self.app.load, self.qid_selector.change],
@@ -341,20 +384,27 @@ class TossupInterface:
             outputs=[self.pipeline_selector],
         )
-        pipeline_state = self.pipeline_interface.pipeline_state
         pipeline_change = self.pipeline_interface.pipeline_change
         self.load_btn.click(
             fn=self.load_pipeline,
             inputs=[self.pipeline_selector, pipeline_change],
-            outputs=[self.pipeline_selector, pipeline_state, pipeline_change, self.import_error_display],
         )
-        self.pipeline_interface.add_triggers_for_pipeline_export([pipeline_state.change], pipeline_state)
         self.run_btn.click(
             self.single_run,
             inputs=[
                 self.qid_selector,
-                self.pipeline_interface.pipeline_state,
                 self.early_stop_checkbox,
             ],
             outputs=[
@@ -369,8 +419,8 @@ class TossupInterface:
         self.eval_btn.click(
             fn=self.evaluate,
-            inputs=[self.pipeline_interface.pipeline_state],
-            outputs=[self.confidence_plot, self.results_table, self.error_display],
         )
         self.submit_btn.click(
@@ -378,7 +428,7 @@ class TossupInterface:
             inputs=[
                 self.model_name_input,
                 self.description_input,
-                self.pipeline_interface.pipeline_state,
             ],
             outputs=[self.submit_status],
         )

 from app_configs import CONFIGS, UNSELECTED_PIPELINE_NAME
 from components import commons
 from components.model_pipeline.tossup_pipeline import TossupPipelineInterface, TossupPipelineState
+from components.typed_dicts import TossupInterfaceDefaults, TossupPipelineStateDict
 from display.formatting import styled_error
 from submission import submit
+from workflows import factory
 from workflows.qb_agents import QuizBowlTossupAgent, TossupResult
 from . import populate, validation
 from .plotting import (
     create_tossup_confidence_pyplot,
+    create_tossup_eval_dashboard,
+    create_tossup_eval_table,
     create_tossup_html,
+    prepare_tossup_results_df,
 )
 from .utils import evaluate_prediction
         logger.warning("No run indices provided, returning empty results")
         return [], []
     eval_points = []
+    for o in model_outputs:
+        token_position = run_indices[o["position"] - 1]
+        eval_points.append((token_position, o))
     return eval_points
+def initialize_eval_interface(
+    example: dict, model_outputs: list[dict], confidence_threshold: float, prob_threshold: float | None = None
+):
     """Initialize the interface with example text."""
     try:
         tokens = example["question"].split()
         if not tokens:
             return "<div>No tokens found in the provided text.</div>", pd.DataFrame(), "{}"
         html_content = create_tossup_html(tokens, answer, clean_answers, run_indices, eval_points)
+        plot_data = create_tossup_confidence_pyplot(tokens, eval_points, confidence_threshold, prob_threshold)
         # Store tokens, values, and buzzes as JSON for later use
         state = json.dumps({"tokens": tokens, "values": eval_points})
         return f"<div>Error initializing interface: {str(e)}</div>", pd.DataFrame(), "{}"
+def process_tossup_results(results: list[dict]) -> pd.DataFrame:
     """Process results from tossup mode and prepare visualization data."""
+    data = []
+    for r in results:
+        entry = {
+            "Token Position": r["token_position"],
+            "Correct?": "✅" if r["score"] == 1 else "❌",
+            "Confidence": r["confidence"],
+        }
+        if r["logprob"] is not None:
+            entry["Probability"] = f"{np.exp(r['logprob']):.3f}"
+        entry["Prediction"] = r["answer"]
+        data.append(entry)
+    return pd.DataFrame(data)
 class TossupInterface:
     """Gradio interface for the Tossup mode."""
+    def __init__(
+        self,
+        app: gr.Blocks,
+        browser_state: gr.BrowserState,
+        dataset: Dataset,
+        model_options: dict,
+        defaults: TossupInterfaceDefaults,
+    ):
         """Initialize the Tossup interface."""
         logger.info(f"Initializing Tossup interface with dataset size: {len(dataset)}")
+        self.browser_state = browser_state
         self.ds = dataset
         self.model_options = model_options
         self.app = app
         self.output_state = gr.State(value="{}")
         self.render()
+    # ------------------------------------- LOAD PIPELINE STATE FROM BROWSER STATE -------------------------------------
+    def load_presaved_pipeline_state(self, browser_state: dict, pipeline_change: bool):
+        logger.debug(f"Loading presaved pipeline state from browser state:\n{json.dumps(browser_state, indent=4)}")
+        try:
+            state_dict = browser_state["tossup"].get("pipeline_state", {})
+            pipeline_state = TossupPipelineState.model_validate(state_dict)
+            pipeline_state_dict = pipeline_state.model_dump()
+            output_state = browser_state["tossup"].get("output_state", "{}")
+        except Exception as e:
+            logger.warning(f"Error loading presaved pipeline state: {e}")
+            output_state = "{}"
+            workflow = self.defaults["init_workflow"]
+            pipeline_state_dict = TossupPipelineState.from_workflow(workflow).model_dump()
+        return browser_state, not pipeline_change, pipeline_state_dict, output_state
+    # ------------------------------------------ INTERFACE RENDER FUNCTIONS -------------------------------------------
+    def _render_pipeline_interface(self, pipeline_state: TossupPipelineState):
         """Render the model interface."""
         with gr.Row(elem_classes="bonus-header-row form-inline"):
             self.pipeline_selector = commons.get_pipeline_selector([])
         self.import_error_display = gr.HTML(label="Import Error", elem_id="import-error-display", visible=False)
         self.pipeline_interface = TossupPipelineInterface(
             self.app,
+            pipeline_state.workflow,
+            ui_state=pipeline_state.ui_state,
             model_options=list(self.model_options.keys()),
             config=self.defaults,
         )
         with gr.Row():
             self.eval_btn = gr.Button("Evaluate", variant="primary")
+        self.model_name_input, self.description_input, self.submit_btn, self.submit_status = (
+            commons.get_model_submission_accordion(self.app)
+        )
     def render(self):
         """Create the Gradio interface."""
+        workflow = factory.create_empty_tossup_workflow()
+        pipeline_state = TossupPipelineState.from_workflow(workflow)
         self.hidden_input = gr.Textbox(value="", visible=False, elem_id="hidden-index")
         with gr.Row():
             # Model Panel
             with gr.Column(scale=1):
+                self._render_pipeline_interface(pipeline_state)
             with gr.Column(scale=1):
                 self._render_qb_interface()
         self._setup_event_listeners()
+    # ------------------------------------- Component Updates Functions ---------------------------------------------
     def get_new_question_html(self, question_id: int) -> str:
         """Get the HTML for a new question."""
         if question_id is None:
         except Exception as e:
             return f"Error loading question: {str(e)}"
     def get_pipeline_names(self, profile: gr.OAuthProfile | None) -> list[str]:
         names = [UNSELECTED_PIPELINE_NAME] + populate.get_pipeline_names("tossup", profile)
         return gr.update(choices=names, value=UNSELECTED_PIPELINE_NAME)
     def load_pipeline(
         self, model_name: str, pipeline_change: bool, profile: gr.OAuthProfile | None
+    ) -> tuple[str, bool, TossupPipelineStateDict, dict]:
         try:
             workflow = populate.load_workflow("tossup", model_name, profile)
             if workflow is None:
                 logger.warning(f"Could not load workflow for {model_name}")
                 return UNSELECTED_PIPELINE_NAME, gr.skip(), gr.skip(), gr.update(visible=False)
             pipeline_state_dict = TossupPipelineState.from_workflow(workflow).model_dump()
+            return UNSELECTED_PIPELINE_NAME, not pipeline_change, pipeline_state_dict, gr.update(visible=True)
         except Exception as e:
             logger.exception(e)
             error_msg = styled_error(f"Error loading pipeline: {str(e)}")
             return UNSELECTED_PIPELINE_NAME, gr.skip(), gr.skip(), gr.update(visible=True, value=error_msg)
+    # ------------------------------------- Agent Functions -----------------------------------------------------------
+    def get_agent_outputs(
+        self, example: dict, pipeline_state: TossupPipelineState, early_stop: bool
+    ) -> list[ScoredTossupResult]:
+        """Get the model outputs for a given question ID."""
+        question_runs = []
+        tokens = example["question"].split()
+        for run_idx in example["run_indices"]:
+            question_runs.append(" ".join(tokens[: run_idx + 1]))
+        agent = QuizBowlTossupAgent(pipeline_state.workflow)
+        outputs = list(agent.run(question_runs, early_stop=early_stop))
+        outputs = add_model_scores(outputs, example["clean_answers"], example["run_indices"])
+        return outputs
     def single_run(
         self,
         question_id: int,
         state_dict: TossupPipelineStateDict,
         early_stop: bool = True,
     ) -> tuple[str, Any, Any]:
+        """Run the agent in tossup mode with a system prompt.
+        Returns:
+            tuple: A tuple containing:
+                - tokens_html (str): HTML representation of the tossup question with buzz indicators
+                - output_state (gr.update): Update for the output state component
+                - plot_data (gr.update): Update for the confidence plot with label and visibility
+                - df (gr.update): Update for the dataframe component showing model outputs
+                - step_outputs (gr.update): Update for the step outputs component
+                - error_msg (gr.update): Update for the error message component (hidden if no errors)
+        """
         try:
             pipeline_state = validation.validate_tossup_workflow(state_dict)
+            workflow = pipeline_state.workflow
             # Validate inputs
             question_id = int(question_id - 1)
             if not self.ds or question_id < 0 or question_id >= len(self.ds):
                 raise gr.Error("Invalid question ID or dataset not loaded")
             example = self.ds[question_id]
+            outputs = self.get_agent_outputs(example, pipeline_state, early_stop)
             # Process results and prepare visualization data
+            confidence_threshold = workflow.buzzer.confidence_threshold
+            prob_threshold = workflow.buzzer.prob_threshold
+            tokens_html, plot_data, output_state = initialize_eval_interface(
+                example, outputs, confidence_threshold, prob_threshold
+            )
             df = process_tossup_results(outputs)
+            tokens = example["question"].split()
+            step_outputs = {}
+            for output in outputs:
+                pos = output["token_position"]
+                token = tokens[pos - 1]
+                key = f"{pos}:{token}"
+                step_outputs[key] = {k: v for k, v in output["step_outputs"].items() if k not in workflow.inputs}
+                if output["logprob"] is not None:
+                    step_outputs[key]["logprob"] = output["logprob"]
+                    step_outputs[key]["prob"] = float(np.exp(output["logprob"]))
             return (
                 tokens_html,
                 gr.update(value=output_state),
+                gr.update(value=plot_data, label=f"Buzz Confidence on Question {question_id + 1}", show_label=True),
                 gr.update(value=df, label=f"Model Outputs for Question {question_id + 1}", visible=True),
                 gr.update(value=step_outputs, label=f"Step Outputs for Question {question_id + 1}", visible=True),
                 gr.update(visible=False),
             if not self.ds or not self.ds.num_rows:
                 return "No dataset loaded", None, None
             pipeline_state = validation.validate_tossup_workflow(state_dict)
+            model_outputs = []
             for example in progress.tqdm(self.ds, desc="Evaluating tossup questions"):
+                run_outputs = self.get_agent_outputs(example, pipeline_state, early_stop=True)
+                model_outputs.append(run_outputs)
+            eval_df = prepare_tossup_results_df(self.ds["run_indices"], model_outputs)
+            plot_data = create_tossup_eval_dashboard(self.ds["run_indices"], eval_df)
+            output_df = create_tossup_eval_table(eval_df)
             return (
+                gr.update(value=plot_data, label="Buzz Positions on Sample Set", show_label=False),
+                gr.update(value=output_df, label="(Mean) Metrics on Sample Set", visible=True),
+                gr.update(visible=False),
                 gr.update(visible=False),
             )
         except Exception as e:
             return (
                 gr.skip(),
                 gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(visible=True, value=styled_error(f"Error: {str(e)}")),
             )
     def submit_model(
             logger.exception(f"Error submitting model: {e.args}")
             return styled_error(f"Error: {str(e)}")
+    @property
+    def pipeline_state(self):
+        return self.pipeline_interface.pipeline_state
+    # ------------------------------------- Event Listeners -----------------------------------------------------------
     def _setup_event_listeners(self):
         gr.on(
             triggers=[self.app.load, self.qid_selector.change],
             outputs=[self.pipeline_selector],
         )
         pipeline_change = self.pipeline_interface.pipeline_change
+        gr.on(
+            triggers=[self.app.load],
+            fn=self.load_presaved_pipeline_state,
+            inputs=[self.browser_state, pipeline_change],
+            outputs=[self.browser_state, pipeline_change, self.pipeline_state, self.output_state],
+        )
         self.load_btn.click(
             fn=self.load_pipeline,
             inputs=[self.pipeline_selector, pipeline_change],
+            outputs=[self.pipeline_selector, pipeline_change, self.pipeline_state, self.import_error_display],
         )
+        self.pipeline_interface.add_triggers_for_pipeline_export([self.pipeline_state.change], self.pipeline_state)
         self.run_btn.click(
             self.single_run,
             inputs=[
                 self.qid_selector,
+                self.pipeline_state,
                 self.early_stop_checkbox,
             ],
             outputs=[
         self.eval_btn.click(
             fn=self.evaluate,
+            inputs=[self.pipeline_state],
+            outputs=[self.confidence_plot, self.results_table, self.model_outputs_display, self.error_display],
         )
         self.submit_btn.click(
             inputs=[
                 self.model_name_input,
                 self.description_input,
+                self.pipeline_state,
             ],
             outputs=[self.submit_status],
         )

src/components/quizbowl/utils.py CHANGED Viewed

@@ -14,7 +14,6 @@ def evaluate_prediction(prediction: str, clean_answers: list[str] | str) -> int:
     for answer in clean_answers:
         answer = answer.strip().lower()
         if answer and answer in pred:
-            print(f"Found {answer} in {pred}")
             return 1
     return 0

     for answer in clean_answers:
         answer = answer.strip().lower()
         if answer and answer in pred:
             return 1
     return 0

src/components/typed_dicts.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
 # TypedDicts for workflows/structs.py
 class InputFieldDict(TypedDict):
@@ -62,3 +64,17 @@ class PipelineStateDict(TypedDict):
 class TossupPipelineStateDict(PipelineStateDict):
     workflow: TossupWorkflowDict

 from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
+from workflows.structs import TossupWorkflow, Workflow
 # TypedDicts for workflows/structs.py
 class InputFieldDict(TypedDict):
 class TossupPipelineStateDict(PipelineStateDict):
     workflow: TossupWorkflowDict
+class PipelineInterfaceDefaults(TypedDict):
+    init_workflow: Workflow
+    simple_workflow: bool
+    model: str
+    temperature: float
+    max_temperature: float
+class TossupInterfaceDefaults(PipelineInterfaceDefaults):
+    init_workflow: TossupWorkflow
+    confidence_threshold: float
+    early_stop: bool