Spaces:

umdclip
/

quizbowl-submission

Running

App Files Files Community

Maharshi Gor commited on 26 days ago

Commit

3b39b49

1 Parent(s): 22e8b31

Updates and Refactor in QB Interfaces:

Browse files

* Tooltips for model buzz, card style for questions
* Remove reset button for temperature
* Remove unused logging

Files changed (9) hide show

app.py +2 -2
src/components/model_pipeline/state_manager.py +0 -2
src/components/model_step/model_step.py +1 -0
src/components/quizbowl/bonus.py +34 -94
src/components/quizbowl/commons.py +15 -0
src/components/quizbowl/plotting.py +172 -64
src/components/quizbowl/tossup.py +39 -69
src/components/quizbowl/utils.py +16 -25
src/display/custom_css.py +113 -1

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ from huggingface_hub import snapshot_download
 from app_configs import AVAILABLE_MODELS, DEFAULT_SELECTIONS, THEME
 from components.quizbowl.bonus import BonusInterface
 from components.quizbowl.tossup import TossupInterface
-from display.custom_css import css_pipeline, css_tossup
 from display.guide import GUIDE_MARKDOWN
 # Constants
@@ -148,7 +148,7 @@ if __name__ == "__main__":
     scheduler.add_job(restart_space, "interval", seconds=1800)
     scheduler.start()
-    full_css = css_pipeline + css_tossup
     tossup_ds = load_dataset("tossup")
     bonus_ds = load_dataset("bonus")
     with gr.Blocks(

 from app_configs import AVAILABLE_MODELS, DEFAULT_SELECTIONS, THEME
 from components.quizbowl.bonus import BonusInterface
 from components.quizbowl.tossup import TossupInterface
+from display.custom_css import css_bonus, css_pipeline, css_tossup
 from display.guide import GUIDE_MARKDOWN
 # Constants
     scheduler.add_job(restart_space, "interval", seconds=1800)
     scheduler.start()
+    full_css = css_pipeline + css_tossup + css_bonus
     tossup_ds = load_dataset("tossup")
     bonus_ds = load_dataset("bonus")
     with gr.Blocks(

src/components/model_pipeline/state_manager.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import json
-import logging
 from typing import Any, Literal
 import gradio as gr
@@ -31,7 +30,6 @@ class ModelStepUIState(BaseModel):
     def update(self, key: str, value: Any) -> "ModelStepUIState":
         """Update the UI state."""
         new_state = self.model_copy(update={key: value})
-        logging.warning("UI state updated: %s", self)
         return new_state

 import json
 from typing import Any, Literal
 import gradio as gr
     def update(self, key: str, value: Any) -> "ModelStepUIState":
         """Update the UI state."""
         new_state = self.model_copy(update={key: value})
         return new_state

src/components/model_step/model_step.py CHANGED Viewed

@@ -244,6 +244,7 @@ class ModelStepComponent(FormComponent):
                 step=0.05,
                 info="Temperature",
                 show_label=False,
             )
     def render(self):

                 step=0.05,
                 info="Temperature",
                 show_label=False,
+                show_reset_button=False,
             )
     def render(self):

src/components/quizbowl/bonus.py CHANGED Viewed

@@ -3,8 +3,6 @@ import logging
 from typing import Any
 import gradio as gr
-import matplotlib.pyplot as plt
-import numpy as np
 import pandas as pd
 from datasets import Dataset
@@ -14,17 +12,14 @@ from workflows.qb.multi_step_agent import MultiStepBonusAgent
 from workflows.qb.simple_agent import SimpleBonusAgent
 from workflows.structs import ModelStep, Workflow
 from .plotting import (
-    create_pyplot,
     create_scatter_pyplot,
-    evaluate_buzz,
-    update_plot,
 )
-def evaluate_bonus_part(prediction: str, clean_answers: list[str]) -> float:
-    """Evaluate a single bonus part."""
-    return evaluate_buzz(prediction, clean_answers)
 def process_bonus_results(results: list[dict]) -> pd.DataFrame:
@@ -46,13 +41,7 @@ def process_bonus_results(results: list[dict]) -> pd.DataFrame:
 def initialize_eval_interface(example: dict, model_outputs: list[dict]):
     """Initialize the interface with example text."""
     try:
-        # Create HTML for leadin and parts
-        leadin_html = f"<div class='leadin'>{example['leadin']}</div>"
-        parts_html = []
-        for i, part in enumerate(example["parts"]):
-            parts_html.append(f"<div class='part'><b>Part {i + 1}:</b> {part['part']}</div>")
-        html_content = f"{leadin_html}<div class='parts-container'>{''.join(parts_html)}</div>"
         # Create confidence plot data
         plot_data = create_bonus_confidence_plot(example["parts"], model_outputs)
@@ -66,33 +55,6 @@ def initialize_eval_interface(example: dict, model_outputs: list[dict]):
         return f"<div>Error initializing interface: {str(e)}</div>", pd.DataFrame(), "{}"
-def create_bonus_confidence_plot(parts: list[dict], model_outputs: list[dict]):
-    """Create confidence plot for bonus parts."""
-    plt.style.use("ggplot")
-    fig = plt.figure(figsize=(10, 6))
-    ax = fig.add_subplot(111)
-    # Plot confidence for each part
-    x = range(1, len(parts) + 1)
-    confidences = [output["confidence"] for output in model_outputs]
-    scores = [output["score"] for output in model_outputs]
-    # Plot confidence bars
-    bars = ax.bar(x, confidences, color="#4698cf")
-    # Color bars based on correctness
-    for i, score in enumerate(scores):
-        bars[i].set_color("green" if score == 1 else "red")
-    ax.set_title("Part Confidence")
-    ax.set_xlabel("Part Number")
-    ax.set_ylabel("Confidence")
-    ax.set_xticks(x)
-    ax.set_xticklabels([f"Part {i}" for i in x])
-    return fig
 def validate_workflow(workflow: Workflow):
     """Validate that a workflow is properly configured for the bonus task."""
     if not workflow.steps:
@@ -165,27 +127,14 @@ class BonusInterface:
             simple=simple,
             model_options=list(self.model_options.keys()),
         )
-        with gr.Row():
-            self.run_btn = gr.Button("Run Bonus", variant="primary")
     def _render_qb_interface(self):
         """Render the quizbowl interface."""
-        with gr.Row():
-            self.qid_selector = gr.Number(
-                label="Question ID", value=1, precision=0, minimum=1, maximum=len(self.ds), show_label=True, scale=0
-            )
-            self.answer_display = gr.Textbox(
-                label="Answers", elem_id="answer-display", elem_classes="answer-box", interactive=False, scale=1
-            )
-            self.clean_answer_display = gr.Textbox(
-                label="Acceptable Answers",
-                elem_id="answer-display-2",
-                elem_classes="answer-box",
-                interactive=False,
-                scale=2,
-            )
-        self.question_display = gr.HTML(label="Question", elem_id="question-display")
         with gr.Row():
             self.confidence_plot = gr.Plot(
                 label="Part Confidence",
@@ -198,7 +147,7 @@ class BonusInterface:
         )
         with gr.Row():
-            self.eval_btn = gr.Button("Evaluate")
         with gr.Accordion("Model Submission", elem_classes="model-submission-accordion", open=True):
             with gr.Row():
@@ -206,7 +155,7 @@ class BonusInterface:
                 self.description_input = gr.Textbox(label="Description")
             with gr.Row():
                 gr.LoginButton()
-                self.submit_btn = gr.Button("Submit")
             self.submit_status = gr.HTML(label="Submission Status")
     def render(self):
@@ -226,30 +175,20 @@ class BonusInterface:
     def get_new_question_html(self, question_id: int):
         """Get the HTML for a new question."""
-        example = self.ds[question_id - 1]
-        leadin = example["leadin"]
-        parts = example["parts"]
-        # Create HTML for leadin and parts
-        leadin_html = f"<div class='leadin'>{leadin}</div>"
-        parts_html = []
-        for i, part in enumerate(parts):
-            parts_html.append(f"<div class='part'>{part['part']}</div>")
-        parts_html_str = "<br>".join(parts_html)
-        html_content = (
-            f"<div class='token-container'>{leadin_html}<div class='parts-container'><br>{parts_html_str}</div></div>"
-        )
-        # Format answers
-        primary_answers = [f"{i + 1}. {part['answer_primary']}" for i, part in enumerate(parts)]
-        clean_answers = []
-        for i, part in enumerate(parts):
-            part_answers = [a for a in part["clean_answers"] if len(a.split()) <= 6]
-            clean_answers.append(f"{i + 1}. {', '.join(part_answers)}")
-        return html_content, "\n".join(primary_answers), "\n".join(clean_answers)
     def get_model_outputs(self, example: dict, pipeline_state: PipelineState):
         """Get the model outputs for a given question ID."""
@@ -267,13 +206,13 @@ class BonusInterface:
             # Add part number and evaluate score
             part_output["part_number"] = i + 1
-            part_output["score"] = evaluate_bonus_part(part_output["answer"], part["clean_answers"])
             outputs.append(part_output)
         return outputs
-    def run_bonus(
         self,
         question_id: int,
         pipeline_state: PipelineState,
@@ -302,9 +241,9 @@ class BonusInterface:
             import traceback
             error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
-            return error_msg, None, None
-    def evaluate_bonus(self, pipeline_state: PipelineState, progress: gr.Progress = gr.Progress()):
         """Evaluate the bonus questions."""
         try:
             # Validate inputs
@@ -361,14 +300,15 @@ class BonusInterface:
             triggers=[self.app.load, self.qid_selector.change],
             fn=self.get_new_question_html,
             inputs=[self.qid_selector],
-            outputs=[self.question_display, self.answer_display, self.clean_answer_display],
         )
         self.run_btn.click(
             self.pipeline_interface.validate_workflow,
             inputs=[self.pipeline_interface.pipeline_state],
             outputs=[self.pipeline_interface.pipeline_state],
         ).success(
-            self.run_bonus,
             inputs=[
                 self.qid_selector,
                 self.pipeline_interface.pipeline_state,
@@ -382,7 +322,7 @@ class BonusInterface:
         )
         self.eval_btn.click(
-            fn=self.evaluate_bonus,
             inputs=[self.pipeline_interface.pipeline_state],
             outputs=[self.results_table, self.confidence_plot],
         )
@@ -397,7 +337,7 @@ class BonusInterface:
             outputs=[self.submit_status],
         )
         self.hidden_input.change(
-            fn=update_plot,
             inputs=[self.hidden_input, self.output_state],
             outputs=[self.confidence_plot],
         )

 from typing import Any
 import gradio as gr
 import pandas as pd
 from datasets import Dataset
 from workflows.qb.simple_agent import SimpleBonusAgent
 from workflows.structs import ModelStep, Workflow
+from .commons import get_qid_selector
 from .plotting import (
+    create_bonus_confidence_plot,
+    create_bonus_html,
     create_scatter_pyplot,
+    update_tossup_plot,
 )
+from .utils import evaluate_prediction
 def process_bonus_results(results: list[dict]) -> pd.DataFrame:
 def initialize_eval_interface(example: dict, model_outputs: list[dict]):
     """Initialize the interface with example text."""
     try:
+        html_content = create_bonus_html(example["leadin"], example["parts"])
         # Create confidence plot data
         plot_data = create_bonus_confidence_plot(example["parts"], model_outputs)
         return f"<div>Error initializing interface: {str(e)}</div>", pd.DataFrame(), "{}"
 def validate_workflow(workflow: Workflow):
     """Validate that a workflow is properly configured for the bonus task."""
     if not workflow.steps:
             simple=simple,
             model_options=list(self.model_options.keys()),
         )
     def _render_qb_interface(self):
         """Render the quizbowl interface."""
+        with gr.Row(elem_classes="bonus-header-row form-inline"):
+            self.qid_selector = get_qid_selector(len(self.ds))
+            self.run_btn = gr.Button("Run on Bonus Question", variant="secondary")
+        self.question_display = gr.HTML(label="Question", elem_id="bonus-question-display")
         with gr.Row():
             self.confidence_plot = gr.Plot(
                 label="Part Confidence",
         )
         with gr.Row():
+            self.eval_btn = gr.Button("Evaluate", variant="primary")
         with gr.Accordion("Model Submission", elem_classes="model-submission-accordion", open=True):
             with gr.Row():
                 self.description_input = gr.Textbox(label="Description")
             with gr.Row():
                 gr.LoginButton()
+                self.submit_btn = gr.Button("Submit", variant="primary")
             self.submit_status = gr.HTML(label="Submission Status")
     def render(self):
     def get_new_question_html(self, question_id: int):
         """Get the HTML for a new question."""
+        if question_id is None:
+            logging.error("Question ID is None. Setting to 1")
+            question_id = 1
+        try:
+            question_id = int(question_id) - 1
+            if not self.ds or question_id < 0 or question_id >= len(self.ds):
+                return "Invalid question ID or dataset not loaded"
+            example = self.ds[question_id]
+            leadin = example["leadin"]
+            parts = example["parts"]
+            return create_bonus_html(leadin, parts)
+        except Exception as e:
+            return f"Error loading question: {str(e)}"
     def get_model_outputs(self, example: dict, pipeline_state: PipelineState):
         """Get the model outputs for a given question ID."""
             # Add part number and evaluate score
             part_output["part_number"] = i + 1
+            part_output["score"] = evaluate_prediction(part_output["answer"], part["clean_answers"])
             outputs.append(part_output)
         return outputs
+    def single_run(
         self,
         question_id: int,
         pipeline_state: PipelineState,
             import traceback
             error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
+            return error_msg, gr.skip(), gr.skip(), gr.skip()
+    def evaluate(self, pipeline_state: PipelineState, progress: gr.Progress = gr.Progress()):
         """Evaluate the bonus questions."""
         try:
             # Validate inputs
             triggers=[self.app.load, self.qid_selector.change],
             fn=self.get_new_question_html,
             inputs=[self.qid_selector],
+            outputs=[self.question_display],
         )
         self.run_btn.click(
             self.pipeline_interface.validate_workflow,
             inputs=[self.pipeline_interface.pipeline_state],
             outputs=[self.pipeline_interface.pipeline_state],
         ).success(
+            self.single_run,
             inputs=[
                 self.qid_selector,
                 self.pipeline_interface.pipeline_state,
         )
         self.eval_btn.click(
+            fn=self.evaluate,
             inputs=[self.pipeline_interface.pipeline_state],
             outputs=[self.results_table, self.confidence_plot],
         )
             outputs=[self.submit_status],
         )
         self.hidden_input.change(
+            fn=update_tossup_plot,
             inputs=[self.hidden_input, self.output_state],
             outputs=[self.confidence_plot],
         )

src/components/quizbowl/commons.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import gradio as gr
+def get_qid_selector(dataset_size: int):
+    return gr.Number(
+        info="Question ID",
+        value=1,
+        precision=0,
+        minimum=1,
+        maximum=dataset_size,
+        show_label=False,
+        scale=0,
+        container=False,
+        elem_classes="qid-selector",
+    )

src/components/quizbowl/plotting.py CHANGED Viewed

@@ -7,67 +7,145 @@ import matplotlib.pyplot as plt
 import pandas as pd
-def evaluate_buzz(prediction: str, clean_answers: list[str] | str) -> int:
-    """Evaluate the buzz of a prediction against the clean answers."""
-    if isinstance(clean_answers, str):
-        print("clean_answers is a string")
-        clean_answers = [clean_answers]
-    pred = prediction.lower().strip()
-    if not pred:
-        return 0
-    for answer in clean_answers:
-        answer = answer.strip().lower()
-        if answer and answer in pred:
-            print(f"Found {answer} in {pred}")
-            return 1
-    return 0
-def create_answer_html(answer: str):
-    """Create HTML for the answer."""
-    return f"<div class='answer-header'>Answer:<br>{answer}</div>"
-def create_tokens_html(tokens: list[str], eval_points: list[tuple], answer: str, marker_indices: list[int] = None):
     """Create HTML for tokens with hover capability and a colored header for the answer."""
     try:
-        html_parts = []
         ep = dict(eval_points)
-        marker_indices = set(marker_indices) if isinstance(marker_indices, list) else set()
-        # Add a colored header for the answer
-        # html_parts.append(create_answer_html(answer))
         for i, token in enumerate(tokens):
-            # Check if this token is a buzz point
-            values = ep.get(i, (None, 0, 0))
-            confidence, buzz_point, score = values
-            # Replace non-word characters for proper display in HTML
-            display_token = token
-            if not re.match(r"\w+", token):
-                display_token = token.replace(" ", "&nbsp;")
-            # Add buzz marker class if it's a buzz point
-            if confidence is None:
-                css_class = ""
-            elif not buzz_point:
-                css_class = " guess-point no-buzz"
-            else:
-                css_class = f" guess-point buzz-{score}"
-            token_html = f'<span id="token-{i}" class="token{css_class}" data-index="{i}">{display_token}</span>'
-            if i in marker_indices:
-                token_html += "<span style='color: rgba(0,0,255,0.3);'>|</span>"
-            html_parts.append(token_html)
-        return f"<div class='token-container'>{''.join(html_parts)}</div>"
     except Exception as e:
         logging.error(f"Error creating token HTML: {e}", exc_info=True)
         return f"<div class='token-container'>Error creating tokens: {str(e)}</div>"
-def create_line_plot(eval_points, highlighted_index=-1):
     """Create a Gradio LinePlot of token values with optional highlighting using DataFrame."""
     try:
         # Create base confidence data
@@ -114,26 +192,29 @@ def create_line_plot(eval_points, highlighted_index=-1):
         return pd.DataFrame(columns=["position", "value", "type", "highlight", "color"])
-def create_pyplot(tokens, eval_points, highlighted_index=-1):
     """Create a pyplot of token values with optional highlighting."""
     plt.style.use("ggplot")  # Set theme to grid paper
-    fig = plt.figure(figsize=(10, 6))  # Set figure size
     ax = fig.add_subplot(111)
     x = [0]
     y = [0]
-    for i, (v, b, s) in eval_points:
         x.append(i + 1)
-        y.append(v)
     ax.plot(x, y, "o--", color="#4698cf")
-    for i, (v, b, s) in eval_points:
-        if not b:
             continue
-        color = "green" if s else "red"
-        ax.plot(i + 1, v, "o", color=color)
         if i >= len(tokens):
             print(f"Token index {i} is out of bounds for n_tokens: {len(tokens)}")
-        ax.annotate(f"{tokens[i]}", (i + 1, v), textcoords="offset points", xytext=(0, 10), ha="center")
     if highlighted_index >= 0:
         # Add light vertical line for the highlighted token from 0 to 1
@@ -147,10 +228,10 @@ def create_pyplot(tokens, eval_points, highlighted_index=-1):
     return fig
-def create_scatter_pyplot(token_positions, scores):
     """Create a scatter plot of token positions and scores."""
     plt.style.use("ggplot")
-    fig = plt.figure(figsize=(10, 6))
     ax = fig.add_subplot(111)
     counts = Counter(zip(token_positions, scores))
@@ -167,7 +248,34 @@ def create_scatter_pyplot(token_positions, scores):
     return fig
-def update_plot(highlighted_index, state):
     """Update the plot when a token is hovered; add a vertical line on the plot."""
     try:
         if not state or state == "{}":
@@ -187,7 +295,7 @@ def update_plot(highlighted_index, state):
         # Create updated plot with highlighting of the token point
         # plot_data = create_line_plot(values, highlighted_index)
-        plot_data = create_pyplot(tokens, values, highlighted_index)
         return plot_data
     except Exception as e:
         logging.error(f"Error updating plot: {e}")

 import pandas as pd
+def _make_answer_html(answer: str, clean_answers: list[str] = []) -> str:
+    clean_answers = [a for a in clean_answers if len(a.split()) <= 6 and a != answer]
+    additional_answers_html = ""
+    if clean_answers:
+        additional_answers_html = f"<span class='bonus-answer-text'> [or {', '.join(clean_answers)}]</span>"
+    return f"""
+        <div class='bonus-answer'>
+            <span class='bonus-answer-label'>Answer: </span>
+            <span class='bonus-answer-text'>{answer}</span>
+            {additional_answers_html}
+        </div>
+    """
+def _get_token_classes(confidence, buzz, score) -> str:
+    if confidence is None:
+        return "token"
+    elif not buzz:
+        return "token guess-point no-buzz"
+    else:
+        return f"token guess-point buzz-{score}"
+def _create_token_tooltip_html(values) -> str:
+    if not values:
+        return ""
+    confidence = values.get("confidence", 0)
+    buzz = values.get("buzz", 0)
+    score = values.get("score", 0)
+    answer = values.get("answer", "")
+    answer_tokens = answer.split()
+    if len(answer_tokens) > 10:
+        k = len(answer_tokens) - 10
+        answer = " ".join(answer_tokens[:10]) + f"...[{k} more words]"
+    color = "#a3c9a3" if score else "#ebbec4"  # Light green for correct, light pink for incorrect
+    return f"""
+        <div class="tooltip card" style="background-color: {color}; border-radius: 8px; padding: 12px; box-shadow: 2px 4px 8px rgba(0, 0, 0, 0.15);">
+            <div class="tooltip-content" style="font-family: 'Arial', sans-serif; color: #333;">
+                <h4 style="margin: 0 0 8px;">💡 Answer</h4>
+                <p style="font-weight: bold; margin: 0 0 8px;">{answer}</p>
+                <p style="margin: 0 0 4px;">📊 <strong>Confidence:</strong> {confidence:.2f}</p>
+                <p style="margin: 0;">🔍 <strong>Status:</strong> {"✅ Correct" if score else "❌ Incorrect" if buzz else "🚫 No Buzz"}</p>
+            </div>
+        </div>
+    """
+def create_token_html(token: str, values: dict, i: int) -> str:
+    confidence = values.get("confidence", None)
+    buzz = values.get("buzz", 0)
+    score = values.get("score", 0)
+    # Replace non-word characters for proper display in HTML
+    display_token = f"{token} 🚨" if buzz else f"{token} 💭" if values else token
+    if not re.match(r"\w+", token):
+        display_token = token.replace(" ", "&nbsp;")
+    css_class = _get_token_classes(confidence, buzz, score)
+    # Add tooltip if we have values for this token
+    tooltip_html = _create_token_tooltip_html(values)
+    token_html = f'<span id="token-{i}" class="{css_class}" data-index="{i}">{display_token}{tooltip_html}</span>'
+    # if i in marker_indices:
+    #     token_html += "<span style='color: crimson;'>|</span>"
+    return token_html
+def create_tossup_html(
+    tokens: list[str],
+    answer_primary: str,
+    clean_answers: list[str],
+    marker_indices: list[int] = [],
+    eval_points: list[tuple[int, dict]] = [],
+) -> str:
     """Create HTML for tokens with hover capability and a colored header for the answer."""
     try:
         ep = dict(eval_points)
+        marker_indices = set(marker_indices)
+        html_tokens = []
         for i, token in enumerate(tokens):
+            token_html = create_token_html(token, ep.get(i, {}), i + 1)
+            html_tokens.append(token_html)
+        answer_html = _make_answer_html(answer_primary, clean_answers)
+        return f"""
+        <div class='bonus-container'>
+            <div class='bonus-card'>
+                <div class='tossup-question'>
+                    {"".join(html_tokens)}
+            </div>
+                {answer_html}
+            </div>
+        </div>
+        """
     except Exception as e:
         logging.error(f"Error creating token HTML: {e}", exc_info=True)
         return f"<div class='token-container'>Error creating tokens: {str(e)}</div>"
+def create_bonus_html(leadin: str, parts: list[dict]) -> str:
+    # Create HTML for leadin and parts with answers
+    leadin_html = f"<div class='bonus-leadin'>{leadin}</div>"
+    parts_html = []
+    for i, part in enumerate(parts):
+        question_text = part["part"]
+        answer_html = _make_answer_html(part["answer_primary"], part["clean_answers"])
+        "<div class='bonus-part-number'>Part {i + 1}</div>"
+        part_html = f"""
+                <div class='bonus-part'>
+                    <div class='bonus-part-text'><b>#{i + 1}.</b> {question_text}</div>
+                    {answer_html}
+                </div>
+            """
+        parts_html.append(part_html)
+    html_content = f"""
+            <div class='bonus-container'>
+                <div class='bonus-card'>
+                    {leadin_html}
+                    {"".join(parts_html)}
+                </div>
+            </div>
+        """
+    # Format clean answers for the answer display
+    clean_answers = []
+    for i, part in enumerate(parts):
+        part_answers = [a for a in part["clean_answers"] if len(a.split()) <= 6]
+        clean_answers.append(f"{i + 1}. {', '.join(part_answers)}")
+    return html_content
+def create_line_plot(eval_points: list[tuple[int, dict]], highlighted_index: int = -1) -> pd.DataFrame:
     """Create a Gradio LinePlot of token values with optional highlighting using DataFrame."""
     try:
         # Create base confidence data
         return pd.DataFrame(columns=["position", "value", "type", "highlight", "color"])
+def create_tossup_confidence_pyplot(
+    tokens: list[str], eval_points: list[tuple[int, dict]], highlighted_index: int = -1
+) -> plt.Figure:
     """Create a pyplot of token values with optional highlighting."""
     plt.style.use("ggplot")  # Set theme to grid paper
+    fig = plt.figure(figsize=(11, 5))  # Set figure size to 11x5
     ax = fig.add_subplot(111)
     x = [0]
     y = [0]
+    for i, v in eval_points:
         x.append(i + 1)
+        y.append(v["confidence"])
     ax.plot(x, y, "o--", color="#4698cf")
+    for i, v in eval_points:
+        if not v["buzz"]:
             continue
+        confidence = v["confidence"]
+        color = "green" if v["score"] else "red"
+        ax.plot(i + 1, confidence, "o", color=color)
         if i >= len(tokens):
             print(f"Token index {i} is out of bounds for n_tokens: {len(tokens)}")
+        ax.annotate(f"{tokens[i]}", (i + 1, confidence), textcoords="offset points", xytext=(0, 10), ha="center")
     if highlighted_index >= 0:
         # Add light vertical line for the highlighted token from 0 to 1
     return fig
+def create_scatter_pyplot(token_positions: list[int], scores: list[int]) -> plt.Figure:
     """Create a scatter plot of token positions and scores."""
     plt.style.use("ggplot")
+    fig = plt.figure(figsize=(11, 5))
     ax = fig.add_subplot(111)
     counts = Counter(zip(token_positions, scores))
     return fig
+def create_bonus_confidence_plot(parts: list[dict], model_outputs: list[dict]) -> plt.Figure:
+    """Create confidence plot for bonus parts."""
+    plt.style.use("ggplot")
+    fig = plt.figure(figsize=(10, 6))
+    ax = fig.add_subplot(111)
+    # Plot confidence for each part
+    x = range(1, len(parts) + 1)
+    confidences = [output["confidence"] for output in model_outputs]
+    scores = [output["score"] for output in model_outputs]
+    # Plot confidence bars
+    bars = ax.bar(x, confidences, color="#4698cf")
+    # Color bars based on correctness
+    for i, score in enumerate(scores):
+        bars[i].set_color("green" if score == 1 else "red")
+    ax.set_title("Part Confidence")
+    ax.set_xlabel("Part Number")
+    ax.set_ylabel("Confidence")
+    ax.set_xticks(x)
+    ax.set_xticklabels([f"Part {i}" for i in x])
+    return fig
+def update_tossup_plot(highlighted_index: int, state: str) -> pd.DataFrame:
     """Update the plot when a token is hovered; add a vertical line on the plot."""
     try:
         if not state or state == "{}":
         # Create updated plot with highlighting of the token point
         # plot_data = create_line_plot(values, highlighted_index)
+        plot_data = create_tossup_confidence_pyplot(tokens, values, highlighted_index)
         return plot_data
     except Exception as e:
         logging.error(f"Error updating plot: {e}")

src/components/quizbowl/tossup.py CHANGED Viewed

@@ -13,14 +13,14 @@ from workflows.qb.multi_step_agent import MultiStepTossupAgent
 from workflows.qb.simple_agent import SimpleTossupAgent
 from workflows.structs import ModelStep, Workflow
 from .plotting import (
-    create_answer_html,
-    create_pyplot,
     create_scatter_pyplot,
-    create_tokens_html,
-    evaluate_buzz,
-    update_plot,
 )
 # TODO: Error handling on run tossup and evaluate tossup and show correct messages
 # TODO: ^^ Same for Bonus
@@ -29,7 +29,7 @@ from .plotting import (
 def add_model_scores(model_outputs: list[dict], clean_answers: list[str], run_indices: list[int]) -> list[dict]:
     """Add model scores to the model outputs."""
     for output, run_idx in zip(model_outputs, run_indices):
-        output["score"] = evaluate_buzz(output["answer"], clean_answers)
         output["token_position"] = run_idx + 1
     return model_outputs
@@ -43,26 +43,25 @@ def prepare_buzz_evals(
         return [], []
     eval_points = []
     for i, v in zip(run_indices, model_outputs):
-        eval_point = v["confidence"], v["buzz"], v["score"]
-        eval_points.append((int(i), eval_point))
     return eval_points
 def initialize_eval_interface(example, model_outputs: list[dict]):
     """Initialize the interface with example text."""
-    tokens = example["question"].split()
-    run_indices = example["run_indices"]
-    answer = example["answer_primary"]
     try:
         eval_points = prepare_buzz_evals(run_indices, model_outputs)
         if not tokens:
             return "<div>No tokens found in the provided text.</div>", pd.DataFrame(), "{}"
-        highlighted_index = next((int(i) for i, (_, b, _) in eval_points if b == 1), -1)
-        html_content = create_tokens_html(tokens, eval_points, answer)
-        plot_data = create_pyplot(tokens, eval_points, highlighted_index)
         # Store tokens, values, and buzzes as JSON for later use
         state = json.dumps({"tokens": tokens, "values": eval_points})
@@ -195,26 +194,13 @@ class TossupInterface:
                 label="Early Stop",
                 info="Stop early if already buzzed",
             )
-            self.run_btn = gr.Button("Run Tossup", variant="primary")
     def _render_qb_interface(self):
         """Render the quizbowl interface."""
-        with gr.Row():
-            self.qid_selector = gr.Number(
-                label="Question ID", value=1, precision=0, minimum=1, maximum=len(self.ds), show_label=True, scale=0
-            )
-            self.answer_display = gr.Textbox(
-                label="PrimaryAnswer", elem_id="answer-display", elem_classes="answer-box", interactive=False, scale=1
-            )
-            self.clean_answer_display = gr.Textbox(
-                label="Acceptable Answers",
-                elem_id="answer-display-2",
-                elem_classes="answer-box",
-                interactive=False,
-                scale=2,
-            )
-            # self.answer_display = gr.HTML(label="Answer", elem_id="answer-display")
-        self.question_display = gr.HTML(label="Question", elem_id="question-display")
         with gr.Row():
             self.confidence_plot = gr.Plot(
                 label="Buzz Confidence",
@@ -225,7 +211,7 @@ class TossupInterface:
             value=pd.DataFrame(columns=["Token Position", "Correct?", "Confidence", "Prediction"]),
         )
         with gr.Row():
-            self.eval_btn = gr.Button("Evaluate")
         with gr.Accordion("Model Submission", elem_classes="model-submission-accordion", open=True):
             with gr.Row():
@@ -233,7 +219,7 @@ class TossupInterface:
                 self.description_input = gr.Textbox(label="Description")
             with gr.Row():
                 gr.LoginButton()
-                self.submit_btn = gr.Button("Submit")
             self.submit_status = gr.HTML(label="Submission Status")
     def render(self):
@@ -253,22 +239,6 @@ class TossupInterface:
         self._setup_event_listeners()
-    def get_full_question(self, question_id: int) -> str:
-        """Get the full question text for a given question ID."""
-        try:
-            question_id = int(question_id - 1)
-            if not self.ds or question_id < 0 or question_id >= len(self.ds):
-                return "Invalid question ID or dataset not loaded"
-            question_data = self.ds[question_id]
-            # Get the full question text (the last element in question_runs)
-            full_question = question_data["question"]
-            gold_label = question_data["answer_primary"]
-            return f"Question: {full_question}\n\nCorrect Answer: {gold_label}"
-        except Exception as e:
-            return f"Error loading question: {str(e)}"
     def validate_workflow(self, pipeline_state: PipelineState):
         """Validate the workflow."""
         try:
@@ -276,17 +246,19 @@ class TossupInterface:
         except Exception as e:
             raise gr.Error(f"Error validating workflow: {str(e)}")
-    def get_new_question_html(self, question_id: int):
         """Get the HTML for a new question."""
-        example = self.ds[question_id - 1]
-        question = example["question"]
-        gold_label = example["answer_primary"]
-        marker_indices = example["run_indices"]
-        tokens = question.split()
-        question_html = create_tokens_html(tokens, [], gold_label, marker_indices)
-        clean_answers = [a for a in example["clean_answers"] if len(a.split()) <= 6]
-        clean_answers = ", ".join(clean_answers)
-        return question_html, gold_label, clean_answers
     def get_model_outputs(self, example: dict, pipeline_state: PipelineState, buzz_threshold: float, early_stop: bool):
         """Get the model outputs for a given question ID."""
@@ -304,7 +276,7 @@ class TossupInterface:
         outputs = add_model_scores(outputs, example["clean_answers"], example["run_indices"])
         return outputs
-    def run_tossup(
         self,
         question_id: int,
         pipeline_state: PipelineState,
@@ -335,10 +307,8 @@ class TossupInterface:
             error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
             return error_msg, None, None
-    def evaluate_tossups(
-        self, pipeline_state: PipelineState, buzz_threshold: float, progress: gr.Progress = gr.Progress()
-    ):
-        """Evaluate the tossup."""
         try:
             # Validate inputs
             if not self.ds or not self.ds.num_rows:
@@ -388,7 +358,7 @@ class TossupInterface:
             triggers=[self.app.load, self.qid_selector.change],
             fn=self.get_new_question_html,
             inputs=[self.qid_selector],
-            outputs=[self.question_display, self.answer_display, self.clean_answer_display],
         )
         self.run_btn.click(
@@ -396,7 +366,7 @@ class TossupInterface:
             inputs=[self.pipeline_interface.pipeline_state],
             outputs=[self.pipeline_interface.pipeline_state],
         ).success(
-            self.run_tossup,
             inputs=[
                 self.qid_selector,
                 self.pipeline_interface.pipeline_state,
@@ -412,7 +382,7 @@ class TossupInterface:
         )
         self.eval_btn.click(
-            fn=self.evaluate_tossups,
             inputs=[self.pipeline_interface.pipeline_state, self.buzz_t_slider],
             outputs=[self.results_table, self.confidence_plot],
         )
@@ -428,7 +398,7 @@ class TossupInterface:
         )
         self.hidden_input.change(
-            fn=update_plot,
             inputs=[self.hidden_input, self.output_state],
             outputs=[self.confidence_plot],
         )

 from workflows.qb.simple_agent import SimpleTossupAgent
 from workflows.structs import ModelStep, Workflow
+from .commons import get_qid_selector
 from .plotting import (
     create_scatter_pyplot,
+    create_tossup_confidence_pyplot,
+    create_tossup_html,
+    update_tossup_plot,
 )
+from .utils import evaluate_prediction
 # TODO: Error handling on run tossup and evaluate tossup and show correct messages
 # TODO: ^^ Same for Bonus
 def add_model_scores(model_outputs: list[dict], clean_answers: list[str], run_indices: list[int]) -> list[dict]:
     """Add model scores to the model outputs."""
     for output, run_idx in zip(model_outputs, run_indices):
+        output["score"] = evaluate_prediction(output["answer"], clean_answers)
         output["token_position"] = run_idx + 1
     return model_outputs
         return [], []
     eval_points = []
     for i, v in zip(run_indices, model_outputs):
+        eval_points.append((int(i), v))
     return eval_points
 def initialize_eval_interface(example, model_outputs: list[dict]):
     """Initialize the interface with example text."""
     try:
+        tokens = example["question"].split()
+        run_indices = example["run_indices"]
+        answer = example["answer_primary"]
+        clean_answers = example["clean_answers"]
         eval_points = prepare_buzz_evals(run_indices, model_outputs)
         if not tokens:
             return "<div>No tokens found in the provided text.</div>", pd.DataFrame(), "{}"
+        highlighted_index = next((int(i) for i, v in eval_points if v["buzz"] == 1), -1)
+        html_content = create_tossup_html(tokens, answer, clean_answers, run_indices, eval_points)
+        plot_data = create_tossup_confidence_pyplot(tokens, eval_points, highlighted_index)
         # Store tokens, values, and buzzes as JSON for later use
         state = json.dumps({"tokens": tokens, "values": eval_points})
                 label="Early Stop",
                 info="Stop early if already buzzed",
             )
     def _render_qb_interface(self):
         """Render the quizbowl interface."""
+        with gr.Row(elem_classes="bonus-header-row form-inline"):
+            self.qid_selector = get_qid_selector(len(self.ds))
+            self.run_btn = gr.Button("Run on Tossup Question", variant="secondary")
+        self.question_display = gr.HTML(label="Question", elem_id="tossup-question-display")
         with gr.Row():
             self.confidence_plot = gr.Plot(
                 label="Buzz Confidence",
             value=pd.DataFrame(columns=["Token Position", "Correct?", "Confidence", "Prediction"]),
         )
         with gr.Row():
+            self.eval_btn = gr.Button("Evaluate", variant="primary")
         with gr.Accordion("Model Submission", elem_classes="model-submission-accordion", open=True):
             with gr.Row():
                 self.description_input = gr.Textbox(label="Description")
             with gr.Row():
                 gr.LoginButton()
+                self.submit_btn = gr.Button("Submit", variant="primary")
             self.submit_status = gr.HTML(label="Submission Status")
     def render(self):
         self._setup_event_listeners()
     def validate_workflow(self, pipeline_state: PipelineState):
         """Validate the workflow."""
         try:
         except Exception as e:
             raise gr.Error(f"Error validating workflow: {str(e)}")
+    def get_new_question_html(self, question_id: int) -> str:
         """Get the HTML for a new question."""
+        if question_id is None:
+            logging.error("Question ID is None. Setting to 1")
+            question_id = 1
+        try:
+            example = self.ds[question_id - 1]
+            question_tokens = example["question"].split()
+            return create_tossup_html(
+                question_tokens, example["answer_primary"], example["clean_answers"], example["run_indices"]
+            )
+        except Exception as e:
+            return f"Error loading question: {str(e)}"
     def get_model_outputs(self, example: dict, pipeline_state: PipelineState, buzz_threshold: float, early_stop: bool):
         """Get the model outputs for a given question ID."""
         outputs = add_model_scores(outputs, example["clean_answers"], example["run_indices"])
         return outputs
+    def single_run(
         self,
         question_id: int,
         pipeline_state: PipelineState,
             error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
             return error_msg, None, None
+    def evaluate(self, pipeline_state: PipelineState, buzz_threshold: float, progress: gr.Progress = gr.Progress()):
+        """Evaluate the tossup questions."""
         try:
             # Validate inputs
             if not self.ds or not self.ds.num_rows:
             triggers=[self.app.load, self.qid_selector.change],
             fn=self.get_new_question_html,
             inputs=[self.qid_selector],
+            outputs=[self.question_display],
         )
         self.run_btn.click(
             inputs=[self.pipeline_interface.pipeline_state],
             outputs=[self.pipeline_interface.pipeline_state],
         ).success(
+            self.single_run,
             inputs=[
                 self.qid_selector,
                 self.pipeline_interface.pipeline_state,
         )
         self.eval_btn.click(
+            fn=self.evaluate,
             inputs=[self.pipeline_interface.pipeline_state, self.buzz_t_slider],
             outputs=[self.results_table, self.confidence_plot],
         )
         )
         self.hidden_input.change(
+            fn=update_tossup_plot,
             inputs=[self.hidden_input, self.output_state],
             outputs=[self.confidence_plot],
         )

src/components/quizbowl/utils.py CHANGED Viewed

@@ -3,6 +3,22 @@ from typing import Any, Dict, List
 import pandas as pd
 def _create_confidence_plot_data(results: List[Dict], top_k_mode: bool = False) -> pd.DataFrame:
     """Create a DataFrame for the confidence plot."""
     if not top_k_mode:
@@ -59,28 +75,3 @@ def _create_top_k_dataframe(results: List[Dict]) -> pd.DataFrame:
                 }
             )
     return pd.DataFrame(df_rows)
-def _format_buzz_result(buzzed: bool, results: List[Dict], gold_label: str, top_k_mode: bool) -> tuple[str, str, bool]:
-    """Format the result text based on whether the agent buzzed."""
-    if not buzzed:
-        return f"Did not buzz. Correct answer was: {gold_label}", "No buzz", False
-    buzz_position = next(i for i, r in enumerate(results) if r.get("buzz", False))
-    buzz_result = results[buzz_position]
-    if top_k_mode:
-        # For top-k, check if any of the top guesses match
-        top_answers = [g.get("answer", "").lower() for g in buzz_result.get("guesses", [])]
-        correct = gold_label.lower() in [a.lower() for a in top_answers]
-        final_answer = top_answers[0] if top_answers else "No answer"
-    else:
-        # For regular mode
-        final_answer = buzz_result["answer"]
-        correct = final_answer.lower() == gold_label.lower()
-    result_text = f"BUZZED at position {buzz_position + 1} with answer: {final_answer}\n"
-    result_text += f"Correct answer: {gold_label}\n"
-    result_text += f"Result: {'CORRECT' if correct else 'INCORRECT'}"
-    return result_text, final_answer, correct

 import pandas as pd
+def evaluate_prediction(prediction: str, clean_answers: list[str] | str) -> int:
+    """Evaluate the buzz of a prediction against the clean answers."""
+    if isinstance(clean_answers, str):
+        print("clean_answers is a string")
+        clean_answers = [clean_answers]
+    pred = prediction.lower().strip()
+    if not pred:
+        return 0
+    for answer in clean_answers:
+        answer = answer.strip().lower()
+        if answer and answer in pred:
+            print(f"Found {answer} in {pred}")
+            return 1
+    return 0
 def _create_confidence_plot_data(results: List[Dict], top_k_mode: bool = False) -> pd.DataFrame:
     """Create a DataFrame for the confidence plot."""
     if not top_k_mode:
                 }
             )
     return pd.DataFrame(df_rows)

src/display/custom_css.py CHANGED Viewed

@@ -420,7 +420,7 @@ css_tossup = """
 .token.buzz-1 {
     border-color: #228b22; /* Darker and slightly muted green */
 }
-.token-container {
     line-height: 1.7;
     padding: 5px;
     margin-left: 4px;
@@ -429,4 +429,116 @@ css_tossup = """
     border-radius: 8px;
     margin-bottom: 10px;
 }
 """

 .token.buzz-1 {
     border-color: #228b22; /* Darker and slightly muted green */
 }
+.tossup-question {
     line-height: 1.7;
     padding: 5px;
     margin-left: 4px;
     border-radius: 8px;
     margin-bottom: 10px;
 }
+/* Tooltip styles */
+.tooltip {
+    display: none;
+    position: fixed;  /* Changed to fixed for better positioning */
+    padding: 12px 16px;
+    border-radius: 8px;
+    font-size: 13px;
+    white-space: normal;
+    z-index: 1000;
+    box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15);
+    min-width: 300px;
+    max-width: 400px;
+    backdrop-filter: blur(4px);
+    border: 1px solid rgba(255, 255, 255, 0.2);
+}
+.tooltip-content {
+    color: #2c3e50;  /* Darker text for better readability */
+}
+.tooltip-content div {
+    margin: 4px 0;
+    line-height: 1.4;
+}
+/* When hovering over a token, show its tooltip */
+.token:hover .tooltip {
+    display: block;
+}
+/* Add a small arrow to the tooltip */
+.tooltip::after {
+    content: '';
+    position: absolute;
+    bottom: -8px;
+    left: 50%;
+    transform: translateX(-50%);
+    border-left: 8px solid transparent;
+    border-right: 8px solid transparent;
+    border-top: 8px solid currentColor;
+}
+"""
+css_bonus = """
+.qid-selector {
+    box-shadow: 0 0 0 0 !important;
+}
+.qid-selector input {
+    border-radius: 12px !important;
+}
+.bonus-header-row {
+    align-items: flex-end;
+}
+.bonus-card {
+    background-color: var(--card-bg-color);
+    border-radius: 12px;
+    padding: 12px;
+    margin: 0px 0px;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+}
+.bonus-leadin {
+    font-size: 14px;
+    font-weight: 500;
+    margin-bottom: 12px;
+    line-height: 1.5;
+}
+.bonus-part {
+    background-color: var(--answer-bg-color);
+    border-radius: 8px;
+    padding: 12px;
+    margin: 8px 0;
+}
+.bonus-part-number {
+    font-weight: 600;
+    color: #666;
+    margin-bottom: 4px;
+}
+.bonus-part-text {
+    margin-bottom: 8px;
+    line-height: 1.5;
+}
+.bonus-answer {
+    background-color: #fff5f5;
+    border-radius: 6px;
+    padding: 8px 12px;
+    margin-top: 8px;
+    font-size: 14px;
+    border-left: 3px solid #ff6b6b;
+}
+.bonus-answer-label {
+    font-weight: 500;
+    color: #666;
+    margin-bottom: 4px;
+}
+.bonus-answer-text {
+    color: #333;
+}
+.bonus-container {
+    max-width: 800px;
+    margin: 0 auto;
+    padding-left: 8px;
+    padding-right: 8px;
+}
 """