# %%
import json
import logging
import re
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
def _make_answer_html(answer: str, clean_answers: list[str] = []) -> str:
clean_answers = [a for a in clean_answers if len(a.split()) <= 6 and a != answer]
additional_answers_html = ""
if clean_answers:
additional_answers_html = f" [or {', '.join(clean_answers)}]"
return f"""
Answer:
{answer}
{additional_answers_html}
"""
def _get_token_classes(confidence, buzz, score) -> str:
if confidence is None:
return "token"
elif not buzz:
return f"token guess-point buzz-{score}"
else:
return f"token guess-point buzz-{score}"
def _create_token_tooltip_html(values) -> str:
if not values:
return ""
confidence = values.get("confidence", 0)
buzz = values.get("buzz", 0)
score = values.get("score", 0)
answer = values.get("answer", "")
answer_tokens = answer.split()
if len(answer_tokens) > 10:
k = len(answer_tokens) - 10
answer = " ".join(answer_tokens[:10]) + f"...[{k} more words]"
color = "#a3c9a3" if score else "#ebbec4" # Light green for correct, light pink for incorrect
if values.get("logprob", None) is not None:
prob = np.exp(values["logprob"])
prob_str = f" 📈 Output Probability: {prob:.3f}
"
else:
prob_str = ""
return f"""
"""
def create_token_html(token: str, values: dict, i: int) -> str:
confidence = values.get("confidence", None)
buzz = values.get("buzz", 0)
score = values.get("score", 0)
# Replace non-word characters for proper display in HTML
display_token = f"{token} 🚨" if buzz else f"{token} 💭" if values else token
if not re.match(r"\w+", token):
display_token = token.replace(" ", " ")
css_class = _get_token_classes(confidence, buzz, score)
# Add tooltip if we have values for this token
tooltip_html = _create_token_tooltip_html(values)
token_html = f'{display_token}{tooltip_html}'
# if i in marker_indices:
# token_html += "|"
return token_html
def create_tossup_html(
tokens: list[str],
answer_primary: str,
clean_answers: list[str],
marker_indices: list[int] = [],
eval_points: list[tuple[int, dict]] = [],
) -> str:
"""Create HTML for tokens with hover capability and a colored header for the answer."""
try:
ep = dict(eval_points)
marker_indices = set(marker_indices)
html_tokens = []
for i, token in enumerate(tokens):
token_html = create_token_html(token, ep.get(i, {}), i + 1)
html_tokens.append(token_html)
answer_html = _make_answer_html(answer_primary, clean_answers)
return f"""
{"".join(html_tokens)}
{answer_html}
"""
except Exception as e:
logging.error(f"Error creating token HTML: {e}", exc_info=True)
return f"Error creating tokens: {str(e)}
"
def create_bonus_html(leadin: str, parts: list[dict]) -> str:
# Create HTML for leadin and parts with answers
leadin_html = f"{leadin}
"
parts_html = []
for i, part in enumerate(parts):
question_text = part["part"]
answer_html = _make_answer_html(part["answer_primary"], part["clean_answers"])
"Part {i + 1}
"
part_html = f"""
#{i + 1}. {question_text}
{answer_html}
"""
parts_html.append(part_html)
html_content = f"""
{leadin_html}
{"".join(parts_html)}
"""
# Format clean answers for the answer display
clean_answers = []
for i, part in enumerate(parts):
part_answers = [a for a in part["clean_answers"] if len(a.split()) <= 6]
clean_answers.append(f"{i + 1}. {', '.join(part_answers)}")
return html_content
def create_tossup_confidence_pyplot(
tokens: list[str],
eval_points: list[tuple[int, dict]],
confidence_threshold: float = 0.5,
prob_threshold: float | None = None,
) -> plt.Figure:
"""Create a pyplot of token values with optional highlighting."""
plt.style.use("ggplot") # Set theme to grid paper
fig = plt.figure(figsize=(10, 4), dpi=300) # Set figure size to 11x5
ax = fig.add_subplot(111)
x = [0] + [int(i + 1) for i, _ in eval_points]
y_conf = [0] + [v["confidence"] for _, v in eval_points]
logprob_values = [v["logprob"] for _, v in eval_points if v["logprob"] is not None]
y_prob = [0] + [np.exp(v) for v in logprob_values]
ax.plot(x, y_prob, "o-", color="#f2b150", label="Probability")
ax.plot(x, y_conf, "o-", color="#4996de", label="Confidence")
for i, v in eval_points:
if not v["buzz"]:
continue
color = "green" if v["score"] else "red"
conf = v["confidence"]
ax.plot(i + 1, conf, "o", color=color, markerfacecolor="none", markersize=12, markeredgewidth=2.5)
if v["logprob"] is not None:
prob = np.exp(v["logprob"])
ax.plot(i + 1, prob, "o", color=color, markerfacecolor="none", markersize=12, markeredgewidth=2.5)
if i >= len(tokens):
print(f"Token index {i} is out of bounds for n_tokens: {len(tokens)}")
ax.annotate(f"{tokens[i]}", (i + 1, conf), textcoords="offset points", xytext=(0, 10), ha="center")
# Add horizontal dashed line for confidence threshold
ax.axhline(y=confidence_threshold, color="#9370DB", linestyle="--", xmin=0, xmax=1, label="Confidence Threshold")
# Add horizontal dashed line for probability threshold if provided
if prob_threshold is not None:
ax.axhline(y=prob_threshold, color="#cf5757", linestyle="--", xmin=0, xmax=1, label="Probability Threshold")
ax.set_title("Buzz Confidence")
ax.set_xlabel("Token Index")
ax.set_ylabel("Confidence")
ax.set_xticks(x)
ax.set_xticklabels(x)
ax.legend()
return fig
def create_scatter_pyplot(token_positions: list[int], scores: list[int]) -> plt.Figure:
"""Create a scatter plot of token positions and scores."""
plt.style.use("ggplot")
fig = plt.figure(figsize=(11, 5))
ax = fig.add_subplot(111)
counts = Counter(zip(token_positions, scores))
X = []
Y = []
S = []
for (pos, score), size in counts.items():
X.append(pos)
Y.append(score)
S.append(size * 20)
ax.scatter(X, Y, color="#4698cf", s=S)
return fig
def create_bonus_confidence_plot(parts: list[dict], model_outputs: list[dict]) -> plt.Figure:
"""Create confidence plot for bonus parts."""
plt.style.use("ggplot")
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111)
# Plot confidence for each part
x = range(1, len(parts) + 1)
confidences = [output["confidence"] for output in model_outputs]
scores = [output["score"] for output in model_outputs]
# Plot confidence bars
bars = ax.bar(x, confidences, color="#4698cf")
# Color bars based on correctness
for i, score in enumerate(scores):
bars[i].set_color("green" if score == 1 else "red")
ax.set_title("Part Confidence")
ax.set_xlabel("Part Number")
ax.set_ylabel("Confidence")
ax.set_xticks(x)
ax.set_xticklabels([f"Part {i}" for i in x])
return fig
def update_tossup_plot(highlighted_index: int, state: str) -> pd.DataFrame:
"""Update the plot when a token is hovered; add a vertical line on the plot."""
try:
if not state or state == "{}":
logging.warning("Empty state provided to update_plot")
return pd.DataFrame()
highlighted_index = int(highlighted_index) if highlighted_index else None
logging.info(f"Update plot triggered with token index: {highlighted_index}")
data = json.loads(state)
tokens = data.get("tokens", [])
values = data.get("values", [])
if not tokens or not values:
logging.warning("No tokens or values found in state")
return pd.DataFrame()
# Create updated plot with highlighting of the token point
# plot_data = create_line_plot(values, highlighted_index)
plot_data = create_tossup_confidence_pyplot(tokens, values, highlighted_index)
return plot_data
except Exception as e:
logging.error(f"Error updating plot: {e}")
return pd.DataFrame()
# %%
def create_df_entry(run_indices: list[int], run_outputs: list[dict]) -> dict:
"""Create a dataframe entry from a list of model outputs."""
chosen_idx = None
earliest_ok_idx = None
is_correct = None
for i, o in enumerate(run_outputs):
if chosen_idx is None and o["buzz"]:
chosen_idx = run_indices[o["position"] - 1] + 1
is_correct = o["score"]
if earliest_ok_idx is None and o["score"]:
earliest_ok_idx = run_indices[o["position"] - 1] + 1
if is_correct is None:
is_correct = False
# if buzz is not the last index, correct scores 10, incorrect scores -5
# if buzz is the final index, correct scores 5, incorrect scores 0
if chosen_idx == -1:
tossup_score = 0
elif chosen_idx == run_indices[-1] + 1:
tossup_score = 5 if is_correct else 0
else:
tossup_score = 10 if is_correct else -5
gap = None if (chosen_idx is None or earliest_ok_idx is None) else chosen_idx - earliest_ok_idx
if earliest_ok_idx is None:
cls = "hopeless"
elif chosen_idx is None:
cls = "never-buzzed" # Opportunity missed to score
elif chosen_idx == earliest_ok_idx:
cls = "best-buzz" # Perfect timing
elif chosen_idx > earliest_ok_idx:
cls = "late-buzz" # Opportunity missed to buzz earlier
elif chosen_idx < earliest_ok_idx:
cls = "premature" # Opportunity missed to score
return {
"chosen_idx": chosen_idx,
"earliest_ok_idx": earliest_ok_idx,
"gap": gap,
"cls": cls,
"tossup_score": tossup_score,
"is_correct": int(is_correct),
}
def prepare_tossup_results_df(run_indices: list[list[int]], model_outputs: list[list[dict]]) -> pd.DataFrame:
"""Create a dataframe from a list of model outputs."""
records = []
for indices, outputs in zip(run_indices, model_outputs):
entry = create_df_entry(indices, outputs)
records.append(entry)
return pd.DataFrame.from_records(records)
def create_tossup_eval_table(df: pd.DataFrame) -> pd.DataFrame:
"""Create a table from a dataframe."""
# Prepare a dataframe of aggregated metrics:
# - Mean Tossup Score
# - Buzz Accuracy
# - Mean +ve Gap
# - Mean -ve Gap
# - Mean Buzz Position
positions = df["chosen_idx"].dropna()
gaps = df["gap"].dropna()
pos_gaps = gaps.loc[gaps >= 0]
neg_gaps = gaps.loc[gaps < 0]
mean_tossup_score = df["tossup_score"].sum() / len(df)
return pd.DataFrame(
[
{
"Tossup Score (10)": f"{mean_tossup_score:5.1f}",
"Buzz Accuracy": f"{df['is_correct'].mean():5.1%}",
"Buzz Position": f"{np.mean(positions):5.1f}",
"+ve Gap": f"{pos_gaps.mean():5.1f}",
"-ve Gap": f"{neg_gaps.mean():5.1f}",
}
]
)
def create_tossup_eval_dashboard(run_indices: list[list[int]], df: pd.DataFrame, *, figsize=(15, 8), title_prefix=""):
"""
Visualise buzzing behaviour with three sub-plots:
1. Ceiling-accuracy vs. prefix length
2. Scatter of earliest-correct idx vs. chosen-buzz idx
3. Frequency distribution of narrative classes (vertical bars)
Parameters
----------
df : pd.DataFrame
Output of `build_buzz_dataframe` – must contain
columns: earliest_ok_idx, chosen_idx, cls.
eval_indices : sequence[int]
Token positions at which the model was probed.
figsize : tuple, optional
Figure size passed to `plt.subplots`.
title_prefix : str, optional
Prepended to each subplot title (useful when comparing models).
"""
# ------------------------------------------------------------------
# 0. Prep (variables reused throughout the function)
# ------------------------------------------------------------------
# Collect all evaluation indices across questions so we know the
# x-axis domain and the padding for NaNs.
eval_indices = np.asarray(sorted({idx for indices in run_indices for idx in indices}))
# Narrative classes and their colours
classes = [
"best-buzz",
"late-buzz",
"never-buzzed",
"premature",
"hopeless",
]
colors = ["tab:green", "tab:olive", "tab:orange", "tab:red", "tab:gray"]
palette = dict(zip(classes, colors))
max_idx = eval_indices.max() * 1.25 # padding for NaN replacement / axis limits
# ------------------------------------------------------------------
# 1. Figure / axes layout
# ------------------------------------------------------------------
# GridSpec layout → 2 rows × 3 cols.
# ┌────────────┬────────────┬────────┐
# │ Ceiling │ Scatter │ Bars │ (row 0)
# ├────────────┴────────────┴────────┤
# │ Descriptions (spans all 3 cols) │ (row 1)
# └──────────────────────────────────┘
# Having a dedicated row for the narrative-class descriptions avoids
# overlapping with sub-plots and makes the whole figure more compact.
plt.style.use("ggplot")
fig = plt.figure(figsize=figsize)
gs = fig.add_gridspec(
nrows=2,
ncols=3,
height_ratios=[5, 1], # extra space for plots vs. descriptions
width_ratios=[2.2, 2.2, 1],
hspace=0.2, # reduced vertical spacing between plots
wspace=0.2, # reduced horizontal spacing between plots
left=0.05, # reduced left margin
right=0.95, # reduced right margin
top=0.9, # reduced top margin
bottom=0.05, # reduced bottom margin
)
ax_ceiling = fig.add_subplot(gs[0, 0]) # Ceiling accuracy curve
ax_scatter = fig.add_subplot(gs[0, 1]) # Earliest vs. chosen scatter
ax_bars = fig.add_subplot(gs[0, 2]) # Outcome distribution bars
ax_desc = fig.add_subplot(gs[1, :]) # Textual descriptions
ax_desc.axis("off")
fig.suptitle("Buzzing behaviour", fontsize=16, fontweight="bold")
# ------------------------------------------------------------------
# 2. Ceiling accuracy curve
# ------------------------------------------------------------------
ceiling = [((df["earliest_ok_idx"].notna()) & (df["earliest_ok_idx"] <= idx)).mean() for idx in eval_indices]
ax_ceiling.plot(eval_indices, ceiling, marker="o", color="#4698cf")
ax_ceiling.set_xlabel("Token index shown")
ax_ceiling.set_ylabel("Proportion of questions correct")
ax_ceiling.set_ylim(0, 1.01)
ax_ceiling.set_title(f"{title_prefix}Ceiling accuracy vs. prefix")
# ------------------------------------------------------------------
# 3. Earliest-vs-Chosen scatter
# ------------------------------------------------------------------
for cls in classes:
sub = df[df["cls"] == cls]
if sub.empty:
continue
x = sub["earliest_ok_idx"].fillna(max_idx)
y = sub["chosen_idx"].fillna(max_idx)
ax_scatter.scatter(
x,
y,
label=cls,
alpha=0.7,
edgecolor="black",
linewidth=1,
marker="o",
s=90,
c=palette[cls],
facecolor="none",
)
lim = max_idx
ax_scatter.plot([0, lim], [0, lim], linestyle=":", linewidth=1)
ax_scatter.set_xlim(0, lim)
ax_scatter.set_ylim(0, lim)
ax_scatter.set_xlabel("Earliest index with correct answer")
ax_scatter.set_ylabel("Chosen buzz index")
ax_scatter.set_title(f"{title_prefix}Earliest vs. chosen index")
ax_scatter.legend(frameon=False, fontsize="small")
# ------------------------------------------------------------------
# 4. Outcome distribution (horizontal bars)
# ------------------------------------------------------------------
counts = df["cls"].value_counts().reindex(classes).fillna(0)
ax_bars.barh(
counts.index,
counts.values,
color=[palette[c] for c in counts.index],
alpha=0.7,
edgecolor="black",
linewidth=1,
)
ax_bars.set_xlabel("Number of questions")
ax_bars.set_title(f"{title_prefix}Outcome distribution")
# Ensure x-axis shows integer ticks only
from matplotlib.ticker import MaxNLocator
ax_bars.xaxis.set_major_locator(MaxNLocator(integer=True))
# ------------------------------------------------------------------
# 5. Narrative-class descriptions (bottom panel)
# ------------------------------------------------------------------
descriptions = {
"best-buzz": "Perfect timing. Buzzed at the earliest possible correct position",
"late-buzz": "Missed opportunity. Buzzed correctly but later than optimal",
"never-buzzed": "Missed opportunity. Never buzzed despite knowing the answer",
"premature": "Incorrect buzz. Buzzing at a later position could have been correct",
"hopeless": "Never knew the answer. No correct answer at any position",
}
y_pos = 1.0 # start at top of the description axis
for cls, color in zip(classes, colors):
ax_desc.text(
0.01,
y_pos,
f"■ {cls}: {descriptions[cls]}",
ha="left",
va="top",
color=color,
fontweight="bold",
fontsize=11, # increased font size from 9 to 11
transform=ax_desc.transAxes,
)
y_pos -= 0.25 # increased vertical step inside the axis for more line height
# ------------------------------------------------------------------
# 6. Return the final figure
# ------------------------------------------------------------------
return fig
# %%
# Create dummy data for testing
def create_dummy_model_outputs(n_entries=10, n_positions=5):
"""Create dummy model outputs for testing."""
np.random.seed(42)
dummy_outputs = []
for _ in range(n_entries):
run_indices = sorted(np.random.choice(range(10, 50), n_positions, replace=False))
outputs = []
for i in range(n_positions):
# Randomly decide if model will buzz at this position
will_buzz = np.random.random() > 0.7
# Randomly decide if answer is correct
is_correct = np.random.random() > 0.4
outputs.append(
{
"position": i + 1,
"buzz": will_buzz,
"score": 1 if is_correct else 0,
"confidence": np.random.random(),
"logprob": np.log(np.random.random()),
"answer": f"Answer {i + 1}",
}
)
dummy_outputs.append({"run_indices": run_indices, "outputs": outputs})
return dummy_outputs
# dummy_data = create_dummy_model_outputs()
# dummy_df = pd.DataFrame([create_df_entry(entry["run_indices"], entry["outputs"]) for entry in dummy_data])
# dummy_df
# plot_buzz_dashboard(dummy_df, dummy_data[0]["run_indices"])
# %%