Spaces:

qanta-challenge
/

quizbowl-submission

Running

quizbowl-submission / src /components /quizbowl /plotting.py

Maharshi Gor

Updated UI for bonus questions and playground dataset reference

ee5d50c about 2 months ago

21.3 kB

	# %%
	import json
	import logging
	import re
	from collections import Counter

	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd


	def _make_answer_html(answer: str, clean_answers: list[str] = []) -> str:
	clean_answers = [a for a in clean_answers if len(a.split()) <= 6 and a != answer]
	additional_answers_html = ""
	if clean_answers:
	additional_answers_html = f"<span class='bonus-answer-text'> [or {', '.join(clean_answers)}]</span>"
	return f"""
	<div class='bonus-answer'>
	<span class='bonus-answer-label'>Answer: </span>
	<span class='bonus-answer-text'>{answer}</span>
	{additional_answers_html}
	</div>
	"""


	def _make_answer_line_html(answer_line: str) -> str:
	return f"""
	<div class='bonus-answer'>
	<span class='bonus-answer-label'>Answer: </span>
	<span class='bonus-answer-text'>{answer_line}</span>
	</div>
	"""


	def _make_model_response_html(part_output: dict, explanation_token_limit: int = 25) -> str:
	guess = part_output.get("guess", "")
	confidence = float(part_output.get("confidence", 0.0))
	explanation = part_output.get("explanation", "")
	expl_tokens = explanation.split()
	if len(expl_tokens) > explanation_token_limit:
	k = len(expl_tokens) - explanation_token_limit
	explanation = " ".join(expl_tokens[:explanation_token_limit]) + f"...[{k} more words]"

	correct = part_output.get("correct", 0)
	emoji = "✅" if correct else "❌"
	answer_class = "correct-answer" if correct else "incorrect-answer"

	return f"""
	<div class='bonus-answer {answer_class}'>
	<div class="bonus-answer-row" style="margin-bottom: 4px;">
	<span class='bonus-answer-label' style='font-size: 1.2em;'>🤖 Guess: </span>
	<span class='bonus-model-guess'>{guess} {emoji}</span>
	<span class='confidence-badge' style='float: right'>⚡️ Confidence: {confidence:.2f}</span>
	</div>
	<div class='bonus-explanation'>
	<span class='bonus-answer-label'>💬 Explanation:</span>
	<span class='bonus-explanation-text' style='font-style: italic;'>{explanation}</span>
	</div>
	</div>
	"""


	def _get_token_classes(confidence, buzz, score) -> str:
	if confidence is None:
	return "token"
	elif not buzz:
	return f"token guess-point buzz-{score}"
	else:
	return f"token guess-point buzz-{score}"


	def _create_token_tooltip_html(values) -> str:
	if not values:
	return ""
	confidence = values.get("confidence", 0)
	buzz = values.get("buzz", 0)
	correct = values.get("correct", 0)
	guess = values.get("guess", "")
	guess_tokens = guess.split()
	if len(guess_tokens) > 10:
	k = len(guess_tokens) - 10
	guess = " ".join(guess_tokens[:10]) + f"...[{k} more words]"

	color = "#a3c9a3" if correct else "#ebbec4" # Light green for correct, light pink for incorrect

	if values.get("logprob", None) is not None:
	prob = np.exp(values["logprob"])
	prob_str = f"<p style='margin: 0 0 4px; color: #000;'> 📈 <b style='color: #000;'>Output Probability:</b> {prob:.3f}</p>"
	else:
	prob_str = ""

	return f"""
	<div class="tooltip card" style="background-color: {color}; border-radius: 8px; padding: 12px; box-shadow: 2px 4px 8px rgba(0, 0, 0, 0.15);">
	<div class="tooltip-content" style="font-family: 'Arial', sans-serif; color: #000;">
	<h4 style="margin: 0 0 8px; color: #000;">💡 Answer</h4>
	<p><code style="font-weight: bold; margin: 0 0 8px; color: #000;">{guess}</code></p>
	<p style="margin: 0 0 4px; color: #000;">📈 <b style="color: #000;">Confidence:</b> {confidence:.2f}</p>
	{prob_str}
	<p style="margin: 0; color: #000;">🔍 <b style="color: #000;">Status:</b> {"✅ Correct" if correct else "❌ Incorrect" if buzz else "🚫 No Buzz"}</p>
	</div>
	</div>
	"""


	def create_token_html(token: str, values: dict, i: int) -> str:
	confidence = values.get("confidence", None)
	buzz = values.get("buzz", 0)
	correct = values.get("correct", 0)

	# Replace non-word characters for proper display in HTML
	display_token = f"{token} 🚨" if buzz else f"{token} 💭" if values else token
	if not re.match(r"\w+", token):
	display_token = token.replace(" ", " ")

	css_class = _get_token_classes(confidence, buzz, correct)
	# Add tooltip if we have values for this token
	tooltip_html = _create_token_tooltip_html(values)

	token_html = f'<span id="token-{i}" class="{css_class}" data-index="{i}">{display_token}{tooltip_html}</span>'
	# if i in marker_indices:
	# token_html += "<span style='color: crimson;'>\|</span>"
	return token_html


	def create_tossup_html(
	tokens: list[str],
	example: dict,
	eval_points: list[tuple[int, dict]] = [],
	) -> str:
	"""Create HTML for tokens with hover capability and a colored header for the answer."""
	try:
	ep = dict(eval_points)

	html_tokens = []
	for i, token in enumerate(tokens, start=1):
	token_html = create_token_html(token, ep.get(i, {}), i)
	html_tokens.append(token_html)

	if "answer_line" in example:
	answer_html = _make_answer_line_html(example["answer_line"])
	else:
	answer_html = _make_answer_html(example["answer_primary"], example["clean_answers"])
	return f"""
	<div class='bonus-container'>
	<div class='bonus-card'>
	<div class='tossup-question'>
	{"".join(html_tokens)}
	</div>
	{answer_html}
	</div>
	</div>
	"""
	except Exception as e:
	logging.error(f"Error creating token HTML: {e}", exc_info=True)
	return f"<div class='token-container'>Error creating tokens: {str(e)}</div>"


	def create_bonus_html(example: dict, part_outputs: list[dict] \| None = None) -> str:
	# Create HTML for leadin and parts with answers
	leadin_html = f"<div class='bonus-leadin'>{example['leadin']}</div>"
	parts_html = []

	for i, part in enumerate(example["parts"]):
	question_text = part["question"]
	if "answer_line" in part:
	answer_html = _make_answer_line_html(part["answer_line"])
	else:
	answer_html = _make_answer_html(part["answer_primary"], part["clean_answers"])

	model_response_html = ""
	if part_outputs is not None:
	model_response_html = _make_model_response_html(part_outputs[i])

	"<div class='bonus-part-number'>Part {i + 1}</div>"
	part_html = f"""
	<div class='bonus-part'>
	<div class='bonus-part-text'><b>#{i + 1}.</b> {question_text}</div>
	{answer_html}
	{model_response_html}
	</div>
	"""
	parts_html.append(part_html)

	html_content = f"""
	<div class='bonus-container'>
	<div class='bonus-card'>
	{leadin_html}
	{"".join(parts_html)}
	</div>
	</div>
	"""

	return html_content


	def create_tossup_confidence_pyplot(
	tokens: list[str],
	run_outputs: list[dict],
	confidence_threshold: float = 0.5,
	prob_threshold: float \| None = None,
	) -> plt.Figure:
	"""Create a pyplot of token values with optional highlighting."""
	plt.style.use("ggplot") # Set theme to grid paper
	fig = plt.figure(figsize=(10, 4), dpi=300) # Set figure size to 11x5
	ax = fig.add_subplot(111)
	x = [0] + [o["token_position"] for o in run_outputs]
	y_conf = [0] + [o["confidence"] for o in run_outputs]
	logprobs = [o["logprob"] for o in run_outputs if o["logprob"] is not None]
	y_prob = [0] + [np.exp(v) for v in logprobs]

	if prob_threshold is not None:
	ax.plot(x, y_prob, "o-", color="#f2b150", label="Probability")
	ax.plot(x, y_conf, "o-", color="#4996de", label="Confidence")
	for o in run_outputs:
	if not o["buzz"]:
	continue
	color = "green" if o["correct"] else "red"
	conf = o["confidence"]
	i = o["token_position"]
	ax.plot(i, conf, "o", color=color, markerfacecolor="none", markersize=12, markeredgewidth=2.5)
	if o["logprob"] is not None and prob_threshold is not None:
	prob = np.exp(o["logprob"])
	if prob > prob_threshold:
	ax.plot(i, prob, "o", color=color, markerfacecolor="none", markersize=12, markeredgewidth=2.5)
	if i > len(tokens):
	print(f"1-indexed token index {i} is out of bounds for n_tokens: {len(tokens)}")
	ax.annotate(f"{tokens[i - 1]}", (i, conf), textcoords="offset points", xytext=(0, 10), ha="center")

	# Add horizontal dashed line for confidence threshold
	ax.axhline(y=confidence_threshold, color="#9370DB", linestyle="--", xmin=0, xmax=1, label="Confidence Threshold")
	# Add horizontal dashed line for probability threshold if provided
	if prob_threshold is not None:
	ax.axhline(y=prob_threshold, color="#cf5757", linestyle="--", xmin=0, xmax=1, label="Probability Threshold")

	ax.set_title("Buzz Confidence")
	ax.set_xlabel("Token Index")
	ax.set_ylabel("Confidence")
	ax.set_xticks(x)
	ax.set_xticklabels(x)
	ax.legend()
	return fig


	def create_scatter_pyplot(token_positions: list[int], scores: list[int]) -> plt.Figure:
	"""Create a scatter plot of token positions and scores."""
	plt.style.use("ggplot")
	fig = plt.figure(figsize=(11, 5))
	ax = fig.add_subplot(111)

	counts = Counter(zip(token_positions, scores))
	X = []
	Y = []
	S = []
	for (pos, score), size in counts.items():
	X.append(pos)
	Y.append(score)
	S.append(size * 20)

	ax.scatter(X, Y, color="#4698cf", s=S)

	return fig


	def create_bonus_confidence_plot(parts: list[dict], model_outputs: list[dict]) -> plt.Figure:
	"""Create confidence plot for bonus parts."""
	plt.style.use("ggplot")
	fig = plt.figure(figsize=(10, 6))
	ax = fig.add_subplot(111)

	# Plot confidence for each part
	x = range(1, len(parts) + 1)
	confidences = [output["confidence"] for output in model_outputs]
	scores = [output["correct"] for output in model_outputs]

	# Plot confidence bars
	bars = ax.bar(x, confidences, color="#4698cf")

	# Color bars based on correctness
	for i, score in enumerate(scores):
	bars[i].set_color("green" if score == 1 else "red")

	ax.set_title("Part Confidence")
	ax.set_xlabel("Part Number")
	ax.set_ylabel("Confidence")
	ax.set_xticks(x)
	ax.set_xticklabels([f"Part {i}" for i in x])

	return fig


	def update_tossup_plot(highlighted_index: int, state: str) -> pd.DataFrame:
	"""Update the plot when a token is hovered; add a vertical line on the plot."""
	try:
	if not state or state == "{}":
	logging.warning("Empty state provided to update_plot")
	return pd.DataFrame()

	highlighted_index = int(highlighted_index) if highlighted_index else None
	logging.info(f"Update plot triggered with token index: {highlighted_index}")

	data = json.loads(state)
	tokens = data.get("tokens", [])
	values = data.get("values", [])

	if not tokens or not values:
	logging.warning("No tokens or values found in state")
	return pd.DataFrame()

	# Create updated plot with highlighting of the token point
	# plot_data = create_line_plot(values, highlighted_index)
	plot_data = create_tossup_confidence_pyplot(tokens, values, highlighted_index)
	return plot_data
	except Exception as e:
	logging.error(f"Error updating plot: {e}")
	return pd.DataFrame()


	def create_tossup_eval_table(df: pd.DataFrame) -> pd.DataFrame:
	"""Create a table from a dataframe."""
	# Prepare a dataframe of aggregated metrics:
	# - Mean Tossup Score
	# - Buzz Accuracy
	# - Mean +ve Gap
	# - Mean -ve Gap
	# - Mean Buzz Position

	positions = df["chosen_idx"].dropna()
	gaps = df["gap"].dropna()
	pos_gaps = gaps.loc[gaps >= 0]
	neg_gaps = gaps.loc[gaps < 0]

	mean_tossup_score = df["raw_score"].sum() / len(df)
	expected_score = df["expected_score"].sum() / len(df)
	buzz_precision = df["is_correct"].sum() / df["buzz"].sum()

	return pd.DataFrame(
	[
	{
	"Raw Score": f"{mean_tossup_score:5.1f}",
	"Expected Score": f"{expected_score:5.1f}",
	"Buzz Precision": f"{buzz_precision:5.1%}",
	"Buzz Position": f"{np.mean(positions):5.1f}",
	"+ve Gap": f"{pos_gaps.mean():5.1f}",
	"-ve Gap": f"{neg_gaps.mean():5.1f}",
	}
	]
	)


	def create_tossup_eval_dashboard(run_indices: list[list[int]], df: pd.DataFrame, *, figsize=(15, 8), title_prefix=""):
	"""
	Visualise buzzing behaviour with three sub-plots:

	1. Ceiling-accuracy vs. prefix length
	2. Scatter of earliest-correct idx vs. chosen-buzz idx
	3. Frequency distribution of narrative classes (vertical bars)

	Parameters
	----------
	df : pd.DataFrame
	Output of `build_buzz_dataframe` – must contain
	columns: earliest_ok_idx, chosen_idx, cls.
	eval_indices : sequence[int]
	Token positions at which the model was probed.
	figsize : tuple, optional
	Figure size passed to `plt.subplots`.
	title_prefix : str, optional
	Prepended to each subplot title (useful when comparing models).
	"""
	# ------------------------------------------------------------------
	# 0. Prep (variables reused throughout the function)
	# ------------------------------------------------------------------
	# Collect all evaluation indices across questions so we know the
	# x-axis domain and the padding for NaNs.
	eval_indices = np.asarray(sorted({idx for indices in run_indices for idx in indices}))

	# Narrative classes and their colours
	classes = [
	"best-buzz",
	"late-buzz",
	"never-buzzed",
	"premature",
	"hopeless",
	]
	colors = ["tab:green", "tab:olive", "tab:orange", "tab:red", "tab:gray"]
	palette = dict(zip(classes, colors))

	max_idx = eval_indices.max() * 1.25 # padding for NaN replacement / axis limits

	# ------------------------------------------------------------------
	# 1. Figure / axes layout
	# ------------------------------------------------------------------
	# GridSpec layout → 2 rows × 3 cols.
	# ┌────────────┬────────────┬────────┐
	# │ Ceiling │ Scatter │ Bars │ (row 0)
	# ├────────────┴────────────┴────────┤
	# │ Descriptions (spans all 3 cols) │ (row 1)
	# └──────────────────────────────────┘
	# Having a dedicated row for the narrative-class descriptions avoids
	# overlapping with sub-plots and makes the whole figure more compact.

	plt.style.use("ggplot")
	fig = plt.figure(figsize=figsize)
	gs = fig.add_gridspec(
	nrows=2,
	ncols=3,
	height_ratios=[5, 1], # extra space for plots vs. descriptions
	width_ratios=[2.2, 2.2, 1],
	hspace=0.2, # reduced vertical spacing between plots
	wspace=0.2, # reduced horizontal spacing between plots
	left=0.05, # reduced left margin
	right=0.95, # reduced right margin
	top=0.9, # reduced top margin
	bottom=0.05, # reduced bottom margin
	)

	ax_ceiling = fig.add_subplot(gs[0, 0]) # Ceiling accuracy curve
	ax_scatter = fig.add_subplot(gs[0, 1]) # Earliest vs. chosen scatter
	ax_bars = fig.add_subplot(gs[0, 2]) # Outcome distribution bars
	ax_desc = fig.add_subplot(gs[1, :]) # Textual descriptions
	ax_desc.axis("off")

	fig.suptitle("Buzzing behaviour", fontsize=16, fontweight="bold")

	# ------------------------------------------------------------------
	# 2. Ceiling accuracy curve
	# ------------------------------------------------------------------
	ceiling = [((df["earliest_ok_idx"].notna()) & (df["earliest_ok_idx"] <= idx)).mean() for idx in eval_indices]
	ax_ceiling.plot(eval_indices, ceiling, marker="o", color="#4698cf")
	ax_ceiling.set_xlabel("Token index shown")
	ax_ceiling.set_ylabel("Proportion of questions correct")
	ax_ceiling.set_ylim(0, 1.01)
	ax_ceiling.set_title(f"{title_prefix}Ceiling accuracy vs. prefix")

	# ------------------------------------------------------------------
	# 3. Earliest-vs-Chosen scatter
	# ------------------------------------------------------------------
	for cls in classes:
	sub = df[df["cls"] == cls]
	if sub.empty:
	continue
	x = sub["earliest_ok_idx"].fillna(max_idx)
	y = sub["chosen_idx"].fillna(max_idx)
	ax_scatter.scatter(
	x,
	y,
	label=cls,
	alpha=0.7,
	edgecolor="black",
	linewidth=1,
	marker="o",
	s=90,
	c=palette[cls],
	facecolor="none",
	)

	lim = max_idx
	ax_scatter.plot([0, lim], [0, lim], linestyle=":", linewidth=1)
	ax_scatter.set_xlim(0, lim)
	ax_scatter.set_ylim(0, lim)
	ax_scatter.set_xlabel("Earliest index with correct answer")
	ax_scatter.set_ylabel("Chosen buzz index")
	ax_scatter.set_title(f"{title_prefix}Earliest vs. chosen index")
	ax_scatter.legend(frameon=False, fontsize="small")

	# ------------------------------------------------------------------
	# 4. Outcome distribution (horizontal bars)
	# ------------------------------------------------------------------
	counts = df["cls"].value_counts().reindex(classes).fillna(0)
	ax_bars.barh(
	counts.index,
	counts.values,
	color=[palette[c] for c in counts.index],
	alpha=0.7,
	edgecolor="black",
	linewidth=1,
	)
	ax_bars.set_xlabel("Number of questions")
	ax_bars.set_title(f"{title_prefix}Outcome distribution")

	# Ensure x-axis shows integer ticks only
	from matplotlib.ticker import MaxNLocator

	ax_bars.xaxis.set_major_locator(MaxNLocator(integer=True))

	# ------------------------------------------------------------------
	# 5. Narrative-class descriptions (bottom panel)
	# ------------------------------------------------------------------
	descriptions = {
	"best-buzz": "Perfect timing. Buzzed at the earliest possible correct position",
	"late-buzz": "Missed opportunity. Buzzed correctly but later than optimal",
	"never-buzzed": "Missed opportunity. Never buzzed despite knowing the answer",
	"premature": "Incorrect buzz. Buzzing at a later position could have been correct",
	"hopeless": "Never knew the answer. No correct answer at any position",
	}

	y_pos = 1.0 # start at top of the description axis

	for cls, color in zip(classes, colors):
	ax_desc.text(
	0.01,
	y_pos,
	f"■ {cls}: {descriptions[cls]}",
	ha="left",
	va="top",
	color=color,
	fontweight="bold",
	fontsize=11, # increased font size from 9 to 11
	transform=ax_desc.transAxes,
	)

	y_pos -= 0.25 # increased vertical step inside the axis for more line height

	# ------------------------------------------------------------------
	# 6. Return the final figure
	# ------------------------------------------------------------------
	return fig


	# %%


	# Create dummy data for testing
	def create_dummy_model_outputs(n_entries=10, n_positions=5):
	"""Create dummy model outputs for testing."""
	np.random.seed(42)
	dummy_outputs = []

	for _ in range(n_entries):
	run_indices = sorted(np.random.choice(range(10, 50), n_positions, replace=False))
	outputs = []

	for i in range(n_positions):
	# Randomly decide if model will buzz at this position
	will_buzz = np.random.random() > 0.7
	# Randomly decide if answer is correct
	is_correct = np.random.random() > 0.4

	outputs.append(
	{
	"run_idx": i + 1,
	"buzz": will_buzz,
	"correct": 1 if is_correct else 0,
	"confidence": np.random.random(),
	"logprob": np.log(np.random.random()),
	"guess": f"Answer {i + 1}",
	}
	)

	dummy_outputs.append({"run_indices": run_indices, "run_outputs": outputs})

	return dummy_outputs


	# dummy_data = create_dummy_model_outputs()
	# dummy_df = pd.DataFrame([create_df_entry(entry["run_indices"], entry["outputs"]) for entry in dummy_data])
	# dummy_df
	# plot_buzz_dashboard(dummy_df, dummy_data[0]["run_indices"])

	# %%