Maharshi Gor commited on
Commit
ee5d50c
·
1 Parent(s): a4396db

Updated UI for bonus questions and playground dataset reference

Browse files
app.py CHANGED
@@ -29,7 +29,7 @@ from envs import (
29
  LEADERBOARD_REFRESH_INTERVAL,
30
  LEADERBOARD_URL,
31
  LOG_LEVEL,
32
- PLAYGROUND_DATASET_NAMES,
33
  QUEUE_REPO,
34
  REGISTRATION_URL,
35
  REPO_ID,
@@ -59,13 +59,8 @@ def filter_qids(qid: str, packet_ids: list[int]) -> bool:
59
  return packet_id in packet_ids
60
 
61
 
62
- def load_dataset(mode: str, max_questions: int = 10):
63
- if mode == "tossup":
64
- ds = datasets.load_dataset(PLAYGROUND_DATASET_NAMES["tossup"], split="eval")
65
- elif mode == "bonus":
66
- ds = datasets.load_dataset(PLAYGROUND_DATASET_NAMES["bonus"], split="eval")
67
- else:
68
- raise ValueError(f"Invalid mode: {mode}")
69
 
70
  return ds.filter(lambda x: filter_qids(x["qid"], [1])).select(range(max_questions))
71
 
 
29
  LEADERBOARD_REFRESH_INTERVAL,
30
  LEADERBOARD_URL,
31
  LOG_LEVEL,
32
+ PLAYGROUND_DATASET,
33
  QUEUE_REPO,
34
  REGISTRATION_URL,
35
  REPO_ID,
 
59
  return packet_id in packet_ids
60
 
61
 
62
+ def load_dataset(config_name: str, max_questions: int = 10):
63
+ ds = datasets.load_dataset(PLAYGROUND_DATASET, config_name, split="eval")
 
 
 
 
 
64
 
65
  return ds.filter(lambda x: filter_qids(x["qid"], [1])).select(range(max_questions))
66
 
requirements.txt CHANGED
@@ -28,4 +28,5 @@ langchain-openai
28
  langchain-cohere
29
  langchain-deepseek
30
  json_repair
 
31
  loguru
 
28
  langchain-cohere
29
  langchain-deepseek
30
  json_repair
31
+ unidecode
32
  loguru
shared/workflows CHANGED
@@ -1 +1 @@
1
- Subproject commit 7f0d4f60746e4911abd1af80d8fbce1fff906549
 
1
+ Subproject commit 5a20959b241e3e73ed3112fb263ad51a8f63e381
src/components/quizbowl/bonus.py CHANGED
@@ -42,7 +42,7 @@ def process_bonus_results(results: list[dict]) -> pd.DataFrame:
42
  def initialize_eval_interface(example: dict, part_outputs: list[dict], input_vars: list[str]):
43
  """Initialize the interface with example text."""
44
  try:
45
- html_content = create_bonus_html(example)
46
 
47
  # Create confidence plot data
48
  plot_data = create_bonus_confidence_plot(example["parts"], part_outputs)
@@ -221,7 +221,7 @@ class BonusInterface:
221
  return (
222
  html_content,
223
  gr.update(value=output_state),
224
- gr.update(value=df, label=f"Model Outputs for Question {question_id + 1}", visible=True),
225
  gr.update(value=step_outputs, label=f"Step Outputs for Question {question_id + 1}", visible=True),
226
  gr.update(visible=False),
227
  )
@@ -248,17 +248,18 @@ class BonusInterface:
248
  model_outputs = run_and_eval_bonus_dataset(
249
  agent, self.ds, num_workers=2, return_extras=True, tqdm_provider=progress.tqdm
250
  )
251
- n_parts_correct = 0
252
  total_parts = 0
253
- n_questions_correct = 0
254
- for model_output in model_outputs:
255
  part_outputs = model_output["part_outputs"]
256
- n_parts_correct += sum(output["correct"] for output in part_outputs)
 
257
  total_parts += len(part_outputs)
258
- n_questions_correct += int(n_parts_correct == len(part_outputs))
259
 
260
- p_accuracy = n_parts_correct / total_parts
261
- q_accuracy = n_questions_correct / len(self.ds)
262
  df = pd.DataFrame(
263
  [
264
  {
@@ -271,7 +272,7 @@ class BonusInterface:
271
 
272
  # plot_data = create_scatter_pyplot(part_numbers, part_scores)
273
  return (
274
- gr.update(value=df, label="Scores on Sample Set"),
275
  gr.update(visible=False),
276
  gr.update(visible=False),
277
  )
 
42
  def initialize_eval_interface(example: dict, part_outputs: list[dict], input_vars: list[str]):
43
  """Initialize the interface with example text."""
44
  try:
45
+ html_content = create_bonus_html(example, part_outputs)
46
 
47
  # Create confidence plot data
48
  plot_data = create_bonus_confidence_plot(example["parts"], part_outputs)
 
221
  return (
222
  html_content,
223
  gr.update(value=output_state),
224
+ gr.update(value=df, label=f"Model Outputs for Question {question_id + 1}", visible=False),
225
  gr.update(value=step_outputs, label=f"Step Outputs for Question {question_id + 1}", visible=True),
226
  gr.update(visible=False),
227
  )
 
248
  model_outputs = run_and_eval_bonus_dataset(
249
  agent, self.ds, num_workers=2, return_extras=True, tqdm_provider=progress.tqdm
250
  )
251
+ total_parts_correct = 0
252
  total_parts = 0
253
+ total_questions_correct = 0
254
+ for i, model_output in enumerate(model_outputs):
255
  part_outputs = model_output["part_outputs"]
256
+ n_parts_correct = sum(output["correct"] for output in part_outputs)
257
+ total_parts_correct += n_parts_correct
258
  total_parts += len(part_outputs)
259
+ total_questions_correct += int(n_parts_correct == len(part_outputs))
260
 
261
+ p_accuracy = total_parts_correct / total_parts
262
+ q_accuracy = total_questions_correct / len(self.ds)
263
  df = pd.DataFrame(
264
  [
265
  {
 
272
 
273
  # plot_data = create_scatter_pyplot(part_numbers, part_scores)
274
  return (
275
+ gr.update(value=df, label="Scores on Sample Set", visible=True),
276
  gr.update(visible=False),
277
  gr.update(visible=False),
278
  )
src/components/quizbowl/plotting.py CHANGED
@@ -32,6 +32,34 @@ def _make_answer_line_html(answer_line: str) -> str:
32
  """
33
 
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def _get_token_classes(confidence, buzz, score) -> str:
36
  if confidence is None:
37
  return "token"
@@ -127,7 +155,7 @@ def create_tossup_html(
127
  return f"<div class='token-container'>Error creating tokens: {str(e)}</div>"
128
 
129
 
130
- def create_bonus_html(example: dict) -> str:
131
  # Create HTML for leadin and parts with answers
132
  leadin_html = f"<div class='bonus-leadin'>{example['leadin']}</div>"
133
  parts_html = []
@@ -139,11 +167,16 @@ def create_bonus_html(example: dict) -> str:
139
  else:
140
  answer_html = _make_answer_html(part["answer_primary"], part["clean_answers"])
141
 
 
 
 
 
142
  "<div class='bonus-part-number'>Part {i + 1}</div>"
143
  part_html = f"""
144
  <div class='bonus-part'>
145
  <div class='bonus-part-text'><b>#{i + 1}.</b> {question_text}</div>
146
  {answer_html}
 
147
  </div>
148
  """
149
  parts_html.append(part_html)
 
32
  """
33
 
34
 
35
+ def _make_model_response_html(part_output: dict, explanation_token_limit: int = 25) -> str:
36
+ guess = part_output.get("guess", "")
37
+ confidence = float(part_output.get("confidence", 0.0))
38
+ explanation = part_output.get("explanation", "")
39
+ expl_tokens = explanation.split()
40
+ if len(expl_tokens) > explanation_token_limit:
41
+ k = len(expl_tokens) - explanation_token_limit
42
+ explanation = " ".join(expl_tokens[:explanation_token_limit]) + f"...[{k} more words]"
43
+
44
+ correct = part_output.get("correct", 0)
45
+ emoji = "✅" if correct else "❌"
46
+ answer_class = "correct-answer" if correct else "incorrect-answer"
47
+
48
+ return f"""
49
+ <div class='bonus-answer {answer_class}'>
50
+ <div class="bonus-answer-row" style="margin-bottom: 4px;">
51
+ <span class='bonus-answer-label' style='font-size: 1.2em;'>🤖 Guess: </span>
52
+ <span class='bonus-model-guess'>{guess} {emoji}</span>
53
+ <span class='confidence-badge' style='float: right'>⚡️ Confidence: {confidence:.2f}</span>
54
+ </div>
55
+ <div class='bonus-explanation'>
56
+ <span class='bonus-answer-label'>💬 Explanation:</span>
57
+ <span class='bonus-explanation-text' style='font-style: italic;'>{explanation}</span>
58
+ </div>
59
+ </div>
60
+ """
61
+
62
+
63
  def _get_token_classes(confidence, buzz, score) -> str:
64
  if confidence is None:
65
  return "token"
 
155
  return f"<div class='token-container'>Error creating tokens: {str(e)}</div>"
156
 
157
 
158
+ def create_bonus_html(example: dict, part_outputs: list[dict] | None = None) -> str:
159
  # Create HTML for leadin and parts with answers
160
  leadin_html = f"<div class='bonus-leadin'>{example['leadin']}</div>"
161
  parts_html = []
 
167
  else:
168
  answer_html = _make_answer_html(part["answer_primary"], part["clean_answers"])
169
 
170
+ model_response_html = ""
171
+ if part_outputs is not None:
172
+ model_response_html = _make_model_response_html(part_outputs[i])
173
+
174
  "<div class='bonus-part-number'>Part {i + 1}</div>"
175
  part_html = f"""
176
  <div class='bonus-part'>
177
  <div class='bonus-part-text'><b>#{i + 1}.</b> {question_text}</div>
178
  {answer_html}
179
+ {model_response_html}
180
  </div>
181
  """
182
  parts_html.append(part_html)
src/display/css_html_js.py CHANGED
@@ -132,6 +132,17 @@ fonts_header = """
132
 
133
  js_head = """
134
  <script>
 
 
 
 
 
 
 
 
 
 
 
135
  const gradioApp = document.getElementsByTagName('gradio-app')[0];
136
  console.log("Gradio app:", gradioApp);
137
  console.log(gradioApp.querySelectorAll('.token'));
 
132
 
133
  js_head = """
134
  <script>
135
+ function refresh() {
136
+ const url = new URL(window.location);
137
+ console.log("URL:", url);
138
+ const theme = url.searchParams.get('__theme');
139
+ console.log("Theme:", theme);
140
+ if (!theme || theme === 'dark') {
141
+ url.searchParams.set('__theme', 'light');
142
+ console.log("Setting theme to light");
143
+ }
144
+ window.location.href = url.href;
145
+ }
146
  const gradioApp = document.getElementsByTagName('gradio-app')[0];
147
  console.log("Gradio app:", gradioApp);
148
  console.log(gradioApp.querySelectorAll('.token'));
src/display/custom_css.py CHANGED
@@ -68,6 +68,21 @@ input[type=range][disabled] {
68
  opacity: .3;
69
  }
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  .json-node {
72
  /* On a light background (usually white), use darker and vivid colors */
73
  font-size: var(--text-sm) !important;
@@ -582,20 +597,54 @@ css_bonus = """
582
  }
583
 
584
  .bonus-answer {
585
- background-color: #fff5f5;
586
  border-radius: 6px;
587
  padding: 8px 12px;
588
  margin-top: 8px;
589
  font-size: 14px;
590
- border-left: 3px solid #ff6b6b;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
591
  }
592
 
593
  .bonus-answer-label {
594
  font-weight: 500;
595
- color: #666;
596
  margin-bottom: 4px;
597
  }
598
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
599
  .bonus-answer-text {
600
  color: #333;
601
  }
 
68
  opacity: .3;
69
  }
70
 
71
+ b, i, u, em, strong {
72
+ color: inherit !important;
73
+ /* Then override specific properties you want to keep */
74
+ }
75
+
76
+ b {font-weight: bold !important;}
77
+
78
+ i {font-style: italic !important;}
79
+
80
+ u {text-decoration: underline !important;}
81
+
82
+ em {font-style: italic !important;}
83
+
84
+ strong {font-weight: bold !important;}
85
+
86
  .json-node {
87
  /* On a light background (usually white), use darker and vivid colors */
88
  font-size: var(--text-sm) !important;
 
597
  }
598
 
599
  .bonus-answer {
600
+ background-color: #bde1ff;
601
  border-radius: 6px;
602
  padding: 8px 12px;
603
  margin-top: 8px;
604
  font-size: 14px;
605
+ border-left: 3px solid #133cba !important;
606
+ }
607
+
608
+ .confidence-badge {
609
+ font-size: 0.9em;
610
+ background-color: #2c3e50;
611
+ color: #fff;
612
+ padding: 2px 8px;
613
+ border-radius: 12px;
614
+ margin-left: 8px;
615
+ }
616
+
617
+ .correct-answer {
618
+ background-color: #b3f2ce !important;
619
+ border-left: 3px solid #228b22 !important;
620
+ }
621
+
622
+ .incorrect-answer {
623
+ background-color: #ffd1c9 !important;
624
+ border-left: 3px solid #ff4444 !important;
625
  }
626
 
627
  .bonus-answer-label {
628
  font-weight: 500;
629
+ color: #133cba; /* Royal Blue */
630
  margin-bottom: 4px;
631
  }
632
 
633
+ .bonus-model-guess {
634
+ border: 2px dotted #666 !important;
635
+ padding: 4px 8px !important;
636
+ border-radius: 4px !important;
637
+ font-weight: bold !important;
638
+ }
639
+
640
+ .correct-label {
641
+ color: #228b22;
642
+ }
643
+
644
+ .incorrect-label {
645
+ color: #ff4444;
646
+ }
647
+
648
  .bonus-answer-text {
649
  color: #333;
650
  }
src/envs.py CHANGED
@@ -32,10 +32,7 @@ REGISTRATION_URL = "https://huggingface.co/spaces/qanta-challenge/register"
32
  LEADERBOARD_URL = "https://huggingface.co/spaces/qanta-challenge/leaderboard"
33
  EXAMPLES_PATH = "examples"
34
 
35
- PLAYGROUND_DATASET_NAMES = {
36
- "tossup": f"{OWNER}/acf-co24-tossups",
37
- "bonus": f"{OWNER}/acf-co24-bonuses",
38
- }
39
 
40
  # ----------------------------------
41
 
 
32
  LEADERBOARD_URL = "https://huggingface.co/spaces/qanta-challenge/leaderboard"
33
  EXAMPLES_PATH = "examples"
34
 
35
+ PLAYGROUND_DATASET = f"{OWNER}/acf-co24"
 
 
 
36
 
37
  # ----------------------------------
38