Maharshi Gor
commited on
Commit
·
ee5d50c
1
Parent(s):
a4396db
Updated UI for bonus questions and playground dataset reference
Browse files- app.py +3 -8
- requirements.txt +1 -0
- shared/workflows +1 -1
- src/components/quizbowl/bonus.py +11 -10
- src/components/quizbowl/plotting.py +34 -1
- src/display/css_html_js.py +11 -0
- src/display/custom_css.py +52 -3
- src/envs.py +1 -4
app.py
CHANGED
@@ -29,7 +29,7 @@ from envs import (
|
|
29 |
LEADERBOARD_REFRESH_INTERVAL,
|
30 |
LEADERBOARD_URL,
|
31 |
LOG_LEVEL,
|
32 |
-
|
33 |
QUEUE_REPO,
|
34 |
REGISTRATION_URL,
|
35 |
REPO_ID,
|
@@ -59,13 +59,8 @@ def filter_qids(qid: str, packet_ids: list[int]) -> bool:
|
|
59 |
return packet_id in packet_ids
|
60 |
|
61 |
|
62 |
-
def load_dataset(
|
63 |
-
|
64 |
-
ds = datasets.load_dataset(PLAYGROUND_DATASET_NAMES["tossup"], split="eval")
|
65 |
-
elif mode == "bonus":
|
66 |
-
ds = datasets.load_dataset(PLAYGROUND_DATASET_NAMES["bonus"], split="eval")
|
67 |
-
else:
|
68 |
-
raise ValueError(f"Invalid mode: {mode}")
|
69 |
|
70 |
return ds.filter(lambda x: filter_qids(x["qid"], [1])).select(range(max_questions))
|
71 |
|
|
|
29 |
LEADERBOARD_REFRESH_INTERVAL,
|
30 |
LEADERBOARD_URL,
|
31 |
LOG_LEVEL,
|
32 |
+
PLAYGROUND_DATASET,
|
33 |
QUEUE_REPO,
|
34 |
REGISTRATION_URL,
|
35 |
REPO_ID,
|
|
|
59 |
return packet_id in packet_ids
|
60 |
|
61 |
|
62 |
+
def load_dataset(config_name: str, max_questions: int = 10):
|
63 |
+
ds = datasets.load_dataset(PLAYGROUND_DATASET, config_name, split="eval")
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
return ds.filter(lambda x: filter_qids(x["qid"], [1])).select(range(max_questions))
|
66 |
|
requirements.txt
CHANGED
@@ -28,4 +28,5 @@ langchain-openai
|
|
28 |
langchain-cohere
|
29 |
langchain-deepseek
|
30 |
json_repair
|
|
|
31 |
loguru
|
|
|
28 |
langchain-cohere
|
29 |
langchain-deepseek
|
30 |
json_repair
|
31 |
+
unidecode
|
32 |
loguru
|
shared/workflows
CHANGED
@@ -1 +1 @@
|
|
1 |
-
Subproject commit
|
|
|
1 |
+
Subproject commit 5a20959b241e3e73ed3112fb263ad51a8f63e381
|
src/components/quizbowl/bonus.py
CHANGED
@@ -42,7 +42,7 @@ def process_bonus_results(results: list[dict]) -> pd.DataFrame:
|
|
42 |
def initialize_eval_interface(example: dict, part_outputs: list[dict], input_vars: list[str]):
|
43 |
"""Initialize the interface with example text."""
|
44 |
try:
|
45 |
-
html_content = create_bonus_html(example)
|
46 |
|
47 |
# Create confidence plot data
|
48 |
plot_data = create_bonus_confidence_plot(example["parts"], part_outputs)
|
@@ -221,7 +221,7 @@ class BonusInterface:
|
|
221 |
return (
|
222 |
html_content,
|
223 |
gr.update(value=output_state),
|
224 |
-
gr.update(value=df, label=f"Model Outputs for Question {question_id + 1}", visible=
|
225 |
gr.update(value=step_outputs, label=f"Step Outputs for Question {question_id + 1}", visible=True),
|
226 |
gr.update(visible=False),
|
227 |
)
|
@@ -248,17 +248,18 @@ class BonusInterface:
|
|
248 |
model_outputs = run_and_eval_bonus_dataset(
|
249 |
agent, self.ds, num_workers=2, return_extras=True, tqdm_provider=progress.tqdm
|
250 |
)
|
251 |
-
|
252 |
total_parts = 0
|
253 |
-
|
254 |
-
for model_output in model_outputs:
|
255 |
part_outputs = model_output["part_outputs"]
|
256 |
-
n_parts_correct
|
|
|
257 |
total_parts += len(part_outputs)
|
258 |
-
|
259 |
|
260 |
-
p_accuracy =
|
261 |
-
q_accuracy =
|
262 |
df = pd.DataFrame(
|
263 |
[
|
264 |
{
|
@@ -271,7 +272,7 @@ class BonusInterface:
|
|
271 |
|
272 |
# plot_data = create_scatter_pyplot(part_numbers, part_scores)
|
273 |
return (
|
274 |
-
gr.update(value=df, label="Scores on Sample Set"),
|
275 |
gr.update(visible=False),
|
276 |
gr.update(visible=False),
|
277 |
)
|
|
|
42 |
def initialize_eval_interface(example: dict, part_outputs: list[dict], input_vars: list[str]):
|
43 |
"""Initialize the interface with example text."""
|
44 |
try:
|
45 |
+
html_content = create_bonus_html(example, part_outputs)
|
46 |
|
47 |
# Create confidence plot data
|
48 |
plot_data = create_bonus_confidence_plot(example["parts"], part_outputs)
|
|
|
221 |
return (
|
222 |
html_content,
|
223 |
gr.update(value=output_state),
|
224 |
+
gr.update(value=df, label=f"Model Outputs for Question {question_id + 1}", visible=False),
|
225 |
gr.update(value=step_outputs, label=f"Step Outputs for Question {question_id + 1}", visible=True),
|
226 |
gr.update(visible=False),
|
227 |
)
|
|
|
248 |
model_outputs = run_and_eval_bonus_dataset(
|
249 |
agent, self.ds, num_workers=2, return_extras=True, tqdm_provider=progress.tqdm
|
250 |
)
|
251 |
+
total_parts_correct = 0
|
252 |
total_parts = 0
|
253 |
+
total_questions_correct = 0
|
254 |
+
for i, model_output in enumerate(model_outputs):
|
255 |
part_outputs = model_output["part_outputs"]
|
256 |
+
n_parts_correct = sum(output["correct"] for output in part_outputs)
|
257 |
+
total_parts_correct += n_parts_correct
|
258 |
total_parts += len(part_outputs)
|
259 |
+
total_questions_correct += int(n_parts_correct == len(part_outputs))
|
260 |
|
261 |
+
p_accuracy = total_parts_correct / total_parts
|
262 |
+
q_accuracy = total_questions_correct / len(self.ds)
|
263 |
df = pd.DataFrame(
|
264 |
[
|
265 |
{
|
|
|
272 |
|
273 |
# plot_data = create_scatter_pyplot(part_numbers, part_scores)
|
274 |
return (
|
275 |
+
gr.update(value=df, label="Scores on Sample Set", visible=True),
|
276 |
gr.update(visible=False),
|
277 |
gr.update(visible=False),
|
278 |
)
|
src/components/quizbowl/plotting.py
CHANGED
@@ -32,6 +32,34 @@ def _make_answer_line_html(answer_line: str) -> str:
|
|
32 |
"""
|
33 |
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
def _get_token_classes(confidence, buzz, score) -> str:
|
36 |
if confidence is None:
|
37 |
return "token"
|
@@ -127,7 +155,7 @@ def create_tossup_html(
|
|
127 |
return f"<div class='token-container'>Error creating tokens: {str(e)}</div>"
|
128 |
|
129 |
|
130 |
-
def create_bonus_html(example: dict) -> str:
|
131 |
# Create HTML for leadin and parts with answers
|
132 |
leadin_html = f"<div class='bonus-leadin'>{example['leadin']}</div>"
|
133 |
parts_html = []
|
@@ -139,11 +167,16 @@ def create_bonus_html(example: dict) -> str:
|
|
139 |
else:
|
140 |
answer_html = _make_answer_html(part["answer_primary"], part["clean_answers"])
|
141 |
|
|
|
|
|
|
|
|
|
142 |
"<div class='bonus-part-number'>Part {i + 1}</div>"
|
143 |
part_html = f"""
|
144 |
<div class='bonus-part'>
|
145 |
<div class='bonus-part-text'><b>#{i + 1}.</b> {question_text}</div>
|
146 |
{answer_html}
|
|
|
147 |
</div>
|
148 |
"""
|
149 |
parts_html.append(part_html)
|
|
|
32 |
"""
|
33 |
|
34 |
|
35 |
+
def _make_model_response_html(part_output: dict, explanation_token_limit: int = 25) -> str:
|
36 |
+
guess = part_output.get("guess", "")
|
37 |
+
confidence = float(part_output.get("confidence", 0.0))
|
38 |
+
explanation = part_output.get("explanation", "")
|
39 |
+
expl_tokens = explanation.split()
|
40 |
+
if len(expl_tokens) > explanation_token_limit:
|
41 |
+
k = len(expl_tokens) - explanation_token_limit
|
42 |
+
explanation = " ".join(expl_tokens[:explanation_token_limit]) + f"...[{k} more words]"
|
43 |
+
|
44 |
+
correct = part_output.get("correct", 0)
|
45 |
+
emoji = "✅" if correct else "❌"
|
46 |
+
answer_class = "correct-answer" if correct else "incorrect-answer"
|
47 |
+
|
48 |
+
return f"""
|
49 |
+
<div class='bonus-answer {answer_class}'>
|
50 |
+
<div class="bonus-answer-row" style="margin-bottom: 4px;">
|
51 |
+
<span class='bonus-answer-label' style='font-size: 1.2em;'>🤖 Guess: </span>
|
52 |
+
<span class='bonus-model-guess'>{guess} {emoji}</span>
|
53 |
+
<span class='confidence-badge' style='float: right'>⚡️ Confidence: {confidence:.2f}</span>
|
54 |
+
</div>
|
55 |
+
<div class='bonus-explanation'>
|
56 |
+
<span class='bonus-answer-label'>💬 Explanation:</span>
|
57 |
+
<span class='bonus-explanation-text' style='font-style: italic;'>{explanation}</span>
|
58 |
+
</div>
|
59 |
+
</div>
|
60 |
+
"""
|
61 |
+
|
62 |
+
|
63 |
def _get_token_classes(confidence, buzz, score) -> str:
|
64 |
if confidence is None:
|
65 |
return "token"
|
|
|
155 |
return f"<div class='token-container'>Error creating tokens: {str(e)}</div>"
|
156 |
|
157 |
|
158 |
+
def create_bonus_html(example: dict, part_outputs: list[dict] | None = None) -> str:
|
159 |
# Create HTML for leadin and parts with answers
|
160 |
leadin_html = f"<div class='bonus-leadin'>{example['leadin']}</div>"
|
161 |
parts_html = []
|
|
|
167 |
else:
|
168 |
answer_html = _make_answer_html(part["answer_primary"], part["clean_answers"])
|
169 |
|
170 |
+
model_response_html = ""
|
171 |
+
if part_outputs is not None:
|
172 |
+
model_response_html = _make_model_response_html(part_outputs[i])
|
173 |
+
|
174 |
"<div class='bonus-part-number'>Part {i + 1}</div>"
|
175 |
part_html = f"""
|
176 |
<div class='bonus-part'>
|
177 |
<div class='bonus-part-text'><b>#{i + 1}.</b> {question_text}</div>
|
178 |
{answer_html}
|
179 |
+
{model_response_html}
|
180 |
</div>
|
181 |
"""
|
182 |
parts_html.append(part_html)
|
src/display/css_html_js.py
CHANGED
@@ -132,6 +132,17 @@ fonts_header = """
|
|
132 |
|
133 |
js_head = """
|
134 |
<script>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
135 |
const gradioApp = document.getElementsByTagName('gradio-app')[0];
|
136 |
console.log("Gradio app:", gradioApp);
|
137 |
console.log(gradioApp.querySelectorAll('.token'));
|
|
|
132 |
|
133 |
js_head = """
|
134 |
<script>
|
135 |
+
function refresh() {
|
136 |
+
const url = new URL(window.location);
|
137 |
+
console.log("URL:", url);
|
138 |
+
const theme = url.searchParams.get('__theme');
|
139 |
+
console.log("Theme:", theme);
|
140 |
+
if (!theme || theme === 'dark') {
|
141 |
+
url.searchParams.set('__theme', 'light');
|
142 |
+
console.log("Setting theme to light");
|
143 |
+
}
|
144 |
+
window.location.href = url.href;
|
145 |
+
}
|
146 |
const gradioApp = document.getElementsByTagName('gradio-app')[0];
|
147 |
console.log("Gradio app:", gradioApp);
|
148 |
console.log(gradioApp.querySelectorAll('.token'));
|
src/display/custom_css.py
CHANGED
@@ -68,6 +68,21 @@ input[type=range][disabled] {
|
|
68 |
opacity: .3;
|
69 |
}
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
.json-node {
|
72 |
/* On a light background (usually white), use darker and vivid colors */
|
73 |
font-size: var(--text-sm) !important;
|
@@ -582,20 +597,54 @@ css_bonus = """
|
|
582 |
}
|
583 |
|
584 |
.bonus-answer {
|
585 |
-
background-color: #
|
586 |
border-radius: 6px;
|
587 |
padding: 8px 12px;
|
588 |
margin-top: 8px;
|
589 |
font-size: 14px;
|
590 |
-
border-left: 3px solid #
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
591 |
}
|
592 |
|
593 |
.bonus-answer-label {
|
594 |
font-weight: 500;
|
595 |
-
color: #
|
596 |
margin-bottom: 4px;
|
597 |
}
|
598 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
599 |
.bonus-answer-text {
|
600 |
color: #333;
|
601 |
}
|
|
|
68 |
opacity: .3;
|
69 |
}
|
70 |
|
71 |
+
b, i, u, em, strong {
|
72 |
+
color: inherit !important;
|
73 |
+
/* Then override specific properties you want to keep */
|
74 |
+
}
|
75 |
+
|
76 |
+
b {font-weight: bold !important;}
|
77 |
+
|
78 |
+
i {font-style: italic !important;}
|
79 |
+
|
80 |
+
u {text-decoration: underline !important;}
|
81 |
+
|
82 |
+
em {font-style: italic !important;}
|
83 |
+
|
84 |
+
strong {font-weight: bold !important;}
|
85 |
+
|
86 |
.json-node {
|
87 |
/* On a light background (usually white), use darker and vivid colors */
|
88 |
font-size: var(--text-sm) !important;
|
|
|
597 |
}
|
598 |
|
599 |
.bonus-answer {
|
600 |
+
background-color: #bde1ff;
|
601 |
border-radius: 6px;
|
602 |
padding: 8px 12px;
|
603 |
margin-top: 8px;
|
604 |
font-size: 14px;
|
605 |
+
border-left: 3px solid #133cba !important;
|
606 |
+
}
|
607 |
+
|
608 |
+
.confidence-badge {
|
609 |
+
font-size: 0.9em;
|
610 |
+
background-color: #2c3e50;
|
611 |
+
color: #fff;
|
612 |
+
padding: 2px 8px;
|
613 |
+
border-radius: 12px;
|
614 |
+
margin-left: 8px;
|
615 |
+
}
|
616 |
+
|
617 |
+
.correct-answer {
|
618 |
+
background-color: #b3f2ce !important;
|
619 |
+
border-left: 3px solid #228b22 !important;
|
620 |
+
}
|
621 |
+
|
622 |
+
.incorrect-answer {
|
623 |
+
background-color: #ffd1c9 !important;
|
624 |
+
border-left: 3px solid #ff4444 !important;
|
625 |
}
|
626 |
|
627 |
.bonus-answer-label {
|
628 |
font-weight: 500;
|
629 |
+
color: #133cba; /* Royal Blue */
|
630 |
margin-bottom: 4px;
|
631 |
}
|
632 |
|
633 |
+
.bonus-model-guess {
|
634 |
+
border: 2px dotted #666 !important;
|
635 |
+
padding: 4px 8px !important;
|
636 |
+
border-radius: 4px !important;
|
637 |
+
font-weight: bold !important;
|
638 |
+
}
|
639 |
+
|
640 |
+
.correct-label {
|
641 |
+
color: #228b22;
|
642 |
+
}
|
643 |
+
|
644 |
+
.incorrect-label {
|
645 |
+
color: #ff4444;
|
646 |
+
}
|
647 |
+
|
648 |
.bonus-answer-text {
|
649 |
color: #333;
|
650 |
}
|
src/envs.py
CHANGED
@@ -32,10 +32,7 @@ REGISTRATION_URL = "https://huggingface.co/spaces/qanta-challenge/register"
|
|
32 |
LEADERBOARD_URL = "https://huggingface.co/spaces/qanta-challenge/leaderboard"
|
33 |
EXAMPLES_PATH = "examples"
|
34 |
|
35 |
-
|
36 |
-
"tossup": f"{OWNER}/acf-co24-tossups",
|
37 |
-
"bonus": f"{OWNER}/acf-co24-bonuses",
|
38 |
-
}
|
39 |
|
40 |
# ----------------------------------
|
41 |
|
|
|
32 |
LEADERBOARD_URL = "https://huggingface.co/spaces/qanta-challenge/leaderboard"
|
33 |
EXAMPLES_PATH = "examples"
|
34 |
|
35 |
+
PLAYGROUND_DATASET = f"{OWNER}/acf-co24"
|
|
|
|
|
|
|
36 |
|
37 |
# ----------------------------------
|
38 |
|