Spaces:
Running
Running
Maharshi Gor
commited on
Commit
·
02b7dec
1
Parent(s):
0bab47c
Adds quizbowl pipeline support for bonus and tossup questions
Browse filesImplements new YAML configurations for bonus and tossup pipelines, enhancing the quizbowl application.
Introduces a two-step process for evaluating answers and confidence levels.
Updates UI components to facilitate pipeline selection and loading, improving user experience.
- examples/bonus/simple-bonus-pipeline.yaml +55 -0
- examples/tossup/simple-tossup-pipeline.yaml +36 -0
- examples/tossup/two-step-justified-confidence.yaml +49 -0
- src/app_configs.py +1 -0
- src/components/model_pipeline/tossup_pipeline.py +10 -8
- src/components/quizbowl/bonus.py +53 -49
- src/components/quizbowl/commons.py +2 -0
- src/components/quizbowl/plotting.py +5 -5
- src/components/quizbowl/populate.py +38 -0
- src/components/quizbowl/tossup.py +34 -31
- src/display/custom_css.py +59 -16
- src/display/guide.py +23 -7
- src/envs.py +3 -1
- src/submission/structs.py +5 -2
- src/submission/submit.py +32 -5
- src/workflows/structs.py +6 -0
examples/bonus/simple-bonus-pipeline.yaml
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
inputs:
|
2 |
+
- leadin
|
3 |
+
- part
|
4 |
+
outputs:
|
5 |
+
answer: A.answer
|
6 |
+
confidence: A.confidence
|
7 |
+
explanation: A.explanation
|
8 |
+
steps:
|
9 |
+
- id: A
|
10 |
+
name: Bonus Agent
|
11 |
+
model: gpt-4o-mini
|
12 |
+
provider: OpenAI
|
13 |
+
temperature: 0.3
|
14 |
+
system_prompt: 'You are an expert quizbowl player specializing in answering bonus questions across various academic domains including history, literature, science, fine arts, and social sciences.
|
15 |
+
|
16 |
+
Quizbowl bonus questions consist of a leadin paragraph followed by multiple parts. Each part tests specific knowledge within the topic introduced in the leadin.
|
17 |
+
|
18 |
+
For each bonus part you receive:
|
19 |
+
|
20 |
+
1. Carefully analyze both the leadin context and the specific part text
|
21 |
+
2. Identify key clues, terms, and relationships mentioned
|
22 |
+
3. Determine the most precise answer based on the specific wording and context
|
23 |
+
4. Provide your answer in the standard accepted format (e.g., full name for people, complete titles for works)
|
24 |
+
5. Assess your confidence on a scale from 0.0 (complete guess) to 1.0 (absolute certainty)
|
25 |
+
6. Explain your reasoning process, citing specific clues from the question that led to your answer
|
26 |
+
|
27 |
+
Remember that bonus questions often follow patterns:
|
28 |
+
- Parts typically increase in difficulty from easy (15 points) to medium (10 points) to hard (5 points)
|
29 |
+
- Later parts may build on information from earlier parts
|
30 |
+
- Specific answer formats may be requested (e.g., "name the author", "identify the compound")
|
31 |
+
|
32 |
+
Format your response precisely as:
|
33 |
+
|
34 |
+
ANSWER: <your specific, concise answer>
|
35 |
+
|
36 |
+
CONFIDENCE: <numerical value between 0.0 and 1.0>
|
37 |
+
|
38 |
+
EXPLANATION: <detailed reasoning that connects specific clues to your answer, demonstrates your thought process, and justifies your confidence level>'
|
39 |
+
input_fields:
|
40 |
+
- name: question_leadin
|
41 |
+
description: The introductory paragraph that establishes the topic and context for all parts of the bonus question
|
42 |
+
variable: leadin
|
43 |
+
- name: question_part
|
44 |
+
description: The specific part text containing clues that should lead to a single answer
|
45 |
+
variable: part
|
46 |
+
output_fields:
|
47 |
+
- name: answer
|
48 |
+
description: The precise predicted answer that directly responds to what the question is asking for
|
49 |
+
type: str
|
50 |
+
- name: confidence
|
51 |
+
type: float
|
52 |
+
description: A numerical assessment (0.0-1.0) of certainty in the provided answer, where 0.0 indicates a complete guess and 1.0 indicates absolute certainty
|
53 |
+
- name: explanation
|
54 |
+
description: A detailed justification that connects specific clues from the question to the answer and explains the reasoning process
|
55 |
+
type: str
|
examples/tossup/simple-tossup-pipeline.yaml
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
inputs:
|
2 |
+
- question_text
|
3 |
+
outputs:
|
4 |
+
answer: A.answer
|
5 |
+
confidence: A.confidence
|
6 |
+
steps:
|
7 |
+
- id: A
|
8 |
+
name: Tossup Agent
|
9 |
+
model: gpt-4o-mini
|
10 |
+
provider: OpenAI
|
11 |
+
temperature: 0.1
|
12 |
+
system_prompt: |
|
13 |
+
You are a professional quizbowl player answering tossup questions.
|
14 |
+
Given a progressively revealed question text, provide your best guess at the answer and your confidence level.
|
15 |
+
|
16 |
+
Your task:
|
17 |
+
1. Analyze the clues provided in the question text
|
18 |
+
2. Determine the most likely answer based on the information available
|
19 |
+
3. Assess your confidence in your answer on a scale from 0.0 (complete guess) to 1.0 (absolute certainty)
|
20 |
+
|
21 |
+
Keep your answer direct and concise, limited to a couple of words.
|
22 |
+
Your confidence should reflect how certain you are based on the clues revealed so far.
|
23 |
+
input_fields:
|
24 |
+
- name: question
|
25 |
+
description: The progressively revealed question text so far.
|
26 |
+
variable: question_text
|
27 |
+
output_fields:
|
28 |
+
- name: answer
|
29 |
+
description: Your best guess at the answer to the revealed question text.
|
30 |
+
type: str
|
31 |
+
- name: confidence
|
32 |
+
type: float
|
33 |
+
description: Your confidence in the answer, ranging from 0.0 (complete guess) to 1.0 (absolute certainty) in increments of 0.01.
|
34 |
+
buzzer:
|
35 |
+
method: AND
|
36 |
+
confidence_threshold: 0.8
|
examples/tossup/two-step-justified-confidence.yaml
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
inputs:
|
2 |
+
- question_text
|
3 |
+
outputs:
|
4 |
+
answer: A.answer
|
5 |
+
confidence: B.confidence
|
6 |
+
justification: B.justification
|
7 |
+
steps:
|
8 |
+
- id: A
|
9 |
+
name: Answer Generator
|
10 |
+
model: gpt-4o-mini
|
11 |
+
provider: OpenAI
|
12 |
+
temperature: 0.1
|
13 |
+
system_prompt: You are a professional quizbowl player answering tossup questions.
|
14 |
+
Given a progressively revealed question text, provide your best guess at the answer.
|
15 |
+
Keep your answer direct and concise, limited to a couple of words.
|
16 |
+
Focus only on determining the correct answer based on the clues provided so far.
|
17 |
+
input_fields:
|
18 |
+
- name: question
|
19 |
+
description: The progressively revealed question text so far.
|
20 |
+
variable: question_text
|
21 |
+
output_fields:
|
22 |
+
- name: answer
|
23 |
+
description: Your best guess at the answer to the revealed question text.
|
24 |
+
- id: B
|
25 |
+
name: Confidence Evaluator
|
26 |
+
model: command-r-plus
|
27 |
+
provider: Cohere
|
28 |
+
temperature: 0.1
|
29 |
+
system_prompt: You are a professional quizbowl player evaluating answer confidence.
|
30 |
+
Given a question and a proposed answer, assess how confident you are that this answer is correct.
|
31 |
+
Provide a confidence score from 0.0 (completely certain about correctness) to 1.0 (completely certain about incorrectness) in increments of 0.01.
|
32 |
+
0.5 means you are highly uncertain about the correctness of the answer.
|
33 |
+
Also provide a single-line justification explaining why you assigned this confidence level.
|
34 |
+
input_fields:
|
35 |
+
- name: question
|
36 |
+
description: The progressively revealed question text so far.
|
37 |
+
variable: question_text
|
38 |
+
- name: proposed_answer
|
39 |
+
description: The answer proposed by the first step.
|
40 |
+
variable: A.answer
|
41 |
+
output_fields:
|
42 |
+
- name: confidence
|
43 |
+
type: float
|
44 |
+
description: Your confidence in the proposed answer, ranging from 0.0 to 1.0 in increments of 0.01.
|
45 |
+
- name: justification
|
46 |
+
description: A single-line explanation justifying your confidence score.
|
47 |
+
buzzer:
|
48 |
+
method: AND
|
49 |
+
confidence_threshold: 0.8
|
src/app_configs.py
CHANGED
@@ -2,6 +2,7 @@ THEME = "gstaff/sketch"
|
|
2 |
|
3 |
UNSELECTED_VAR_NAME = "Select Variable..."
|
4 |
UNSELECTED_MODEL_NAME = "Select Model..."
|
|
|
5 |
AVAILABLE_MODELS = {
|
6 |
"OpenAI/gpt-4o": {
|
7 |
"model": "gpt-4o-2024-11-20",
|
|
|
2 |
|
3 |
UNSELECTED_VAR_NAME = "Select Variable..."
|
4 |
UNSELECTED_MODEL_NAME = "Select Model..."
|
5 |
+
UNSELECTED_PIPELINE_NAME = "Select Pipeline to Import..."
|
6 |
AVAILABLE_MODELS = {
|
7 |
"OpenAI/gpt-4o": {
|
8 |
"model": "gpt-4o-2024-11-20",
|
src/components/model_pipeline/tossup_pipeline.py
CHANGED
@@ -52,15 +52,13 @@ class TossupPipelineInterface(PipelineInterface):
|
|
52 |
tokens_prob: float | None,
|
53 |
):
|
54 |
"""Update the buzzer."""
|
55 |
-
|
56 |
-
|
57 |
-
else:
|
58 |
-
log_prob_thresh = None
|
59 |
state.workflow.buzzer = state.workflow.buzzer.model_copy(
|
60 |
update={
|
61 |
"method": method,
|
62 |
"confidence_threshold": confidence_threshold,
|
63 |
-
"
|
64 |
}
|
65 |
)
|
66 |
Buzzer.model_validate(state.workflow.buzzer)
|
@@ -108,14 +106,17 @@ class TossupPipelineInterface(PipelineInterface):
|
|
108 |
)
|
109 |
dropdowns[output_field] = dropdown
|
110 |
with gr.Row(elem_classes="output-fields-header"):
|
111 |
-
gr.Markdown(
|
|
|
|
|
112 |
with gr.Row(elem_classes="control-panel"):
|
113 |
self.confidence_slider = gr.Slider(
|
114 |
minimum=0.0,
|
115 |
maximum=1.0,
|
116 |
value=self.defaults.get("confidence_threshold", 0.85),
|
117 |
step=0.01,
|
118 |
-
label="Confidence
|
|
|
119 |
)
|
120 |
self.buzzer_method_dropdown = gr.Dropdown(
|
121 |
choices=["AND", "OR"],
|
@@ -127,10 +128,11 @@ class TossupPipelineInterface(PipelineInterface):
|
|
127 |
)
|
128 |
self.prob_slider = gr.Slider(
|
129 |
value=self.defaults.get("logits_prob", 0.0),
|
130 |
-
label="Probability
|
131 |
minimum=0.0,
|
132 |
maximum=1.0,
|
133 |
step=0.001,
|
|
|
134 |
)
|
135 |
|
136 |
def update_choices(available_variables):
|
|
|
52 |
tokens_prob: float | None,
|
53 |
):
|
54 |
"""Update the buzzer."""
|
55 |
+
|
56 |
+
prob_threshold = float(tokens_prob) if tokens_prob and tokens_prob > 0 else None
|
|
|
|
|
57 |
state.workflow.buzzer = state.workflow.buzzer.model_copy(
|
58 |
update={
|
59 |
"method": method,
|
60 |
"confidence_threshold": confidence_threshold,
|
61 |
+
"prob_threshold": prob_threshold,
|
62 |
}
|
63 |
)
|
64 |
Buzzer.model_validate(state.workflow.buzzer)
|
|
|
106 |
)
|
107 |
dropdowns[output_field] = dropdown
|
108 |
with gr.Row(elem_classes="output-fields-header"):
|
109 |
+
gr.Markdown(
|
110 |
+
"#### Buzzer settings:\n Set your thresholds for confidence and output tokens probability."
|
111 |
+
)
|
112 |
with gr.Row(elem_classes="control-panel"):
|
113 |
self.confidence_slider = gr.Slider(
|
114 |
minimum=0.0,
|
115 |
maximum=1.0,
|
116 |
value=self.defaults.get("confidence_threshold", 0.85),
|
117 |
step=0.01,
|
118 |
+
label="Confidence",
|
119 |
+
elem_classes="slider-container",
|
120 |
)
|
121 |
self.buzzer_method_dropdown = gr.Dropdown(
|
122 |
choices=["AND", "OR"],
|
|
|
128 |
)
|
129 |
self.prob_slider = gr.Slider(
|
130 |
value=self.defaults.get("logits_prob", 0.0),
|
131 |
+
label="Probability",
|
132 |
minimum=0.0,
|
133 |
maximum=1.0,
|
134 |
step=0.001,
|
135 |
+
elem_classes="slider-container",
|
136 |
)
|
137 |
|
138 |
def update_choices(available_variables):
|
src/components/quizbowl/bonus.py
CHANGED
@@ -6,13 +6,14 @@ import pandas as pd
|
|
6 |
from datasets import Dataset
|
7 |
from loguru import logger
|
8 |
|
|
|
9 |
from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState, PipelineUIState
|
10 |
from display.formatting import styled_error
|
11 |
from submission import submit
|
12 |
from workflows.qb_agents import QuizBowlBonusAgent
|
13 |
from workflows.structs import ModelStep, Workflow
|
14 |
|
15 |
-
from . import commons
|
16 |
from .plotting import (
|
17 |
create_bonus_confidence_plot,
|
18 |
create_bonus_html,
|
@@ -120,10 +121,11 @@ class BonusInterface:
|
|
120 |
self.output_state = gr.State(value="{}")
|
121 |
self.render()
|
122 |
|
123 |
-
def
|
124 |
"""Render the model interface."""
|
125 |
-
with gr.Row():
|
126 |
-
self.
|
|
|
127 |
self.pipeline_interface = PipelineInterface(
|
128 |
workflow,
|
129 |
simple=simple,
|
@@ -137,16 +139,13 @@ class BonusInterface:
|
|
137 |
self.run_btn = gr.Button("Run on Bonus Question", variant="secondary")
|
138 |
|
139 |
self.question_display = gr.HTML(label="Question", elem_id="bonus-question-display")
|
140 |
-
|
141 |
-
self.confidence_plot = gr.Plot(
|
142 |
-
label="Part Confidence",
|
143 |
-
format="webp",
|
144 |
-
)
|
145 |
-
|
146 |
self.results_table = gr.DataFrame(
|
147 |
label="Model Outputs",
|
148 |
value=pd.DataFrame(columns=["Part", "Correct?", "Confidence", "Prediction", "Explanation"]),
|
|
|
149 |
)
|
|
|
150 |
|
151 |
with gr.Row():
|
152 |
self.eval_btn = gr.Button("Evaluate", variant="primary")
|
@@ -168,7 +167,7 @@ class BonusInterface:
|
|
168 |
with gr.Row():
|
169 |
# Model Panel
|
170 |
with gr.Column(scale=1):
|
171 |
-
self.
|
172 |
|
173 |
with gr.Column(scale=1):
|
174 |
self._render_qb_interface()
|
@@ -192,20 +191,6 @@ class BonusInterface:
|
|
192 |
except Exception as e:
|
193 |
return f"Error loading question: {str(e)}"
|
194 |
|
195 |
-
def get_user_submission_names(self, profile: gr.OAuthProfile | None) -> list[str]:
|
196 |
-
if profile is None:
|
197 |
-
logger.error("Authentication required. Please log in to view your submissions.")
|
198 |
-
return []
|
199 |
-
model_names = submit.get_user_submission_names("bonus", profile)
|
200 |
-
logger.info("Loaded model names: {model_names}")
|
201 |
-
return gr.update(choices=model_names, value=None)
|
202 |
-
|
203 |
-
def load_user_submission(self, model_name: str, profile: gr.OAuthProfile | None) -> PipelineState:
|
204 |
-
if profile is None:
|
205 |
-
return styled_error("Authentication required. Please log in to view your submissions.")
|
206 |
-
submission = submit.load_submission(model_name, "tossup", profile)
|
207 |
-
return PipelineState(workflow=submission.workflow, ui_state=PipelineUIState.from_workflow(submission.workflow))
|
208 |
-
|
209 |
def get_model_outputs(self, example: dict, pipeline_state: PipelineState):
|
210 |
"""Get the model outputs for a given question ID."""
|
211 |
outputs = []
|
@@ -224,6 +209,20 @@ class BonusInterface:
|
|
224 |
|
225 |
return outputs
|
226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
def single_run(
|
228 |
self,
|
229 |
question_id: int,
|
@@ -242,18 +241,26 @@ class BonusInterface:
|
|
242 |
# Process results and prepare visualization data
|
243 |
html_content, plot_data, output_state = initialize_eval_interface(example, outputs)
|
244 |
df = process_bonus_results(outputs)
|
|
|
245 |
|
246 |
return (
|
247 |
html_content,
|
248 |
-
gr.update(value=plot_data, label=f"Part Confidence on Question {question_id + 1}"),
|
249 |
gr.update(value=output_state),
|
250 |
-
gr.update(value=df, label=f"Model Outputs for Question {question_id + 1}"),
|
|
|
|
|
251 |
)
|
252 |
except Exception as e:
|
253 |
import traceback
|
254 |
|
255 |
error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
|
256 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
257 |
|
258 |
def evaluate(self, pipeline_state: PipelineState, progress: gr.Progress = gr.Progress()):
|
259 |
"""Evaluate the bonus questions."""
|
@@ -288,14 +295,15 @@ class BonusInterface:
|
|
288 |
]
|
289 |
)
|
290 |
|
291 |
-
plot_data = create_scatter_pyplot(part_numbers, part_scores)
|
292 |
return (
|
293 |
gr.update(value=df, label="Scores on Sample Set"),
|
294 |
-
gr.update(
|
295 |
)
|
296 |
except Exception as e:
|
|
|
297 |
logger.exception(f"Error evaluating bonus: {e.args}")
|
298 |
-
return
|
299 |
|
300 |
def submit_model(
|
301 |
self, model_name: str, description: str, pipeline_state: PipelineState, profile: gr.OAuthProfile = None
|
@@ -315,19 +323,19 @@ class BonusInterface:
|
|
315 |
|
316 |
gr.on(
|
317 |
triggers=[self.app.load],
|
318 |
-
fn=self.
|
319 |
-
outputs=[self.
|
320 |
)
|
321 |
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
|
332 |
self.run_btn.click(
|
333 |
self.pipeline_interface.validate_workflow,
|
@@ -341,16 +349,17 @@ class BonusInterface:
|
|
341 |
],
|
342 |
outputs=[
|
343 |
self.question_display,
|
344 |
-
self.confidence_plot,
|
345 |
self.output_state,
|
346 |
self.results_table,
|
|
|
|
|
347 |
],
|
348 |
)
|
349 |
|
350 |
self.eval_btn.click(
|
351 |
fn=self.evaluate,
|
352 |
inputs=[self.pipeline_interface.pipeline_state],
|
353 |
-
outputs=[self.results_table, self.
|
354 |
)
|
355 |
|
356 |
self.submit_btn.click(
|
@@ -362,8 +371,3 @@ class BonusInterface:
|
|
362 |
],
|
363 |
outputs=[self.submit_status],
|
364 |
)
|
365 |
-
self.hidden_input.change(
|
366 |
-
fn=update_tossup_plot,
|
367 |
-
inputs=[self.hidden_input, self.output_state],
|
368 |
-
outputs=[self.confidence_plot],
|
369 |
-
)
|
|
|
6 |
from datasets import Dataset
|
7 |
from loguru import logger
|
8 |
|
9 |
+
from app_configs import UNSELECTED_PIPELINE_NAME
|
10 |
from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState, PipelineUIState
|
11 |
from display.formatting import styled_error
|
12 |
from submission import submit
|
13 |
from workflows.qb_agents import QuizBowlBonusAgent
|
14 |
from workflows.structs import ModelStep, Workflow
|
15 |
|
16 |
+
from . import commons, populate
|
17 |
from .plotting import (
|
18 |
create_bonus_confidence_plot,
|
19 |
create_bonus_html,
|
|
|
121 |
self.output_state = gr.State(value="{}")
|
122 |
self.render()
|
123 |
|
124 |
+
def _render_pipeline_interface(self, workflow: Workflow, simple: bool = True):
|
125 |
"""Render the model interface."""
|
126 |
+
with gr.Row(elem_classes="bonus-header-row form-inline"):
|
127 |
+
self.pipeline_selector = commons.get_pipeline_selector([])
|
128 |
+
self.load_btn = gr.Button("⬇️ Import Pipeline", variant="secondary")
|
129 |
self.pipeline_interface = PipelineInterface(
|
130 |
workflow,
|
131 |
simple=simple,
|
|
|
139 |
self.run_btn = gr.Button("Run on Bonus Question", variant="secondary")
|
140 |
|
141 |
self.question_display = gr.HTML(label="Question", elem_id="bonus-question-display")
|
142 |
+
self.error_display = gr.HTML(label="Error", elem_id="bonus-error-display", visible=False)
|
|
|
|
|
|
|
|
|
|
|
143 |
self.results_table = gr.DataFrame(
|
144 |
label="Model Outputs",
|
145 |
value=pd.DataFrame(columns=["Part", "Correct?", "Confidence", "Prediction", "Explanation"]),
|
146 |
+
visible=False,
|
147 |
)
|
148 |
+
self.model_outputs_display = gr.JSON(label="Model Outputs", value="{}", show_indices=True, visible=False)
|
149 |
|
150 |
with gr.Row():
|
151 |
self.eval_btn = gr.Button("Evaluate", variant="primary")
|
|
|
167 |
with gr.Row():
|
168 |
# Model Panel
|
169 |
with gr.Column(scale=1):
|
170 |
+
self._render_pipeline_interface(workflow, simple=self.defaults["simple_workflow"])
|
171 |
|
172 |
with gr.Column(scale=1):
|
173 |
self._render_qb_interface()
|
|
|
191 |
except Exception as e:
|
192 |
return f"Error loading question: {str(e)}"
|
193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
def get_model_outputs(self, example: dict, pipeline_state: PipelineState):
|
195 |
"""Get the model outputs for a given question ID."""
|
196 |
outputs = []
|
|
|
209 |
|
210 |
return outputs
|
211 |
|
212 |
+
def get_pipeline_names(self, profile: gr.OAuthProfile | None) -> list[str]:
|
213 |
+
names = [UNSELECTED_PIPELINE_NAME] + populate.get_pipeline_names("bonus", profile)
|
214 |
+
return gr.update(choices=names, value=UNSELECTED_PIPELINE_NAME)
|
215 |
+
|
216 |
+
def load_pipeline(self, model_name: str, profile: gr.OAuthProfile | None) -> tuple[str, PipelineState]:
|
217 |
+
try:
|
218 |
+
pipeline_state = populate.load_pipeline("bonus", model_name, profile)
|
219 |
+
if pipeline_state is None:
|
220 |
+
return UNSELECTED_PIPELINE_NAME, gr.skip(), gr.update(visible=False)
|
221 |
+
return UNSELECTED_PIPELINE_NAME, pipeline_state, gr.update(visible=True)
|
222 |
+
except Exception as e:
|
223 |
+
error_msg = styled_error(f"Error loading pipeline: {str(e)}")
|
224 |
+
return UNSELECTED_PIPELINE_NAME, gr.skip(), gr.update(visible=True, value=error_msg)
|
225 |
+
|
226 |
def single_run(
|
227 |
self,
|
228 |
question_id: int,
|
|
|
241 |
# Process results and prepare visualization data
|
242 |
html_content, plot_data, output_state = initialize_eval_interface(example, outputs)
|
243 |
df = process_bonus_results(outputs)
|
244 |
+
step_outputs = [output["step_outputs"] for output in outputs]
|
245 |
|
246 |
return (
|
247 |
html_content,
|
|
|
248 |
gr.update(value=output_state),
|
249 |
+
gr.update(value=df, label=f"Model Outputs for Question {question_id + 1}", visible=True),
|
250 |
+
gr.update(value=step_outputs, label=f"Step Outputs for Question {question_id + 1}", visible=True),
|
251 |
+
gr.update(visible=False),
|
252 |
)
|
253 |
except Exception as e:
|
254 |
import traceback
|
255 |
|
256 |
error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
|
257 |
+
return (
|
258 |
+
gr.skip(),
|
259 |
+
gr.skip(),
|
260 |
+
gr.update(visible=False),
|
261 |
+
gr.update(visible=False),
|
262 |
+
gr.update(visible=True, value=error_msg),
|
263 |
+
)
|
264 |
|
265 |
def evaluate(self, pipeline_state: PipelineState, progress: gr.Progress = gr.Progress()):
|
266 |
"""Evaluate the bonus questions."""
|
|
|
295 |
]
|
296 |
)
|
297 |
|
298 |
+
# plot_data = create_scatter_pyplot(part_numbers, part_scores)
|
299 |
return (
|
300 |
gr.update(value=df, label="Scores on Sample Set"),
|
301 |
+
gr.update(visible=False),
|
302 |
)
|
303 |
except Exception as e:
|
304 |
+
error_msg = styled_error(f"Error evaluating bonus: {e.args}")
|
305 |
logger.exception(f"Error evaluating bonus: {e.args}")
|
306 |
+
return gr.skip(), gr.update(visible=True, value=error_msg)
|
307 |
|
308 |
def submit_model(
|
309 |
self, model_name: str, description: str, pipeline_state: PipelineState, profile: gr.OAuthProfile = None
|
|
|
323 |
|
324 |
gr.on(
|
325 |
triggers=[self.app.load],
|
326 |
+
fn=self.get_pipeline_names,
|
327 |
+
outputs=[self.pipeline_selector],
|
328 |
)
|
329 |
|
330 |
+
self.new_loaded_pipeline_state = gr.State(value=None)
|
331 |
+
self.load_btn.click(
|
332 |
+
fn=self.load_pipeline,
|
333 |
+
inputs=[self.pipeline_selector],
|
334 |
+
outputs=[self.pipeline_selector, self.new_loaded_pipeline_state, self.error_display],
|
335 |
+
)
|
336 |
+
self.pipeline_interface.add_triggers_for_pipeline_export(
|
337 |
+
[self.new_loaded_pipeline_state.change], self.new_loaded_pipeline_state
|
338 |
+
)
|
339 |
|
340 |
self.run_btn.click(
|
341 |
self.pipeline_interface.validate_workflow,
|
|
|
349 |
],
|
350 |
outputs=[
|
351 |
self.question_display,
|
|
|
352 |
self.output_state,
|
353 |
self.results_table,
|
354 |
+
self.model_outputs_display,
|
355 |
+
self.error_display,
|
356 |
],
|
357 |
)
|
358 |
|
359 |
self.eval_btn.click(
|
360 |
fn=self.evaluate,
|
361 |
inputs=[self.pipeline_interface.pipeline_state],
|
362 |
+
outputs=[self.results_table, self.error_display],
|
363 |
)
|
364 |
|
365 |
self.submit_btn.click(
|
|
|
371 |
],
|
372 |
outputs=[self.submit_status],
|
373 |
)
|
|
|
|
|
|
|
|
|
|
src/components/quizbowl/commons.py
CHANGED
@@ -21,4 +21,6 @@ def get_pipeline_selector(model_options: list[str]):
|
|
21 |
choices=model_options,
|
22 |
value="",
|
23 |
interactive=True,
|
|
|
|
|
24 |
)
|
|
|
21 |
choices=model_options,
|
22 |
value="",
|
23 |
interactive=True,
|
24 |
+
container=False,
|
25 |
+
elem_classes="pipeline-selector",
|
26 |
)
|
src/components/quizbowl/plotting.py
CHANGED
@@ -46,11 +46,11 @@ def _create_token_tooltip_html(values) -> str:
|
|
46 |
|
47 |
return f"""
|
48 |
<div class="tooltip card" style="background-color: {color}; border-radius: 8px; padding: 12px; box-shadow: 2px 4px 8px rgba(0, 0, 0, 0.15);">
|
49 |
-
<div class="tooltip-content" style="font-family: 'Arial', sans-serif; color: #
|
50 |
-
<h4 style="margin: 0 0 8px;">💡 Answer</h4>
|
51 |
-
<p style="font-weight: bold; margin: 0 0 8px;">{answer}</p>
|
52 |
-
<p style="margin: 0 0 4px;">📊 <
|
53 |
-
<p style="margin: 0;">🔍 <
|
54 |
</div>
|
55 |
</div>
|
56 |
"""
|
|
|
46 |
|
47 |
return f"""
|
48 |
<div class="tooltip card" style="background-color: {color}; border-radius: 8px; padding: 12px; box-shadow: 2px 4px 8px rgba(0, 0, 0, 0.15);">
|
49 |
+
<div class="tooltip-content" style="font-family: 'Arial', sans-serif; color: #000;">
|
50 |
+
<h4 style="margin: 0 0 8px; color: #000;">💡 Answer</h4>
|
51 |
+
<p style="font-weight: bold; margin: 0 0 8px; color: #000;">{answer}</p>
|
52 |
+
<p style="margin: 0 0 4px; color: #000;">📊 <b style="color: #000;">Confidence:</b> {confidence:.2f}</p>
|
53 |
+
<p style="margin: 0; color: #000;">🔍 <b style="color: #000;">Status:</b> {"✅ Correct" if score else "❌ Incorrect" if buzz else "🚫 No Buzz"}</p>
|
54 |
</div>
|
55 |
</div>
|
56 |
"""
|
src/components/quizbowl/populate.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
from loguru import logger
|
5 |
+
|
6 |
+
from app_configs import UNSELECTED_PIPELINE_NAME
|
7 |
+
from components.model_pipeline.model_pipeline import PipelineState, PipelineUIState
|
8 |
+
from display.formatting import styled_error
|
9 |
+
from submission import submit
|
10 |
+
|
11 |
+
|
12 |
+
def get_user_submission_names(profile: gr.OAuthProfile | None) -> list[str]:
|
13 |
+
if profile is None:
|
14 |
+
logger.error("Authentication required. Please log in to view your submissions.")
|
15 |
+
return []
|
16 |
+
return submit.get_user_submission_names("tossup", profile)
|
17 |
+
|
18 |
+
|
19 |
+
def get_pipeline_names(competition_type: str, profile: gr.OAuthProfile | None) -> list[str]:
|
20 |
+
demo_example_names = submit.get_demo_example_submissions(competition_type)
|
21 |
+
user_model_names = submit.get_user_submission_names(competition_type, profile)
|
22 |
+
all_names = demo_example_names + user_model_names
|
23 |
+
logger.info("Loaded model names: {all_names}")
|
24 |
+
return all_names
|
25 |
+
|
26 |
+
|
27 |
+
def load_pipeline(competition_type: str, model_name: str, profile: gr.OAuthProfile | None) -> Optional[PipelineState]:
|
28 |
+
if not model_name or model_name == UNSELECTED_PIPELINE_NAME:
|
29 |
+
return None
|
30 |
+
username, model_name = model_name.split("/")
|
31 |
+
if username == "umdclip":
|
32 |
+
workflow = submit.load_demo_example(model_name, competition_type)
|
33 |
+
elif profile is not None:
|
34 |
+
submission = submit.load_submission(model_name, competition_type, profile)
|
35 |
+
workflow = submission.workflow
|
36 |
+
else:
|
37 |
+
raise gr.Error("Authentication required. Please log in to view your submissions.")
|
38 |
+
return PipelineState(workflow=workflow, ui_state=PipelineUIState.from_workflow(workflow))
|
src/components/quizbowl/tossup.py
CHANGED
@@ -7,6 +7,7 @@ import pandas as pd
|
|
7 |
from datasets import Dataset
|
8 |
from loguru import logger
|
9 |
|
|
|
10 |
from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState, PipelineUIState
|
11 |
from components.model_pipeline.tossup_pipeline import TossupPipelineInterface, TossupPipelineState
|
12 |
from display.formatting import styled_error
|
@@ -14,7 +15,7 @@ from submission import submit
|
|
14 |
from workflows.qb_agents import QuizBowlTossupAgent, TossupResult
|
15 |
from workflows.structs import ModelStep, TossupWorkflow
|
16 |
|
17 |
-
from . import commons
|
18 |
from .plotting import (
|
19 |
create_scatter_pyplot,
|
20 |
create_tossup_confidence_pyplot,
|
@@ -190,8 +191,9 @@ class TossupInterface:
|
|
190 |
|
191 |
def _render_pipeline_interface(self, workflow: TossupWorkflow, simple: bool = True):
|
192 |
"""Render the model interface."""
|
193 |
-
with gr.Row():
|
194 |
-
self.
|
|
|
195 |
self.pipeline_interface = TossupPipelineInterface(
|
196 |
workflow,
|
197 |
simple=simple,
|
@@ -217,10 +219,11 @@ class TossupInterface:
|
|
217 |
label="Buzz Confidence",
|
218 |
format="webp",
|
219 |
)
|
220 |
-
self.model_outputs_display = gr.JSON(label="Model Outputs", value="{}", visible=False)
|
221 |
self.results_table = gr.DataFrame(
|
222 |
label="Model Outputs",
|
223 |
value=pd.DataFrame(columns=["Token Position", "Correct?", "Confidence", "Prediction"]),
|
|
|
224 |
)
|
225 |
with gr.Row():
|
226 |
self.eval_btn = gr.Button("Evaluate", variant="primary")
|
@@ -285,19 +288,19 @@ class TossupInterface:
|
|
285 |
outputs = add_model_scores(outputs, example["clean_answers"], example["run_indices"])
|
286 |
return outputs
|
287 |
|
288 |
-
def
|
289 |
-
|
290 |
-
|
291 |
-
return []
|
292 |
-
model_names = submit.get_user_submission_names("tossup", profile)
|
293 |
-
logger.info("Loaded model names: {model_names}")
|
294 |
-
return gr.update(choices=model_names, value=None)
|
295 |
|
296 |
-
def
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
|
|
|
|
|
|
|
|
301 |
|
302 |
def single_run(
|
303 |
self,
|
@@ -322,7 +325,7 @@ class TossupInterface:
|
|
322 |
tokens_html,
|
323 |
gr.update(value=output_state),
|
324 |
gr.update(value=plot_data, label=f"Buzz Confidence on Question {question_id + 1}"),
|
325 |
-
gr.update(value=df, label=f"Model Outputs for Question {question_id + 1}"),
|
326 |
gr.update(value=step_outputs, label=f"Step Outputs for Question {question_id + 1}", visible=True),
|
327 |
gr.update(visible=False),
|
328 |
)
|
@@ -334,7 +337,7 @@ class TossupInterface:
|
|
334 |
gr.skip(),
|
335 |
gr.skip(),
|
336 |
gr.skip(),
|
337 |
-
gr.
|
338 |
gr.update(visible=False),
|
339 |
gr.update(visible=True, value=error_msg),
|
340 |
)
|
@@ -371,7 +374,7 @@ class TossupInterface:
|
|
371 |
plot_data = create_scatter_pyplot(token_positions, correctness)
|
372 |
return (
|
373 |
gr.update(value=plot_data, label="Buzz Positions on Sample Set"),
|
374 |
-
gr.update(value=df, label="Scores on Sample Set"),
|
375 |
gr.update(visible=False),
|
376 |
)
|
377 |
except Exception as e:
|
@@ -380,7 +383,7 @@ class TossupInterface:
|
|
380 |
logger.exception(f"Error evaluating tossups: {e.args}")
|
381 |
return (
|
382 |
gr.skip(),
|
383 |
-
gr.
|
384 |
gr.update(visible=True, value=styled_error(f"Error: {str(e)}\n{traceback.format_exc()}")),
|
385 |
)
|
386 |
|
@@ -400,19 +403,19 @@ class TossupInterface:
|
|
400 |
|
401 |
gr.on(
|
402 |
triggers=[self.app.load],
|
403 |
-
fn=self.
|
404 |
-
outputs=[self.
|
405 |
)
|
406 |
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
|
417 |
self.run_btn.click(
|
418 |
self.pipeline_interface.validate_workflow,
|
|
|
7 |
from datasets import Dataset
|
8 |
from loguru import logger
|
9 |
|
10 |
+
from app_configs import UNSELECTED_PIPELINE_NAME
|
11 |
from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState, PipelineUIState
|
12 |
from components.model_pipeline.tossup_pipeline import TossupPipelineInterface, TossupPipelineState
|
13 |
from display.formatting import styled_error
|
|
|
15 |
from workflows.qb_agents import QuizBowlTossupAgent, TossupResult
|
16 |
from workflows.structs import ModelStep, TossupWorkflow
|
17 |
|
18 |
+
from . import commons, populate
|
19 |
from .plotting import (
|
20 |
create_scatter_pyplot,
|
21 |
create_tossup_confidence_pyplot,
|
|
|
191 |
|
192 |
def _render_pipeline_interface(self, workflow: TossupWorkflow, simple: bool = True):
|
193 |
"""Render the model interface."""
|
194 |
+
with gr.Row(elem_classes="bonus-header-row form-inline"):
|
195 |
+
self.pipeline_selector = commons.get_pipeline_selector([])
|
196 |
+
self.load_btn = gr.Button("⬇️ Import Pipeline", variant="secondary")
|
197 |
self.pipeline_interface = TossupPipelineInterface(
|
198 |
workflow,
|
199 |
simple=simple,
|
|
|
219 |
label="Buzz Confidence",
|
220 |
format="webp",
|
221 |
)
|
222 |
+
self.model_outputs_display = gr.JSON(label="Model Outputs", value="{}", show_indices=True, visible=False)
|
223 |
self.results_table = gr.DataFrame(
|
224 |
label="Model Outputs",
|
225 |
value=pd.DataFrame(columns=["Token Position", "Correct?", "Confidence", "Prediction"]),
|
226 |
+
visible=False,
|
227 |
)
|
228 |
with gr.Row():
|
229 |
self.eval_btn = gr.Button("Evaluate", variant="primary")
|
|
|
288 |
outputs = add_model_scores(outputs, example["clean_answers"], example["run_indices"])
|
289 |
return outputs
|
290 |
|
291 |
+
def get_pipeline_names(self, profile: gr.OAuthProfile | None) -> list[str]:
|
292 |
+
names = [UNSELECTED_PIPELINE_NAME] + populate.get_pipeline_names("tossup", profile)
|
293 |
+
return gr.update(choices=names, value=UNSELECTED_PIPELINE_NAME)
|
|
|
|
|
|
|
|
|
294 |
|
295 |
+
def load_pipeline(self, model_name: str, profile: gr.OAuthProfile | None) -> tuple[str, PipelineState]:
|
296 |
+
try:
|
297 |
+
pipeline_state = populate.load_pipeline("tossup", model_name, profile)
|
298 |
+
if pipeline_state is None:
|
299 |
+
return UNSELECTED_PIPELINE_NAME, gr.skip(), gr.update(visible=False)
|
300 |
+
return UNSELECTED_PIPELINE_NAME, pipeline_state, gr.update(visible=True)
|
301 |
+
except Exception as e:
|
302 |
+
error_msg = styled_error(f"Error loading pipeline: {str(e)}")
|
303 |
+
return UNSELECTED_PIPELINE_NAME, gr.skip(), gr.update(visible=True, value=error_msg)
|
304 |
|
305 |
def single_run(
|
306 |
self,
|
|
|
325 |
tokens_html,
|
326 |
gr.update(value=output_state),
|
327 |
gr.update(value=plot_data, label=f"Buzz Confidence on Question {question_id + 1}"),
|
328 |
+
gr.update(value=df, label=f"Model Outputs for Question {question_id + 1}", visible=True),
|
329 |
gr.update(value=step_outputs, label=f"Step Outputs for Question {question_id + 1}", visible=True),
|
330 |
gr.update(visible=False),
|
331 |
)
|
|
|
337 |
gr.skip(),
|
338 |
gr.skip(),
|
339 |
gr.skip(),
|
340 |
+
gr.update(visible=False),
|
341 |
gr.update(visible=False),
|
342 |
gr.update(visible=True, value=error_msg),
|
343 |
)
|
|
|
374 |
plot_data = create_scatter_pyplot(token_positions, correctness)
|
375 |
return (
|
376 |
gr.update(value=plot_data, label="Buzz Positions on Sample Set"),
|
377 |
+
gr.update(value=df, label="Scores on Sample Set", visible=True),
|
378 |
gr.update(visible=False),
|
379 |
)
|
380 |
except Exception as e:
|
|
|
383 |
logger.exception(f"Error evaluating tossups: {e.args}")
|
384 |
return (
|
385 |
gr.skip(),
|
386 |
+
gr.update(visible=False),
|
387 |
gr.update(visible=True, value=styled_error(f"Error: {str(e)}\n{traceback.format_exc()}")),
|
388 |
)
|
389 |
|
|
|
403 |
|
404 |
gr.on(
|
405 |
triggers=[self.app.load],
|
406 |
+
fn=self.get_pipeline_names,
|
407 |
+
outputs=[self.pipeline_selector],
|
408 |
)
|
409 |
|
410 |
+
self.new_loaded_pipeline_state = gr.State(value=None)
|
411 |
+
self.load_btn.click(
|
412 |
+
fn=self.load_pipeline,
|
413 |
+
inputs=[self.pipeline_selector],
|
414 |
+
outputs=[self.pipeline_selector, self.new_loaded_pipeline_state, self.error_display],
|
415 |
+
)
|
416 |
+
self.pipeline_interface.add_triggers_for_pipeline_export(
|
417 |
+
[self.new_loaded_pipeline_state.change], self.new_loaded_pipeline_state
|
418 |
+
)
|
419 |
|
420 |
self.run_btn.click(
|
421 |
self.pipeline_interface.validate_workflow,
|
src/display/custom_css.py
CHANGED
@@ -1,19 +1,22 @@
|
|
1 |
css_pipeline = """
|
2 |
:root {
|
3 |
color-scheme: light !important;
|
4 |
-
--block-border-width: 0;
|
5 |
-
--section-header-text-weight: 600;
|
6 |
-
--section-header-text-size: 14px;
|
7 |
-
--input-radius: var(--radius-xl);
|
8 |
-
--font-mono: "Space Mono", monospace;
|
9 |
-
--text-
|
10 |
-
--text-
|
|
|
|
|
11 |
--body-text-size: 14px !important;
|
12 |
--input-background-fill-focus: var(--secondary-300) !important;
|
13 |
|
14 |
// Button Colors
|
15 |
--button-primary-background-fill: var(--primary-800) !important;
|
16 |
--button-secondary-background-fill: var(--secondary-600) !important;
|
|
|
17 |
|
18 |
|
19 |
--card-bg-color: #fcecd4;
|
@@ -23,14 +26,25 @@ css_pipeline = """
|
|
23 |
--hover-border-color: #121212;
|
24 |
}
|
25 |
|
26 |
-
.dark {
|
27 |
-
|
28 |
-
--
|
29 |
-
--
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
--button-primary-text-color: black !important;
|
31 |
-
--button-secondary-text-color: black !important
|
|
|
32 |
|
33 |
-
--block-border-width: 0;
|
34 |
--card-bg-color: #383127;
|
35 |
--answer-bg-color: #1a2b3c;
|
36 |
--hover-border-color: #ffffff;
|
@@ -44,10 +58,32 @@ css_pipeline = """
|
|
44 |
box-shadow: 0 0 0 0 !important;
|
45 |
}
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
.head {
|
48 |
margin-bottom: 0px;
|
49 |
}
|
50 |
|
|
|
|
|
|
|
|
|
51 |
.gradio-container {
|
52 |
max-width: 1500px;
|
53 |
margin: 0 auto;
|
@@ -85,6 +121,14 @@ css_pipeline = """
|
|
85 |
font-size: 12px;
|
86 |
}
|
87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
.output-fields-panel {
|
89 |
background-color: var(--card-bg-color);
|
90 |
border: 0px solid #e0e0e0 !important;
|
@@ -143,13 +187,12 @@ css_pipeline = """
|
|
143 |
}
|
144 |
|
145 |
.model-dropdown input {
|
146 |
-
font-size: 14px;
|
147 |
padding-bottom: 2px;
|
148 |
padding-top: 2px;
|
149 |
}
|
150 |
|
151 |
.step-name input {
|
152 |
-
font-size:
|
153 |
font-weight: bold;
|
154 |
padding-bottom: 8px;
|
155 |
margin-bottom: 4px;
|
@@ -421,7 +464,7 @@ css_tossup = """
|
|
421 |
border-color: #228b22; /* Darker and slightly muted green */
|
422 |
}
|
423 |
.tossup-question {
|
424 |
-
line-height: 1.
|
425 |
padding: 5px;
|
426 |
margin-left: 4px;
|
427 |
margin-right: 4px;
|
|
|
1 |
css_pipeline = """
|
2 |
:root {
|
3 |
color-scheme: light !important;
|
4 |
+
--block-border-width: 0 !important;
|
5 |
+
--section-header-text-weight: 600 !important;
|
6 |
+
--section-header-text-size: 14px !important;
|
7 |
+
--input-radius: var(--radius-xl) !important;
|
8 |
+
--font-mono: "Space Mono", monospace !important;
|
9 |
+
--text-sm: 12px !important;
|
10 |
+
--text-md: 14px !important;
|
11 |
+
--text-lg: 16px !important;
|
12 |
+
--input-text-size: var(--text-sm) !important;
|
13 |
--body-text-size: 14px !important;
|
14 |
--input-background-fill-focus: var(--secondary-300) !important;
|
15 |
|
16 |
// Button Colors
|
17 |
--button-primary-background-fill: var(--primary-800) !important;
|
18 |
--button-secondary-background-fill: var(--secondary-600) !important;
|
19 |
+
--checkbox-label-text-color: var(--body-text-color) !important;
|
20 |
|
21 |
|
22 |
--card-bg-color: #fcecd4;
|
|
|
26 |
--hover-border-color: #121212;
|
27 |
}
|
28 |
|
29 |
+
:root .dark {
|
30 |
+
color-scheme: dark !important;
|
31 |
+
--block-border-width: 0 !important;
|
32 |
+
--section-header-text-weight: 600 !important;
|
33 |
+
--section-header-text-size: 14px !important;
|
34 |
+
--input-radius: var(--radius-xl) !important;
|
35 |
+
--font-mono: "Space Mono", monospace !important;
|
36 |
+
--text-sm: 12px !important;
|
37 |
+
--text-md: 14px !important;
|
38 |
+
--text-lg: 16px !important;
|
39 |
+
--input-text-size: var(--text-sm) !important;
|
40 |
+
--body-text-size: 14px !important;
|
41 |
+
|
42 |
+
--button-primary-background-fill: var(--neutral-100) !important;
|
43 |
+
--button-secondary-background-fill: var(--secondary-300) !important;
|
44 |
--button-primary-text-color: black !important;
|
45 |
+
--button-secondary-text-color: black !important;
|
46 |
+
--checkbox-label-text-color: var(--body-text-color) !important;
|
47 |
|
|
|
48 |
--card-bg-color: #383127;
|
49 |
--answer-bg-color: #1a2b3c;
|
50 |
--hover-border-color: #ffffff;
|
|
|
58 |
box-shadow: 0 0 0 0 !important;
|
59 |
}
|
60 |
|
61 |
+
.slider-container .wrap {
|
62 |
+
gap: var(--spacing-md) !important;
|
63 |
+
}
|
64 |
+
|
65 |
+
.json-node {
|
66 |
+
/* On a light background (usually white), use darker and vivid colors */
|
67 |
+
font-size: var(--text-sm) !important;
|
68 |
+
--text-color: #2e2e2e; /* Dark grey text for overall readability */
|
69 |
+
--key-color: #d73a49; /* Bright red for keys */
|
70 |
+
--string-color: #22863a; /* Bold green for strings */
|
71 |
+
--number-color: #0366d6; /* Vivid blue for numbers */
|
72 |
+
--bracket-color: #6f42c1; /* Distinct purple for regular brackets */
|
73 |
+
--square-bracket-color: #e36209; /* Eye-popping orange for square brackets */
|
74 |
+
--punctuation-color: #17a2b8; /* Turquoise punctuation */
|
75 |
+
--line-number-color: #6a737d; /* Used for line numbers if shown */
|
76 |
+
--separator-color: var(--line-number-color);
|
77 |
+
}
|
78 |
+
|
79 |
.head {
|
80 |
margin-bottom: 0px;
|
81 |
}
|
82 |
|
83 |
+
.icon-wrap {
|
84 |
+
right: var(--size-1) !important;
|
85 |
+
}
|
86 |
+
|
87 |
.gradio-container {
|
88 |
max-width: 1500px;
|
89 |
margin: 0 auto;
|
|
|
121 |
font-size: 12px;
|
122 |
}
|
123 |
|
124 |
+
.control-panel {
|
125 |
+
gap: var(--spacing-lg) !important;
|
126 |
+
}
|
127 |
+
|
128 |
+
.toggleable {
|
129 |
+
gap: var(--spacing-xs) !important;
|
130 |
+
}
|
131 |
+
|
132 |
.output-fields-panel {
|
133 |
background-color: var(--card-bg-color);
|
134 |
border: 0px solid #e0e0e0 !important;
|
|
|
187 |
}
|
188 |
|
189 |
.model-dropdown input {
|
|
|
190 |
padding-bottom: 2px;
|
191 |
padding-top: 2px;
|
192 |
}
|
193 |
|
194 |
.step-name input {
|
195 |
+
font-size: var(--text-md);
|
196 |
font-weight: bold;
|
197 |
padding-bottom: 8px;
|
198 |
margin-bottom: 4px;
|
|
|
464 |
border-color: #228b22; /* Darker and slightly muted green */
|
465 |
}
|
466 |
.tossup-question {
|
467 |
+
line-height: 1.5;
|
468 |
padding: 5px;
|
469 |
margin-left: 4px;
|
470 |
margin-right: 4px;
|
src/display/guide.py
CHANGED
@@ -12,7 +12,7 @@ GUIDE_MARKDOWN = """
|
|
12 |
## Competition Rules
|
13 |
|
14 |
### 🧠 Tossup Questions
|
15 |
-
- **Format**: Individual questions
|
16 |
- **Scoring**:
|
17 |
- Correct early buzz: +10 points
|
18 |
- Incorrect early buzz: -5 points
|
@@ -20,10 +20,16 @@ GUIDE_MARKDOWN = """
|
|
20 |
- **Required Outputs**:
|
21 |
- `answer`: Your predicted answer
|
22 |
- `confidence`: Score between 0-1
|
23 |
-
-
|
|
|
|
|
|
|
24 |
|
25 |
### 🎁 Bonus Questions
|
26 |
-
- **Format**:
|
|
|
|
|
|
|
27 |
- **Scoring**: +10 points per correct part (max 30)
|
28 |
- **Required Outputs**:
|
29 |
- `answer`: Your predicted answer
|
@@ -40,7 +46,17 @@ GUIDE_MARKDOWN = """
|
|
40 |
- System prompt
|
41 |
- Required outputs
|
42 |
|
43 |
-
### 2.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
1. Select an example question
|
45 |
2. For Tossup:
|
46 |
- Set buzz threshold (0.5-1.0)
|
@@ -50,15 +66,15 @@ GUIDE_MARKDOWN = """
|
|
50 |
- Confidence scores
|
51 |
- Performance metrics
|
52 |
|
53 |
-
###
|
54 |
- Test on multiple questions
|
55 |
- Monitor:
|
56 |
- Accuracy
|
57 |
- Confidence patterns
|
58 |
- Response times
|
59 |
|
60 |
-
###
|
61 |
-
1. Log in
|
62 |
2. Name your model
|
63 |
3. Add description
|
64 |
4. Submit for evaluation
|
|
|
12 |
## Competition Rules
|
13 |
|
14 |
### 🧠 Tossup Questions
|
15 |
+
- **Format**: Individual questions progressively revealed. Questions get easier as they are revealed.
|
16 |
- **Scoring**:
|
17 |
- Correct early buzz: +10 points
|
18 |
- Incorrect early buzz: -5 points
|
|
|
20 |
- **Required Outputs**:
|
21 |
- `answer`: Your predicted answer
|
22 |
- `confidence`: Score between 0-1
|
23 |
+
- `buzzer`: When to attempt answering
|
24 |
+
- Configure with confidence threshold (0.0-1.0)
|
25 |
+
- Optional token probability threshold for more control
|
26 |
+
- Combine thresholds using AND/OR logic (buzz when both/either condition met)
|
27 |
|
28 |
### 🎁 Bonus Questions
|
29 |
+
- **Format**:
|
30 |
+
- Consists of a `leadin` paragraph that introduces the topic
|
31 |
+
- Followed by three related `parts` (A, B, C) that test specific knowledge
|
32 |
+
- Each part is worth 10 points
|
33 |
- **Scoring**: +10 points per correct part (max 30)
|
34 |
- **Required Outputs**:
|
35 |
- `answer`: Your predicted answer
|
|
|
46 |
- System prompt
|
47 |
- Required outputs
|
48 |
|
49 |
+
### 2. Using Demo Pipelines
|
50 |
+
- Load existing demo pipelines as starting points
|
51 |
+
- Modify configurations:
|
52 |
+
- Adjust model parameters
|
53 |
+
- Update system prompts
|
54 |
+
- Change confidence thresholds
|
55 |
+
- Add/remove pipeline steps
|
56 |
+
- Save modified versions as new pipelines
|
57 |
+
- Test changes incrementally
|
58 |
+
|
59 |
+
### 3. Testing Your Pipeline
|
60 |
1. Select an example question
|
61 |
2. For Tossup:
|
62 |
- Set buzz threshold (0.5-1.0)
|
|
|
66 |
- Confidence scores
|
67 |
- Performance metrics
|
68 |
|
69 |
+
### 4. Evaluation
|
70 |
- Test on multiple questions
|
71 |
- Monitor:
|
72 |
- Accuracy
|
73 |
- Confidence patterns
|
74 |
- Response times
|
75 |
|
76 |
+
### 5. Submission
|
77 |
+
1. Log in to Hugging Face
|
78 |
2. Name your model
|
79 |
3. Add description
|
80 |
4. Submit for evaluation
|
src/envs.py
CHANGED
@@ -14,10 +14,12 @@ OWNER = (
|
|
14 |
)
|
15 |
# ----------------------------------
|
16 |
|
17 |
-
REPO_ID = f"{OWNER}/
|
18 |
QUEUE_REPO = f"{OWNER}/advcal-requests"
|
19 |
RESULTS_REPO = f"{OWNER}/advcal-results"
|
20 |
|
|
|
|
|
21 |
PLAYGROUND_DATASET_NAMES = {
|
22 |
"tossup": "umdclip/acf-co24-tossups",
|
23 |
"bonus": "umdclip/acf-co24-bonuses",
|
|
|
14 |
)
|
15 |
# ----------------------------------
|
16 |
|
17 |
+
REPO_ID = f"{OWNER}/quizbowl-submission"
|
18 |
QUEUE_REPO = f"{OWNER}/advcal-requests"
|
19 |
RESULTS_REPO = f"{OWNER}/advcal-results"
|
20 |
|
21 |
+
EXAMPLES_PATH = "examples"
|
22 |
+
|
23 |
PLAYGROUND_DATASET_NAMES = {
|
24 |
"tossup": "umdclip/acf-co24-tossups",
|
25 |
"bonus": "umdclip/acf-co24-bonuses",
|
src/submission/structs.py
CHANGED
@@ -3,7 +3,7 @@ from typing import Dict, List, Literal, Optional
|
|
3 |
|
4 |
from pydantic import BaseModel, Field
|
5 |
|
6 |
-
from workflows.structs import Workflow
|
7 |
|
8 |
CompetitionType = Literal["tossup", "bonus"]
|
9 |
SubmissionType = Literal["python_file", "simple_workflow", "complex_workflow"]
|
@@ -54,5 +54,8 @@ class Submission(BaseModel):
|
|
54 |
def from_dict(cls, data: Dict) -> "Submission":
|
55 |
"""Create instance from dictionary format used in HF datasets"""
|
56 |
if data.get("workflow"):
|
57 |
-
data["
|
|
|
|
|
|
|
58 |
return cls.model_validate(data)
|
|
|
3 |
|
4 |
from pydantic import BaseModel, Field
|
5 |
|
6 |
+
from workflows.structs import TossupWorkflow, Workflow
|
7 |
|
8 |
CompetitionType = Literal["tossup", "bonus"]
|
9 |
SubmissionType = Literal["python_file", "simple_workflow", "complex_workflow"]
|
|
|
54 |
def from_dict(cls, data: Dict) -> "Submission":
|
55 |
"""Create instance from dictionary format used in HF datasets"""
|
56 |
if data.get("workflow"):
|
57 |
+
if data["competition_type"] == "tossup":
|
58 |
+
data["workflow"] = TossupWorkflow.model_validate(data["workflow"])
|
59 |
+
else:
|
60 |
+
data["workflow"] = Workflow.model_validate(data["workflow"])
|
61 |
return cls.model_validate(data)
|
src/submission/submit.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import json
|
2 |
import logging
|
3 |
import os
|
@@ -5,12 +6,14 @@ import traceback
|
|
5 |
from datetime import datetime, timedelta, timezone
|
6 |
|
7 |
import gradio as gr
|
|
|
|
|
8 |
|
9 |
from app_configs import DAILY_SUBMISSION_LIMIT_PER_USER
|
10 |
from display.formatting import styled_error, styled_message
|
11 |
-
from envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO
|
12 |
from submission.structs import CompetitionType, Submission, SubmissionStatus
|
13 |
-
from workflows.structs import Workflow
|
14 |
|
15 |
|
16 |
def get_user_submissions(username: str, competition_type: str, pattern: str = None) -> list[Submission]:
|
@@ -24,18 +27,28 @@ def get_user_submissions(username: str, competition_type: str, pattern: str = No
|
|
24 |
continue
|
25 |
if pattern is not None and pattern not in file:
|
26 |
continue
|
27 |
-
|
28 |
-
|
|
|
29 |
submissions.append(submission)
|
|
|
|
|
30 |
return submissions
|
31 |
|
32 |
|
33 |
def get_user_submission_names(competition_type: str, profile: gr.OAuthProfile | None) -> list[str]:
|
34 |
"""Get all submission model names for a user."""
|
35 |
if profile is None:
|
|
|
36 |
return []
|
37 |
submissions = get_user_submissions(profile.username, competition_type)
|
38 |
-
return [s.model_name for s in submissions]
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
|
41 |
def get_user_submissions_today(username: str, competition_type: str) -> list[Submission]:
|
@@ -170,6 +183,20 @@ def submit_model(
|
|
170 |
return styled_error(f"Error submitting model: {str(e)}")
|
171 |
|
172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
def load_submission(model_name: str, competition_type: CompetitionType, profile: gr.OAuthProfile | None) -> Submission:
|
174 |
if profile is None:
|
175 |
logging.error("Authentication required. Please log in to view your submissions.")
|
|
|
1 |
+
import glob
|
2 |
import json
|
3 |
import logging
|
4 |
import os
|
|
|
6 |
from datetime import datetime, timedelta, timezone
|
7 |
|
8 |
import gradio as gr
|
9 |
+
import yaml
|
10 |
+
from loguru import logger
|
11 |
|
12 |
from app_configs import DAILY_SUBMISSION_LIMIT_PER_USER
|
13 |
from display.formatting import styled_error, styled_message
|
14 |
+
from envs import API, EVAL_REQUESTS_PATH, EXAMPLES_PATH, QUEUE_REPO
|
15 |
from submission.structs import CompetitionType, Submission, SubmissionStatus
|
16 |
+
from workflows.structs import TossupWorkflow, Workflow
|
17 |
|
18 |
|
19 |
def get_user_submissions(username: str, competition_type: str, pattern: str = None) -> list[Submission]:
|
|
|
27 |
continue
|
28 |
if pattern is not None and pattern not in file:
|
29 |
continue
|
30 |
+
try:
|
31 |
+
with open(os.path.join(out_dir, file), "r") as f:
|
32 |
+
submission = Submission.from_dict(json.load(f))
|
33 |
submissions.append(submission)
|
34 |
+
except Exception as e:
|
35 |
+
logger.error(f"Error loading submission {file}: {e}")
|
36 |
return submissions
|
37 |
|
38 |
|
39 |
def get_user_submission_names(competition_type: str, profile: gr.OAuthProfile | None) -> list[str]:
|
40 |
"""Get all submission model names for a user."""
|
41 |
if profile is None:
|
42 |
+
logger.warning("No user profile provided. Returning empty list.")
|
43 |
return []
|
44 |
submissions = get_user_submissions(profile.username, competition_type)
|
45 |
+
return [f"{s.username}/{s.model_name}" for s in submissions]
|
46 |
+
|
47 |
+
|
48 |
+
def get_demo_example_submissions(competition_type: str) -> list[str]:
|
49 |
+
"""Get all submissions for a demo example."""
|
50 |
+
examples_dir = f"{EXAMPLES_PATH}/{competition_type}"
|
51 |
+
return [f"umdclip/{os.path.basename(f).removesuffix('.yaml')}" for f in glob.glob(f"{examples_dir}/*.yaml")]
|
52 |
|
53 |
|
54 |
def get_user_submissions_today(username: str, competition_type: str) -> list[Submission]:
|
|
|
183 |
return styled_error(f"Error submitting model: {str(e)}")
|
184 |
|
185 |
|
186 |
+
def load_demo_example(model_name: str, competition_type: CompetitionType) -> Workflow:
|
187 |
+
"""Load a demo example submission."""
|
188 |
+
examples_dir = f"{EXAMPLES_PATH}/{competition_type}"
|
189 |
+
filepath = f"{examples_dir}/{model_name}.yaml"
|
190 |
+
if not os.path.exists(filepath):
|
191 |
+
raise ValueError(f"Demo example file {filepath} not found")
|
192 |
+
with open(filepath, "r") as f:
|
193 |
+
yaml_data = yaml.safe_load(f)
|
194 |
+
if competition_type == "tossup":
|
195 |
+
return TossupWorkflow.model_validate(yaml_data)
|
196 |
+
else:
|
197 |
+
return Workflow.model_validate(yaml_data)
|
198 |
+
|
199 |
+
|
200 |
def load_submission(model_name: str, competition_type: CompetitionType, profile: gr.OAuthProfile | None) -> Submission:
|
201 |
if profile is None:
|
202 |
logging.error("Authentication required. Please log in to view your submissions.")
|
src/workflows/structs.py
CHANGED
@@ -107,6 +107,9 @@ class ModelStep(BaseModel):
|
|
107 |
input_fields: list[InputField]
|
108 |
output_fields: list[OutputField]
|
109 |
|
|
|
|
|
|
|
110 |
def fields(self, field_type: FieldType) -> list[InputField | OutputField]:
|
111 |
return self.input_fields if field_type == "input" else self.output_fields
|
112 |
|
@@ -252,6 +255,9 @@ class Buzzer(BaseModel):
|
|
252 |
confidence_threshold: float = Field(default=0.8, ge=0.0, le=1.0) # Minimum confidence to trigger a buzz
|
253 |
prob_threshold: float | None = None # Optional log probability threshold
|
254 |
|
|
|
|
|
|
|
255 |
def run(self, confidence: float, prob: float | None = None, logprob: float | None = None) -> bool:
|
256 |
"""Run the buzzer logic."""
|
257 |
if logprob is not None and prob is not None:
|
|
|
107 |
input_fields: list[InputField]
|
108 |
output_fields: list[OutputField]
|
109 |
|
110 |
+
class Config:
|
111 |
+
use_enum_values = True
|
112 |
+
|
113 |
def fields(self, field_type: FieldType) -> list[InputField | OutputField]:
|
114 |
return self.input_fields if field_type == "input" else self.output_fields
|
115 |
|
|
|
255 |
confidence_threshold: float = Field(default=0.8, ge=0.0, le=1.0) # Minimum confidence to trigger a buzz
|
256 |
prob_threshold: float | None = None # Optional log probability threshold
|
257 |
|
258 |
+
class Config:
|
259 |
+
use_enum_values = True
|
260 |
+
|
261 |
def run(self, confidence: float, prob: float | None = None, logprob: float | None = None) -> bool:
|
262 |
"""Run the buzzer logic."""
|
263 |
if logprob is not None and prob is not None:
|