Maharshi Gor
commited on
Commit
·
cd9f5b3
1
Parent(s):
f10a835
minor interface refactor
Browse files
src/components/quizbowl/bonus.py
CHANGED
@@ -2,6 +2,7 @@ import json
|
|
2 |
from typing import Any
|
3 |
|
4 |
import gradio as gr
|
|
|
5 |
import pandas as pd
|
6 |
from datasets import Dataset
|
7 |
from loguru import logger
|
@@ -36,7 +37,7 @@ def process_bonus_results(results: list[dict]) -> pd.DataFrame:
|
|
36 |
)
|
37 |
|
38 |
|
39 |
-
def initialize_eval_interface(example: dict, model_outputs: list[dict]):
|
40 |
"""Initialize the interface with example text."""
|
41 |
try:
|
42 |
html_content = create_bonus_html(example["leadin"], example["parts"])
|
@@ -45,12 +46,20 @@ def initialize_eval_interface(example: dict, model_outputs: list[dict]):
|
|
45 |
plot_data = create_bonus_confidence_plot(example["parts"], model_outputs)
|
46 |
|
47 |
# Store state
|
48 |
-
state =
|
49 |
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
except Exception as e:
|
52 |
logger.exception(f"Error initializing interface: {e.args}")
|
53 |
-
return f"<div>Error initializing interface: {str(e)}</div>", pd.DataFrame(),
|
54 |
|
55 |
|
56 |
class BonusInterface:
|
@@ -64,7 +73,7 @@ class BonusInterface:
|
|
64 |
self.model_options = model_options
|
65 |
self.app = app
|
66 |
self.defaults = defaults
|
67 |
-
self.output_state = gr.State(value=
|
68 |
self.render()
|
69 |
|
70 |
# ------------------------------------- LOAD PIPELINE STATE FROM BROWSER STATE -------------------------------------
|
@@ -75,10 +84,10 @@ class BonusInterface:
|
|
75 |
state_dict = browser_state["bonus"].get("pipeline_state", {})
|
76 |
pipeline_state = PipelineState.model_validate(state_dict)
|
77 |
pipeline_state_dict = pipeline_state.model_dump()
|
78 |
-
output_state = browser_state["bonus"].get("output_state",
|
79 |
except Exception as e:
|
80 |
logger.warning(f"Error loading presaved pipeline state: {e}")
|
81 |
-
output_state =
|
82 |
workflow = self.defaults["init_workflow"]
|
83 |
pipeline_state_dict = PipelineState.from_workflow(workflow).model_dump()
|
84 |
return browser_state, not pipeline_change, pipeline_state_dict, output_state
|
@@ -228,9 +237,10 @@ class BonusInterface:
|
|
228 |
outputs = self.get_agent_outputs(example, pipeline_state)
|
229 |
|
230 |
# Process results and prepare visualization data
|
231 |
-
html_content, plot_data, output_state = initialize_eval_interface(
|
|
|
|
|
232 |
df = process_bonus_results(outputs)
|
233 |
-
step_outputs = [output["step_outputs"] for output in outputs]
|
234 |
|
235 |
return (
|
236 |
html_content,
|
|
|
2 |
from typing import Any
|
3 |
|
4 |
import gradio as gr
|
5 |
+
import numpy as np
|
6 |
import pandas as pd
|
7 |
from datasets import Dataset
|
8 |
from loguru import logger
|
|
|
37 |
)
|
38 |
|
39 |
|
40 |
+
def initialize_eval_interface(example: dict, model_outputs: list[dict], input_vars: list[str]):
|
41 |
"""Initialize the interface with example text."""
|
42 |
try:
|
43 |
html_content = create_bonus_html(example["leadin"], example["parts"])
|
|
|
46 |
plot_data = create_bonus_confidence_plot(example["parts"], model_outputs)
|
47 |
|
48 |
# Store state
|
49 |
+
state = {"parts": example["parts"], "outputs": model_outputs}
|
50 |
|
51 |
+
# Preparing step outputs for the model
|
52 |
+
step_outputs = {}
|
53 |
+
for i, output in enumerate(model_outputs):
|
54 |
+
key = f"part {i + 1}"
|
55 |
+
step_outputs[key] = {k: v for k, v in output["step_outputs"].items() if k not in input_vars}
|
56 |
+
if output["logprob"] is not None:
|
57 |
+
step_outputs[key]["output_probability"] = float(np.exp(output["logprob"]))
|
58 |
+
|
59 |
+
return html_content, plot_data, state, step_outputs
|
60 |
except Exception as e:
|
61 |
logger.exception(f"Error initializing interface: {e.args}")
|
62 |
+
return f"<div>Error initializing interface: {str(e)}</div>", pd.DataFrame(), {}, {}
|
63 |
|
64 |
|
65 |
class BonusInterface:
|
|
|
73 |
self.model_options = model_options
|
74 |
self.app = app
|
75 |
self.defaults = defaults
|
76 |
+
self.output_state = gr.State(value={})
|
77 |
self.render()
|
78 |
|
79 |
# ------------------------------------- LOAD PIPELINE STATE FROM BROWSER STATE -------------------------------------
|
|
|
84 |
state_dict = browser_state["bonus"].get("pipeline_state", {})
|
85 |
pipeline_state = PipelineState.model_validate(state_dict)
|
86 |
pipeline_state_dict = pipeline_state.model_dump()
|
87 |
+
output_state = browser_state["bonus"].get("output_state", {})
|
88 |
except Exception as e:
|
89 |
logger.warning(f"Error loading presaved pipeline state: {e}")
|
90 |
+
output_state = {}
|
91 |
workflow = self.defaults["init_workflow"]
|
92 |
pipeline_state_dict = PipelineState.from_workflow(workflow).model_dump()
|
93 |
return browser_state, not pipeline_change, pipeline_state_dict, output_state
|
|
|
237 |
outputs = self.get_agent_outputs(example, pipeline_state)
|
238 |
|
239 |
# Process results and prepare visualization data
|
240 |
+
html_content, plot_data, output_state, step_outputs = initialize_eval_interface(
|
241 |
+
example, outputs, pipeline_state.workflow.inputs
|
242 |
+
)
|
243 |
df = process_bonus_results(outputs)
|
|
|
244 |
|
245 |
return (
|
246 |
html_content,
|
src/components/quizbowl/tossup.py
CHANGED
@@ -37,12 +37,12 @@ class ScoredTossupResult(TossupResult):
|
|
37 |
token_position: int # Position in the question where prediction was made
|
38 |
|
39 |
|
40 |
-
def add_model_scores(
|
41 |
"""Add model scores to the model outputs."""
|
42 |
-
for output
|
43 |
output["score"] = evaluate_prediction(output["answer"], clean_answers)
|
44 |
-
output["token_position"] =
|
45 |
-
return
|
46 |
|
47 |
|
48 |
def prepare_buzz_evals(
|
@@ -61,7 +61,11 @@ def prepare_buzz_evals(
|
|
61 |
|
62 |
|
63 |
def initialize_eval_interface(
|
64 |
-
example: dict,
|
|
|
|
|
|
|
|
|
65 |
):
|
66 |
"""Initialize the interface with example text."""
|
67 |
try:
|
@@ -69,7 +73,7 @@ def initialize_eval_interface(
|
|
69 |
run_indices = example["run_indices"]
|
70 |
answer = example["answer_primary"]
|
71 |
clean_answers = example["clean_answers"]
|
72 |
-
eval_points =
|
73 |
|
74 |
if not tokens:
|
75 |
return "<div>No tokens found in the provided text.</div>", pd.DataFrame(), "{}"
|
@@ -77,12 +81,21 @@ def initialize_eval_interface(
|
|
77 |
plot_data = create_tossup_confidence_pyplot(tokens, eval_points, confidence_threshold, prob_threshold)
|
78 |
|
79 |
# Store tokens, values, and buzzes as JSON for later use
|
80 |
-
state =
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
except Exception as e:
|
84 |
logger.exception(f"Error initializing interface: {e.args}")
|
85 |
-
return f"<div>Error initializing interface: {str(e)}</div>", pd.DataFrame(), "{}"
|
86 |
|
87 |
|
88 |
def process_tossup_results(results: list[dict]) -> pd.DataFrame:
|
@@ -119,7 +132,7 @@ class TossupInterface:
|
|
119 |
self.model_options = model_options
|
120 |
self.app = app
|
121 |
self.defaults = defaults
|
122 |
-
self.output_state = gr.State(value=
|
123 |
self.render()
|
124 |
|
125 |
# ------------------------------------- LOAD PIPELINE STATE FROM BROWSER STATE -------------------------------------
|
@@ -130,10 +143,10 @@ class TossupInterface:
|
|
130 |
state_dict = browser_state["tossup"].get("pipeline_state", {})
|
131 |
pipeline_state = TossupPipelineState.model_validate(state_dict)
|
132 |
pipeline_state_dict = pipeline_state.model_dump()
|
133 |
-
output_state = browser_state["tossup"].get("output_state",
|
134 |
except Exception as e:
|
135 |
logger.warning(f"Error loading presaved pipeline state: {e}")
|
136 |
-
output_state =
|
137 |
workflow = self.defaults["init_workflow"]
|
138 |
pipeline_state_dict = TossupPipelineState.from_workflow(workflow).model_dump()
|
139 |
return browser_state, not pipeline_change, pipeline_state_dict, output_state
|
@@ -282,20 +295,10 @@ class TossupInterface:
|
|
282 |
# Process results and prepare visualization data
|
283 |
confidence_threshold = workflow.buzzer.confidence_threshold
|
284 |
prob_threshold = workflow.buzzer.prob_threshold
|
285 |
-
tokens_html, plot_data, output_state = initialize_eval_interface(
|
286 |
-
example, outputs, confidence_threshold, prob_threshold
|
287 |
)
|
288 |
df = process_tossup_results(outputs)
|
289 |
-
tokens = example["question"].split()
|
290 |
-
step_outputs = {}
|
291 |
-
for output in outputs:
|
292 |
-
pos = output["token_position"]
|
293 |
-
token = tokens[pos - 1]
|
294 |
-
key = f"{pos}:{token}"
|
295 |
-
step_outputs[key] = {k: v for k, v in output["step_outputs"].items() if k not in workflow.inputs}
|
296 |
-
if output["logprob"] is not None:
|
297 |
-
step_outputs[key]["logprob"] = output["logprob"]
|
298 |
-
step_outputs[key]["prob"] = float(np.exp(output["logprob"]))
|
299 |
|
300 |
return (
|
301 |
tokens_html,
|
|
|
37 |
token_position: int # Position in the question where prediction was made
|
38 |
|
39 |
|
40 |
+
def add_model_scores(run_outputs: list[dict], clean_answers: list[str], run_indices: list[int]) -> list[dict]:
|
41 |
"""Add model scores to the model outputs."""
|
42 |
+
for output in run_outputs:
|
43 |
output["score"] = evaluate_prediction(output["answer"], clean_answers)
|
44 |
+
output["token_position"] = run_indices[output["position"] - 1]
|
45 |
+
return run_outputs
|
46 |
|
47 |
|
48 |
def prepare_buzz_evals(
|
|
|
61 |
|
62 |
|
63 |
def initialize_eval_interface(
|
64 |
+
example: dict,
|
65 |
+
run_outputs: list[dict],
|
66 |
+
input_vars: list,
|
67 |
+
confidence_threshold: float,
|
68 |
+
prob_threshold: float | None = None,
|
69 |
):
|
70 |
"""Initialize the interface with example text."""
|
71 |
try:
|
|
|
73 |
run_indices = example["run_indices"]
|
74 |
answer = example["answer_primary"]
|
75 |
clean_answers = example["clean_answers"]
|
76 |
+
eval_points = [(o["token_position"], o) for o in run_outputs]
|
77 |
|
78 |
if not tokens:
|
79 |
return "<div>No tokens found in the provided text.</div>", pd.DataFrame(), "{}"
|
|
|
81 |
plot_data = create_tossup_confidence_pyplot(tokens, eval_points, confidence_threshold, prob_threshold)
|
82 |
|
83 |
# Store tokens, values, and buzzes as JSON for later use
|
84 |
+
state = {"tokens": tokens, "values": eval_points}
|
85 |
+
|
86 |
+
# Preparing step outputs for the model
|
87 |
+
step_outputs = {}
|
88 |
+
for output in run_outputs:
|
89 |
+
tok_pos = output["token_position"]
|
90 |
+
key = "{pos}:{token}".format(pos=tok_pos + 1, token=tokens[tok_pos])
|
91 |
+
step_outputs[key] = {k: v for k, v in output["step_outputs"].items() if k not in input_vars}
|
92 |
+
if output["logprob"] is not None:
|
93 |
+
step_outputs[key]["output_probability"] = float(np.exp(output["logprob"]))
|
94 |
+
|
95 |
+
return html_content, plot_data, state, step_outputs
|
96 |
except Exception as e:
|
97 |
logger.exception(f"Error initializing interface: {e.args}")
|
98 |
+
return f"<div>Error initializing interface: {str(e)}</div>", pd.DataFrame(), "{}", {}
|
99 |
|
100 |
|
101 |
def process_tossup_results(results: list[dict]) -> pd.DataFrame:
|
|
|
132 |
self.model_options = model_options
|
133 |
self.app = app
|
134 |
self.defaults = defaults
|
135 |
+
self.output_state = gr.State(value={})
|
136 |
self.render()
|
137 |
|
138 |
# ------------------------------------- LOAD PIPELINE STATE FROM BROWSER STATE -------------------------------------
|
|
|
143 |
state_dict = browser_state["tossup"].get("pipeline_state", {})
|
144 |
pipeline_state = TossupPipelineState.model_validate(state_dict)
|
145 |
pipeline_state_dict = pipeline_state.model_dump()
|
146 |
+
output_state = browser_state["tossup"].get("output_state", {})
|
147 |
except Exception as e:
|
148 |
logger.warning(f"Error loading presaved pipeline state: {e}")
|
149 |
+
output_state = {}
|
150 |
workflow = self.defaults["init_workflow"]
|
151 |
pipeline_state_dict = TossupPipelineState.from_workflow(workflow).model_dump()
|
152 |
return browser_state, not pipeline_change, pipeline_state_dict, output_state
|
|
|
295 |
# Process results and prepare visualization data
|
296 |
confidence_threshold = workflow.buzzer.confidence_threshold
|
297 |
prob_threshold = workflow.buzzer.prob_threshold
|
298 |
+
tokens_html, plot_data, output_state, step_outputs = initialize_eval_interface(
|
299 |
+
example, outputs, workflow.inputs, confidence_threshold, prob_threshold
|
300 |
)
|
301 |
df = process_tossup_results(outputs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
|
303 |
return (
|
304 |
tokens_html,
|