Maharshi Gor
commited on
Commit
·
38e3800
1
Parent(s):
2900a81
Refactored single step and multi step qb agents into one module as QB Agents.
Browse files- src/components/quizbowl/bonus.py +2 -7
- src/components/quizbowl/tossup.py +2 -8
- src/workflows/executors.py +119 -43
- src/workflows/qb/__init__.py +0 -0
- src/workflows/qb/simple_agent.py +0 -186
- src/workflows/{qb/multi_step_agent.py → qb_agents.py} +27 -23
- src/workflows/quizbowl_agent.py +0 -269
- src/workflows/structs.py +7 -3
- tests/test_executors.py +72 -12
src/components/quizbowl/bonus.py
CHANGED
@@ -9,8 +9,7 @@ from loguru import logger
|
|
9 |
from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState, PipelineUIState
|
10 |
from display.formatting import styled_error
|
11 |
from submission import submit
|
12 |
-
from workflows.
|
13 |
-
from workflows.qb.simple_agent import SimpleBonusAgent
|
14 |
from workflows.structs import ModelStep, Workflow
|
15 |
|
16 |
from . import commons
|
@@ -211,11 +210,7 @@ class BonusInterface:
|
|
211 |
"""Get the model outputs for a given question ID."""
|
212 |
outputs = []
|
213 |
leadin = example["leadin"]
|
214 |
-
|
215 |
-
if len(workflow.steps) > 1:
|
216 |
-
agent = MultiStepBonusAgent(workflow)
|
217 |
-
else:
|
218 |
-
agent = SimpleBonusAgent(workflow)
|
219 |
|
220 |
for i, part in enumerate(example["parts"]):
|
221 |
# Run model for each part
|
|
|
9 |
from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState, PipelineUIState
|
10 |
from display.formatting import styled_error
|
11 |
from submission import submit
|
12 |
+
from workflows.qb_agents import QuizBowlBonusAgent
|
|
|
13 |
from workflows.structs import ModelStep, Workflow
|
14 |
|
15 |
from . import commons
|
|
|
210 |
"""Get the model outputs for a given question ID."""
|
211 |
outputs = []
|
212 |
leadin = example["leadin"]
|
213 |
+
agent = QuizBowlBonusAgent(pipeline_state.workflow)
|
|
|
|
|
|
|
|
|
214 |
|
215 |
for i, part in enumerate(example["parts"]):
|
216 |
# Run model for each part
|
src/components/quizbowl/tossup.py
CHANGED
@@ -10,8 +10,7 @@ from loguru import logger
|
|
10 |
from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState, PipelineUIState
|
11 |
from display.formatting import styled_error
|
12 |
from submission import submit
|
13 |
-
from workflows.
|
14 |
-
from workflows.qb.simple_agent import SimpleTossupAgent
|
15 |
from workflows.structs import ModelStep, Workflow
|
16 |
|
17 |
from . import commons
|
@@ -275,12 +274,7 @@ class TossupInterface:
|
|
275 |
tokens = example["question"].split()
|
276 |
for run_idx in example["run_indices"]:
|
277 |
question_runs.append(" ".join(tokens[: run_idx + 1]))
|
278 |
-
|
279 |
-
workflow = pipeline_state.workflow
|
280 |
-
if len(workflow.steps) > 1:
|
281 |
-
agent = MultiStepTossupAgent(workflow, buzz_threshold)
|
282 |
-
else:
|
283 |
-
agent = SimpleTossupAgent(workflow, buzz_threshold)
|
284 |
outputs = list(agent.run(question_runs, early_stop=early_stop))
|
285 |
outputs = add_model_scores(outputs, example["clean_answers"], example["run_indices"])
|
286 |
return outputs
|
|
|
10 |
from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState, PipelineUIState
|
11 |
from display.formatting import styled_error
|
12 |
from submission import submit
|
13 |
+
from workflows.qb_agents import QuizBowlTossupAgent
|
|
|
14 |
from workflows.structs import ModelStep, Workflow
|
15 |
|
16 |
from . import commons
|
|
|
274 |
tokens = example["question"].split()
|
275 |
for run_idx in example["run_indices"]:
|
276 |
question_runs.append(" ".join(tokens[: run_idx + 1]))
|
277 |
+
agent = QuizBowlTossupAgent(pipeline_state.workflow, buzz_threshold)
|
|
|
|
|
|
|
|
|
|
|
278 |
outputs = list(agent.run(question_runs, early_stop=early_stop))
|
279 |
outputs = add_model_scores(outputs, example["clean_answers"], example["run_indices"])
|
280 |
return outputs
|
src/workflows/executors.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
# %%
|
2 |
-
import json
|
3 |
from typing import Any
|
4 |
|
5 |
import pydantic
|
@@ -178,42 +177,10 @@ def execute_model_step(
|
|
178 |
return outputs
|
179 |
|
180 |
|
181 |
-
# Example usage
|
182 |
-
if __name__ == "__main__":
|
183 |
-
# Define a simple model step
|
184 |
-
model_step = ModelStep(
|
185 |
-
id="step1",
|
186 |
-
model="gpt-4o-mini",
|
187 |
-
provider="OpenAI",
|
188 |
-
call_type="llm",
|
189 |
-
system_prompt="You are a simple NLP tool that takes a string, and a number N, and return the first N entities in the string, and the total count of entities in the string.",
|
190 |
-
input_fields=[
|
191 |
-
InputField(name="sentence", description="The sentence to process", variable="sentence", func=None),
|
192 |
-
InputField(name="n", description="The number of entities to return", variable="n", func=None),
|
193 |
-
],
|
194 |
-
output_fields=[
|
195 |
-
OutputField(
|
196 |
-
name="entities",
|
197 |
-
description="The first N entities in the string as a list of strings",
|
198 |
-
type="list[str]",
|
199 |
-
func=None,
|
200 |
-
),
|
201 |
-
OutputField(name="count", description="The total count of entities in the string", type="int", func=None),
|
202 |
-
],
|
203 |
-
)
|
204 |
-
|
205 |
-
# Define processed inputs
|
206 |
-
processed_inputs = {"sentence": "Abdul Akbar is a good person, but Jesus is the son of God.", "n": 3}
|
207 |
-
|
208 |
-
# Execute the model step
|
209 |
-
outputs = execute_model_step(model_step, processed_inputs)
|
210 |
-
print(outputs)
|
211 |
-
|
212 |
-
|
213 |
# %%
|
214 |
-
def
|
215 |
workflow: Workflow, input_values: dict[str, Any], return_full_content: bool = False
|
216 |
-
) -> dict[str, Any]
|
217 |
"""
|
218 |
Execute the given workflow as a computational graph.
|
219 |
|
@@ -234,10 +201,14 @@ def execute_workflow(
|
|
234 |
dependencies, and input/output specifications.
|
235 |
input_values (dict[str, Any]): External input values to be used by the workflow.
|
236 |
Keys should match the required workflow.inputs.
|
|
|
|
|
237 |
|
238 |
Returns:
|
239 |
-
|
240 |
-
|
|
|
|
|
241 |
|
242 |
Raises:
|
243 |
UnknownVariableError: If an input_field references a variable that is not
|
@@ -255,12 +226,12 @@ def execute_workflow(
|
|
255 |
... "analyze": ModelStep(...) # A step that analyzes the entities
|
256 |
... },
|
257 |
... inputs=["text"],
|
258 |
-
... outputs=
|
259 |
... )
|
260 |
-
>>>
|
261 |
-
>>> print(
|
262 |
"positive"
|
263 |
-
>>> print(
|
264 |
["Apple", "product"]
|
265 |
"""
|
266 |
# Step 1: Pre-populate computed values with external workflow inputs.
|
@@ -280,11 +251,15 @@ def execute_workflow(
|
|
280 |
execution_order = topological_sort(dependencies)
|
281 |
|
282 |
# Step 4: Execute steps in topological order.
|
|
|
283 |
for step_id in execution_order:
|
284 |
step = workflow.steps[step_id]
|
285 |
|
|
|
286 |
# Execute the step
|
287 |
-
|
|
|
|
|
288 |
outputs = {f"{step_id}.{k}": v for k, v in outputs.items()}
|
289 |
computed_values.update(outputs)
|
290 |
|
@@ -295,7 +270,77 @@ def execute_workflow(
|
|
295 |
raise WorkflowError(f"Workflow output variable {var} was not produced")
|
296 |
final_outputs[target] = computed_values[var]
|
297 |
|
298 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
|
300 |
|
301 |
def run_examples():
|
@@ -438,3 +483,34 @@ if __name__ == "__main__":
|
|
438 |
run_examples()
|
439 |
|
440 |
# %%
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# %%
|
|
|
2 |
from typing import Any
|
3 |
|
4 |
import pydantic
|
|
|
177 |
return outputs
|
178 |
|
179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
# %%
|
181 |
+
def execute_multi_step_workflow(
|
182 |
workflow: Workflow, input_values: dict[str, Any], return_full_content: bool = False
|
183 |
+
) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
|
184 |
"""
|
185 |
Execute the given workflow as a computational graph.
|
186 |
|
|
|
201 |
dependencies, and input/output specifications.
|
202 |
input_values (dict[str, Any]): External input values to be used by the workflow.
|
203 |
Keys should match the required workflow.inputs.
|
204 |
+
return_full_content (bool, optional): If True, returns the full content of each step.
|
205 |
+
Defaults to False.
|
206 |
|
207 |
Returns:
|
208 |
+
A tuple containing:
|
209 |
+
- A dictionary of the workflow's outputs, with keys matching the variables defined in workflow.outputs.
|
210 |
+
- A dictionary of all computed values during workflow execution, including intermediate results.
|
211 |
+
- A dictionary of step contents, only populated if return_full_content is True.
|
212 |
|
213 |
Raises:
|
214 |
UnknownVariableError: If an input_field references a variable that is not
|
|
|
226 |
... "analyze": ModelStep(...) # A step that analyzes the entities
|
227 |
... },
|
228 |
... inputs=["text"],
|
229 |
+
... outputs={"sentiment": "analyze.sentiment", "entities": "extract.entities"}
|
230 |
... )
|
231 |
+
>>> final_outputs, computed_values, step_contents = execute_workflow(workflow, {"text": "Apple is launching a new product tomorrow."})
|
232 |
+
>>> print(final_outputs["sentiment"])
|
233 |
"positive"
|
234 |
+
>>> print(final_outputs["entities"])
|
235 |
["Apple", "product"]
|
236 |
"""
|
237 |
# Step 1: Pre-populate computed values with external workflow inputs.
|
|
|
251 |
execution_order = topological_sort(dependencies)
|
252 |
|
253 |
# Step 4: Execute steps in topological order.
|
254 |
+
step_contents: dict[str, Any] = {}
|
255 |
for step_id in execution_order:
|
256 |
step = workflow.steps[step_id]
|
257 |
|
258 |
+
outputs = execute_model_step(step, computed_values, return_full_content=return_full_content)
|
259 |
# Execute the step
|
260 |
+
if return_full_content:
|
261 |
+
outputs, content = outputs
|
262 |
+
step_contents[step_id] = content
|
263 |
outputs = {f"{step_id}.{k}": v for k, v in outputs.items()}
|
264 |
computed_values.update(outputs)
|
265 |
|
|
|
270 |
raise WorkflowError(f"Workflow output variable {var} was not produced")
|
271 |
final_outputs[target] = computed_values[var]
|
272 |
|
273 |
+
step_outputs = {k: v for k, v in computed_values.items() if k not in workflow.inputs}
|
274 |
+
|
275 |
+
return final_outputs, step_outputs, step_contents
|
276 |
+
|
277 |
+
|
278 |
+
def execute_simple_workflow(
|
279 |
+
workflow: Workflow, input_values: dict[str, Any], return_full_content: bool = False
|
280 |
+
) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
|
281 |
+
"""Execute a simple workflow with a single step.
|
282 |
+
|
283 |
+
This is a simplified version of execute_workflow for workflows with only one step.
|
284 |
+
|
285 |
+
Args:
|
286 |
+
workflow: The workflow to execute
|
287 |
+
input_values: Dictionary of input values
|
288 |
+
return_full_content: Whether to return the full content of each step
|
289 |
+
|
290 |
+
Returns:
|
291 |
+
Tuple containing:
|
292 |
+
- final_outputs: Dictionary of workflow outputs
|
293 |
+
- computed_values: Dictionary of all computed values
|
294 |
+
- step_contents: Dictionary of step contents (if return_full_content=True)
|
295 |
+
|
296 |
+
Raises:
|
297 |
+
WorkflowError: If the workflow has more than one step
|
298 |
+
"""
|
299 |
+
if len(workflow.steps) != 1:
|
300 |
+
raise WorkflowError("Simple workflow must have exactly one step")
|
301 |
+
|
302 |
+
# Get the single step
|
303 |
+
step = list(workflow.steps.values())[0]
|
304 |
+
|
305 |
+
# Validate inputs
|
306 |
+
for var in workflow.inputs:
|
307 |
+
if var not in input_values:
|
308 |
+
raise WorkflowError(f"Missing required workflow input: {var}")
|
309 |
+
|
310 |
+
# Execute the step
|
311 |
+
if return_full_content:
|
312 |
+
step_outputs, content = execute_model_step(step, input_values, return_full_content=True)
|
313 |
+
step_contents = {step.id: content}
|
314 |
+
else:
|
315 |
+
step_outputs = execute_model_step(step, input_values, return_full_content=False)
|
316 |
+
step_contents = {}
|
317 |
+
|
318 |
+
# Prepare the final outputs
|
319 |
+
final_outputs = {}
|
320 |
+
for target, var in workflow.outputs.items():
|
321 |
+
if var.startswith(f"{step.id}."):
|
322 |
+
output_key = var.split(".", 1)[1]
|
323 |
+
if output_key in step_outputs:
|
324 |
+
final_outputs[target] = step_outputs[output_key]
|
325 |
+
else:
|
326 |
+
raise WorkflowError(f"Workflow output variable {var} was not produced")
|
327 |
+
else:
|
328 |
+
raise WorkflowError(f"Invalid output mapping: {var} does not match step ID {step.id}")
|
329 |
+
|
330 |
+
# Prepare computed values (prefixed with step ID)
|
331 |
+
computed_values = input_values.copy()
|
332 |
+
computed_values.update({f"{step.id}.{k}": v for k, v in step_outputs.items()})
|
333 |
+
|
334 |
+
return final_outputs, computed_values, step_contents
|
335 |
+
|
336 |
+
|
337 |
+
def execute_workflow(
|
338 |
+
workflow: Workflow, input_values: dict[str, Any], return_full_content: bool = False
|
339 |
+
) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
|
340 |
+
if len(workflow.steps) > 1:
|
341 |
+
return execute_multi_step_workflow(workflow, input_values, return_full_content)
|
342 |
+
else:
|
343 |
+
return execute_simple_workflow(workflow, input_values, return_full_content)
|
344 |
|
345 |
|
346 |
def run_examples():
|
|
|
483 |
run_examples()
|
484 |
|
485 |
# %%
|
486 |
+
|
487 |
+
# Example usage
|
488 |
+
if __name__ == "__main__":
|
489 |
+
# Define a simple model step
|
490 |
+
model_step = ModelStep(
|
491 |
+
id="step1",
|
492 |
+
model="gpt-4o-mini",
|
493 |
+
provider="OpenAI",
|
494 |
+
call_type="llm",
|
495 |
+
system_prompt="You are a simple NLP tool that takes a string, and a number N, and return the first N entities in the string, and the total count of entities in the string.",
|
496 |
+
input_fields=[
|
497 |
+
InputField(name="sentence", description="The sentence to process", variable="sentence", func=None),
|
498 |
+
InputField(name="n", description="The number of entities to return", variable="n", func=None),
|
499 |
+
],
|
500 |
+
output_fields=[
|
501 |
+
OutputField(
|
502 |
+
name="entities",
|
503 |
+
description="The first N entities in the string as a list of strings",
|
504 |
+
type="list[str]",
|
505 |
+
func=None,
|
506 |
+
),
|
507 |
+
OutputField(name="count", description="The total count of entities in the string", type="int", func=None),
|
508 |
+
],
|
509 |
+
)
|
510 |
+
|
511 |
+
# Define processed inputs
|
512 |
+
processed_inputs = {"sentence": "Abdul Akbar is a good person, but Jesus is the son of God.", "n": 3}
|
513 |
+
|
514 |
+
# Execute the model step
|
515 |
+
outputs = execute_model_step(model_step, processed_inputs)
|
516 |
+
print(outputs)
|
src/workflows/qb/__init__.py
DELETED
File without changes
|
src/workflows/qb/simple_agent.py
DELETED
@@ -1,186 +0,0 @@
|
|
1 |
-
import time
|
2 |
-
from typing import Any, Iterable
|
3 |
-
|
4 |
-
# from litellm import completion
|
5 |
-
from llms import completion
|
6 |
-
from workflows.executors import execute_model_step, execute_workflow
|
7 |
-
from workflows.structs import ModelStep, Workflow
|
8 |
-
|
9 |
-
|
10 |
-
def _get_agent_response(self, prompt: str, system_prompt: str) -> dict:
|
11 |
-
"""Get response from the LLM model."""
|
12 |
-
messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
|
13 |
-
|
14 |
-
start_time = time.time()
|
15 |
-
response = completion(
|
16 |
-
model=self.model,
|
17 |
-
messages=messages,
|
18 |
-
temperature=self.temperature,
|
19 |
-
max_tokens=150, # Limit token usage for faster responses
|
20 |
-
)
|
21 |
-
response_time = time.time() - start_time
|
22 |
-
|
23 |
-
return response, response_time
|
24 |
-
|
25 |
-
|
26 |
-
def _get_model_step_response(
|
27 |
-
model_step: ModelStep, available_vars: dict[str, Any]
|
28 |
-
) -> tuple[dict[str, Any], str, float]:
|
29 |
-
"""Get response from the LLM model."""
|
30 |
-
start_time = time.time()
|
31 |
-
response, content = execute_model_step(model_step, available_vars, return_full_content=True)
|
32 |
-
response_time = time.time() - start_time
|
33 |
-
return response, content, response_time
|
34 |
-
|
35 |
-
|
36 |
-
class SimpleTossupAgent:
|
37 |
-
external_input_variable = "question_text"
|
38 |
-
output_variables = ["answer", "confidence"]
|
39 |
-
|
40 |
-
def __init__(self, workflow: Workflow, buzz_threshold: float):
|
41 |
-
steps = list(workflow.steps.values())
|
42 |
-
assert len(steps) == 1, "Only one step is allowed in a simple workflow"
|
43 |
-
self.model_step = steps[0]
|
44 |
-
self.buzz_threshold = buzz_threshold
|
45 |
-
self.output_variables = list(workflow.outputs.keys())
|
46 |
-
|
47 |
-
if self.external_input_variable not in workflow.inputs:
|
48 |
-
raise ValueError(f"External input variable {self.external_input_variable} not found in model step inputs")
|
49 |
-
|
50 |
-
for out_var in self.output_variables:
|
51 |
-
if out_var not in workflow.outputs:
|
52 |
-
raise ValueError(f"Output variable {out_var} not found in the workflow outputs")
|
53 |
-
|
54 |
-
def run(self, question_runs: list[str], early_stop: bool = True) -> Iterable[dict]:
|
55 |
-
"""
|
56 |
-
Process a tossup question and decide when to buzz based on confidence.
|
57 |
-
|
58 |
-
Args:
|
59 |
-
question_runs: Progressive reveals of the question text
|
60 |
-
early_stop: Whether to stop after the first buzz
|
61 |
-
|
62 |
-
Yields:
|
63 |
-
Dict with answer, confidence, and whether to buzz
|
64 |
-
"""
|
65 |
-
|
66 |
-
for i, question_text in enumerate(question_runs):
|
67 |
-
response, content, response_time = _get_model_step_response(
|
68 |
-
self.model_step, {self.external_input_variable: question_text}
|
69 |
-
)
|
70 |
-
buzz = response["confidence"] >= self.buzz_threshold
|
71 |
-
result = {
|
72 |
-
"answer": response["answer"],
|
73 |
-
"confidence": response["confidence"],
|
74 |
-
"buzz": buzz,
|
75 |
-
"question_fragment": question_text,
|
76 |
-
"position": i + 1,
|
77 |
-
"full_response": content,
|
78 |
-
"response_time": response_time,
|
79 |
-
}
|
80 |
-
|
81 |
-
yield result
|
82 |
-
|
83 |
-
# If we've reached the confidence threshold, buzz and stop
|
84 |
-
if early_stop and buzz:
|
85 |
-
return
|
86 |
-
|
87 |
-
|
88 |
-
class SimpleBonusAgent:
|
89 |
-
external_input_variables = ["leadin", "part"]
|
90 |
-
output_variables = ["answer", "confidence", "explanation"]
|
91 |
-
|
92 |
-
def __init__(self, workflow: Workflow):
|
93 |
-
steps = list(workflow.steps.values())
|
94 |
-
assert len(steps) == 1, "Only one step is allowed in a simple workflow"
|
95 |
-
self.model_step = steps[0]
|
96 |
-
self.output_variables = list(workflow.outputs.keys())
|
97 |
-
|
98 |
-
# Validate input variables
|
99 |
-
for input_var in self.external_input_variables:
|
100 |
-
if input_var not in workflow.inputs:
|
101 |
-
raise ValueError(f"External input variable {input_var} not found in model step inputs")
|
102 |
-
|
103 |
-
# Validate output variables
|
104 |
-
for out_var in self.output_variables:
|
105 |
-
if out_var not in workflow.outputs:
|
106 |
-
raise ValueError(f"Output variable {out_var} not found in the workflow outputs")
|
107 |
-
|
108 |
-
def run(self, leadin: str, part: str) -> dict:
|
109 |
-
"""
|
110 |
-
Process a bonus part with the given leadin.
|
111 |
-
|
112 |
-
Args:
|
113 |
-
leadin: The leadin text for the bonus question
|
114 |
-
part: The specific part text to answer
|
115 |
-
|
116 |
-
Returns:
|
117 |
-
Dict with answer, confidence, and explanation
|
118 |
-
"""
|
119 |
-
response, content, response_time = _get_model_step_response(
|
120 |
-
self.model_step,
|
121 |
-
{
|
122 |
-
"leadin": leadin,
|
123 |
-
"part": part,
|
124 |
-
},
|
125 |
-
)
|
126 |
-
|
127 |
-
return {
|
128 |
-
"answer": response["answer"],
|
129 |
-
"confidence": response["confidence"],
|
130 |
-
"explanation": response["explanation"],
|
131 |
-
"full_response": content,
|
132 |
-
"response_time": response_time,
|
133 |
-
}
|
134 |
-
|
135 |
-
|
136 |
-
# Example usage
|
137 |
-
if __name__ == "__main__":
|
138 |
-
# Load the Quizbowl dataset
|
139 |
-
from datasets import load_dataset
|
140 |
-
|
141 |
-
from workflows.factory import create_quizbowl_bonus_step_initial_setup, create_quizbowl_simple_step_initial_setup
|
142 |
-
|
143 |
-
ds_name = "umdclip/leaderboard_co_set"
|
144 |
-
ds = load_dataset(ds_name, split="train")
|
145 |
-
|
146 |
-
# Create the agents
|
147 |
-
tossup_step = create_quizbowl_simple_step_initial_setup()
|
148 |
-
tossup_step.model = "gpt-4"
|
149 |
-
tossup_step.provider = "openai"
|
150 |
-
tossup_agent = SimpleTossupAgent(workflow=tossup_step, buzz_threshold=0.9)
|
151 |
-
|
152 |
-
bonus_step = create_quizbowl_bonus_step_initial_setup()
|
153 |
-
bonus_step.model = "gpt-4"
|
154 |
-
bonus_step.provider = "openai"
|
155 |
-
bonus_agent = SimpleBonusAgent(workflow=bonus_step)
|
156 |
-
|
157 |
-
# Example for tossup mode
|
158 |
-
print("\n=== TOSSUP MODE EXAMPLE ===")
|
159 |
-
sample_question = ds[30]
|
160 |
-
print(sample_question["question_runs"][-1])
|
161 |
-
print(sample_question["gold_label"])
|
162 |
-
print()
|
163 |
-
question_runs = sample_question["question_runs"]
|
164 |
-
|
165 |
-
results = tossup_agent.run(question_runs, early_stop=True)
|
166 |
-
for result in results:
|
167 |
-
print(result["full_response"])
|
168 |
-
print(f"Guess at position {result['position']}: {result['answer']}")
|
169 |
-
print(f"Confidence: {result['confidence']}")
|
170 |
-
if result["buzz"]:
|
171 |
-
print("Buzzed!\n")
|
172 |
-
|
173 |
-
# Example for bonus mode
|
174 |
-
print("\n=== BONUS MODE EXAMPLE ===")
|
175 |
-
sample_bonus = ds[31] # Assuming this is a bonus question
|
176 |
-
leadin = sample_bonus["leadin"]
|
177 |
-
parts = sample_bonus["parts"]
|
178 |
-
|
179 |
-
print(f"Leadin: {leadin}")
|
180 |
-
for i, part in enumerate(parts):
|
181 |
-
print(f"\nPart {i + 1}: {part['part']}")
|
182 |
-
result = bonus_agent.run(leadin, part["part"])
|
183 |
-
print(f"Answer: {result['answer']}")
|
184 |
-
print(f"Confidence: {result['confidence']}")
|
185 |
-
print(f"Explanation: {result['explanation']}")
|
186 |
-
print(f"Response time: {result['response_time']:.2f}s")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/workflows/{qb/multi_step_agent.py → qb_agents.py}
RENAMED
@@ -5,15 +5,19 @@ from workflows.executors import execute_workflow
|
|
5 |
from workflows.structs import Workflow
|
6 |
|
7 |
|
8 |
-
def _get_workflow_response(
|
|
|
|
|
9 |
"""Get response from executing a complete workflow."""
|
10 |
start_time = time.time()
|
11 |
-
|
|
|
|
|
12 |
response_time = time.time() - start_time
|
13 |
-
return
|
14 |
|
15 |
|
16 |
-
class
|
17 |
"""Agent for handling tossup questions with multiple steps in the workflow."""
|
18 |
|
19 |
external_input_variable = "question_text"
|
@@ -53,26 +57,26 @@ class MultiStepTossupAgent:
|
|
53 |
- buzz: Whether to buzz
|
54 |
- question_fragment: Current question text
|
55 |
- position: Current position in question
|
56 |
-
-
|
57 |
- response_time: Time taken for response
|
58 |
- step_outputs: Outputs from each step
|
59 |
"""
|
60 |
for i, question_text in enumerate(question_runs):
|
61 |
# Execute the complete workflow
|
62 |
-
|
63 |
self.workflow, {self.external_input_variable: question_text}
|
64 |
)
|
65 |
-
|
66 |
-
buzz =
|
67 |
result = {
|
68 |
-
"answer":
|
69 |
-
"confidence":
|
70 |
"buzz": buzz,
|
71 |
"question_fragment": question_text,
|
72 |
"position": i + 1,
|
73 |
-
"
|
74 |
"response_time": response_time,
|
75 |
-
"step_outputs":
|
76 |
}
|
77 |
|
78 |
yield result
|
@@ -82,7 +86,7 @@ class MultiStepTossupAgent:
|
|
82 |
return
|
83 |
|
84 |
|
85 |
-
class
|
86 |
"""Agent for handling bonus questions with multiple steps in the workflow."""
|
87 |
|
88 |
external_input_variables = ["leadin", "part"]
|
@@ -119,11 +123,11 @@ class MultiStepBonusAgent:
|
|
119 |
- answer: The model's answer
|
120 |
- confidence: Confidence score
|
121 |
- explanation: Explanation for the answer
|
122 |
-
-
|
123 |
- response_time: Time taken for response
|
124 |
- step_outputs: Outputs from each step
|
125 |
"""
|
126 |
-
|
127 |
self.workflow,
|
128 |
{
|
129 |
"leadin": leadin,
|
@@ -132,12 +136,12 @@ class MultiStepBonusAgent:
|
|
132 |
)
|
133 |
|
134 |
return {
|
135 |
-
"answer":
|
136 |
-
"confidence":
|
137 |
-
"explanation":
|
138 |
-
"
|
139 |
"response_time": response_time,
|
140 |
-
"step_outputs":
|
141 |
}
|
142 |
|
143 |
|
@@ -153,10 +157,10 @@ if __name__ == "__main__":
|
|
153 |
|
154 |
# Create the agents with multi-step workflows
|
155 |
tossup_workflow = create_quizbowl_tossup_workflow()
|
156 |
-
tossup_agent =
|
157 |
|
158 |
bonus_workflow = create_quizbowl_bonus_workflow()
|
159 |
-
bonus_agent =
|
160 |
|
161 |
# Example for tossup mode
|
162 |
print("\n=== TOSSUP MODE EXAMPLE ===")
|
@@ -168,7 +172,7 @@ if __name__ == "__main__":
|
|
168 |
|
169 |
results = tossup_agent.run(question_runs, early_stop=True)
|
170 |
for result in results:
|
171 |
-
print(result["
|
172 |
print(f"Guess at position {result['position']}: {result['answer']}")
|
173 |
print(f"Confidence: {result['confidence']}")
|
174 |
print("Step outputs:", result["step_outputs"])
|
|
|
5 |
from workflows.structs import Workflow
|
6 |
|
7 |
|
8 |
+
def _get_workflow_response(
|
9 |
+
workflow: Workflow, available_vars: dict[str, Any]
|
10 |
+
) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any], float]:
|
11 |
"""Get response from executing a complete workflow."""
|
12 |
start_time = time.time()
|
13 |
+
final_outputs, computed_values, step_contents = execute_workflow(
|
14 |
+
workflow, available_vars, return_full_content=True
|
15 |
+
)
|
16 |
response_time = time.time() - start_time
|
17 |
+
return final_outputs, computed_values, step_contents, response_time
|
18 |
|
19 |
|
20 |
+
class QuizBowlTossupAgent:
|
21 |
"""Agent for handling tossup questions with multiple steps in the workflow."""
|
22 |
|
23 |
external_input_variable = "question_text"
|
|
|
57 |
- buzz: Whether to buzz
|
58 |
- question_fragment: Current question text
|
59 |
- position: Current position in question
|
60 |
+
- step_contents: String content outputs of each step
|
61 |
- response_time: Time taken for response
|
62 |
- step_outputs: Outputs from each step
|
63 |
"""
|
64 |
for i, question_text in enumerate(question_runs):
|
65 |
# Execute the complete workflow
|
66 |
+
final_outputs, computed_values, step_contents, response_time = _get_workflow_response(
|
67 |
self.workflow, {self.external_input_variable: question_text}
|
68 |
)
|
69 |
+
print(f"Workflow response: {final_outputs}")
|
70 |
+
buzz = final_outputs["confidence"] >= self.buzz_threshold
|
71 |
result = {
|
72 |
+
"answer": final_outputs["answer"],
|
73 |
+
"confidence": final_outputs["confidence"],
|
74 |
"buzz": buzz,
|
75 |
"question_fragment": question_text,
|
76 |
"position": i + 1,
|
77 |
+
"step_contents": step_contents,
|
78 |
"response_time": response_time,
|
79 |
+
"step_outputs": computed_values, # Include intermediate step outputs
|
80 |
}
|
81 |
|
82 |
yield result
|
|
|
86 |
return
|
87 |
|
88 |
|
89 |
+
class QuizBowlBonusAgent:
|
90 |
"""Agent for handling bonus questions with multiple steps in the workflow."""
|
91 |
|
92 |
external_input_variables = ["leadin", "part"]
|
|
|
123 |
- answer: The model's answer
|
124 |
- confidence: Confidence score
|
125 |
- explanation: Explanation for the answer
|
126 |
+
- step_contents: String content outputs of each step
|
127 |
- response_time: Time taken for response
|
128 |
- step_outputs: Outputs from each step
|
129 |
"""
|
130 |
+
final_outputs, computed_values, step_contents, response_time = _get_workflow_response(
|
131 |
self.workflow,
|
132 |
{
|
133 |
"leadin": leadin,
|
|
|
136 |
)
|
137 |
|
138 |
return {
|
139 |
+
"answer": final_outputs["answer"],
|
140 |
+
"confidence": final_outputs["confidence"],
|
141 |
+
"explanation": final_outputs["explanation"],
|
142 |
+
"step_contents": step_contents,
|
143 |
"response_time": response_time,
|
144 |
+
"step_outputs": computed_values, # Include intermediate step outputs
|
145 |
}
|
146 |
|
147 |
|
|
|
157 |
|
158 |
# Create the agents with multi-step workflows
|
159 |
tossup_workflow = create_quizbowl_tossup_workflow()
|
160 |
+
tossup_agent = QuizBowlTossupAgent(workflow=tossup_workflow, buzz_threshold=0.9)
|
161 |
|
162 |
bonus_workflow = create_quizbowl_bonus_workflow()
|
163 |
+
bonus_agent = QuizBowlBonusAgent(workflow=bonus_workflow)
|
164 |
|
165 |
# Example for tossup mode
|
166 |
print("\n=== TOSSUP MODE EXAMPLE ===")
|
|
|
172 |
|
173 |
results = tossup_agent.run(question_runs, early_stop=True)
|
174 |
for result in results:
|
175 |
+
print(result["step_contents"])
|
176 |
print(f"Guess at position {result['position']}: {result['answer']}")
|
177 |
print(f"Confidence: {result['confidence']}")
|
178 |
print("Step outputs:", result["step_outputs"])
|
src/workflows/quizbowl_agent.py
DELETED
@@ -1,269 +0,0 @@
|
|
1 |
-
# %%
|
2 |
-
import json
|
3 |
-
import os
|
4 |
-
import time
|
5 |
-
from typing import Dict, Iterable, List, Optional, Tuple, Union
|
6 |
-
|
7 |
-
import litellm
|
8 |
-
from datasets import load_dataset
|
9 |
-
from litellm import completion
|
10 |
-
|
11 |
-
litellm.drop_params = True
|
12 |
-
|
13 |
-
# Set your API key - you can replace this with your actual key or use environment variables
|
14 |
-
os.environ["OPENAI_API_KEY"] = (
|
15 |
-
"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"
|
16 |
-
)
|
17 |
-
|
18 |
-
DEFAULT_SYS_PROMPT = """
|
19 |
-
You are a Quizbowl expert. You will be given a question that's progressively revealed.
|
20 |
-
Your goal is to identify the answer as quickly as possible with high confidence.
|
21 |
-
Respond with a JSON object with two fields:
|
22 |
-
1. "answer": Your best guess for the answer
|
23 |
-
2. "confidence": Your confidence in your answer from 0.0 to 1.0
|
24 |
-
|
25 |
-
DO NOT include any explanation. ONLY return the JSON object.
|
26 |
-
"""
|
27 |
-
|
28 |
-
|
29 |
-
class QuizbowlAgent:
|
30 |
-
"""
|
31 |
-
An agent for playing Quizbowl with two modes:
|
32 |
-
1. Tossup mode: Fast and direct with confidence calibration for buzzing
|
33 |
-
2. Bonus round mode: Provides guess, rationale, and confidence
|
34 |
-
"""
|
35 |
-
|
36 |
-
def __init__(
|
37 |
-
self,
|
38 |
-
model: str = "gpt-4o-mini",
|
39 |
-
buzz_threshold: float = 0.85,
|
40 |
-
temperature: float = 0.2,
|
41 |
-
system_prompt: str = DEFAULT_SYS_PROMPT,
|
42 |
-
):
|
43 |
-
"""
|
44 |
-
Initialize the QuizbowlAgent.
|
45 |
-
|
46 |
-
Args:
|
47 |
-
model: The LLM model to use for answering
|
48 |
-
buzz_threshold: Confidence threshold for buzzing in tossup mode (0-1)
|
49 |
-
temperature: Temperature for model sampling
|
50 |
-
"""
|
51 |
-
self.model = model
|
52 |
-
self.buzz_threshold = buzz_threshold
|
53 |
-
self.temperature = temperature
|
54 |
-
self.system_prompt = system_prompt
|
55 |
-
|
56 |
-
def _process_question_runs(self, question_runs: List[str]) -> List[str]:
|
57 |
-
"""Process question runs to extract increasing amounts of text."""
|
58 |
-
# For simpler testing, just return the runs as they are in the dataset
|
59 |
-
return question_runs
|
60 |
-
|
61 |
-
def _get_agent_response(self, prompt: str, system_prompt: str) -> Dict:
|
62 |
-
"""Get response from the LLM model."""
|
63 |
-
messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
|
64 |
-
|
65 |
-
start_time = time.time()
|
66 |
-
response = completion(
|
67 |
-
model=self.model,
|
68 |
-
messages=messages,
|
69 |
-
temperature=self.temperature,
|
70 |
-
max_tokens=150, # Limit token usage for faster responses
|
71 |
-
)
|
72 |
-
response_time = time.time() - start_time
|
73 |
-
|
74 |
-
return response, response_time
|
75 |
-
|
76 |
-
def _extract_confidence_and_answer(self, content: str) -> Tuple[str, float]:
|
77 |
-
"""Extract the answer and confidence score from the model response."""
|
78 |
-
try:
|
79 |
-
# Try to parse JSON from the response
|
80 |
-
data = json.loads(content)
|
81 |
-
answer = data.get("answer", "")
|
82 |
-
confidence = float(data.get("confidence", 0.0))
|
83 |
-
return answer, confidence
|
84 |
-
except (json.JSONDecodeError, ValueError):
|
85 |
-
# Fallback if parsing fails
|
86 |
-
lines = content.strip().split("\n")
|
87 |
-
answer = lines[0] if lines else ""
|
88 |
-
confidence = 0.5 # Default confidence
|
89 |
-
|
90 |
-
# Try to extract confidence from text
|
91 |
-
for line in lines:
|
92 |
-
if "confidence:" in line.lower():
|
93 |
-
try:
|
94 |
-
confidence = float(line.lower().split("confidence:")[1].strip())
|
95 |
-
except (ValueError, IndexError):
|
96 |
-
pass
|
97 |
-
|
98 |
-
return answer, confidence
|
99 |
-
|
100 |
-
def tossup_mode(self, question_runs: List[str]) -> Iterable[Dict]:
|
101 |
-
"""
|
102 |
-
Process a tossup question and decide when to buzz based on confidence.
|
103 |
-
|
104 |
-
Args:
|
105 |
-
question_runs: Progressive reveals of the question text
|
106 |
-
|
107 |
-
Yields:
|
108 |
-
Dict with answer, confidence, and whether to buzz
|
109 |
-
"""
|
110 |
-
|
111 |
-
for i, question_text in enumerate(question_runs):
|
112 |
-
prompt = f"Question: {question_text}\n\nProvide your answer and confidence level:"
|
113 |
-
|
114 |
-
response, response_time = self._get_agent_response(prompt, DEFAULT_SYS_PROMPT)
|
115 |
-
content = response.choices[0].message.content
|
116 |
-
|
117 |
-
answer, confidence = self._extract_confidence_and_answer(content)
|
118 |
-
|
119 |
-
result = {
|
120 |
-
"answer": answer,
|
121 |
-
"confidence": confidence,
|
122 |
-
"buzz": confidence >= self.buzz_threshold,
|
123 |
-
"question_fragment": question_text,
|
124 |
-
"position": i + 1,
|
125 |
-
"full_response": content,
|
126 |
-
"response_time": response_time,
|
127 |
-
}
|
128 |
-
|
129 |
-
yield result
|
130 |
-
|
131 |
-
# If we've reached the confidence threshold, buzz and stop
|
132 |
-
if confidence >= self.buzz_threshold:
|
133 |
-
return
|
134 |
-
|
135 |
-
def tossup_mode_top5(self, question_runs: List[str]) -> Iterable[Dict]:
|
136 |
-
"""
|
137 |
-
Process a tossup question and provide the top 5 guesses with confidence levels.
|
138 |
-
|
139 |
-
Args:
|
140 |
-
question_runs: Progressive reveals of the question text
|
141 |
-
|
142 |
-
Returns:
|
143 |
-
Dict with top 5 answers, their confidences, and whether to buzz
|
144 |
-
"""
|
145 |
-
|
146 |
-
for i, question_text in enumerate(question_runs):
|
147 |
-
prompt = f"Question: {question_text}\n\nProvide your top 5 answers and confidence levels."
|
148 |
-
|
149 |
-
response, response_time = self._get_agent_response(prompt, self.system_prompt)
|
150 |
-
content = response.choices[0].message.content
|
151 |
-
|
152 |
-
try:
|
153 |
-
# Try to parse JSON from the response
|
154 |
-
data = json.loads(content)
|
155 |
-
guesses = data.get("guesses", [])
|
156 |
-
except (json.JSONDecodeError, ValueError):
|
157 |
-
# Fallback if parsing fails
|
158 |
-
guesses = []
|
159 |
-
|
160 |
-
result = {
|
161 |
-
"guesses": guesses,
|
162 |
-
"buzz": any(guess["confidence"] >= self.buzz_threshold for guess in guesses),
|
163 |
-
"question_fragment": question_text,
|
164 |
-
"position": i + 1,
|
165 |
-
"full_response": content,
|
166 |
-
"response_time": response_time,
|
167 |
-
}
|
168 |
-
|
169 |
-
yield result
|
170 |
-
|
171 |
-
# If any guess reaches the confidence threshold, buzz and stop
|
172 |
-
if result["buzz"]:
|
173 |
-
return
|
174 |
-
|
175 |
-
def bonus_round_mode(self, question: str) -> Dict:
|
176 |
-
"""
|
177 |
-
Process a bonus round question with detailed analysis.
|
178 |
-
|
179 |
-
Args:
|
180 |
-
question: The bonus question text
|
181 |
-
|
182 |
-
Returns:
|
183 |
-
Dict with answer, rationale, and confidence
|
184 |
-
"""
|
185 |
-
system_prompt = """
|
186 |
-
You are a Quizbowl expert answering a bonus question. Provide:
|
187 |
-
1. Your direct answer
|
188 |
-
2. A very brief and crisp one line rationale for your answer (key clues that led to it)
|
189 |
-
3. Your confidence level (0.0-1.0)
|
190 |
-
|
191 |
-
Respond with a JSON object with these three fields:
|
192 |
-
{
|
193 |
-
"answer": "Your answer here",
|
194 |
-
"rationale": "Your reasoning here",
|
195 |
-
"confidence": 0.XX
|
196 |
-
}
|
197 |
-
"""
|
198 |
-
|
199 |
-
prompt = f"Bonus Question: {question}\n\nProvide your answer, rationale, and confidence:"
|
200 |
-
|
201 |
-
response = self._get_agent_response(prompt, system_prompt)
|
202 |
-
content = response.choices[0].message.content
|
203 |
-
|
204 |
-
try:
|
205 |
-
# Try to parse JSON
|
206 |
-
result = json.loads(content)
|
207 |
-
# Ensure all fields are present
|
208 |
-
if not all(k in result for k in ["answer", "rationale", "confidence"]):
|
209 |
-
raise ValueError("Missing fields in response")
|
210 |
-
except (json.JSONDecodeError, ValueError):
|
211 |
-
# If parsing fails, extract manually
|
212 |
-
lines = content.strip().split("\n")
|
213 |
-
result = {"answer": "", "rationale": "", "confidence": 0.5}
|
214 |
-
|
215 |
-
for line in lines:
|
216 |
-
if line.lower().startswith("answer:"):
|
217 |
-
result["answer"] = line[7:].strip()
|
218 |
-
elif line.lower().startswith("rationale:"):
|
219 |
-
result["rationale"] = line[10:].strip()
|
220 |
-
elif line.lower().startswith("confidence:"):
|
221 |
-
try:
|
222 |
-
result["confidence"] = float(line[11:].strip())
|
223 |
-
except ValueError:
|
224 |
-
pass
|
225 |
-
|
226 |
-
return result
|
227 |
-
|
228 |
-
|
229 |
-
# %%
|
230 |
-
# Example usage
|
231 |
-
if __name__ == "__main__":
|
232 |
-
# Load the Quizbowl dataset
|
233 |
-
ds_name = "umdclip/leaderboard_co_set"
|
234 |
-
ds = load_dataset(ds_name, split="train")
|
235 |
-
|
236 |
-
# Create the agent
|
237 |
-
agent = QuizbowlAgent(model="gpt-4-turbo", buzz_threshold=0.85)
|
238 |
-
|
239 |
-
# Example for tossup mode
|
240 |
-
print("\n=== TOSSUP MODE EXAMPLE ===")
|
241 |
-
sample_question = ds[0]
|
242 |
-
print(sample_question["question_runs"][-1])
|
243 |
-
print(sample_question["gold_label"])
|
244 |
-
question_runs = sample_question["question_runs"]
|
245 |
-
|
246 |
-
results = agent.tossup_mode(question_runs)
|
247 |
-
for result in results:
|
248 |
-
print(f"Guess at position {result['position']}: {result['answer']}")
|
249 |
-
print(f"Confidence: {result['confidence']}")
|
250 |
-
if result["buzz"]:
|
251 |
-
print("Buzzed!\n")
|
252 |
-
|
253 |
-
results = agent.tossup_mode_top5(question_runs)
|
254 |
-
for result in results:
|
255 |
-
guesses = [f"{guess['answer']} ({guess['confidence']})" for guess in result["guesses"]]
|
256 |
-
print(f"Guesses at position {result['position']}: {', '.join(guesses)}")
|
257 |
-
if result["buzz"]:
|
258 |
-
print("Buzzed!")
|
259 |
-
|
260 |
-
# Example for bonus round mode
|
261 |
-
print("\n=== BONUS ROUND MODE EXAMPLE ===")
|
262 |
-
bonus_question = sample_question["question_runs"][-1]
|
263 |
-
|
264 |
-
bonus_result = agent.bonus_round_mode(bonus_question)
|
265 |
-
print(f"Answer: {bonus_result['answer']}")
|
266 |
-
print(f"Rationale: {bonus_result['rationale']}")
|
267 |
-
print(f"Confidence: {bonus_result['confidence']}")
|
268 |
-
|
269 |
-
# %%
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/workflows/structs.py
CHANGED
@@ -202,9 +202,13 @@ class Workflow(BaseModel):
|
|
202 |
if "steps" in data and isinstance(data["steps"], list):
|
203 |
steps_dict = {}
|
204 |
for step in data["steps"]:
|
205 |
-
if step
|
206 |
-
|
207 |
-
|
|
|
|
|
|
|
|
|
208 |
data["steps"] = steps_dict
|
209 |
return data
|
210 |
|
|
|
202 |
if "steps" in data and isinstance(data["steps"], list):
|
203 |
steps_dict = {}
|
204 |
for step in data["steps"]:
|
205 |
+
if isinstance(step, ModelStep):
|
206 |
+
step_id = step.id
|
207 |
+
else:
|
208 |
+
step_id = step["id"]
|
209 |
+
if step_id in steps_dict:
|
210 |
+
raise ValueError(f"Duplicate step ID: {step_id}")
|
211 |
+
steps_dict[step_id] = step
|
212 |
data["steps"] = steps_dict
|
213 |
return data
|
214 |
|
tests/test_executors.py
CHANGED
@@ -45,6 +45,7 @@ def test_create_processed_inputs_basic():
|
|
45 |
"""Test basic input processing without transformations."""
|
46 |
step = ModelStep(
|
47 |
id="test_step",
|
|
|
48 |
model="gpt-4",
|
49 |
provider="openai",
|
50 |
call_type="llm",
|
@@ -62,6 +63,7 @@ def test_create_processed_inputs_with_transformation():
|
|
62 |
"""Test input processing with transformation functions."""
|
63 |
step = ModelStep(
|
64 |
id="test_step",
|
|
|
65 |
model="gpt-4",
|
66 |
provider="openai",
|
67 |
call_type="llm",
|
@@ -82,6 +84,7 @@ def test_create_processed_inputs_missing_var():
|
|
82 |
"""Test that appropriate error is raised when a variable is missing."""
|
83 |
step = ModelStep(
|
84 |
id="test_step",
|
|
|
85 |
model="gpt-4",
|
86 |
provider="openai",
|
87 |
call_type="llm",
|
@@ -99,6 +102,7 @@ def test_create_processed_inputs_unknown_func():
|
|
99 |
"""Test that appropriate error is raised when an unknown function is specified."""
|
100 |
step = ModelStep(
|
101 |
id="test_step",
|
|
|
102 |
model="gpt-4",
|
103 |
provider="openai",
|
104 |
call_type="llm",
|
@@ -116,18 +120,22 @@ def test_create_processed_inputs_unknown_func():
|
|
116 |
# Tests for execute_model_step
|
117 |
|
118 |
|
119 |
-
@patch("workflows.executors.
|
120 |
def test_execute_model_step_success(mock_completion):
|
121 |
"""Test successful execution of a model step with mocked litellm response."""
|
122 |
# Mock the litellm response
|
123 |
-
mock_response = {
|
|
|
|
|
|
|
124 |
mock_completion.return_value = mock_response
|
125 |
|
126 |
# Create a test step
|
127 |
step = ModelStep(
|
128 |
id="summarize",
|
|
|
129 |
model="gpt-3.5-turbo",
|
130 |
-
provider="
|
131 |
call_type="llm",
|
132 |
system_prompt="Summarize the text",
|
133 |
input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
|
@@ -143,11 +151,11 @@ def test_execute_model_step_success(mock_completion):
|
|
143 |
# Verify the litellm call was made correctly
|
144 |
mock_completion.assert_called_once()
|
145 |
args, kwargs = mock_completion.call_args
|
146 |
-
assert kwargs["model"] == "gpt-3.5-turbo"
|
147 |
-
assert "Summarize the text" in kwargs["
|
148 |
|
149 |
|
150 |
-
@patch("workflows.executors.
|
151 |
def test_execute_model_step_error(mock_completion):
|
152 |
"""Test handling of errors in model step execution."""
|
153 |
# Make litellm raise an exception
|
@@ -156,6 +164,7 @@ def test_execute_model_step_error(mock_completion):
|
|
156 |
# Create a test step
|
157 |
step = ModelStep(
|
158 |
id="summarize",
|
|
|
159 |
model="gpt-3.5-turbo",
|
160 |
provider="openai",
|
161 |
call_type="llm",
|
@@ -181,6 +190,7 @@ def test_execute_workflow_simple(mock_execute_step):
|
|
181 |
# Create a simple workflow
|
182 |
step = ModelStep(
|
183 |
id="summarize",
|
|
|
184 |
model="gpt-3.5-turbo",
|
185 |
provider="openai",
|
186 |
call_type="llm",
|
@@ -192,10 +202,14 @@ def test_execute_workflow_simple(mock_execute_step):
|
|
192 |
workflow = Workflow(steps={"summarize": step}, inputs=["input_text"], outputs={"summary": "summarize.summary"})
|
193 |
|
194 |
# Execute the workflow
|
195 |
-
|
|
|
|
|
196 |
|
197 |
# Verify the results
|
198 |
-
assert
|
|
|
|
|
199 |
|
200 |
# Verify execute_model_step was called correctly
|
201 |
mock_execute_step.assert_called_once()
|
@@ -206,7 +220,7 @@ def test_execute_workflow_multi_step(mock_execute_step):
|
|
206 |
"""Test execution of a multi-step workflow with dependencies."""
|
207 |
|
208 |
# Configure mock to return different values based on the step
|
209 |
-
def side_effect(step, available_vars):
|
210 |
if step.id == "extract":
|
211 |
return {"entities": ["Apple", "product"]}
|
212 |
elif step.id == "analyze":
|
@@ -218,6 +232,7 @@ def test_execute_workflow_multi_step(mock_execute_step):
|
|
218 |
# Create extract step
|
219 |
extract_step = ModelStep(
|
220 |
id="extract",
|
|
|
221 |
model="gpt-3.5-turbo",
|
222 |
provider="openai",
|
223 |
call_type="llm",
|
@@ -229,6 +244,7 @@ def test_execute_workflow_multi_step(mock_execute_step):
|
|
229 |
# Create analyze step that depends on extract step
|
230 |
analyze_step = ModelStep(
|
231 |
id="analyze",
|
|
|
232 |
model="gpt-4",
|
233 |
provider="openai",
|
234 |
call_type="llm",
|
@@ -244,10 +260,18 @@ def test_execute_workflow_multi_step(mock_execute_step):
|
|
244 |
)
|
245 |
|
246 |
# Execute the workflow
|
247 |
-
|
|
|
|
|
248 |
|
249 |
# Verify the results
|
250 |
-
assert
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
|
252 |
# Verify execute_model_step was called twice (once for each step)
|
253 |
assert mock_execute_step.call_count == 2
|
@@ -257,6 +281,7 @@ def test_execute_workflow_missing_input():
|
|
257 |
"""Test that an error is raised when a required input is missing."""
|
258 |
step = ModelStep(
|
259 |
id="summarize",
|
|
|
260 |
model="gpt-3.5-turbo",
|
261 |
provider="openai",
|
262 |
call_type="llm",
|
@@ -280,6 +305,7 @@ def test_execute_workflow_cyclic_dependency(mock_dependency_graph):
|
|
280 |
|
281 |
step = ModelStep(
|
282 |
id="test",
|
|
|
283 |
model="gpt-3.5-turbo",
|
284 |
provider="openai",
|
285 |
call_type="llm",
|
@@ -288,8 +314,42 @@ def test_execute_workflow_cyclic_dependency(mock_dependency_graph):
|
|
288 |
output_fields=[],
|
289 |
)
|
290 |
|
291 |
-
workflow = Workflow(steps=
|
292 |
|
293 |
# This should propagate the CyclicDependencyError
|
294 |
with pytest.raises(CyclicDependencyError):
|
295 |
execute_workflow(workflow, {})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
"""Test basic input processing without transformations."""
|
46 |
step = ModelStep(
|
47 |
id="test_step",
|
48 |
+
name="Test Step",
|
49 |
model="gpt-4",
|
50 |
provider="openai",
|
51 |
call_type="llm",
|
|
|
63 |
"""Test input processing with transformation functions."""
|
64 |
step = ModelStep(
|
65 |
id="test_step",
|
66 |
+
name="Test Step",
|
67 |
model="gpt-4",
|
68 |
provider="openai",
|
69 |
call_type="llm",
|
|
|
84 |
"""Test that appropriate error is raised when a variable is missing."""
|
85 |
step = ModelStep(
|
86 |
id="test_step",
|
87 |
+
name="Test Step",
|
88 |
model="gpt-4",
|
89 |
provider="openai",
|
90 |
call_type="llm",
|
|
|
102 |
"""Test that appropriate error is raised when an unknown function is specified."""
|
103 |
step = ModelStep(
|
104 |
id="test_step",
|
105 |
+
name="Test Step",
|
106 |
model="gpt-4",
|
107 |
provider="openai",
|
108 |
call_type="llm",
|
|
|
120 |
# Tests for execute_model_step
|
121 |
|
122 |
|
123 |
+
@patch("workflows.executors.completion")
|
124 |
def test_execute_model_step_success(mock_completion):
|
125 |
"""Test successful execution of a model step with mocked litellm response."""
|
126 |
# Mock the litellm response
|
127 |
+
mock_response = {
|
128 |
+
"content": json.dumps({"summary": "This is a summary"}),
|
129 |
+
"output": {"summary": "This is a summary"},
|
130 |
+
}
|
131 |
mock_completion.return_value = mock_response
|
132 |
|
133 |
# Create a test step
|
134 |
step = ModelStep(
|
135 |
id="summarize",
|
136 |
+
name="Summarize Text",
|
137 |
model="gpt-3.5-turbo",
|
138 |
+
provider="OpenAI",
|
139 |
call_type="llm",
|
140 |
system_prompt="Summarize the text",
|
141 |
input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
|
|
|
151 |
# Verify the litellm call was made correctly
|
152 |
mock_completion.assert_called_once()
|
153 |
args, kwargs = mock_completion.call_args
|
154 |
+
assert kwargs["model"] == "OpenAI/gpt-3.5-turbo"
|
155 |
+
assert "Summarize the text" in kwargs["system"]
|
156 |
|
157 |
|
158 |
+
@patch("workflows.executors.completion")
|
159 |
def test_execute_model_step_error(mock_completion):
|
160 |
"""Test handling of errors in model step execution."""
|
161 |
# Make litellm raise an exception
|
|
|
164 |
# Create a test step
|
165 |
step = ModelStep(
|
166 |
id="summarize",
|
167 |
+
name="Summarize Text",
|
168 |
model="gpt-3.5-turbo",
|
169 |
provider="openai",
|
170 |
call_type="llm",
|
|
|
190 |
# Create a simple workflow
|
191 |
step = ModelStep(
|
192 |
id="summarize",
|
193 |
+
name="Summarize Text",
|
194 |
model="gpt-3.5-turbo",
|
195 |
provider="openai",
|
196 |
call_type="llm",
|
|
|
202 |
workflow = Workflow(steps={"summarize": step}, inputs=["input_text"], outputs={"summary": "summarize.summary"})
|
203 |
|
204 |
# Execute the workflow
|
205 |
+
final_outputs, computed_values, step_contents = execute_workflow(
|
206 |
+
workflow, {"input_text": "Long text to be summarized..."}
|
207 |
+
)
|
208 |
|
209 |
# Verify the results
|
210 |
+
assert final_outputs == {"summary": "This is a summary"}
|
211 |
+
assert computed_values == {"input_text": "Long text to be summarized...", "summarize.summary": "This is a summary"}
|
212 |
+
assert step_contents == {}
|
213 |
|
214 |
# Verify execute_model_step was called correctly
|
215 |
mock_execute_step.assert_called_once()
|
|
|
220 |
"""Test execution of a multi-step workflow with dependencies."""
|
221 |
|
222 |
# Configure mock to return different values based on the step
|
223 |
+
def side_effect(step, available_vars, return_full_content=False):
|
224 |
if step.id == "extract":
|
225 |
return {"entities": ["Apple", "product"]}
|
226 |
elif step.id == "analyze":
|
|
|
232 |
# Create extract step
|
233 |
extract_step = ModelStep(
|
234 |
id="extract",
|
235 |
+
name="Extract Entities",
|
236 |
model="gpt-3.5-turbo",
|
237 |
provider="openai",
|
238 |
call_type="llm",
|
|
|
244 |
# Create analyze step that depends on extract step
|
245 |
analyze_step = ModelStep(
|
246 |
id="analyze",
|
247 |
+
name="Analyze Sentiment",
|
248 |
model="gpt-4",
|
249 |
provider="openai",
|
250 |
call_type="llm",
|
|
|
260 |
)
|
261 |
|
262 |
# Execute the workflow
|
263 |
+
final_outputs, computed_values, step_contents = execute_workflow(
|
264 |
+
workflow, {"input_text": "Apple is launching a new product tomorrow."}
|
265 |
+
)
|
266 |
|
267 |
# Verify the results
|
268 |
+
assert final_outputs == {"entities": ["Apple", "product"], "sentiment": "positive"}
|
269 |
+
assert computed_values == {
|
270 |
+
"input_text": "Apple is launching a new product tomorrow.",
|
271 |
+
"extract.entities": ["Apple", "product"],
|
272 |
+
"analyze.sentiment": "positive",
|
273 |
+
}
|
274 |
+
assert step_contents == {}
|
275 |
|
276 |
# Verify execute_model_step was called twice (once for each step)
|
277 |
assert mock_execute_step.call_count == 2
|
|
|
281 |
"""Test that an error is raised when a required input is missing."""
|
282 |
step = ModelStep(
|
283 |
id="summarize",
|
284 |
+
name="Summarize Text",
|
285 |
model="gpt-3.5-turbo",
|
286 |
provider="openai",
|
287 |
call_type="llm",
|
|
|
305 |
|
306 |
step = ModelStep(
|
307 |
id="test",
|
308 |
+
name="Test Step",
|
309 |
model="gpt-3.5-turbo",
|
310 |
provider="openai",
|
311 |
call_type="llm",
|
|
|
314 |
output_fields=[],
|
315 |
)
|
316 |
|
317 |
+
workflow = Workflow(steps=[step], inputs=[], outputs={})
|
318 |
|
319 |
# This should propagate the CyclicDependencyError
|
320 |
with pytest.raises(CyclicDependencyError):
|
321 |
execute_workflow(workflow, {})
|
322 |
+
|
323 |
+
|
324 |
+
@patch("workflows.executors.execute_model_step")
|
325 |
+
def test_execute_workflow_with_full_content(mock_execute_step):
|
326 |
+
"""Test execution of a workflow with return_full_content=True."""
|
327 |
+
# Configure mock to return expected outputs and content
|
328 |
+
mock_execute_step.return_value = ({"summary": "This is a summary"}, "Full model response content")
|
329 |
+
|
330 |
+
# Create a simple workflow
|
331 |
+
step = ModelStep(
|
332 |
+
id="summarize",
|
333 |
+
name="Summarize Text",
|
334 |
+
model="gpt-3.5-turbo",
|
335 |
+
provider="openai",
|
336 |
+
call_type="llm",
|
337 |
+
system_prompt="Summarize the text",
|
338 |
+
input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
|
339 |
+
output_fields=[OutputField(name="summary", description="Summary of the text", type="str")],
|
340 |
+
)
|
341 |
+
|
342 |
+
workflow = Workflow(steps=[step], inputs=["input_text"], outputs={"summary": "summarize.summary"})
|
343 |
+
|
344 |
+
# Execute the workflow with return_full_content=True
|
345 |
+
final_outputs, computed_values, step_contents = execute_workflow(
|
346 |
+
workflow, {"input_text": "Long text to be summarized..."}, return_full_content=True
|
347 |
+
)
|
348 |
+
|
349 |
+
# Verify the results
|
350 |
+
assert final_outputs == {"summary": "This is a summary"}
|
351 |
+
assert computed_values == {"input_text": "Long text to be summarized...", "summarize.summary": "This is a summary"}
|
352 |
+
assert step_contents == {"summarize": "Full model response content"}
|
353 |
+
|
354 |
+
# Verify execute_model_step was called correctly with return_full_content=True
|
355 |
+
mock_execute_step.assert_called_once_with(step, computed_values, return_full_content=True)
|