Spaces:

qanta-challenge
/

quizbowl-submission

Running

App Files Files Community

Maharshi Gor commited on about 1 month ago

Commit

38e3800

1 Parent(s): 2900a81

Refactored single step and multi step qb agents into one module as QB Agents.

Browse files

Files changed (9) hide show

src/components/quizbowl/bonus.py +2 -7
src/components/quizbowl/tossup.py +2 -8
src/workflows/executors.py +119 -43
src/workflows/qb/__init__.py +0 -0
src/workflows/qb/simple_agent.py +0 -186
src/workflows/{qb/multi_step_agent.py → qb_agents.py} +27 -23
src/workflows/quizbowl_agent.py +0 -269
src/workflows/structs.py +7 -3
tests/test_executors.py +72 -12

src/components/quizbowl/bonus.py CHANGED Viewed

@@ -9,8 +9,7 @@ from loguru import logger
 from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState, PipelineUIState
 from display.formatting import styled_error
 from submission import submit
-from workflows.qb.multi_step_agent import MultiStepBonusAgent
-from workflows.qb.simple_agent import SimpleBonusAgent
 from workflows.structs import ModelStep, Workflow
 from . import commons
@@ -211,11 +210,7 @@ class BonusInterface:
         """Get the model outputs for a given question ID."""
         outputs = []
         leadin = example["leadin"]
-        workflow = pipeline_state.workflow
-        if len(workflow.steps) > 1:
-            agent = MultiStepBonusAgent(workflow)
-        else:
-            agent = SimpleBonusAgent(workflow)
         for i, part in enumerate(example["parts"]):
             # Run model for each part

 from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState, PipelineUIState
 from display.formatting import styled_error
 from submission import submit
+from workflows.qb_agents import QuizBowlBonusAgent
 from workflows.structs import ModelStep, Workflow
 from . import commons
         """Get the model outputs for a given question ID."""
         outputs = []
         leadin = example["leadin"]
+        agent = QuizBowlBonusAgent(pipeline_state.workflow)
         for i, part in enumerate(example["parts"]):
             # Run model for each part

src/components/quizbowl/tossup.py CHANGED Viewed

@@ -10,8 +10,7 @@ from loguru import logger
 from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState, PipelineUIState
 from display.formatting import styled_error
 from submission import submit
-from workflows.qb.multi_step_agent import MultiStepTossupAgent
-from workflows.qb.simple_agent import SimpleTossupAgent
 from workflows.structs import ModelStep, Workflow
 from . import commons
@@ -275,12 +274,7 @@ class TossupInterface:
         tokens = example["question"].split()
         for run_idx in example["run_indices"]:
             question_runs.append(" ".join(tokens[: run_idx + 1]))
-        workflow = pipeline_state.workflow
-        if len(workflow.steps) > 1:
-            agent = MultiStepTossupAgent(workflow, buzz_threshold)
-        else:
-            agent = SimpleTossupAgent(workflow, buzz_threshold)
         outputs = list(agent.run(question_runs, early_stop=early_stop))
         outputs = add_model_scores(outputs, example["clean_answers"], example["run_indices"])
         return outputs

 from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState, PipelineUIState
 from display.formatting import styled_error
 from submission import submit
+from workflows.qb_agents import QuizBowlTossupAgent
 from workflows.structs import ModelStep, Workflow
 from . import commons
         tokens = example["question"].split()
         for run_idx in example["run_indices"]:
             question_runs.append(" ".join(tokens[: run_idx + 1]))
+        agent = QuizBowlTossupAgent(pipeline_state.workflow, buzz_threshold)
         outputs = list(agent.run(question_runs, early_stop=early_stop))
         outputs = add_model_scores(outputs, example["clean_answers"], example["run_indices"])
         return outputs

src/workflows/executors.py CHANGED Viewed

@@ -1,5 +1,4 @@
 # %%
-import json
 from typing import Any
 import pydantic
@@ -178,42 +177,10 @@ def execute_model_step(
     return outputs
-# Example usage
-if __name__ == "__main__":
-    # Define a simple model step
-    model_step = ModelStep(
-        id="step1",
-        model="gpt-4o-mini",
-        provider="OpenAI",
-        call_type="llm",
-        system_prompt="You are a simple NLP tool that takes a string, and a number N, and return the first N entities in the string, and the total count of entities in the string.",
-        input_fields=[
-            InputField(name="sentence", description="The sentence to process", variable="sentence", func=None),
-            InputField(name="n", description="The number of entities to return", variable="n", func=None),
-        ],
-        output_fields=[
-            OutputField(
-                name="entities",
-                description="The first N entities in the string as a list of strings",
-                type="list[str]",
-                func=None,
-            ),
-            OutputField(name="count", description="The total count of entities in the string", type="int", func=None),
-        ],
-    )
-    # Define processed inputs
-    processed_inputs = {"sentence": "Abdul Akbar is a good person, but Jesus is the son of God.", "n": 3}
-    # Execute the model step
-    outputs = execute_model_step(model_step, processed_inputs)
-    print(outputs)
 # %%
-def execute_workflow(
     workflow: Workflow, input_values: dict[str, Any], return_full_content: bool = False
-) -> dict[str, Any] | tuple[dict[str, Any], str]:
     """
     Execute the given workflow as a computational graph.
@@ -234,10 +201,14 @@ def execute_workflow(
                             dependencies, and input/output specifications.
         input_values (dict[str, Any]): External input values to be used by the workflow.
                                       Keys should match the required workflow.inputs.
     Returns:
-        dict[str, Any]: A dictionary of the workflow's outputs, with keys matching
-                       the variables defined in workflow.outputs.
     Raises:
         UnknownVariableError: If an input_field references a variable that is not
@@ -255,12 +226,12 @@ def execute_workflow(
         ...         "analyze": ModelStep(...)   # A step that analyzes the entities
         ...     },
         ...     inputs=["text"],
-        ...     outputs=["analyze.sentiment", "extract.entities"]
         ... )
-        >>> result = execute_workflow(workflow, {"text": "Apple is launching a new product tomorrow."})
-        >>> print(result["analyze.sentiment"])
         "positive"
-        >>> print(result["extract.entities"])
         ["Apple", "product"]
     """
     # Step 1: Pre-populate computed values with external workflow inputs.
@@ -280,11 +251,15 @@ def execute_workflow(
     execution_order = topological_sort(dependencies)
     # Step 4: Execute steps in topological order.
     for step_id in execution_order:
         step = workflow.steps[step_id]
         # Execute the step
-        outputs = execute_model_step(step, computed_values)
         outputs = {f"{step_id}.{k}": v for k, v in outputs.items()}
         computed_values.update(outputs)
@@ -295,7 +270,77 @@ def execute_workflow(
             raise WorkflowError(f"Workflow output variable {var} was not produced")
         final_outputs[target] = computed_values[var]
-    return final_outputs
 def run_examples():
@@ -438,3 +483,34 @@ if __name__ == "__main__":
     run_examples()
 # %%

 # %%
 from typing import Any
 import pydantic
     return outputs
 # %%
+def execute_multi_step_workflow(
     workflow: Workflow, input_values: dict[str, Any], return_full_content: bool = False
+) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
     """
     Execute the given workflow as a computational graph.
                             dependencies, and input/output specifications.
         input_values (dict[str, Any]): External input values to be used by the workflow.
                                       Keys should match the required workflow.inputs.
+        return_full_content (bool, optional): If True, returns the full content of each step.
+                                             Defaults to False.
     Returns:
+        A tuple containing:
+            - A dictionary of the workflow's outputs, with keys matching the variables defined in workflow.outputs.
+            - A dictionary of all computed values during workflow execution, including intermediate results.
+            - A dictionary of step contents, only populated if return_full_content is True.
     Raises:
         UnknownVariableError: If an input_field references a variable that is not
         ...         "analyze": ModelStep(...)   # A step that analyzes the entities
         ...     },
         ...     inputs=["text"],
+        ...     outputs={"sentiment": "analyze.sentiment", "entities": "extract.entities"}
         ... )
+        >>> final_outputs, computed_values, step_contents = execute_workflow(workflow, {"text": "Apple is launching a new product tomorrow."})
+        >>> print(final_outputs["sentiment"])
         "positive"
+        >>> print(final_outputs["entities"])
         ["Apple", "product"]
     """
     # Step 1: Pre-populate computed values with external workflow inputs.
     execution_order = topological_sort(dependencies)
     # Step 4: Execute steps in topological order.
+    step_contents: dict[str, Any] = {}
     for step_id in execution_order:
         step = workflow.steps[step_id]
+        outputs = execute_model_step(step, computed_values, return_full_content=return_full_content)
         # Execute the step
+        if return_full_content:
+            outputs, content = outputs
+            step_contents[step_id] = content
         outputs = {f"{step_id}.{k}": v for k, v in outputs.items()}
         computed_values.update(outputs)
             raise WorkflowError(f"Workflow output variable {var} was not produced")
         final_outputs[target] = computed_values[var]
+    step_outputs = {k: v for k, v in computed_values.items() if k not in workflow.inputs}
+    return final_outputs, step_outputs, step_contents
+def execute_simple_workflow(
+    workflow: Workflow, input_values: dict[str, Any], return_full_content: bool = False
+) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
+    """Execute a simple workflow with a single step.
+    This is a simplified version of execute_workflow for workflows with only one step.
+    Args:
+        workflow: The workflow to execute
+        input_values: Dictionary of input values
+        return_full_content: Whether to return the full content of each step
+    Returns:
+        Tuple containing:
+            - final_outputs: Dictionary of workflow outputs
+            - computed_values: Dictionary of all computed values
+            - step_contents: Dictionary of step contents (if return_full_content=True)
+    Raises:
+        WorkflowError: If the workflow has more than one step
+    """
+    if len(workflow.steps) != 1:
+        raise WorkflowError("Simple workflow must have exactly one step")
+    # Get the single step
+    step = list(workflow.steps.values())[0]
+    # Validate inputs
+    for var in workflow.inputs:
+        if var not in input_values:
+            raise WorkflowError(f"Missing required workflow input: {var}")
+    # Execute the step
+    if return_full_content:
+        step_outputs, content = execute_model_step(step, input_values, return_full_content=True)
+        step_contents = {step.id: content}
+    else:
+        step_outputs = execute_model_step(step, input_values, return_full_content=False)
+        step_contents = {}
+    # Prepare the final outputs
+    final_outputs = {}
+    for target, var in workflow.outputs.items():
+        if var.startswith(f"{step.id}."):
+            output_key = var.split(".", 1)[1]
+            if output_key in step_outputs:
+                final_outputs[target] = step_outputs[output_key]
+            else:
+                raise WorkflowError(f"Workflow output variable {var} was not produced")
+        else:
+            raise WorkflowError(f"Invalid output mapping: {var} does not match step ID {step.id}")
+    # Prepare computed values (prefixed with step ID)
+    computed_values = input_values.copy()
+    computed_values.update({f"{step.id}.{k}": v for k, v in step_outputs.items()})
+    return final_outputs, computed_values, step_contents
+def execute_workflow(
+    workflow: Workflow, input_values: dict[str, Any], return_full_content: bool = False
+) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
+    if len(workflow.steps) > 1:
+        return execute_multi_step_workflow(workflow, input_values, return_full_content)
+    else:
+        return execute_simple_workflow(workflow, input_values, return_full_content)
 def run_examples():
     run_examples()
 # %%
+# Example usage
+if __name__ == "__main__":
+    # Define a simple model step
+    model_step = ModelStep(
+        id="step1",
+        model="gpt-4o-mini",
+        provider="OpenAI",
+        call_type="llm",
+        system_prompt="You are a simple NLP tool that takes a string, and a number N, and return the first N entities in the string, and the total count of entities in the string.",
+        input_fields=[
+            InputField(name="sentence", description="The sentence to process", variable="sentence", func=None),
+            InputField(name="n", description="The number of entities to return", variable="n", func=None),
+        ],
+        output_fields=[
+            OutputField(
+                name="entities",
+                description="The first N entities in the string as a list of strings",
+                type="list[str]",
+                func=None,
+            ),
+            OutputField(name="count", description="The total count of entities in the string", type="int", func=None),
+        ],
+    )
+    # Define processed inputs
+    processed_inputs = {"sentence": "Abdul Akbar is a good person, but Jesus is the son of God.", "n": 3}
+    # Execute the model step
+    outputs = execute_model_step(model_step, processed_inputs)
+    print(outputs)

src/workflows/qb/__init__.py DELETED Viewed

File without changes

src/workflows/qb/simple_agent.py DELETED Viewed

@@ -1,186 +0,0 @@
-import time
-from typing import Any, Iterable
-# from litellm import completion
-from llms import completion
-from workflows.executors import execute_model_step, execute_workflow
-from workflows.structs import ModelStep, Workflow
-def _get_agent_response(self, prompt: str, system_prompt: str) -> dict:
-    """Get response from the LLM model."""
-    messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
-    start_time = time.time()
-    response = completion(
-        model=self.model,
-        messages=messages,
-        temperature=self.temperature,
-        max_tokens=150,  # Limit token usage for faster responses
-    )
-    response_time = time.time() - start_time
-    return response, response_time
-def _get_model_step_response(
-    model_step: ModelStep, available_vars: dict[str, Any]
-) -> tuple[dict[str, Any], str, float]:
-    """Get response from the LLM model."""
-    start_time = time.time()
-    response, content = execute_model_step(model_step, available_vars, return_full_content=True)
-    response_time = time.time() - start_time
-    return response, content, response_time
-class SimpleTossupAgent:
-    external_input_variable = "question_text"
-    output_variables = ["answer", "confidence"]
-    def __init__(self, workflow: Workflow, buzz_threshold: float):
-        steps = list(workflow.steps.values())
-        assert len(steps) == 1, "Only one step is allowed in a simple workflow"
-        self.model_step = steps[0]
-        self.buzz_threshold = buzz_threshold
-        self.output_variables = list(workflow.outputs.keys())
-        if self.external_input_variable not in workflow.inputs:
-            raise ValueError(f"External input variable {self.external_input_variable} not found in model step inputs")
-        for out_var in self.output_variables:
-            if out_var not in workflow.outputs:
-                raise ValueError(f"Output variable {out_var} not found in the workflow outputs")
-    def run(self, question_runs: list[str], early_stop: bool = True) -> Iterable[dict]:
-        """
-        Process a tossup question and decide when to buzz based on confidence.
-        Args:
-            question_runs: Progressive reveals of the question text
-            early_stop: Whether to stop after the first buzz
-        Yields:
-            Dict with answer, confidence, and whether to buzz
-        """
-        for i, question_text in enumerate(question_runs):
-            response, content, response_time = _get_model_step_response(
-                self.model_step, {self.external_input_variable: question_text}
-            )
-            buzz = response["confidence"] >= self.buzz_threshold
-            result = {
-                "answer": response["answer"],
-                "confidence": response["confidence"],
-                "buzz": buzz,
-                "question_fragment": question_text,
-                "position": i + 1,
-                "full_response": content,
-                "response_time": response_time,
-            }
-            yield result
-            # If we've reached the confidence threshold, buzz and stop
-            if early_stop and buzz:
-                return
-class SimpleBonusAgent:
-    external_input_variables = ["leadin", "part"]
-    output_variables = ["answer", "confidence", "explanation"]
-    def __init__(self, workflow: Workflow):
-        steps = list(workflow.steps.values())
-        assert len(steps) == 1, "Only one step is allowed in a simple workflow"
-        self.model_step = steps[0]
-        self.output_variables = list(workflow.outputs.keys())
-        # Validate input variables
-        for input_var in self.external_input_variables:
-            if input_var not in workflow.inputs:
-                raise ValueError(f"External input variable {input_var} not found in model step inputs")
-        # Validate output variables
-        for out_var in self.output_variables:
-            if out_var not in workflow.outputs:
-                raise ValueError(f"Output variable {out_var} not found in the workflow outputs")
-    def run(self, leadin: str, part: str) -> dict:
-        """
-        Process a bonus part with the given leadin.
-        Args:
-            leadin: The leadin text for the bonus question
-            part: The specific part text to answer
-        Returns:
-            Dict with answer, confidence, and explanation
-        """
-        response, content, response_time = _get_model_step_response(
-            self.model_step,
-            {
-                "leadin": leadin,
-                "part": part,
-            },
-        )
-        return {
-            "answer": response["answer"],
-            "confidence": response["confidence"],
-            "explanation": response["explanation"],
-            "full_response": content,
-            "response_time": response_time,
-        }
-# Example usage
-if __name__ == "__main__":
-    # Load the Quizbowl dataset
-    from datasets import load_dataset
-    from workflows.factory import create_quizbowl_bonus_step_initial_setup, create_quizbowl_simple_step_initial_setup
-    ds_name = "umdclip/leaderboard_co_set"
-    ds = load_dataset(ds_name, split="train")
-    # Create the agents
-    tossup_step = create_quizbowl_simple_step_initial_setup()
-    tossup_step.model = "gpt-4"
-    tossup_step.provider = "openai"
-    tossup_agent = SimpleTossupAgent(workflow=tossup_step, buzz_threshold=0.9)
-    bonus_step = create_quizbowl_bonus_step_initial_setup()
-    bonus_step.model = "gpt-4"
-    bonus_step.provider = "openai"
-    bonus_agent = SimpleBonusAgent(workflow=bonus_step)
-    # Example for tossup mode
-    print("\n=== TOSSUP MODE EXAMPLE ===")
-    sample_question = ds[30]
-    print(sample_question["question_runs"][-1])
-    print(sample_question["gold_label"])
-    print()
-    question_runs = sample_question["question_runs"]
-    results = tossup_agent.run(question_runs, early_stop=True)
-    for result in results:
-        print(result["full_response"])
-        print(f"Guess at position {result['position']}: {result['answer']}")
-        print(f"Confidence: {result['confidence']}")
-        if result["buzz"]:
-            print("Buzzed!\n")
-    # Example for bonus mode
-    print("\n=== BONUS MODE EXAMPLE ===")
-    sample_bonus = ds[31]  # Assuming this is a bonus question
-    leadin = sample_bonus["leadin"]
-    parts = sample_bonus["parts"]
-    print(f"Leadin: {leadin}")
-    for i, part in enumerate(parts):
-        print(f"\nPart {i + 1}: {part['part']}")
-        result = bonus_agent.run(leadin, part["part"])
-        print(f"Answer: {result['answer']}")
-        print(f"Confidence: {result['confidence']}")
-        print(f"Explanation: {result['explanation']}")
-        print(f"Response time: {result['response_time']:.2f}s")

src/workflows/{qb/multi_step_agent.py → qb_agents.py} RENAMED Viewed

@@ -5,15 +5,19 @@ from workflows.executors import execute_workflow
 from workflows.structs import Workflow
-def _get_workflow_response(workflow: Workflow, available_vars: dict[str, Any]) -> tuple[dict[str, Any], str, float]:
     """Get response from executing a complete workflow."""
     start_time = time.time()
-    response, content = execute_workflow(workflow, available_vars, return_full_content=True)
     response_time = time.time() - start_time
-    return response, content, response_time
-class MultiStepTossupAgent:
     """Agent for handling tossup questions with multiple steps in the workflow."""
     external_input_variable = "question_text"
@@ -53,26 +57,26 @@ class MultiStepTossupAgent:
                 - buzz: Whether to buzz
                 - question_fragment: Current question text
                 - position: Current position in question
-                - full_response: Complete model response
                 - response_time: Time taken for response
                 - step_outputs: Outputs from each step
         """
         for i, question_text in enumerate(question_runs):
             # Execute the complete workflow
-            response, content, response_time = _get_workflow_response(
                 self.workflow, {self.external_input_variable: question_text}
             )
-            buzz = response["confidence"] >= self.buzz_threshold
             result = {
-                "answer": response["answer"],
-                "confidence": response["confidence"],
                 "buzz": buzz,
                 "question_fragment": question_text,
                 "position": i + 1,
-                "full_response": content,
                 "response_time": response_time,
-                "step_outputs": response.get("step_outputs", {}),  # Include intermediate step outputs
             }
             yield result
@@ -82,7 +86,7 @@ class MultiStepTossupAgent:
                 return
-class MultiStepBonusAgent:
     """Agent for handling bonus questions with multiple steps in the workflow."""
     external_input_variables = ["leadin", "part"]
@@ -119,11 +123,11 @@ class MultiStepBonusAgent:
                 - answer: The model's answer
                 - confidence: Confidence score
                 - explanation: Explanation for the answer
-                - full_response: Complete model response
                 - response_time: Time taken for response
                 - step_outputs: Outputs from each step
         """
-        response, content, response_time = _get_workflow_response(
             self.workflow,
             {
                 "leadin": leadin,
@@ -132,12 +136,12 @@ class MultiStepBonusAgent:
         )
         return {
-            "answer": response["answer"],
-            "confidence": response["confidence"],
-            "explanation": response["explanation"],
-            "full_response": content,
             "response_time": response_time,
-            "step_outputs": response.get("step_outputs", {}),  # Include intermediate step outputs
         }
@@ -153,10 +157,10 @@ if __name__ == "__main__":
     # Create the agents with multi-step workflows
     tossup_workflow = create_quizbowl_tossup_workflow()
-    tossup_agent = MultiStepTossupAgent(workflow=tossup_workflow, buzz_threshold=0.9)
     bonus_workflow = create_quizbowl_bonus_workflow()
-    bonus_agent = MultiStepBonusAgent(workflow=bonus_workflow)
     # Example for tossup mode
     print("\n=== TOSSUP MODE EXAMPLE ===")
@@ -168,7 +172,7 @@ if __name__ == "__main__":
     results = tossup_agent.run(question_runs, early_stop=True)
     for result in results:
-        print(result["full_response"])
         print(f"Guess at position {result['position']}: {result['answer']}")
         print(f"Confidence: {result['confidence']}")
         print("Step outputs:", result["step_outputs"])

 from workflows.structs import Workflow
+def _get_workflow_response(
+    workflow: Workflow, available_vars: dict[str, Any]
+) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any], float]:
     """Get response from executing a complete workflow."""
     start_time = time.time()
+    final_outputs, computed_values, step_contents = execute_workflow(
+        workflow, available_vars, return_full_content=True
+    )
     response_time = time.time() - start_time
+    return final_outputs, computed_values, step_contents, response_time
+class QuizBowlTossupAgent:
     """Agent for handling tossup questions with multiple steps in the workflow."""
     external_input_variable = "question_text"
                 - buzz: Whether to buzz
                 - question_fragment: Current question text
                 - position: Current position in question
+                - step_contents: String content outputs of each step
                 - response_time: Time taken for response
                 - step_outputs: Outputs from each step
         """
         for i, question_text in enumerate(question_runs):
             # Execute the complete workflow
+            final_outputs, computed_values, step_contents, response_time = _get_workflow_response(
                 self.workflow, {self.external_input_variable: question_text}
             )
+            print(f"Workflow response: {final_outputs}")
+            buzz = final_outputs["confidence"] >= self.buzz_threshold
             result = {
+                "answer": final_outputs["answer"],
+                "confidence": final_outputs["confidence"],
                 "buzz": buzz,
                 "question_fragment": question_text,
                 "position": i + 1,
+                "step_contents": step_contents,
                 "response_time": response_time,
+                "step_outputs": computed_values,  # Include intermediate step outputs
             }
             yield result
                 return
+class QuizBowlBonusAgent:
     """Agent for handling bonus questions with multiple steps in the workflow."""
     external_input_variables = ["leadin", "part"]
                 - answer: The model's answer
                 - confidence: Confidence score
                 - explanation: Explanation for the answer
+                - step_contents: String content outputs of each step
                 - response_time: Time taken for response
                 - step_outputs: Outputs from each step
         """
+        final_outputs, computed_values, step_contents, response_time = _get_workflow_response(
             self.workflow,
             {
                 "leadin": leadin,
         )
         return {
+            "answer": final_outputs["answer"],
+            "confidence": final_outputs["confidence"],
+            "explanation": final_outputs["explanation"],
+            "step_contents": step_contents,
             "response_time": response_time,
+            "step_outputs": computed_values,  # Include intermediate step outputs
         }
     # Create the agents with multi-step workflows
     tossup_workflow = create_quizbowl_tossup_workflow()
+    tossup_agent = QuizBowlTossupAgent(workflow=tossup_workflow, buzz_threshold=0.9)
     bonus_workflow = create_quizbowl_bonus_workflow()
+    bonus_agent = QuizBowlBonusAgent(workflow=bonus_workflow)
     # Example for tossup mode
     print("\n=== TOSSUP MODE EXAMPLE ===")
     results = tossup_agent.run(question_runs, early_stop=True)
     for result in results:
+        print(result["step_contents"])
         print(f"Guess at position {result['position']}: {result['answer']}")
         print(f"Confidence: {result['confidence']}")
         print("Step outputs:", result["step_outputs"])

src/workflows/quizbowl_agent.py DELETED Viewed

@@ -1,269 +0,0 @@
-# %%
-import json
-import os
-import time
-from typing import Dict, Iterable, List, Optional, Tuple, Union
-import litellm
-from datasets import load_dataset
-from litellm import completion
-litellm.drop_params = True
-# Set your API key - you can replace this with your actual key or use environment variables
-os.environ["OPENAI_API_KEY"] = (
-    "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"
-)
-DEFAULT_SYS_PROMPT = """
-You are a Quizbowl expert. You will be given a question that's progressively revealed.
-Your goal is to identify the answer as quickly as possible with high confidence.
-Respond with a JSON object with two fields:
-1. "answer": Your best guess for the answer
-2. "confidence": Your confidence in your answer from 0.0 to 1.0
-DO NOT include any explanation. ONLY return the JSON object.
-"""
-class QuizbowlAgent:
-    """
-    An agent for playing Quizbowl with two modes:
-    1. Tossup mode: Fast and direct with confidence calibration for buzzing
-    2. Bonus round mode: Provides guess, rationale, and confidence
-    """
-    def __init__(
-        self,
-        model: str = "gpt-4o-mini",
-        buzz_threshold: float = 0.85,
-        temperature: float = 0.2,
-        system_prompt: str = DEFAULT_SYS_PROMPT,
-    ):
-        """
-        Initialize the QuizbowlAgent.
-        Args:
-            model: The LLM model to use for answering
-            buzz_threshold: Confidence threshold for buzzing in tossup mode (0-1)
-            temperature: Temperature for model sampling
-        """
-        self.model = model
-        self.buzz_threshold = buzz_threshold
-        self.temperature = temperature
-        self.system_prompt = system_prompt
-    def _process_question_runs(self, question_runs: List[str]) -> List[str]:
-        """Process question runs to extract increasing amounts of text."""
-        # For simpler testing, just return the runs as they are in the dataset
-        return question_runs
-    def _get_agent_response(self, prompt: str, system_prompt: str) -> Dict:
-        """Get response from the LLM model."""
-        messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
-        start_time = time.time()
-        response = completion(
-            model=self.model,
-            messages=messages,
-            temperature=self.temperature,
-            max_tokens=150,  # Limit token usage for faster responses
-        )
-        response_time = time.time() - start_time
-        return response, response_time
-    def _extract_confidence_and_answer(self, content: str) -> Tuple[str, float]:
-        """Extract the answer and confidence score from the model response."""
-        try:
-            # Try to parse JSON from the response
-            data = json.loads(content)
-            answer = data.get("answer", "")
-            confidence = float(data.get("confidence", 0.0))
-            return answer, confidence
-        except (json.JSONDecodeError, ValueError):
-            # Fallback if parsing fails
-            lines = content.strip().split("\n")
-            answer = lines[0] if lines else ""
-            confidence = 0.5  # Default confidence
-            # Try to extract confidence from text
-            for line in lines:
-                if "confidence:" in line.lower():
-                    try:
-                        confidence = float(line.lower().split("confidence:")[1].strip())
-                    except (ValueError, IndexError):
-                        pass
-            return answer, confidence
-    def tossup_mode(self, question_runs: List[str]) -> Iterable[Dict]:
-        """
-        Process a tossup question and decide when to buzz based on confidence.
-        Args:
-            question_runs: Progressive reveals of the question text
-        Yields:
-            Dict with answer, confidence, and whether to buzz
-        """
-        for i, question_text in enumerate(question_runs):
-            prompt = f"Question: {question_text}\n\nProvide your answer and confidence level:"
-            response, response_time = self._get_agent_response(prompt, DEFAULT_SYS_PROMPT)
-            content = response.choices[0].message.content
-            answer, confidence = self._extract_confidence_and_answer(content)
-            result = {
-                "answer": answer,
-                "confidence": confidence,
-                "buzz": confidence >= self.buzz_threshold,
-                "question_fragment": question_text,
-                "position": i + 1,
-                "full_response": content,
-                "response_time": response_time,
-            }
-            yield result
-            # If we've reached the confidence threshold, buzz and stop
-            if confidence >= self.buzz_threshold:
-                return
-    def tossup_mode_top5(self, question_runs: List[str]) -> Iterable[Dict]:
-        """
-        Process a tossup question and provide the top 5 guesses with confidence levels.
-        Args:
-            question_runs: Progressive reveals of the question text
-        Returns:
-            Dict with top 5 answers, their confidences, and whether to buzz
-        """
-        for i, question_text in enumerate(question_runs):
-            prompt = f"Question: {question_text}\n\nProvide your top 5 answers and confidence levels."
-            response, response_time = self._get_agent_response(prompt, self.system_prompt)
-            content = response.choices[0].message.content
-            try:
-                # Try to parse JSON from the response
-                data = json.loads(content)
-                guesses = data.get("guesses", [])
-            except (json.JSONDecodeError, ValueError):
-                # Fallback if parsing fails
-                guesses = []
-            result = {
-                "guesses": guesses,
-                "buzz": any(guess["confidence"] >= self.buzz_threshold for guess in guesses),
-                "question_fragment": question_text,
-                "position": i + 1,
-                "full_response": content,
-                "response_time": response_time,
-            }
-            yield result
-            # If any guess reaches the confidence threshold, buzz and stop
-            if result["buzz"]:
-                return
-    def bonus_round_mode(self, question: str) -> Dict:
-        """
-        Process a bonus round question with detailed analysis.
-        Args:
-            question: The bonus question text
-        Returns:
-            Dict with answer, rationale, and confidence
-        """
-        system_prompt = """
-        You are a Quizbowl expert answering a bonus question. Provide:
-        1. Your direct answer
-        2. A very brief and crisp one line rationale for your answer (key clues that led to it)
-        3. Your confidence level (0.0-1.0)
-        Respond with a JSON object with these three fields:
-        {
-            "answer": "Your answer here",
-            "rationale": "Your reasoning here",
-            "confidence": 0.XX
-        }
-        """
-        prompt = f"Bonus Question: {question}\n\nProvide your answer, rationale, and confidence:"
-        response = self._get_agent_response(prompt, system_prompt)
-        content = response.choices[0].message.content
-        try:
-            # Try to parse JSON
-            result = json.loads(content)
-            # Ensure all fields are present
-            if not all(k in result for k in ["answer", "rationale", "confidence"]):
-                raise ValueError("Missing fields in response")
-        except (json.JSONDecodeError, ValueError):
-            # If parsing fails, extract manually
-            lines = content.strip().split("\n")
-            result = {"answer": "", "rationale": "", "confidence": 0.5}
-            for line in lines:
-                if line.lower().startswith("answer:"):
-                    result["answer"] = line[7:].strip()
-                elif line.lower().startswith("rationale:"):
-                    result["rationale"] = line[10:].strip()
-                elif line.lower().startswith("confidence:"):
-                    try:
-                        result["confidence"] = float(line[11:].strip())
-                    except ValueError:
-                        pass
-        return result
-# %%
-# Example usage
-if __name__ == "__main__":
-    # Load the Quizbowl dataset
-    ds_name = "umdclip/leaderboard_co_set"
-    ds = load_dataset(ds_name, split="train")
-    # Create the agent
-    agent = QuizbowlAgent(model="gpt-4-turbo", buzz_threshold=0.85)
-    # Example for tossup mode
-    print("\n=== TOSSUP MODE EXAMPLE ===")
-    sample_question = ds[0]
-    print(sample_question["question_runs"][-1])
-    print(sample_question["gold_label"])
-    question_runs = sample_question["question_runs"]
-    results = agent.tossup_mode(question_runs)
-    for result in results:
-        print(f"Guess at position {result['position']}: {result['answer']}")
-        print(f"Confidence: {result['confidence']}")
-        if result["buzz"]:
-            print("Buzzed!\n")
-    results = agent.tossup_mode_top5(question_runs)
-    for result in results:
-        guesses = [f"{guess['answer']} ({guess['confidence']})" for guess in result["guesses"]]
-        print(f"Guesses at position {result['position']}: {', '.join(guesses)}")
-        if result["buzz"]:
-            print("Buzzed!")
-    # Example for bonus round mode
-    print("\n=== BONUS ROUND MODE EXAMPLE ===")
-    bonus_question = sample_question["question_runs"][-1]
-    bonus_result = agent.bonus_round_mode(bonus_question)
-    print(f"Answer: {bonus_result['answer']}")
-    print(f"Rationale: {bonus_result['rationale']}")
-    print(f"Confidence: {bonus_result['confidence']}")
-# %%

src/workflows/structs.py CHANGED Viewed

@@ -202,9 +202,13 @@ class Workflow(BaseModel):
         if "steps" in data and isinstance(data["steps"], list):
             steps_dict = {}
             for step in data["steps"]:
-                if step["id"] in steps_dict:
-                    raise ValueError(f"Duplicate step ID: {step['id']}")
-                steps_dict[step["id"]] = step
             data["steps"] = steps_dict
         return data

         if "steps" in data and isinstance(data["steps"], list):
             steps_dict = {}
             for step in data["steps"]:
+                if isinstance(step, ModelStep):
+                    step_id = step.id
+                else:
+                    step_id = step["id"]
+                if step_id in steps_dict:
+                    raise ValueError(f"Duplicate step ID: {step_id}")
+                steps_dict[step_id] = step
             data["steps"] = steps_dict
         return data

tests/test_executors.py CHANGED Viewed

@@ -45,6 +45,7 @@ def test_create_processed_inputs_basic():
     """Test basic input processing without transformations."""
     step = ModelStep(
         id="test_step",
         model="gpt-4",
         provider="openai",
         call_type="llm",
@@ -62,6 +63,7 @@ def test_create_processed_inputs_with_transformation():
     """Test input processing with transformation functions."""
     step = ModelStep(
         id="test_step",
         model="gpt-4",
         provider="openai",
         call_type="llm",
@@ -82,6 +84,7 @@ def test_create_processed_inputs_missing_var():
     """Test that appropriate error is raised when a variable is missing."""
     step = ModelStep(
         id="test_step",
         model="gpt-4",
         provider="openai",
         call_type="llm",
@@ -99,6 +102,7 @@ def test_create_processed_inputs_unknown_func():
     """Test that appropriate error is raised when an unknown function is specified."""
     step = ModelStep(
         id="test_step",
         model="gpt-4",
         provider="openai",
         call_type="llm",
@@ -116,18 +120,22 @@ def test_create_processed_inputs_unknown_func():
 # Tests for execute_model_step
-@patch("workflows.executors.litellm.completion")
 def test_execute_model_step_success(mock_completion):
     """Test successful execution of a model step with mocked litellm response."""
     # Mock the litellm response
-    mock_response = {"choices": [{"message": {"content": json.dumps({"summary": "This is a summary"})}}]}
     mock_completion.return_value = mock_response
     # Create a test step
     step = ModelStep(
         id="summarize",
         model="gpt-3.5-turbo",
-        provider="openai",
         call_type="llm",
         system_prompt="Summarize the text",
         input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
@@ -143,11 +151,11 @@ def test_execute_model_step_success(mock_completion):
     # Verify the litellm call was made correctly
     mock_completion.assert_called_once()
     args, kwargs = mock_completion.call_args
-    assert kwargs["model"] == "gpt-3.5-turbo"
-    assert "Summarize the text" in kwargs["messages"][0]["content"]
-@patch("workflows.executors.litellm.completion")
 def test_execute_model_step_error(mock_completion):
     """Test handling of errors in model step execution."""
     # Make litellm raise an exception
@@ -156,6 +164,7 @@ def test_execute_model_step_error(mock_completion):
     # Create a test step
     step = ModelStep(
         id="summarize",
         model="gpt-3.5-turbo",
         provider="openai",
         call_type="llm",
@@ -181,6 +190,7 @@ def test_execute_workflow_simple(mock_execute_step):
     # Create a simple workflow
     step = ModelStep(
         id="summarize",
         model="gpt-3.5-turbo",
         provider="openai",
         call_type="llm",
@@ -192,10 +202,14 @@ def test_execute_workflow_simple(mock_execute_step):
     workflow = Workflow(steps={"summarize": step}, inputs=["input_text"], outputs={"summary": "summarize.summary"})
     # Execute the workflow
-    result = execute_workflow(workflow, {"input_text": "Long text to be summarized..."})
     # Verify the results
-    assert result == {"summary": "This is a summary"}
     # Verify execute_model_step was called correctly
     mock_execute_step.assert_called_once()
@@ -206,7 +220,7 @@ def test_execute_workflow_multi_step(mock_execute_step):
     """Test execution of a multi-step workflow with dependencies."""
     # Configure mock to return different values based on the step
-    def side_effect(step, available_vars):
         if step.id == "extract":
             return {"entities": ["Apple", "product"]}
         elif step.id == "analyze":
@@ -218,6 +232,7 @@ def test_execute_workflow_multi_step(mock_execute_step):
     # Create extract step
     extract_step = ModelStep(
         id="extract",
         model="gpt-3.5-turbo",
         provider="openai",
         call_type="llm",
@@ -229,6 +244,7 @@ def test_execute_workflow_multi_step(mock_execute_step):
     # Create analyze step that depends on extract step
     analyze_step = ModelStep(
         id="analyze",
         model="gpt-4",
         provider="openai",
         call_type="llm",
@@ -244,10 +260,18 @@ def test_execute_workflow_multi_step(mock_execute_step):
     )
     # Execute the workflow
-    result = execute_workflow(workflow, {"input_text": "Apple is launching a new product tomorrow."})
     # Verify the results
-    assert result == {"entities": ["Apple", "product"], "sentiment": "positive"}
     # Verify execute_model_step was called twice (once for each step)
     assert mock_execute_step.call_count == 2
@@ -257,6 +281,7 @@ def test_execute_workflow_missing_input():
     """Test that an error is raised when a required input is missing."""
     step = ModelStep(
         id="summarize",
         model="gpt-3.5-turbo",
         provider="openai",
         call_type="llm",
@@ -280,6 +305,7 @@ def test_execute_workflow_cyclic_dependency(mock_dependency_graph):
     step = ModelStep(
         id="test",
         model="gpt-3.5-turbo",
         provider="openai",
         call_type="llm",
@@ -288,8 +314,42 @@ def test_execute_workflow_cyclic_dependency(mock_dependency_graph):
         output_fields=[],
     )
-    workflow = Workflow(steps={"test": step}, inputs=[], outputs=[])
     # This should propagate the CyclicDependencyError
     with pytest.raises(CyclicDependencyError):
         execute_workflow(workflow, {})

     """Test basic input processing without transformations."""
     step = ModelStep(
         id="test_step",
+        name="Test Step",
         model="gpt-4",
         provider="openai",
         call_type="llm",
     """Test input processing with transformation functions."""
     step = ModelStep(
         id="test_step",
+        name="Test Step",
         model="gpt-4",
         provider="openai",
         call_type="llm",
     """Test that appropriate error is raised when a variable is missing."""
     step = ModelStep(
         id="test_step",
+        name="Test Step",
         model="gpt-4",
         provider="openai",
         call_type="llm",
     """Test that appropriate error is raised when an unknown function is specified."""
     step = ModelStep(
         id="test_step",
+        name="Test Step",
         model="gpt-4",
         provider="openai",
         call_type="llm",
 # Tests for execute_model_step
+@patch("workflows.executors.completion")
 def test_execute_model_step_success(mock_completion):
     """Test successful execution of a model step with mocked litellm response."""
     # Mock the litellm response
+    mock_response = {
+        "content": json.dumps({"summary": "This is a summary"}),
+        "output": {"summary": "This is a summary"},
+    }
     mock_completion.return_value = mock_response
     # Create a test step
     step = ModelStep(
         id="summarize",
+        name="Summarize Text",
         model="gpt-3.5-turbo",
+        provider="OpenAI",
         call_type="llm",
         system_prompt="Summarize the text",
         input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
     # Verify the litellm call was made correctly
     mock_completion.assert_called_once()
     args, kwargs = mock_completion.call_args
+    assert kwargs["model"] == "OpenAI/gpt-3.5-turbo"
+    assert "Summarize the text" in kwargs["system"]
+@patch("workflows.executors.completion")
 def test_execute_model_step_error(mock_completion):
     """Test handling of errors in model step execution."""
     # Make litellm raise an exception
     # Create a test step
     step = ModelStep(
         id="summarize",
+        name="Summarize Text",
         model="gpt-3.5-turbo",
         provider="openai",
         call_type="llm",
     # Create a simple workflow
     step = ModelStep(
         id="summarize",
+        name="Summarize Text",
         model="gpt-3.5-turbo",
         provider="openai",
         call_type="llm",
     workflow = Workflow(steps={"summarize": step}, inputs=["input_text"], outputs={"summary": "summarize.summary"})
     # Execute the workflow
+    final_outputs, computed_values, step_contents = execute_workflow(
+        workflow, {"input_text": "Long text to be summarized..."}
+    )
     # Verify the results
+    assert final_outputs == {"summary": "This is a summary"}
+    assert computed_values == {"input_text": "Long text to be summarized...", "summarize.summary": "This is a summary"}
+    assert step_contents == {}
     # Verify execute_model_step was called correctly
     mock_execute_step.assert_called_once()
     """Test execution of a multi-step workflow with dependencies."""
     # Configure mock to return different values based on the step
+    def side_effect(step, available_vars, return_full_content=False):
         if step.id == "extract":
             return {"entities": ["Apple", "product"]}
         elif step.id == "analyze":
     # Create extract step
     extract_step = ModelStep(
         id="extract",
+        name="Extract Entities",
         model="gpt-3.5-turbo",
         provider="openai",
         call_type="llm",
     # Create analyze step that depends on extract step
     analyze_step = ModelStep(
         id="analyze",
+        name="Analyze Sentiment",
         model="gpt-4",
         provider="openai",
         call_type="llm",
     )
     # Execute the workflow
+    final_outputs, computed_values, step_contents = execute_workflow(
+        workflow, {"input_text": "Apple is launching a new product tomorrow."}
+    )
     # Verify the results
+    assert final_outputs == {"entities": ["Apple", "product"], "sentiment": "positive"}
+    assert computed_values == {
+        "input_text": "Apple is launching a new product tomorrow.",
+        "extract.entities": ["Apple", "product"],
+        "analyze.sentiment": "positive",
+    }
+    assert step_contents == {}
     # Verify execute_model_step was called twice (once for each step)
     assert mock_execute_step.call_count == 2
     """Test that an error is raised when a required input is missing."""
     step = ModelStep(
         id="summarize",
+        name="Summarize Text",
         model="gpt-3.5-turbo",
         provider="openai",
         call_type="llm",
     step = ModelStep(
         id="test",
+        name="Test Step",
         model="gpt-3.5-turbo",
         provider="openai",
         call_type="llm",
         output_fields=[],
     )
+    workflow = Workflow(steps=[step], inputs=[], outputs={})
     # This should propagate the CyclicDependencyError
     with pytest.raises(CyclicDependencyError):
         execute_workflow(workflow, {})
+@patch("workflows.executors.execute_model_step")
+def test_execute_workflow_with_full_content(mock_execute_step):
+    """Test execution of a workflow with return_full_content=True."""
+    # Configure mock to return expected outputs and content
+    mock_execute_step.return_value = ({"summary": "This is a summary"}, "Full model response content")
+    # Create a simple workflow
+    step = ModelStep(
+        id="summarize",
+        name="Summarize Text",
+        model="gpt-3.5-turbo",
+        provider="openai",
+        call_type="llm",
+        system_prompt="Summarize the text",
+        input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
+        output_fields=[OutputField(name="summary", description="Summary of the text", type="str")],
+    )
+    workflow = Workflow(steps=[step], inputs=["input_text"], outputs={"summary": "summarize.summary"})
+    # Execute the workflow with return_full_content=True
+    final_outputs, computed_values, step_contents = execute_workflow(
+        workflow, {"input_text": "Long text to be summarized..."}, return_full_content=True
+    )
+    # Verify the results
+    assert final_outputs == {"summary": "This is a summary"}
+    assert computed_values == {"input_text": "Long text to be summarized...", "summarize.summary": "This is a summary"}
+    assert step_contents == {"summarize": "Full model response content"}
+    # Verify execute_model_step was called correctly with return_full_content=True
+    mock_execute_step.assert_called_once_with(step, computed_values, return_full_content=True)