Spaces:

qanta-challenge
/

quizbowl-submission

Running

Maharshi Gor commited on Apr 9

Commit

0bab47c

1 Parent(s): e1ce295

Refactors workflow management and model configurations

Moves llms.py to the workflows directory and updates imports.

Adds logprobs support to Cohere models and renames buzz_threshold to confidence_threshold for clarity.

Enhances PipelineStateManager to include buzzer configuration and implements cycle detection in workflow utilities for improved stability.

Relates to previous refactoring efforts.

Files changed (17) hide show

app.py +4 -3
src/app_configs.py +5 -2
src/components/model_pipeline/model_pipeline.py +3 -3
src/components/model_pipeline/state_manager.py +4 -1
src/components/model_pipeline/tossup_pipeline.py +171 -0
src/components/quizbowl/tossup.py +58 -35
src/workflows/README.md +58 -21
src/workflows/configs.py +51 -0
src/workflows/executors.py +254 -98
src/workflows/factory.py +19 -5
src/{llms.py → workflows/llms.py} +1 -1
src/workflows/qb_agents.py +41 -26
src/workflows/structs.py +49 -2
src/workflows/utils.py +37 -3
src/workflows/validators.py +62 -94
tests/test_executors.py +233 -87
tests/test_validators.py +173 -57

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import gradio as gr
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
-from app_configs import AVAILABLE_MODELS, DEFAULT_SELECTIONS, THEME
 from components.quizbowl.bonus import BonusInterface
 from components.quizbowl.tossup import TossupInterface
 from display.custom_css import css_bonus, css_pipeline, css_tossup
@@ -21,6 +21,7 @@ from envs import (
     TOKEN,
 )
 from workflows import factory
 def restart_space():
@@ -164,12 +165,12 @@ if __name__ == "__main__":
         with gr.Tabs():
             with gr.Tab("Tossup Agents"):
                 defaults = DEFAULT_SELECTIONS["tossup"] | {
-                    "init_workflow": factory.create_quizbowl_simple_workflow(),
                 }
                 tossup_interface = TossupInterface(demo, tossup_ds, AVAILABLE_MODELS, defaults)
             with gr.Tab("Bonus Round Agents"):
                 defaults = DEFAULT_SELECTIONS["bonus"] | {
-                    "init_workflow": factory.create_quizbowl_bonus_simple_workflow(),
                 }
                 bonus_interface = BonusInterface(demo, bonus_ds, AVAILABLE_MODELS, defaults)

 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
+from app_configs import DEFAULT_SELECTIONS, THEME
 from components.quizbowl.bonus import BonusInterface
 from components.quizbowl.tossup import TossupInterface
 from display.custom_css import css_bonus, css_pipeline, css_tossup
     TOKEN,
 )
 from workflows import factory
+from workflows.configs import AVAILABLE_MODELS
 def restart_space():
         with gr.Tabs():
             with gr.Tab("Tossup Agents"):
                 defaults = DEFAULT_SELECTIONS["tossup"] | {
+                    "init_workflow": factory.create_simple_qb_tossup_workflow(),
                 }
                 tossup_interface = TossupInterface(demo, tossup_ds, AVAILABLE_MODELS, defaults)
             with gr.Tab("Bonus Round Agents"):
                 defaults = DEFAULT_SELECTIONS["bonus"] | {
+                    "init_workflow": factory.create_simple_qb_bonus_workflow(),
                 }
                 bonus_interface = BonusInterface(demo, bonus_ds, AVAILABLE_MODELS, defaults)

src/app_configs.py CHANGED Viewed

@@ -23,12 +23,15 @@ AVAILABLE_MODELS = {
     },
     "Cohere/command-r": {
         "model": "command-r-08-2024",
     },
     "Cohere/command-r-plus": {
         "model": "command-r-plus-08-2024",
     },
     "Cohere/command-r7b": {
         "model": "command-r7b-12-2024",
     },
 }
@@ -37,14 +40,14 @@ DEFAULT_SELECTIONS = {
         "simple_workflow": False,
         "model": "OpenAI/gpt-4o-mini",
         "temperature": 0.2,
-        "buzz_threshold": 0.85,
         "early_stop": True,
     },
     "bonus": {
         "simple_workflow": False,
         "model": "OpenAI/gpt-4o-mini",
         "temperature": 0.2,
-        "buzz_threshold": 0.85,
         "early_stop": True,
     },
 }

     },
     "Cohere/command-r": {
         "model": "command-r-08-2024",
+        "logprobs": True,
     },
     "Cohere/command-r-plus": {
         "model": "command-r-plus-08-2024",
+        "logprobs": True,
     },
     "Cohere/command-r7b": {
         "model": "command-r7b-12-2024",
+        "logprobs": False,
     },
 }
         "simple_workflow": False,
         "model": "OpenAI/gpt-4o-mini",
         "temperature": 0.2,
+        "confidence_threshold": 0.85,
         "early_stop": True,
     },
     "bonus": {
         "simple_workflow": False,
         "model": "OpenAI/gpt-4o-mini",
         "temperature": 0.2,
+        "confidence_threshold": 0.85,
         "early_stop": True,
     },
 }

src/components/model_pipeline/model_pipeline.py CHANGED Viewed

@@ -156,10 +156,10 @@ class PipelineInterface:
         )
         return add_step_btn
-    def _render_output_fields(self, available_variables: list[str], pipeline_state: PipelineState):
         dropdowns = {}
         variable_options = [UNSELECTED_VAR_NAME] + [v for v in available_variables if v not in self.input_variables]
-        with gr.Column(elem_classes="step-accordion"):
             with gr.Row(elem_classes="output-fields-header"):
                 gr.Markdown("#### Final output variables mapping:")
             with gr.Row(elem_classes="output-fields-row"):
@@ -260,7 +260,7 @@ class PipelineInterface:
             concurrency_id="render_output_fields",
         )
         def render_output_fields(available_variables, pipeline_state):
-            self._render_output_fields(available_variables, pipeline_state)
         export_btn = gr.Button("Export Pipeline", elem_classes="export-button")
         # components.append(export_btn)

         )
         return add_step_btn
+    def _render_output_panel(self, available_variables: list[str], pipeline_state: PipelineState):
         dropdowns = {}
         variable_options = [UNSELECTED_VAR_NAME] + [v for v in available_variables if v not in self.input_variables]
+        with gr.Column(elem_classes="step-accordion control-panel"):
             with gr.Row(elem_classes="output-fields-header"):
                 gr.Markdown("#### Final output variables mapping:")
             with gr.Row(elem_classes="output-fields-row"):
             concurrency_id="render_output_fields",
         )
         def render_output_fields(available_variables, pipeline_state):
+            self._render_output_panel(available_variables, pipeline_state)
         export_btn = gr.Button("Export Pipeline", elem_classes="export-button")
         # components.append(export_btn)

src/components/model_pipeline/state_manager.py CHANGED Viewed

@@ -8,7 +8,7 @@ from pydantic import BaseModel, Field
 from components import utils
 from workflows.factory import create_new_llm_step
-from workflows.structs import ModelStep, Workflow
 def make_step_id(step_number: int):
@@ -133,6 +133,9 @@ class PipelineStateManager:
     def get_formatted_config(self, state: PipelineState, format: Literal["json", "yaml"] = "yaml"):
         """Get the full pipeline configuration."""
         config = state.workflow.model_dump(exclude_defaults=True)
         if format == "yaml":
             return yaml.dump(config, default_flow_style=False, sort_keys=False, indent=4)
         else:

 from components import utils
 from workflows.factory import create_new_llm_step
+from workflows.structs import ModelStep, TossupWorkflow, Workflow
 def make_step_id(step_number: int):
     def get_formatted_config(self, state: PipelineState, format: Literal["json", "yaml"] = "yaml"):
         """Get the full pipeline configuration."""
         config = state.workflow.model_dump(exclude_defaults=True)
+        if isinstance(state.workflow, TossupWorkflow):
+            buzzer_config = state.workflow.buzzer.model_dump(exclude_defaults=False)
+            config["buzzer"] = buzzer_config
         if format == "yaml":
             return yaml.dump(config, default_flow_style=False, sort_keys=False, indent=4)
         else:

src/components/model_pipeline/tossup_pipeline.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import gradio as gr
+import numpy as np
+from app_configs import AVAILABLE_MODELS, UNSELECTED_VAR_NAME
+from workflows.structs import Buzzer, TossupWorkflow
+from .model_pipeline import PipelineInterface, PipelineState, PipelineUIState
+def toggleable_slider(
+    value, minimum, maximum, step, toggle_value=False, label=None, info=None, min_width=200, scale=1
+):
+    with gr.Column(elem_classes="toggleable", min_width=min_width, scale=scale):
+        show_label = label is not None
+        checkbox = gr.Checkbox(label=label, value=toggle_value, container=False, info=info, show_label=show_label)
+        slider = gr.Slider(
+            minimum=minimum,
+            maximum=maximum,
+            value=value,
+            step=step,
+            label="",
+            interactive=True,
+            show_label=False,
+            container=False,
+        )
+        checkbox.change(fn=lambda x: gr.update(interactive=x), inputs=[checkbox], outputs=[slider])
+    return checkbox, slider
+class TossupPipelineState(PipelineState):
+    workflow: TossupWorkflow
+class TossupPipelineInterface(PipelineInterface):
+    def __init__(
+        self,
+        workflow: TossupWorkflow,
+        ui_state: PipelineUIState | None = None,
+        model_options: list[str] = None,
+        simple: bool = False,
+        show_pipeline_selector: bool = False,
+        defaults: dict = {},
+    ):
+        super().__init__(workflow, ui_state, model_options, simple, show_pipeline_selector)
+        self.defaults = defaults
+    def update_buzzer(
+        self,
+        state: TossupPipelineState,
+        confidence_threshold: float,
+        method: str,
+        tokens_prob: float | None,
+    ):
+        """Update the buzzer."""
+        if tokens_prob and tokens_prob > 1e-5:
+            log_prob_thresh = float(np.log(tokens_prob)) if tokens_prob > 0 else None
+        else:
+            log_prob_thresh = None
+        state.workflow.buzzer = state.workflow.buzzer.model_copy(
+            update={
+                "method": method,
+                "confidence_threshold": confidence_threshold,
+                "log_prob_threshold": log_prob_thresh,
+            }
+        )
+        Buzzer.model_validate(state.workflow.buzzer)
+        return state
+    def update_prob_slider(self, state: TossupPipelineState, answer_var: str, tokens_prob: float | None):
+        """Update the probability slider based on the answer variable."""
+        if answer_var == UNSELECTED_VAR_NAME:
+            return gr.update(interactive=True)
+        step_id = answer_var.split(".")[0]
+        model_name = state.workflow.steps[step_id].model
+        model_config = AVAILABLE_MODELS[model_name]
+        is_model_with_logprobs = model_config.get("logprobs", False)
+        buzzer = state.workflow.buzzer
+        tokens_prob_threshold = tokens_prob if is_model_with_logprobs else None
+        state = self.update_buzzer(
+            state,
+            confidence_threshold=buzzer.confidence_threshold,
+            method=buzzer.method,
+            tokens_prob=tokens_prob_threshold,
+        )
+        return state, gr.update(interactive=not is_model_with_logprobs)
+    def _render_output_panel(self, available_variables: list[str], pipeline_state: TossupPipelineState):
+        dropdowns = {}
+        variable_options = [UNSELECTED_VAR_NAME] + [v for v in available_variables if v not in self.input_variables]
+        with gr.Column(elem_classes="step-accordion control-panel"):
+            with gr.Row(elem_classes="output-fields-header"):
+                gr.Markdown("#### Final output variables mapping:")
+            with gr.Row(elem_classes="output-fields-row"):
+                for output_field in self.required_output_variables:
+                    value = pipeline_state.workflow.outputs.get(output_field, UNSELECTED_VAR_NAME)
+                    dropdown = gr.Dropdown(
+                        label=output_field,
+                        value=value,
+                        choices=variable_options,
+                        interactive=True,
+                        elem_classes="output-field-variable",
+                        # show_label=False,
+                    )
+                    dropdown.change(
+                        self.sm.update_output_variables,
+                        inputs=[self.pipeline_state, gr.State(output_field), dropdown],
+                        outputs=[self.pipeline_state],
+                    )
+                    dropdowns[output_field] = dropdown
+            with gr.Row(elem_classes="output-fields-header"):
+                gr.Markdown("#### Buzzer settings:")
+            with gr.Row(elem_classes="control-panel"):
+                self.confidence_slider = gr.Slider(
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=self.defaults.get("confidence_threshold", 0.85),
+                    step=0.01,
+                    label="Confidence Threshold",
+                )
+                self.buzzer_method_dropdown = gr.Dropdown(
+                    choices=["AND", "OR"],
+                    value=self.defaults.get("buzzer_method", "AND"),
+                    label="Method",
+                    interactive=True,
+                    min_width=80,
+                    scale=0,
+                )
+                self.prob_slider = gr.Slider(
+                    value=self.defaults.get("logits_prob", 0.0),
+                    label="Probability threshold",
+                    minimum=0.0,
+                    maximum=1.0,
+                    step=0.001,
+                )
+        def update_choices(available_variables):
+            """Update the choices for the dropdowns"""
+            return [
+                gr.update(choices=available_variables, value=None, selected=None) for dropdown in dropdowns.values()
+            ]
+        self.variables_state.change(
+            update_choices,
+            inputs=[self.variables_state],
+            outputs=list(dropdowns.values()),
+        )
+        gr.on(
+            triggers=[
+                self.confidence_slider.input,
+                self.buzzer_method_dropdown.input,
+                self.prob_slider.input,
+            ],
+            fn=self.update_buzzer,
+            inputs=[
+                self.pipeline_state,
+                self.confidence_slider,
+                self.buzzer_method_dropdown,
+                self.prob_slider,
+            ],
+            outputs=[self.pipeline_state],
+        )
+        # TODO: Do Add model step change triggers as well. (Model name change triggers)
+        answer_dropdown = dropdowns["answer"]
+        if answer_dropdown is not None:
+            answer_dropdown.change(
+                self.update_prob_slider,
+                inputs=[self.pipeline_state, answer_dropdown, self.prob_slider],
+                outputs=[self.pipeline_state, self.prob_slider],
+            )

src/components/quizbowl/tossup.py CHANGED Viewed

@@ -8,10 +8,11 @@ from datasets import Dataset
 from loguru import logger
 from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState, PipelineUIState
 from display.formatting import styled_error
 from submission import submit
-from workflows.qb_agents import QuizBowlTossupAgent
-from workflows.structs import ModelStep, Workflow
 from . import commons
 from .plotting import (
@@ -26,6 +27,13 @@ from .utils import evaluate_prediction
 # TODO: ^^ Same for Bonus
 def add_model_scores(model_outputs: list[dict], clean_answers: list[str], run_indices: list[int]) -> list[dict]:
     """Add model scores to the model outputs."""
     for output, run_idx in zip(model_outputs, run_indices):
@@ -90,12 +98,12 @@ def process_tossup_results(results: list[dict], top_k_mode: bool = False) -> pd.
     )
-def validate_workflow(workflow: Workflow):
     """
     Validate that a workflow is properly configured for the tossup task.
     Args:
-        workflow (Workflow): The workflow to validate
     Raises:
         ValueError: If the workflow is not properly configured
@@ -180,40 +188,36 @@ class TossupInterface:
         self.output_state = gr.State(value="{}")
         self.render()
-    def _render_model_interface(self, workflow: Workflow, simple: bool = True):
         """Render the model interface."""
         with gr.Row():
             self.model_selector = commons.get_pipeline_selector([])
-        self.pipeline_interface = PipelineInterface(
             workflow,
             simple=simple,
             model_options=list(self.model_options.keys()),
         )
-        with gr.Row():
-            self.buzz_t_slider = gr.Slider(
-                minimum=0.5,
-                maximum=1.0,
-                value=self.defaults["buzz_threshold"],
-                step=0.01,
-                label="Buzz Threshold",
-            )
-            self.early_stop_checkbox = gr.Checkbox(
-                value=self.defaults["early_stop"],
-                label="Early Stop",
-                info="Stop early if already buzzed",
-            )
     def _render_qb_interface(self):
         """Render the quizbowl interface."""
         with gr.Row(elem_classes="bonus-header-row form-inline"):
             self.qid_selector = commons.get_qid_selector(len(self.ds))
             self.run_btn = gr.Button("Run on Tossup Question", variant="secondary")
         self.question_display = gr.HTML(label="Question", elem_id="tossup-question-display")
         with gr.Row():
             self.confidence_plot = gr.Plot(
                 label="Buzz Confidence",
                 format="webp",
             )
         self.results_table = gr.DataFrame(
             label="Model Outputs",
             value=pd.DataFrame(columns=["Token Position", "Correct?", "Confidence", "Prediction"]),
@@ -240,7 +244,7 @@ class TossupInterface:
         with gr.Row():
             # Model Panel
             with gr.Column(scale=1):
-                self._render_model_interface(workflow, simple=self.defaults["simple_workflow"])
             with gr.Column(scale=1):
                 self._render_qb_interface()
@@ -268,13 +272,15 @@ class TossupInterface:
         except Exception as e:
             return f"Error loading question: {str(e)}"
-    def get_model_outputs(self, example: dict, pipeline_state: PipelineState, buzz_threshold: float, early_stop: bool):
         """Get the model outputs for a given question ID."""
         question_runs = []
         tokens = example["question"].split()
         for run_idx in example["run_indices"]:
             question_runs.append(" ".join(tokens[: run_idx + 1]))
-        agent = QuizBowlTossupAgent(pipeline_state.workflow, buzz_threshold)
         outputs = list(agent.run(question_runs, early_stop=early_stop))
         outputs = add_model_scores(outputs, example["clean_answers"], example["run_indices"])
         return outputs
@@ -297,7 +303,6 @@ class TossupInterface:
         self,
         question_id: int,
         pipeline_state: PipelineState,
-        buzz_threshold: float,
         early_stop: bool = True,
     ) -> tuple[str, Any, Any]:
         """Run the agent in tossup mode with a system prompt."""
@@ -307,24 +312,34 @@ class TossupInterface:
             if not self.ds or question_id < 0 or question_id >= len(self.ds):
                 return "Invalid question ID or dataset not loaded", None, None
             example = self.ds[question_id]
-            outputs = self.get_model_outputs(example, pipeline_state, buzz_threshold, early_stop)
             # Process results and prepare visualization data
             tokens_html, plot_data, output_state = initialize_eval_interface(example, outputs)
             df = process_tossup_results(outputs)
             return (
                 tokens_html,
-                gr.update(value=plot_data, label=f"Buzz Confidence on Question {question_id + 1}"),
                 gr.update(value=output_state),
                 gr.update(value=df, label=f"Model Outputs for Question {question_id + 1}"),
             )
         except Exception as e:
             import traceback
-            error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
-            return error_msg, None, None
-    def evaluate(self, pipeline_state: PipelineState, buzz_threshold: float, progress: gr.Progress = gr.Progress()):
         """Evaluate the tossup questions."""
         try:
             # Validate inputs
@@ -336,7 +351,7 @@ class TossupInterface:
             token_positions = []
             correctness = []
             for example in progress.tqdm(self.ds, desc="Evaluating tossup questions"):
-                model_outputs = self.get_model_outputs(example, pipeline_state, buzz_threshold, early_stop=True)
                 if model_outputs[-1]["buzz"]:
                     buzz_counts += 1
                     if model_outputs[-1]["score"] == 1:
@@ -355,12 +370,19 @@ class TossupInterface:
             )
             plot_data = create_scatter_pyplot(token_positions, correctness)
             return (
-                gr.update(value=df, label="Scores on Sample Set"),
                 gr.update(value=plot_data, label="Buzz Positions on Sample Set"),
             )
         except Exception as e:
             logger.exception(f"Error evaluating tossups: {e.args}")
-            return "Error evaluating tossups", None, None
     def submit_model(
         self, model_name: str, description: str, pipeline_state: PipelineState, profile: gr.OAuthProfile = None
@@ -401,21 +423,22 @@ class TossupInterface:
             inputs=[
                 self.qid_selector,
                 self.pipeline_interface.pipeline_state,
-                self.buzz_t_slider,
                 self.early_stop_checkbox,
             ],
             outputs=[
                 self.question_display,
-                self.confidence_plot,
                 self.output_state,
                 self.results_table,
             ],
         )
         self.eval_btn.click(
             fn=self.evaluate,
-            inputs=[self.pipeline_interface.pipeline_state, self.buzz_t_slider],
-            outputs=[self.results_table, self.confidence_plot],
         )
         self.submit_btn.click(

 from loguru import logger
 from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState, PipelineUIState
+from components.model_pipeline.tossup_pipeline import TossupPipelineInterface, TossupPipelineState
 from display.formatting import styled_error
 from submission import submit
+from workflows.qb_agents import QuizBowlTossupAgent, TossupResult
+from workflows.structs import ModelStep, TossupWorkflow
 from . import commons
 from .plotting import (
 # TODO: ^^ Same for Bonus
+class ScoredTossupResult(TossupResult):
+    """Result of a tossup question with evaluation score and position."""
+    score: int  # Correctness score of the answer
+    token_position: int  # Position in the question where prediction was made
 def add_model_scores(model_outputs: list[dict], clean_answers: list[str], run_indices: list[int]) -> list[dict]:
     """Add model scores to the model outputs."""
     for output, run_idx in zip(model_outputs, run_indices):
     )
+def validate_workflow(workflow: TossupWorkflow):
     """
     Validate that a workflow is properly configured for the tossup task.
     Args:
+        workflow (TossupWorkflow): The workflow to validate
     Raises:
         ValueError: If the workflow is not properly configured
         self.output_state = gr.State(value="{}")
         self.render()
+    def _render_pipeline_interface(self, workflow: TossupWorkflow, simple: bool = True):
         """Render the model interface."""
         with gr.Row():
             self.model_selector = commons.get_pipeline_selector([])
+        self.pipeline_interface = TossupPipelineInterface(
             workflow,
             simple=simple,
             model_options=list(self.model_options.keys()),
+            defaults=self.defaults,
         )
     def _render_qb_interface(self):
         """Render the quizbowl interface."""
         with gr.Row(elem_classes="bonus-header-row form-inline"):
             self.qid_selector = commons.get_qid_selector(len(self.ds))
+            self.early_stop_checkbox = gr.Checkbox(
+                value=self.defaults["early_stop"],
+                label="Early Stop",
+                info="Stop if already buzzed",
+                scale=0,
+            )
             self.run_btn = gr.Button("Run on Tossup Question", variant="secondary")
         self.question_display = gr.HTML(label="Question", elem_id="tossup-question-display")
+        self.error_display = gr.HTML(label="Error", elem_id="tossup-error-display", visible=False)
         with gr.Row():
             self.confidence_plot = gr.Plot(
                 label="Buzz Confidence",
                 format="webp",
             )
+        self.model_outputs_display = gr.JSON(label="Model Outputs", value="{}", visible=False)
         self.results_table = gr.DataFrame(
             label="Model Outputs",
             value=pd.DataFrame(columns=["Token Position", "Correct?", "Confidence", "Prediction"]),
         with gr.Row():
             # Model Panel
             with gr.Column(scale=1):
+                self._render_pipeline_interface(workflow, simple=self.defaults["simple_workflow"])
             with gr.Column(scale=1):
                 self._render_qb_interface()
         except Exception as e:
             return f"Error loading question: {str(e)}"
+    def get_model_outputs(
+        self, example: dict, pipeline_state: PipelineState, early_stop: bool
+    ) -> list[ScoredTossupResult]:
         """Get the model outputs for a given question ID."""
         question_runs = []
         tokens = example["question"].split()
         for run_idx in example["run_indices"]:
             question_runs.append(" ".join(tokens[: run_idx + 1]))
+        agent = QuizBowlTossupAgent(pipeline_state.workflow)
         outputs = list(agent.run(question_runs, early_stop=early_stop))
         outputs = add_model_scores(outputs, example["clean_answers"], example["run_indices"])
         return outputs
         self,
         question_id: int,
         pipeline_state: PipelineState,
         early_stop: bool = True,
     ) -> tuple[str, Any, Any]:
         """Run the agent in tossup mode with a system prompt."""
             if not self.ds or question_id < 0 or question_id >= len(self.ds):
                 return "Invalid question ID or dataset not loaded", None, None
             example = self.ds[question_id]
+            outputs = self.get_model_outputs(example, pipeline_state, early_stop)
             # Process results and prepare visualization data
             tokens_html, plot_data, output_state = initialize_eval_interface(example, outputs)
             df = process_tossup_results(outputs)
+            step_outputs = [output["step_outputs"] for output in outputs]
             return (
                 tokens_html,
                 gr.update(value=output_state),
+                gr.update(value=plot_data, label=f"Buzz Confidence on Question {question_id + 1}"),
                 gr.update(value=df, label=f"Model Outputs for Question {question_id + 1}"),
+                gr.update(value=step_outputs, label=f"Step Outputs for Question {question_id + 1}", visible=True),
+                gr.update(visible=False),
             )
         except Exception as e:
             import traceback
+            error_msg = styled_error(f"Error: {str(e)}\n{traceback.format_exc()}")
+            return (
+                gr.skip(),
+                gr.skip(),
+                gr.skip(),
+                gr.skip(),
+                gr.update(visible=False),
+                gr.update(visible=True, value=error_msg),
+            )
+    def evaluate(self, pipeline_state: PipelineState, progress: gr.Progress = gr.Progress()):
         """Evaluate the tossup questions."""
         try:
             # Validate inputs
             token_positions = []
             correctness = []
             for example in progress.tqdm(self.ds, desc="Evaluating tossup questions"):
+                model_outputs = self.get_model_outputs(example, pipeline_state, early_stop=True)
                 if model_outputs[-1]["buzz"]:
                     buzz_counts += 1
                     if model_outputs[-1]["score"] == 1:
             )
             plot_data = create_scatter_pyplot(token_positions, correctness)
             return (
                 gr.update(value=plot_data, label="Buzz Positions on Sample Set"),
+                gr.update(value=df, label="Scores on Sample Set"),
+                gr.update(visible=False),
             )
         except Exception as e:
+            import traceback
             logger.exception(f"Error evaluating tossups: {e.args}")
+            return (
+                gr.skip(),
+                gr.skip(),
+                gr.update(visible=True, value=styled_error(f"Error: {str(e)}\n{traceback.format_exc()}")),
+            )
     def submit_model(
         self, model_name: str, description: str, pipeline_state: PipelineState, profile: gr.OAuthProfile = None
             inputs=[
                 self.qid_selector,
                 self.pipeline_interface.pipeline_state,
                 self.early_stop_checkbox,
             ],
             outputs=[
                 self.question_display,
                 self.output_state,
+                self.confidence_plot,
                 self.results_table,
+                self.model_outputs_display,
+                self.error_display,
             ],
         )
         self.eval_btn.click(
             fn=self.evaluate,
+            inputs=[self.pipeline_interface.pipeline_state],
+            outputs=[self.confidence_plot, self.results_table, self.error_display],
         )
         self.submit_btn.click(

src/workflows/README.md CHANGED Viewed

@@ -12,25 +12,44 @@ The workflows subpackage enables the creation and execution of workflows where m
 Contains the core data structures used throughout the workflow system:
-- `Field`: Represents an input or output field with name and type information
 - `ModelStep`: Represents a single step in a workflow with input fields, output fields, and model details
 - `Workflow`: A collection of ModelSteps with their identifiers
 ### `utils.py`
 Provides utility functions for workflow operations:
-- `_create_variable_step_mapping`: Maps variables to the steps that produce them
 - `create_dependency_graph`: Builds a dependency graph representing the execution order constraints
 - `topological_sort`: Sorts steps in execution order based on their dependencies
-### `workflow_executor.py`
 Handles the execution of workflows:
-- Processes inputs and outputs between steps
-- Coordinates the execution of model steps in the correct order
-- Integrates with external model providers (e.g., via litellm)
 ### `errors.py`
@@ -43,36 +62,50 @@ Defines custom exceptions for workflow-related errors:
 ## Usage Example
 ```python
-from workflows.structs import Field, ModelStep, Workflow
 # Define a workflow with two steps
 step1 = ModelStep(
-    input_fields=[Field(name="query", type="string")],
-    output_fields=[Field(name="summary", type="string")],
-    model="gpt-3.5-turbo",
-    system_prompt="Summarize the following text"
 )
 step2 = ModelStep(
-    input_fields=[Field(name="summary", type="string", variable="step1.summary")],
-    output_fields=[Field(name="key_points", type="array")],
-    model="gpt-4",
-    system_prompt="Extract key points from the summary"
 )
-workflow = Workflow(steps={"step1": step1, "step2": step2})
 # Execute the workflow
-from workflows.workflow_executor import execute_workflow
 result = execute_workflow(
     workflow=workflow,
-    input_values={"query": "Long text to summarize..."}
 )
 # Access results
-summary = result["step1.summary"]
-key_points = result["step2.key_points"]
 ```
 ## Error Handling
@@ -82,6 +115,8 @@ The workflows system provides robust error handling:
 - Detects cyclic dependencies in workflow definitions
 - Validates input/output variable references
 - Ensures all required inputs are provided
 ## Extending the Workflows System
@@ -89,4 +124,6 @@ To extend the workflows system:
 1. Add new model step types by extending the `ModelStep` class
 2. Create custom field types by extending validation in the execution logic
-3. Implement additional error types in `errors.py` for specialized error handling

 Contains the core data structures used throughout the workflow system:
+- `InputField`: Represents an input field with name, description, and variable reference
+- `OutputField`: Represents an output field with name, type, and description
 - `ModelStep`: Represents a single step in a workflow with input fields, output fields, and model details
 - `Workflow`: A collection of ModelSteps with their identifiers
+- `TossupWorkflow`: Specialized workflow for quizbowl tossup questions with buzzing capability
+### `configs.py`
+Provides configuration settings and constants:
+- `AVAILABLE_MODELS`: Supported model configurations from various providers
+- `TYPE_MAP`: Mapping of supported field types to Python types
+- `FUNCTION_MAP`: Built-in transformation functions for input/output processing
 ### `utils.py`
 Provides utility functions for workflow operations:
 - `create_dependency_graph`: Builds a dependency graph representing the execution order constraints
 - `topological_sort`: Sorts steps in execution order based on their dependencies
+- `detect_cycles`: Identifies cyclic dependencies in workflow definitions
+### `executors.py`
 Handles the execution of workflows:
+- `execute_model_step`: Executes a single model step with input processing and output collection
+- `execute_simple_workflow`: Handles single-step workflows
+- `execute_multi_step_workflow`: Manages multi-step workflows with dependency resolution
+- `execute_workflow`: Main entry point that routes to appropriate executor based on workflow complexity
+### `validators.py`
+Provides workflow validation functionality:
+- `ValidationErrorType`: Enumeration of possible validation error types
+- `WorkflowValidationError`: Base class for validation errors
+- Validation functions for steps, DAGs, variables, and types
 ### `errors.py`
 ## Usage Example
 ```python
+from workflows.structs import InputField, ModelStep, OutputField, Workflow
 # Define a workflow with two steps
 step1 = ModelStep(
+    id="step1",
+    model="gpt-4o-mini",
+    provider="OpenAI",
+    call_type="llm",
+    system_prompt="Step1 processing",
+    input_fields=[InputField(name="value", description="Input value", variable="input.value")],
+    output_fields=[OutputField(name="result", description="Processed result", type="str", func="upper")],
 )
 step2 = ModelStep(
+    id="step2",
+    model="gpt-4o-mini",
+    provider="OpenAI",
+    call_type="llm",
+    system_prompt="Step2 processing",
+    input_fields=[InputField(name="result", description="Result from step1", variable="step1.result")],
+    output_fields=[OutputField(name="final", description="Final output", type="str", func="lower")],
 )
+workflow = Workflow(
+    steps={"step1": step1, "step2": step2},
+    inputs=["input.value"],
+    outputs={"final": "step2.final"}
+)
 # Execute the workflow
+from workflows.executors import execute_workflow
 result = execute_workflow(
     workflow=workflow,
+    input_values={"input.value": "Hello, World!"},
+    return_full_content=True,
+    logprob_step="step2"
 )
 # Access results
+final_output = result["final_outputs"]["final"]
+intermediate_results = result["intermediate_outputs"]
+step_contents = result["step_contents"]
+logprob = result["logprob"]
 ```
 ## Error Handling
 - Detects cyclic dependencies in workflow definitions
 - Validates input/output variable references
 - Ensures all required inputs are provided
+- Supports custom validation rules through the validation system
+- Provides detailed error messages for debugging
 ## Extending the Workflows System
 1. Add new model step types by extending the `ModelStep` class
 2. Create custom field types by extending validation in the execution logic
+3. Implement additional error types in `errors.py` for specialized error handling
+4. Add new transformation functions to `FUNCTION_MAP` in `configs.py`
+5. Create specialized workflow types by extending the `Workflow` class

src/workflows/configs.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""
+Configuration settings for the workflows package.
+This module contains configuration settings and constants used across the workflows package,
+including model configurations, workflow settings, and other package-wide constants.
+"""
+AVAILABLE_MODELS = {
+    "OpenAI/gpt-4o": {
+        "model": "gpt-4o-2024-11-20",
+    },
+    "OpenAI/gpt-4o-mini": {
+        "model": "gpt-4o-mini-2024-07-18",
+    },
+    "OpenAI/gpt-3.5-turbo": {
+        "model": "gpt-3.5-turbo-0125",
+    },
+    "Anthropic/claude-3-7-sonnet": {
+        "model": "claude-3-7-sonnet-20250219",
+    },
+    "Anthropic/claude-3-5-sonnet": {
+        "model": "claude-3-5-sonnet-20241022",
+    },
+    "Anthropic/claude-3-5-haiku": {
+        "model": "claude-3-5-haiku-20241022",
+    },
+    "Cohere/command-r": {
+        "model": "command-r-08-2024",
+    },
+    "Cohere/command-r-plus": {
+        "model": "command-r-plus-08-2024",
+    },
+    "Cohere/command-r7b": {
+        "model": "command-r7b-12-2024",
+    },
+}
+# Function mapping for input/output transformations
+TYPE_MAP = {
+    "str": str,
+    "int": int,
+    "float": float,
+    "bool": bool,
+}
+FUNCTION_MAP = {
+    "upper": str.upper,
+    "lower": str.lower,
+    "len": len,
+    "split": str.split,
+}

src/workflows/executors.py CHANGED Viewed

@@ -1,13 +1,3 @@
-# %%
-from typing import Any
-import pydantic
-from llms import completion
-from workflows.errors import WorkflowError
-from workflows.structs import InputField, ModelStep, OutputField, Workflow
-from workflows.utils import create_dependency_graph, topological_sort
 """
 Core workflow execution functionality.
@@ -18,42 +8,48 @@ with the litellm library to handle model interactions.
 Key components:
 - Utility functions for input/output transformation
 - Input processing and validation
-- Model step execution
 - Complete workflow execution with dependency resolution
 The module orchestrates the execution of steps in the correct order based on their
-dependencies and manages the flow of data between steps.
 """
-def upper(x):
-    if isinstance(x, str):
-        return x.upper()
-    return x
-def lower(x):
-    if isinstance(x, str):
-        return x.lower()
-    return x
-TYPE_MAP = {
-    "str": str,
-    "int": int,
-    "float": float,
-    "bool": bool,
-}
-FUNCTION_MAP = {
-    "upper": upper,
-    "lower": lower,
-    "len": len,
-    "split": str.split,
-}
-def get_type(type_str: str) -> type:
     return TYPE_MAP.get(type_str, eval(type_str))
@@ -95,10 +91,72 @@ def create_processed_inputs(model_step: ModelStep, available_vars: dict[str, Any
     return processed_inputs
 # %%
 def execute_model_step(
-    model_step: ModelStep, available_vars: dict[str, Any], return_full_content: bool = False
-) -> dict[str, Any] | tuple[dict[str, Any], str]:
     """
     Executes a model step using the provided available variables.
@@ -117,10 +175,14 @@ def execute_model_step(
                                input/output specifications, and system prompt.
         available_vars (dict[str, Any]): A dictionary of all variables available to this step,
                                         including outputs from previous steps and external inputs.
     Returns:
-        dict[str, Any]: A dictionary of processed outputs from the model step,
-                       with keys matching the output field names.
     Raises:
         WorkflowError: If there's an error in input processing, model execution,
@@ -136,8 +198,8 @@ def execute_model_step(
         ...     input_fields=[InputField(name="text", variable="input_text", description="Text to summarize")],
         ...     output_fields=[OutputField(name="summary", type="str", description="Summary of the text")]
         ... )
-        >>> execute_model_step(step, {"input_text": "Long text to be summarized..."})
-        {"summary": "A concise summary of the text."}
     """
     # Ensure inputs are processed using the specified functions in input_fields.
     processed_inputs = create_processed_inputs(model_step, available_vars)
@@ -159,28 +221,25 @@ def execute_model_step(
         system=model_step.system_prompt,
         prompt=step_result,
         response_format=ModelResponse,
     )
-    # api_response = litellm.completion(
-    #     model=model_step.model,
-    #     messages=[{"role": "user", "content": step_result}],
-    #     response_format=ModelResponse,
-    # )
-    # Extract and parse the model response
-    # model_response_content = api_response["choices"][0]["message"]["content"]
-    # model_response = json.loads(model_response_content)
-    model_response = api_response["output"]
     # Map the parsed response to the output fields
-    outputs = {field.name: model_response[field.name] for field in model_step.output_fields}
     if return_full_content:
-        return outputs, api_response["content"]
-    return outputs
-# %%
 def execute_multi_step_workflow(
-    workflow: Workflow, input_values: dict[str, Any], return_full_content: bool = False
-) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
     """
     Execute the given workflow as a computational graph.
@@ -203,12 +262,11 @@ def execute_multi_step_workflow(
                                       Keys should match the required workflow.inputs.
         return_full_content (bool, optional): If True, returns the full content of each step.
                                              Defaults to False.
     Returns:
-        A tuple containing:
-            - A dictionary of the workflow's outputs, with keys matching the variables defined in workflow.outputs.
-            - A dictionary of all computed values during workflow execution, including intermediate results.
-            - A dictionary of step contents, only populated if return_full_content is True.
     Raises:
         UnknownVariableError: If an input_field references a variable that is not
@@ -252,49 +310,75 @@ def execute_multi_step_workflow(
     # Step 4: Execute steps in topological order.
     step_contents: dict[str, Any] = {}
     for step_id in execution_order:
         step = workflow.steps[step_id]
-        outputs = execute_model_step(step, computed_values, return_full_content=return_full_content)
         # Execute the step
         if return_full_content:
-            outputs, content = outputs
-            step_contents[step_id] = content
-        outputs = {f"{step_id}.{k}": v for k, v in outputs.items()}
         computed_values.update(outputs)
     # Step 5: Gather and return workflow outputs.
     final_outputs: dict[str, Any] = {}
     for target, var in workflow.outputs.items():
         if var not in computed_values:
-            raise WorkflowError(f"Workflow output variable {var} was not produced")
         final_outputs[target] = computed_values[var]
-    step_outputs = {k: v for k, v in computed_values.items() if k not in workflow.inputs}
-    return final_outputs, step_outputs, step_contents
 def execute_simple_workflow(
-    workflow: Workflow, input_values: dict[str, Any], return_full_content: bool = False
-) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
-    """Execute a simple workflow with a single step.
-    This is a simplified version of execute_workflow for workflows with only one step.
     Args:
-        workflow: The workflow to execute
-        input_values: Dictionary of input values
-        return_full_content: Whether to return the full content of each step
     Returns:
-        Tuple containing:
-            - final_outputs: Dictionary of workflow outputs
-            - computed_values: Dictionary of all computed values
-            - step_contents: Dictionary of step contents (if return_full_content=True)
     Raises:
-        WorkflowError: If the workflow has more than one step
     """
     if len(workflow.steps) != 1:
         raise WorkflowError("Simple workflow must have exactly one step")
@@ -302,19 +386,17 @@ def execute_simple_workflow(
     # Get the single step
     step = list(workflow.steps.values())[0]
     # Validate inputs
     for var in workflow.inputs:
         if var not in input_values:
             raise WorkflowError(f"Missing required workflow input: {var}")
     # Execute the step
-    if return_full_content:
-        step_outputs, content = execute_model_step(step, input_values, return_full_content=True)
-        step_contents = {step.id: content}
-    else:
-        step_outputs = execute_model_step(step, input_values, return_full_content=False)
-        step_contents = {}
     # Prepare the final outputs
     final_outputs = {}
     for target, var in workflow.outputs.items():
@@ -328,27 +410,101 @@ def execute_simple_workflow(
             raise WorkflowError(f"Invalid output mapping: {var} does not match step ID {step.id}")
     # Prepare computed values (prefixed with step ID)
-    computed_values = input_values.copy()
-    computed_values.update({f"{step.id}.{k}": v for k, v in step_outputs.items()})
-    return final_outputs, computed_values, step_contents
 def execute_workflow(
-    workflow: Workflow, input_values: dict[str, Any], return_full_content: bool = False
-) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
     if len(workflow.steps) > 1:
-        return execute_multi_step_workflow(workflow, input_values, return_full_content)
     else:
-        return execute_simple_workflow(workflow, input_values, return_full_content)
 def run_examples():
     """
-    Runs three example workflows demonstrating:
-      1. A successful (linear) workflow execution.
-      2. A cyclic dependency error.
-      3. An unknown variable dependency error.
     """
     print("Example 1: Successful Workflow Execution")
     # Example 1: Simple linear workflow.

 """
 Core workflow execution functionality.
 Key components:
 - Utility functions for input/output transformation
 - Input processing and validation
+- Model step execution with support for log probabilities
 - Complete workflow execution with dependency resolution
+- Support for both simple (single-step) and multi-step workflows
+- Structured output collection with intermediate results
 The module orchestrates the execution of steps in the correct order based on their
+dependencies and manages the flow of data between steps. It supports:
+- Full content tracking for debugging
+- Log probability calculation for specific steps
+- Flexible input/output transformations
+- Error handling and validation
 """
+from typing import Any, TypedDict
+import pydantic
+from .configs import FUNCTION_MAP, TYPE_MAP
+from .errors import WorkflowError
+from .llms import completion
+from .structs import InputField, ModelStep, OutputField, Workflow
+from .utils import create_dependency_graph, topological_sort
+def get_type(type_str: str) -> type:
+    """
+    Converts a type string to its corresponding Python type.
+    This function maps type strings to their actual Python type objects. It first checks
+    the TYPE_MAP dictionary for predefined mappings, and if not found, falls back to
+    evaluating the type string directly.
+    Args:
+        type_str (str): A string representation of a type (e.g., "str", "int", "list[str]")
+    Returns:
+        type: The corresponding Python type object
+    Note:
+        Uses eval() for non-predefined types, which has security implications if used
+        with untrusted input. This is intended for internal use with validated type strings.
+    """
     return TYPE_MAP.get(type_str, eval(type_str))
     return processed_inputs
+class ModelStepResult(TypedDict):
+    """
+    Result of executing a model step.
+    This TypedDict contains the outputs and metadata from executing a single model step,
+    including the processed output values, the full response content, and log probability
+    information when requested.
+    Attributes:
+        outputs (dict[str, Any]): A dictionary of processed outputs from the model step,
+                                with keys matching the output field names.
+        content (str | None): The full content of the model's response, only populated
+                            if return_full_content is True.
+        logprob (float | None): The log probability of the model step output, only populated
+                               if logprobs is True.
+    """
+    # A dictionary of processed outputs from the model step,
+    # with keys matching the output field names.
+    outputs: dict[str, Any]
+    # The full content of the model step.
+    content: str | None
+    # The log probability of the model step output if requested.
+    logprob: float | None
+class WorkflowOutput(TypedDict):
+    """
+    Result of executing a complete workflow.
+    This TypedDict contains the outputs and metadata from executing a workflow,
+    including final outputs, intermediate values, step contents, and log probabilities.
+    Attributes:
+        final_outputs (dict[str, Any]): The final output values produced by the workflow,
+                                      with keys matching the names defined in workflow.outputs.
+        intermediate_outputs (dict[str, Any]): All computed values during workflow execution,
+                                             including both external inputs and outputs from all steps.
+        step_contents (dict[str, Any]): Full response content for each step, keyed by step ID.
+                                      Only populated if return_full_content is True.
+        logprob (float | None): The log probability of the specified step's output.
+                               Only populated if logprob_step is specified.
+    """
+    # A dictionary of the workflow's outputs, with keys matching the variables defined in workflow.outputs.
+    final_outputs: dict[str, Any]
+    # A dictionary of all computed values during workflow execution, including intermediate results.
+    intermediate_outputs: dict[str, Any]
+    # A dictionary of step contents, only populated if return_full_content is True.
+    step_contents: dict[str, Any]
+    # The log probability of the workflow output if requested.
+    logprob: float | None
 # %%
 def execute_model_step(
+    model_step: ModelStep,
+    available_vars: dict[str, Any],
+    return_full_content: bool = False,
+    logprobs: bool = False,
+) -> ModelStepResult:
     """
     Executes a model step using the provided available variables.
                                input/output specifications, and system prompt.
         available_vars (dict[str, Any]): A dictionary of all variables available to this step,
                                         including outputs from previous steps and external inputs.
+        return_full_content (bool, optional): If True, includes the full model response content
+                                             in the result. Defaults to False.
+        logprobs (bool, optional): If True, calculates and returns log probability information
+                                  for the model response. Defaults to False.
     Returns:
+        ModelStepResult: A TypedDict containing processed outputs, optional full content,
+                        and optional log probability information.
     Raises:
         WorkflowError: If there's an error in input processing, model execution,
         ...     input_fields=[InputField(name="text", variable="input_text", description="Text to summarize")],
         ...     output_fields=[OutputField(name="summary", type="str", description="Summary of the text")]
         ... )
+        >>> result = execute_model_step(step, {"input_text": "Long text to be summarized..."})
+        >>> summary = result["outputs"]["summary"]
     """
     # Ensure inputs are processed using the specified functions in input_fields.
     processed_inputs = create_processed_inputs(model_step, available_vars)
         system=model_step.system_prompt,
         prompt=step_result,
         response_format=ModelResponse,
+        logprobs=logprobs,
     )
     # Map the parsed response to the output fields
+    outputs = {field.name: api_response["output"][field.name] for field in model_step.output_fields}
+    result = ModelStepResult(outputs=outputs, content=None, logprob=None)
     if return_full_content:
+        result["content"] = api_response["content"]
+    if logprobs:
+        result["logprob"] = api_response["log_prob"]
+    return result
 def execute_multi_step_workflow(
+    workflow: Workflow,
+    input_values: dict[str, Any],
+    return_full_content: bool = False,
+    logprob_step: str | None = None,
+) -> WorkflowOutput:
     """
     Execute the given workflow as a computational graph.
                                       Keys should match the required workflow.inputs.
         return_full_content (bool, optional): If True, returns the full content of each step.
                                              Defaults to False.
+        logprob_step (str, optional): The ID of the step to use for log probability calculation.
+                                      Defaults to None.
     Returns:
+        WorkflowOutput: A dictionary of workflow outputs, including final outputs, intermediate outputs, and step contents.
     Raises:
         UnknownVariableError: If an input_field references a variable that is not
     # Step 4: Execute steps in topological order.
     step_contents: dict[str, Any] = {}
+    logprob = None
     for step_id in execution_order:
         step = workflow.steps[step_id]
+        return_logprobs = logprob_step == step_id
         # Execute the step
+        result = execute_model_step(
+            step, computed_values, return_full_content=return_full_content, logprobs=return_logprobs
+        )
+        if return_logprobs:
+            logprob = result["logprob"]
         if return_full_content:
+            step_contents[step_id] = result["content"]
+        outputs = {f"{step_id}.{k}": v for k, v in result["outputs"].items()}
         computed_values.update(outputs)
     # Step 5: Gather and return workflow outputs.
     final_outputs: dict[str, Any] = {}
     for target, var in workflow.outputs.items():
         if var not in computed_values:
+            raise WorkflowError(
+                f"Workflow output variable {var} was not produced. Computed values: {computed_values.keys()}"
+            )
         final_outputs[target] = computed_values[var]
+    return WorkflowOutput(
+        final_outputs=final_outputs,
+        intermediate_outputs=computed_values,
+        step_contents=step_contents,
+        logprob=logprob,
+    )
 def execute_simple_workflow(
+    workflow: Workflow,
+    input_values: dict[str, Any],
+    return_full_content: bool = False,
+    logprob_step: bool | str = False,
+) -> WorkflowOutput:
+    """
+    Execute a simple workflow with a single step.
+    This is an optimized version of workflow execution for workflows containing only one step.
+    It bypasses the dependency graph building and topological sorting steps, providing a more
+    direct execution path for simple workflows.
     Args:
+        workflow (Workflow): The workflow to execute, which must contain exactly one step.
+        input_values (dict[str, Any]): External input values to be used by the workflow.
+                                     Keys should match the required workflow.inputs.
+        return_full_content (bool, optional): If True, includes the full model response content
+                                            in the result. Defaults to False.
+        logprobs (bool, optional): If True, calculates and returns log probability information
+                                  for the model response. Defaults to False.
     Returns:
+        WorkflowOutput: A TypedDict containing the workflow outputs, intermediate values,
+                       optional step contents, and optional log probability information.
     Raises:
+        WorkflowError: If the workflow has more than one step or if required inputs are missing.
+    Example:
+        >>> workflow = Workflow(
+        ...     steps={"extract": ModelStep(...)},
+        ...     inputs=["text"],
+        ...     outputs={"entities": "extract.entities"}
+        ... )
+        >>> result = execute_simple_workflow(workflow, {"text": "Apple is launching a new product."})
+        >>> entities = result["final_outputs"]["entities"]
     """
     if len(workflow.steps) != 1:
         raise WorkflowError("Simple workflow must have exactly one step")
     # Get the single step
     step = list(workflow.steps.values())[0]
+    logprobs = logprob_step is True or logprob_step == step.id
     # Validate inputs
     for var in workflow.inputs:
         if var not in input_values:
             raise WorkflowError(f"Missing required workflow input: {var}")
     # Execute the step
+    step_result = execute_model_step(step, input_values, return_full_content=return_full_content, logprobs=logprobs)
+    step_outputs = step_result["outputs"]
+    step_contents = {step.id: step_result["content"]} if return_full_content else {}
     # Prepare the final outputs
     final_outputs = {}
     for target, var in workflow.outputs.items():
             raise WorkflowError(f"Invalid output mapping: {var} does not match step ID {step.id}")
     # Prepare computed values (prefixed with step ID)
+    computed_values = input_values | {f"{step.id}.{k}": v for k, v in step_outputs.items()}
+    return WorkflowOutput(
+        final_outputs=final_outputs,
+        intermediate_outputs=computed_values,
+        step_contents=step_contents,
+        logprob=step_result.get("logprob"),
+    )
 def execute_workflow(
+    workflow: Workflow,
+    input_values: dict[str, Any],
+    return_full_content: bool = False,
+    logprob_step: str | bool = False,
+) -> WorkflowOutput:
+    """
+    Main entry point for executing workflows of any complexity.
+    This function serves as a router that delegates to the appropriate specialized
+    execution function based on the complexity of the workflow:
+    - For single-step workflows, it calls execute_simple_workflow
+    - For multi-step workflows, it calls execute_multi_step_workflow
+    This abstraction allows callers to use a consistent interface regardless of
+    the workflow's complexity.
+    Args:
+        workflow (Workflow): The workflow to execute, containing steps, their
+                           dependencies, and input/output specifications.
+        input_values (dict[str, Any]): External input values to be used by the workflow.
+                                     Keys should match the required workflow.inputs.
+        return_full_content (bool, optional): If True, includes the full model response
+                                            content in the result. Defaults to False.
+        logprob_step (str | bool, optional): Either a string with the ID of the step for which
+                                           to calculate log probability, or a boolean flag.
+                                           If False, no log probabilities are calculated.
+                                           Defaults to False.
+    Returns:
+        WorkflowOutput: A TypedDict containing the workflow outputs, intermediate values,
+                       optional step contents, and optional log probability information.
+    Raises:
+        WorkflowError: For any workflow-related errors, such as missing required inputs,
+                      circular dependencies, or invalid variable references.
+    Example:
+        >>> workflow = Workflow(
+        ...     steps={"extract": ModelStep(...), "analyze": ModelStep(...)},
+        ...     inputs=["text"],
+        ...     outputs={"sentiment": "analyze.sentiment"}
+        ... )
+        >>> result = execute_workflow(
+        ...     workflow,
+        ...     {"text": "Apple is launching a new product."},
+        ...     return_full_content=True,
+        ...     logprob_step="analyze"
+        ... )
+        >>> print(result["final_outputs"]["sentiment"])
+        "positive"
+    """
     if len(workflow.steps) > 1:
+        return execute_multi_step_workflow(workflow, input_values, return_full_content, logprob_step)
     else:
+        return execute_simple_workflow(workflow, input_values, return_full_content, logprob_step)
 def run_examples():
     """
+    Runs example workflows demonstrating key functionality and error handling.
+    This function creates and executes three different example workflows to showcase:
+    1. Successful workflow execution:
+       - A linear two-step workflow with proper dependency flow
+       - Input transformation using the 'upper' function
+       - Output transformation using the 'lower' function
+       - Proper variable passing between steps
+    2. Cyclic dependency detection:
+       - A workflow with two steps that depend on each other circularly
+       - Demonstrates the error handling for cyclic dependencies
+       - Shows how the system prevents infinite execution loops
+    3. Unknown variable detection:
+       - A workflow that references a variable not provided as input or by any step
+       - Demonstrates validation of variable references
+       - Shows error handling for missing dependencies
+    Each example prints its result or the error encountered, making this function
+    useful for testing and demonstration purposes.
+    Returns:
+        None: This function prints its results and doesn't return a value.
     """
     print("Example 1: Successful Workflow Execution")
     # Example 1: Simple linear workflow.

src/workflows/factory.py CHANGED Viewed

@@ -1,5 +1,14 @@
 # %%
-from workflows.structs import Field, InputField, ModelStep, OutputField, Workflow
 INITIAL_SYS_PROMPT = """You are a  helpful performant question answering bot.
 Given a question clue, output your most likely guess in a couple words with a calibrated confidence for the guess.
@@ -71,8 +80,8 @@ def create_first_llm_step() -> ModelStep:
     )
-def create_quizbowl_simple_workflow():
-    return Workflow(
         inputs=["question_text"],
         outputs={"answer": "A.answer", "confidence": "A.confidence"},
         steps={
@@ -99,6 +108,11 @@ def create_quizbowl_simple_workflow():
                 ],
             )
         },
     )
@@ -114,7 +128,7 @@ CONFIDENCE: <0-1>
 EXPLANATION: <your reasoning>"""
-def create_quizbowl_bonus_simple_workflow() -> Workflow:
     """Create a simple model step for bonus questions."""
     return Workflow(
         inputs=["leadin", "part"],
@@ -126,7 +140,7 @@ def create_quizbowl_bonus_simple_workflow() -> Workflow:
                 model="gpt-4o-mini",
                 provider="OpenAI",
                 temperature=0.3,
-                call_type="llm",
                 system_prompt=BONUS_SYS_PROMPT,
                 input_fields=[
                     InputField(

 # %%
+from .structs import (
+    Buzzer,
+    BuzzerMethod,
+    CallType,
+    InputField,
+    ModelStep,
+    OutputField,
+    TossupWorkflow,
+    Workflow,
+)
 INITIAL_SYS_PROMPT = """You are a  helpful performant question answering bot.
 Given a question clue, output your most likely guess in a couple words with a calibrated confidence for the guess.
     )
+def create_simple_qb_tossup_workflow():
+    return TossupWorkflow(
         inputs=["question_text"],
         outputs={"answer": "A.answer", "confidence": "A.confidence"},
         steps={
                 ],
             )
         },
+        buzzer=Buzzer(
+            confidence_threshold=0.75,
+            prob_threshold=None,
+            method=BuzzerMethod.AND,
+        ),
     )
 EXPLANATION: <your reasoning>"""
+def create_simple_qb_bonus_workflow() -> Workflow:
     """Create a simple model step for bonus questions."""
     return Workflow(
         inputs=["leadin", "part"],
                 model="gpt-4o-mini",
                 provider="OpenAI",
                 temperature=0.3,
+                call_type=CallType.LLM,
                 system_prompt=BONUS_SYS_PROMPT,
                 input_fields=[
                     InputField(

src/{llms.py → workflows/llms.py} RENAMED Viewed

@@ -13,7 +13,7 @@ from openai import OpenAI
 from pydantic import BaseModel, Field
 from rich import print as rprint
-from app_configs import AVAILABLE_MODELS
 def _openai_is_json_mode_supported(model_name: str) -> bool:

 from pydantic import BaseModel, Field
 from rich import print as rprint
+from .configs import AVAILABLE_MODELS
 def _openai_is_json_mode_supported(model_name: str) -> bool:

src/workflows/qb_agents.py CHANGED Viewed

@@ -1,20 +1,36 @@
 import time
-from typing import Any, Iterable
-from workflows.executors import execute_workflow
-from workflows.structs import Workflow
-def _get_workflow_response(
-    workflow: Workflow, available_vars: dict[str, Any]
-) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any], float]:
     """Get response from executing a complete workflow."""
     start_time = time.time()
-    final_outputs, computed_values, step_contents = execute_workflow(
-        workflow, available_vars, return_full_content=True
-    )
     response_time = time.time() - start_time
-    return final_outputs, computed_values, step_contents, response_time
 class QuizBowlTossupAgent:
@@ -23,7 +39,7 @@ class QuizBowlTossupAgent:
     external_input_variable = "question_text"
     output_variables = ["answer", "confidence"]
-    def __init__(self, workflow: Workflow, buzz_threshold: float):
         """Initialize the multi-step tossup agent.
         Args:
@@ -31,7 +47,6 @@ class QuizBowlTossupAgent:
             buzz_threshold: Confidence threshold for buzzing
         """
         self.workflow = workflow
-        self.buzz_threshold = buzz_threshold
         self.output_variables = list(workflow.outputs.keys())
         # Validate input variables
@@ -43,7 +58,7 @@ class QuizBowlTossupAgent:
             if out_var not in workflow.outputs:
                 raise ValueError(f"Output variable {out_var} not found in workflow outputs")
-    def run(self, question_runs: list[str], early_stop: bool = True) -> Iterable[dict]:
         """Process a tossup question and decide when to buzz based on confidence.
         Args:
@@ -63,26 +78,26 @@ class QuizBowlTossupAgent:
         """
         for i, question_text in enumerate(question_runs):
             # Execute the complete workflow
-            final_outputs, computed_values, step_contents, response_time = _get_workflow_response(
                 self.workflow, {self.external_input_variable: question_text}
             )
-            print(f"Workflow response: {final_outputs}")
-            buzz = final_outputs["confidence"] >= self.buzz_threshold
-            result = {
                 "answer": final_outputs["answer"],
                 "confidence": final_outputs["confidence"],
                 "buzz": buzz,
                 "question_fragment": question_text,
-                "position": i + 1,
-                "step_contents": step_contents,
                 "response_time": response_time,
-                "step_outputs": computed_values,  # Include intermediate step outputs
             }
             yield result
             # If we've reached the confidence threshold, buzz and stop
-            if early_stop and buzz:
                 return
@@ -111,7 +126,7 @@ class QuizBowlBonusAgent:
             if out_var not in workflow.outputs:
                 raise ValueError(f"Output variable {out_var} not found in workflow outputs")
-    def run(self, leadin: str, part: str) -> dict:
         """Process a bonus part with the given leadin.
         Args:
@@ -127,21 +142,21 @@ class QuizBowlBonusAgent:
                 - response_time: Time taken for response
                 - step_outputs: Outputs from each step
         """
-        final_outputs, computed_values, step_contents, response_time = _get_workflow_response(
             self.workflow,
             {
                 "leadin": leadin,
                 "part": part,
             },
         )
         return {
             "answer": final_outputs["answer"],
             "confidence": final_outputs["confidence"],
             "explanation": final_outputs["explanation"],
-            "step_contents": step_contents,
             "response_time": response_time,
-            "step_outputs": computed_values,  # Include intermediate step outputs
         }

 import time
+from typing import Any, Iterable, TypedDict
+from .executors import WorkflowOutput, execute_workflow
+from .structs import TossupWorkflow, Workflow
+def _get_workflow_response(workflow: Workflow, available_vars: dict[str, Any]) -> tuple[WorkflowOutput, float]:
     """Get response from executing a complete workflow."""
     start_time = time.time()
+    workflow_output = execute_workflow(workflow, available_vars, return_full_content=True)
     response_time = time.time() - start_time
+    return workflow_output, response_time
+class TossupResult(TypedDict):
+    answer: str
+    confidence: float
+    buzz: bool
+    question_fragment: str
+    position: int
+    step_contents: list[str]
+    response_time: float
+    step_outputs: dict[str, Any]
+class BonusResult(TypedDict):
+    answer: str
+    confidence: float
+    explanation: str
+    response_time: float
+    step_contents: list[str]
+    step_outputs: dict[str, Any]
 class QuizBowlTossupAgent:
     external_input_variable = "question_text"
     output_variables = ["answer", "confidence"]
+    def __init__(self, workflow: TossupWorkflow):
         """Initialize the multi-step tossup agent.
         Args:
             buzz_threshold: Confidence threshold for buzzing
         """
         self.workflow = workflow
         self.output_variables = list(workflow.outputs.keys())
         # Validate input variables
             if out_var not in workflow.outputs:
                 raise ValueError(f"Output variable {out_var} not found in workflow outputs")
+    def run(self, question_runs: list[str], early_stop: bool = True) -> Iterable[TossupResult]:
         """Process a tossup question and decide when to buzz based on confidence.
         Args:
         """
         for i, question_text in enumerate(question_runs):
             # Execute the complete workflow
+            workflow_output, response_time = _get_workflow_response(
                 self.workflow, {self.external_input_variable: question_text}
             )
+            final_outputs = workflow_output["final_outputs"]
+            buzz = self.workflow.buzzer.run(final_outputs["confidence"], logprob=final_outputs.get("logprob"))
+            result: TossupResult = {
+                "position": i + 1,
                 "answer": final_outputs["answer"],
                 "confidence": final_outputs["confidence"],
                 "buzz": buzz,
                 "question_fragment": question_text,
+                "step_contents": workflow_output["step_contents"],
+                "step_outputs": workflow_output["intermediate_outputs"],  # Include intermediate step outputs
                 "response_time": response_time,
             }
             yield result
             # If we've reached the confidence threshold, buzz and stop
+            if early_stop and result["buzz"]:
                 return
             if out_var not in workflow.outputs:
                 raise ValueError(f"Output variable {out_var} not found in workflow outputs")
+    def run(self, leadin: str, part: str) -> BonusResult:
         """Process a bonus part with the given leadin.
         Args:
                 - response_time: Time taken for response
                 - step_outputs: Outputs from each step
         """
+        workflow_output, response_time = _get_workflow_response(
             self.workflow,
             {
                 "leadin": leadin,
                 "part": part,
             },
         )
+        final_outputs = workflow_output["final_outputs"]
         return {
             "answer": final_outputs["answer"],
             "confidence": final_outputs["confidence"],
             "explanation": final_outputs["explanation"],
+            "step_contents": workflow_output["step_contents"],
             "response_time": response_time,
+            "step_outputs": workflow_output["intermediate_outputs"],  # Include intermediate step outputs
         }

src/workflows/structs.py CHANGED Viewed

@@ -1,6 +1,8 @@
 # %%
 from typing import Any, Literal, Optional
 from pydantic import BaseModel, Field, model_validator
 """
@@ -20,6 +22,7 @@ All classes use Pydantic's BaseModel for validation and serialization support.
 """
 FieldType = Literal["input", "output"]
 SUPPORTED_TYPES = Literal["str", "int", "float", "bool", "list[str]", "list[int]", "list[float]", "list[bool]"]
 """Supported field types for input and output fields"""
@@ -68,6 +71,12 @@ class OutputField(BaseModel):
     func: str | None = None
 class ModelStep(BaseModel):
     """
     Represents a single step in a workflow.
@@ -89,7 +98,7 @@ class ModelStep(BaseModel):
     name: str
     model: str
     provider: str
-    call_type: str = "llm"  # llm, search, etc # TODO: make this enum or provide explicit options using Literal
     # TODO: Validate that this is not None for call_type = llm
     temperature: Optional[float] = None
@@ -231,4 +240,42 @@ class Workflow(BaseModel):
         return list(variables)
-# %%

 # %%
+from enum import Enum
 from typing import Any, Literal, Optional
+import numpy as np
 from pydantic import BaseModel, Field, model_validator
 """
 """
 FieldType = Literal["input", "output"]
 SUPPORTED_TYPES = Literal["str", "int", "float", "bool", "list[str]", "list[int]", "list[float]", "list[bool]"]
 """Supported field types for input and output fields"""
     func: str | None = None
+class CallType(str, Enum):
+    LLM = "llm"
+    SEARCH = "search"
+    PYTHON_FUNC = "python_func"
 class ModelStep(BaseModel):
     """
     Represents a single step in a workflow.
     name: str
     model: str
     provider: str
+    call_type: CallType = CallType.LLM
     # TODO: Validate that this is not None for call_type = llm
     temperature: Optional[float] = None
         return list(variables)
+class BuzzerMethod(str, Enum):
+    AND = "AND"
+    OR = "OR"
+class Buzzer(BaseModel):
+    """Configuration for when to buzz in a tossup question."""
+    method: BuzzerMethod = BuzzerMethod.AND  # Logic to combine thresholds
+    confidence_threshold: float = Field(default=0.8, ge=0.0, le=1.0)  # Minimum confidence to trigger a buzz
+    prob_threshold: float | None = None  # Optional log probability threshold
+    def run(self, confidence: float, prob: float | None = None, logprob: float | None = None) -> bool:
+        """Run the buzzer logic."""
+        if logprob is not None and prob is not None:
+            raise ValueError("Cannot provide both logprob and prob")
+        if logprob is not None:
+            prob = np.exp(logprob)
+        if self.prob_threshold is None:
+            return confidence >= self.confidence_threshold
+        if self.method == BuzzerMethod.AND:
+            return confidence >= self.confidence_threshold and prob >= self.prob_threshold
+        elif self.method == BuzzerMethod.OR:
+            return confidence >= self.confidence_threshold or prob >= self.prob_threshold
+        else:
+            raise ValueError(f"Invalid buzzer method: {self.method}")
+    @model_validator(mode="after")
+    def validate_method_with_log_prob(cls, data):
+        """Validate that if prob_threshold is None, method must be 'and'."""
+        if data.prob_threshold is None and data.method != BuzzerMethod.AND:
+            raise ValueError("If prob_threshold is None, method must be 'and'")
+        return data
+class TossupWorkflow(Workflow):
+    """Workflow specialized for tossup questions with buzzing capability."""
+    buzzer: Buzzer

src/workflows/utils.py CHANGED Viewed

@@ -1,8 +1,8 @@
 from collections import deque
-from typing import Any
-from workflows.errors import CyclicDependencyError, UnknownVariableError, WorkflowError
-from workflows.structs import ModelStep, Workflow
 """
 Utilities for workflow dependency management and execution order determination.
@@ -98,6 +98,40 @@ def create_dependency_graph(workflow: Workflow, input_values: dict[str, Any]) ->
     return dependencies
 def topological_sort(dependencies: dict[str, set[str]]) -> list[str]:
     """
     Performs a topological sort on a dependency graph and detects cycles using Kahn's algorithm.

 from collections import deque
+from typing import Any, Iterable
+from .errors import CyclicDependencyError, UnknownVariableError, WorkflowError
+from .structs import Workflow
 """
 Utilities for workflow dependency management and execution order determination.
     return dependencies
+def detect_cycles(dep_graph: dict[str, Iterable[str]]) -> str | None:
+    """Detects cycles in the dependency graph.
+    Args:
+        dep_graph: A dictionary where the keys are node IDs and the values are the dependent node IDs
+    Returns:
+        The first step id of a model_step that is part of a cycle, None if no cycles are found
+    """
+    # Check for cycles in step dependencies
+    visited = set()
+    path = set()
+    def has_cycle(node: str) -> bool:
+        if node in path:
+            return True
+        if node in visited:
+            return False
+        visited.add(node)
+        path.add(node)
+        for neighbor in dep_graph.get(node, set()):
+            if has_cycle(neighbor):
+                return True
+        path.remove(node)
+        return False
+    # Check each step for cycles
+    for node_id in dep_graph:
+        if has_cycle(node_id):
+            return node_id
+    return None
 def topological_sort(dependencies: dict[str, set[str]]) -> list[str]:
     """
     Performs a topological sort on a dependency graph and detects cycles using Kahn's algorithm.

src/workflows/validators.py CHANGED Viewed

@@ -2,9 +2,10 @@ import keyword
 import re
 from dataclasses import dataclass
 from enum import Enum
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
-from .structs import InputField, ModelStep, OutputField, Workflow
 SUPPORTED_TYPES = {"str", "int", "float", "bool", "list[str]", "list[int]", "list[float]", "list[bool]"}
@@ -13,7 +14,7 @@ MAX_FIELD_NAME_LENGTH = 50
 MAX_DESCRIPTION_LENGTH = 200
 MAX_SYSTEM_PROMPT_LENGTH = 4000
 MIN_TEMPERATURE = 0.0
-MAX_TEMPERATURE = 1.0
 class ValidationErrorType(Enum):
@@ -42,16 +43,42 @@ class ValidationError:
 class WorkflowValidationError(Exception):
     """Base class for workflow validation errors"""
-    def __init__(self, errors: List[ValidationError]):
         self.errors = errors
         super().__init__(f"Workflow validation failed with {len(errors)} errors")
 class WorkflowValidator:
     """Validates workflows for correctness and consistency"""
     def __init__(self):
-        self.errors: List[ValidationError] = []
         self.workflow: Optional[Workflow] = None
     def validate(self, workflow: Workflow) -> bool:
@@ -106,7 +133,7 @@ class WorkflowValidator:
                 return False
             # Verify the output field exists in the step
-            _, field_name = self._parse_variable_reference(output_var)
             if not any(field.name == field_name for field in step.output_fields):
                 self.errors.append(
                     ValidationError(
@@ -153,7 +180,7 @@ class WorkflowValidator:
                 return False
             # Verify the output field exists in the referenced step
-            step_id, field_name = self._parse_variable_reference(output_var)
             if step_id not in workflow.steps:
                 self.errors.append(
                     ValidationError(ValidationErrorType.VARIABLE, f"Referenced step '{step_id}' not found")
@@ -172,47 +199,22 @@ class WorkflowValidator:
                 )
                 return False
-        # Build dependency graph
-        dep_graph: Dict[str, Set[str]] = {}
-        for step_id, step in workflow.steps.items():
-            dep_graph[step_id] = self._get_step_dependencies(step)
-        # Check for cycles in step dependencies
-        visited = set()
-        path = set()
-        def has_cycle(node: str) -> bool:
-            if node in path:
-                return True
-            if node in visited:
-                return False
-            visited.add(node)
-            path.add(node)
-            for neighbor in dep_graph.get(node, set()):
-                if has_cycle(neighbor):
-                    return True
-            path.remove(node)
-            return False
-        # Check each step for cycles
-        for step_id in workflow.steps:
-            if has_cycle(step_id):
-                self.errors.append(
-                    ValidationError(ValidationErrorType.DAG, f"Circular dependency detected involving step: {step_id}")
                 )
-                return False
         # Check for orphaned steps (steps that aren't used by any other step)
         used_steps = set()
         for deps in dep_graph.values():
             used_steps.update(deps)
-        print("Used steps: ", used_steps)
         for step_id in workflow.steps:
             if step_id not in used_steps and not any(
-                output_var and self._parse_variable_reference(output_var)[0] == step_id
                 for output_var in workflow.outputs.values()
             ):
                 self.errors.append(ValidationError(ValidationErrorType.DAG, f"Orphaned step detected: {step_id}"))
@@ -277,7 +279,7 @@ class WorkflowValidator:
             return False
         # Validate temperature for LLM call type
-        if step.call_type == "llm":
             if step.temperature is None:
                 self.errors.append(
                     ValidationError(ValidationErrorType.STEP, "LLM step must specify temperature", step.id)
@@ -295,7 +297,7 @@ class WorkflowValidator:
                 return False
         # Validate system prompt for LLM call type
-        if step.call_type == "llm":
             if not step.system_prompt:
                 self.errors.append(
                     ValidationError(ValidationErrorType.STEP, "LLM step must specify system prompt", step.id)
@@ -477,50 +479,32 @@ class WorkflowValidator:
     def _validate_variable_dependencies(self, workflow: Workflow) -> bool:
         """Validates variable dependencies between steps"""
         # Build variable dependency graph
-        var_graph: Dict[str, Set[str]] = {}
-        for step_id, step in workflow.steps.items():
-            for field in step.input_fields:
-                if field.variable not in var_graph:
-                    var_graph[field.variable] = set()
-                # Add dependency from input variable to step's outputs
-                for output in step.output_fields:
-                    var_graph[field.variable].add(f"{step_id}.{output.name}")
         # Check for cycles in variable dependencies
-        visited = set()
-        path = set()
-        def has_cycle(node: str) -> bool:
-            if node in path:
-                return True
-            if node in visited:
-                return False
-            visited.add(node)
-            path.add(node)
-            for neighbor in var_graph.get(node, set()):
-                if has_cycle(neighbor):
-                    return True
-            path.remove(node)
             return False
-        # Check each variable for cycles
-        for var in var_graph:
-            if has_cycle(var):
-                self.errors.append(
-                    ValidationError(ValidationErrorType.VARIABLE, f"Circular variable dependency detected: {var}")
-                )
-                return False
         # Validate external input existence
         external_inputs = set(workflow.inputs)
         for step in workflow.steps.values():
             for field in step.input_fields:
-                step_id, field_name = self._parse_variable_reference(field.variable)
                 if not step_id and field_name not in external_inputs:
                     self.errors.append(
                         ValidationError(
@@ -533,22 +517,6 @@ class WorkflowValidator:
         return True
-    def _get_step_dependencies(self, step: ModelStep) -> Set[str]:
-        """Gets set of step IDs that this step depends on"""
-        deps = set()
-        for field in step.input_fields:
-            step_id = self._parse_variable_reference(field.variable)[0]
-            if step_id:
-                deps.add(step_id)
-        return deps
-    def _parse_variable_reference(self, var: str) -> Tuple[Optional[str], str]:
-        """Extracts step_id and field_name from variable reference"""
-        parts = var.split(".")
-        if len(parts) == 1:
-            return None, parts[0]
-        return parts[0], parts[1]
     def _is_valid_variable_reference(self, var: str) -> bool:
         """Validates if a variable reference is properly formatted"""
         if not self.workflow:

 import re
 from dataclasses import dataclass
 from enum import Enum
+from typing import Optional
+from .structs import CallType, InputField, ModelStep, OutputField, Workflow
+from .utils import detect_cycles
 SUPPORTED_TYPES = {"str", "int", "float", "bool", "list[str]", "list[int]", "list[float]", "list[bool]"}
 MAX_DESCRIPTION_LENGTH = 200
 MAX_SYSTEM_PROMPT_LENGTH = 4000
 MIN_TEMPERATURE = 0.0
+MAX_TEMPERATURE = 10.0
 class ValidationErrorType(Enum):
 class WorkflowValidationError(Exception):
     """Base class for workflow validation errors"""
+    def __init__(self, errors: list[ValidationError]):
         self.errors = errors
         super().__init__(f"Workflow validation failed with {len(errors)} errors")
+def _parse_variable_reference(var: str) -> tuple[Optional[str], str]:
+    """Extracts step_id and field_name from variable reference"""
+    parts = var.split(".")
+    if len(parts) == 1:
+        return None, parts[0]
+    return parts[0], parts[1]
+def _get_step_dependencies(step: ModelStep) -> set[str]:
+    """Gets set of step IDs that this step depends on"""
+    deps = set()
+    for field in step.input_fields:
+        step_id, _ = _parse_variable_reference(field.variable)
+        if step_id:
+            deps.add(step_id)
+    return deps
+def create_step_dep_graph(workflow: Workflow) -> dict[str, set[str]]:
+    """Creates a dependency graph of steps"""
+    dep_graph: dict[str, set[str]] = {}
+    for step_id, step in workflow.steps.items():
+        dep_graph[step_id] = _get_step_dependencies(step)
+    return dep_graph
 class WorkflowValidator:
     """Validates workflows for correctness and consistency"""
     def __init__(self):
+        self.errors: list[ValidationError] = []
         self.workflow: Optional[Workflow] = None
     def validate(self, workflow: Workflow) -> bool:
                 return False
             # Verify the output field exists in the step
+            _, field_name = _parse_variable_reference(output_var)
             if not any(field.name == field_name for field in step.output_fields):
                 self.errors.append(
                     ValidationError(
                 return False
             # Verify the output field exists in the referenced step
+            step_id, field_name = _parse_variable_reference(output_var)
             if step_id not in workflow.steps:
                 self.errors.append(
                     ValidationError(ValidationErrorType.VARIABLE, f"Referenced step '{step_id}' not found")
                 )
                 return False
+        dep_graph = create_step_dep_graph(workflow)
+        if cycle_step_id := detect_cycles(dep_graph):
+            self.errors.append(
+                ValidationError(
+                    ValidationErrorType.DAG, f"Circular dependency detected involving step: {cycle_step_id}"
                 )
+            )
+            return False
         # Check for orphaned steps (steps that aren't used by any other step)
         used_steps = set()
         for deps in dep_graph.values():
             used_steps.update(deps)
         for step_id in workflow.steps:
             if step_id not in used_steps and not any(
+                output_var and _parse_variable_reference(output_var)[0] == step_id
                 for output_var in workflow.outputs.values()
             ):
                 self.errors.append(ValidationError(ValidationErrorType.DAG, f"Orphaned step detected: {step_id}"))
             return False
         # Validate temperature for LLM call type
+        if step.call_type == CallType.LLM:
             if step.temperature is None:
                 self.errors.append(
                     ValidationError(ValidationErrorType.STEP, "LLM step must specify temperature", step.id)
                 return False
         # Validate system prompt for LLM call type
+        if step.call_type == CallType.LLM:
             if not step.system_prompt:
                 self.errors.append(
                     ValidationError(ValidationErrorType.STEP, "LLM step must specify system prompt", step.id)
     def _validate_variable_dependencies(self, workflow: Workflow) -> bool:
         """Validates variable dependencies between steps"""
         # Build variable dependency graph
+        var_graph: dict[str, set[str]] = {}
+        def create_var_dep_graph(workflow: Workflow) -> dict[str, set[str]]:
+            var_graph: dict[str, set[str]] = {}
+            for step_id, step in workflow.steps.items():
+                for field in step.input_fields:
+                    if field.variable not in var_graph:
+                        var_graph[field.variable] = set()
+                    # Add dependency from input variable to step's outputs
+                    for output in step.output_fields:
+                        var_graph[field.variable].add(f"{step_id}.{output.name}")
+            return var_graph
         # Check for cycles in variable dependencies
+        var_graph = create_var_dep_graph(workflow)
+        if cycle_var := detect_cycles(var_graph):
+            self.errors.append(
+                ValidationError(ValidationErrorType.VARIABLE, f"Circular variable dependency detected: {cycle_var}")
+            )
             return False
         # Validate external input existence
         external_inputs = set(workflow.inputs)
         for step in workflow.steps.values():
             for field in step.input_fields:
+                step_id, field_name = _parse_variable_reference(field.variable)
                 if not step_id and field_name not in external_inputs:
                     self.errors.append(
                         ValidationError(
         return True
     def _is_valid_variable_reference(self, var: str) -> bool:
         """Validates if a variable reference is properly formatted"""
         if not self.workflow:

tests/test_executors.py CHANGED Viewed

@@ -8,37 +8,33 @@ from workflows.executors import (
     create_processed_inputs,
     execute_model_step,
     execute_workflow,
-    lower,
-    upper,
 )
-from workflows.structs import InputField, ModelStep, OutputField, Workflow
 # Tests for utility functions
-def test_upper():
-    """Test the upper function with different input types."""
-    assert upper("hello") == "HELLO"
-    assert upper("Hello World") == "HELLO WORLD"
-    assert upper("") == ""
-    # Non-string inputs should be returned unchanged
-    assert upper(123) == 123
-    assert upper([1, 2, 3]) == [1, 2, 3]
-    assert upper(None) is None
-def test_lower():
-    """Test the lower function with different input types."""
-    assert lower("HELLO") == "hello"
-    assert lower("Hello World") == "hello world"
-    assert lower("") == ""
-    # Non-string inputs should be returned unchanged
-    assert lower(123) == 123
-    assert lower([1, 2, 3]) == [1, 2, 3]
-    assert lower(None) is None
-# Tests for create_processed_inputs
 def test_create_processed_inputs_basic():
@@ -47,8 +43,8 @@ def test_create_processed_inputs_basic():
         id="test_step",
         name="Test Step",
         model="gpt-4",
-        provider="openai",
-        call_type="llm",
         system_prompt="Test prompt",
         input_fields=[InputField(name="text", description="Input text", variable="input_text")],
         output_fields=[],
@@ -65,8 +61,8 @@ def test_create_processed_inputs_with_transformation():
         id="test_step",
         name="Test Step",
         model="gpt-4",
-        provider="openai",
-        call_type="llm",
         system_prompt="Test prompt",
         input_fields=[
             InputField(name="upper_text", description="Uppercase text", variable="input_text", func="upper"),
@@ -86,8 +82,8 @@ def test_create_processed_inputs_missing_var():
         id="test_step",
         name="Test Step",
         model="gpt-4",
-        provider="openai",
-        call_type="llm",
         system_prompt="Test prompt",
         input_fields=[InputField(name="text", description="Input text", variable="missing_var")],
         output_fields=[],
@@ -104,8 +100,8 @@ def test_create_processed_inputs_unknown_func():
         id="test_step",
         name="Test Step",
         model="gpt-4",
-        provider="openai",
-        call_type="llm",
         system_prompt="Test prompt",
         input_fields=[InputField(name="text", description="Input text", variable="input_text", func="unknown_func")],
         output_fields=[],
@@ -136,7 +132,7 @@ def test_execute_model_step_success(mock_completion):
         name="Summarize Text",
         model="gpt-3.5-turbo",
         provider="OpenAI",
-        call_type="llm",
         system_prompt="Summarize the text",
         input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
         output_fields=[OutputField(name="summary", description="Summary of the text", type="str")],
@@ -146,7 +142,13 @@ def test_execute_model_step_success(mock_completion):
     result = execute_model_step(step, {"input_text": "Long text to be summarized..."})
     # Verify the results
-    assert result == {"summary": "This is a summary"}
     # Verify the litellm call was made correctly
     mock_completion.assert_called_once()
@@ -155,6 +157,77 @@ def test_execute_model_step_success(mock_completion):
     assert "Summarize the text" in kwargs["system"]
 @patch("workflows.executors.completion")
 def test_execute_model_step_error(mock_completion):
     """Test handling of errors in model step execution."""
@@ -166,8 +239,8 @@ def test_execute_model_step_error(mock_completion):
         id="summarize",
         name="Summarize Text",
         model="gpt-3.5-turbo",
-        provider="openai",
-        call_type="llm",
         system_prompt="Summarize the text",
         input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
         output_fields=[OutputField(name="summary", description="Summary of the text", type="str")],
@@ -185,15 +258,16 @@ def test_execute_model_step_error(mock_completion):
 def test_execute_workflow_simple(mock_execute_step):
     """Test execution of a simple workflow with a single step."""
     # Configure mock to return expected outputs
-    mock_execute_step.return_value = {"summary": "This is a summary"}
     # Create a simple workflow
     step = ModelStep(
         id="summarize",
         name="Summarize Text",
         model="gpt-3.5-turbo",
-        provider="openai",
-        call_type="llm",
         system_prompt="Summarize the text",
         input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
         output_fields=[OutputField(name="summary", description="Summary of the text", type="str")],
@@ -202,15 +276,21 @@ def test_execute_workflow_simple(mock_execute_step):
     workflow = Workflow(steps={"summarize": step}, inputs=["input_text"], outputs={"summary": "summarize.summary"})
     # Execute the workflow
-    final_outputs, computed_values, step_contents = execute_workflow(
-        workflow, {"input_text": "Long text to be summarized..."}
-    )
     # Verify the results
-    assert final_outputs == {"summary": "This is a summary"}
-    assert computed_values == {"input_text": "Long text to be summarized...", "summarize.summary": "This is a summary"}
-    assert step_contents == {}
     # Verify execute_model_step was called correctly
     mock_execute_step.assert_called_once()
@@ -220,12 +300,12 @@ def test_execute_workflow_multi_step(mock_execute_step):
     """Test execution of a multi-step workflow with dependencies."""
     # Configure mock to return different values based on the step
-    def side_effect(step, available_vars, return_full_content=False):
         if step.id == "extract":
-            return {"entities": ["Apple", "product"]}
         elif step.id == "analyze":
-            return {"sentiment": "positive"}
-        return {}
     mock_execute_step.side_effect = side_effect
@@ -234,8 +314,8 @@ def test_execute_workflow_multi_step(mock_execute_step):
         id="extract",
         name="Extract Entities",
         model="gpt-3.5-turbo",
-        provider="openai",
-        call_type="llm",
         system_prompt="Extract entities",
         input_fields=[InputField(name="text", description="Text to analyze", variable="input_text")],
         output_fields=[OutputField(name="entities", description="Extracted entities", type="list[str]")],
@@ -246,8 +326,8 @@ def test_execute_workflow_multi_step(mock_execute_step):
         id="analyze",
         name="Analyze Sentiment",
         model="gpt-4",
-        provider="openai",
-        call_type="llm",
         system_prompt="Analyze sentiment",
         input_fields=[InputField(name="entities", description="Entities to analyze", variable="extract.entities")],
         output_fields=[OutputField(name="sentiment", description="Sentiment analysis", type="str")],
@@ -260,19 +340,22 @@ def test_execute_workflow_multi_step(mock_execute_step):
     )
     # Execute the workflow
-    final_outputs, computed_values, step_contents = execute_workflow(
-        workflow, {"input_text": "Apple is launching a new product tomorrow."}
     )
-    # Verify the results
-    assert final_outputs == {"entities": ["Apple", "product"], "sentiment": "positive"}
-    assert computed_values == {
-        "input_text": "Apple is launching a new product tomorrow.",
-        "extract.entities": ["Apple", "product"],
-        "analyze.sentiment": "positive",
-    }
-    assert step_contents == {}
     # Verify execute_model_step was called twice (once for each step)
     assert mock_execute_step.call_count == 2
@@ -283,8 +366,8 @@ def test_execute_workflow_missing_input():
         id="summarize",
         name="Summarize Text",
         model="gpt-3.5-turbo",
-        provider="openai",
-        call_type="llm",
         system_prompt="Summarize the text",
         input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
         output_fields=[OutputField(name="summary", description="Summary of the text", type="str")],
@@ -297,24 +380,32 @@ def test_execute_workflow_missing_input():
         execute_workflow(workflow, {})
-@patch("workflows.executors.create_dependency_graph")
-def test_execute_workflow_cyclic_dependency(mock_dependency_graph):
     """Test that a cyclic dependency in the workflow raises an appropriate error."""
     # Make create_dependency_graph raise a CyclicDependencyError
-    mock_dependency_graph.side_effect = CyclicDependencyError()
-    step = ModelStep(
-        id="test",
-        name="Test Step",
         model="gpt-3.5-turbo",
-        provider="openai",
-        call_type="llm",
         system_prompt="Test",
-        input_fields=[],
-        output_fields=[],
     )
-    workflow = Workflow(steps=[step], inputs=[], outputs={})
     # This should propagate the CyclicDependencyError
     with pytest.raises(CyclicDependencyError):
@@ -325,15 +416,20 @@ def test_execute_workflow_cyclic_dependency(mock_dependency_graph):
 def test_execute_workflow_with_full_content(mock_execute_step):
     """Test execution of a workflow with return_full_content=True."""
     # Configure mock to return expected outputs and content
-    mock_execute_step.return_value = ({"summary": "This is a summary"}, "Full model response content")
     # Create a simple workflow
     step = ModelStep(
         id="summarize",
         name="Summarize Text",
         model="gpt-3.5-turbo",
-        provider="openai",
-        call_type="llm",
         system_prompt="Summarize the text",
         input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
         output_fields=[OutputField(name="summary", description="Summary of the text", type="str")],
@@ -342,14 +438,64 @@ def test_execute_workflow_with_full_content(mock_execute_step):
     workflow = Workflow(steps=[step], inputs=["input_text"], outputs={"summary": "summarize.summary"})
     # Execute the workflow with return_full_content=True
-    final_outputs, computed_values, step_contents = execute_workflow(
-        workflow, {"input_text": "Long text to be summarized..."}, return_full_content=True
     )
-    # Verify the results
-    assert final_outputs == {"summary": "This is a summary"}
-    assert computed_values == {"input_text": "Long text to be summarized...", "summarize.summary": "This is a summary"}
-    assert step_contents == {"summarize": "Full model response content"}
     # Verify execute_model_step was called correctly with return_full_content=True
-    mock_execute_step.assert_called_once_with(step, computed_values, return_full_content=True)

     create_processed_inputs,
     execute_model_step,
     execute_workflow,
 )
+from workflows.structs import CallType, InputField, ModelStep, OutputField, Workflow
 # Tests for utility functions
+lower = str.lower
+upper = str.upper
+# Tests for create_processed_inputs
+def assert_model_step_result(result: dict, expected_result: dict):
+    # Verify the results
+    assert isinstance(result, dict)
+    assert "outputs" in result
+    assert "content" in result
+    assert "logprob" in result
+    assert result["outputs"] == expected_result["outputs"]
+    assert result["content"] == expected_result["content"]
+    assert result["logprob"] == expected_result["logprob"]
+def assert_workflow_output(output: dict, expected_output: dict):
+    assert isinstance(output, dict)
+    for key in ["final_outputs", "intermediate_outputs", "step_contents", "logprob"]:
+        assert key in output
+        assert output[key] == expected_output[key]
 def test_create_processed_inputs_basic():
         id="test_step",
         name="Test Step",
         model="gpt-4",
+        provider="OpenAI",
+        call_type=CallType.LLM,
         system_prompt="Test prompt",
         input_fields=[InputField(name="text", description="Input text", variable="input_text")],
         output_fields=[],
         id="test_step",
         name="Test Step",
         model="gpt-4",
+        provider="OpenAI",
+        call_type=CallType.LLM,
         system_prompt="Test prompt",
         input_fields=[
             InputField(name="upper_text", description="Uppercase text", variable="input_text", func="upper"),
         id="test_step",
         name="Test Step",
         model="gpt-4",
+        provider="OpenAI",
+        call_type=CallType.LLM,
         system_prompt="Test prompt",
         input_fields=[InputField(name="text", description="Input text", variable="missing_var")],
         output_fields=[],
         id="test_step",
         name="Test Step",
         model="gpt-4",
+        provider="OpenAI",
+        call_type=CallType.LLM,
         system_prompt="Test prompt",
         input_fields=[InputField(name="text", description="Input text", variable="input_text", func="unknown_func")],
         output_fields=[],
         name="Summarize Text",
         model="gpt-3.5-turbo",
         provider="OpenAI",
+        call_type=CallType.LLM,
         system_prompt="Summarize the text",
         input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
         output_fields=[OutputField(name="summary", description="Summary of the text", type="str")],
     result = execute_model_step(step, {"input_text": "Long text to be summarized..."})
     # Verify the results
+    assert isinstance(result, dict)
+    assert "outputs" in result
+    assert "content" in result
+    assert "logprob" in result
+    assert result["outputs"] == {"summary": "This is a summary"}
+    assert result["content"] is None
+    assert result["logprob"] is None
     # Verify the litellm call was made correctly
     mock_completion.assert_called_once()
     assert "Summarize the text" in kwargs["system"]
+@patch("workflows.executors.completion")
+def test_execute_model_step_with_full_content(mock_completion):
+    """Test execution of a model step with full content returned."""
+    # Mock the litellm response
+    mock_response = {
+        "content": "Full model response content",
+        "output": {"summary": "This is a summary"},
+    }
+    mock_completion.return_value = mock_response
+    # Create a test step
+    step = ModelStep(
+        id="summarize",
+        name="Summarize Text",
+        model="gpt-3.5-turbo",
+        provider="OpenAI",
+        call_type=CallType.LLM,
+        system_prompt="Summarize the text",
+        input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
+        output_fields=[OutputField(name="summary", description="Summary of the text", type="str")],
+    )
+    # Execute the step with return_full_content=True
+    result = execute_model_step(step, {"input_text": "Long text to be summarized..."}, return_full_content=True)
+    # Verify the results
+    assert isinstance(result, dict)
+    assert "outputs" in result
+    assert "content" in result
+    assert "logprob" in result
+    assert result["outputs"] == {"summary": "This is a summary"}
+    assert result["content"] == "Full model response content"
+    assert result["logprob"] is None
+@patch("workflows.executors.completion")
+def test_execute_model_step_with_logprobs(mock_completion):
+    """Test execution of a model step with log probabilities."""
+    # Mock the litellm response with log probability
+    mock_response = {
+        "content": json.dumps({"summary": "This is a summary"}),
+        "output": {"summary": "This is a summary"},
+        "log_prob": -2.5,
+    }
+    mock_completion.return_value = mock_response
+    # Create a test step
+    step = ModelStep(
+        id="summarize",
+        name="Summarize Text",
+        model="gpt-3.5-turbo",
+        provider="OpenAI",
+        call_type=CallType.LLM,
+        system_prompt="Summarize the text",
+        input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
+        output_fields=[OutputField(name="summary", description="Summary of the text", type="str")],
+    )
+    # Execute the step with logprobs=True
+    result = execute_model_step(step, {"input_text": "Long text to be summarized..."}, logprobs=True)
+    # Verify the results
+    assert isinstance(result, dict)
+    assert "outputs" in result
+    assert "content" in result
+    assert "logprob" in result
+    assert result["outputs"] == {"summary": "This is a summary"}
+    assert result["content"] is None
+    assert result["logprob"] == -2.5
 @patch("workflows.executors.completion")
 def test_execute_model_step_error(mock_completion):
     """Test handling of errors in model step execution."""
         id="summarize",
         name="Summarize Text",
         model="gpt-3.5-turbo",
+        provider="OpenAI",
+        call_type=CallType.LLM,
         system_prompt="Summarize the text",
         input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
         output_fields=[OutputField(name="summary", description="Summary of the text", type="str")],
 def test_execute_workflow_simple(mock_execute_step):
     """Test execution of a simple workflow with a single step."""
     # Configure mock to return expected outputs
+    mock_result = {"outputs": {"summary": "This is a summary"}, "content": None, "logprob": None}
+    mock_execute_step.return_value = mock_result
     # Create a simple workflow
     step = ModelStep(
         id="summarize",
         name="Summarize Text",
         model="gpt-3.5-turbo",
+        provider="OpenAI",
+        call_type=CallType.LLM,
         system_prompt="Summarize the text",
         input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
         output_fields=[OutputField(name="summary", description="Summary of the text", type="str")],
     workflow = Workflow(steps={"summarize": step}, inputs=["input_text"], outputs={"summary": "summarize.summary"})
     # Execute the workflow
+    result = execute_workflow(workflow, {"input_text": "Long text to be summarized..."})
     # Verify the results
+    assert_workflow_output(
+        result,
+        {
+            "final_outputs": {"summary": "This is a summary"},
+            "intermediate_outputs": {
+                "input_text": "Long text to be summarized...",
+                "summarize.summary": "This is a summary",
+            },
+            "step_contents": {},
+            "logprob": None,
+        },
+    )
     # Verify execute_model_step was called correctly
     mock_execute_step.assert_called_once()
     """Test execution of a multi-step workflow with dependencies."""
     # Configure mock to return different values based on the step
+    def side_effect(step, available_vars, return_full_content=False, logprobs=False):
         if step.id == "extract":
+            return {"outputs": {"entities": ["Apple", "product"]}, "content": None, "logprob": None}
         elif step.id == "analyze":
+            return {"outputs": {"sentiment": "positive"}, "content": None, "logprob": None}
+        return {"outputs": {}, "content": None, "logprob": None}
     mock_execute_step.side_effect = side_effect
         id="extract",
         name="Extract Entities",
         model="gpt-3.5-turbo",
+        provider="OpenAI",
+        call_type=CallType.LLM,
         system_prompt="Extract entities",
         input_fields=[InputField(name="text", description="Text to analyze", variable="input_text")],
         output_fields=[OutputField(name="entities", description="Extracted entities", type="list[str]")],
         id="analyze",
         name="Analyze Sentiment",
         model="gpt-4",
+        provider="OpenAI",
+        call_type=CallType.LLM,
         system_prompt="Analyze sentiment",
         input_fields=[InputField(name="entities", description="Entities to analyze", variable="extract.entities")],
         output_fields=[OutputField(name="sentiment", description="Sentiment analysis", type="str")],
     )
     # Execute the workflow
+    result = execute_workflow(workflow, {"input_text": "Apple is launching a new product tomorrow."})
+    assert_workflow_output(
+        result,
+        {
+            "final_outputs": {"entities": ["Apple", "product"], "sentiment": "positive"},
+            "intermediate_outputs": {
+                "input_text": "Apple is launching a new product tomorrow.",
+                "extract.entities": ["Apple", "product"],
+                "analyze.sentiment": "positive",
+            },
+            "step_contents": {},
+            "logprob": None,
+        },
     )
     # Verify execute_model_step was called twice (once for each step)
     assert mock_execute_step.call_count == 2
         id="summarize",
         name="Summarize Text",
         model="gpt-3.5-turbo",
+        provider="OpenAI",
+        call_type=CallType.LLM,
         system_prompt="Summarize the text",
         input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
         output_fields=[OutputField(name="summary", description="Summary of the text", type="str")],
         execute_workflow(workflow, {})
+def test_execute_workflow_cyclic_dependency():
     """Test that a cyclic dependency in the workflow raises an appropriate error."""
     # Make create_dependency_graph raise a CyclicDependencyError
+    step1 = ModelStep(
+        id="t1",
+        name="Test Step 1",
         model="gpt-3.5-turbo",
+        provider="OpenAI",
+        call_type=CallType.LLM,
         system_prompt="Test",
+        input_fields=[InputField(name="v1", description="", variable="t2.var")],
+        output_fields=[OutputField(name="out", description="")],
+    )
+    step2 = ModelStep(
+        id="t2",
+        name="Test Step 2",
+        model="gpt-3.5-turbo",
+        provider="OpenAI",
+        call_type=CallType.LLM,
+        system_prompt="Test",
+        input_fields=[InputField(name="v2", description="", variable="t1.out")],
+        output_fields=[OutputField(name="var", description="")],
     )
+    workflow = Workflow(steps=[step1, step2], inputs=[], outputs={})
     # This should propagate the CyclicDependencyError
     with pytest.raises(CyclicDependencyError):
 def test_execute_workflow_with_full_content(mock_execute_step):
     """Test execution of a workflow with return_full_content=True."""
     # Configure mock to return expected outputs and content
+    mock_result = {
+        "outputs": {"summary": "This is a summary"},
+        "content": "Full model response content",
+        "logprob": None,
+    }
+    mock_execute_step.return_value = mock_result
     # Create a simple workflow
     step = ModelStep(
         id="summarize",
         name="Summarize Text",
         model="gpt-3.5-turbo",
+        provider="OpenAI",
+        call_type=CallType.LLM,
         system_prompt="Summarize the text",
         input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
         output_fields=[OutputField(name="summary", description="Summary of the text", type="str")],
     workflow = Workflow(steps=[step], inputs=["input_text"], outputs={"summary": "summarize.summary"})
     # Execute the workflow with return_full_content=True
+    inputs = {"input_text": "Long text to be summarized..."}
+    result = execute_workflow(workflow, inputs, return_full_content=True)
+    assert_workflow_output(
+        result,
+        {
+            "final_outputs": {"summary": "This is a summary"},
+            "intermediate_outputs": {
+                "input_text": "Long text to be summarized...",
+                "summarize.summary": "This is a summary",
+            },
+            "step_contents": {"summarize": "Full model response content"},
+            "logprob": None,
+        },
     )
     # Verify execute_model_step was called correctly with return_full_content=True
+    mock_execute_step.assert_called_once_with(step, inputs, return_full_content=True, logprobs=False)
+@patch("workflows.executors.execute_model_step")
+def test_execute_workflow_with_logprob(mock_execute_step):
+    """Test execution of a workflow with logprob_step specified."""
+    # Configure mock to return expected outputs with logprob
+    mock_result = {"outputs": {"summary": "This is a summary"}, "content": None, "logprob": -2.5}
+    mock_execute_step.return_value = mock_result
+    # Create a simple workflow
+    step = ModelStep(
+        id="summarize",
+        name="Summarize Text",
+        model="gpt-3.5-turbo",
+        provider="OpenAI",
+        call_type=CallType.LLM,
+        system_prompt="Summarize the text",
+        input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
+        output_fields=[OutputField(name="summary", description="Summary of the text", type="str")],
+    )
+    workflow = Workflow(steps={"summarize": step}, inputs=["input_text"], outputs={"summary": "summarize.summary"})
+    # Execute the workflow with logprob_step specified
+    result = execute_workflow(workflow, {"input_text": "Long text to be summarized..."}, logprob_step="summarize")
+    # Verify the results
+    assert_workflow_output(
+        result,
+        {
+            "final_outputs": {"summary": "This is a summary"},
+            "logprob": -2.5,
+            "intermediate_outputs": {
+                "input_text": "Long text to be summarized...",
+                "summarize.summary": "This is a summary",
+            },
+            "step_contents": {},
+        },
+    )
+    # Verify execute_model_step was called with logprobs=True
+    mock_execute_step.assert_called_once()
+    args, kwargs = mock_execute_step.call_args
+    assert kwargs["logprobs"] is True

tests/test_validators.py CHANGED Viewed

@@ -1,21 +1,21 @@
-from typing import Dict, List
 import pytest
 from pydantic import ValidationError as PydanticValidationError
-from workflows.structs import InputField, ModelStep, OutputField, Workflow
-from workflows.validators import ValidationError, ValidationErrorType, WorkflowValidator
 # Test Data
-def create_basic_step(step_id: str = "step1") -> ModelStep:
     """Creates a basic valid step for testing"""
     return ModelStep(
         id=step_id,
         name="Test Step",
         model="gpt-4",
         provider="openai",
-        call_type="llm",
         temperature=0.7,
         system_prompt="Test prompt",
         input_fields=[],
@@ -23,16 +23,32 @@ def create_basic_step(step_id: str = "step1") -> ModelStep:
     )
-def create_basic_workflow(steps: List[ModelStep] | None = None) -> Workflow:
     """Creates a basic valid workflow for testing"""
     if steps is None:
-        steps = [create_basic_step()]
     return Workflow(inputs=[], outputs={}, steps={step.id: step for step in steps})
 # Additional Test Data
 def create_step_with_fields(
-    step_id: str, input_fields: List[InputField], output_fields: List[OutputField]
 ) -> ModelStep:
     """Creates a step with specific input and output fields"""
     return ModelStep(
@@ -40,7 +56,7 @@ def create_step_with_fields(
         name="Test Step",
         model="gpt-4",
         provider="openai",
-        call_type="llm",
         temperature=0.7,
         system_prompt="Test prompt",
         input_fields=input_fields,
@@ -117,15 +133,15 @@ class TestStepValidation:
             name="",  # Missing name
             model="",  # Missing model
             provider="",  # Missing provider
-            call_type="",  # Missing call_type
             temperature=0.7,
             system_prompt="Test prompt",
             input_fields=[],
             output_fields=[],
         )
         workflow = create_basic_workflow([step])
-        workflow.inputs = ["input"]
-        workflow.outputs = {"output": "step1.field"}
         assert not validator.validate(workflow)
         assert len(validator.errors) == 1
         assert validator.errors[0].error_type == ValidationErrorType.STEP
@@ -135,32 +151,34 @@ class TestStepValidation:
         validator = WorkflowValidator()
         step = create_basic_step("123invalid")  # Invalid ID format
         workflow = create_basic_workflow([step])
-        workflow.inputs = ["input"]
-        workflow.outputs = {"output": "step1.field"}
         assert not validator.validate(workflow)
         assert len(validator.errors) == 1
         assert validator.errors[0].error_type == ValidationErrorType.NAMING
-    def test_llm_temperature_validation(self):
         """Test validation of LLM step temperature"""
         validator = WorkflowValidator()
         # Test invalid temperature
         step = create_basic_step()
-        step.temperature = 1.5  # Invalid temperature
         workflow = create_basic_workflow([step])
-        workflow.inputs = ["input"]
-        workflow.outputs = {"output": "step1.field"}
         assert not validator.validate(workflow)
         assert len(validator.errors) == 1
         assert validator.errors[0].error_type == ValidationErrorType.RANGE
         # Test missing temperature
         step = create_basic_step()
         step.temperature = None  # Missing temperature
         workflow = create_basic_workflow([step])
-        workflow.inputs = ["input"]
-        workflow.outputs = {"output": "step1.field"}
         assert not validator.validate(workflow)
         assert len(validator.errors) == 1
         assert validator.errors[0].error_type == ValidationErrorType.STEP
@@ -173,8 +191,8 @@ class TestStepValidation:
         step = create_basic_step()
         step.system_prompt = ""  # Missing system prompt
         workflow = create_basic_workflow([step])
-        workflow.inputs = ["input"]
-        workflow.outputs = {"output": "step1.field"}
         assert not validator.validate(workflow)
         assert len(validator.errors) == 1
         assert validator.errors[0].error_type == ValidationErrorType.STEP
@@ -183,8 +201,8 @@ class TestStepValidation:
         step = create_basic_step()
         step.system_prompt = "x" * 4001  # Too long
         workflow = create_basic_workflow([step])
-        workflow.inputs = ["input"]
-        workflow.outputs = {"output": "step1.field"}
         assert not validator.validate(workflow)
         assert len(validator.errors) == 1
         assert validator.errors[0].error_type == ValidationErrorType.LENGTH
@@ -477,39 +495,6 @@ class TestTypeCompatibility:
         workflow.outputs = {"output": "step2.output"}
         assert validator.validate(workflow)
-    # def test_list_type_compatibility(self):
-    #     """Test validation of list type compatibility"""
-    #     validator = WorkflowValidator()
-    #     # Test compatible list types
-    #     step1 = create_step_with_fields(
-    #         "step1", [], [OutputField(name="output", description="test", type="list[str]")]
-    #     )
-    #     step2 = create_step_with_fields(
-    #         "step2", [InputField(name="input", description="test", variable="step1.output")], []
-    #     )
-    #     workflow = create_basic_workflow([step1, step2])
-    #     workflow.inputs = ["input"]
-    #     workflow.outputs = {"output": "step2.output"}
-    #     assert validator.validate(workflow)
-    #     assert len(validator.errors) == 0
-    #     # Test incompatible list types
-    #     step1 = create_step_with_fields(
-    #         "step1", [], [OutputField(name="output", description="test", type="list[int]")]
-    #     )
-    #     step2 = create_step_with_fields(
-    #         "step2", [InputField(name="input", description="test", variable="step1.output")], []
-    #     )
-    #     workflow = create_basic_workflow([step1, step2])
-    #     workflow.inputs = ["input"]
-    #     workflow.outputs = {"output": "step2.output"}
-    #     assert not validator.validate(workflow)
-    #     assert len(validator.errors) == 1
-    #     assert validator.errors[0].error_type == ValidationErrorType.TYPE
 # Complex Workflow Tests
 class TestComplexWorkflows:
@@ -569,6 +554,92 @@ class TestComplexWorkflows:
         assert len(validator.errors) == 0
 # External Input Tests
 class TestExternalInputs:
     def test_external_input_existence(self):
@@ -645,3 +716,48 @@ class TestEdgeCases:
         assert not validator.validate(workflow)
         assert len(validator.errors) == 1
         assert validator.errors[0].error_type == ValidationErrorType.STEP

+from typing import Any
 import pytest
 from pydantic import ValidationError as PydanticValidationError
+from workflows.structs import CallType, InputField, ModelStep, OutputField, Workflow
+from workflows.validators import ValidationError, ValidationErrorType, WorkflowValidator, _parse_variable_reference
 # Test Data
+def create_empty_step(step_id: str = "step1") -> ModelStep:
     """Creates a basic valid step for testing"""
     return ModelStep(
         id=step_id,
         name="Test Step",
         model="gpt-4",
         provider="openai",
+        call_type=CallType.LLM,
         temperature=0.7,
         system_prompt="Test prompt",
         input_fields=[],
     )
+# Test Data
+def create_basic_step(step_id: str = "step1") -> ModelStep:
+    """Creates a basic valid step for testing"""
+    return ModelStep(
+        id=step_id,
+        name="Test Step",
+        model="gpt-4",
+        provider="openai",
+        call_type=CallType.LLM,
+        temperature=0.7,
+        system_prompt="Test prompt",
+        input_fields=[InputField(name="input", description="test", variable="external_input")],
+        output_fields=[OutputField(name="output", description="test", type="str")],
+    )
+def create_basic_workflow(steps: list[ModelStep] | None = None) -> Workflow:
     """Creates a basic valid workflow for testing"""
     if steps is None:
+        steps = [create_empty_step()]
     return Workflow(inputs=[], outputs={}, steps={step.id: step for step in steps})
 # Additional Test Data
 def create_step_with_fields(
+    step_id: str, input_fields: list[InputField], output_fields: list[OutputField]
 ) -> ModelStep:
     """Creates a step with specific input and output fields"""
     return ModelStep(
         name="Test Step",
         model="gpt-4",
         provider="openai",
+        call_type=CallType.LLM,
         temperature=0.7,
         system_prompt="Test prompt",
         input_fields=input_fields,
             name="",  # Missing name
             model="",  # Missing model
             provider="",  # Missing provider
+            call_type=CallType.LLM,  # Missing call_type
             temperature=0.7,
             system_prompt="Test prompt",
             input_fields=[],
             output_fields=[],
         )
         workflow = create_basic_workflow([step])
+        workflow.inputs = ["external_input"]
+        workflow.outputs = {"output": "step1.output"}
         assert not validator.validate(workflow)
         assert len(validator.errors) == 1
         assert validator.errors[0].error_type == ValidationErrorType.STEP
         validator = WorkflowValidator()
         step = create_basic_step("123invalid")  # Invalid ID format
         workflow = create_basic_workflow([step])
+        workflow.inputs = ["external_input"]
+        workflow.outputs = {"output": "step1.output"}
         assert not validator.validate(workflow)
         assert len(validator.errors) == 1
         assert validator.errors[0].error_type == ValidationErrorType.NAMING
+    def test_llm_temperature_validation_invalid(self):
         """Test validation of LLM step temperature"""
         validator = WorkflowValidator()
         # Test invalid temperature
         step = create_basic_step()
+        step.temperature = -0.5  # Invalid temperature
         workflow = create_basic_workflow([step])
+        workflow.inputs = ["external_input"]
+        workflow.outputs = {"output": "step1.output"}
         assert not validator.validate(workflow)
         assert len(validator.errors) == 1
         assert validator.errors[0].error_type == ValidationErrorType.RANGE
+    def test_llm_temperature_validation_missing(self):
         # Test missing temperature
+        validator = WorkflowValidator()
         step = create_basic_step()
         step.temperature = None  # Missing temperature
         workflow = create_basic_workflow([step])
+        workflow.inputs = ["external_input"]
+        workflow.outputs = {"output": "step1.output"}
         assert not validator.validate(workflow)
         assert len(validator.errors) == 1
         assert validator.errors[0].error_type == ValidationErrorType.STEP
         step = create_basic_step()
         step.system_prompt = ""  # Missing system prompt
         workflow = create_basic_workflow([step])
+        workflow.inputs = ["external_input"]
+        workflow.outputs = {"output": "step1.output"}
         assert not validator.validate(workflow)
         assert len(validator.errors) == 1
         assert validator.errors[0].error_type == ValidationErrorType.STEP
         step = create_basic_step()
         step.system_prompt = "x" * 4001  # Too long
         workflow = create_basic_workflow([step])
+        workflow.inputs = ["external_input"]
+        workflow.outputs = {"output": "step1.output"}
         assert not validator.validate(workflow)
         assert len(validator.errors) == 1
         assert validator.errors[0].error_type == ValidationErrorType.LENGTH
         workflow.outputs = {"output": "step2.output"}
         assert validator.validate(workflow)
 # Complex Workflow Tests
 class TestComplexWorkflows:
         assert len(validator.errors) == 0
+# Log Probability Validation Tests
+class TestLogProbabilityValidation:
+    def test_logprob_step_validation(self):
+        """Test validation of log probability step references"""
+        validator = WorkflowValidator()
+        # Create a workflow with multiple steps
+        step1 = create_step_with_fields(
+            "step1",
+            [InputField(name="input", description="test", variable="external_input")],
+            [OutputField(name="output", description="test", type="str")],
+        )
+        step2 = create_step_with_fields(
+            "step2",
+            [InputField(name="input", description="test", variable="step1.output")],
+            [OutputField(name="output", description="test", type="str")],
+        )
+        workflow = create_basic_workflow([step1, step2])
+        workflow.inputs = ["external_input"]
+        workflow.outputs = {"output": "step2.output"}
+        # Validate the workflow first
+        assert validator.validate(workflow)
+        validator.errors = []  # Clear any previous errors
+        # Test that a valid step ID is accepted
+        valid_logprob_step = "step1"
+        assert valid_logprob_step in workflow.steps
+        # A validator for logprob_step would check if the step exists in workflow.steps
+        # Test that an invalid step ID is caught
+        invalid_logprob_step = "nonexistent_step"
+        assert invalid_logprob_step not in workflow.steps
+        # A validator for logprob_step would report an error for a non-existent step
+# Output Structure Tests
+class TestOutputStructure:
+    def test_workflow_output_structure(self):
+        """Test the expected structure of workflow outputs"""
+        # Sample output dictionary matching WorkflowOutput structure
+        output: dict[str, dict | None] = {
+            "final_outputs": {},
+            "intermediate_outputs": {},
+            "step_contents": {},
+            "logprob": None,
+        }
+        # Verify that all expected keys are present
+        assert "final_outputs" in output
+        assert "intermediate_outputs" in output
+        assert "step_contents" in output
+        assert "logprob" in output
+        # Test with populated values
+        output = {
+            "final_outputs": {"output": "result"},
+            "intermediate_outputs": {"step1.output": "result", "input": "value"},
+            "step_contents": {"step1": "Full content"},
+            "logprob": -2.5,
+        }
+        assert output["final_outputs"] == {"output": "result"}
+        assert output["intermediate_outputs"]["step1.output"] == "result"
+        assert output["step_contents"]["step1"] == "Full content"
+        assert output["logprob"] == -2.5
+    def test_model_step_result_structure(self):
+        """Test the expected structure of model step results"""
+        # Sample result dictionary matching ModelStepResult structure
+        result: dict[str, Any] = {"outputs": {}, "content": None, "logprob": None}
+        # Verify that all expected keys are present
+        assert "outputs" in result
+        assert "content" in result
+        assert "logprob" in result
+        # Test with populated values
+        result = {"outputs": {"field": "value"}, "content": "Full response", "logprob": -1.5}
+        assert result["outputs"] == {"field": "value"}
+        assert result["content"] == "Full response"
+        assert result["logprob"] == -1.5
 # External Input Tests
 class TestExternalInputs:
     def test_external_input_existence(self):
         assert not validator.validate(workflow)
         assert len(validator.errors) == 1
         assert validator.errors[0].error_type == ValidationErrorType.STEP
+# Extended validator tests for actual implementation
+class TestExtendedValidation:
+    def test_parse_variable_reference(self):
+        """Test the _parse_variable_reference method"""
+        validator = WorkflowValidator()
+        # Test external input reference
+        step_id, field_name = _parse_variable_reference("input_var")
+        assert step_id is None
+        assert field_name == "input_var"
+        # Test step output reference
+        step_id, field_name = _parse_variable_reference("step1.output")
+        assert step_id == "step1"
+        assert field_name == "output"
+    def test_is_valid_identifier(self):
+        """Test the _is_valid_identifier method"""
+        validator = WorkflowValidator()
+        # Valid identifiers
+        assert validator._is_valid_identifier("valid_name")
+        assert validator._is_valid_identifier("ValidName")
+        assert validator._is_valid_identifier("name123")
+        # Invalid identifiers
+        assert not validator._is_valid_identifier("")  # Empty
+        assert not validator._is_valid_identifier("   ")  # Whitespace
+        assert not validator._is_valid_identifier("123name")  # Starts with number
+        assert not validator._is_valid_identifier("name-with-hyphens")  # Has hyphens
+        assert not validator._is_valid_identifier("name.with.dots")  # Has dots
+    def test_is_valid_external_input(self):
+        """Test the _is_valid_external_input method"""
+        validator = WorkflowValidator()
+        # Valid external inputs
+        assert validator._is_valid_external_input("input_var")
+        # Invalid external inputs
+        assert not validator._is_valid_external_input("")  # Empty
+        assert not validator._is_valid_external_input("input.var")  # Contains dot
+        assert not validator._is_valid_external_input("123input")  # Starts with number