Spaces:

umdclip
/

quizbowl-submission

Running

App Files Files Community

Maharshi Gor commited on 27 days ago

Commit

193db9d

1 Parent(s): a562808

First Working commit

Browse files

Files changed (42) hide show

.gitignore +23 -0
Makefile +13 -0
app.py +125 -0
pyproject.toml +13 -0
requirements.txt +27 -0
src/components/__init__.py +1 -0
src/components/model_pipeline/__init__.py +0 -0
src/components/model_pipeline/model_pipeline.py +291 -0
src/components/model_pipeline/state_manager.py +180 -0
src/components/model_step/__init__.py +0 -0
src/components/model_step/model_step.py +477 -0
src/components/model_step/state_manager.py +152 -0
src/components/model_step/ui_components.py +91 -0
src/components/quizbowl/__init__.py +0 -0
src/components/quizbowl/bonus.py +399 -0
src/components/quizbowl/plotting.py +194 -0
src/components/quizbowl/tossup.py +426 -0
src/components/quizbowl/utils.py +86 -0
src/components/utils.py +29 -0
src/display/__init__.py +0 -0
src/display/css_html_js.py +122 -0
src/display/custom_css.py +413 -0
src/display/formatting.py +27 -0
src/display/utils.py +110 -0
src/envs.py +86 -0
src/submission/structs.py +58 -0
src/submission/submit.py +170 -0
src/utils.py +38 -0
src/workflows/README.md +92 -0
src/workflows/errors.py +63 -0
src/workflows/executors.py +440 -0
src/workflows/factory.py +150 -0
src/workflows/qb/__init__.py +0 -0
src/workflows/qb/simple_agent.py +194 -0
src/workflows/quizbowl_agent.py +269 -0
src/workflows/structs.py +229 -0
src/workflows/utils.py +161 -0
src/workflows/validators.py +586 -0
tests/conftest.py +5 -0
tests/test_executors.py +295 -0
tests/test_utils.py +159 -0
tests/test_validators.py +647 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,23 @@

+# Purpose: Ignore certain files and directories in git
+*.pyc
+*.pyi
+*.pyo
+*.pyd
+*.pyw
+*.pyz
+*.pywz
+*.pywz
+auto_evals/
+venv/
+__pycache__/
+.env
+.ipynb_checkpoints
+*ipynb
+.vscode/
+eval-queue/
+eval-results/
+eval-queue-bk/
+eval-results-bk/
+logs/

Makefile ADDED Viewed

	@@ -0,0 +1,13 @@

+.PHONY: style format
+style:
+	python -m black --line-length 119 .
+	python -m isort .
+	ruff check --fix .
+quality:
+	python -m black --check --line-length 119 .
+	python -m isort --check-only .
+	ruff check .

app.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import datasets
+import gradio as gr
+from components.quizbowl.bonus import BonusInterface
+from components.quizbowl.tossup import TossupInterface
+from display.custom_css import css_pipeline, css_tossup
+# Constants
+from envs import AVAILABLE_MODELS, DEFAULT_SELECTIONS, PLAYGROUND_DATASET_NAMES, THEME
+from workflows import factory
+js_preamble = """
+<link href="https://fonts.cdnfonts.com/css/roboto-mono" rel="stylesheet">
+<script>
+    const gradioApp = document.getElementsByTagName('gradio-app')[0];
+    console.log("Gradio app:", gradioApp);
+    console.log(gradioApp.querySelectorAll('.token'));
+    console.log(document.querySelectorAll('.token'));
+    // Function to trigger Python callback
+    const setHiddenIndex = (index) => {
+        console.log("Setting hidden index to:", index);
+        const hiddenIndex = gradioApp.querySelector("#hidden-index textarea");
+        if (hiddenIndex) {
+            hiddenIndex.value = index;
+            let event = new Event("input", { bubbles: true});
+            Object.defineProperty(event, "target", { value: hiddenIndex});
+            hiddenIndex.dispatchEvent(event);
+        }
+    };
+    // Add event listeners to all tokens
+    function setupTokenListeners() {
+        const tokens = gradioApp.querySelectorAll('.token');
+        console.log("Tokens:", tokens);
+        tokens.forEach(token => {
+            token.addEventListener('mouseover', function() {
+                const index = parseInt(this.getAttribute('data-index'));
+                console.log("Mouseover token index:", index);
+                // Reset all tokens
+                gradioApp.querySelectorAll('.token').forEach(el => {
+                    el.classList.remove('highlighted');
+                });
+                // Highlight this token
+                this.classList.add('highlighted');
+                // Update the hidden index to trigger the Python callback
+                setHiddenIndex(index);
+            });
+        });
+    }
+    console.log("Preamble complete");
+    document.addEventListener("DOMContentLoaded", function() {
+        // Setup initial listeners
+        console.log("DOM fully loaded and parsed");
+        setupTokenListeners();
+        // Setup a mutation observer to handle dynamically added tokens
+        const observer = new MutationObserver(function(mutations) {
+            mutations.forEach(function(mutation) {
+                if (mutation.addedNodes.length) {
+                    setupTokenListeners();
+                }
+            });
+        });
+        // Start observing the token container for changes
+        const tokenContainer = gradioApp.querySelector('.token-container');
+        console.log("Token container:", tokenContainer);
+        if (tokenContainer) {
+            observer.observe(tokenContainer.parentNode, { childList: true, subtree: true });
+        }
+        console.log("Listener setup complete");
+    });
+</script>
+"""
+def load_dataset(mode: str):
+    if mode == "tossup":
+        ds = datasets.load_dataset(PLAYGROUND_DATASET_NAMES["tossup"], split="eval")
+        ds = ds.filter(lambda x: x["qid"].split("-")[2] == "1" and int(x["qid"].split("-")[3]) <= 10)
+    elif mode == "bonus":
+        ds = datasets.load_dataset(PLAYGROUND_DATASET_NAMES["bonus"], split="eval")
+        ds = ds.filter(lambda x: x["qid"].split("-")[2] == "1" and int(x["qid"].split("-")[3]) <= 10)
+    else:
+        raise ValueError(f"Invalid mode: {mode}")
+    return ds
+def main():
+    tossup_ds = load_dataset("tossup")
+    bonus_ds = load_dataset("bonus")
+    app = gr.Blocks(
+        css=css_pipeline + css_tossup,
+        head=js_preamble,
+        theme=THEME,
+        title="Quizbowl Bot",
+    )
+    with app:
+        with gr.Tabs():
+            with gr.Tab("Tossup Agents"):
+                defaults = DEFAULT_SELECTIONS["tossup"] | {
+                    "init_workflow": factory.create_quizbowl_simple_workflow(),
+                    "simple_workflow": False,
+                }
+                tossup_interface = TossupInterface(app, tossup_ds, AVAILABLE_MODELS, defaults)
+                # ModelStepComponent(value=factory.create_quizbowl_simple_step())
+            with gr.Tab("Bonus Round Agents"):
+                defaults = DEFAULT_SELECTIONS["bonus"] | {
+                    "init_workflow": factory.create_quizbowl_bonus_simple_workflow(),
+                    "simple_workflow": True,
+                }
+                bonus_interface = BonusInterface(app, bonus_ds, AVAILABLE_MODELS, defaults)
+    app.queue(api_open=True).launch()
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,13 @@

+[tool.ruff]
+# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
+select = ["E", "F"]
+ignore = ["E501"] # line too long (black is taking care of this)
+line-length = 119
+fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
+[tool.isort]
+profile = "black"
+line_length = 119
+[tool.black]
+line-length = 119

requirements.txt ADDED Viewed

	@@ -0,0 +1,27 @@

+APScheduler
+black
+datasets
+gradio
+modelscope_studio
+gradio[oauth]
+gradio_leaderboard
+gradio_client
+huggingface-hub>=0.18.0
+matplotlib
+numpy<2.0.0
+pandas
+python-dateutil
+tqdm
+transformers
+tokenizers>=0.15.0
+sentencepiece
+litellm
+openai
+anthropic
+cohere
+langchain
+langchain-core
+langchain-community
+langchain-anthropic
+langchain-openai
+langchain-cohere

src/components/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Components package

src/components/model_pipeline/__init__.py ADDED Viewed

File without changes

src/components/model_pipeline/model_pipeline.py ADDED Viewed

	@@ -0,0 +1,291 @@

+import json
+import gradio as gr
+import yaml
+from components.model_pipeline.state_manager import (
+    ModelStepUIState,
+    PipelineState,
+    PipelineStateManager,
+    PipelineUIState,
+)
+from components.model_step.model_step import ModelStepComponent
+from components.utils import make_state
+from workflows.structs import ModelStep, Workflow
+from workflows.validators import WorkflowValidator
+def validate_simple_workflow(workflow: Workflow, required_output_variables: list[str]) -> Workflow:
+    """Validate the workflow."""
+    step = next(iter(workflow.steps.values()))
+    if not step.output_fields:
+        raise ValueError("No output fields found in the workflow")
+    output_field_names = {output.name for output in step.output_fields}
+    if not set(required_output_variables) <= output_field_names:
+        missing_vars = required_output_variables - output_field_names
+        raise ValueError(f"Missing required output variables: {missing_vars}")
+    return workflow
+def validate_complex_workflow(workflow: Workflow, required_output_variables: list[str]) -> Workflow:
+    """Validate the workflow."""
+    print("Validating complex workflow.")
+    return workflow
+    step = next(iter(workflow.steps.values()))
+    if not step.output_fields:
+        raise ValueError("No output fields found in the workflow")
+    output_field_names = {output.name for output in step.output_fields}
+    if not output_field_names <= set(required_output_variables):
+        missing_vars = output_field_names - set(required_output_variables)
+        raise ValueError(f"Missing required output variables: {missing_vars}")
+    return workflow
+def parse_yaml_workflow(yaml_str: str) -> Workflow:
+    """Parse a YAML workflow."""
+    workflow = yaml.safe_load(yaml_str)
+    return Workflow(**workflow)
+def update_workflow_from_code(yaml_str: str, ui_state: PipelineUIState) -> PipelineState:
+    """Update a workflow from a YAML string."""
+    workflow = parse_yaml_workflow(yaml_str)
+    ui_state = PipelineUIState.from_workflow(workflow)
+    return PipelineState(workflow=workflow, ui_state=ui_state)
+class PipelineInterface:
+    """UI for the pipeline."""
+    def __init__(
+        self,
+        workflow: Workflow,
+        ui_state: PipelineUIState | None = None,
+        model_options: list[str] = None,
+        simple: bool = False,
+    ):
+        self.model_options = model_options
+        self.simple = simple
+        if not ui_state:
+            ui_state = PipelineUIState.from_workflow(workflow)
+        self.ui_state = make_state(ui_state)
+        self.pipeline_state = make_state(PipelineState(workflow=workflow, ui_state=ui_state))
+        self.variables_state = make_state(workflow.get_available_variables())
+        self.sm = PipelineStateManager()
+        self.input_variables = workflow.inputs
+        self.required_output_variables = list(workflow.outputs.keys())
+        # UI elements
+        self.steps_container = None
+        self.components = []
+        # Render the pipeline UI
+        self.render()
+    def _render_step(
+        self,
+        model_step: ModelStep,
+        step_ui_state: ModelStepUIState,
+        available_variables: list[str],
+        position: int = 0,
+    ):
+        with gr.Column(elem_classes="step-container"):
+            # Create the step component
+            step_interface = ModelStepComponent(
+                value=model_step,
+                ui_state=step_ui_state,
+                model_options=self.model_options,
+                input_variables=available_variables,
+                pipeline_state_manager=self.sm,
+            )
+            step_interface.on_model_step_change(
+                self.sm.update_model_step_state,
+                inputs=[self.pipeline_state, step_interface.model_step_state, step_interface.ui_state],
+                outputs=[self.pipeline_state, self.ui_state, self.variables_state],
+            )
+            step_interface.on_ui_change(
+                self.sm.update_model_step_ui,
+                inputs=[self.pipeline_state, step_interface.ui_state, gr.State(model_step.id)],
+                outputs=[self.pipeline_state, self.ui_state],
+            )
+            if self.simple:
+                return step_interface
+            # Add step controls below
+            with gr.Row(elem_classes="step-controls"):
+                up_button = gr.Button("⬆️ Move Up", elem_classes="step-control-btn")
+                down_button = gr.Button("⬇️ Move Down", elem_classes="step-control-btn")
+                remove_button = gr.Button("🗑️ Remove", elem_classes="step-control-btn")
+            buttons = (up_button, down_button, remove_button)
+            self._assign_step_controls(buttons, position)
+            return (step_interface, *buttons)
+    def _assign_step_controls(self, buttons: tuple[gr.Button, gr.Button, gr.Button], position: int):
+        up_button, down_button, remove_button = buttons
+        position = gr.State(position)
+        up_button.click(self.sm.move_up, inputs=[self.ui_state, position], outputs=self.ui_state)
+        down_button.click(self.sm.move_down, inputs=[self.ui_state, position], outputs=self.ui_state)
+        remove_button.click(
+            self.sm.remove_step,
+            inputs=[self.pipeline_state, position],
+            outputs=[self.pipeline_state, self.ui_state, self.variables_state],
+        )
+    def _render_add_step_button(self, position: int):
+        if position not in {0, -1}:
+            raise ValueError("Position must be 0 or -1")
+        row_class = "pipeline-header" if position == 0 else "pipeline-footer"
+        with gr.Row(elem_classes=row_class):
+            add_step_btn = gr.Button("➕ Add Step", elem_classes="add-step-button")
+        add_step_btn.click(
+            self.sm.add_step,
+            inputs=[self.pipeline_state, gr.State(position)],
+            outputs=[self.pipeline_state, self.ui_state, self.variables_state],
+        )
+        return add_step_btn
+    def _render_output_fields(self, available_variables: list[str], pipeline_state: PipelineState):
+        dropdowns = {}
+        UNSET_VALUE = "Choose variable..."
+        variable_options = [UNSET_VALUE] + [v for v in available_variables if v not in self.input_variables]
+        with gr.Column(elem_classes="step-accordion"):
+            with gr.Row(elem_classes="output-fields-header"):
+                gr.Markdown("#### Final output variables mapping:")
+            with gr.Row(elem_classes="output-fields-row"):
+                for output_field in self.required_output_variables:
+                    value = pipeline_state.workflow.outputs[output_field]
+                    if not value:
+                        value = UNSET_VALUE
+                    dropdown = gr.Dropdown(
+                        label=output_field,
+                        value=value,
+                        choices=variable_options,
+                        interactive=True,
+                        elem_classes="output-field-variable",
+                        # show_label=False,
+                    )
+                    dropdown.change(
+                        self.sm.update_output_variables,
+                        inputs=[self.pipeline_state, gr.State(output_field), dropdown],
+                        outputs=[self.pipeline_state],
+                    )
+                    dropdowns[output_field] = dropdown
+        def update_choices(available_variables):
+            """Update the choices for the dropdowns"""
+            return [
+                gr.update(choices=available_variables, value=None, selected=None) for dropdown in dropdowns.values()
+            ]
+        self.variables_state.change(
+            update_choices,
+            inputs=[self.variables_state],
+            outputs=list(dropdowns.values()),
+        )
+        return dropdowns
+    def validate_workflow(self, state: PipelineState) -> PipelineState:
+        """Validate the workflow."""
+        try:
+            if self.simple:
+                workflow = validate_simple_workflow(state.workflow, self.required_output_variables)
+            else:
+                workflow = validate_complex_workflow(state.workflow, self.required_output_variables)
+            state.workflow = workflow
+            return state
+        except ValueError as e:
+            raise gr.Error(e)
+    def _render_pipeline_header(self):
+        # Add Step button at top
+        input_variables_str = ", ".join([f"`{variable}`" for variable in self.input_variables])
+        output_variables_str = ", ".join([f"`{variable}`" for variable in self.required_output_variables])
+        if self.simple:
+            instruction = "Create a simple single LLM call pipeline that takes in the following input variables and outputs the following output variables:"
+        else:
+            instruction = "Create a pipeline that takes in the following input variables and outputs the following output variables:"
+        gr.Markdown(f"### {instruction}")
+        gr.Markdown(f"Input Variables: {input_variables_str}")
+        gr.Markdown(f"Output Variables: {output_variables_str}")
+        # if not self.simple:
+        #     self._render_add_step_button(0)
+    def render(self):
+        """Render the pipeline UI."""
+        # Create a placeholder for all the step components
+        self.all_components = []
+        # self.pipeline_state.change(
+        #     lambda x, y: print(f"Pipeline state changed! UI:\n{x}\n\n Data:\n{y}"),
+        #     inputs=[self.ui_state, self.pipeline_state],
+        #     outputs=[],
+        # )
+        self._render_pipeline_header()
+        # Function to render all steps
+        @gr.render(inputs=[self.pipeline_state, self.ui_state])
+        def render_steps(state, ui_state):
+            """Render all steps in the pipeline"""
+            workflow = state.workflow
+            print(f"\nRerender triggered! Current UI State:{ui_state}")
+            components = []
+            step_objects = []  # Reset step objects list
+            for i, step_id in enumerate(ui_state.step_ids):
+                step_data = workflow.steps[step_id]
+                step_ui_state = ui_state.steps[step_id]
+                available_variables = self.sm.get_all_variables(state, step_id)
+                sub_components = self._render_step(step_data, step_ui_state, available_variables, i)
+                step_objects.append(sub_components)
+            components.append(step_objects)
+        # Bottom buttons
+        if not self.simple:
+            self._render_add_step_button(-1)
+        @gr.render(inputs=[self.variables_state, self.pipeline_state])
+        def render_output_fields(available_variables, pipeline_state):
+            return self._render_output_fields(available_variables, pipeline_state)
+        export_btn = gr.Button("Export Pipeline", elem_classes="export-button")
+        # components.append(export_btn)
+        # Add a code box to display the workflow JSON
+        # with gr.Column(elem_classes="workflow-json-container"):
+        with gr.Accordion("Pipeline Preview", open=False, elem_classes="pipeline-preview") as config_accordion:
+            config_output = gr.Code(
+                label="Workflow Configuration",
+                language="yaml",
+                elem_classes="workflow-json",
+                interactive=True,
+                autocomplete=True,
+            )
+        # components.append(config_accordion)
+        config_output.blur(
+            fn=update_workflow_from_code,
+            inputs=[config_output, self.ui_state],
+            outputs=[self.pipeline_state],
+        )
+        # Connect the export button to show the workflow JSON
+        export_btn.click(self.validate_workflow, inputs=[self.pipeline_state], outputs=[self.pipeline_state]).success(
+            fn=lambda: gr.update(visible=True, open=True), outputs=[config_accordion]
+        )
+        export_btn.click(
+            fn=self.sm.get_formatted_config,
+            inputs=[self.pipeline_state, gr.State("yaml")],
+            outputs=[config_output],
+            js="() => {document.querySelector('.pipeline-preview').scrollIntoView({behavior: 'smooth'})}",
+        )
+        # self.all_components = components

src/components/model_pipeline/state_manager.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import json
+import logging
+from typing import Any, Literal
+import gradio as gr
+import yaml
+from pydantic import BaseModel, Field
+from components import utils
+from workflows.factory import create_new_llm_step
+from workflows.structs import ModelStep, Workflow
+def make_step_id(step_id: int):
+    """Make a step id from a step name."""
+    if step_id < 26:
+        return chr(ord("A") + step_id)
+    else:
+        # For more than 26 steps, use AA, AB, AC, etc.
+        first_char = chr(ord("A") + (step_id // 26) - 1)
+        second_char = chr(ord("A") + (step_id % 26))
+        return f"{first_char}{second_char}"
+class ModelStepUIState(BaseModel):
+    """Represents the UI state for a model step component."""
+    expanded: bool = True
+    active_tab: Literal["model-tab", "inputs-tab", "outputs-tab"] = "model-tab"
+    def update(self, key: str, value: Any) -> "ModelStepUIState":
+        """Update the UI state."""
+        new_state = self.model_copy(update={key: value})
+        logging.warning("UI state updated: %s", self)
+        return new_state
+class PipelineUIState(BaseModel):
+    """Represents the UI state for a pipeline component."""
+    step_ids: list[str] = Field(default_factory=list)
+    steps: dict[str, ModelStepUIState] = Field(default_factory=dict)
+    def model_post_init(self, __context: utils.Any) -> None:
+        if not self.steps and self.step_ids:
+            self.steps = {step_id: ModelStepUIState() for step_id in self.step_ids}
+        return super().model_post_init(__context)
+    def get_step_position(self, step_id: str):
+        """Get the position of a step in the pipeline."""
+        return next((i for i, step in enumerate(self.step_ids) if step == step_id), None)
+    @classmethod
+    def from_workflow(cls, workflow: Workflow):
+        """Create a pipeline UI state from a workflow."""
+        return PipelineUIState(
+            step_ids=list(workflow.steps.keys()),
+            steps={step_id: ModelStepUIState() for step_id in workflow.steps.keys()},
+        )
+class PipelineState(BaseModel):
+    """Represents the state for a pipeline component."""
+    workflow: Workflow
+    ui_state: PipelineUIState
+    def insert_step(self, position: int, step: ModelStep):
+        if step.id in self.workflow.steps:
+            raise ValueError(f"Step {step.id} already exists in pipeline")
+        # Validate position
+        if position != -1 and (position < 0 or position > self.n_steps):
+            raise ValueError(f"Invalid position: {position}. Must be between 0 and {self.n_steps} or -1")
+        self.workflow.steps[step.id] = step
+        self.ui_state = self.ui_state.model_copy()
+        self.ui_state.steps[step.id] = ModelStepUIState()
+        if position == -1:
+            self.ui_state.step_ids.append(step.id)
+        else:
+            self.ui_state.step_ids.insert(position, step.id)
+        return self
+    def remove_step(self, position: int):
+        step_id = self.ui_state.step_ids.pop(position)
+        self.workflow.steps.pop(step_id)
+        self.ui_state = self.ui_state.model_copy()
+        self.ui_state.steps.pop(step_id)
+        self.update_output_variables_mapping()
+    def update_output_variables_mapping(self):
+        available_variables = set(self.available_variables)
+        for output_field in self.workflow.outputs:
+            if self.workflow.outputs[output_field] not in available_variables:
+                self.workflow.outputs[output_field] = None
+        return self
+    @property
+    def available_variables(self):
+        return self.workflow.get_available_variables()
+    @property
+    def n_steps(self):
+        return len(self.workflow.steps)
+class PipelineStateManager:
+    """Manages a pipeline of multiple steps."""
+    def get_formatted_config(self, state: PipelineState, format: Literal["json", "yaml"] = "yaml"):
+        """Get the full pipeline configuration."""
+        config = state.workflow.model_dump(exclude_defaults=True)
+        if format == "yaml":
+            return yaml.dump(config, default_flow_style=False, sort_keys=False, indent=4)
+        else:
+            return json.dumps(config, indent=4, sort_keys=False)
+    def count_state(self):
+        return gr.State(len(self.steps))
+    def add_step(self, state: PipelineState, position: int = -1, name=""):
+        """Create a new step and return its state."""
+        step_id = make_step_id(state.n_steps)
+        step_name = name or f"Step {state.n_steps + 1}"
+        new_step = create_new_llm_step(step_id=step_id, name=step_name)
+        state = state.insert_step(position, new_step)
+        return state, state.ui_state, state.available_variables
+    def remove_step(self, state: PipelineState, position: int):
+        """Remove a step from the pipeline."""
+        if 0 <= position < state.n_steps:
+            state = state.remove_step(position)
+        else:
+            raise ValueError(f"Invalid step position: {position}")
+        return state, state.ui_state, state.available_variables
+    def move_up(self, ui_state: PipelineUIState, position: int):
+        """Move a step up in the pipeline."""
+        utils.move_item(ui_state.step_ids, position, "up")
+        return ui_state.model_copy()
+    def move_down(self, ui_state: PipelineUIState, position: int):
+        """Move a step down in the pipeline."""
+        utils.move_item(ui_state.step_ids, position, "down")
+        return ui_state.model_copy()
+    def update_model_step_state(self, state: PipelineState, model_step: ModelStep, ui_state: ModelStepUIState):
+        """Update a step in the pipeline."""
+        state.workflow.steps[model_step.id] = model_step.model_copy()
+        state.ui_state.steps[model_step.id] = ui_state.model_copy()
+        state.ui_state = state.ui_state.model_copy()
+        state.update_output_variables_mapping()
+        return state, state.ui_state, state.available_variables
+    def update_output_variables(self, state: PipelineState, target: str, produced_variable: str):
+        if produced_variable == "Choose variable...":
+            produced_variable = None
+        """Update the output variables for a step."""
+        state.workflow.outputs.update({target: produced_variable})
+        return state
+    def update_model_step_ui(self, state: PipelineState, step_ui: ModelStepUIState, step_id: str):
+        """Update a step in the pipeline."""
+        state.ui_state.steps[step_id] = step_ui.model_copy()
+        return state, state.ui_state
+    def get_all_variables(self, state: PipelineState, model_step_id: str | None = None) -> list[str]:
+        """Get all variables from all steps."""
+        available_variables = state.available_variables
+        if model_step_id is None:
+            return available_variables
+        else:
+            prefix = f"{model_step_id}."
+            return [var for var in available_variables if not var.startswith(prefix)]
+    def get_pipeline_config(self):
+        """Get the full pipeline configuration."""
+        return self.workflow

src/components/model_step/__init__.py ADDED Viewed

File without changes

src/components/model_step/model_step.py ADDED Viewed

	@@ -0,0 +1,477 @@

+import json
+from typing import Any
+import gradio as gr
+from gradio.components import FormComponent
+from components.model_pipeline.state_manager import ModelStepUIState, PipelineState, PipelineStateManager
+from utils import get_full_model_name
+from workflows.structs import ModelStep
+from .state_manager import ModelStepStateManager
+from .ui_components import InputRowButtonGroup, OutputRowButtonGroup
+def _make_accordion_label(model_step: ModelStep):
+    name = model_step.name if model_step.name else "Untitled"
+    input_field_names = [field.name for field in model_step.input_fields]
+    inputs_str = ", ".join(input_field_names)
+    output_field_names = [field.name for field in model_step.output_fields]
+    outputs_str = ", ".join(output_field_names)
+    return "{}: {} ({}) → ({})".format(model_step.id, name, inputs_str, outputs_str)
+class ModelStepComponent(FormComponent):
+    """
+    A custom Gradio component representing a single Step in a pipeline.
+    It contains:
+      1. Model Provider & System Prompt
+      2. Inputs – fields with name, description, and variable used
+      3. Outputs – fields with name, description, and variable used
+    Listens to events:
+        - on_model_step_change
+        - on_ui_change
+    """
+    def __init__(
+        self,
+        value: ModelStep | gr.State,
+        ui_state: ModelStepUIState | gr.State | None = None,
+        model_options: list[str] | None = None,
+        input_variables: list[str] | None = None,
+        max_input_fields=5,
+        max_output_fields=5,
+        pipeline_state_manager: PipelineStateManager | None = None,
+        **kwargs,
+    ):
+        self.max_fields = {
+            "input": max_input_fields,
+            "output": max_output_fields,
+        }
+        self.model_options = model_options
+        self.input_variables = input_variables
+        self.sm = ModelStepStateManager(max_input_fields, max_output_fields)
+        self.pipeline_sm: PipelineStateManager = pipeline_state_manager
+        self.model_step_state = gr.State(value)
+        ui_state = ui_state or ModelStepUIState()
+        if not isinstance(ui_state, gr.State):
+            ui_state = gr.State(ui_state)
+        self.ui_state: gr.State = ui_state
+        self.inputs_count_state = gr.State(len(value.input_fields))
+        self.outputs_count_state = gr.State(len(value.output_fields))
+        # UI components that will be created in render
+        self.accordion = None
+        self.ui = None
+        self.step_name_input = None
+        self.model_selection = None
+        self.system_prompt = None
+        self.input_rows = []
+        self.output_rows = []
+        super().__init__(**kwargs)
+        # self.render()
+        self.setup_event_listeners()
+    @property
+    def model_step(self) -> ModelStep:
+        return self.model_step_state.value
+    @property
+    def step_id(self) -> str:
+        return self.model_step.id
+    def get_step_config(self) -> dict:
+        return self.model_step.model_dump()
+    # UI state accessors
+    def is_open(self) -> bool:
+        return self.ui_state.value.expanded
+    def get_active_tab(self) -> str:
+        """Get the current active tab."""
+        return self.ui_state.value.active_tab
+    def _render_input_row(self, i: int) -> tuple[gr.Row, tuple, tuple]:
+        """Render a single input row at index i."""
+        inputs = self.model_step.input_fields
+        is_visible = i < len(inputs)
+        label_visible = i == 0
+        initial_name = inputs[i].name if is_visible else ""
+        initial_desc = inputs[i].description if is_visible else ""
+        initial_var = inputs[i].variable if is_visible else "question_text"
+        with gr.Row(visible=is_visible, elem_classes="field-row form") as row:
+            button_group = InputRowButtonGroup()
+            inp_name = gr.Textbox(
+                label="Input Name",
+                placeholder="Field name",
+                value=initial_name,
+                elem_classes="field-name",
+                scale=1,
+                show_label=label_visible,
+            )
+            # Get variable choices safely
+            # variable_choices = []
+            # if self.pipeline_sm is not None:
+            #     variable_choices = self.pipeline_sm.get_all_variables(self.step_id)
+            inp_var = gr.Dropdown(
+                choices=self.input_variables,
+                label="Variable Used",
+                value=initial_var,
+                elem_classes="field-variable",
+                scale=1,
+                show_label=label_visible,
+            )
+            inp_desc = gr.Textbox(
+                label="Description",
+                placeholder="Field description",
+                value=initial_desc,
+                elem_classes="field-description",
+                scale=3,
+                show_label=label_visible,
+            )
+            fields = (inp_name, inp_var, inp_desc)
+            # buttons = (delete_button, add_button)
+        return row, fields, button_group
+    def _render_output_row(self, i: int) -> tuple[gr.Row, tuple, tuple]:
+        """Render a single output row at index i."""
+        outputs = self.model_step.output_fields
+        is_visible = i < len(outputs)
+        label_visible = i == 0
+        initial_name = outputs[i].name if is_visible else ""
+        initial_desc = outputs[i].description if is_visible else ""
+        initial_type = outputs[i].type if is_visible else "str"
+        with gr.Row(visible=is_visible, elem_classes="field-row") as row:
+            button_group = OutputRowButtonGroup()
+            out_name = gr.Textbox(
+                label="Output Field",
+                placeholder="Variable identifier",
+                value=initial_name,
+                elem_classes="field-name",
+                scale=1,
+                show_label=label_visible,
+            )
+            out_type = gr.Dropdown(
+                choices=["str", "int", "float", "bool"],
+                allow_custom_value=True,
+                label="Type",
+                value=initial_type,
+                elem_classes="field-type",
+                scale=0,
+                show_label=label_visible,
+                interactive=True,
+            )
+            out_desc = gr.Textbox(
+                label="Description",
+                placeholder="Field description",
+                value=initial_desc,
+                elem_classes="field-description",
+                scale=3,
+                show_label=label_visible,
+            )
+            fields = (out_name, out_type, out_desc)
+        return row, fields, button_group
+    def _render_prompt_tab_content(self):
+        self.system_prompt = gr.Textbox(
+            label="System Prompt",
+            placeholder="Enter the system prompt for this step",
+            lines=5,
+            value=self.model_step.system_prompt,
+            elem_classes="system-prompt",
+        )
+    def _render_inputs_tab_content(self):
+        with gr.Column(variant="panel", elem_classes="fields-panel") as self.inputs_column:
+            # Render input rows using helper method
+            for i in range(self.max_fields["input"]):
+                row = self._render_input_row(i)
+                self.input_rows.append(row)
+    def _render_outputs_tab_content(self):
+        with gr.Column(variant="panel", elem_classes="fields-panel") as self.outputs_column:
+            # Render output rows using helper method
+            for i in range(self.max_fields["output"]):
+                row = self._render_output_row(i)
+                self.output_rows.append(row)
+    def _render_tab_content(self, tab_id: str):
+        if tab_id == "model-tab":
+            self._render_prompt_tab_content()
+        elif tab_id == "inputs-tab":
+            self._render_inputs_tab_content()
+        elif tab_id == "outputs-tab":
+            self._render_outputs_tab_content()
+    def _render_header(self, model_options: tuple[str]):
+        # Header with step name
+        with gr.Row(elem_classes="step-header-row"):
+            self.step_name_input = gr.Textbox(
+                label="",
+                value=self.model_step.name,
+                elem_classes="step-name",
+                show_label=False,
+                placeholder="Model name...",
+            )
+            unselected_choice = "Select Model..."
+            current_value = (
+                get_full_model_name(self.model_step.model, self.model_step.provider)
+                if self.model_step.model
+                else unselected_choice
+            )
+            self.model_selection = gr.Dropdown(
+                choices=[unselected_choice] + model_options,
+                label="Model Provider",
+                show_label=False,
+                value=current_value,
+                elem_classes="model-dropdown",
+                scale=1,
+            )
+            self.temperature_slider = gr.Slider(
+                value=self.model_step.temperature,
+                minimum=0.0,
+                maximum=5,
+                step=0.05,
+                info="Temperature",
+                show_label=False,
+            )
+    def render(self):
+        """Render the component UI"""
+        # Reset UI component lists
+        self.input_rows = []
+        self.output_rows = []
+        self.tabs = {}
+        # Create the accordion for this step
+        accordion_label = _make_accordion_label(self.model_step)
+        self.accordion = gr.Accordion(label=accordion_label, open=self.is_open(), elem_classes="step-accordion")
+        # Create the UI content inside the accordion
+        with self.accordion:
+            self._render_header(self.model_options)
+            # Configuration tabs
+            selected_tab = self.get_active_tab()
+            with gr.Tabs(elem_classes="step-tabs", selected=selected_tab):
+                tab_ids = ("model-tab", "inputs-tab", "outputs-tab")
+                tab_labels = ("Model", "Inputs", "Outputs")
+                for tab_id, label in zip(tab_ids, tab_labels):
+                    with gr.TabItem(label, elem_classes="tab-content", id=tab_id) as tab:
+                        self._render_tab_content(tab_id)
+                        self.tabs[tab_id] = tab
+        return self.accordion
+    def _setup_event_listeners_for_view_change(self):
+        for tab_id, tab in self.tabs.items():
+            tab.select(
+                fn=self.sm.update_ui_state,
+                inputs=[self.ui_state, gr.State("active_tab"), gr.State(tab_id)],
+                outputs=[self.ui_state],
+            )
+        self.accordion.collapse(
+            fn=self.sm.update_ui_state,
+            inputs=[self.ui_state, gr.State("expanded"), gr.State(False)],
+            outputs=[self.ui_state],
+        )
+        self.accordion.expand(
+            fn=self.sm.update_ui_state,
+            inputs=[self.ui_state, gr.State("expanded"), gr.State(True)],
+            outputs=[self.ui_state],
+        )
+    def _setup_event_listeners_model_tab(self):
+        # Step name change
+        self.step_name_input.blur(
+            fn=self._update_state_and_label,
+            inputs=[self.model_step_state, self.step_name_input],
+            outputs=[self.model_step_state, self.accordion],
+        )
+        self.temperature_slider.release(
+            fn=self.sm.update_temperature,
+            inputs=[self.model_step_state, self.temperature_slider],
+            outputs=[self.model_step_state],
+        )
+        # Model and system prompt
+        self.model_selection.change(
+            fn=self.sm.update_model_and_provider,
+            inputs=[self.model_step_state, self.model_selection],
+            outputs=[self.model_step_state],
+        )
+        self.system_prompt.blur(
+            fn=self.sm.update_system_prompt,
+            inputs=[self.model_step_state, self.system_prompt],
+            outputs=[self.model_step_state],
+        )
+    def _setup_event_listeners_inputs_tab(self):
+        # Setup input row events
+        for i, (row, fields, button_group) in enumerate(self.input_rows):
+            inp_name, inp_var, inp_desc = fields
+            row_index = gr.State(i)
+            # Field change handlers
+            inp_name.blur(
+                fn=self.sm.update_input_field_name,
+                inputs=[self.model_step_state, inp_name, row_index],
+                outputs=[self.model_step_state],
+            )
+            inp_var.change(
+                fn=self.sm.update_input_field_variable,
+                inputs=[self.model_step_state, inp_var, row_index],
+                outputs=[self.model_step_state],
+            )
+            inp_desc.blur(
+                fn=self.sm.update_input_field_description,
+                inputs=[self.model_step_state, inp_desc, row_index],
+                outputs=[self.model_step_state],
+            )
+            rows = [row for (row, _, _) in self.input_rows]
+            input_fields = [field for (_, fields, _) in self.input_rows for field in fields]
+            # Button handlers
+            button_group.delete(
+                fn=self.sm.delete_input_field,
+                inputs=[self.model_step_state, row_index],
+                outputs=[self.model_step_state, self.inputs_count_state] + rows + input_fields,
+            )
+            button_group.add(
+                fn=self.sm.add_input_field,
+                inputs=[self.model_step_state, row_index],
+                outputs=[self.model_step_state, self.inputs_count_state] + rows + input_fields,
+            )
+    def _setup_event_listeners_outputs_tab(self):
+        # Setup output row events
+        for i, (row, fields, button_group) in enumerate(self.output_rows):
+            out_name, out_type, out_desc = fields
+            row_index = gr.State(i)
+            # Field change handlers
+            out_name.blur(
+                fn=self.sm.update_output_field_name,
+                inputs=[self.model_step_state, out_name, row_index],
+                outputs=[self.model_step_state],
+            )
+            out_type.change(
+                fn=self.sm.update_output_field_type,
+                inputs=[self.model_step_state, out_type, row_index],
+                outputs=[self.model_step_state],
+            )
+            out_desc.blur(
+                fn=self.sm.update_output_field_description,
+                inputs=[self.model_step_state, out_desc, row_index],
+                outputs=[self.model_step_state],
+            )
+            rows = [row for (row, _, _) in self.output_rows]
+            output_fields = [field for (_, fields, _) in self.output_rows for field in fields]
+            # Button handlers
+            button_group.delete(
+                fn=self.sm.delete_output_field,
+                inputs=[self.model_step_state, row_index],
+                outputs=[self.model_step_state, self.outputs_count_state] + rows + output_fields,
+            )
+            button_group.add(
+                fn=self.sm.add_output_field,
+                inputs=[self.model_step_state, row_index],
+                outputs=[self.model_step_state, self.outputs_count_state] + rows + output_fields,
+            )
+            button_group.up(
+                fn=self.sm.move_output_field,
+                inputs=[self.model_step_state, row_index, gr.State("up")],
+                outputs=[self.model_step_state] + output_fields,
+            )
+            button_group.down(
+                fn=self.sm.move_output_field,
+                inputs=[self.model_step_state, row_index, gr.State("down")],
+                outputs=[self.model_step_state] + output_fields,
+            )
+    # Function to set up event listeners - call this separately after all components are rendered
+    def setup_event_listeners(self):
+        """Set up all event listeners for this component"""
+        self._setup_event_listeners_for_view_change()
+        self._setup_event_listeners_model_tab()
+        self._setup_event_listeners_inputs_tab()
+        self._setup_event_listeners_outputs_tab()
+        def state_str(x, limited: bool = False):
+            d = x.model_dump()
+            if limited:
+                d = {k: d[k] for k in {"name", "temperature"}}
+            return json.dumps(d, indent=2)
+        def log_step_states(x, y, src: str):
+            print(f"{src} triggered! UI:\n{state_str(x)}\n\nData:\n{state_str(y, True)}")
+            print("--------------------------------")
+            print(f"self.model_step_state: \n{self.get_step_config()}")
+            print("--------------------------------")
+        # self.model_step_state.change(
+        #     log_step_states,
+        #     inputs=[self.ui_state, self.model_step_state, gr.State("Model Change")],
+        # )
+        # self.ui_state.change(
+        #     log_step_states,
+        #     inputs=[self.ui_state, self.model_step_state, gr.State("UI Change")],
+        # )
+    def on_model_step_change(self, fn, inputs, outputs):
+        """Set up an event listener for the model change event."""
+        self.model_step_state.change(fn, inputs, outputs)
+    def on_ui_change(self, fn, inputs, outputs):
+        """Set up an event listener for the UI change event."""
+        self.ui_state.change(fn, inputs, outputs)
+    def _update_state_and_label(self, model_step: ModelStep, name: str):
+        """Update both the state and the accordion label."""
+        new_model_step = self.sm.update_step_name(model_step, name)
+        new_label = _make_accordion_label(new_model_step)
+        return new_model_step, gr.update(label=new_label)
+    def refresh_variable_dropdowns(self, pipeline_state: PipelineState):
+        # TODO: Fix this. Not sure why this is needed.
+        """Refresh the variable dropdown options in all input rows."""
+        variable_choices = []
+        if self.pipeline_sm is not None:
+            variable_choices = self.pipeline_sm.get_all_variables(pipeline_state)
+        for _, fields, _ in self.input_rows:
+            _, inp_var, _ = fields
+            inp_var.update(choices=variable_choices)
+    def _update_model_and_refresh_ui(self, updated_model_step):
+        """Update the model step state and refresh UI elements that depend on it."""
+        self.model_step_state.value = updated_model_step
+        # Update accordion label
+        new_label = _make_accordion_label(updated_model_step)
+        if self.accordion:
+            self.accordion.update(label=new_label)
+        return updated_model_step

src/components/model_step/state_manager.py ADDED Viewed

	@@ -0,0 +1,152 @@

+from typing import Any, Literal, Union
+import gradio as gr
+from components.model_pipeline.state_manager import ModelStepUIState
+from components.utils import DIRECTIONS, move_item
+from utils import get_model_and_provider
+from workflows.structs import FieldType, ModelStep
+class ModelStepStateManager:
+    def __init__(self, max_input_fields: int, max_output_fields: int):
+        self.max_fields = {
+            "input": max_input_fields,
+            "output": max_output_fields,
+        }
+    # UI state update functions
+    def update_ui_state(self, ui_state: ModelStepUIState, key: str, value: Any) -> ModelStepUIState:
+        return ui_state.update(key, value)
+    # Property update functions
+    def update_step_name(self, model_step: ModelStep, value: str) -> ModelStep:
+        """Update the step name in state and accordion label."""
+        return model_step.update_property("name", value)
+    def update_temperature(self, model_step: ModelStep, value: float) -> ModelStep:
+        return model_step.update_property("temperature", value)
+    def update_model_and_provider(self, model_step: ModelStep, value: str) -> ModelStep:
+        """Update the model provider in the state."""
+        model, provider = get_model_and_provider(value)
+        return model_step.update({"model": model, "provider": provider})
+    def update_system_prompt(self, model_step: ModelStep, value: str) -> ModelStep:
+        """Update the system prompt in the state."""
+        return model_step.update_property("system_prompt", value)
+    # Field update functions
+    def update_input_field_name(self, model_step: ModelStep, value: str, index: int) -> ModelStep:
+        """Update a specific field of an input field at the given index."""
+        return model_step.update_field("input", index, "name", value)
+    def update_input_field_variable(self, model_step: ModelStep, value: str, index: int) -> ModelStep:
+        """Update a specific field of an input field at the given index."""
+        return model_step.update_field("input", index, "variable", value)
+    def update_input_field_description(self, model_step: ModelStep, value: str, index: int) -> ModelStep:
+        """Update a specific field of an input field at the given index."""
+        return model_step.update_field("input", index, "description", value)
+    def update_output_field_name(self, model_step: ModelStep, value: str, index: int) -> ModelStep:
+        """Update a specific field of an output field at the given index."""
+        return model_step.update_field("output", index, "name", value)
+    def update_output_field_type(self, model_step: ModelStep, value: str, index: int) -> ModelStep:
+        """Update a specific field of an output field at the given index."""
+        return model_step.update_field("output", index, "type", value)
+    def update_output_field_variable(self, model_step: ModelStep, value: str, index: int) -> ModelStep:
+        """Update a specific field of an output field at the given index."""
+        return model_step.update_field("output", index, "variable", value)
+    def update_output_field_description(self, model_step: ModelStep, value: str, index: int) -> ModelStep:
+        """Update a specific field of an output field at the given index."""
+        return model_step.update_field("output", index, "description", value)
+    def make_input_field_updates(self, model_step: ModelStep) -> list[gr.State | dict[str, Any]]:
+        fields = model_step.input_fields
+        updates = []
+        for i in range(self.max_fields["input"]):
+            if i < len(fields):
+                updates.extend(
+                    [
+                        gr.update(value=fields[i].name),
+                        gr.update(value=fields[i].variable),
+                        gr.update(value=fields[i].description),
+                    ]
+                )
+            else:
+                updates.extend([gr.skip(), gr.skip(), gr.skip()])
+        return updates
+    def make_output_field_updates(self, model_step: ModelStep) -> list[gr.State | dict[str, Any]]:
+        fields = model_step.output_fields
+        updates = []
+        for i in range(self.max_fields["output"]):
+            if i < len(fields):
+                updates.extend(
+                    [
+                        gr.update(value=fields[i].name),
+                        gr.update(value=fields[i].type),
+                        gr.update(value=fields[i].description),
+                    ]
+                )
+            else:
+                updates.extend([gr.skip(), gr.skip(), gr.skip()])
+        return updates
+    def _add_field(
+        self, model_step: ModelStep, field_type: FieldType, index: int = -1, input_var: str | None = None
+    ) -> tuple[Union[ModelStep, int, dict[str, Any]], ...]:
+        new_step = model_step.add_field(field_type, index, input_var)
+        fields = new_step.fields(field_type)
+        row_updates = [gr.update(visible=i < len(fields)) for i in range(self.max_fields[field_type])]
+        return new_step, len(fields), *row_updates
+    def _delete_field(
+        self, model_step: ModelStep, field_type: FieldType, index: int
+    ) -> tuple[Union[ModelStep, int, dict[str, Any]], ...]:
+        new_step = model_step.delete_field(field_type, index)
+        fields = new_step.fields(field_type)
+        row_updates = [gr.update(visible=i < len(fields)) for i in range(self.max_fields[field_type])]
+        return new_step, len(fields), *row_updates
+    # Field add/delete functions
+    def add_input_field(self, model_step: ModelStep, index: int = -1):
+        updates = self._add_field(model_step, "input", index, input_var="question_text")
+        return *updates, *self.make_input_field_updates(model_step)
+    def add_output_field(self, model_step: ModelStep, index: int = -1):
+        updates = self._add_field(model_step, "output", index)
+        return *updates, *self.make_output_field_updates(model_step)
+    def delete_input_field(self, model_step: ModelStep, index: int):
+        updates = self._delete_field(model_step, "input", index)
+        return *updates, *self.make_input_field_updates(model_step)
+    def delete_output_field(self, model_step: ModelStep, index: int):
+        updates = self._delete_field(model_step, "output", index)
+        return *updates, *self.make_output_field_updates(model_step)
+    def move_output_field(
+        self, model_step: ModelStep, index: int, direction: DIRECTIONS
+    ) -> list[gr.State | dict[str, Any]]:
+        """
+        Move an output field in the list either up or down.
+        Args:
+            index: Index of the output field to move
+            direction: Direction to move the field ('up' or 'down')
+        Returns:
+            list: A list containing [updated_state, field_value_updates...]
+        """
+        new_step = model_step.model_copy()
+        move_item(new_step.output_fields, index, direction)
+        # Update all output fields to reflect the new order
+        updates = self.make_output_field_updates(new_step)
+        return new_step, *updates

src/components/model_step/ui_components.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import gradio as gr
+from gradio.components import FormComponent
+class ButtonGroup:
+    """Base class for button groups with common functionality."""
+    def __init__(self, events: list[str], *args, **kwargs):
+        self.buttons = {event: None for event in events}
+        self.click_args = {event: None for event in events}
+        self.render()
+    def render(self):
+        """Render the buttons and set up their event handlers."""
+        for event, button in self.buttons.items():
+            if self.click_args[event]:
+                button.click(*self.click_args[event])
+    def _setup_button(self, event, fn, inputs, outputs):
+        """Set up a button's click event handler."""
+        self.click_args[event] = fn, inputs, outputs
+        if self.buttons[event]:
+            self.buttons[event].click(fn, inputs, outputs)
+    def api_info(self):
+        return {
+            "name": self.__class__.__name__,
+            "events": self.EVENTS,
+            "inputs": [],
+            "outputs": [],
+        }
+    def example_payload(self):
+        """Return None since this component doesn't have direct input values."""
+        return None
+    def example_value(self):
+        """Return None since this component doesn't have direct output values."""
+        return None
+class InputRowButtonGroup(ButtonGroup):
+    """Button group for input rows with delete and add buttons."""
+    EVENTS = ["delete", "add"]
+    def __init__(self, *args, **kwargs):
+        super().__init__(self.EVENTS, *args, **kwargs)
+    def render(self):
+        with gr.Column(scale=0, min_width=40, elem_classes="button-column"):
+            self.buttons["delete"] = gr.Button("❌", elem_classes="icon-button delete-button", scale=0)
+            self.buttons["add"] = gr.Button("➕", elem_classes="icon-button add-field-button", scale=0)
+        super().render()
+    def delete(self, fn, inputs, outputs):
+        self._setup_button("delete", fn, inputs, outputs)
+    def add(self, fn, inputs, outputs):
+        self._setup_button("add", fn, inputs, outputs)
+class OutputRowButtonGroup(ButtonGroup):
+    """Button group for output rows with delete, add, up, and down buttons."""
+    EVENTS = ["delete", "add", "up", "down"]
+    def __init__(self, *args, **kwargs):
+        super().__init__(self.EVENTS, *args, **kwargs)
+    def render(self):
+        with gr.Column(scale=0, elem_classes="button-column", min_width=40):
+            self.buttons["delete"] = gr.Button("❌", elem_classes="icon-button delete-button", scale=0)
+            self.buttons["add"] = gr.Button("➕", elem_classes="icon-button add-field-button", scale=0)
+        with gr.Column(scale=0, elem_classes="button-column", min_width=40):
+            self.buttons["up"] = gr.Button("⬆️", elem_classes="icon-button up-button", scale=0)
+            self.buttons["down"] = gr.Button("⬇️", elem_classes="icon-button down-button", scale=0)
+        return super().render()
+    def delete(self, fn, inputs, outputs):
+        self._setup_button("delete", fn, inputs, outputs)
+    def add(self, fn, inputs, outputs):
+        self._setup_button("add", fn, inputs, outputs)
+    def up(self, fn, inputs, outputs):
+        self._setup_button("up", fn, inputs, outputs)
+    def down(self, fn, inputs, outputs):
+        self._setup_button("down", fn, inputs, outputs)

src/components/quizbowl/__init__.py ADDED Viewed

File without changes

src/components/quizbowl/bonus.py ADDED Viewed

	@@ -0,0 +1,399 @@

+import json
+import logging
+from typing import Any
+import gradio as gr
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from datasets import Dataset
+from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState
+from submission import submit
+from workflows import factory
+from workflows.qb.simple_agent import SimpleBonusAgent
+from workflows.structs import ModelStep, Workflow
+from .plotting import (
+    create_pyplot,
+    create_scatter_pyplot,
+    evaluate_buzz,
+    update_plot,
+)
+def evaluate_bonus_part(prediction: str, clean_answers: list[str]) -> float:
+    """Evaluate a single bonus part."""
+    return evaluate_buzz(prediction, clean_answers)
+def process_bonus_results(results: list[dict]) -> pd.DataFrame:
+    """Process results from bonus mode and prepare visualization data."""
+    return pd.DataFrame(
+        [
+            {
+                "Part": f"Part {r['part_number']}",
+                "Correct?": "✅" if r["score"] == 1 else "❌",
+                "Confidence": r["confidence"],
+                "Prediction": r["answer"],
+                "Explanation": r["explanation"],
+            }
+            for r in results
+        ]
+    )
+def initialize_eval_interface(example: dict, model_outputs: list[dict]):
+    """Initialize the interface with example text."""
+    try:
+        # Create HTML for leadin and parts
+        leadin_html = f"<div class='leadin'>{example['leadin']}</div>"
+        parts_html = []
+        for i, part in enumerate(example["parts"]):
+            parts_html.append(f"<div class='part'><b>Part {i + 1}:</b> {part['part']}</div>")
+        html_content = f"{leadin_html}<div class='parts-container'>{''.join(parts_html)}</div>"
+        # Create confidence plot data
+        plot_data = create_bonus_confidence_plot(example["parts"], model_outputs)
+        # Store state
+        state = json.dumps({"parts": example["parts"], "outputs": model_outputs})
+        return html_content, plot_data, state
+    except Exception as e:
+        logging.error(f"Error initializing interface: {e}", exc_info=True)
+        return f"<div>Error initializing interface: {str(e)}</div>", pd.DataFrame(), "{}"
+def create_bonus_confidence_plot(parts: list[dict], model_outputs: list[dict]):
+    """Create confidence plot for bonus parts."""
+    plt.style.use("ggplot")
+    fig = plt.figure(figsize=(10, 6))
+    ax = fig.add_subplot(111)
+    # Plot confidence for each part
+    x = range(1, len(parts) + 1)
+    confidences = [output["confidence"] for output in model_outputs]
+    scores = [output["score"] for output in model_outputs]
+    # Plot confidence bars
+    bars = ax.bar(x, confidences, color="#4698cf")
+    # Color bars based on correctness
+    for i, score in enumerate(scores):
+        bars[i].set_color("green" if score == 1 else "red")
+    ax.set_title("Part Confidence")
+    ax.set_xlabel("Part Number")
+    ax.set_ylabel("Confidence")
+    ax.set_xticks(x)
+    ax.set_xticklabels([f"Part {i}" for i in x])
+    return fig
+def validate_workflow(workflow: Workflow):
+    """Validate that a workflow is properly configured for the bonus task."""
+    if not workflow.steps:
+        raise ValueError("Workflow must have at least one step")
+    # Ensure all steps are properly configured
+    for step_id, step in workflow.steps.items():
+        validate_model_step(step)
+    # Check that the workflow has the correct structure
+    input_vars = set(workflow.inputs)
+    if "leadin" not in input_vars or "part" not in input_vars:
+        raise ValueError("Workflow must have 'leadin' and 'part' as inputs")
+    output_vars = set(workflow.outputs)
+    if not all(var in output_vars for var in ["answer", "confidence", "explanation"]):
+        raise ValueError("Workflow must produce 'answer', 'confidence', and 'explanation' as outputs")
+def validate_model_step(model_step: ModelStep):
+    """Validate that a model step is properly configured for the bonus task."""
+    # Check required fields
+    if not model_step.model or not model_step.provider:
+        raise ValueError("Model step must have both model and provider specified")
+    if model_step.call_type != "llm":
+        raise ValueError("Model step must have call_type 'llm'")
+    # Validate temperature for LLM steps
+    if model_step.temperature is None:
+        raise ValueError("Temperature must be specified for LLM model steps")
+    if not (0.0 <= model_step.temperature <= 1.0):
+        raise ValueError(f"Temperature must be between 0.0 and 1.0, got {model_step.temperature}")
+    # Validate input fields
+    input_field_names = {field.name for field in model_step.input_fields}
+    if "leadin" not in input_field_names or "part" not in input_field_names:
+        raise ValueError("Model step must have 'leadin' and 'part' input fields")
+    # Validate output fields
+    output_field_names = {field.name for field in model_step.output_fields}
+    required_outputs = {"answer", "confidence", "explanation"}
+    if not all(out in output_field_names for out in required_outputs):
+        raise ValueError("Model step must have all required output fields: answer, confidence, explanation")
+    # Validate confidence output field is of type float
+    for field in model_step.output_fields:
+        if field.name == "confidence" and field.type != "float":
+            raise ValueError("The 'confidence' output field must be of type 'float'")
+class BonusInterface:
+    """Gradio interface for the Bonus mode."""
+    def __init__(self, app: gr.Blocks, dataset: Dataset, model_options: dict, defaults: dict):
+        """Initialize the Bonus interface."""
+        logging.info(f"Initializing Bonus interface with dataset size: {len(dataset)}")
+        self.ds = dataset
+        self.model_options = model_options
+        self.app = app
+        self.defaults = defaults
+        self.output_state = gr.State(value="{}")
+        self.render()
+    def _render_model_interface(self, workflow: Workflow, simple: bool = True):
+        """Render the model interface."""
+        self.pipeline_interface = PipelineInterface(
+            workflow,
+            simple=simple,
+            model_options=list(self.model_options.keys()),
+        )
+        with gr.Row():
+            self.run_btn = gr.Button("Run Bonus", variant="primary")
+    def _render_qb_interface(self):
+        """Render the quizbowl interface."""
+        with gr.Row():
+            self.qid_selector = gr.Number(
+                label="Question ID", value=1, precision=0, minimum=1, maximum=len(self.ds), show_label=True, scale=0
+            )
+            self.answer_display = gr.Textbox(
+                label="Answers", elem_id="answer-display", elem_classes="answer-box", interactive=False, scale=1
+            )
+            self.clean_answer_display = gr.Textbox(
+                label="Acceptable Answers",
+                elem_id="answer-display-2",
+                elem_classes="answer-box",
+                interactive=False,
+                scale=2,
+            )
+        self.question_display = gr.HTML(label="Question", elem_id="question-display")
+        with gr.Row():
+            self.confidence_plot = gr.Plot(
+                label="Part Confidence",
+                format="webp",
+            )
+        self.results_table = gr.DataFrame(
+            label="Model Outputs",
+            value=pd.DataFrame(columns=["Part", "Correct?", "Confidence", "Prediction", "Explanation"]),
+        )
+        with gr.Row():
+            self.eval_btn = gr.Button("Evaluate")
+        with gr.Accordion("Model Submission", elem_classes="model-submission-accordion", open=True):
+            with gr.Row():
+                self.model_name_input = gr.Textbox(label="Model Name")
+                self.description_input = gr.Textbox(label="Description")
+            with gr.Row():
+                gr.LoginButton()
+                self.submit_btn = gr.Button("Submit")
+            self.submit_status = gr.HTML(label="Submission Status")
+    def render(self):
+        """Create the Gradio interface."""
+        self.hidden_input = gr.Textbox(value="", visible=False, elem_id="hidden-index")
+        workflow = self.defaults["init_workflow"]
+        with gr.Row():
+            # Model Panel
+            with gr.Column(scale=1):
+                self._render_model_interface(workflow, simple=self.defaults["simple_workflow"])
+            with gr.Column(scale=1):
+                self._render_qb_interface()
+        self._setup_event_listeners()
+    def get_new_question_html(self, question_id: int):
+        """Get the HTML for a new question."""
+        example = self.ds[question_id - 1]
+        leadin = example["leadin"]
+        parts = example["parts"]
+        # Create HTML for leadin and parts
+        leadin_html = f"<div class='leadin'>{leadin}</div>"
+        parts_html = []
+        for i, part in enumerate(parts):
+            parts_html.append(f"<div class='part'>{part['part']}</div>")
+        parts_html_str = "<br>".join(parts_html)
+        html_content = (
+            f"<div class='token-container'>{leadin_html}<div class='parts-container'><br>{parts_html_str}</div></div>"
+        )
+        # Format answers
+        primary_answers = [f"{i + 1}. {part['answer_primary']}" for i, part in enumerate(parts)]
+        clean_answers = []
+        for i, part in enumerate(parts):
+            part_answers = [a for a in part["clean_answers"] if len(a.split()) <= 6]
+            clean_answers.append(f"{i + 1}. {', '.join(part_answers)}")
+        return html_content, "\n".join(primary_answers), "\n".join(clean_answers)
+    def get_model_outputs(self, example: dict, pipeline_state: PipelineState):
+        """Get the model outputs for a given question ID."""
+        outputs = []
+        leadin = example["leadin"]
+        for i, part in enumerate(example["parts"]):
+            agent = SimpleBonusAgent(workflow=pipeline_state.workflow)
+            # Run model for each part
+            part_output = agent.run(leadin, part["part"])
+            # Add part number and evaluate score
+            part_output["part_number"] = i + 1
+            part_output["score"] = evaluate_bonus_part(part_output["answer"], part["clean_answers"])
+            outputs.append(part_output)
+        return outputs
+    def run_bonus(
+        self,
+        question_id: int,
+        pipeline_state: PipelineState,
+    ) -> tuple[str, Any, Any]:
+        """Run the agent in bonus mode."""
+        try:
+            # Validate inputs
+            question_id = int(question_id - 1)
+            if not self.ds or question_id < 0 or question_id >= len(self.ds):
+                return "Invalid question ID or dataset not loaded", None, None
+            example = self.ds[question_id]
+            outputs = self.get_model_outputs(example, pipeline_state)
+            # Process results and prepare visualization data
+            html_content, plot_data, output_state = initialize_eval_interface(example, outputs)
+            df = process_bonus_results(outputs)
+            return (
+                html_content,
+                gr.update(value=plot_data, label=f"Part Confidence on Question {question_id + 1}"),
+                gr.update(value=output_state),
+                gr.update(value=df, label=f"Model Outputs for Question {question_id + 1}"),
+            )
+        except Exception as e:
+            import traceback
+            error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
+            return error_msg, None, None
+    def evaluate_bonus(self, pipeline_state: PipelineState, progress: gr.Progress = gr.Progress()):
+        """Evaluate the bonus questions."""
+        try:
+            # Validate inputs
+            if not self.ds or not self.ds.num_rows:
+                return "No dataset loaded", None, None
+            total_correct = 0
+            total_parts = 0
+            part_scores = []
+            part_numbers = []
+            for example in progress.tqdm(self.ds, desc="Evaluating bonus questions"):
+                model_outputs = self.get_model_outputs(example, pipeline_state)
+                for output in model_outputs:
+                    total_parts += 1
+                    if output["score"] == 1:
+                        total_correct += 1
+                    part_scores.append(output["score"])
+                    part_numbers.append(output["part_number"])
+            accuracy = total_correct / total_parts
+            df = pd.DataFrame(
+                [
+                    {
+                        "Part Accuracy": f"{accuracy:.2%}",
+                        "Total Score": f"{total_correct}/{total_parts}",
+                        "Questions Evaluated": len(self.ds),
+                    }
+                ]
+            )
+            plot_data = create_scatter_pyplot(part_numbers, part_scores)
+            return (
+                gr.update(value=df, label="Scores on Sample Set"),
+                gr.update(value=plot_data, label="Part Scores on Sample Set"),
+            )
+        except Exception:
+            import traceback
+            logging.error(f"Error evaluating bonus: {traceback.format_exc()}")
+            return "Error evaluating bonus", None, None
+    def submit_model(
+        self, model_name: str, description: str, pipeline_state: PipelineState, profile: gr.OAuthProfile = None
+    ):
+        """Submit the model output."""
+        return submit.submit_model(model_name, description, pipeline_state.workflow, "bonus", profile)
+    def _setup_event_listeners(self):
+        # Initialize with the default question (ID 0)
+        gr.on(
+            triggers=[self.app.load, self.qid_selector.change],
+            fn=self.get_new_question_html,
+            inputs=[self.qid_selector],
+            outputs=[self.question_display, self.answer_display, self.clean_answer_display],
+        )
+        self.run_btn.click(
+            self.pipeline_interface.validate_workflow,
+            inputs=[self.pipeline_interface.pipeline_state],
+            outputs=[self.pipeline_interface.pipeline_state],
+        ).success(
+            self.run_bonus,
+            inputs=[
+                self.qid_selector,
+                self.pipeline_interface.pipeline_state,
+            ],
+            outputs=[
+                self.question_display,
+                self.confidence_plot,
+                self.output_state,
+                self.results_table,
+            ],
+        )
+        self.eval_btn.click(
+            fn=self.evaluate_bonus,
+            inputs=[self.pipeline_interface.pipeline_state],
+            outputs=[self.results_table, self.confidence_plot],
+        )
+        self.submit_btn.click(
+            fn=self.submit_model_output,
+            inputs=[
+                self.model_name_input,
+                self.description_input,
+                self.pipeline_interface.pipeline_state,
+            ],
+            outputs=[self.submit_status],
+        )
+        self.hidden_input.change(
+            fn=update_plot,
+            inputs=[self.hidden_input, self.output_state],
+            outputs=[self.confidence_plot],
+        )

src/components/quizbowl/plotting.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import json
+import logging
+import re
+from collections import Counter
+import matplotlib.pyplot as plt
+import pandas as pd
+def evaluate_buzz(prediction: str, clean_answers: list[str] | str) -> int:
+    """Evaluate the buzz of a prediction against the clean answers."""
+    if isinstance(clean_answers, str):
+        print("clean_answers is a string")
+        clean_answers = [clean_answers]
+    pred = prediction.lower().strip()
+    if not pred:
+        return 0
+    for answer in clean_answers:
+        answer = answer.strip().lower()
+        if answer and answer in pred:
+            print(f"Found {answer} in {pred}")
+            return 1
+    return 0
+def create_answer_html(answer: str):
+    """Create HTML for the answer."""
+    return f"<div class='answer-header'>Answer:<br>{answer}</div>"
+def create_tokens_html(tokens: list[str], eval_points: list[tuple], answer: str, marker_indices: list[int] = None):
+    """Create HTML for tokens with hover capability and a colored header for the answer."""
+    try:
+        html_parts = []
+        ep = dict(eval_points)
+        marker_indices = set(marker_indices) if isinstance(marker_indices, list) else set()
+        # Add a colored header for the answer
+        # html_parts.append(create_answer_html(answer))
+        for i, token in enumerate(tokens):
+            # Check if this token is a buzz point
+            values = ep.get(i, (None, 0, 0))
+            confidence, buzz_point, score = values
+            # Replace non-word characters for proper display in HTML
+            display_token = token
+            if not re.match(r"\w+", token):
+                display_token = token.replace(" ", "&nbsp;")
+            # Add buzz marker class if it's a buzz point
+            if confidence is None:
+                css_class = ""
+            elif not buzz_point:
+                css_class = " guess-point no-buzz"
+            else:
+                css_class = f" guess-point buzz-{score}"
+            token_html = f'<span id="token-{i}" class="token{css_class}" data-index="{i}">{display_token}</span>'
+            if i in marker_indices:
+                token_html += "<span style='color: rgba(0,0,255,0.3);'>|</span>"
+            html_parts.append(token_html)
+        return f"<div class='token-container'>{''.join(html_parts)}</div>"
+    except Exception as e:
+        logging.error(f"Error creating token HTML: {e}", exc_info=True)
+        return f"<div class='token-container'>Error creating tokens: {str(e)}</div>"
+def create_line_plot(eval_points, highlighted_index=-1):
+    """Create a Gradio LinePlot of token values with optional highlighting using DataFrame."""
+    try:
+        # Create base confidence data
+        data = []
+        # Add buzz points to the plot
+        for i, (v, b) in eval_points:
+            color = "#ff4444" if b == 0 else "#228b22"
+            data.append(
+                {
+                    "position": i,
+                    "value": v,
+                    "type": "buzz",
+                    "highlight": True,
+                    "color": color,
+                }
+            )
+        if highlighted_index >= 0:
+            # Add vertical line for the highlighted token
+            data.extend(
+                [
+                    {
+                        "position": highlighted_index,
+                        "value": 0,
+                        "type": "hover-line",
+                        "color": "#000000",
+                        "highlight": True,
+                    },
+                    {
+                        "position": highlighted_index,
+                        "value": 1,
+                        "type": "hover-line",
+                        "color": "#000000",
+                        "highlight": True,
+                    },
+                ]
+            )
+        return pd.DataFrame(data)
+    except Exception as e:
+        logging.error(f"Error creating line plot: {e}", exc_info=True)
+        # Return an empty DataFrame with the expected columns
+        return pd.DataFrame(columns=["position", "value", "type", "highlight", "color"])
+def create_pyplot(tokens, eval_points, highlighted_index=-1):
+    """Create a pyplot of token values with optional highlighting."""
+    plt.style.use("ggplot")  # Set theme to grid paper
+    fig = plt.figure(figsize=(10, 6))  # Set figure size
+    ax = fig.add_subplot(111)
+    x = [0]
+    y = [0]
+    for i, (v, b, s) in eval_points:
+        x.append(i + 1)
+        y.append(v)
+    ax.plot(x, y, "o--", color="#4698cf")
+    for i, (v, b, s) in eval_points:
+        if not b:
+            continue
+        color = "green" if s else "red"
+        ax.plot(i + 1, v, "o", color=color)
+        if i >= len(tokens):
+            print(f"Token index {i} is out of bounds for n_tokens: {len(tokens)}")
+        ax.annotate(f"{tokens[i]}", (i + 1, v), textcoords="offset points", xytext=(0, 10), ha="center")
+    if highlighted_index >= 0:
+        # Add light vertical line for the highlighted token from 0 to 1
+        ax.axvline(x=highlighted_index + 1, color="#ff9900", linestyle="--", ymin=0, ymax=1)
+    ax.set_title("Buzz Confidence")
+    ax.set_xlabel("Token Index")
+    ax.set_ylabel("Confidence")
+    ax.set_xticks(x)
+    ax.set_xticklabels(x)
+    return fig
+def create_scatter_pyplot(token_positions, scores):
+    """Create a scatter plot of token positions and scores."""
+    plt.style.use("ggplot")
+    fig = plt.figure(figsize=(10, 6))
+    ax = fig.add_subplot(111)
+    counts = Counter(zip(token_positions, scores))
+    X = []
+    Y = []
+    S = []
+    for (pos, score), size in counts.items():
+        X.append(pos)
+        Y.append(score)
+        S.append(size * 20)
+    ax.scatter(X, Y, color="#4698cf", s=S)
+    return fig
+def update_plot(highlighted_index, state):
+    """Update the plot when a token is hovered; add a vertical line on the plot."""
+    try:
+        if not state or state == "{}":
+            logging.warning("Empty state provided to update_plot")
+            return pd.DataFrame()
+        highlighted_index = int(highlighted_index) if highlighted_index else None
+        logging.info(f"Update plot triggered with token index: {highlighted_index}")
+        data = json.loads(state)
+        tokens = data.get("tokens", [])
+        values = data.get("values", [])
+        if not tokens or not values:
+            logging.warning("No tokens or values found in state")
+            return pd.DataFrame()
+        # Create updated plot with highlighting of the token point
+        # plot_data = create_line_plot(values, highlighted_index)
+        plot_data = create_pyplot(tokens, values, highlighted_index)
+        return plot_data
+    except Exception as e:
+        logging.error(f"Error updating plot: {e}")
+        return pd.DataFrame()

src/components/quizbowl/tossup.py ADDED Viewed

	@@ -0,0 +1,426 @@

+import json
+import logging
+from typing import Any
+import gradio as gr
+import numpy as np
+import pandas as pd
+from datasets import Dataset
+from components.model_pipeline.model_pipeline import PipelineInterface, PipelineState
+from submission import submit
+from workflows.qb.simple_agent import SimpleTossupAgent
+from workflows.structs import ModelStep, Workflow
+from .plotting import (
+    create_answer_html,
+    create_pyplot,
+    create_scatter_pyplot,
+    create_tokens_html,
+    evaluate_buzz,
+    update_plot,
+)
+def add_model_scores(model_outputs: list[dict], clean_answers: list[str], run_indices: list[int]) -> list[dict]:
+    """Add model scores to the model outputs."""
+    for output, run_idx in zip(model_outputs, run_indices):
+        output["score"] = evaluate_buzz(output["answer"], clean_answers)
+        output["token_position"] = run_idx + 1
+    return model_outputs
+def prepare_buzz_evals(
+    run_indices: list[int], model_outputs: list[dict]
+) -> tuple[list[str], list[tuple[int, float, bool]]]:
+    """Process text into tokens and assign random values for demonstration."""
+    if not run_indices:
+        logging.warning("No run indices provided, returning empty results")
+        return [], []
+    eval_points = []
+    for i, v in zip(run_indices, model_outputs):
+        eval_point = v["confidence"], v["buzz"], v["score"]
+        eval_points.append((int(i), eval_point))
+    return eval_points
+def initialize_eval_interface(example, model_outputs: list[dict]):
+    """Initialize the interface with example text."""
+    tokens = example["question"].split()
+    run_indices = example["run_indices"]
+    answer = example["answer_primary"]
+    try:
+        eval_points = prepare_buzz_evals(run_indices, model_outputs)
+        if not tokens:
+            return "<div>No tokens found in the provided text.</div>", pd.DataFrame(), "{}"
+        highlighted_index = next((int(i) for i, (_, b, _) in eval_points if b == 1), -1)
+        html_content = create_tokens_html(tokens, eval_points, answer)
+        plot_data = create_pyplot(tokens, eval_points, highlighted_index)
+        # Store tokens, values, and buzzes as JSON for later use
+        state = json.dumps({"tokens": tokens, "values": eval_points})
+        return html_content, plot_data, state
+    except Exception as e:
+        logging.error(f"Error initializing interface: {e}", exc_info=True)
+        return f"<div>Error initializing interface: {str(e)}</div>", pd.DataFrame(), "{}"
+def process_tossup_results(results: list[dict], top_k_mode: bool = False) -> pd.DataFrame:
+    """Process results from tossup mode and prepare visualization data."""
+    # Create DataFrame for detailed results
+    if top_k_mode:
+        raise ValueError("Top-k mode not supported for tossup mode")
+    return pd.DataFrame(
+        [
+            {
+                "Token Position": r["token_position"],
+                "Correct?": "✅" if r["score"] == 1 else "❌",
+                "Confidence": r["confidence"],
+                "Prediction": r["answer"],
+            }
+            for r in results
+        ]
+    )
+def validate_workflow(workflow: Workflow):
+    """
+    Validate that a workflow is properly configured for the tossup task.
+    Args:
+        workflow (Workflow): The workflow to validate
+    Raises:
+        ValueError: If the workflow is not properly configured
+    """
+    if not workflow.steps:
+        raise ValueError("Workflow must have at least one step")
+    # Ensure all steps are properly configured
+    for step_id, step in workflow.steps.items():
+        validate_model_step(step)
+    # Check that the workflow has the correct structure
+    input_vars = set(workflow.inputs)
+    if "question" not in input_vars:
+        raise ValueError("Workflow must have 'question' as an input")
+    output_vars = set(workflow.outputs)
+    if not any("answer" in out_var for out_var in output_vars):
+        raise ValueError("Workflow must produce an 'answer' as output")
+    if not any("confidence" in out_var for out_var in output_vars):
+        raise ValueError("Workflow must produce a 'confidence' score as output")
+def validate_model_step(model_step: ModelStep):
+    """
+    Validate that a model step is properly configured for the tossup task.
+    Args:
+        model_step (ModelStep): The model step to validate
+    Raises:
+        ValueError: If the model step is not properly configured
+    """
+    # Check required fields
+    if not model_step.model or not model_step.provider:
+        raise ValueError("Model step must have both model and provider specified")
+    if model_step.call_type != "llm":
+        raise ValueError("Model step must have call_type 'llm'")
+    # Validate temperature for LLM steps
+    if model_step.temperature is None:
+        raise ValueError("Temperature must be specified for LLM model steps")
+    if not (0.0 <= model_step.temperature <= 1.0):
+        raise ValueError(f"Temperature must be between 0.0 and 1.0, got {model_step.temperature}")
+    # Validate input fields
+    input_field_names = {field.name for field in model_step.input_fields}
+    if "question" not in input_field_names:
+        raise ValueError("Model step must have a 'question' input field")
+    # Validate output fields
+    output_field_names = {field.name for field in model_step.output_fields}
+    if "answer" not in output_field_names:
+        raise ValueError("Model step must have an 'answer' output field")
+    if "confidence" not in output_field_names:
+        raise ValueError("Model step must have a 'confidence' output field")
+    # Validate confidence output field is of type float
+    for field in model_step.output_fields:
+        if field.name == "confidence" and field.type != "float":
+            raise ValueError("The 'confidence' output field must be of type 'float'")
+class TossupInterface:
+    """Gradio interface for the Tossup mode."""
+    def __init__(self, app: gr.Blocks, dataset: Dataset, model_options: dict, defaults: dict):
+        """Initialize the Tossup interface."""
+        logging.info(f"Initializing Tossup interface with dataset size: {len(dataset)}")
+        self.ds = dataset
+        self.model_options = model_options
+        self.app = app
+        self.defaults = defaults
+        self.output_state = gr.State(value="{}")
+        self.render()
+    def _render_model_interface(self, workflow: Workflow, simple: bool = True):
+        """Render the model interface."""
+        self.pipeline_interface = PipelineInterface(
+            workflow,
+            simple=simple,
+            model_options=list(self.model_options.keys()),
+        )
+        with gr.Row():
+            self.buzz_t_slider = gr.Slider(
+                minimum=0.5,
+                maximum=1.0,
+                value=self.defaults["buzz_threshold"],
+                step=0.01,
+                label="Buzz Threshold",
+            )
+            self.early_stop_checkbox = gr.Checkbox(
+                value=self.defaults["early_stop"],
+                label="Early Stop",
+                info="Stop early if already buzzed",
+            )
+            self.run_btn = gr.Button("Run Tossup", variant="primary")
+    def _render_qb_interface(self):
+        """Render the quizbowl interface."""
+        with gr.Row():
+            self.qid_selector = gr.Number(
+                label="Question ID", value=1, precision=0, minimum=1, maximum=len(self.ds), show_label=True, scale=0
+            )
+            self.answer_display = gr.Textbox(
+                label="PrimaryAnswer", elem_id="answer-display", elem_classes="answer-box", interactive=False, scale=1
+            )
+            self.clean_answer_display = gr.Textbox(
+                label="Acceptable Answers",
+                elem_id="answer-display-2",
+                elem_classes="answer-box",
+                interactive=False,
+                scale=2,
+            )
+            # self.answer_display = gr.HTML(label="Answer", elem_id="answer-display")
+        self.question_display = gr.HTML(label="Question", elem_id="question-display")
+        with gr.Row():
+            self.confidence_plot = gr.Plot(
+                label="Buzz Confidence",
+                format="webp",
+            )
+        self.results_table = gr.DataFrame(
+            label="Model Outputs",
+            value=pd.DataFrame(columns=["Token Position", "Correct?", "Confidence", "Prediction"]),
+        )
+        with gr.Row():
+            self.eval_btn = gr.Button("Evaluate")
+        with gr.Accordion("Model Submission", elem_classes="model-submission-accordion", open=True):
+            with gr.Row():
+                self.model_name_input = gr.Textbox(label="Model Name")
+                self.description_input = gr.Textbox(label="Description")
+            with gr.Row():
+                gr.LoginButton()
+                self.submit_btn = gr.Button("Submit")
+            self.submit_status = gr.HTML(label="Submission Status")
+    def render(self):
+        """Create the Gradio interface."""
+        self.hidden_input = gr.Textbox(value="", visible=False, elem_id="hidden-index")
+        workflow = self.defaults["init_workflow"]
+        with gr.Row():
+            # Model Panel
+            with gr.Column(scale=1):
+                self._render_model_interface(workflow, simple=self.defaults["simple_workflow"])
+            with gr.Column(scale=1):
+                self._render_qb_interface()
+        self._setup_event_listeners()
+    def get_full_question(self, question_id: int) -> str:
+        """Get the full question text for a given question ID."""
+        try:
+            question_id = int(question_id - 1)
+            if not self.ds or question_id < 0 or question_id >= len(self.ds):
+                return "Invalid question ID or dataset not loaded"
+            question_data = self.ds[question_id]
+            # Get the full question text (the last element in question_runs)
+            full_question = question_data["question"]
+            gold_label = question_data["answer_primary"]
+            return f"Question: {full_question}\n\nCorrect Answer: {gold_label}"
+        except Exception as e:
+            return f"Error loading question: {str(e)}"
+    def validate_workflow(self, pipeline_state: PipelineState):
+        """Validate the workflow."""
+        try:
+            validate_workflow(pipeline_state.workflow)
+        except Exception as e:
+            raise gr.Error(f"Error validating workflow: {str(e)}")
+    def get_new_question_html(self, question_id: int):
+        """Get the HTML for a new question."""
+        example = self.ds[question_id - 1]
+        question = example["question"]
+        gold_label = example["answer_primary"]
+        marker_indices = example["run_indices"]
+        tokens = question.split()
+        question_html = create_tokens_html(tokens, [], gold_label, marker_indices)
+        clean_answers = [a for a in example["clean_answers"] if len(a.split()) <= 6]
+        clean_answers = ", ".join(clean_answers)
+        return question_html, gold_label, clean_answers
+    def get_model_outputs(self, example: dict, pipeline_state: PipelineState, buzz_threshold: float, early_stop: bool):
+        """Get the model outputs for a given question ID."""
+        question_runs = []
+        tokens = example["question"].split()
+        for run_idx in example["run_indices"]:
+            question_runs.append(" ".join(tokens[: run_idx + 1]))
+        agent = SimpleTossupAgent(workflow=pipeline_state.workflow, buzz_threshold=buzz_threshold)
+        outputs = list(agent.run(question_runs, early_stop=early_stop))
+        outputs = add_model_scores(outputs, example["clean_answers"], example["run_indices"])
+        return outputs
+    def run_tossup(
+        self,
+        question_id: int,
+        pipeline_state: PipelineState,
+        buzz_threshold: float,
+        early_stop: bool = True,
+    ) -> tuple[str, Any, Any]:
+        """Run the agent in tossup mode with a system prompt."""
+        try:
+            # Validate inputs
+            question_id = int(question_id - 1)
+            if not self.ds or question_id < 0 or question_id >= len(self.ds):
+                return "Invalid question ID or dataset not loaded", None, None
+            example = self.ds[question_id]
+            outputs = self.get_model_outputs(example, pipeline_state, buzz_threshold, early_stop)
+            # Process results and prepare visualization data
+            tokens_html, plot_data, output_state = initialize_eval_interface(example, outputs)
+            df = process_tossup_results(outputs)
+            return (
+                tokens_html,
+                gr.update(value=plot_data, label=f"Buzz Confidence on Question {question_id + 1}"),
+                gr.update(value=output_state),
+                gr.update(value=df, label=f"Model Outputs for Question {question_id + 1}"),
+            )
+        except Exception as e:
+            import traceback
+            error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
+            return error_msg, None, None
+    def evaluate_tossups(
+        self, pipeline_state: PipelineState, buzz_threshold: float, progress: gr.Progress = gr.Progress()
+    ):
+        """Evaluate the tossup."""
+        try:
+            # Validate inputs
+            if not self.ds or not self.ds.num_rows:
+                return "No dataset loaded", None, None
+            buzz_counts = 0
+            correct_buzzes = 0
+            token_positions = []
+            correctness = []
+            for example in progress.tqdm(self.ds, desc="Evaluating tossup questions"):
+                model_outputs = self.get_model_outputs(example, pipeline_state, buzz_threshold, early_stop=True)
+                if model_outputs[-1]["buzz"]:
+                    buzz_counts += 1
+                    if model_outputs[-1]["score"] == 1:
+                        correct_buzzes += 1
+                    token_positions.append(model_outputs[-1]["token_position"])
+                    correctness.append(model_outputs[-1]["score"])
+            buzz_accuracy = correct_buzzes / buzz_counts
+            df = pd.DataFrame(
+                [
+                    {
+                        "Avg Buzz Position": f"{np.mean(token_positions):.2f}",
+                        "Buzz Accuracy": f"{buzz_accuracy:.2%}",
+                        "Total Score": f"{correct_buzzes}/{len(self.ds)}",
+                    }
+                ]
+            )
+            plot_data = create_scatter_pyplot(token_positions, correctness)
+            return (
+                gr.update(value=df, label="Scores on Sample Set"),
+                gr.update(value=plot_data, label="Buzz Positions on Sample Set"),
+            )
+        except Exception:
+            import traceback
+            logging.error(f"Error evaluating tossups: {traceback.format_exc()}")
+            return "Error evaluating tossups", None, None
+    def submit_model(
+        self, model_name: str, description: str, pipeline_state: PipelineState, profile: gr.OAuthProfile = None
+    ):
+        """Submit the model output."""
+        return submit.submit_model(model_name, description, pipeline_state.workflow, "tossup", profile)
+    def _setup_event_listeners(self):
+        gr.on(
+            triggers=[self.app.load, self.qid_selector.change],
+            fn=self.get_new_question_html,
+            inputs=[self.qid_selector],
+            outputs=[self.question_display, self.answer_display, self.clean_answer_display],
+        )
+        self.run_btn.click(
+            self.pipeline_interface.validate_workflow,
+            inputs=[self.pipeline_interface.pipeline_state],
+            outputs=[self.pipeline_interface.pipeline_state],
+        ).success(
+            self.run_tossup,
+            inputs=[
+                self.qid_selector,
+                self.pipeline_interface.pipeline_state,
+                self.buzz_t_slider,
+                self.early_stop_checkbox,
+            ],
+            outputs=[
+                self.question_display,
+                self.confidence_plot,
+                self.output_state,
+                self.results_table,
+            ],
+        )
+        self.eval_btn.click(
+            fn=self.evaluate_tossups,
+            inputs=[self.pipeline_interface.pipeline_state, self.buzz_t_slider],
+            outputs=[self.results_table, self.confidence_plot],
+        )
+        self.submit_btn.click(
+            fn=self.submit_model,
+            inputs=[
+                self.model_name_input,
+                self.description_input,
+                self.pipeline_interface.pipeline_state,
+            ],
+            outputs=[self.submit_status],
+        )
+        self.hidden_input.change(
+            fn=update_plot,
+            inputs=[self.hidden_input, self.output_state],
+            outputs=[self.confidence_plot],
+        )

src/components/quizbowl/utils.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from typing import Any, Dict, List
+import pandas as pd
+def _create_confidence_plot_data(results: List[Dict], top_k_mode: bool = False) -> pd.DataFrame:
+    """Create a DataFrame for the confidence plot."""
+    if not top_k_mode:
+        return pd.DataFrame(
+            {
+                "position": [r["position"] for r in results],
+                "confidence": [r["confidence"] for r in results],
+                "answer": [r["answer"] for r in results],
+            }
+        )
+    # For top-k mode, extract and plot top answers
+    return _create_top_k_plot_data(results)
+def _create_top_k_plot_data(results: List[Dict]) -> pd.DataFrame:
+    """Create plot data for top-k mode."""
+    # Find top answers across all positions (limited to top 5)
+    top_answers = set()
+    for r in results:
+        for g in r.get("guesses", [])[:3]:  # Get top 3 from each position
+            if g.get("answer"):
+                top_answers.add(g.get("answer"))
+    top_answers = list(top_answers)[:5]  # Limit to 5 total answers
+    # Create plot data for each answer
+    all_data = []
+    for position_idx, result in enumerate(results):
+        position = result["position"]
+        for answer in top_answers:
+            confidence = 0
+            for guess in result.get("guesses", []):
+                if guess.get("answer") == answer:
+                    confidence = guess.get("confidence", 0)
+                    break
+            all_data.append({"position": position, "confidence": confidence, "answer": answer})
+    return pd.DataFrame(all_data)
+def _create_top_k_dataframe(results: List[Dict]) -> pd.DataFrame:
+    """Create a DataFrame for top-k results."""
+    df_rows = []
+    for result in results:
+        position = result["position"]
+        for i, guess in enumerate(result.get("guesses", [])):
+            df_rows.append(
+                {
+                    "position": position,
+                    "answer": guess.get("answer", ""),
+                    "confidence": guess.get("confidence", 0),
+                    "rank": i + 1,
+                }
+            )
+    return pd.DataFrame(df_rows)
+def _format_buzz_result(buzzed: bool, results: List[Dict], gold_label: str, top_k_mode: bool) -> tuple[str, str, bool]:
+    """Format the result text based on whether the agent buzzed."""
+    if not buzzed:
+        return f"Did not buzz. Correct answer was: {gold_label}", "No buzz", False
+    buzz_position = next(i for i, r in enumerate(results) if r.get("buzz", False))
+    buzz_result = results[buzz_position]
+    if top_k_mode:
+        # For top-k, check if any of the top guesses match
+        top_answers = [g.get("answer", "").lower() for g in buzz_result.get("guesses", [])]
+        correct = gold_label.lower() in [a.lower() for a in top_answers]
+        final_answer = top_answers[0] if top_answers else "No answer"
+    else:
+        # For regular mode
+        final_answer = buzz_result["answer"]
+        correct = final_answer.lower() == gold_label.lower()
+    result_text = f"BUZZED at position {buzz_position + 1} with answer: {final_answer}\n"
+    result_text += f"Correct answer: {gold_label}\n"
+    result_text += f"Result: {'CORRECT' if correct else 'INCORRECT'}"
+    return result_text, final_answer, correct

src/components/utils.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from typing import Any, Literal
+import gradio as gr
+DIRECTIONS = Literal["up", "down"]
+def make_state(value: Any) -> gr.State:
+    """Make a state from a value."""
+    if isinstance(value, gr.State):
+        return value
+    else:
+        return gr.State(value)
+# List utilities
+def move_item(items: list, position: int, direction: DIRECTIONS):
+    """Move an item up or down in a list."""
+    if not isinstance(items, list):
+        raise ValueError("items must be a list")
+    if not isinstance(position, int) or not (0 <= position < len(items)):
+        raise ValueError("position must be a valid index in the list")
+    if direction not in ["up", "down"]:
+        raise ValueError("direction must be 'up' or 'down'")
+    if direction == "up" and position > 0:
+        items[position], items[position - 1] = items[position - 1], items[position]
+    elif direction == "down" and position < len(items) - 1:
+        items[position], items[position + 1] = items[position + 1], items[position]

src/display/__init__.py ADDED Viewed

File without changes

src/display/css_html_js.py ADDED Viewed

	@@ -0,0 +1,122 @@

+custom_css = """
+.markdown-text {
+    font-size: 16px !important;
+}
+#models-to-add-text {
+    font-size: 18px !important;
+}
+#citation-button span {
+    font-size: 16px !important;
+}
+#citation-button textarea {
+    font-size: 16px !important;
+}
+#citation-button > label > button {
+    margin: 6px;
+    transform: scale(1.3);
+}
+#leaderboard-table {
+    margin-top: 15px
+}
+#leaderboard-table-lite {
+    margin-top: 15px
+}
+#search-bar-table-box > div:first-child {
+    background: none;
+    border: none;
+}
+#search-bar {
+    padding: 0px;
+}
+/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
+#leaderboard-table td:nth-child(2),
+#leaderboard-table th:nth-child(2) {
+    max-width: 400px;
+    overflow: auto;
+    white-space: nowrap;
+}
+/* Workflow JSON styling */
+.workflow-json-container {
+    margin-top: 20px;
+    margin-bottom: 30px;
+}
+.workflow-json {
+    border: 1px solid #ddd;
+    border-radius: 8px;
+    box-shadow: 0 2px 5px rgba(0,0,0,0.1);
+}
+.workflow-json pre {
+    max-height: 500px;
+    overflow-y: auto;
+}
+.tab-buttons button {
+    font-size: 20px;
+}
+#scale-logo {
+    border-style: none !important;
+    box-shadow: none;
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+    max-width: 600px;
+}
+#scale-logo .download {
+    display: none;
+}
+#filter_type{
+    border: 0;
+    padding-left: 0;
+    padding-top: 0;
+}
+#filter_type label {
+    display: flex;
+}
+#filter_type label > span{
+    margin-top: var(--spacing-lg);
+    margin-right: 0.5em;
+}
+#filter_type label > .wrap{
+    width: 103px;
+}
+#filter_type label > .wrap .wrap-inner{
+    padding: 2px;
+}
+#filter_type label > .wrap .wrap-inner input{
+    width: 1px
+}
+#filter-columns-type{
+    border:0;
+    padding:0.5;
+}
+#filter-columns-size{
+    border:0;
+    padding:0.5;
+}
+#box-filter > .form{
+    border: 0
+}
+"""
+get_window_url_params = """
+    function(url_params) {
+        const params = new URLSearchParams(window.location.search);
+        url_params = Object.fromEntries(params);
+        return url_params;
+    }
+    """

src/display/custom_css.py ADDED Viewed

	@@ -0,0 +1,413 @@

+css_pipeline = """
+:root {
+    color-scheme: light !important;
+    --block-border-width: 0;
+    --section-header-text-weight: 600;
+    --section-header-text-size: 14px;
+    --mono-font-family: "Roboto Mono", monospace;
+    --body-text-size: 14px !important;
+    --card-bg-color: #fcecd4;
+    --card-btn-color: #D4E4FC;
+    --card-btn-color-hover: #7DAEF6;
+    --answer-bg-color: #f0f8ff;
+    --hover-border-color: #121212;
+}
+.dark {
+    --block-border-width: 0;
+    --card-bg-color: #383127;
+    --answer-bg-color: #1a2b3c;
+    --hover-border-color: #ffffff;
+}
+.gradio-app {
+    // font-family: Arial, sans-serif;
+}
+.form {
+    box-shadow: 0 0 0 0 !important;
+}
+.head {
+    margin-bottom: 0px;
+}
+.gradio-container {
+    max-width: 1500px;
+    margin: 0 auto;
+    padding: 0 8px;
+}
+.html-container {
+    padding: 0px 0px;
+    margin: 4px 0px;
+    border-radius: 12px;
+    gap: 0px
+}
+.step-container {
+    background-color: var(--card-bg-color);
+    padding: 0px 0px;
+    margin: 4px 0px;
+    box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
+    border-radius: 12px;
+    gap: 0px
+}
+.step-container:hover {
+    border-color: var(--hover-border-color);
+    box-shadow: 0 2px 8px rgba(0, 0, 0, 0.2);
+}
+.step-accordion {
+    background-color: var(--card-bg-color);
+    border: 0px solid #e0e0e0 !important;
+    border-radius: 12px;
+    overflow: hidden;
+    // transition: box-shadow 0.3s ease, border-color 0.3s ease;
+    padding: 8px 8px;
+    font-size: 12px;
+}
+.output-fields-panel {
+    background-color: var(--card-bg-color);
+    border: 0px solid #e0e0e0 !important;
+    border-radius: 12px;
+    overflow: hidden;
+    transition: box-shadow 0.3s ease, border-color 0.3s ease;
+    padding: 8px 8px;
+    font-size: 12px;
+}
+.model-submission-accordion {
+    background-color: var(--card-bg-color);
+    border: 0px solid #e0e0e0 !important;
+    border-radius: 12px;
+    overflow: hidden;
+    transition: box-shadow 0.3s ease, border-color 0.3s ease;
+    font-size: 14px;
+}
+.model-submission-accordion > label-wrap {
+    font-size: 16px;
+    font-weight: bold !important;
+}
+.step-accordion:hover .step-name-input input {
+    font-weight: bold;
+}
+.step-accordion > label-wrap {
+    font-size: 14px;
+    font-weight: bold !important;
+}
+.step-header-row {
+    margin: 0px 0px;
+    padding: 0px 0px;
+    border: 0px !important;
+}
+.step-header-row form {
+    margin: 0px 0px;
+    padding: 0px 0px;
+    border: 0px !important;
+}
+.step-name {
+    margin: 0px
+    padding: 0px 0px;
+    // border-radius: 8px;
+    border: 0px !important
+}
+.model-dropdown {
+    margin: 0px
+    padding: 0px 8px;
+}
+.model-dropdown input {
+    font-size: 14px;
+    padding-bottom: 2px;
+    padding-top: 2px;
+}
+.step-name input {
+    font-size: 14px;
+    font-weight: bold;
+    padding-bottom: 8px;
+    margin-bottom: 4px;
+    border-radius: 12px !important;
+}
+.step-controls {
+    display: flex;
+    justify-content: flex-end;
+    gap: 12px;
+    background-color: var(--card-bg-color);
+    border-radius: 12px;
+    padding: 0px
+    border: 1px solid black;
+}
+.step-control-btn {
+    background-color: var(--card-btn-color);
+    font-size: 12px !important;
+    color: var(--body-text-color);
+    min-width: 36px !important;
+    min-height: 24px !important;
+    padding: 4px !important;
+    margin: 8px !important;
+    border-radius: 12px;
+}
+.step-control-btn:hover {
+    background-color: var(--card-btn-color-hover);
+}
+.step-tabs {
+    margin-top: 0px;
+    padding: 0px 0px;
+    border-radius: 0px;
+    border: 0px
+    background-color: transparent;
+}
+.tab-content {
+    padding: 0px 0px;
+    margin-bottom: 0px;
+    border-radius: 4px;
+    border: 0px solid #eee;
+    background-color: transparent !important;
+}
+.fields-panel {
+    background-color: transparent !important;
+    gap: 5px !important;
+    border-radius: 4px;
+    padding: 2px;
+}
+.field-row {
+    margin-bottom: 1px;
+    margin-top: 1px;
+    padding: 2px;
+    border-radius: 8px;
+    background-color: var(--block-background-fill) !important;
+    border: 0px solid #eee;
+    gap: 0px !important;
+}
+.output-field-row {
+    margin-bottom: 1px;
+    margin-top: 1px;
+    padding: 2px;
+    border-radius: 4px;
+    background-color: var(--block-background-fill) !important;
+    border: 0px solid #eee;
+    gap: 0px !important;
+}
+.output-fields-header {
+    padding: 0px 8px;
+}
+.output-fields-panel {
+    background-color: var(--block-background-fill) !important;
+    padding: 8px 8px;
+}
+.output-field-variable {
+    font-family: var(--mono-font-family) !important;
+    font-weight: 300 !important;
+    font-size: 12px !important;
+    padding: 8px 8px;
+    border-radius: 4px;
+    border: 0px solid #eee !important;
+}
+.output-field-variable span {
+    font-size: 12px !important;
+}
+.field-type {
+    min-width: 100px !important;
+}
+.field-name > label, .field-variable > label, .field-description > label, .field-type > label {
+    font-size: 12px !important;
+}
+.field-name input, .field-description input, .field-type input {
+    font-family: var(--mono-font-family) !important;
+    font-size: 12px !important;
+}
+.field-variable input, .field-type input, .output-field-variable input {
+    font-family: var(--mono-font-family) !important;
+    font-size: 12px !important;
+    padding-top: 3px;
+    padding-bottom: 3px;
+}
+.field-name listbox, .field-variable listbox, .field-type listbox {
+    font-family: var(--mono-font-family) !important;
+    padding-top: 2px;
+    padding-bottom: 2px;
+    font-size: 12px !important;
+}
+.field-description {
+    font-size: 12px !important;
+}
+/* Accordion button labels */
+.step-accordion button.label-wrap {
+  font-size: 14px;
+  font-weight: bold !important;
+  font-family: var(--mono-font-family) !important;
+}
+.step-accordion button.label-wrap.open {
+  font-size: 14px;
+  font-weight: bold !important;
+  font-family: var(--mono-font-family) !important;
+}
+.button-column {
+    margin-top: 2px;
+    margin-bottom: 2px;
+    padding-top: 2px;
+    padding-bottom: 2px;
+    display: flex;
+    flex-direction: column;
+    justify-content: center;
+    align-items: center;
+    gap: 4x !important;
+    height: 100%;
+}
+.icon-button {
+    min-width: 28px !important;
+    max-width: 42px !important;
+    height: 28px !important;
+    max-height: 42px !important;
+    padding: 0 !important;
+    border-radius: 4px !important;
+    transition: background-color 0.2s ease, color 0.2s ease;
+}
+.delete-button {
+    background-color: #ffebee !important;
+    color: #d32f2f !important;
+}
+.delete-button:hover {
+    background-color: #ffcdd2 !important;
+}
+.up-button, .down-button {
+    background-color: #e3f2fd !important;
+    color: #1976d2 !important;
+}
+.up-button:hover, .down-button:hover {
+    background-color: #bbdefb !important;
+}
+.add-field-button {
+    background-color: #e8f5e9 !important;
+    color: #2e7d32 !important;
+}
+.add-field-button:hover, .add-step-button:hover {
+    background-color: #c8e6c9 !important;
+}
+.pipeline-controls {
+    border-top: 1px solid #eee;
+    padding-top: 8px;
+}
+.pipeline-header {
+    border-bottom: 1px solid #eee;
+    padding: 8px 0px;
+}
+.pipeline-footer {
+    border-top: 1px solid #eee;
+    padding: 8px 0px;
+}
+.add-step-button {
+    background-color: #e8f5e9 !important;
+    color: #2e7d32 !important;
+    border-radius: 12px;
+}
+.export-button {
+    background-color: #e0f7f5 !important;
+    color: #00796b !important;
+    border-radius: 12px;
+}
+.export-button:hover {
+    background-color: #b2dfdb !important;
+}
+.pipeline-preview {
+    background-color: var(--card-bg-color);
+    border-radius: 12px;
+    box-shadow: 0 0 0 0 !important;
+}
+"""
+css_tossup = """
+.token {
+    display: inline-block;
+    padding: 1px 3px;
+    margin: 1px;
+    border-radius: 4px;
+    cursor: pointer;
+    transition: background-color 0.2s;
+}
+.answer-header {
+    font-weight: 900;
+    font-size: 16px;
+    padding: 8px;
+    border-radius: 8px;
+    background-color: var(--answer-bg-color) !important;
+}
+.answer-box textarea {
+    font-size: 16px;
+    padding: 8px;
+    border-radius: 8px;
+    background-color: var(--answer-bg-color) !important;
+}
+.token:hover, .token.highlighted {
+    background-color: #ffcc00;
+}
+.token.guess-point {
+    border-bottom: 3px solid;
+}
+.token.no-buzz {
+    border-color: #6b96b3;
+}
+.token.buzz-0 {
+    border-color: #ff4444;
+}
+.token.buzz-1 {
+    border-color: #228b22; /* Darker and slightly muted green */
+}
+.token-container {
+    line-height: 1.7;
+    padding: 5px;
+    margin-left: 4px;
+    margin-right: 4px;
+    background-color: var(--answer-bg-color) !important;
+    border-radius: 8px;
+    margin-bottom: 10px;
+}
+"""

src/display/formatting.py ADDED Viewed

	@@ -0,0 +1,27 @@

+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def make_clickable_model(model_name):
+    link = f"https://huggingface.co/{model_name}"
+    return model_hyperlink(link, model_name)
+def styled_error(error):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
+def styled_warning(warn):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
+def styled_message(message):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
+def has_no_nan_values(df, columns):
+    return df[columns].notna().all(axis=1)
+def has_nan_values(df, columns):
+    return df[columns].isna().any(axis=1)

src/display/utils.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from dataclasses import dataclass, make_dataclass
+from enum import Enum
+import pandas as pd
+from src.about import Tasks
+def fields(raw_class):
+    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
+# These classes are for user facing column names,
+# to avoid having to change them all around the code
+# when a modif is needed
+@dataclass
+class ColumnContent:
+    name: str
+    type: str
+    displayed_by_default: bool
+    hidden: bool = False
+    never_hidden: bool = False
+## Leaderboard columns
+auto_eval_column_dict = []
+# Init
+auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
+auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
+#Scores
+auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
+for task in Tasks:
+    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
+# Model information
+auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
+auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
+auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
+auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
+auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
+auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
+auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
+auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
+auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
+# We use make dataclass to dynamically fill the scores from Tasks
+AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
+## For the queue columns in the submission tab
+@dataclass(frozen=True)
+class EvalQueueColumn:  # Queue column
+    model = ColumnContent("model", "markdown", True)
+    revision = ColumnContent("revision", "str", True)
+    private = ColumnContent("private", "bool", True)
+    precision = ColumnContent("precision", "str", True)
+    weight_type = ColumnContent("weight_type", "str", "Original")
+    status = ColumnContent("status", "str", True)
+## All the model information that we might need
+@dataclass
+class ModelDetails:
+    name: str
+    display_name: str = ""
+    symbol: str = "" # emoji
+class ModelType(Enum):
+    PT = ModelDetails(name="pretrained", symbol="🟢")
+    FT = ModelDetails(name="fine-tuned", symbol="🔶")
+    IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
+    RL = ModelDetails(name="RL-tuned", symbol="🟦")
+    Unknown = ModelDetails(name="", symbol="?")
+    def to_str(self, separator=" "):
+        return f"{self.value.symbol}{separator}{self.value.name}"
+    @staticmethod
+    def from_str(type):
+        if "fine-tuned" in type or "🔶" in type:
+            return ModelType.FT
+        if "pretrained" in type or "🟢" in type:
+            return ModelType.PT
+        if "RL-tuned" in type or "🟦" in type:
+            return ModelType.RL
+        if "instruction-tuned" in type or "⭕" in type:
+            return ModelType.IFT
+        return ModelType.Unknown
+class WeightType(Enum):
+    Adapter = ModelDetails("Adapter")
+    Original = ModelDetails("Original")
+    Delta = ModelDetails("Delta")
+class Precision(Enum):
+    float16 = ModelDetails("float16")
+    bfloat16 = ModelDetails("bfloat16")
+    Unknown = ModelDetails("?")
+    def from_str(precision):
+        if precision in ["torch.float16", "float16"]:
+            return Precision.float16
+        if precision in ["torch.bfloat16", "bfloat16"]:
+            return Precision.bfloat16
+        return Precision.Unknown
+# Column selection
+COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
+EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
+EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
+BENCHMARK_COLS = [t.value.col_name for t in Tasks]

src/envs.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+from huggingface_hub import HfApi
+# Info to change for your repository
+# ----------------------------------
+TOKEN = os.environ.get("HF_TOKEN")  # A read/write token for your org
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")
+COHERE_API_KEY = os.environ.get("COHERE_API_KEY")
+OWNER = (
+    "umdclip"  # Change to your org - don't forget to create a results and request dataset, with the correct format!
+)
+# ----------------------------------
+REPO_ID = f"{OWNER}/advcal-leaderboard"
+QUEUE_REPO = f"{OWNER}/advcal-requests"
+RESULTS_REPO = f"{OWNER}/advcal-results"
+PLAYGROUND_DATASET_NAMES = {
+    "tossup": "umdclip/acf-co24-tossups",
+    "bonus": "umdclip/acf-co24-bonuses",
+}
+# If you setup a cache later, just change HF_HOME
+CACHE_PATH = os.getenv("HF_HOME", ".")
+# Local caches
+EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
+EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
+EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
+EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
+THEME = "gstaff/xkcd"
+UNSELECTED_VAR_NAME = "Select Variable..."
+UNSELECTED_MODEL_NAME = "Select Model..."
+AVAILABLE_MODELS = {
+    "OpenAI/gpt-4o": {
+        "model": "gpt-4o-2024-11-20",
+    },
+    "OpenAI/gpt-4o-mini": {
+        "model": "gpt-4o-mini-2024-07-18",
+    },
+    "OpenAI/gpt-3.5-turbo": {
+        "model": "gpt-3.5-turbo-0125",
+    },
+    "Anthropic/claude-3-7-sonnet": {
+        "model": "claude-3-7-sonnet-20250219",
+    },
+    "Anthropic/claude-3-5-sonnet": {
+        "model": "claude-3-5-sonnet-20241022",
+    },
+    "Anthropic/claude-3-5-haiku": {
+        "model": "claude-3-5-haiku-20241022",
+    },
+    "Cohere/command-r": {
+        "model": "command-r-08-2024",
+    },
+    "Cohere/command-r-plus": {
+        "model": "command-r-plus-08-2024",
+    },
+    "Cohere/command-r7b": {
+        "model": "command-r7b-12-2024",
+    },
+}
+DEFAULT_SELECTIONS = {
+    "tossup": {
+        "simple_workflow": False,
+        "model": "OpenAI/gpt-4o-mini",
+        "temperature": 0.2,
+        "buzz_threshold": 0.85,
+        "early_stop": True,
+    },
+    "bonus": {
+        "simple_workflow": False,
+        "model": "OpenAI/gpt-4o-mini",
+        "temperature": 0.2,
+        "buzz_threshold": 0.85,
+        "early_stop": True,
+    },
+}
+DAILY_SUBMISSION_LIMIT_PER_USER = 5
+API = HfApi(token=TOKEN)

src/submission/structs.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from datetime import datetime
+from typing import Dict, List, Literal, Optional
+from pydantic import BaseModel, Field
+from workflows.structs import Workflow
+CompetitionType = Literal["tossup", "bonus"]
+SubmissionType = Literal["python_file", "simple_workflow", "complex_workflow"]
+SubmissionStatus = Literal["submitted", "in_progress", "completed", "failed"]
+class Submission(BaseModel):
+    """
+    Represents a submission in the competition system, formatted for HuggingFace datasets.
+    This model is designed to be easily serializable to/from HuggingFace dataset format
+    while maintaining type safety and validation through Pydantic.
+    Attributes:
+        id: Unique identifier for the submission
+        name: Display name of the submission
+        description: Detailed description of what the submission does
+        user_email: Email of the user who created the submission
+        competition_type: Type of competition (Tossup or Bonus)
+        submission_type: Format of the submission (python file or workflow)
+        workflow: Optional workflow definition for workflow submissions, stored as JSON
+        code: Optional code content for python file submissions
+        status: Current status of the submission
+        created_at: ISO format timestamp of creation
+        updated_at: ISO format timestamp of last update
+    """
+    id: str = Field(description="Unique identifier for the submission")
+    model_name: str = Field(description="Display name of the submission")
+    username: str = Field(description="HuggingFace username of the user who created the submission")
+    description: str = Field(description="Detailed description of what the submission does")
+    competition_type: CompetitionType = Field(description="Type of competition (tossup or bonus)")
+    submission_type: SubmissionType = Field(description="Format of the submission (python file or workflow)")
+    workflow: Optional[Workflow] = Field(default=None, description="Optional workflow definition stored as JSON dict")
+    code: Optional[str] = Field(default=None, description="Optional code content for python file submissions")
+    status: SubmissionStatus = Field(description="Current status of the submission")
+    created_at: str = Field(description="ISO format timestamp of creation")
+    updated_at: str = Field(description="ISO format timestamp of last update")
+    def to_dict(self) -> Dict:
+        """Convert to dictionary format suitable for HF datasets"""
+        data = self.model_dump()
+        if self.workflow:
+            data["workflow"] = self.workflow.model_dump(exclude_defaults=True)
+        return data
+    @classmethod
+    def from_dict(cls, data: Dict) -> "Submission":
+        """Create instance from dictionary format used in HF datasets"""
+        if data.get("workflow"):
+            data["workflow"] = Workflow.model_validate(data["workflow"])
+        return cls.model_validate(data)

src/submission/submit.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import json
+import os
+import traceback
+from datetime import datetime, timedelta, timezone
+from typing import Optional
+import gradio as gr
+import yaml
+from src.display.formatting import styled_error, styled_message
+from src.envs import API, DAILY_SUBMISSION_LIMIT_PER_USER, EVAL_REQUESTS_PATH, QUEUE_REPO
+from src.submission.structs import CompetitionType, Submission, SubmissionStatus
+from workflows.structs import Workflow
+def get_user_submissions_today(username: str, competition_type: str) -> list[Submission]:
+    today = datetime.now(timezone.utc).strftime("%Y%m%d")
+    if username is None:
+        raise gr.Error("Authentication required. Please log in to view your submissions.")
+    out_dir = f"{EVAL_REQUESTS_PATH}/{username}"
+    submissions = []
+    if not os.path.exists(out_dir):
+        return submissions
+    for file in os.listdir(out_dir):
+        if not file.startswith(f"{competition_type}_"):
+            continue
+        with open(os.path.join(out_dir, file), "r") as f:
+            submission = Submission.from_dict(json.load(f))
+            if submission.created_at.startswith(today):
+                submissions.append(submission)
+    return submissions
+def get_time_until_next_submission(tz: timezone = timezone.utc) -> str:
+    next_day_00 = datetime.now(tz) + timedelta(days=1)
+    next_day_00 = next_day_00.replace(hour=0, minute=0, second=0, microsecond=0)
+    remaining_time = next_day_00 - datetime.now(tz)
+    hours = remaining_time.seconds // 3600
+    minutes = (remaining_time.seconds % 3600) // 60
+    remaining_time_str = f"{hours} hours {minutes} mins"
+    return remaining_time_str
+def create_submission(
+    username: str,
+    model_name: str,
+    description: str,
+    workflow: Workflow,
+    competition_type: CompetitionType,
+) -> Submission:
+    """
+    Create a submission for a tossup model.
+    Args:
+        name: Display name of the submission
+        description: Detailed description of what the submission does
+        user_email: Email of the user who created the submission
+        workflow: The workflow configuration for the tossup model
+    Returns:
+        Submission object if successful, None if validation fails
+    """
+    # Create the submission
+    dt = datetime.now(timezone.utc)
+    submission = Submission(
+        id=f"{competition_type}_{dt.strftime('%Y%m%d_%H%M%S')}_{model_name.lower().replace(' ', '_')}",
+        model_name=model_name,
+        username=username,
+        description=description,
+        competition_type=competition_type,
+        submission_type="simple_workflow",
+        workflow=workflow,
+        status="submitted",
+        created_at=dt.isoformat(),
+        updated_at=dt.isoformat(),
+    )
+    return submission
+def submit_model(
+    model_name: str,
+    description: str,
+    workflow: Workflow,
+    competition_type: CompetitionType,
+    profile: gr.OAuthProfile | None,
+) -> str:
+    """
+    Submit a tossup model for evaluation.
+    Args:
+        name: Display name of the submission
+        description: Detailed description of what the submission does
+        user_email: Email of the user who created the submission
+        workflow: The workflow configuration for the tossup model
+    Returns:
+        Status message
+    """
+    if profile is None:
+        return styled_error("Authentication required. Please log in first to submit your model.")
+    username = profile.username
+    if len(get_user_submissions_today(username)) >= DAILY_SUBMISSION_LIMIT_PER_USER:
+        time_str = get_time_until_next_submission()
+        return styled_error(
+            f"Daily submission limit of {DAILY_SUBMISSION_LIMIT_PER_USER} reached. Please try again in \n {time_str}."
+        )
+    try:
+        submission = create_submission(
+            username=username,
+            model_name=model_name,
+            description=description,
+            workflow=workflow,
+            competition_type=competition_type,
+        )
+        # Convert to dictionary format
+        submission_dict = submission.to_dict()
+        # Create output directory path
+        out_dir = f"{EVAL_REQUESTS_PATH}/{username}"
+        out_path = f"{out_dir}/{submission.id}.json"
+        # Upload to HuggingFace dataset
+        API.upload_file(
+            path_or_fileobj=json.dumps(submission_dict, indent=2).encode(),
+            path_in_repo=out_path.split("eval-queue/")[1],
+            repo_id=QUEUE_REPO,
+            repo_type="dataset",
+            commit_message=f"Add tossup submission {submission.id}",
+        )
+        return styled_message(
+            f"Successfully submitted tossup model!\n"
+            f"Submission ID: {submission.id}\n"
+            f"Name: {username}/{model_name}\n"
+            f"Please wait for up to an hour for the model to show in the PENDING list."
+        )
+    except Exception as e:
+        traceback.print_exc()
+        return styled_error(f"Error submitting model: {str(e)}")
+if __name__ == "__main__":
+    # Example usage
+    from workflows.factory import create_quizbowl_simple_step_initial_setup
+    # Create workflow
+    model_step = create_quizbowl_simple_step_initial_setup()
+    model_step.model = "gpt-4"
+    model_step.provider = "openai"
+    model_step.temperature = 0.7
+    workflow = Workflow(
+        inputs=["question_text"],
+        outputs={"answer": "A.answer", "confidence": "A.confidence"},
+        steps={"A": model_step},
+    )
+    # Submit model
+    result = submit_model(
+        model_name="GPT-4 Tossup",
+        description="A simple GPT-4 model for tossup questions",
+        workflow=workflow,
+        competition_type="tossup",
+    )
+    print(result)

src/utils.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# Description: Utility functions for the model_step component.
+from envs import AVAILABLE_MODELS, UNSELECTED_MODEL_NAME
+def guess_model_provider(model_name: str):
+    """Guess the provider of a model name."""
+    model_name = model_name.lower()
+    if model_name.startswith("gpt-"):
+        return "OpenAI"
+    if "sonnet" in model_name or "claude" in model_name:
+        return "Anthropic"
+    raise ValueError(f"Model `{model_name}` not yet supported")
+def get_model_and_provider(model_name: str):
+    """Get the model and provider from a model name."""
+    if model_name == UNSELECTED_MODEL_NAME:
+        return "", ""
+    splits = model_name.split("/", maxsplit=1)
+    if len(splits) == 1:
+        full_model_name = AVAILABLE_MODELS.get(model_name, model_name)
+        provider = guess_model_provider(full_model_name)
+        return full_model_name, provider
+    if len(splits) == 2:
+        provider, model_name = splits
+        full_model_name = AVAILABLE_MODELS.get(model_name, model_name)
+        return full_model_name, provider
+    raise ValueError(f"Model `{model_name}` not yet supported")
+def get_full_model_name(model_name: str, provider: str = ""):
+    """Get the full model name from a model name."""
+    if model_name == "":
+        return UNSELECTED_MODEL_NAME
+    if not provider:
+        provider = guess_model_provider(model_name)
+    return f"{provider}/{model_name}"

src/workflows/README.md ADDED Viewed

	@@ -0,0 +1,92 @@

+# Workflows Subpackage
+This subpackage provides a framework for defining, validating, and executing workflows composed of interconnected model steps with dependency management.
+## Overview
+The workflows subpackage enables the creation and execution of workflows where multiple model steps can be combined, with outputs from earlier steps feeding into inputs of later steps. The package handles dependency resolution, execution order, and error handling.
+## Components
+### `structs.py`
+Contains the core data structures used throughout the workflow system:
+- `Field`: Represents an input or output field with name and type information
+- `ModelStep`: Represents a single step in a workflow with input fields, output fields, and model details
+- `Workflow`: A collection of ModelSteps with their identifiers
+### `utils.py`
+Provides utility functions for workflow operations:
+- `_create_variable_step_mapping`: Maps variables to the steps that produce them
+- `create_dependency_graph`: Builds a dependency graph representing the execution order constraints
+- `topological_sort`: Sorts steps in execution order based on their dependencies
+### `workflow_executor.py`
+Handles the execution of workflows:
+- Processes inputs and outputs between steps
+- Coordinates the execution of model steps in the correct order
+- Integrates with external model providers (e.g., via litellm)
+### `errors.py`
+Defines custom exceptions for workflow-related errors:
+- `WorkflowError`: Base class for workflow errors
+- `CyclicDependencyError`: Raised when detecting cycles in the workflow graph
+- `UnknownVariableError`: Raised when a step requires a variable that's not provided or produced
+## Usage Example
+```python
+from workflows.structs import Field, ModelStep, Workflow
+# Define a workflow with two steps
+step1 = ModelStep(
+    input_fields=[Field(name="query", type="string")],
+    output_fields=[Field(name="summary", type="string")],
+    model="gpt-3.5-turbo",
+    system_prompt="Summarize the following text"
+)
+step2 = ModelStep(
+    input_fields=[Field(name="summary", type="string", variable="step1.summary")],
+    output_fields=[Field(name="key_points", type="array")],
+    model="gpt-4",
+    system_prompt="Extract key points from the summary"
+)
+workflow = Workflow(steps={"step1": step1, "step2": step2})
+# Execute the workflow
+from workflows.workflow_executor import execute_workflow
+result = execute_workflow(
+    workflow=workflow,
+    input_values={"query": "Long text to summarize..."}
+)
+# Access results
+summary = result["step1.summary"]
+key_points = result["step2.key_points"]
+```
+## Error Handling
+The workflows system provides robust error handling:
+- Detects cyclic dependencies in workflow definitions
+- Validates input/output variable references
+- Ensures all required inputs are provided
+## Extending the Workflows System
+To extend the workflows system:
+1. Add new model step types by extending the `ModelStep` class
+2. Create custom field types by extending validation in the execution logic
+3. Implement additional error types in `errors.py` for specialized error handling

src/workflows/errors.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""
+Custom exceptions for workflow validation and execution errors.
+This module defines the exception hierarchy for the workflows package, enabling
+specific error types to be raised and caught during workflow validation and execution.
+Each exception provides detailed error messages to help diagnose and fix issues in
+workflow definitions or execution.
+Exception hierarchy:
+- WorkflowError (base class)
+  - UnknownVariableError (missing variable reference)
+  - CyclicDependencyError (circular dependencies)
+  - FunctionNotFoundError (missing function reference)
+"""
+# Define custom exceptions for workflow errors
+class WorkflowError(Exception):
+    """
+    Base exception class for all workflow-related errors.
+    This is the parent class for all workflow-specific exceptions and can be used
+    to catch any error from the workflows package.
+    """
+    pass
+class UnknownVariableError(WorkflowError):
+    """
+    Raised when a workflow step references a variable that doesn't exist.
+    This typically occurs when a step's input field references a variable that is neither
+    provided as an external input nor produced as an output by any previous step.
+    """
+    def __init__(self, var: str):
+        super().__init__(f"Unknown variable referenced: {var}")
+class CyclicDependencyError(WorkflowError):
+    """
+    Raised when a cyclic dependency is detected in a workflow.
+    A cyclic dependency occurs when there is a circular reference in the workflow graph,
+    such as step A depending on step B, which depends on step A. Such workflows cannot
+    be executed because there's no valid order to process the steps.
+    """
+    def __init__(self):
+        super().__init__("Cyclic dependency detected in workflow")
+class FunctionNotFoundError(WorkflowError):
+    """
+    Raised when a referenced function cannot be found during workflow execution.
+    This typically occurs when a step references a function that doesn't exist in
+    the available function registry or namespace.
+    """
+    def __init__(self, func_name: str):
+        super().__init__(f"Function not found: {func_name}")

src/workflows/executors.py ADDED Viewed

	@@ -0,0 +1,440 @@

+# %%
+import json
+from typing import Any
+import pydantic
+from llms import completion
+from workflows.errors import WorkflowError
+from workflows.structs import InputField, ModelStep, OutputField, Workflow
+from workflows.utils import create_dependency_graph, topological_sort
+"""
+Core workflow execution functionality.
+This module handles the execution of defined workflows, including input processing,
+dependency-based execution order, model calling, and output collection. It integrates
+with the litellm library to handle model interactions.
+Key components:
+- Utility functions for input/output transformation
+- Input processing and validation
+- Model step execution
+- Complete workflow execution with dependency resolution
+The module orchestrates the execution of steps in the correct order based on their
+dependencies and manages the flow of data between steps.
+"""
+def upper(x):
+    if isinstance(x, str):
+        return x.upper()
+    return x
+def lower(x):
+    if isinstance(x, str):
+        return x.lower()
+    return x
+TYPE_MAP = {
+    "str": str,
+    "int": int,
+    "float": float,
+    "bool": bool,
+}
+FUNCTION_MAP = {
+    "upper": upper,
+    "lower": lower,
+    "len": len,
+    "split": str.split,
+}
+def get_type(type_str: str) -> type:
+    return TYPE_MAP.get(type_str, eval(type_str))
+def create_processed_inputs(model_step: ModelStep, available_vars: dict[str, Any]) -> dict[str, Any]:
+    """
+    Creates processed inputs for a model step.
+    This function extracts and processes the required inputs for a model step based on
+    its input field definitions. It retrieves values from the available variables dictionary
+    and applies any specified transformations.
+    Args:
+        model_step (ModelStep): The model step for which to create processed inputs.
+        available_vars (dict[str, Any]): Dictionary of variables available for use as inputs.
+                                         Keys are variable names, values are the variable values.
+    Returns:
+        dict[str, Any]: A dictionary of processed inputs ready for use by the model step.
+                        Keys are input field names, values are the processed input values.
+    Raises:
+        WorkflowError: If a required variable is not found in available_vars,
+                       or if a specified transformation function is not available.
+    Example:
+        >>> available_vars = {"step1.output": "Hello World"}
+        >>> create_processed_inputs(model_step, available_vars)
+        {"input_field_name": "HELLO WORLD"}  # If upper transformation was specified
+    """
+    processed_inputs: dict[str, Any] = {}
+    for input_field in model_step.input_fields:
+        var = input_field.variable
+        value = available_vars[var]
+        if input_field.func is not None:
+            func = FUNCTION_MAP.get(input_field.func)
+            func = func or eval(input_field.func)
+            value = func(value)
+        processed_inputs[input_field.name] = value
+    return processed_inputs
+# %%
+def execute_model_step(
+    model_step: ModelStep, available_vars: dict[str, Any], return_full_content: bool = False
+) -> dict[str, Any] | tuple[dict[str, Any], str]:
+    """
+    Executes a model step using the provided available variables.
+    This function handles the complete execution of a model step, including:
+    1. Processing inputs using variable references and transformations
+    2. Constructing the appropriate prompt for the model
+    3. Calling the model via litellm with structured output
+    4. Processing and validating the model's response
+    5. Applying any output transformations
+    The function supports different providers and model types through the litellm
+    integration, allowing for a consistent interface regardless of the underlying model.
+    Args:
+        model_step (ModelStep): The model step to execute, containing model details,
+                               input/output specifications, and system prompt.
+        available_vars (dict[str, Any]): A dictionary of all variables available to this step,
+                                        including outputs from previous steps and external inputs.
+    Returns:
+        dict[str, Any]: A dictionary of processed outputs from the model step,
+                       with keys matching the output field names.
+    Raises:
+        WorkflowError: If there's an error in input processing, model execution,
+                      or output validation.
+    Example:
+        >>> step = ModelStep(
+        ...     id="summarize",
+        ...     model="gpt-3.5-turbo",
+        ...     provider="openai",
+        ...     call_type="llm",
+        ...     system_prompt="Summarize the text",
+        ...     input_fields=[InputField(name="text", variable="input_text", description="Text to summarize")],
+        ...     output_fields=[OutputField(name="summary", type="str", description="Summary of the text")]
+        ... )
+        >>> execute_model_step(step, {"input_text": "Long text to be summarized..."})
+        {"summary": "A concise summary of the text."}
+    """
+    # Ensure inputs are processed using the specified functions in input_fields.
+    processed_inputs = create_processed_inputs(model_step, available_vars)
+    # Construct the input prompt for the model
+    input_str = ", ".join(f"{k}={v}" for k, v in processed_inputs.items())
+    step_result = f"{model_step.system_prompt} | Inputs: {input_str}"
+    # Define the expected output fields and their types
+    fields = {
+        field.name: (get_type(field.type), pydantic.Field(..., description=field.description))
+        for field in model_step.output_fields
+    }
+    ModelResponse = pydantic.create_model("ModelResponse", **fields)
+    # Execute the model step using litellm
+    api_response = completion(
+        model=f"{model_step.provider}/{model_step.model}",
+        system=model_step.system_prompt,
+        prompt=step_result,
+        response_format=ModelResponse,
+    )
+    # api_response = litellm.completion(
+    #     model=model_step.model,
+    #     messages=[{"role": "user", "content": step_result}],
+    #     response_format=ModelResponse,
+    # )
+    # Extract and parse the model response
+    # model_response_content = api_response["choices"][0]["message"]["content"]
+    # model_response = json.loads(model_response_content)
+    model_response = api_response["output"]
+    # Map the parsed response to the output fields
+    outputs = {field.name: model_response[field.name] for field in model_step.output_fields}
+    if return_full_content:
+        return outputs, api_response["content"]
+    return outputs
+# Example usage
+if __name__ == "__main__":
+    # Define a simple model step
+    model_step = ModelStep(
+        id="step1",
+        model="gpt-4o-mini",
+        provider="OpenAI",
+        call_type="llm",
+        system_prompt="You are a simple NLP tool that takes a string, and a number N, and return the first N entities in the string, and the total count of entities in the string.",
+        input_fields=[
+            InputField(name="sentence", description="The sentence to process", variable="sentence", func=None),
+            InputField(name="n", description="The number of entities to return", variable="n", func=None),
+        ],
+        output_fields=[
+            OutputField(
+                name="entities",
+                description="The first N entities in the string as a list of strings",
+                type="list[str]",
+                func=None,
+            ),
+            OutputField(name="count", description="The total count of entities in the string", type="int", func=None),
+        ],
+    )
+    # Define processed inputs
+    processed_inputs = {"sentence": "Abdul Akbar is a good person, but Jesus is the son of God.", "n": 3}
+    # Execute the model step
+    outputs = execute_model_step(model_step, processed_inputs)
+    print(outputs)
+# %%
+def execute_workflow(
+    workflow: Workflow, input_values: dict[str, Any], return_full_content: bool = False
+) -> dict[str, Any] | tuple[dict[str, Any], str]:
+    """
+    Execute the given workflow as a computational graph.
+    This function orchestrates the complete execution of a workflow by:
+    1. Validating and populating initial values using the provided external inputs
+    2. Building a dependency graph between workflow steps
+    3. Determining a valid execution order using topological sorting
+    4. Executing each step in the correct order, with inputs from previous steps
+    5. Collecting and returning the final outputs
+    The execution process ensures that all dependencies are satisfied before a step
+    is executed, and that the data flows correctly between steps according to the
+    variable references defined in each step's input fields.
+    Args:
+        workflow (Workflow): The workflow to execute, containing steps, their
+                            dependencies, and input/output specifications.
+        input_values (dict[str, Any]): External input values to be used by the workflow.
+                                      Keys should match the required workflow.inputs.
+    Returns:
+        dict[str, Any]: A dictionary of the workflow's outputs, with keys matching
+                       the variables defined in workflow.outputs.
+    Raises:
+        UnknownVariableError: If an input_field references a variable that is not
+                             provided externally nor produced by any step.
+        CyclicDependencyError: If the workflow contains a circular dependency that
+                              prevents a valid execution order.
+        FunctionNotFoundError: If a transformation function specified in input_fields.func
+                              or output_fields.func is not available.
+        WorkflowError: For any other workflow-related errors, such as missing required inputs.
+    Example:
+        >>> workflow = Workflow(
+        ...     steps={
+        ...         "extract": ModelStep(...),  # A step that extracts entities
+        ...         "analyze": ModelStep(...)   # A step that analyzes the entities
+        ...     },
+        ...     inputs=["text"],
+        ...     outputs=["analyze.sentiment", "extract.entities"]
+        ... )
+        >>> result = execute_workflow(workflow, {"text": "Apple is launching a new product tomorrow."})
+        >>> print(result["analyze.sentiment"])
+        "positive"
+        >>> print(result["extract.entities"])
+        ["Apple", "product"]
+    """
+    # Step 1: Pre-populate computed values with external workflow inputs.
+    computed_values: dict[str, Any] = {}
+    for var in workflow.inputs:
+        if var not in input_values:
+            raise WorkflowError(f"Missing required workflow input: {var}")
+        computed_values[var] = input_values[var]
+    # Step 2: Build dependency graph among model steps.
+    # For each step, examine its input_fields. If an input is not in the pre-populated external inputs,
+    # then it is expected to be produced by some step. Otherwise, raise an error.
+    dependencies = create_dependency_graph(workflow, input_values)
+    # Step 3: Determine the execution order of the steps using topological sort.
+    # Raises an error if a cycle is detected.
+    execution_order = topological_sort(dependencies)
+    # Step 4: Execute steps in topological order.
+    for step_id in execution_order:
+        step = workflow.steps[step_id]
+        # Execute the step
+        outputs = execute_model_step(step, computed_values)
+        outputs = {f"{step_id}.{k}": v for k, v in outputs.items()}
+        computed_values.update(outputs)
+    # Step 5: Gather and return workflow outputs.
+    final_outputs: dict[str, Any] = {}
+    for target, var in workflow.outputs.items():
+        if var not in computed_values:
+            raise WorkflowError(f"Workflow output variable {var} was not produced")
+        final_outputs[target] = computed_values[var]
+    return final_outputs
+def run_examples():
+    """
+    Runs three example workflows demonstrating:
+      1. A successful (linear) workflow execution.
+      2. A cyclic dependency error.
+      3. An unknown variable dependency error.
+    """
+    print("Example 1: Successful Workflow Execution")
+    # Example 1: Simple linear workflow.
+    # External input "input.value" is provided. Two steps:
+    #  - step1 takes "input.value" and produces "step1.result".
+    #  - step2 uses "step1.result" and produces "step2.final".
+    from workflows.structs import ModelStep, Workflow
+    workflow_success = Workflow(
+        steps={
+            "step1": ModelStep(
+                id="step1",
+                model="gpt-4o-mini",
+                provider="OpenAI",
+                call_type="llm",
+                system_prompt="Step1 processing",
+                input_fields=[InputField(name="value", description="Input value", variable="input.value")],
+                output_fields=[OutputField(name="result", description="Processed result", type="str", func="upper")],
+            ),
+            "step2": ModelStep(
+                id="step2",
+                model="gpt-4o-mini",
+                provider="OpenAI",
+                call_type="llm",
+                system_prompt="Step2 processing",
+                input_fields=[InputField(name="result", description="Result from step1", variable="step1.result")],
+                output_fields=[OutputField(name="final", description="Final output", type="str", func="lower")],
+            ),
+        },
+        inputs=["input.value"],
+        outputs={"final": "step2.final"},
+    )
+    input_values_success = {"input.value": "Hello, World!"}
+    try:
+        outputs = execute_workflow(workflow_success, input_values_success)
+        print("Workflow outputs:", outputs)
+    except WorkflowError as e:
+        print("Workflow failed with error:", e)
+    print("\nExample 2: Cyclic Dependency Workflow")
+    # Example 2: Cyclic dependency.
+    # stepA depends on an output from stepB and vice versa.
+    workflow_cycle = Workflow(
+        steps={
+            "stepA": ModelStep(
+                id="stepA",
+                model="gpt-4o-mini",
+                provider="OpenAI",
+                call_type="llm",
+                system_prompt="StepA processing",
+                input_fields=[
+                    InputField(name="input", description="Input from stepB", variable="stepB.output", func="identity")
+                ],
+                output_fields=[OutputField(name="output", description="Output from A", type="str", func="upper")],
+            ),
+            "stepB": ModelStep(
+                id="stepB",
+                model="gpt-4o-mini",
+                provider="OpenAI",
+                call_type="llm",
+                system_prompt="StepB processing",
+                input_fields=[
+                    InputField(name="input", description="Input from stepA", variable="stepA.output", func="identity")
+                ],
+                output_fields=[OutputField(name="output", description="Output from B", type="str", func="upper")],
+            ),
+        },
+        inputs=[],  # no external inputs
+        outputs={"output": "stepB.output"},
+    )
+    try:
+        outputs = execute_workflow(workflow_cycle, {})
+        print("Workflow outputs:", outputs)
+    except WorkflowError as e:
+        print("Workflow failed with error:", e)
+    print("\nExample 3: Unknown Variable Dependency Workflow")
+    # Example 3: A workflow that references a variable not provided as an input or produced by any step.
+    workflow_unknown = Workflow(
+        steps={
+            "stepX": ModelStep(
+                id="stepX",
+                model="gpt-4o-mini",
+                provider="OpenAI",
+                call_type="llm",
+                system_prompt="StepX processing",
+                input_fields=[
+                    InputField(
+                        name="input", description="Non-existent input", variable="nonexistent.value", func="identity"
+                    )
+                ],
+                output_fields=[OutputField(name="output", description="Output from X", type="str", func="upper")],
+            )
+        },
+        inputs=[],  # no external inputs
+        outputs={"output": "stepX.output"},
+    )
+    try:
+        outputs = execute_workflow(workflow_unknown, {})
+        print("Workflow outputs:", outputs)
+    except WorkflowError as e:
+        print("Workflow failed with error:", e)
+if __name__ == "__main__":
+    # create example of model_step
+    model_step = ModelStep(
+        id="step1",
+        model="gpt-4o-mini",
+        provider="OpenAI",
+        call_type="llm",
+        system_prompt="You are a simple NLP tool that takes a string, and a number N, and return the first N entities in the string, and the total count of entities in the string.",
+        input_fields=[
+            InputField(name="sentence", description="The sentence to process", variable="sentence", func=None),
+            InputField(name="n", description="The number of entities to return", variable="n", func=None),
+        ],
+        output_fields=[
+            OutputField(
+                name="entities",
+                description="The first N entities in the string as a list of strings",
+                type="list[str]",
+                func=None,
+            ),
+            OutputField(name="count", description="The total count of entities in the string", type="int", func=None),
+        ],
+    )
+    processed_inputs = {"sentence": "Abdul Akbar is a good person, but Jesus is the son of God.", "n": 3}
+    processed_inputs = create_processed_inputs(model_step, processed_inputs)
+    print(processed_inputs)
+    run_examples()
+# %%

src/workflows/factory.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# %%
+from workflows.structs import Field, InputField, ModelStep, OutputField, Workflow
+INITIAL_SYS_PROMPT = """You are a  helpful performant question answering bot.
+Given a question clue, output your most likely guess in a couple words with a calibrated confidence for the guess.
+"""
+def create_simple_workflow():
+    pass
+def create_first_step_input_fields() -> list[InputField]:
+    return [
+        InputField(
+            name="question",
+            description="The question text progressively revealed to the agent so far.",
+            variable="question_text",
+        )
+    ]
+def create_empty_input_field() -> list[InputField]:
+    return [InputField(name="", description="", variable="question_text")]
+def create_quizbowl_simple_step_initial_setup():
+    return ModelStep(
+        id="simple_step",
+        name="Quizbowl Simple Step",
+        model="",
+        provider="",
+        temperature=0.7,
+        call_type="llm",
+        system_prompt=INITIAL_SYS_PROMPT,
+        input_fields=[
+            InputField(name="question", description="The question to answer", variable="question"),
+        ],
+        output_fields=[
+            OutputField(name="answer", description="The most likely answer", type="str"),
+            OutputField(name="confidence", description="The confidence of the answer", type="float"),
+        ],
+    )
+def create_new_llm_step(step_id: str, name: str) -> ModelStep:
+    return ModelStep(
+        id=step_id,
+        name=name,
+        model="gpt-4o",
+        provider="OpenAI",
+        call_type="llm",
+        temperature=0.7,
+        system_prompt="",
+        input_fields=create_empty_input_field(),
+        output_fields=[OutputField(name="", description="")],
+    )
+def create_first_llm_step() -> ModelStep:
+    return ModelStep(
+        id="A",
+        name="",
+        model="gpt-4o",
+        provider="OpenAI",
+        call_type="llm",
+        temperature=0.7,
+        system_prompt="",
+        input_fields=[create_first_step_input_fields()],
+        output_fields=[OutputField(name="", description="")],
+    )
+def create_quizbowl_simple_workflow():
+    return Workflow(
+        inputs=["question_text"],
+        outputs={"answer": "A.answer", "confidence": "A.confidence"},
+        steps={
+            "A": ModelStep(
+                id="A",
+                name="Tossup Agent",
+                model="gpt-4o-mini",
+                provider="OpenAI",
+                call_type="llm",
+                temperature=0.3,
+                system_prompt="You are a helpful assistant that can answer questions.",
+                input_fields=[InputField(name="question", description="The question text", variable="question_text")],
+                output_fields=[
+                    OutputField(
+                        name="answer",
+                        description="The best guess at the answer to the question",
+                        type="str",
+                    ),
+                    OutputField(
+                        name="confidence",
+                        description="The confidence in the answer, ranging from 0 to 1 in increments of 0.05.",
+                        type="float",
+                    ),
+                ],
+            )
+        },
+    )
+BONUS_SYS_PROMPT = """You are a quizbowl player answering bonus questions. For each part:
+1. Read the leadin and part carefully
+2. Provide a concise answer
+3. Rate your confidence (0-1)
+4. Explain your reasoning
+Format your response as:
+ANSWER: <your answer>
+CONFIDENCE: <0-1>
+EXPLANATION: <your reasoning>"""
+def create_quizbowl_bonus_simple_workflow() -> Workflow:
+    """Create a simple model step for bonus questions."""
+    return Workflow(
+        inputs=["leadin", "part"],
+        outputs={"answer": "A.answer", "confidence": "A.confidence", "explanation": "A.explanation"},
+        steps={
+            "A": ModelStep(
+                id="A",
+                name="Bonus Agent",
+                model="gpt-4o-mini",
+                provider="OpenAI",
+                temperature=0.3,
+                call_type="llm",
+                system_prompt=BONUS_SYS_PROMPT,
+                input_fields=[
+                    InputField(
+                        name="question_leadin",
+                        description="The leadin text for the bonus question",
+                        variable="leadin",
+                    ),
+                    InputField(
+                        name="question_part",
+                        description="The specific part text to answer",
+                        variable="part",
+                    ),
+                ],
+                output_fields=[
+                    OutputField(name="answer", description="The predicted answer", type="str"),
+                    OutputField(name="confidence", description="Confidence in the answer (0-1)", type="float"),
+                    OutputField(name="explanation", description="Short explanation for the answer", type="str"),
+                ],
+            )
+        },
+    )

src/workflows/qb/__init__.py ADDED Viewed

File without changes

src/workflows/qb/simple_agent.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import time
+from typing import Any, Iterable
+# from litellm import completion
+from llms import completion
+from workflows.executors import execute_model_step, execute_workflow
+from workflows.structs import ModelStep, Workflow
+def _get_agent_response(self, prompt: str, system_prompt: str) -> dict:
+    """Get response from the LLM model."""
+    messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
+    start_time = time.time()
+    response = completion(
+        model=self.model,
+        messages=messages,
+        temperature=self.temperature,
+        max_tokens=150,  # Limit token usage for faster responses
+    )
+    response_time = time.time() - start_time
+    return response, response_time
+def _get_model_step_response(
+    model_step: ModelStep, available_vars: dict[str, Any]
+) -> tuple[dict[str, Any], str, float]:
+    """Get response from the LLM model."""
+    start_time = time.time()
+    response, content = execute_model_step(model_step, available_vars, return_full_content=True)
+    response_time = time.time() - start_time
+    return response, content, response_time
+def _get_workflow_response(workflow: Workflow, available_vars: dict[str, Any]) -> tuple[dict[str, Any], str, float]:
+    """Get response from the LLM model."""
+    start_time = time.time()
+    response, content = execute_workflow(workflow, available_vars, return_full_content=True)
+    response_time = time.time() - start_time
+    return response, content, response_time
+class SimpleTossupAgent:
+    external_input_variable = "question_text"
+    output_variables = ["answer", "confidence"]
+    def __init__(self, workflow: Workflow, buzz_threshold: float):
+        steps = list(workflow.steps.values())
+        assert len(steps) == 1, "Only one step is allowed in a simple workflow"
+        self.model_step = steps[0]
+        self.buzz_threshold = buzz_threshold
+        self.output_variables = list(workflow.outputs.keys())
+        if self.external_input_variable not in workflow.inputs:
+            raise ValueError(f"External input variable {self.external_input_variable} not found in model step inputs")
+        for out_var in self.output_variables:
+            if out_var not in workflow.outputs:
+                raise ValueError(f"Output variable {out_var} not found in the workflow outputs")
+    def run(self, question_runs: list[str], early_stop: bool = True) -> Iterable[dict]:
+        """
+        Process a tossup question and decide when to buzz based on confidence.
+        Args:
+            question_runs: Progressive reveals of the question text
+            early_stop: Whether to stop after the first buzz
+        Yields:
+            Dict with answer, confidence, and whether to buzz
+        """
+        for i, question_text in enumerate(question_runs):
+            response, content, response_time = _get_model_step_response(
+                self.model_step, {self.external_input_variable: question_text}
+            )
+            buzz = response["confidence"] >= self.buzz_threshold
+            result = {
+                "answer": response["answer"],
+                "confidence": response["confidence"],
+                "buzz": buzz,
+                "question_fragment": question_text,
+                "position": i + 1,
+                "full_response": content,
+                "response_time": response_time,
+            }
+            yield result
+            # If we've reached the confidence threshold, buzz and stop
+            if early_stop and buzz:
+                return
+class SimpleBonusAgent:
+    external_input_variables = ["leadin", "part"]
+    output_variables = ["answer", "confidence", "explanation"]
+    def __init__(self, workflow: Workflow):
+        steps = list(workflow.steps.values())
+        assert len(steps) == 1, "Only one step is allowed in a simple workflow"
+        self.model_step = steps[0]
+        self.output_variables = list(workflow.outputs.keys())
+        # Validate input variables
+        for input_var in self.external_input_variables:
+            if input_var not in workflow.inputs:
+                raise ValueError(f"External input variable {input_var} not found in model step inputs")
+        # Validate output variables
+        for out_var in self.output_variables:
+            if out_var not in workflow.outputs:
+                raise ValueError(f"Output variable {out_var} not found in the workflow outputs")
+    def run(self, leadin: str, part: str) -> dict:
+        """
+        Process a bonus part with the given leadin.
+        Args:
+            leadin: The leadin text for the bonus question
+            part: The specific part text to answer
+        Returns:
+            Dict with answer, confidence, and explanation
+        """
+        response, content, response_time = _get_model_step_response(
+            self.model_step,
+            {
+                "leadin": leadin,
+                "part": part,
+            },
+        )
+        return {
+            "answer": response["answer"],
+            "confidence": response["confidence"],
+            "explanation": response["explanation"],
+            "full_response": content,
+            "response_time": response_time,
+        }
+# Example usage
+if __name__ == "__main__":
+    # Load the Quizbowl dataset
+    from datasets import load_dataset
+    from workflows.factory import create_quizbowl_bonus_step_initial_setup, create_quizbowl_simple_step_initial_setup
+    ds_name = "umdclip/leaderboard_co_set"
+    ds = load_dataset(ds_name, split="train")
+    # Create the agents
+    tossup_step = create_quizbowl_simple_step_initial_setup()
+    tossup_step.model = "gpt-4"
+    tossup_step.provider = "openai"
+    tossup_agent = SimpleTossupAgent(workflow=tossup_step, buzz_threshold=0.9)
+    bonus_step = create_quizbowl_bonus_step_initial_setup()
+    bonus_step.model = "gpt-4"
+    bonus_step.provider = "openai"
+    bonus_agent = SimpleBonusAgent(workflow=bonus_step)
+    # Example for tossup mode
+    print("\n=== TOSSUP MODE EXAMPLE ===")
+    sample_question = ds[30]
+    print(sample_question["question_runs"][-1])
+    print(sample_question["gold_label"])
+    print()
+    question_runs = sample_question["question_runs"]
+    results = tossup_agent.run(question_runs, early_stop=True)
+    for result in results:
+        print(result["full_response"])
+        print(f"Guess at position {result['position']}: {result['answer']}")
+        print(f"Confidence: {result['confidence']}")
+        if result["buzz"]:
+            print("Buzzed!\n")
+    # Example for bonus mode
+    print("\n=== BONUS MODE EXAMPLE ===")
+    sample_bonus = ds[31]  # Assuming this is a bonus question
+    leadin = sample_bonus["leadin"]
+    parts = sample_bonus["parts"]
+    print(f"Leadin: {leadin}")
+    for i, part in enumerate(parts):
+        print(f"\nPart {i + 1}: {part['part']}")
+        result = bonus_agent.run(leadin, part["part"])
+        print(f"Answer: {result['answer']}")
+        print(f"Confidence: {result['confidence']}")
+        print(f"Explanation: {result['explanation']}")
+        print(f"Response time: {result['response_time']:.2f}s")

src/workflows/quizbowl_agent.py ADDED Viewed

	@@ -0,0 +1,269 @@

+# %%
+import json
+import os
+import time
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+import litellm
+from datasets import load_dataset
+from litellm import completion
+litellm.drop_params = True
+# Set your API key - you can replace this with your actual key or use environment variables
+os.environ["OPENAI_API_KEY"] = (
+    "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"
+)
+DEFAULT_SYS_PROMPT = """
+You are a Quizbowl expert. You will be given a question that's progressively revealed.
+Your goal is to identify the answer as quickly as possible with high confidence.
+Respond with a JSON object with two fields:
+1. "answer": Your best guess for the answer
+2. "confidence": Your confidence in your answer from 0.0 to 1.0
+DO NOT include any explanation. ONLY return the JSON object.
+"""
+class QuizbowlAgent:
+    """
+    An agent for playing Quizbowl with two modes:
+    1. Tossup mode: Fast and direct with confidence calibration for buzzing
+    2. Bonus round mode: Provides guess, rationale, and confidence
+    """
+    def __init__(
+        self,
+        model: str = "gpt-4o-mini",
+        buzz_threshold: float = 0.85,
+        temperature: float = 0.2,
+        system_prompt: str = DEFAULT_SYS_PROMPT,
+    ):
+        """
+        Initialize the QuizbowlAgent.
+        Args:
+            model: The LLM model to use for answering
+            buzz_threshold: Confidence threshold for buzzing in tossup mode (0-1)
+            temperature: Temperature for model sampling
+        """
+        self.model = model
+        self.buzz_threshold = buzz_threshold
+        self.temperature = temperature
+        self.system_prompt = system_prompt
+    def _process_question_runs(self, question_runs: List[str]) -> List[str]:
+        """Process question runs to extract increasing amounts of text."""
+        # For simpler testing, just return the runs as they are in the dataset
+        return question_runs
+    def _get_agent_response(self, prompt: str, system_prompt: str) -> Dict:
+        """Get response from the LLM model."""
+        messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
+        start_time = time.time()
+        response = completion(
+            model=self.model,
+            messages=messages,
+            temperature=self.temperature,
+            max_tokens=150,  # Limit token usage for faster responses
+        )
+        response_time = time.time() - start_time
+        return response, response_time
+    def _extract_confidence_and_answer(self, content: str) -> Tuple[str, float]:
+        """Extract the answer and confidence score from the model response."""
+        try:
+            # Try to parse JSON from the response
+            data = json.loads(content)
+            answer = data.get("answer", "")
+            confidence = float(data.get("confidence", 0.0))
+            return answer, confidence
+        except (json.JSONDecodeError, ValueError):
+            # Fallback if parsing fails
+            lines = content.strip().split("\n")
+            answer = lines[0] if lines else ""
+            confidence = 0.5  # Default confidence
+            # Try to extract confidence from text
+            for line in lines:
+                if "confidence:" in line.lower():
+                    try:
+                        confidence = float(line.lower().split("confidence:")[1].strip())
+                    except (ValueError, IndexError):
+                        pass
+            return answer, confidence
+    def tossup_mode(self, question_runs: List[str]) -> Iterable[Dict]:
+        """
+        Process a tossup question and decide when to buzz based on confidence.
+        Args:
+            question_runs: Progressive reveals of the question text
+        Yields:
+            Dict with answer, confidence, and whether to buzz
+        """
+        for i, question_text in enumerate(question_runs):
+            prompt = f"Question: {question_text}\n\nProvide your answer and confidence level:"
+            response, response_time = self._get_agent_response(prompt, DEFAULT_SYS_PROMPT)
+            content = response.choices[0].message.content
+            answer, confidence = self._extract_confidence_and_answer(content)
+            result = {
+                "answer": answer,
+                "confidence": confidence,
+                "buzz": confidence >= self.buzz_threshold,
+                "question_fragment": question_text,
+                "position": i + 1,
+                "full_response": content,
+                "response_time": response_time,
+            }
+            yield result
+            # If we've reached the confidence threshold, buzz and stop
+            if confidence >= self.buzz_threshold:
+                return
+    def tossup_mode_top5(self, question_runs: List[str]) -> Iterable[Dict]:
+        """
+        Process a tossup question and provide the top 5 guesses with confidence levels.
+        Args:
+            question_runs: Progressive reveals of the question text
+        Returns:
+            Dict with top 5 answers, their confidences, and whether to buzz
+        """
+        for i, question_text in enumerate(question_runs):
+            prompt = f"Question: {question_text}\n\nProvide your top 5 answers and confidence levels."
+            response, response_time = self._get_agent_response(prompt, self.system_prompt)
+            content = response.choices[0].message.content
+            try:
+                # Try to parse JSON from the response
+                data = json.loads(content)
+                guesses = data.get("guesses", [])
+            except (json.JSONDecodeError, ValueError):
+                # Fallback if parsing fails
+                guesses = []
+            result = {
+                "guesses": guesses,
+                "buzz": any(guess["confidence"] >= self.buzz_threshold for guess in guesses),
+                "question_fragment": question_text,
+                "position": i + 1,
+                "full_response": content,
+                "response_time": response_time,
+            }
+            yield result
+            # If any guess reaches the confidence threshold, buzz and stop
+            if result["buzz"]:
+                return
+    def bonus_round_mode(self, question: str) -> Dict:
+        """
+        Process a bonus round question with detailed analysis.
+        Args:
+            question: The bonus question text
+        Returns:
+            Dict with answer, rationale, and confidence
+        """
+        system_prompt = """
+        You are a Quizbowl expert answering a bonus question. Provide:
+        1. Your direct answer
+        2. A very brief and crisp one line rationale for your answer (key clues that led to it)
+        3. Your confidence level (0.0-1.0)
+        Respond with a JSON object with these three fields:
+        {
+            "answer": "Your answer here",
+            "rationale": "Your reasoning here",
+            "confidence": 0.XX
+        }
+        """
+        prompt = f"Bonus Question: {question}\n\nProvide your answer, rationale, and confidence:"
+        response = self._get_agent_response(prompt, system_prompt)
+        content = response.choices[0].message.content
+        try:
+            # Try to parse JSON
+            result = json.loads(content)
+            # Ensure all fields are present
+            if not all(k in result for k in ["answer", "rationale", "confidence"]):
+                raise ValueError("Missing fields in response")
+        except (json.JSONDecodeError, ValueError):
+            # If parsing fails, extract manually
+            lines = content.strip().split("\n")
+            result = {"answer": "", "rationale": "", "confidence": 0.5}
+            for line in lines:
+                if line.lower().startswith("answer:"):
+                    result["answer"] = line[7:].strip()
+                elif line.lower().startswith("rationale:"):
+                    result["rationale"] = line[10:].strip()
+                elif line.lower().startswith("confidence:"):
+                    try:
+                        result["confidence"] = float(line[11:].strip())
+                    except ValueError:
+                        pass
+        return result
+# %%
+# Example usage
+if __name__ == "__main__":
+    # Load the Quizbowl dataset
+    ds_name = "umdclip/leaderboard_co_set"
+    ds = load_dataset(ds_name, split="train")
+    # Create the agent
+    agent = QuizbowlAgent(model="gpt-4-turbo", buzz_threshold=0.85)
+    # Example for tossup mode
+    print("\n=== TOSSUP MODE EXAMPLE ===")
+    sample_question = ds[0]
+    print(sample_question["question_runs"][-1])
+    print(sample_question["gold_label"])
+    question_runs = sample_question["question_runs"]
+    results = agent.tossup_mode(question_runs)
+    for result in results:
+        print(f"Guess at position {result['position']}: {result['answer']}")
+        print(f"Confidence: {result['confidence']}")
+        if result["buzz"]:
+            print("Buzzed!\n")
+    results = agent.tossup_mode_top5(question_runs)
+    for result in results:
+        guesses = [f"{guess['answer']} ({guess['confidence']})" for guess in result["guesses"]]
+        print(f"Guesses at position {result['position']}: {', '.join(guesses)}")
+        if result["buzz"]:
+            print("Buzzed!")
+    # Example for bonus round mode
+    print("\n=== BONUS ROUND MODE EXAMPLE ===")
+    bonus_question = sample_question["question_runs"][-1]
+    bonus_result = agent.bonus_round_mode(bonus_question)
+    print(f"Answer: {bonus_result['answer']}")
+    print(f"Rationale: {bonus_result['rationale']}")
+    print(f"Confidence: {bonus_result['confidence']}")
+# %%

src/workflows/structs.py ADDED Viewed

	@@ -0,0 +1,229 @@

+# %%
+from typing import Any, Literal, Optional
+from pydantic import BaseModel, Field, model_validator
+"""
+Core data structures for defining workflows and their components.
+This module defines the primary classes used to model workflows, steps, and their
+input/output fields. These data structures serve as the foundation for workflow
+definition, validation, and execution throughout the workflows package.
+The primary components are:
+- InputField: Represents an input to a model step with name and source variable
+- OutputField: Represents an output from a model step with name and type
+- ModelStep: Represents a single step in a workflow with inputs and outputs
+- Workflow: A collection of interconnected steps with defined inputs and outputs
+All classes use Pydantic's BaseModel for validation and serialization support.
+"""
+FieldType = Literal["input", "output"]
+SUPPORTED_TYPES = Literal["str", "int", "float", "bool", "list[str]", "list[int]", "list[float]", "list[bool]"]
+"""Supported field types for input and output fields"""
+class InputField(BaseModel):
+    """
+    Defines an input field for a model step.
+    An input field specifies what data a step requires, where it comes from,
+    and optional pre-processing to apply before use.
+    Attributes:
+        name: The name of the input field within the step's context
+        description: Human-readable description of the input's purpose
+        variable: Reference to the source variable (format: "{step_id}.{field_name}" or external input name)
+        func: Optional function name to transform the input value before use
+    """
+    name: str
+    description: str
+    variable: str
+    # function to call on the input before passing it to the model
+    func: str | None = None
+class OutputField(BaseModel):
+    """
+    Defines an output field produced by a model step.
+    An output field specifies a value that the step will produce, including
+    its data type and optional post-processing.
+    Attributes:
+        name: The name of the output field within the step's context
+        description: Human-readable description of the output's purpose
+        type: The data type of the output (one of SUPPORTED_TYPES)
+        func: Optional function name to transform the raw output value
+    """
+    name: str
+    type: SUPPORTED_TYPES = Field(default="str")
+    description: str
+    # function to call on the output string from the model
+    func: str | None = None
+class ModelStep(BaseModel):
+    """
+    Represents a single step in a workflow.
+    A model step encapsulates the details of a specific operation within a workflow,
+    including what model to use, what inputs it requires, and what outputs it produces.
+    Attributes:
+        id: Unique identifier for this step within a workflow
+        model: The model to use for this step (e.g., "gpt-4")
+        provider: The provider of the model (e.g., "openai")
+        call_type: The type of operation (e.g., "llm", "search")
+        system_prompt: Instructions for the model
+        input_fields: List of input fields required by this step
+        output_fields: List of output fields produced by this step
+    """
+    id: str
+    name: str
+    model: str
+    provider: str
+    call_type: str = "llm"  # llm, search, etc # TODO: make this enum or provide explicit options using Literal
+    # TODO: Validate that this is not None for call_type = llm
+    temperature: Optional[float] = None
+    system_prompt: str
+    input_fields: list[InputField]
+    output_fields: list[OutputField]
+    def fields(self, field_type: FieldType) -> list[InputField | OutputField]:
+        return self.input_fields if field_type == "input" else self.output_fields
+    def get_full_model_name(self):
+        return f"{self.provider} {self.model}"
+    def get_produced_variables(self) -> list[str]:
+        return [f"{self.id}.{field.name}" for field in self.output_fields if field.name]
+    def update(self, update: dict[str, Any]) -> "ModelStep":
+        return self.model_copy(update=update)
+    def update_property(self, field: str, value: Any) -> "ModelStep":
+        "Update the `field` key of the model step with `value`."
+        return self.update({field: value})
+    def update_field(self, field_type: FieldType, index: int, key: str, value: str) -> "ModelStep":
+        """Update a specific field of an input or output field at the given index."""
+        if field_type == "input":
+            fields = self.input_fields
+        elif field_type == "output":
+            fields = self.output_fields
+        else:
+            raise ValueError(f"Invalid field type: {field_type}")
+        if index < len(fields):
+            fields[index] = fields[index].model_copy(update={key: value})
+        return self.model_copy()
+    @staticmethod
+    def create_new_field(field_type: FieldType, input_var: str | None = None) -> InputField | OutputField:
+        if field_type == "input":
+            return InputField(name="", description="", variable=input_var)
+        elif field_type == "output":
+            return OutputField(name="", description="")
+        else:
+            raise ValueError(f"Invalid field type: {field_type}")
+    def add_field(self, field_type: FieldType, index: int = -1, input_var: str | None = None) -> "ModelStep":
+        """Add a new field to the state and update visibility.
+        Args:
+            field_type: Type of field to add ('input' or 'output').
+            index: Position to insert the new field (-1 to append).
+        Returns:
+            A new ModelStep with the updated fields.
+        """
+        new_step = self.model_copy()
+        fields = new_step.input_fields if field_type == "input" else new_step.output_fields
+        new_field = ModelStep.create_new_field(field_type, input_var)
+        fields.insert(index + 1, new_field) if index != -1 else fields.append(new_field)
+        return new_step
+    def delete_field(self, field_type: FieldType, index: int) -> "ModelStep":
+        """
+        Delete an input or output field from the state and update visibility.
+        Args:
+            field_type: Type of field to delete ('input' or 'output').
+            index: Index of the field to delete. [-1 to delete the last field]
+        Returns:
+            A new ModelStep with the updated fields.
+        """
+        new_step = self.model_copy()
+        fields = new_step.input_fields if field_type == "input" else new_step.output_fields
+        fields.pop(index)
+        return new_step
+class Workflow(BaseModel):
+    """
+    Represents a complete workflow composed of interconnected steps.
+    A workflow defines a directed acyclic graph of model steps, where outputs
+    from earlier steps can be used as inputs to later steps.
+    Attributes:
+        inputs: List of input variables required by the workflow
+        outputs: List of output variables produced by the workflow
+        steps: Dictionary mapping step IDs to ModelStep instances
+    The inputs and outputs lists use the format "{step_id}.{field_name}"
+    to uniquely identify variables within the workflow.
+    """
+    # variables of form {node}.{field}
+    inputs: list[str] = Field(default_factory=list)
+    # variables of form {node}.{field}
+    outputs: dict[str, str | None] = Field(default_factory=dict)
+    steps: dict[str, ModelStep] = Field(default_factory=dict)
+    def model_dump(self, *args, **kwargs):
+        data = super().model_dump(*args, **kwargs)
+        data["steps"] = list(data["steps"].values())
+        return data
+    @model_validator(mode="before")
+    def dictify_steps(cls, data):
+        if "steps" in data and isinstance(data["steps"], list):
+            steps_dict = {}
+            for step in data["steps"]:
+                if step["id"] in steps_dict:
+                    raise ValueError(f"Duplicate step ID: {step['id']}")
+                steps_dict[step["id"]] = step
+            data["steps"] = steps_dict
+        return data
+    def get_step_variables(self, step_id: str) -> list[str]:
+        """Get all variables from a specific step."""
+        step = self.steps[step_id]
+        variables = []
+        for output in step.output_fields:
+            if output.name == "":
+                continue
+            output_var = f"{step.id}.{output.name}"
+            variables.append(output_var)
+        return variables
+    def get_available_variables(self) -> list[str]:
+        """Get all output variables from all steps."""
+        variables = set(self.inputs)
+        for step in self.steps.values():
+            variables.update(self.get_step_variables(step.id))
+        return list(variables)
+# %%

src/workflows/utils.py ADDED Viewed

	@@ -0,0 +1,161 @@

+from collections import deque
+from typing import Any
+from workflows.errors import CyclicDependencyError, UnknownVariableError, WorkflowError
+from workflows.structs import ModelStep, Workflow
+"""
+Utilities for workflow dependency management and execution order determination.
+This module provides functions for analyzing workflows, determining dependencies between steps,
+and calculating the correct execution order to ensure all dependencies are satisfied.
+Key functionality includes:
+- Variable to step mapping: Identifying which step produces each variable
+- Dependency graph creation: Building a graph representing dependencies between steps
+- Topological sorting: Determining a valid execution order based on dependencies
+- Cycle detection: Identifying cyclic dependencies that would prevent execution
+These utilities form the foundation for workflow validation and execution in the
+workflow_executor module.
+"""
+def _create_variable_step_mapping(workflow: Workflow) -> dict[str, str]:
+    """
+    Creates a mapping from produced variable names to the model step that produces them.
+    Args:
+        workflow (Workflow): The workflow containing steps and their input/output fields.
+    Returns:
+        dict[str, str]: A dictionary where keys are variable names (formatted as "{step_id}.{output name}")
+                        and values are the step IDs that produce them.
+    Raises:
+        WorkflowError: If there are duplicate step IDs or if a variable is produced by multiple steps.
+    Example:
+        For a workflow with steps "extract" and "summarize" each producing outputs:
+        >>> _create_variable_step_mapping(workflow)
+        {'extract.keywords': 'extract', 'summarize.summary': 'summarize'}
+    """
+    variable_step_map: dict[str, str] = {}  # variable name -> step id
+    for step_id, step in workflow.steps.items():
+        for output in step.output_fields:
+            var_name = f"{step_id}.{output.name}"
+            if var_name in variable_step_map:
+                raise WorkflowError(f"Variable '{output.name}' has duplicate entry in step {step_id}")
+            variable_step_map[var_name] = step_id
+    return variable_step_map
+def create_dependency_graph(workflow: Workflow, input_values: dict[str, Any]) -> dict[str, set[str]]:
+    """
+    Creates a dependency graph from a workflow.
+    This function analyzes the workflow and determines which steps depend on others
+    based on their input/output relationships. A step depends on another if it requires
+    a variable that is produced by the other step. External inputs provided through
+    input_values don't create dependencies.
+    Args:
+        workflow (Workflow): The workflow containing steps and their input/output fields.
+        input_values (dict[str, Any]): A dictionary of external input values provided to the workflow.
+    Returns:
+        dict[str, set[str]]: A dictionary where keys are step IDs and values are sets of step IDs
+                             that the key step depends on.
+    Raises:
+        UnknownVariableError: If an input field references a variable that is not provided
+                              externally nor produced by any step.
+    Example:
+        For a workflow where step "classify" depends on output from "extract":
+        >>> create_dependency_graph(workflow, {})
+        {'extract': set(), 'classify': {'extract'}}
+        With external input provided for "text" variable:
+        >>> create_dependency_graph(workflow, {'text': 'Sample text'})
+        {'extract': set(), 'classify': {'extract'}}
+    """
+    produced_by = _create_variable_step_mapping(workflow)
+    dependencies: dict[str, set[str]] = {step_id: set() for step_id in workflow.steps}
+    for step_id, step in workflow.steps.items():
+        for input_field in step.input_fields:
+            var = input_field.variable
+            # If the variable was provided externally, then no dependency is needed.
+            if var in input_values:
+                continue
+            # Otherwise, check if the variable is produced by a step.
+            if var in produced_by:
+                producer_step_id = produced_by[var]
+                if producer_step_id != step_id:  # Avoid self-dependency
+                    dependencies[step_id].add(producer_step_id)
+            else:
+                raise UnknownVariableError(f"Variable '{var}' is not provided externally nor produced by any step")
+    return dependencies
+def topological_sort(dependencies: dict[str, set[str]]) -> list[str]:
+    """
+    Performs a topological sort on a dependency graph and detects cycles using Kahn's algorithm.
+    A topological sort orders the steps such that for every dependency from step A to step B,
+    step A comes before step B in the ordering. This ensures that all dependencies are satisfied
+    when executing steps in the returned order.
+    Args:
+        dependencies (dict[str, set[str]]): A dictionary where each key is a node identifier and
+                                            each value is a set of nodes that the key node depends on.
+    Returns:
+        list[str]: A list representing the nodes in topological order if no cycle is detected.
+    Raises:
+        CyclicDependencyError: If a cycle is detected in the graph.
+    Example:
+        >>> topological_sort({'A': set(), 'B': {'A'}, 'C': {'B'}})
+        ['A', 'B', 'C']
+        >>> topological_sort({'A': {'B'}, 'B': {'A'}})  # Cyclic dependency
+        CyclicDependencyError
+    Algorithm:
+        This implementation uses Kahn's algorithm:
+        1. Calculate in-degree for all nodes (number of dependencies)
+        2. Start with nodes having 0 in-degree (no dependencies)
+        3. Process each node by removing its outgoing edges
+        4. Add newly dependency-free nodes to the processing queue
+        5. If not all nodes are processed, a cycle exists
+    """
+    nodes = list(dependencies.keys())
+    dependents: dict[str, list[str]] = {node: [] for node in nodes}
+    in_degree: dict[str, int] = {node: 0 for node in nodes}
+    # Calculate in-degrees and build dependents list
+    for node, deps in dependencies.items():
+        in_degree[node] = len(deps)
+        for dep in deps:
+            dependents[dep].append(node)
+    # Initialize queue with nodes having zero in-degree
+    queue = deque([node for node, deg in in_degree.items() if deg == 0])
+    execution_order: list[str] = []
+    # Process nodes in topological order
+    while queue:
+        current = queue.popleft()
+        execution_order.append(current)
+        for dep in dependents[current]:
+            in_degree[dep] -= 1
+            if in_degree[dep] == 0:
+                queue.append(dep)
+    # If execution order includes all nodes, no cycle exists
+    if len(execution_order) != len(nodes):
+        raise CyclicDependencyError()
+    return execution_order

src/workflows/validators.py ADDED Viewed

	@@ -0,0 +1,586 @@

+import keyword
+import re
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from .structs import InputField, ModelStep, OutputField, Workflow
+SUPPORTED_TYPES = {"str", "int", "float", "bool", "list[str]", "list[int]", "list[float]", "list[bool]"}
+# Constants for validation
+MAX_FIELD_NAME_LENGTH = 50
+MAX_DESCRIPTION_LENGTH = 200
+MAX_SYSTEM_PROMPT_LENGTH = 4000
+MIN_TEMPERATURE = 0.0
+MAX_TEMPERATURE = 1.0
+class ValidationErrorType(Enum):
+    """Types of validation errors that can occur"""
+    STEP = "step"
+    DAG = "dag"
+    VARIABLE = "variable"
+    TYPE = "type"
+    GENERAL = "general"
+    NAMING = "naming"
+    LENGTH = "length"
+    RANGE = "range"
+@dataclass
+class ValidationError:
+    """Represents a validation error with type and message"""
+    error_type: ValidationErrorType
+    message: str
+    step_id: Optional[str] = None
+    field_name: Optional[str] = None
+class WorkflowValidationError(Exception):
+    """Base class for workflow validation errors"""
+    def __init__(self, errors: List[ValidationError]):
+        self.errors = errors
+        super().__init__(f"Workflow validation failed with {len(errors)} errors")
+class WorkflowValidator:
+    """Validates workflows for correctness and consistency"""
+    def __init__(self):
+        self.errors: List[ValidationError] = []
+        self.workflow: Optional[Workflow] = None
+    def validate(self, workflow: Workflow) -> bool:
+        """Main validation entry point"""
+        self.errors = []
+        self.workflow = workflow
+        # Basic workflow validation
+        if not self._validate_workflow_basic(workflow):
+            return False
+        # If it's a single-step workflow, use simple validation
+        if len(workflow.steps) == 1:
+            return self.validate_simple_workflow(workflow)
+        # Otherwise use complex validation
+        return self.validate_complex_workflow(workflow)
+    def validate_simple_workflow(self, workflow: Workflow) -> bool:
+        """Validates a single-step workflow"""
+        if not self.workflow:
+            return False
+        # Get the single step
+        step = next(iter(workflow.steps.values()))
+        # Validate the step itself
+        if not self._validate_step(step):
+            return False
+        # Validate input variables
+        for input_var in workflow.inputs:
+            if not self._is_valid_external_input(input_var):
+                self.errors.append(
+                    ValidationError(ValidationErrorType.VARIABLE, f"Invalid input variable format: {input_var}")
+                )
+                return False
+        # Validate output variables
+        for output_name, output_var in workflow.outputs.items():
+            if not output_var:
+                self.errors.append(
+                    ValidationError(ValidationErrorType.VARIABLE, f"Missing output variable for {output_name}")
+                )
+                return False
+            # Check if output variable references a valid step output
+            if not self._is_valid_variable_reference(output_var):
+                self.errors.append(
+                    ValidationError(ValidationErrorType.VARIABLE, f"Invalid output variable reference: {output_var}")
+                )
+                return False
+            # Verify the output field exists in the step
+            _, field_name = self._parse_variable_reference(output_var)
+            if not any(field.name == field_name for field in step.output_fields):
+                self.errors.append(
+                    ValidationError(
+                        ValidationErrorType.VARIABLE,
+                        f"Output field '{field_name}' not found in step '{step.id}'",
+                        step.id,
+                        field_name,
+                    )
+                )
+                return False
+        return True
+    def validate_complex_workflow(self, workflow: Workflow) -> bool:
+        """Validates a multi-step workflow"""
+        if not self.workflow:
+            return False
+        # Validate each step
+        for step in workflow.steps.values():
+            if not self._validate_step(step):
+                return False
+        # Validate input variables
+        for input_var in workflow.inputs:
+            if not self._is_valid_external_input(input_var):
+                self.errors.append(
+                    ValidationError(ValidationErrorType.VARIABLE, f"Invalid input variable format: {input_var}")
+                )
+                return False
+        # Validate output variables
+        for output_name, output_var in workflow.outputs.items():
+            if not output_var:
+                self.errors.append(
+                    ValidationError(ValidationErrorType.VARIABLE, f"Missing output variable for {output_name}")
+                )
+                return False
+            if not self._is_valid_variable_reference(output_var):
+                self.errors.append(
+                    ValidationError(ValidationErrorType.VARIABLE, f"Invalid output variable reference: {output_var}")
+                )
+                return False
+            # Verify the output field exists in the referenced step
+            step_id, field_name = self._parse_variable_reference(output_var)
+            if step_id not in workflow.steps:
+                self.errors.append(
+                    ValidationError(ValidationErrorType.VARIABLE, f"Referenced step '{step_id}' not found")
+                )
+                return False
+            ref_step = workflow.steps[step_id]
+            if not any(field.name == field_name for field in ref_step.output_fields):
+                self.errors.append(
+                    ValidationError(
+                        ValidationErrorType.VARIABLE,
+                        f"Output field '{field_name}' not found in step '{step_id}'",
+                        step_id,
+                        field_name,
+                    )
+                )
+                return False
+        # Build dependency graph
+        dep_graph: Dict[str, Set[str]] = {}
+        for step_id, step in workflow.steps.items():
+            dep_graph[step_id] = self._get_step_dependencies(step)
+        # Check for cycles in step dependencies
+        visited = set()
+        path = set()
+        def has_cycle(node: str) -> bool:
+            if node in path:
+                return True
+            if node in visited:
+                return False
+            visited.add(node)
+            path.add(node)
+            for neighbor in dep_graph.get(node, set()):
+                if has_cycle(neighbor):
+                    return True
+            path.remove(node)
+            return False
+        # Check each step for cycles
+        for step_id in workflow.steps:
+            if has_cycle(step_id):
+                self.errors.append(
+                    ValidationError(ValidationErrorType.DAG, f"Circular dependency detected involving step: {step_id}")
+                )
+                return False
+        # Check for orphaned steps (steps that aren't used by any other step)
+        used_steps = set()
+        for deps in dep_graph.values():
+            used_steps.update(deps)
+        print("Used steps: ", used_steps)
+        for step_id in workflow.steps:
+            if step_id not in used_steps and not any(
+                output_var and self._parse_variable_reference(output_var)[0] == step_id
+                for output_var in workflow.outputs.values()
+            ):
+                self.errors.append(ValidationError(ValidationErrorType.DAG, f"Orphaned step detected: {step_id}"))
+                return False
+        # Validate variable dependencies
+        if not self._validate_variable_dependencies(workflow):
+            return False
+        return True
+    def _validate_workflow_basic(self, workflow: Workflow) -> bool:
+        """Validates basic workflow properties"""
+        # Check for atleast one input
+        if not workflow.inputs:
+            self.errors.append(
+                ValidationError(ValidationErrorType.GENERAL, "Workflow must contain at least one input")
+            )
+            return False
+        if not workflow.outputs:
+            self.errors.append(
+                ValidationError(ValidationErrorType.GENERAL, "Workflow must contain at least one output")
+            )
+            return False
+        for output_var in workflow.outputs.values():
+            if output_var is None:
+                self.errors.append(ValidationError(ValidationErrorType.GENERAL, "Output variable cannot be None"))
+                return False
+        # Check for empty workflow
+        if not workflow.steps:
+            self.errors.append(ValidationError(ValidationErrorType.GENERAL, "Workflow must contain at least one step"))
+            return False
+        # Check for step ID consistency
+        for step_id, step in workflow.steps.items():
+            if step_id != step.id:
+                self.errors.append(
+                    ValidationError(ValidationErrorType.STEP, f"Step ID mismatch: {step_id} != {step.id}", step_id)
+                )
+                return False
+        return True
+    def _validate_step(self, step: ModelStep) -> bool:
+        """Validates a single step"""
+        # Validate required fields
+        if not step.id or not step.name or not step.model or not step.provider or not step.call_type:
+            self.errors.append(ValidationError(ValidationErrorType.STEP, "Step missing required fields", step.id))
+            return False
+        # Validate step ID and name
+        if not self._is_valid_identifier(step.id):
+            self.errors.append(
+                ValidationError(
+                    ValidationErrorType.NAMING,
+                    f"Invalid step ID format: {step.id}. Must be a valid Python identifier.",
+                    step.id,
+                )
+            )
+            return False
+        # Validate temperature for LLM call type
+        if step.call_type == "llm":
+            if step.temperature is None:
+                self.errors.append(
+                    ValidationError(ValidationErrorType.STEP, "LLM step must specify temperature", step.id)
+                )
+                return False
+            if not MIN_TEMPERATURE <= step.temperature <= MAX_TEMPERATURE:
+                self.errors.append(
+                    ValidationError(
+                        ValidationErrorType.RANGE,
+                        f"Temperature must be between {MIN_TEMPERATURE} and {MAX_TEMPERATURE}",
+                        step.id,
+                    )
+                )
+                return False
+        # Validate system prompt for LLM call type
+        if step.call_type == "llm":
+            if not step.system_prompt:
+                self.errors.append(
+                    ValidationError(ValidationErrorType.STEP, "LLM step must specify system prompt", step.id)
+                )
+                return False
+            if len(step.system_prompt) > MAX_SYSTEM_PROMPT_LENGTH:
+                self.errors.append(
+                    ValidationError(
+                        ValidationErrorType.LENGTH,
+                        f"System prompt exceeds maximum length of {MAX_SYSTEM_PROMPT_LENGTH} characters",
+                        step.id,
+                    )
+                )
+                return False
+        # Validate input fields
+        input_names = set()
+        for field in step.input_fields:
+            if not self._validate_input_field(field):
+                return False
+            if field.name in input_names:
+                self.errors.append(
+                    ValidationError(
+                        ValidationErrorType.STEP, f"Duplicate input field name: {field.name}", step.id, field.name
+                    )
+                )
+                return False
+            input_names.add(field.name)
+        # Validate output fields
+        output_names = set()
+        for field in step.output_fields:
+            if not self._validate_output_field(field):
+                return False
+            if field.name in output_names:
+                self.errors.append(
+                    ValidationError(
+                        ValidationErrorType.STEP, f"Duplicate output field name: {field.name}", step.id, field.name
+                    )
+                )
+                return False
+            output_names.add(field.name)
+        return True
+    def _validate_input_field(self, field: InputField) -> bool:
+        """Validates an input field"""
+        # Validate required fields
+        if not field.name or not field.description or not field.variable:
+            self.errors.append(
+                ValidationError(ValidationErrorType.STEP, "Input field missing required fields", field_name=field.name)
+            )
+            return False
+        # Validate field name
+        if not self._is_valid_identifier(field.name):
+            self.errors.append(
+                ValidationError(
+                    ValidationErrorType.NAMING,
+                    f"Invalid field name format: {field.name}. Must be a valid Python identifier.",
+                    field_name=field.name,
+                )
+            )
+            return False
+        # Validate field name length
+        if len(field.name) > MAX_FIELD_NAME_LENGTH:
+            self.errors.append(
+                ValidationError(
+                    ValidationErrorType.LENGTH,
+                    f"Field name exceeds maximum length of {MAX_FIELD_NAME_LENGTH} characters",
+                    field_name=field.name,
+                )
+            )
+            return False
+        # Validate description length
+        if len(field.description) > MAX_DESCRIPTION_LENGTH:
+            self.errors.append(
+                ValidationError(
+                    ValidationErrorType.LENGTH,
+                    f"Description exceeds maximum length of {MAX_DESCRIPTION_LENGTH} characters",
+                    field_name=field.name,
+                )
+            )
+            return False
+        # Validate variable reference
+        if not self._is_valid_variable_reference(field.variable):
+            self.errors.append(
+                ValidationError(
+                    ValidationErrorType.VARIABLE,
+                    f"Invalid variable reference: {field.variable}",
+                    field_name=field.name,
+                )
+            )
+            return False
+        return True
+    def _validate_output_field(self, field: OutputField) -> bool:
+        """Validates an output field"""
+        # Validate required fields
+        if not field.name or not field.description:
+            self.errors.append(
+                ValidationError(
+                    ValidationErrorType.STEP, "Output field missing required fields", field_name=field.name
+                )
+            )
+            return False
+        # Validate field name
+        if not self._is_valid_identifier(field.name):
+            self.errors.append(
+                ValidationError(
+                    ValidationErrorType.NAMING,
+                    f"Invalid field name format: {field.name}. Must be a valid Python identifier.",
+                    field_name=field.name,
+                )
+            )
+            return False
+        # Validate field name length
+        if len(field.name) > MAX_FIELD_NAME_LENGTH:
+            self.errors.append(
+                ValidationError(
+                    ValidationErrorType.LENGTH,
+                    f"Field name exceeds maximum length of {MAX_FIELD_NAME_LENGTH} characters",
+                    field_name=field.name,
+                )
+            )
+            return False
+        # Validate description length
+        if len(field.description) > MAX_DESCRIPTION_LENGTH:
+            self.errors.append(
+                ValidationError(
+                    ValidationErrorType.LENGTH,
+                    f"Description exceeds maximum length of {MAX_DESCRIPTION_LENGTH} characters",
+                    field_name=field.name,
+                )
+            )
+            return False
+        # Validate type
+        if field.type not in SUPPORTED_TYPES:
+            self.errors.append(
+                ValidationError(
+                    ValidationErrorType.TYPE, f"Unsupported output type: {field.type}", field_name=field.name
+                )
+            )
+            return False
+        return True
+    def _validate_simple_workflow_variables(self, workflow: Workflow) -> bool:
+        """Validates variables in a simple workflow"""
+        step = next(iter(workflow.steps.values()))
+        # Validate input variables
+        for input_var in workflow.inputs:
+            if not self._is_valid_external_input(input_var):
+                self.errors.append(
+                    ValidationError(ValidationErrorType.VARIABLE, f"Invalid input variable format: {input_var}")
+                )
+                return False
+        # Validate output variables
+        for output_name, output_var in workflow.outputs.items():
+            if output_var and not self._is_valid_variable_reference(output_var):
+                self.errors.append(
+                    ValidationError(ValidationErrorType.VARIABLE, f"Invalid output variable reference: {output_var}")
+                )
+                return False
+        return True
+    def _validate_variable_dependencies(self, workflow: Workflow) -> bool:
+        """Validates variable dependencies between steps"""
+        # Build variable dependency graph
+        var_graph: Dict[str, Set[str]] = {}
+        for step_id, step in workflow.steps.items():
+            for field in step.input_fields:
+                if field.variable not in var_graph:
+                    var_graph[field.variable] = set()
+                # Add dependency from input variable to step's outputs
+                for output in step.output_fields:
+                    var_graph[field.variable].add(f"{step_id}.{output.name}")
+        # Check for cycles in variable dependencies
+        visited = set()
+        path = set()
+        def has_cycle(node: str) -> bool:
+            if node in path:
+                return True
+            if node in visited:
+                return False
+            visited.add(node)
+            path.add(node)
+            for neighbor in var_graph.get(node, set()):
+                if has_cycle(neighbor):
+                    return True
+            path.remove(node)
+            return False
+        # Check each variable for cycles
+        for var in var_graph:
+            if has_cycle(var):
+                self.errors.append(
+                    ValidationError(ValidationErrorType.VARIABLE, f"Circular variable dependency detected: {var}")
+                )
+                return False
+        # Validate external input existence
+        external_inputs = set(workflow.inputs)
+        for step in workflow.steps.values():
+            for field in step.input_fields:
+                step_id, field_name = self._parse_variable_reference(field.variable)
+                if not step_id and field_name not in external_inputs:
+                    self.errors.append(
+                        ValidationError(
+                            ValidationErrorType.VARIABLE,
+                            f"External input '{field_name}' not found in workflow inputs",
+                            field_name=field_name,
+                        )
+                    )
+                    return False
+        return True
+    def _get_step_dependencies(self, step: ModelStep) -> Set[str]:
+        """Gets set of step IDs that this step depends on"""
+        deps = set()
+        for field in step.input_fields:
+            step_id = self._parse_variable_reference(field.variable)[0]
+            if step_id:
+                deps.add(step_id)
+        return deps
+    def _parse_variable_reference(self, var: str) -> Tuple[Optional[str], str]:
+        """Extracts step_id and field_name from variable reference"""
+        parts = var.split(".")
+        if len(parts) == 1:
+            return None, parts[0]
+        return parts[0], parts[1]
+    def _is_valid_variable_reference(self, var: str) -> bool:
+        """Validates if a variable reference is properly formatted"""
+        if not self.workflow:
+            return False
+        parts = var.split(".")
+        if len(parts) == 1:
+            return True  # External input
+        if len(parts) != 2:
+            return False
+        step_id, field_name = parts
+        return step_id in self.workflow.steps and any(
+            field.name == field_name for field in self.workflow.steps[step_id].output_fields
+        )
+    def _is_valid_external_input(self, var: str) -> bool:
+        """Validates if a variable is a valid external input"""
+        if not var:
+            return False
+        if not self._is_valid_identifier(var):
+            return False
+        if keyword.iskeyword(var):
+            return False
+        if "." in var:  # External inputs should not contain dots
+            return False
+        return True
+    def _is_valid_identifier(self, name: str) -> bool:
+        """Validates if a string is a valid Python identifier"""
+        if not name:
+            return False
+        if keyword.iskeyword(name):
+            return False
+        if not name.strip():  # Check for whitespace-only strings
+            return False
+        return bool(re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", name))

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import os
+import sys
+# Add the src directory to the PYTHONPATH
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../src"))

tests/test_executors.py ADDED Viewed

	@@ -0,0 +1,295 @@

+import json
+from unittest.mock import patch
+import pytest
+from workflows.errors import CyclicDependencyError, WorkflowError
+from workflows.executors import (
+    create_processed_inputs,
+    execute_model_step,
+    execute_workflow,
+    lower,
+    upper,
+)
+from workflows.structs import InputField, ModelStep, OutputField, Workflow
+# Tests for utility functions
+def test_upper():
+    """Test the upper function with different input types."""
+    assert upper("hello") == "HELLO"
+    assert upper("Hello World") == "HELLO WORLD"
+    assert upper("") == ""
+    # Non-string inputs should be returned unchanged
+    assert upper(123) == 123
+    assert upper([1, 2, 3]) == [1, 2, 3]
+    assert upper(None) is None
+def test_lower():
+    """Test the lower function with different input types."""
+    assert lower("HELLO") == "hello"
+    assert lower("Hello World") == "hello world"
+    assert lower("") == ""
+    # Non-string inputs should be returned unchanged
+    assert lower(123) == 123
+    assert lower([1, 2, 3]) == [1, 2, 3]
+    assert lower(None) is None
+# Tests for create_processed_inputs
+def test_create_processed_inputs_basic():
+    """Test basic input processing without transformations."""
+    step = ModelStep(
+        id="test_step",
+        model="gpt-4",
+        provider="openai",
+        call_type="llm",
+        system_prompt="Test prompt",
+        input_fields=[InputField(name="text", description="Input text", variable="input_text")],
+        output_fields=[],
+    )
+    available_vars = {"input_text": "Hello World"}
+    result = create_processed_inputs(step, available_vars)
+    assert result == {"text": "Hello World"}
+def test_create_processed_inputs_with_transformation():
+    """Test input processing with transformation functions."""
+    step = ModelStep(
+        id="test_step",
+        model="gpt-4",
+        provider="openai",
+        call_type="llm",
+        system_prompt="Test prompt",
+        input_fields=[
+            InputField(name="upper_text", description="Uppercase text", variable="input_text", func="upper"),
+            InputField(name="lower_text", description="Lowercase text", variable="input_caps", func="lower"),
+        ],
+        output_fields=[],
+    )
+    available_vars = {"input_text": "hello", "input_caps": "WORLD"}
+    result = create_processed_inputs(step, available_vars)
+    assert result == {"upper_text": "HELLO", "lower_text": "world"}
+def test_create_processed_inputs_missing_var():
+    """Test that appropriate error is raised when a variable is missing."""
+    step = ModelStep(
+        id="test_step",
+        model="gpt-4",
+        provider="openai",
+        call_type="llm",
+        system_prompt="Test prompt",
+        input_fields=[InputField(name="text", description="Input text", variable="missing_var")],
+        output_fields=[],
+    )
+    available_vars = {"input_text": "Hello World"}
+    with pytest.raises(KeyError):
+        create_processed_inputs(step, available_vars)
+def test_create_processed_inputs_unknown_func():
+    """Test that appropriate error is raised when an unknown function is specified."""
+    step = ModelStep(
+        id="test_step",
+        model="gpt-4",
+        provider="openai",
+        call_type="llm",
+        system_prompt="Test prompt",
+        input_fields=[InputField(name="text", description="Input text", variable="input_text", func="unknown_func")],
+        output_fields=[],
+    )
+    available_vars = {"input_text": "Hello World"}
+    # This should raise an error when the function isn't found
+    with pytest.raises(Exception):
+        create_processed_inputs(step, available_vars)
+# Tests for execute_model_step
+@patch("workflows.executors.litellm.completion")
+def test_execute_model_step_success(mock_completion):
+    """Test successful execution of a model step with mocked litellm response."""
+    # Mock the litellm response
+    mock_response = {"choices": [{"message": {"content": json.dumps({"summary": "This is a summary"})}}]}
+    mock_completion.return_value = mock_response
+    # Create a test step
+    step = ModelStep(
+        id="summarize",
+        model="gpt-3.5-turbo",
+        provider="openai",
+        call_type="llm",
+        system_prompt="Summarize the text",
+        input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
+        output_fields=[OutputField(name="summary", description="Summary of the text", type="str")],
+    )
+    # Execute the step
+    result = execute_model_step(step, {"input_text": "Long text to be summarized..."})
+    # Verify the results
+    assert result == {"summary": "This is a summary"}
+    # Verify the litellm call was made correctly
+    mock_completion.assert_called_once()
+    args, kwargs = mock_completion.call_args
+    assert kwargs["model"] == "gpt-3.5-turbo"
+    assert "Summarize the text" in kwargs["messages"][0]["content"]
+@patch("workflows.executors.litellm.completion")
+def test_execute_model_step_error(mock_completion):
+    """Test handling of errors in model step execution."""
+    # Make litellm raise an exception
+    mock_completion.side_effect = Exception("API Error")
+    # Create a test step
+    step = ModelStep(
+        id="summarize",
+        model="gpt-3.5-turbo",
+        provider="openai",
+        call_type="llm",
+        system_prompt="Summarize the text",
+        input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
+        output_fields=[OutputField(name="summary", description="Summary of the text", type="str")],
+    )
+    # Execute the step - should raise an exception
+    with pytest.raises(Exception):
+        execute_model_step(step, {"input_text": "Long text to be summarized..."})
+# Tests for execute_workflow
+@patch("workflows.executors.execute_model_step")
+def test_execute_workflow_simple(mock_execute_step):
+    """Test execution of a simple workflow with a single step."""
+    # Configure mock to return expected outputs
+    mock_execute_step.return_value = {"summary": "This is a summary"}
+    # Create a simple workflow
+    step = ModelStep(
+        id="summarize",
+        model="gpt-3.5-turbo",
+        provider="openai",
+        call_type="llm",
+        system_prompt="Summarize the text",
+        input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
+        output_fields=[OutputField(name="summary", description="Summary of the text", type="str")],
+    )
+    workflow = Workflow(steps={"summarize": step}, inputs=["input_text"], outputs={"summary": "summarize.summary"})
+    # Execute the workflow
+    result = execute_workflow(workflow, {"input_text": "Long text to be summarized..."})
+    # Verify the results
+    assert result == {"summary": "This is a summary"}
+    # Verify execute_model_step was called correctly
+    mock_execute_step.assert_called_once()
+@patch("workflows.executors.execute_model_step")
+def test_execute_workflow_multi_step(mock_execute_step):
+    """Test execution of a multi-step workflow with dependencies."""
+    # Configure mock to return different values based on the step
+    def side_effect(step, available_vars):
+        if step.id == "extract":
+            return {"entities": ["Apple", "product"]}
+        elif step.id == "analyze":
+            return {"sentiment": "positive"}
+        return {}
+    mock_execute_step.side_effect = side_effect
+    # Create extract step
+    extract_step = ModelStep(
+        id="extract",
+        model="gpt-3.5-turbo",
+        provider="openai",
+        call_type="llm",
+        system_prompt="Extract entities",
+        input_fields=[InputField(name="text", description="Text to analyze", variable="input_text")],
+        output_fields=[OutputField(name="entities", description="Extracted entities", type="list[str]")],
+    )
+    # Create analyze step that depends on extract step
+    analyze_step = ModelStep(
+        id="analyze",
+        model="gpt-4",
+        provider="openai",
+        call_type="llm",
+        system_prompt="Analyze sentiment",
+        input_fields=[InputField(name="entities", description="Entities to analyze", variable="extract.entities")],
+        output_fields=[OutputField(name="sentiment", description="Sentiment analysis", type="str")],
+    )
+    workflow = Workflow(
+        steps={"extract": extract_step, "analyze": analyze_step},
+        inputs=["input_text"],
+        outputs={"entities": "extract.entities", "sentiment": "analyze.sentiment"},
+    )
+    # Execute the workflow
+    result = execute_workflow(workflow, {"input_text": "Apple is launching a new product tomorrow."})
+    # Verify the results
+    assert result == {"entities": ["Apple", "product"], "sentiment": "positive"}
+    # Verify execute_model_step was called twice (once for each step)
+    assert mock_execute_step.call_count == 2
+def test_execute_workflow_missing_input():
+    """Test that an error is raised when a required input is missing."""
+    step = ModelStep(
+        id="summarize",
+        model="gpt-3.5-turbo",
+        provider="openai",
+        call_type="llm",
+        system_prompt="Summarize the text",
+        input_fields=[InputField(name="text", description="Text to summarize", variable="input_text")],
+        output_fields=[OutputField(name="summary", description="Summary of the text", type="str")],
+    )
+    workflow = Workflow(steps={"summarize": step}, inputs=["input_text"], outputs={"summary": "summarize.summary"})
+    # Execute with missing input
+    with pytest.raises(WorkflowError, match="Missing required workflow input"):
+        execute_workflow(workflow, {})
+@patch("workflows.executors.create_dependency_graph")
+def test_execute_workflow_cyclic_dependency(mock_dependency_graph):
+    """Test that a cyclic dependency in the workflow raises an appropriate error."""
+    # Make create_dependency_graph raise a CyclicDependencyError
+    mock_dependency_graph.side_effect = CyclicDependencyError()
+    step = ModelStep(
+        id="test",
+        model="gpt-3.5-turbo",
+        provider="openai",
+        call_type="llm",
+        system_prompt="Test",
+        input_fields=[],
+        output_fields=[],
+    )
+    workflow = Workflow(steps={"test": step}, inputs=[], outputs=[])
+    # This should propagate the CyclicDependencyError
+    with pytest.raises(CyclicDependencyError):
+        execute_workflow(workflow, {})

tests/test_utils.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import pytest
+from workflows.errors import CyclicDependencyError, UnknownVariableError, WorkflowError
+from workflows.utils import _create_variable_step_mapping, create_dependency_graph, topological_sort
+# Dummy classes to simulate Workflow, Step, and Field
+class DummyField:
+    def __init__(self, name, type="str", variable=None):
+        self.name = name
+        self.type = type
+        # For input fields, variable property is needed
+        self.variable = variable if variable is not None else name
+class DummyStep:
+    def __init__(self, input_fields, output_fields):
+        self.input_fields = input_fields
+        self.output_fields = output_fields
+class DummyWorkflow:
+    def __init__(self, steps):
+        # steps is a dict with key as step_id and value as DummyStep
+        self.steps = steps
+# Tests for _create_variable_step_mapping
+def test_create_variable_step_mapping_success():
+    # Create a workflow with two steps producing unique output variables
+    step_a = DummyStep(input_fields=[], output_fields=[DummyField("out1")])
+    step_b = DummyStep(input_fields=[], output_fields=[DummyField("out2")])
+    workflow = DummyWorkflow({"A": step_a, "B": step_b})
+    mapping = _create_variable_step_mapping(workflow)
+    assert mapping == {"A.out1": "A", "B.out2": "B"}
+def test_create_variable_step_mapping_duplicate():
+    # Create a workflow where two steps produce an output with same name
+    step_a = DummyStep(input_fields=[], output_fields=[DummyField("out"), DummyField("out")])
+    workflow = DummyWorkflow({"A": step_a})
+    with pytest.raises(WorkflowError):
+        _create_variable_step_mapping(workflow)
+def test_create_variable_step_mapping_empty():
+    """Test _create_variable_step_mapping with an empty workflow should return an empty mapping."""
+    workflow = DummyWorkflow({})
+    mapping = _create_variable_step_mapping(workflow)
+    assert mapping == {}
+def test_create_variable_step_mapping_multiple_outputs():
+    """Test a workflow where a single step produces multiple outputs with unique names."""
+    step = DummyStep(input_fields=[], output_fields=[DummyField("out1"), DummyField("out2")])
+    workflow = DummyWorkflow({"A": step})
+    mapping = _create_variable_step_mapping(workflow)
+    assert mapping == {"A.out1": "A", "A.out2": "A"}
+# Tests for create_dependency_graph
+def test_create_dependency_graph_success_with_dependency():
+    # Step A produces 'A.out', which is used as input in step B
+    step_a = DummyStep(input_fields=[], output_fields=[DummyField("out")])
+    # For input_fields, explicitly set variable to reference A.out
+    step_b = DummyStep(input_fields=[DummyField("dummy", variable="A.out")], output_fields=[DummyField("result")])
+    workflow = DummyWorkflow({"A": step_a, "B": step_b})
+    # No external input provided for A.out so dependency must be created
+    deps = create_dependency_graph(workflow, input_values={})
+    # Step B depends on step A
+    assert deps["B"] == {"A"}
+    # Step A has no dependencies
+    assert deps["A"] == set()
+def test_create_dependency_graph_success_with_external_input():
+    # Step B expects an input, but it is provided externally
+    step_b = DummyStep(
+        input_fields=[DummyField("param", variable="external_param")], output_fields=[DummyField("result")]
+    )
+    workflow = DummyWorkflow({"B": step_b})
+    # Provide external input for external_param
+    deps = create_dependency_graph(workflow, input_values={"external_param": 42})
+    # With external input, no dependency is needed
+    assert deps["B"] == set()
+def test_create_dependency_graph_unknown_variable():
+    # Step B expects an input that is neither produced by any step nor provided externally
+    step_b = DummyStep(
+        input_fields=[DummyField("param", variable="non_existent")], output_fields=[DummyField("result")]
+    )
+    workflow = DummyWorkflow({"B": step_b})
+    with pytest.raises(UnknownVariableError):
+        _ = create_dependency_graph(workflow, input_values={})
+def test_create_dependency_graph_complex():
+    """Test create_dependency_graph on a more complex workflow with multiple dependencies."""
+    # Step A produces A.out, Step B uses A.out, Step C uses B.out, and Step D uses both A.out and B.out
+    step_a = DummyStep(input_fields=[], output_fields=[DummyField("out")])
+    step_b = DummyStep(input_fields=[DummyField("inp", variable="A.out")], output_fields=[DummyField("out")])
+    step_c = DummyStep(input_fields=[DummyField("inp", variable="B.out")], output_fields=[DummyField("result")])
+    step_d = DummyStep(
+        input_fields=[DummyField("inp1", variable="A.out"), DummyField("inp2", variable="B.out")],
+        output_fields=[DummyField("final")],
+    )
+    workflow = DummyWorkflow({"A": step_a, "B": step_b, "C": step_c, "D": step_d})
+    # Provide external input for "B.out" so that step B's output isn't expected to come from a step
+    # However, to simulate dependency, assume external input is not provided for the dependencies used in step C and D
+    # Therefore, workflow must resolve A.out for step B, and then step B produces B.out for steps C and D.
+    # Let's not provide any external input, so both dependencies are created.
+    deps = create_dependency_graph(workflow, input_values={})
+    # Expected dependencies:
+    # B depends on A
+    # C depends on B
+    # D depends on both A and B
+    assert deps["B"] == {"A"}
+    assert deps["C"] == {"B"}
+    assert deps["D"] == {"A", "B"}
+# Tests for topological_sort
+def test_topological_sort_success():
+    # Create a simple dependency graph: A -> B -> C
+    deps = {"A": set(), "B": {"A"}, "C": {"B"}}
+    order = topological_sort(deps)
+    # Check that order satisfies dependencies: A before B, B before C
+    assert order.index("A") < order.index("B") < order.index("C")
+def test_topological_sort_cycle():
+    # Create a cyclic dependency: A -> B and B -> A
+    deps = {"A": {"B"}, "B": {"A"}}
+    with pytest.raises(CyclicDependencyError):
+        _ = topological_sort(deps)
+def test_topological_sort_single_node():
+    """Test topological_sort on a graph with a single node and no dependencies."""
+    deps = {"A": set()}
+    order = topological_sort(deps)
+    assert order == ["A"]
+def test_topological_sort_disconnected():
+    """Test topological_sort on a graph with disconnected nodes (no dependencies among them)."""
+    deps = {"A": set(), "B": set(), "C": set()}
+    order = topological_sort(deps)
+    # The order can be in any permutation, but must contain all nodes
+    assert set(order) == {"A", "B", "C"}

tests/test_validators.py ADDED Viewed

	@@ -0,0 +1,647 @@

+from typing import Dict, List
+import pytest
+from pydantic import ValidationError as PydanticValidationError
+from workflows.structs import InputField, ModelStep, OutputField, Workflow
+from workflows.validators import ValidationError, ValidationErrorType, WorkflowValidator
+# Test Data
+def create_basic_step(step_id: str = "step1") -> ModelStep:
+    """Creates a basic valid step for testing"""
+    return ModelStep(
+        id=step_id,
+        name="Test Step",
+        model="gpt-4",
+        provider="openai",
+        call_type="llm",
+        temperature=0.7,
+        system_prompt="Test prompt",
+        input_fields=[],
+        output_fields=[],
+    )
+def create_basic_workflow(steps: List[ModelStep] | None = None) -> Workflow:
+    """Creates a basic valid workflow for testing"""
+    if steps is None:
+        steps = [create_basic_step()]
+    return Workflow(inputs=[], outputs={}, steps={step.id: step for step in steps})
+# Additional Test Data
+def create_step_with_fields(
+    step_id: str, input_fields: List[InputField], output_fields: List[OutputField]
+) -> ModelStep:
+    """Creates a step with specific input and output fields"""
+    return ModelStep(
+        id=step_id,
+        name="Test Step",
+        model="gpt-4",
+        provider="openai",
+        call_type="llm",
+        temperature=0.7,
+        system_prompt="Test prompt",
+        input_fields=input_fields,
+        output_fields=output_fields,
+    )
+def create_valid_workflow() -> Workflow:
+    # Create a step with input and output fields
+    step = create_step_with_fields(
+        "step1",
+        [InputField(name="input", description="test", variable="external_input")],
+        [OutputField(name="output", description="test", type="str")],
+    )
+    # Create workflow with the single step
+    workflow = create_basic_workflow([step])
+    workflow.inputs = ["external_input"]
+    workflow.outputs = {"output": "step1.output"}
+    return workflow
+# Basic Workflow Validation Tests
+class TestBasicWorkflowValidation:
+    def test_empty_workflow(self):
+        """Test validation of empty workflow"""
+        validator = WorkflowValidator()
+        workflow = Workflow(inputs=["input"], outputs={"output": "input"}, steps={})
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.GENERAL
+        assert "must contain at least one step" in validator.errors[0].message
+    def test_workflow_without_inputs(self):
+        """Test validation of workflow without inputs"""
+        validator = WorkflowValidator()
+        workflow = create_basic_workflow()
+        workflow.inputs = []
+        workflow.outputs = {"output": "step1.field"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.GENERAL
+        assert "must contain at least one input" in validator.errors[0].message
+    def test_workflow_without_outputs(self):
+        """Test validation of workflow without outputs"""
+        validator = WorkflowValidator()
+        workflow = create_basic_workflow()
+        workflow.inputs = ["input"]
+        workflow.outputs = {}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.GENERAL
+        assert "must contain at least one output" in validator.errors[0].message
+    def test_single_step_workflow(self):
+        """Test validation of valid single-step workflow"""
+        validator = WorkflowValidator()
+        # Create a step with input and output fields
+        workflow = create_valid_workflow()
+        assert validator.validate(workflow)
+        assert len(validator.errors) == 0
+# Step Validation Tests
+class TestStepValidation:
+    def test_missing_required_fields(self):
+        """Test validation of step with missing required fields"""
+        validator = WorkflowValidator()
+        step = ModelStep(
+            id="step1",
+            name="",  # Missing name
+            model="",  # Missing model
+            provider="",  # Missing provider
+            call_type="",  # Missing call_type
+            temperature=0.7,
+            system_prompt="Test prompt",
+            input_fields=[],
+            output_fields=[],
+        )
+        workflow = create_basic_workflow([step])
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output": "step1.field"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.STEP
+    def test_invalid_step_id(self):
+        """Test validation of step with invalid ID format"""
+        validator = WorkflowValidator()
+        step = create_basic_step("123invalid")  # Invalid ID format
+        workflow = create_basic_workflow([step])
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output": "step1.field"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.NAMING
+    def test_llm_temperature_validation(self):
+        """Test validation of LLM step temperature"""
+        validator = WorkflowValidator()
+        # Test invalid temperature
+        step = create_basic_step()
+        step.temperature = 1.5  # Invalid temperature
+        workflow = create_basic_workflow([step])
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output": "step1.field"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.RANGE
+        # Test missing temperature
+        step = create_basic_step()
+        step.temperature = None  # Missing temperature
+        workflow = create_basic_workflow([step])
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output": "step1.field"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.STEP
+    def test_llm_system_prompt_validation(self):
+        """Test validation of LLM step system prompt"""
+        validator = WorkflowValidator()
+        # Test missing system prompt
+        step = create_basic_step()
+        step.system_prompt = ""  # Missing system prompt
+        workflow = create_basic_workflow([step])
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output": "step1.field"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.STEP
+        # Test too long system prompt
+        step = create_basic_step()
+        step.system_prompt = "x" * 4001  # Too long
+        workflow = create_basic_workflow([step])
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output": "step1.field"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.LENGTH
+# Field Validation Tests
+class TestFieldValidation:
+    def test_input_field_validation(self):
+        """Test validation of input fields"""
+        validator = WorkflowValidator()
+        # Test missing required fields
+        step = create_basic_step()
+        step.input_fields = [InputField(name="", description="", variable="")]
+        workflow = create_basic_workflow([step])
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output": "step1.field"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.STEP
+        # Test invalid field name
+        step = create_basic_step()
+        step.input_fields = [InputField(name="123invalid", description="test", variable="test")]
+        workflow = create_basic_workflow([step])
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output": "step1.field"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.NAMING
+        # Test too long description
+        step = create_basic_step()
+        step.input_fields = [InputField(name="test", description="x" * 201, variable="test")]
+        workflow = create_basic_workflow([step])
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output": "step1.field"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.LENGTH
+    def test_output_field_validation(self):
+        """Test validation of output fields"""
+        validator = WorkflowValidator()
+        # Test missing required fields
+        step = create_basic_step()
+        step.output_fields = [OutputField(name="", description="", type="str")]
+        workflow = create_basic_workflow([step])
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output": "step1.field"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.STEP
+        # Test invalid field name
+        step = create_basic_step()
+        step.output_fields = [OutputField(name="123invalid", description="test", type="str")]
+        workflow = create_basic_workflow([step])
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output": "step1.field"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.NAMING
+    def test_field_name_length(self):
+        """Test validation of field name length"""
+        validator = WorkflowValidator()
+        # Test too long field name
+        step = create_basic_step()
+        step.input_fields = [InputField(name="x" * 51, description="test", variable="test")]
+        workflow = create_basic_workflow([step])
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output": "step1.field"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.LENGTH
+    def test_field_description_length(self):
+        """Test validation of field description length"""
+        validator = WorkflowValidator()
+        # Test too long description
+        step = create_basic_step()
+        step.input_fields = [InputField(name="test", description="x" * 201, variable="test")]
+        workflow = create_basic_workflow([step])
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output": "step1.field"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.LENGTH
+    def test_whitespace_only_strings(self):
+        """Test validation of whitespace-only strings"""
+        validator = WorkflowValidator()
+        # Test whitespace-only field name
+        step = create_basic_step()
+        step.input_fields = [InputField(name="   ", description="test", variable="test")]
+        workflow = create_basic_workflow([step])
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output": "step1.field"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.NAMING
+    def test_special_characters(self):
+        """Test validation of special characters in names"""
+        validator = WorkflowValidator()
+        # Test special characters in field name
+        step = create_basic_step()
+        step.input_fields = [InputField(name="test@field", description="test", variable="test")]
+        workflow = create_basic_workflow([step])
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output": "step1.field"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.NAMING
+# Variable Reference Tests
+class TestVariableReference:
+    def test_external_input_validation(self):
+        """Test validation of external input variables"""
+        validator = WorkflowValidator()
+        # Test invalid external input format
+        workflow = create_valid_workflow()
+        workflow.inputs = ["step1.field"]  # Invalid format
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.VARIABLE
+    def test_step_output_reference(self):
+        """Test validation of step output references"""
+        validator = WorkflowValidator()
+        # Test invalid output reference
+        workflow = create_basic_workflow()
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output": "nonexistent_step.field"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.VARIABLE
+        # Test valid output reference
+        step = create_basic_step()
+        step.output_fields = [OutputField(name="field", description="test", type="str")]
+        workflow = create_basic_workflow([step])
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output": "step1.field"}
+        assert validator.validate(workflow)
+        assert len(validator.errors) == 0
+# DAG Validation Tests
+class TestDAGValidation:
+    def test_cycle_detection(self):
+        """Test detection of cycles in workflow"""
+        validator = WorkflowValidator()
+        # Create a workflow with a cycle
+        step1 = create_step_with_fields(
+            "step1",
+            [InputField(name="input", description="test", variable="step3.output")],
+            [OutputField(name="output", description="test", type="str")],
+        )
+        step2 = create_step_with_fields(
+            "step2",
+            [InputField(name="input", description="test", variable="step1.output")],
+            [OutputField(name="output", description="test", type="str")],
+        )
+        step3 = create_step_with_fields(
+            "step3",
+            [InputField(name="input", description="test", variable="step2.output")],
+            [OutputField(name="output", description="test", type="str")],
+        )
+        workflow = create_basic_workflow([step1, step2, step3])
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output": "step3.output"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.DAG
+    def test_orphaned_steps(self):
+        """Test detection of orphaned steps"""
+        validator = WorkflowValidator()
+        # Create a workflow with an orphaned step
+        step1 = create_step_with_fields(
+            "step1",
+            [InputField(name="input", description="test", variable="step2.output")],
+            [OutputField(name="output", description="test", type="str")],
+        )
+        step2 = create_step_with_fields(
+            "step2",
+            [InputField(name="input", description="test", variable="step1.output")],
+            [OutputField(name="output", description="test", type="str")],
+        )
+        step3 = create_step_with_fields(
+            "step3",
+            [],
+            [OutputField(name="output", description="test", type="str")],
+        )
+        workflow = create_basic_workflow([step1, step2, step3])
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output": "step3.output"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.DAG
+# Variable Dependency Tests
+class TestVariableDependencies:
+    def test_circular_dependencies(self):
+        """Test detection of circular variable dependencies"""
+        validator = WorkflowValidator()
+        # Create a workflow with circular variable dependencies
+        step1 = create_step_with_fields(
+            "step1",
+            [InputField(name="input", description="test", variable="step2.output")],
+            [OutputField(name="output", description="test", type="str")],
+        )
+        step2 = create_step_with_fields(
+            "step2",
+            [InputField(name="input", description="test", variable="step1.output")],
+            [OutputField(name="output", description="test", type="str")],
+        )
+        workflow = create_basic_workflow([step1, step2])
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output": "step2.output"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.DAG
+    def test_valid_dependencies(self):
+        """Test validation of valid variable dependencies"""
+        validator = WorkflowValidator()
+        # Create a workflow with valid dependencies
+        step1 = create_step_with_fields(
+            "step1",
+            [InputField(name="input", description="test", variable="external_input")],
+            [OutputField(name="output", description="test", type="str")],
+        )
+        step2 = create_step_with_fields(
+            "step2",
+            [InputField(name="input", description="test", variable="step1.output")],
+            [OutputField(name="output", description="test", type="str")],
+        )
+        step3 = create_step_with_fields(
+            "step3",
+            [InputField(name="input", description="test", variable="step2.output")],
+            [OutputField(name="output", description="test", type="str")],
+        )
+        workflow = create_basic_workflow([step1, step2, step3])
+        workflow.inputs = ["external_input"]
+        workflow.outputs = {"output": "step3.output"}
+        assert validator.validate(workflow)
+        assert len(validator.errors) == 0
+# Type Compatibility Tests
+class TestTypeCompatibility:
+    def test_basic_type_compatibility(self):
+        """Test validation of basic type compatibility"""
+        validator = WorkflowValidator()
+        # Create steps with type mismatch
+        step1 = create_step_with_fields(
+            "step1",
+            [InputField(name="input", description="test", variable="external_input")],
+            [OutputField(name="output", description="test", type="int")],
+        )
+        step2 = create_step_with_fields(
+            "step2",
+            [InputField(name="input", description="test", variable="step1.output")],
+            [OutputField(name="output", description="test", type="str")],
+        )
+        workflow = create_basic_workflow([step1, step2])
+        workflow.inputs = ["external_input"]
+        workflow.outputs = {"output": "step2.output"}
+        assert validator.validate(workflow)
+    # def test_list_type_compatibility(self):
+    #     """Test validation of list type compatibility"""
+    #     validator = WorkflowValidator()
+    #     # Test compatible list types
+    #     step1 = create_step_with_fields(
+    #         "step1", [], [OutputField(name="output", description="test", type="list[str]")]
+    #     )
+    #     step2 = create_step_with_fields(
+    #         "step2", [InputField(name="input", description="test", variable="step1.output")], []
+    #     )
+    #     workflow = create_basic_workflow([step1, step2])
+    #     workflow.inputs = ["input"]
+    #     workflow.outputs = {"output": "step2.output"}
+    #     assert validator.validate(workflow)
+    #     assert len(validator.errors) == 0
+    #     # Test incompatible list types
+    #     step1 = create_step_with_fields(
+    #         "step1", [], [OutputField(name="output", description="test", type="list[int]")]
+    #     )
+    #     step2 = create_step_with_fields(
+    #         "step2", [InputField(name="input", description="test", variable="step1.output")], []
+    #     )
+    #     workflow = create_basic_workflow([step1, step2])
+    #     workflow.inputs = ["input"]
+    #     workflow.outputs = {"output": "step2.output"}
+    #     assert not validator.validate(workflow)
+    #     assert len(validator.errors) == 1
+    #     assert validator.errors[0].error_type == ValidationErrorType.TYPE
+# Complex Workflow Tests
+class TestComplexWorkflows:
+    def test_multi_output_workflow(self):
+        """Test validation of workflow with multiple outputs"""
+        validator = WorkflowValidator()
+        # Create a workflow with multiple outputs
+        step1 = create_step_with_fields(
+            "step1",
+            [],
+            [
+                OutputField(name="output1", description="test", type="str"),
+                OutputField(name="output2", description="test", type="int"),
+            ],
+        )
+        step2 = create_step_with_fields(
+            "step2",
+            [InputField(name="input", description="test", variable="step1.output1")],
+            [OutputField(name="output", description="test", type="str")],
+        )
+        workflow = create_basic_workflow([step1, step2])
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output1": "step1.output1", "output2": "step1.output2", "output3": "step2.output"}
+        assert validator.validate(workflow)
+        assert len(validator.errors) == 0
+    def test_complex_dependencies(self):
+        """Test validation of workflow with complex dependencies"""
+        validator = WorkflowValidator()
+        # Create a workflow with complex dependencies
+        step1 = create_step_with_fields(
+            "step1",
+            [InputField(name="input", description="test", variable="external_input")],
+            [OutputField(name="output", description="test", type="str")],
+        )
+        step2 = create_step_with_fields(
+            "step2",
+            [InputField(name="input", description="test", variable="step1.output")],
+            [OutputField(name="output", description="test", type="str")],
+        )
+        step3 = create_step_with_fields(
+            "step3",
+            [
+                InputField(name="input1", description="test", variable="step1.output"),
+                InputField(name="input2", description="test", variable="step2.output"),
+            ],
+            [OutputField(name="output", description="test", type="str")],
+        )
+        workflow = create_basic_workflow([step1, step2, step3])
+        workflow.inputs = ["external_input"]
+        workflow.outputs = {"output": "step3.output"}
+        assert validator.validate(workflow)
+        assert len(validator.errors) == 0
+# External Input Tests
+class TestExternalInputs:
+    def test_external_input_existence(self):
+        """Test validation of external input existence"""
+        validator = WorkflowValidator()
+        # Test missing external input
+        step = create_step_with_fields(
+            "step1", [InputField(name="input", description="test", variable="missing_input")], []
+        )
+        workflow = create_basic_workflow([step])
+        workflow.inputs = ["valid_input"]
+        workflow.outputs = {"output": "step1.output"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.VARIABLE
+    def test_external_input_naming_conflicts(self):
+        """Test validation of external input naming conflicts"""
+        validator = WorkflowValidator()
+        # Test conflict between external input and step output
+        step = create_step_with_fields("step1", [], [OutputField(name="output", description="test", type="str")])
+        workflow = create_basic_workflow([step])
+        workflow.inputs = ["step1.output"]  # Conflict with step output
+        workflow.outputs = {"output": "step1.output"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.VARIABLE
+# Edge Cases
+class TestEdgeCases:
+    def test_empty_workflow_with_inputs(self):
+        """Test validation of empty workflow with inputs"""
+        validator = WorkflowValidator()
+        workflow = Workflow(inputs=["input"], outputs={}, steps={})
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.GENERAL
+    def test_workflow_with_empty_outputs(self):
+        """Test validation of workflow with empty outputs"""
+        validator = WorkflowValidator()
+        workflow = create_valid_workflow()
+        workflow.outputs = {}  # Empty output reference
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.GENERAL
+    def test_workflow_with_none_outputs(self):
+        """Test validation of workflow with empty outputs"""
+        validator = WorkflowValidator()
+        workflow = create_valid_workflow()
+        workflow.outputs = {"output": None}  # Empty output reference
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.GENERAL
+    def test_workflow_with_duplicate_output_names(self):
+        """Test validation of workflow with duplicate output names"""
+        validator = WorkflowValidator()
+        step = create_step_with_fields(
+            "step1",
+            [],
+            [
+                OutputField(name="output", description="test", type="str"),
+                OutputField(name="output", description="test", type="str"),
+            ],
+        )
+        workflow = create_basic_workflow([step])
+        workflow.inputs = ["input"]
+        workflow.outputs = {"output": "step1.output"}
+        assert not validator.validate(workflow)
+        assert len(validator.errors) == 1
+        assert validator.errors[0].error_type == ValidationErrorType.STEP