import gradio as gr import pandas as pd from typing import List, Dict from flow_judge import Vllm, FlowJudge, EvalInput from flow_judge.metrics import CustomMetric, RubricItem try: model = Vllm(quantized=False) except Exception as e: raise RuntimeError(f"Failed to initialize Vllm: {e}") EXAMPLES = [ { "example_description": "Faithfulness of a answer", "emoji": "🏈", "task_inputs": [{"name": "Question", "value": "What is the capital of France?"}, {"name": "Context", "value": "Paris is the capital of Spain."}], "task_output": {"name": "Answer", "value": "The capital of France is Paris."}, "evaluation_criteria": "Based on the provided context, does the response contain only information that is supported by or \ directly inferable from the context?", "rubric": ['The response contains statements or claims that cannot be directly found in or logically inferred \ from the provided context. There is hallucinated or fabricated information present in the response \ that does not have support in the given context.', 'The response contains only statements and claims that are directly stated in or logically \ inferable from the provided context. There is no hallucinated or fabricated information present in \ the response that cannot be traced back to or deduced from the context.'] } ] def populate_fields(example_index: int): example = EXAMPLES[example_index] return ( [[input["name"], input["value"]] for input in example["task_inputs"]], [[example["task_output"]["name"], example["task_output"]["value"]]], example["evaluation_criteria"], [[str(i), description] for i, description in enumerate(example["rubric"])] ) def evaluate(task_inputs: pd.DataFrame, task_output: pd.DataFrame, evaluation_criteria: str, rubric: pd.DataFrame) -> tuple: # Convert inputs to the expected format eval_input = EvalInput( inputs=[{row['Name']: row['Value']} for _, row in task_inputs.iterrows()], output={row['Name']: row['Value'] for _, row in task_output.iterrows()} ) # Parse the rubric into RubricItems rubric_items = [ RubricItem(score=int(row['Score']), description=row['Description']) for _, row in rubric.iterrows() ] # Create the CustomMetric custom_metric = CustomMetric( name="custom-metric", criteria=evaluation_criteria, rubric=rubric_items, required_inputs=[input_row['Name'] for _, input_row in task_inputs.iterrows()], required_output=task_output.iloc[0]['Name'] ) # Create a FlowJudge instance judge = FlowJudge(model=model, metric=custom_metric) # Evaluate using FlowJudge try: result = judge.evaluate(eval_input) except Exception as e: raise RuntimeError(f"Failed to evaluate: {e}") # Extract feedback and score from the result feedback = result.feedback score = result.score return feedback, score def reset_fields(): return ( [["", ""]], # task_inputs [["", ""]], # task_output "", # evaluation_criteria [["", ""]], # rubric "", # feedback "" # score ) def reset_task(): return ( [["", ""]], # task_inputs [["", ""]] # task_output ) def reset_evaluation_criteria(): return ( "", # evaluation_criteria [["", ""]] # rubric ) with gr.Blocks() as demo: with gr.Row(): example_buttons = [gr.Button(f"{example['emoji']} Example {i+1}") for i, example in enumerate(EXAMPLES)] with gr.Row(equal_height=False): with gr.Column(scale=1): gr.Markdown("**Inputs**") task_inputs = gr.Dataframe( headers=["Name", "Value"], col_count=(2, "fixed"), datatype=["str", "str"], row_count=1, column_widths=["30%", "70%"] ) add_input_btn = gr.Button("Add Input") gr.Markdown("**Output**") task_output = gr.Dataframe( headers=["Name", "Value"], col_count=(2, "fixed"), datatype=["str", "str"], row_count=1, column_widths=["30%", "70%"] ) reset_task_btn = gr.Button("Clear Inputs and Output") with gr.Column(scale=1): gr.Markdown("**Evaluation criteria and rubric**") evaluation_criteria = gr.Textbox(label="Evaluation criteria") rubric = gr.Dataframe( headers=["Score", "Description"], col_count=(2, "fixed"), datatype=["str", "str"], row_count=1, column_widths=["10%", "90%"] ) add_score_btn = gr.Button("Add Score") reset_criteria_btn = gr.Button("Clear Evaluation Criteria") with gr.Row(): with gr.Column(scale=1, variant="compact"): gr.Markdown("**Evaluation**") feedback = gr.Textbox(label="Feedback") score = gr.Textbox(label="Score") evaluate_btn = gr.Button("Evaluate") with gr.Row(): # Add the reset buttons reset_all_btn = gr.Button("Clear All") # Event handlers add_input_btn.click( lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]], headers=["Name", "Value"], col_count=(2, "fixed"), datatype=["str", "str"], row_count=1, column_widths=["30%", "70%"]), inputs=task_inputs, outputs=task_inputs ) add_score_btn.click( lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]], headers=["Score", "Description"], col_count=(2, "fixed"), datatype=["str", "str"], row_count=1, column_widths=["10%", "90%"]), inputs=rubric, outputs=rubric ) for i, button in enumerate(example_buttons): button.click( populate_fields, inputs=[gr.State(i)], # Pass the example index as a state outputs=[task_inputs, task_output, evaluation_criteria, rubric] ) evaluate_btn.click( evaluate, inputs=[task_inputs, task_output, evaluation_criteria, rubric], outputs=[feedback, score] ) reset_task_btn.click( reset_task, inputs=[], outputs=[task_inputs, task_output] ) reset_criteria_btn.click( reset_evaluation_criteria, inputs=[], outputs=[evaluation_criteria, rubric] ) reset_all_btn.click( reset_fields, inputs=[], outputs=[task_inputs, task_output, evaluation_criteria, rubric, feedback, score] ) if __name__ == "__main__": demo.launch(debug=True)