Spaces:

flowaicom
/

Flow-Judge-v0.1

Running

App Files Files Community

bergr7f commited on Oct 10, 2024

Commit

31fda98

1 Parent(s): e871e90

Add WIP application file and dependencies

Browse files

Files changed (2) hide show

app.py +204 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import gradio as gr
+import pandas as pd
+from typing import List, Dict
+from flow_judge import Vllm, FlowJudge, EvalInput
+from flow_judge.metrics import CustomMetric, RubricItem
+try:
+    model = Vllm(quantized=False)
+except Exception as e:
+    raise RuntimeError(f"Failed to initialize Vllm: {e}")
+EXAMPLES = [
+    {
+        "example_description": "Faithfulness of a answer",
+        "emoji": "🏈",
+        "task_inputs": [{"name": "Question", "value": "What is the capital of France?"}, {"name": "Context", "value": "Paris is the capital of Spain."}],
+        "task_output": {"name": "Answer", "value": "The capital of France is Paris."},
+        "evaluation_criteria": "Based on the provided context, does the response contain only information that is supported by or \
+directly inferable from the context?",
+        "rubric": ['The response contains statements or claims that cannot be directly found in or logically inferred \
+from the provided context. There is hallucinated or fabricated information present in the response \
+that does not have support in the given context.', 'The response contains only statements and claims that are directly stated in or logically \
+inferable from the provided context. There is no hallucinated or fabricated information present in \
+the response that cannot be traced back to or deduced from the context.']
+    }
+]
+def populate_fields(example_index: int):
+    example = EXAMPLES[example_index]
+    return (
+        [[input["name"], input["value"]] for input in example["task_inputs"]],
+        [[example["task_output"]["name"], example["task_output"]["value"]]],
+        example["evaluation_criteria"],
+        [[str(i), description] for i, description in enumerate(example["rubric"])]
+    )
+def evaluate(task_inputs: pd.DataFrame, task_output: pd.DataFrame, evaluation_criteria: str, rubric: pd.DataFrame) -> tuple:
+    # Convert inputs to the expected format
+    eval_input = EvalInput(
+        inputs=[{row['Name']: row['Value']} for _, row in task_inputs.iterrows()],
+        output={row['Name']: row['Value'] for _, row in task_output.iterrows()}
+    )
+    # Parse the rubric into RubricItems
+    rubric_items = [
+        RubricItem(score=int(row['Score']), description=row['Description'])
+        for _, row in rubric.iterrows()
+    ]
+    # Create the CustomMetric
+    custom_metric = CustomMetric(
+        name="custom-metric",
+        criteria=evaluation_criteria,
+        rubric=rubric_items,
+        required_inputs=[input_row['Name'] for _, input_row in task_inputs.iterrows()],
+        required_output=task_output.iloc[0]['Name']
+    )
+    # Create a FlowJudge instance
+    judge = FlowJudge(model=model, metric=custom_metric)
+    # Evaluate using FlowJudge
+    try:
+        result = judge.evaluate(eval_input)
+    except Exception as e:
+        raise RuntimeError(f"Failed to evaluate: {e}")
+    # Extract feedback and score from the result
+    feedback = result.feedback
+    score = result.score
+    return feedback, score
+def reset_fields():
+    return (
+        [["", ""]],  # task_inputs
+        [["", ""]],  # task_output
+        "",          # evaluation_criteria
+        [["", ""]],  # rubric
+        "",          # feedback
+        ""           # score
+    )
+def reset_task():
+    return (
+        [["", ""]],  # task_inputs
+        [["", ""]]   # task_output
+    )
+def reset_evaluation_criteria():
+    return (
+        "",          # evaluation_criteria
+        [["", ""]]   # rubric
+    )
+with gr.Blocks() as demo:
+    with gr.Row():
+        example_buttons = [gr.Button(f"{example['emoji']} Example {i+1}") for i, example in enumerate(EXAMPLES)]
+    with gr.Row(equal_height=False):
+        with gr.Column(scale=1):
+            gr.Markdown("**Inputs**")
+            task_inputs = gr.Dataframe(
+                headers=["Name", "Value"],
+                col_count=(2, "fixed"),
+                datatype=["str", "str"],
+                row_count=1,
+                column_widths=["30%", "70%"]
+            )
+            add_input_btn = gr.Button("Add Input")
+            gr.Markdown("**Output**")
+            task_output = gr.Dataframe(
+                headers=["Name", "Value"],
+                col_count=(2, "fixed"),
+                datatype=["str", "str"],
+                row_count=1,
+                column_widths=["30%", "70%"]
+            )
+            reset_task_btn = gr.Button("Clear Inputs and Output")
+        with gr.Column(scale=1):
+            gr.Markdown("**Evaluation criteria and rubric**")
+            evaluation_criteria = gr.Textbox(label="Evaluation criteria")
+            rubric = gr.Dataframe(
+                headers=["Score", "Description"],
+                col_count=(2, "fixed"),
+                datatype=["str", "str"],
+                row_count=1,
+                column_widths=["10%", "90%"]
+            )
+            add_score_btn = gr.Button("Add Score")
+            reset_criteria_btn = gr.Button("Clear Evaluation Criteria")
+    with gr.Row():
+        with gr.Column(scale=1, variant="compact"):
+            gr.Markdown("**Evaluation**")
+            feedback = gr.Textbox(label="Feedback")
+            score = gr.Textbox(label="Score")
+            evaluate_btn = gr.Button("Evaluate")
+    with gr.Row():
+        # Add the reset buttons
+        reset_all_btn = gr.Button("Clear All")
+    # Event handlers
+    add_input_btn.click(
+        lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]],
+                                headers=["Name", "Value"],
+                                col_count=(2, "fixed"),
+                                datatype=["str", "str"],
+                                row_count=1,
+                                column_widths=["30%", "70%"]),
+        inputs=task_inputs,
+        outputs=task_inputs
+    )
+    add_score_btn.click(
+        lambda df: gr.Dataframe(value=df.values.tolist() + [["", ""]],
+                                headers=["Score", "Description"],
+                                col_count=(2, "fixed"),
+                                datatype=["str", "str"],
+                                row_count=1,
+                                column_widths=["10%", "90%"]),
+        inputs=rubric,
+        outputs=rubric
+    )
+    for i, button in enumerate(example_buttons):
+        button.click(
+            populate_fields,
+            inputs=[gr.State(i)],  # Pass the example index as a state
+            outputs=[task_inputs, task_output, evaluation_criteria, rubric]
+        )
+    evaluate_btn.click(
+        evaluate,
+        inputs=[task_inputs, task_output, evaluation_criteria, rubric],
+        outputs=[feedback, score]
+    )
+    reset_task_btn.click(
+        reset_task,
+        inputs=[],
+        outputs=[task_inputs, task_output]
+    )
+    reset_criteria_btn.click(
+        reset_evaluation_criteria,
+        inputs=[],
+        outputs=[evaluation_criteria, rubric]
+    )
+    reset_all_btn.click(
+        reset_fields,
+        inputs=[],
+        outputs=[task_inputs, task_output, evaluation_criteria, rubric, feedback, score]
+    )
+if __name__ == "__main__":
+    demo.launch(debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+vllm-flash-attn==2.6.2
+flow-judge[vllm]==0.1.0
+flash_attn>=2.6.3