Spaces:
Sleeping
Sleeping
import gradio as gr | |
import spaces | |
import pandas as pd | |
from typing import List, Dict, Tuple | |
from flow_judge import Hf, FlowJudge, EvalInput | |
from flow_judge.metrics import CustomMetric, RubricItem | |
from huggingface_hub import snapshot_download | |
from flow_judge.models.huggingface import Hf | |
from examples import get_examples | |
MODEL_NAME = "flowaicom/Flow-Judge-v0.1" | |
def download_model(): | |
try: | |
print(f"Downloading model {MODEL_NAME}...") | |
snapshot_download(repo_id=MODEL_NAME) | |
print(f"Model {MODEL_NAME} downloaded to default Hugging Face cache") | |
return True | |
except Exception as e: | |
raise RuntimeError(f"Failed to download model {MODEL_NAME}: {e}") | |
def evaluate( | |
inputs_task: List[Dict[str, str]], | |
output_name: str, | |
output_value: str, | |
evaluation_criteria: str, | |
rubric_items: List[Dict[str, str]] | |
) -> Tuple[str, int]: | |
# [{'name': 'a', 'value': 'a'}] | |
try: | |
model = Hf(flash_attn=False) | |
except Exception as e: | |
raise RuntimeError(f"Failed to initialize Hf Model: {e}") | |
eval_input = EvalInput( | |
inputs=[{input['name']: input['value']} for input in inputs_task], | |
output={output_name: output_value} | |
) | |
score_rubric_items = [ | |
RubricItem( | |
score=int(rubric_item['name']), | |
description=rubric_item['value'] | |
) | |
for rubric_item in rubric_items | |
] | |
custom_metric = CustomMetric( | |
name="custom-metric", | |
criteria=evaluation_criteria, | |
rubric=score_rubric_items, | |
required_inputs=[input['name'] for input in inputs_task], | |
required_output=output_name | |
) | |
judge = FlowJudge(model=model, metric=custom_metric) | |
try: | |
result = judge.evaluate(eval_input) | |
except Exception as e: | |
raise RuntimeError(f"Failed to evaluate: {e}") | |
return result.feedback, result.score | |
def reset_all(): | |
return ( | |
[], "", "", [], "", "", # Existing resets for inputs and rubrics | |
"", "", "", "", "" # New resets for additional fields | |
) | |
# Define presets | |
EXAMPLES = get_examples() | |
IMAGE_PATH = "./img/flow_judge_banner.png" | |
HEADER = """<h1 align="center" style="font-family: 'Courier New', Courier, monospace;">Flow Judge Demo</h1> | |
<p align="center" style="font-family: 'Courier New', Courier, monospace;"> | |
<strong> | |
<a href="https://www.flow-ai.com/judge">Technical Report</a> | | |
<a href="https://huggingface.co/collections/flowaicom/flow-judge-v01-66e6af5fc3b3a128bde07dec">Model Weights</a> | | |
<a href="https://github.com/flowaicom/lm-evaluation-harness/tree/Flow-Judge-v0.1_evals/lm_eval/tasks/flow_judge_evals">Evaluation Code</a> | | |
<a href="https://github.com/flowaicom/flow-judge/tree/main/examples">Tutorials</a> | |
</strong> | |
</p> | |
<p align="center" style="font-family: 'Courier New', Courier, monospace;"> | |
<code>flow-judge</code> is a lightweight library for evaluating LLM applications with <code>Flow-Judge-v0.1</code>. | |
</p>""" | |
with gr.Blocks() as demo: | |
model_downloaded = download_model() | |
with gr.Row(equal_height=False): | |
with gr.Column(scale=2): | |
gr.Image(IMAGE_PATH, show_label=False, interactive=False, show_share_button=False, show_fullscreen_button=False, show_download_button=False) | |
with gr.Column(scale=3): | |
gr.HTML(HEADER) | |
gr.Markdown("# ⚡ **Quickstart Examples**") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
preset_buttons = [gr.Button(example["description"]) for example in EXAMPLES[:len(EXAMPLES)//3]] | |
with gr.Column(scale=1): | |
preset_buttons += [gr.Button(example["description"]) for example in EXAMPLES[len(EXAMPLES)//3:2*len(EXAMPLES)//3]] | |
with gr.Column(scale=1): | |
preset_buttons += [gr.Button(example["description"]) for example in EXAMPLES[2*len(EXAMPLES)//3:]] | |
with gr.Row(equal_height=False): | |
with gr.Column(scale=1): | |
gr.Markdown("## **Evaluation task inputs**") | |
gr.Markdown("*<span style='color: gray;'>Define the input names and values. Inputs are optional if evaluation depends on the output only.</span>*") | |
with gr.Group(): | |
inputs_task = gr.State([]) | |
new_input_name = gr.Textbox(label="Name") | |
new_input_value = gr.Textbox(label="Value") | |
def add_input(inputs_task, new_input_name, new_input_value): | |
return inputs_task + [{"name": new_input_name, "value": new_input_value}], "", "" | |
# You have to pass the state here | |
def render_inputs(inputs_list): # Use different name than the state variable | |
for input in inputs_list: | |
with gr.Group(): | |
with gr.Row(equal_height=True): | |
with gr.Column(min_width=60, scale=2): | |
gr.Textbox(input['name'], label="Name", show_label=True, interactive=False, autoscroll=False, max_lines=1) | |
with gr.Column(scale=8): | |
gr.Textbox(input['value'], label="Value", show_label=True, interactive=False, autoscroll=False, max_lines=3) | |
with gr.Column(min_width=15, scale=1): | |
delete_btn = gr.Button("X", size="lg", variant="secondary") | |
def delete(input=input): | |
inputs_list.remove(input) | |
return inputs_list | |
delete_btn.click(delete, None, [inputs_task]) # This is the state variable | |
gr.Button("Add Input").click( | |
add_input, | |
[inputs_task, new_input_name, new_input_value], | |
[inputs_task, new_input_name, new_input_value] | |
) | |
with gr.Column(scale=1): | |
gr.Markdown("## **Evaluation task output**") | |
gr.Markdown("*<span style='color: gray;'>Define the output name and value. Output is always required.</span>*") | |
with gr.Group(): | |
with gr.Row(equal_height=True): | |
with gr.Column(min_width=60, scale=2): | |
output_name = gr.Textbox(label="Name", show_label=True, interactive=True, autoscroll=False, max_lines=1) | |
with gr.Column(scale=9): | |
output_value = gr.Textbox(label="Value", show_label=True, interactive=True, autoscroll=False, max_lines=3) | |
with gr.Column(scale=1): | |
gr.Markdown("## **Evaluation criteria and rubric**") | |
gr.Markdown("*<span style='color: gray;'>Define the evaluation criteria and rubric for the evaluation task. Supported scoring scales: Binary (0 and 1), 3-Likert and 5-Likert.</span>*\n\n*<span style='color: gray;'>❗You can experiment with other scoring scales. However, performance may vary.</span>*") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
rubric_items = gr.State([]) | |
new_rubric_name = gr.Textbox(label="Score", show_label=True, interactive=True, autoscroll=False, max_lines=1) | |
new_rubric_value = gr.Textbox(label="Description", show_label=True, interactive=True, autoscroll=False, max_lines=3) | |
def add_rubric_item(rubric_items, new_rubric_name, new_rubric_value): | |
return rubric_items + [{"name": new_rubric_name, "value": new_rubric_value}], "", "" | |
# You have to pass the state here | |
def render_rubrics(rubric_items_list): # Use different name than the state variable | |
for rubric_item in rubric_items_list: | |
with gr.Group(): | |
with gr.Row(equal_height=True): | |
with gr.Column(min_width=30, scale=1): | |
gr.Textbox(rubric_item['name'], label="Score", show_label=True, interactive=False) | |
with gr.Column(scale=8): | |
gr.Textbox(rubric_item['value'], label="Description", show_label=True, interactive=False) | |
with gr.Column(min_width=15, scale=1): | |
delete_btn = gr.Button("X", size="lg", variant="secondary") | |
def delete(rubric_item=rubric_item): | |
rubric_items_list.remove(rubric_item) | |
return rubric_items_list | |
delete_btn.click(delete, None, [rubric_items]) # This is the state variable | |
gr.Button("Add Rubric Item").click( | |
add_rubric_item, | |
[rubric_items, new_rubric_name, new_rubric_value], | |
[rubric_items, new_rubric_name, new_rubric_value] | |
) | |
with gr.Column(scale=1): | |
evaluation_criteria = gr.Textbox(label="Evaluation criteria") | |
with gr.Row(): | |
with gr.Column(scale=1, variant="panel"): | |
gr.Markdown("# **Evaluation**") | |
with gr.Group(): | |
with gr.Row(equal_height=True): | |
with gr.Column(min_width=15, scale=1): | |
score = gr.Textbox(label="Score", interactive=False, autoscroll=False, max_lines=1) | |
with gr.Column(scale=5): | |
feedback = gr.Textbox(label="Feedback", interactive=False, autoscroll=False, max_lines=6) | |
with gr.Column(min_width=15, scale=1): | |
evaluate_btn = gr.Button("Evaluate", variant="primary") | |
reset_all_btn = gr.Button("Clear All", variant="stop") # Add Reset All button | |
reset_all_btn.click( | |
reset_all, | |
inputs=[], | |
outputs=[ | |
inputs_task, | |
new_input_name, | |
new_input_value, | |
rubric_items, | |
new_rubric_name, | |
new_rubric_value, | |
evaluation_criteria, # Reset evaluation criteria | |
output_name, # Reset output name | |
output_value, # Reset output value | |
feedback, # Reset feedback | |
score # Reset score | |
] | |
) | |
evaluate_btn.click( | |
evaluate, | |
inputs=[inputs_task, output_name, output_value, evaluation_criteria, rubric_items], | |
outputs=[feedback, score] | |
) | |
for i, button in enumerate(preset_buttons): | |
def populate_preset(ex_i=i): | |
return populate_fields(ex_i) | |
button.click( | |
populate_preset, # Use the closure to pass the current index | |
inputs=[], # No direct inputs needed | |
outputs=[ | |
inputs_task, | |
output_name, | |
output_value, | |
evaluation_criteria, | |
rubric_items, | |
feedback, # Add feedback to be reset | |
score # Add score to be reset | |
] | |
) | |
def populate_fields(example_index: int): | |
example = EXAMPLES[example_index] | |
return ( | |
example["inputs_task"], | |
example["output"]["name"], | |
example["output"]["value"], | |
example["evaluation_criteria"], | |
example["rubric"], | |
"", # Reset feedback | |
"" # Reset score | |
) | |
demo.launch(debug=True) | |