Spaces:
Runtime error
Runtime error
import gradio as gr | |
import pandas as pd | |
import yaml | |
import json | |
import os | |
from lm_eval import tasks, evaluator | |
from datetime import datetime | |
from huggingface_hub import HfApi | |
import plotly.express as px | |
class LeaderboardSpace: | |
def __init__(self, space_name="ozayezerceli/PoCLeaderboard"): | |
self.space_name = space_name | |
self.results_dir = "benchmark_results" | |
self.leaderboard_file = os.path.join(self.results_dir, "leaderboard.json") | |
os.makedirs(self.results_dir, exist_ok=True) | |
self.load_leaderboard() | |
self.api = HfApi() | |
def load_leaderboard(self): | |
if os.path.exists(self.leaderboard_file): | |
with open(self.leaderboard_file, 'r') as f: | |
self.leaderboard = json.load(f) | |
else: | |
self.leaderboard = {"models": [], "results": {}} | |
def save_leaderboard(self): | |
with open(self.leaderboard_file, 'w') as f: | |
json.dump(self.leaderboard, f, indent=2) | |
# Push updated leaderboard to Space | |
self.api.upload_file( | |
path_or_fileobj=self.leaderboard_file, | |
path_in_repo=self.leaderboard_file, | |
repo_id=self.space_name, | |
repo_type="space" | |
) | |
def get_leaderboard_df(self): | |
if not self.leaderboard["models"]: | |
return pd.DataFrame() | |
data = [] | |
for model in self.leaderboard["models"]: | |
result = self.leaderboard["results"][model] | |
row = {"Model": model, "Timestamp": result["timestamp"]} | |
for task, scores in result["scores"].items(): | |
for metric, value in scores.items(): | |
row[f"{task}_{metric}"] = round(value * 100, 2) | |
data.append(row) | |
return pd.DataFrame(data) | |
def create_leaderboard_plot(self): | |
df = self.get_leaderboard_df() | |
if df.empty: | |
return None | |
# Melt the DataFrame to create a format suitable for plotting | |
metrics_cols = [col for col in df.columns if col not in ["Model", "Timestamp"]] | |
df_melted = df.melt( | |
id_vars=["Model"], | |
value_vars=metrics_cols, | |
var_name="Metric", | |
value_name="Score" | |
) | |
# Create a grouped bar plot | |
fig = px.bar( | |
df_melted, | |
x="Model", | |
y="Score", | |
color="Metric", | |
title="Model Performance Across Tasks", | |
barmode="group" | |
) | |
fig.update_layout( | |
yaxis_title="Score (%)", | |
xaxis_title="Model", | |
legend_title="Metric" | |
) | |
return fig | |
def create_interface(): | |
space = LeaderboardSpace() | |
with gr.Blocks() as demo: | |
gr.Markdown("# ๐ Model Evaluation Leaderboard") | |
with gr.Tab("Leaderboard"): | |
with gr.Row(): | |
leaderboard_plot = gr.Plot() | |
with gr.Row(): | |
leaderboard_table = gr.DataFrame() | |
with gr.Tab("Submit Evaluation"): | |
with gr.Row(): | |
with gr.Column(): | |
model_name = gr.Textbox(label="Model Name") | |
model_id = gr.Textbox(label="Hugging Face Model ID") | |
# Task selection | |
available_tasks = tasks.LIST_OF_PUBLIC_TASKS | |
task_selection = gr.Checkboxgroup( | |
choices=available_tasks, | |
label="Select Tasks" | |
) | |
submit_btn = gr.Button("Submit Evaluation") | |
with gr.Row(): | |
evaluation_status = gr.Textbox( | |
label="Evaluation Status", | |
interactive=False | |
) | |
with gr.Tab("Custom Tasks"): | |
with gr.Row(): | |
with gr.Column(): | |
task_name = gr.Textbox(label="Task Name") | |
task_description = gr.Textbox( | |
label="Task Description", | |
lines=3 | |
) | |
example_file = gr.File( | |
label="Upload Examples (JSON)", | |
file_types=[".json"] | |
) | |
submit_task_btn = gr.Button("Submit Custom Task") | |
with gr.Row(): | |
task_status = gr.Textbox( | |
label="Task Status", | |
interactive=False | |
) | |
# Define update functions | |
def update_leaderboard(): | |
df = space.get_leaderboard_df() | |
plot = space.create_leaderboard_plot() | |
return df, plot | |
def submit_evaluation(model_name, model_id, selected_tasks): | |
try: | |
# Initialize evaluation | |
results = evaluator.simple_evaluate( | |
model=model_id, | |
tasks=selected_tasks, | |
num_fewshot=0, | |
batch_size=1 | |
) | |
# Update leaderboard | |
if model_name not in space.leaderboard["models"]: | |
space.leaderboard["models"].append(model_name) | |
space.leaderboard["results"][model_name] = { | |
"timestamp": datetime.now().isoformat(), | |
"model_id": model_id, | |
"scores": results | |
} | |
space.save_leaderboard() | |
return "Evaluation completed successfully!", *update_leaderboard() | |
except Exception as e: | |
return f"Error during evaluation: {str(e)}", None, None | |
def submit_custom_task(task_name, description, file): | |
try: | |
# Load and validate task data | |
task_data = json.load(open(file.name)) | |
# Save task configuration | |
task_config = { | |
"name": task_name, | |
"description": description, | |
"data": task_data | |
} | |
task_file = os.path.join(space.results_dir, f"task_{task_name}.json") | |
with open(task_file, 'w') as f: | |
json.dump(task_config, f, indent=2) | |
# Upload to Space | |
space.api.upload_file( | |
path_or_fileobj=task_file, | |
path_in_repo=task_file, | |
repo_id=space.space_name, | |
repo_type="space" | |
) | |
return "Custom task added successfully!" | |
except Exception as e: | |
return f"Error adding custom task: {str(e)}" | |
# Connect components | |
submit_btn.click( | |
submit_evaluation, | |
inputs=[model_name, model_id, task_selection], | |
outputs=[evaluation_status, leaderboard_table, leaderboard_plot] | |
) | |
submit_task_btn.click( | |
submit_custom_task, | |
inputs=[task_name, task_description, example_file], | |
outputs=[task_status] | |
) | |
# Initial loading of leaderboard | |
demo.load( | |
update_leaderboard, | |
outputs=[leaderboard_table, leaderboard_plot] | |
) | |
return demo | |
# Launch the interface | |
if __name__ == "__main__": | |
demo = create_interface() | |
demo.launch() |