PoCLeaderboard / app.py
ozayezerceli's picture
Update app.py
af35bc9 verified
import gradio as gr
import pandas as pd
import yaml
import json
import os
from lm_eval import tasks, evaluator
from datetime import datetime
from huggingface_hub import HfApi
import plotly.express as px
class LeaderboardSpace:
def __init__(self, space_name="ozayezerceli/PoCLeaderboard"):
self.space_name = space_name
self.results_dir = "benchmark_results"
self.leaderboard_file = os.path.join(self.results_dir, "leaderboard.json")
os.makedirs(self.results_dir, exist_ok=True)
self.load_leaderboard()
self.api = HfApi()
def load_leaderboard(self):
if os.path.exists(self.leaderboard_file):
with open(self.leaderboard_file, 'r') as f:
self.leaderboard = json.load(f)
else:
self.leaderboard = {"models": [], "results": {}}
def save_leaderboard(self):
with open(self.leaderboard_file, 'w') as f:
json.dump(self.leaderboard, f, indent=2)
# Push updated leaderboard to Space
self.api.upload_file(
path_or_fileobj=self.leaderboard_file,
path_in_repo=self.leaderboard_file,
repo_id=self.space_name,
repo_type="space"
)
def get_leaderboard_df(self):
if not self.leaderboard["models"]:
return pd.DataFrame()
data = []
for model in self.leaderboard["models"]:
result = self.leaderboard["results"][model]
row = {"Model": model, "Timestamp": result["timestamp"]}
for task, scores in result["scores"].items():
for metric, value in scores.items():
row[f"{task}_{metric}"] = round(value * 100, 2)
data.append(row)
return pd.DataFrame(data)
def create_leaderboard_plot(self):
df = self.get_leaderboard_df()
if df.empty:
return None
# Melt the DataFrame to create a format suitable for plotting
metrics_cols = [col for col in df.columns if col not in ["Model", "Timestamp"]]
df_melted = df.melt(
id_vars=["Model"],
value_vars=metrics_cols,
var_name="Metric",
value_name="Score"
)
# Create a grouped bar plot
fig = px.bar(
df_melted,
x="Model",
y="Score",
color="Metric",
title="Model Performance Across Tasks",
barmode="group"
)
fig.update_layout(
yaxis_title="Score (%)",
xaxis_title="Model",
legend_title="Metric"
)
return fig
def create_interface():
space = LeaderboardSpace()
with gr.Blocks() as demo:
gr.Markdown("# ๐Ÿ† Model Evaluation Leaderboard")
with gr.Tab("Leaderboard"):
with gr.Row():
leaderboard_plot = gr.Plot()
with gr.Row():
leaderboard_table = gr.DataFrame()
with gr.Tab("Submit Evaluation"):
with gr.Row():
with gr.Column():
model_name = gr.Textbox(label="Model Name")
model_id = gr.Textbox(label="Hugging Face Model ID")
# Task selection
available_tasks = tasks.LIST_OF_PUBLIC_TASKS
task_selection = gr.Checkboxgroup(
choices=available_tasks,
label="Select Tasks"
)
submit_btn = gr.Button("Submit Evaluation")
with gr.Row():
evaluation_status = gr.Textbox(
label="Evaluation Status",
interactive=False
)
with gr.Tab("Custom Tasks"):
with gr.Row():
with gr.Column():
task_name = gr.Textbox(label="Task Name")
task_description = gr.Textbox(
label="Task Description",
lines=3
)
example_file = gr.File(
label="Upload Examples (JSON)",
file_types=[".json"]
)
submit_task_btn = gr.Button("Submit Custom Task")
with gr.Row():
task_status = gr.Textbox(
label="Task Status",
interactive=False
)
# Define update functions
def update_leaderboard():
df = space.get_leaderboard_df()
plot = space.create_leaderboard_plot()
return df, plot
def submit_evaluation(model_name, model_id, selected_tasks):
try:
# Initialize evaluation
results = evaluator.simple_evaluate(
model=model_id,
tasks=selected_tasks,
num_fewshot=0,
batch_size=1
)
# Update leaderboard
if model_name not in space.leaderboard["models"]:
space.leaderboard["models"].append(model_name)
space.leaderboard["results"][model_name] = {
"timestamp": datetime.now().isoformat(),
"model_id": model_id,
"scores": results
}
space.save_leaderboard()
return "Evaluation completed successfully!", *update_leaderboard()
except Exception as e:
return f"Error during evaluation: {str(e)}", None, None
def submit_custom_task(task_name, description, file):
try:
# Load and validate task data
task_data = json.load(open(file.name))
# Save task configuration
task_config = {
"name": task_name,
"description": description,
"data": task_data
}
task_file = os.path.join(space.results_dir, f"task_{task_name}.json")
with open(task_file, 'w') as f:
json.dump(task_config, f, indent=2)
# Upload to Space
space.api.upload_file(
path_or_fileobj=task_file,
path_in_repo=task_file,
repo_id=space.space_name,
repo_type="space"
)
return "Custom task added successfully!"
except Exception as e:
return f"Error adding custom task: {str(e)}"
# Connect components
submit_btn.click(
submit_evaluation,
inputs=[model_name, model_id, task_selection],
outputs=[evaluation_status, leaderboard_table, leaderboard_plot]
)
submit_task_btn.click(
submit_custom_task,
inputs=[task_name, task_description, example_file],
outputs=[task_status]
)
# Initial loading of leaderboard
demo.load(
update_leaderboard,
outputs=[leaderboard_table, leaderboard_plot]
)
return demo
# Launch the interface
if __name__ == "__main__":
demo = create_interface()
demo.launch()