import gradio as gr import pandas as pd import yaml import json import os from lm_eval import tasks, evaluator from datetime import datetime from huggingface_hub import HfApi import plotly.express as px class LeaderboardSpace: def __init__(self, space_name="ozayezerceli/PoCLeaderboard"): self.space_name = space_name self.results_dir = "benchmark_results" self.leaderboard_file = os.path.join(self.results_dir, "leaderboard.json") os.makedirs(self.results_dir, exist_ok=True) self.load_leaderboard() self.api = HfApi() def load_leaderboard(self): if os.path.exists(self.leaderboard_file): with open(self.leaderboard_file, 'r') as f: self.leaderboard = json.load(f) else: self.leaderboard = {"models": [], "results": {}} def save_leaderboard(self): with open(self.leaderboard_file, 'w') as f: json.dump(self.leaderboard, f, indent=2) # Push updated leaderboard to Space self.api.upload_file( path_or_fileobj=self.leaderboard_file, path_in_repo=self.leaderboard_file, repo_id=self.space_name, repo_type="space" ) def get_leaderboard_df(self): if not self.leaderboard["models"]: return pd.DataFrame() data = [] for model in self.leaderboard["models"]: result = self.leaderboard["results"][model] row = {"Model": model, "Timestamp": result["timestamp"]} for task, scores in result["scores"].items(): for metric, value in scores.items(): row[f"{task}_{metric}"] = round(value * 100, 2) data.append(row) return pd.DataFrame(data) def create_leaderboard_plot(self): df = self.get_leaderboard_df() if df.empty: return None # Melt the DataFrame to create a format suitable for plotting metrics_cols = [col for col in df.columns if col not in ["Model", "Timestamp"]] df_melted = df.melt( id_vars=["Model"], value_vars=metrics_cols, var_name="Metric", value_name="Score" ) # Create a grouped bar plot fig = px.bar( df_melted, x="Model", y="Score", color="Metric", title="Model Performance Across Tasks", barmode="group" ) fig.update_layout( yaxis_title="Score (%)", xaxis_title="Model", legend_title="Metric" ) return fig def create_interface(): space = LeaderboardSpace() with gr.Blocks() as demo: gr.Markdown("# 🏆 Model Evaluation Leaderboard") with gr.Tab("Leaderboard"): with gr.Row(): leaderboard_plot = gr.Plot() with gr.Row(): leaderboard_table = gr.DataFrame() with gr.Tab("Submit Evaluation"): with gr.Row(): with gr.Column(): model_name = gr.Textbox(label="Model Name") model_id = gr.Textbox(label="Hugging Face Model ID") # Task selection available_tasks = tasks.LIST_OF_PUBLIC_TASKS task_selection = gr.Checkboxgroup( choices=available_tasks, label="Select Tasks" ) submit_btn = gr.Button("Submit Evaluation") with gr.Row(): evaluation_status = gr.Textbox( label="Evaluation Status", interactive=False ) with gr.Tab("Custom Tasks"): with gr.Row(): with gr.Column(): task_name = gr.Textbox(label="Task Name") task_description = gr.Textbox( label="Task Description", lines=3 ) example_file = gr.File( label="Upload Examples (JSON)", file_types=[".json"] ) submit_task_btn = gr.Button("Submit Custom Task") with gr.Row(): task_status = gr.Textbox( label="Task Status", interactive=False ) # Define update functions def update_leaderboard(): df = space.get_leaderboard_df() plot = space.create_leaderboard_plot() return df, plot def submit_evaluation(model_name, model_id, selected_tasks): try: # Initialize evaluation results = evaluator.simple_evaluate( model=model_id, tasks=selected_tasks, num_fewshot=0, batch_size=1 ) # Update leaderboard if model_name not in space.leaderboard["models"]: space.leaderboard["models"].append(model_name) space.leaderboard["results"][model_name] = { "timestamp": datetime.now().isoformat(), "model_id": model_id, "scores": results } space.save_leaderboard() return "Evaluation completed successfully!", *update_leaderboard() except Exception as e: return f"Error during evaluation: {str(e)}", None, None def submit_custom_task(task_name, description, file): try: # Load and validate task data task_data = json.load(open(file.name)) # Save task configuration task_config = { "name": task_name, "description": description, "data": task_data } task_file = os.path.join(space.results_dir, f"task_{task_name}.json") with open(task_file, 'w') as f: json.dump(task_config, f, indent=2) # Upload to Space space.api.upload_file( path_or_fileobj=task_file, path_in_repo=task_file, repo_id=space.space_name, repo_type="space" ) return "Custom task added successfully!" except Exception as e: return f"Error adding custom task: {str(e)}" # Connect components submit_btn.click( submit_evaluation, inputs=[model_name, model_id, task_selection], outputs=[evaluation_status, leaderboard_table, leaderboard_plot] ) submit_task_btn.click( submit_custom_task, inputs=[task_name, task_description, example_file], outputs=[task_status] ) # Initial loading of leaderboard demo.load( update_leaderboard, outputs=[leaderboard_table, leaderboard_plot] ) return demo # Launch the interface if __name__ == "__main__": demo = create_interface() demo.launch()