Spaces:
Runtime error
Runtime error
ozayezerceli
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import yaml
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
from lm_eval import tasks, evaluator
|
7 |
+
from datetime import datetime
|
8 |
+
from huggingface_hub import HfApi
|
9 |
+
import plotly.express as px
|
10 |
+
|
11 |
+
class LeaderboardSpace:
|
12 |
+
def __init__(self, space_name="ozayezerceli/PoCLeaderboard"):
|
13 |
+
self.space_name = space_name
|
14 |
+
self.results_dir = "benchmark_results"
|
15 |
+
self.leaderboard_file = os.path.join(self.results_dir, "leaderboard.json")
|
16 |
+
os.makedirs(self.results_dir, exist_ok=True)
|
17 |
+
self.load_leaderboard()
|
18 |
+
self.api = HfApi()
|
19 |
+
|
20 |
+
def load_leaderboard(self):
|
21 |
+
if os.path.exists(self.leaderboard_file):
|
22 |
+
with open(self.leaderboard_file, 'r') as f:
|
23 |
+
self.leaderboard = json.load(f)
|
24 |
+
else:
|
25 |
+
self.leaderboard = {"models": [], "results": {}}
|
26 |
+
|
27 |
+
def save_leaderboard(self):
|
28 |
+
with open(self.leaderboard_file, 'w') as f:
|
29 |
+
json.dump(self.leaderboard, f, indent=2)
|
30 |
+
|
31 |
+
# Push updated leaderboard to Space
|
32 |
+
self.api.upload_file(
|
33 |
+
path_or_fileobj=self.leaderboard_file,
|
34 |
+
path_in_repo=self.leaderboard_file,
|
35 |
+
repo_id=self.space_name,
|
36 |
+
repo_type="space"
|
37 |
+
)
|
38 |
+
|
39 |
+
def get_leaderboard_df(self):
|
40 |
+
if not self.leaderboard["models"]:
|
41 |
+
return pd.DataFrame()
|
42 |
+
|
43 |
+
data = []
|
44 |
+
for model in self.leaderboard["models"]:
|
45 |
+
result = self.leaderboard["results"][model]
|
46 |
+
row = {"Model": model, "Timestamp": result["timestamp"]}
|
47 |
+
|
48 |
+
for task, scores in result["scores"].items():
|
49 |
+
for metric, value in scores.items():
|
50 |
+
row[f"{task}_{metric}"] = round(value * 100, 2)
|
51 |
+
|
52 |
+
data.append(row)
|
53 |
+
|
54 |
+
return pd.DataFrame(data)
|
55 |
+
|
56 |
+
def create_leaderboard_plot(self):
|
57 |
+
df = self.get_leaderboard_df()
|
58 |
+
if df.empty:
|
59 |
+
return None
|
60 |
+
|
61 |
+
# Melt the DataFrame to create a format suitable for plotting
|
62 |
+
metrics_cols = [col for col in df.columns if col not in ["Model", "Timestamp"]]
|
63 |
+
df_melted = df.melt(
|
64 |
+
id_vars=["Model"],
|
65 |
+
value_vars=metrics_cols,
|
66 |
+
var_name="Metric",
|
67 |
+
value_name="Score"
|
68 |
+
)
|
69 |
+
|
70 |
+
# Create a grouped bar plot
|
71 |
+
fig = px.bar(
|
72 |
+
df_melted,
|
73 |
+
x="Model",
|
74 |
+
y="Score",
|
75 |
+
color="Metric",
|
76 |
+
title="Model Performance Across Tasks",
|
77 |
+
barmode="group"
|
78 |
+
)
|
79 |
+
|
80 |
+
fig.update_layout(
|
81 |
+
yaxis_title="Score (%)",
|
82 |
+
xaxis_title="Model",
|
83 |
+
legend_title="Metric"
|
84 |
+
)
|
85 |
+
|
86 |
+
return fig
|
87 |
+
|
88 |
+
def create_interface():
|
89 |
+
space = LeaderboardSpace()
|
90 |
+
|
91 |
+
with gr.Blocks() as demo:
|
92 |
+
gr.Markdown("# 🏆 Model Evaluation Leaderboard")
|
93 |
+
|
94 |
+
with gr.Tab("Leaderboard"):
|
95 |
+
with gr.Row():
|
96 |
+
leaderboard_plot = gr.Plot()
|
97 |
+
|
98 |
+
with gr.Row():
|
99 |
+
leaderboard_table = gr.DataFrame()
|
100 |
+
|
101 |
+
with gr.Tab("Submit Evaluation"):
|
102 |
+
with gr.Row():
|
103 |
+
with gr.Column():
|
104 |
+
model_name = gr.Textbox(label="Model Name")
|
105 |
+
model_id = gr.Textbox(label="Hugging Face Model ID")
|
106 |
+
|
107 |
+
# Task selection
|
108 |
+
available_tasks = tasks.LIST_OF_PUBLIC_TASKS
|
109 |
+
task_selection = gr.Checkboxgroup(
|
110 |
+
choices=available_tasks,
|
111 |
+
label="Select Tasks"
|
112 |
+
)
|
113 |
+
|
114 |
+
submit_btn = gr.Button("Submit Evaluation")
|
115 |
+
|
116 |
+
with gr.Row():
|
117 |
+
evaluation_status = gr.Textbox(
|
118 |
+
label="Evaluation Status",
|
119 |
+
interactive=False
|
120 |
+
)
|
121 |
+
|
122 |
+
with gr.Tab("Custom Tasks"):
|
123 |
+
with gr.Row():
|
124 |
+
with gr.Column():
|
125 |
+
task_name = gr.Textbox(label="Task Name")
|
126 |
+
task_description = gr.Textbox(
|
127 |
+
label="Task Description",
|
128 |
+
lines=3
|
129 |
+
)
|
130 |
+
example_file = gr.File(
|
131 |
+
label="Upload Examples (JSON)",
|
132 |
+
file_types=[".json"]
|
133 |
+
)
|
134 |
+
submit_task_btn = gr.Button("Submit Custom Task")
|
135 |
+
|
136 |
+
with gr.Row():
|
137 |
+
task_status = gr.Textbox(
|
138 |
+
label="Task Status",
|
139 |
+
interactive=False
|
140 |
+
)
|
141 |
+
|
142 |
+
# Define update functions
|
143 |
+
def update_leaderboard():
|
144 |
+
df = space.get_leaderboard_df()
|
145 |
+
plot = space.create_leaderboard_plot()
|
146 |
+
return df, plot
|
147 |
+
|
148 |
+
def submit_evaluation(model_name, model_id, selected_tasks):
|
149 |
+
try:
|
150 |
+
# Initialize evaluation
|
151 |
+
results = evaluator.simple_evaluate(
|
152 |
+
model=model_id,
|
153 |
+
tasks=selected_tasks,
|
154 |
+
num_fewshot=0,
|
155 |
+
batch_size=1
|
156 |
+
)
|
157 |
+
|
158 |
+
# Update leaderboard
|
159 |
+
if model_name not in space.leaderboard["models"]:
|
160 |
+
space.leaderboard["models"].append(model_name)
|
161 |
+
|
162 |
+
space.leaderboard["results"][model_name] = {
|
163 |
+
"timestamp": datetime.now().isoformat(),
|
164 |
+
"model_id": model_id,
|
165 |
+
"scores": results
|
166 |
+
}
|
167 |
+
|
168 |
+
space.save_leaderboard()
|
169 |
+
|
170 |
+
return "Evaluation completed successfully!", *update_leaderboard()
|
171 |
+
except Exception as e:
|
172 |
+
return f"Error during evaluation: {str(e)}", None, None
|
173 |
+
|
174 |
+
def submit_custom_task(task_name, description, file):
|
175 |
+
try:
|
176 |
+
# Load and validate task data
|
177 |
+
task_data = json.load(open(file.name))
|
178 |
+
|
179 |
+
# Save task configuration
|
180 |
+
task_config = {
|
181 |
+
"name": task_name,
|
182 |
+
"description": description,
|
183 |
+
"data": task_data
|
184 |
+
}
|
185 |
+
|
186 |
+
task_file = os.path.join(space.results_dir, f"task_{task_name}.json")
|
187 |
+
with open(task_file, 'w') as f:
|
188 |
+
json.dump(task_config, f, indent=2)
|
189 |
+
|
190 |
+
# Upload to Space
|
191 |
+
space.api.upload_file(
|
192 |
+
path_or_fileobj=task_file,
|
193 |
+
path_in_repo=task_file,
|
194 |
+
repo_id=space.space_name,
|
195 |
+
repo_type="space"
|
196 |
+
)
|
197 |
+
|
198 |
+
return "Custom task added successfully!"
|
199 |
+
except Exception as e:
|
200 |
+
return f"Error adding custom task: {str(e)}"
|
201 |
+
|
202 |
+
# Connect components
|
203 |
+
submit_btn.click(
|
204 |
+
submit_evaluation,
|
205 |
+
inputs=[model_name, model_id, task_selection],
|
206 |
+
outputs=[evaluation_status, leaderboard_table, leaderboard_plot]
|
207 |
+
)
|
208 |
+
|
209 |
+
submit_task_btn.click(
|
210 |
+
submit_custom_task,
|
211 |
+
inputs=[task_name, task_description, example_file],
|
212 |
+
outputs=[task_status]
|
213 |
+
)
|
214 |
+
|
215 |
+
# Initial loading of leaderboard
|
216 |
+
demo.load(
|
217 |
+
update_leaderboard,
|
218 |
+
outputs=[leaderboard_table, leaderboard_plot]
|
219 |
+
)
|
220 |
+
|
221 |
+
return demo
|
222 |
+
|
223 |
+
# Launch the interface
|
224 |
+
if __name__ == "__main__":
|
225 |
+
demo = create_interface()
|
226 |
+
demo.launch()
|