ozayezerceli commited on
Commit
af35bc9
·
verified ·
1 Parent(s): a6e8493

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +226 -0
app.py CHANGED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import yaml
4
+ import json
5
+ import os
6
+ from lm_eval import tasks, evaluator
7
+ from datetime import datetime
8
+ from huggingface_hub import HfApi
9
+ import plotly.express as px
10
+
11
+ class LeaderboardSpace:
12
+ def __init__(self, space_name="ozayezerceli/PoCLeaderboard"):
13
+ self.space_name = space_name
14
+ self.results_dir = "benchmark_results"
15
+ self.leaderboard_file = os.path.join(self.results_dir, "leaderboard.json")
16
+ os.makedirs(self.results_dir, exist_ok=True)
17
+ self.load_leaderboard()
18
+ self.api = HfApi()
19
+
20
+ def load_leaderboard(self):
21
+ if os.path.exists(self.leaderboard_file):
22
+ with open(self.leaderboard_file, 'r') as f:
23
+ self.leaderboard = json.load(f)
24
+ else:
25
+ self.leaderboard = {"models": [], "results": {}}
26
+
27
+ def save_leaderboard(self):
28
+ with open(self.leaderboard_file, 'w') as f:
29
+ json.dump(self.leaderboard, f, indent=2)
30
+
31
+ # Push updated leaderboard to Space
32
+ self.api.upload_file(
33
+ path_or_fileobj=self.leaderboard_file,
34
+ path_in_repo=self.leaderboard_file,
35
+ repo_id=self.space_name,
36
+ repo_type="space"
37
+ )
38
+
39
+ def get_leaderboard_df(self):
40
+ if not self.leaderboard["models"]:
41
+ return pd.DataFrame()
42
+
43
+ data = []
44
+ for model in self.leaderboard["models"]:
45
+ result = self.leaderboard["results"][model]
46
+ row = {"Model": model, "Timestamp": result["timestamp"]}
47
+
48
+ for task, scores in result["scores"].items():
49
+ for metric, value in scores.items():
50
+ row[f"{task}_{metric}"] = round(value * 100, 2)
51
+
52
+ data.append(row)
53
+
54
+ return pd.DataFrame(data)
55
+
56
+ def create_leaderboard_plot(self):
57
+ df = self.get_leaderboard_df()
58
+ if df.empty:
59
+ return None
60
+
61
+ # Melt the DataFrame to create a format suitable for plotting
62
+ metrics_cols = [col for col in df.columns if col not in ["Model", "Timestamp"]]
63
+ df_melted = df.melt(
64
+ id_vars=["Model"],
65
+ value_vars=metrics_cols,
66
+ var_name="Metric",
67
+ value_name="Score"
68
+ )
69
+
70
+ # Create a grouped bar plot
71
+ fig = px.bar(
72
+ df_melted,
73
+ x="Model",
74
+ y="Score",
75
+ color="Metric",
76
+ title="Model Performance Across Tasks",
77
+ barmode="group"
78
+ )
79
+
80
+ fig.update_layout(
81
+ yaxis_title="Score (%)",
82
+ xaxis_title="Model",
83
+ legend_title="Metric"
84
+ )
85
+
86
+ return fig
87
+
88
+ def create_interface():
89
+ space = LeaderboardSpace()
90
+
91
+ with gr.Blocks() as demo:
92
+ gr.Markdown("# 🏆 Model Evaluation Leaderboard")
93
+
94
+ with gr.Tab("Leaderboard"):
95
+ with gr.Row():
96
+ leaderboard_plot = gr.Plot()
97
+
98
+ with gr.Row():
99
+ leaderboard_table = gr.DataFrame()
100
+
101
+ with gr.Tab("Submit Evaluation"):
102
+ with gr.Row():
103
+ with gr.Column():
104
+ model_name = gr.Textbox(label="Model Name")
105
+ model_id = gr.Textbox(label="Hugging Face Model ID")
106
+
107
+ # Task selection
108
+ available_tasks = tasks.LIST_OF_PUBLIC_TASKS
109
+ task_selection = gr.Checkboxgroup(
110
+ choices=available_tasks,
111
+ label="Select Tasks"
112
+ )
113
+
114
+ submit_btn = gr.Button("Submit Evaluation")
115
+
116
+ with gr.Row():
117
+ evaluation_status = gr.Textbox(
118
+ label="Evaluation Status",
119
+ interactive=False
120
+ )
121
+
122
+ with gr.Tab("Custom Tasks"):
123
+ with gr.Row():
124
+ with gr.Column():
125
+ task_name = gr.Textbox(label="Task Name")
126
+ task_description = gr.Textbox(
127
+ label="Task Description",
128
+ lines=3
129
+ )
130
+ example_file = gr.File(
131
+ label="Upload Examples (JSON)",
132
+ file_types=[".json"]
133
+ )
134
+ submit_task_btn = gr.Button("Submit Custom Task")
135
+
136
+ with gr.Row():
137
+ task_status = gr.Textbox(
138
+ label="Task Status",
139
+ interactive=False
140
+ )
141
+
142
+ # Define update functions
143
+ def update_leaderboard():
144
+ df = space.get_leaderboard_df()
145
+ plot = space.create_leaderboard_plot()
146
+ return df, plot
147
+
148
+ def submit_evaluation(model_name, model_id, selected_tasks):
149
+ try:
150
+ # Initialize evaluation
151
+ results = evaluator.simple_evaluate(
152
+ model=model_id,
153
+ tasks=selected_tasks,
154
+ num_fewshot=0,
155
+ batch_size=1
156
+ )
157
+
158
+ # Update leaderboard
159
+ if model_name not in space.leaderboard["models"]:
160
+ space.leaderboard["models"].append(model_name)
161
+
162
+ space.leaderboard["results"][model_name] = {
163
+ "timestamp": datetime.now().isoformat(),
164
+ "model_id": model_id,
165
+ "scores": results
166
+ }
167
+
168
+ space.save_leaderboard()
169
+
170
+ return "Evaluation completed successfully!", *update_leaderboard()
171
+ except Exception as e:
172
+ return f"Error during evaluation: {str(e)}", None, None
173
+
174
+ def submit_custom_task(task_name, description, file):
175
+ try:
176
+ # Load and validate task data
177
+ task_data = json.load(open(file.name))
178
+
179
+ # Save task configuration
180
+ task_config = {
181
+ "name": task_name,
182
+ "description": description,
183
+ "data": task_data
184
+ }
185
+
186
+ task_file = os.path.join(space.results_dir, f"task_{task_name}.json")
187
+ with open(task_file, 'w') as f:
188
+ json.dump(task_config, f, indent=2)
189
+
190
+ # Upload to Space
191
+ space.api.upload_file(
192
+ path_or_fileobj=task_file,
193
+ path_in_repo=task_file,
194
+ repo_id=space.space_name,
195
+ repo_type="space"
196
+ )
197
+
198
+ return "Custom task added successfully!"
199
+ except Exception as e:
200
+ return f"Error adding custom task: {str(e)}"
201
+
202
+ # Connect components
203
+ submit_btn.click(
204
+ submit_evaluation,
205
+ inputs=[model_name, model_id, task_selection],
206
+ outputs=[evaluation_status, leaderboard_table, leaderboard_plot]
207
+ )
208
+
209
+ submit_task_btn.click(
210
+ submit_custom_task,
211
+ inputs=[task_name, task_description, example_file],
212
+ outputs=[task_status]
213
+ )
214
+
215
+ # Initial loading of leaderboard
216
+ demo.load(
217
+ update_leaderboard,
218
+ outputs=[leaderboard_table, leaderboard_plot]
219
+ )
220
+
221
+ return demo
222
+
223
+ # Launch the interface
224
+ if __name__ == "__main__":
225
+ demo = create_interface()
226
+ demo.launch()