Terry Zhuo commited on
Commit
7a7f67a
·
1 Parent(s): f614612

big update

Browse files
app.py CHANGED
@@ -1,296 +1,521 @@
1
- # some code blocks are taken from https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/tree/main
2
- import json
3
  import os
4
- from datetime import datetime, timezone
5
-
 
 
6
  import gradio as gr
7
- import pandas as pd
8
- import requests
9
- from huggingface_hub import HfApi
10
-
11
- from src.css_html import custom_css
12
- from src.text_content import ABOUT_TEXT, SUBMISSION_TEXT_3, CITATION_BUTTON_TEXT, CITATION_BUTTON_LABEL
13
- from src.utils import (
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  AutoEvalColumn,
15
  fields,
16
- is_model_on_hub,
17
- make_clickable_names,
18
- plot_elo_mle,
19
- plot_solve_rate,
20
- styled_error,
21
- styled_message,
22
  )
23
- from datasets import load_dataset
24
- TOKEN = os.environ.get("TOKEN", None)
25
- api = HfApi(TOKEN)
26
- df = load_dataset("bigcode/bigcodebench-results", split="train").to_pandas().sort_values(["complete", "instruct"], ascending=False)
27
- task_elo_mle_df = load_dataset("bigcode/bigcodebench-elo", split="task_no_tie").to_pandas()
28
- bench_elo_mle_df = load_dataset("bigcode/bigcodebench-elo", split="benchmark_tie").to_pandas()
29
- complete_solve_rate = load_dataset("bigcode/bigcodebench-solve-rate", split="complete").to_pandas()
30
- instruct_solve_rate = load_dataset("bigcode/bigcodebench-solve-rate", split="instruct").to_pandas()
31
-
32
- QUEUE_REPO = "bigcode/bigcodebench-requests"
33
- EVAL_REQUESTS_PATH = "eval-queue"
34
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
35
- TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
36
- COLS_LITE = [
37
- c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
38
- ]
39
- TYPES_LITE = [
40
- c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden
41
- ]
42
-
43
-
44
- def add_new_eval(
45
- model: str,
46
- revision: str,
47
- model_type: str,
48
- ):
49
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
50
-
51
- if model_type is None or model_type == "":
52
- return styled_error("Please select a model type.")
53
-
54
- # check the model actually exists before adding the eval
55
- if revision == "":
56
- revision = "main"
57
-
58
- model_on_hub, error = is_model_on_hub(model, revision)
59
- if not model_on_hub:
60
- return styled_error(f'Model "{model}" {error}')
61
-
62
- print("adding new eval")
63
-
64
- eval_entry = {
65
- "model": model,
66
- "revision": revision,
67
- "status": "PENDING",
68
- "submitted_time": current_time,
69
- "model_type": model_type.split(" ")[1],
70
- }
71
-
72
- user_name = ""
73
- model_path = model
74
- if "/" in model:
75
- user_name = model.split("/")[0]
76
- model_path = model.split("/")[1]
77
-
78
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
79
- os.makedirs(OUT_DIR, exist_ok=True)
80
- out_path = f"{OUT_DIR}/{model_path}_eval_request.json"
81
- print(f"Saving eval request to {out_path}")
82
-
83
- with open(out_path, "w") as f:
84
- f.write(json.dumps(eval_entry))
85
-
86
- api.upload_file(
87
- path_or_fileobj=out_path,
88
- path_in_repo=out_path.split("eval-queue/")[1],
89
- repo_id=QUEUE_REPO,
90
- repo_type="dataset",
91
- commit_message=f"Add {model} to eval queue",
92
- )
93
-
94
- # remove the local file
95
- os.remove(out_path)
96
-
97
- return styled_message("Your request has been submitted to the evaluation queue!\n")
98
-
99
-
100
- def select_columns(df, columns):
101
- always_here_cols = [
102
- AutoEvalColumn.model_type_symbol.name,
103
- AutoEvalColumn.model.name,
104
- ]
105
- # We use COLS to maintain sorting
106
- filtered_df = df[
107
- always_here_cols + [c for c in COLS if c in df.columns and c in columns]
108
- ]
109
- return filtered_df
110
-
111
-
112
- def filter_types(df, leaderboard_table, query):
113
- if query == "all":
114
- return df[leaderboard_table.columns]
115
- else:
116
- query = query[0]
117
- filtered_df = df[df["type"].str.contains(query, na=False)]
118
- return filtered_df[leaderboard_table.columns]
119
-
120
-
121
- def filter_direct_complete(df, leaderboard_table, query):
122
- if query == "all":
123
- return df[leaderboard_table.columns]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
- if query == "chat template":
126
- return df[~df["direct_complete"]][leaderboard_table.columns]
127
  else:
128
- return df[df["direct_complete"]][leaderboard_table.columns]
129
-
130
-
131
- def search_table(df, leaderboard_table, query):
132
- filtered_df = df[(df["model"].str.contains("|".join(q.strip() for q in query.split("|")), case=False))]
133
- return filtered_df[leaderboard_table.columns]
134
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
- df = make_clickable_names(df)
137
 
138
- demo = gr.Blocks(css=custom_css)
139
- with demo:
140
- with gr.Row():
141
- gr.Markdown(
142
- """<div style="text-align: center;"><h1> 🌸<span style='color: #A74E95;'>Big</span><span style='color: #C867B5;'>Code</span><span style='color: #DD71C8;'>Bench</span> Leaderboard🌸</h1></div>\
143
- <br>\
144
- <p>Inspired from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a> and <a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">⭐ Big Code Models Leaderboard</a>, we compare performance of LLMs on <a href="https://huggingface.co/datasets/bigcode/bigcodebench">BigCodeBench</a> benchmark.</p>
145
- <p>To get started, please check out <a href="https://github.com/bigcode-project/bigcodebench">our GitHub repository</a>.</p>
146
- """,
147
- elem_classes="markdown-text",
148
- )
149
 
 
 
 
 
 
 
150
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
151
- with gr.Column():
152
- with gr.Tabs(elem_classes="A100-tabs") as A100_tabs:
153
- with gr.TabItem("🔍 Evaluation Table", id=0):
154
- with gr.Column():
155
- with gr.Accordion("➡️ See All Columns", open=False):
156
- shown_columns = gr.CheckboxGroup(
157
- choices=[
158
- c
159
- for c in COLS
160
- if c
161
- not in [
162
- AutoEvalColumn.dummy.name,
163
- AutoEvalColumn.model.name,
164
- AutoEvalColumn.model_type_symbol.name,
165
- ]
166
- ],
167
- value=[
168
- c
169
- for c in COLS_LITE
170
- if c
171
- not in [
172
- AutoEvalColumn.dummy.name,
173
- AutoEvalColumn.model.name,
174
- AutoEvalColumn.model_type_symbol.name,
175
- ]
176
- ],
177
- label="",
178
- elem_id="column-select",
179
- interactive=True,
180
- )
181
- # with gr.Column(min_width=780):
182
- with gr.Row():
183
- search_bar = gr.Textbox(
184
- placeholder="🔍 Separate multiple queries with '|'",
185
- show_label=False,
186
- elem_id="search-bar",
187
- )
188
- filter_types_columns = gr.Radio(
189
- label="⏚ Filter model types",
190
- choices=["all", "🟢 base", "🔶 instruction-tuned"], #, "EXT external-evaluation"],
191
- value="all",
192
- elem_id="filter-columns",
193
- )
194
- filter_prompting_columns = gr.Radio(
195
- label="⏚ Filter prompting",
196
- choices=["all", "chat template", "direct complete"],
197
- value="all",
198
- elem_id="filter-direct-complete",
199
- )
200
- leaderboard_df = gr.components.Dataframe(
201
- value=df[
202
- [
203
- AutoEvalColumn.model_type_symbol.name,
204
- AutoEvalColumn.model.name,
205
- ]
206
- + shown_columns.value
207
- ],
208
- headers=[
209
- AutoEvalColumn.model_type_symbol.name,
210
- AutoEvalColumn.model.name,
211
- ]
212
- + shown_columns.value,
213
- datatype=TYPES,
214
- elem_id="leaderboard-table",
215
- interactive=False,
216
- )
217
-
218
- hidden_leaderboard_df = gr.components.Dataframe(
219
- value=df,
220
- headers=COLS,
221
- datatype=["str" for _ in range(len(COLS))],
222
- visible=False,
223
- )
224
- search_bar.submit(
225
- search_table,
226
- [hidden_leaderboard_df, leaderboard_df, search_bar],
227
- leaderboard_df,
228
- )
229
- filter_types_columns.change(
230
- filter_types,
231
- [hidden_leaderboard_df, leaderboard_df, filter_types_columns],
232
- leaderboard_df,
233
- )
234
- filter_prompting_columns.change(
235
- filter_direct_complete,
236
- [hidden_leaderboard_df, leaderboard_df, filter_prompting_columns],
237
- leaderboard_df,
238
- )
239
- shown_columns.change(
240
- select_columns,
241
- [hidden_leaderboard_df, shown_columns],
242
- leaderboard_df,
243
- )
244
- gr.Markdown(
245
- """
246
- **Notes:**
247
- - _Complete_ vs _Instruct_:
248
- - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
249
- - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
250
- - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
251
- - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on `BigCodeBench-Complete`, which starts from 1000 and is boostrapped 500 times.
252
- - `size` is the amount of activated model weight during inference.
253
- - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
254
- - For more details check the 📝 About section.
255
- """,
256
- elem_classes="markdown-text",
257
- )
258
-
259
- with gr.TabItem("📊 Elo Rating", id=1):
260
- with gr.Column():
261
- with gr.Group():
262
- gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
263
- task_elo_map = gr.Plot()
264
- demo.load(plot_elo_mle, [gr.Dataframe(task_elo_mle_df, visible=False)], task_elo_map)
265
- with gr.Group():
266
- gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
267
- model_elo_map = gr.Plot()
268
- demo.load(plot_elo_mle, [gr.Dataframe(bench_elo_mle_df, visible=False)], model_elo_map)
269
-
270
- with gr.TabItem("🧩 Solve Rate", id=2):
271
- with gr.Column():
272
- complete_map = gr.Plot()
273
- demo.load(plot_solve_rate, [gr.Dataframe(complete_solve_rate, visible=False),
274
- gr.Textbox("Complete", visible=False),
275
- ], complete_map)
276
- instruct_map = gr.Plot()
277
- demo.load(plot_solve_rate, [gr.Dataframe(instruct_solve_rate, visible=False),
278
- gr.Textbox("Instruct", visible=False),
279
- ], instruct_map)
280
-
281
- with gr.TabItem("📝 About", id=3):
282
- gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
283
- with gr.TabItem("Submit/Request Results 🚀", id=4):
284
- gr.Markdown(SUBMISSION_TEXT_3)
285
 
286
- with gr.Row():
287
- with gr.Accordion("📙 Citation", open=False):
288
- citation_button = gr.Textbox(
289
- value=CITATION_BUTTON_TEXT,
290
- label=CITATION_BUTTON_LABEL,
291
- lines=20,
292
- elem_id="citation-button",
293
- show_copy_button=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
 
296
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import logging
3
+ import time
4
+ import schedule
5
+ import datetime
6
  import gradio as gr
7
+ from threading import Thread
8
+ import datasets
9
+ from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
10
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
11
+ from apscheduler.schedulers.background import BackgroundScheduler
12
+
13
+ # Start ephemeral Spaces on PRs (see config in README.md)
14
+ from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
15
+
16
+ from src.display.about import (
17
+ CITATION_BUTTON_LABEL,
18
+ CITATION_BUTTON_TEXT,
19
+ # INTRODUCTION_TEXT,
20
+ TITLE,
21
+ ABOUT_TEXT,
22
+ SUBMISSION_TEXT_3,
23
+ )
24
+ from src.display.css_html_js import custom_css
25
+ from src.display.utils import (
26
+ COLS,
27
+ EVAL_COLS,
28
+ EVAL_TYPES,
29
  AutoEvalColumn,
30
  fields,
31
+ EvalQueueColumn
 
 
 
 
 
32
  )
33
+ from src.envs import (
34
+ API,
35
+ EVAL_REQUESTS_PATH,
36
+ RESULT_REPO,
37
+ HARD_RESULT_REPO,
38
+ ELO_REPO,
39
+ HARD_ELO_REPO,
40
+ SOLVE_REPO,
41
+ HARD_SOLVE_REPO,
42
+ HF_TOKEN,
43
+ QUEUE_REPO,
44
+ REPO_ID,
45
+ VOTES_REPO,
46
+ VOTES_PATH,
47
+ HF_HOME,
48
+ )
49
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
50
+ from src.tools.plots import plot_elo_mle, plot_solve_rate
51
+ # from src.voting.vote_system import VoteManager, run_scheduler
52
+
53
+ # Configure logging
54
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
55
+
56
+ # Start ephemeral Spaces on PRs (see config in README.md)
57
+ from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
58
+
59
+ # Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
60
+ # This controls whether a full initialization should be performed.
61
+ DO_FULL_INIT = True # os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
62
+ NEW_DATA_ON_LEADERBOARD = True
63
+ LEADERBOARD_DF = None
64
+ HARD_LEADERBOARD_DF = None
65
+ ELO_TASK_DF = None
66
+ ELO_BENCH_DF = None
67
+ HARD_ELO_TASK_DF = None
68
+ HARD_ELO_BENCH_DF = None
69
+ COMPLETE_SOLVE_DF = None
70
+ INSTRUCT_SOLVE_DF = None
71
+ HARD_COMPLETE_SOLVE_DF = None
72
+ HARD_INSTRUCT_SOLVE_DF = None
73
+
74
+ def restart_space():
75
+ API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
76
+
77
+
78
+ def time_diff_wrapper(func):
79
+ def wrapper(*args, **kwargs):
80
+ start_time = time.time()
81
+ result = func(*args, **kwargs)
82
+ end_time = time.time()
83
+ diff = end_time - start_time
84
+ logging.info(f"Time taken for {func.__name__}: {diff} seconds")
85
+ return result
86
+
87
+ return wrapper
88
+
89
+
90
+ @time_diff_wrapper
91
+ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
92
+ """Download dataset with exponential backoff retries."""
93
+ attempt = 0
94
+ while attempt < max_attempts:
95
+ try:
96
+ logging.info(f"Downloading {repo_id} to {local_dir}")
97
+ snapshot_download(
98
+ repo_id=repo_id,
99
+ local_dir=local_dir,
100
+ repo_type=repo_type,
101
+ tqdm_class=None,
102
+ etag_timeout=30,
103
+ max_workers=8,
104
+ )
105
+ logging.info("Download successful")
106
+ return
107
+ except Exception as e:
108
+ wait_time = backoff_factor**attempt
109
+ logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
110
+ time.sleep(wait_time)
111
+ attempt += 1
112
+ raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
113
+
114
+ def get_latest_data_leaderboard(
115
+ leaderboard_initial_df = None,
116
+ hard_leaderboard_initial_df = None,
117
+ elo_task_df = None,
118
+ elo_bench_df = None,
119
+ hard_elo_task_df = None,
120
+ hard_elo_bench_df = None,
121
+ complete_solve_df = None,
122
+ instruct_solve_df = None,
123
+ hard_complete_solve_df = None,
124
+ hard_instruct_solve_df = None
125
+ ):
126
+ global NEW_DATA_ON_LEADERBOARD
127
+ global LEADERBOARD_DF
128
+ global HARD_LEADERBOARD_DF
129
+ global ELO_TASK_DF
130
+ global ELO_BENCH_DF
131
+ global HARD_ELO_TASK_DF
132
+ global HARD_ELO_BENCH_DF
133
+ global COMPLETE_SOLVE_DF
134
+ global INSTRUCT_SOLVE_DF
135
+ global HARD_COMPLETE_SOLVE_DF
136
+ global HARD_INSTRUCT_SOLVE_DF
137
+
138
+ if NEW_DATA_ON_LEADERBOARD:
139
+ print("Leaderboard updated at reload!")
140
+ leaderboard_dataset = datasets.load_dataset(
141
+ RESULT_REPO,
142
+ "default",
143
+ split="train",
144
+ cache_dir=HF_HOME,
145
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
146
+ verification_mode="no_checks"
147
+ )
148
+ LEADERBOARD_DF = get_leaderboard_df(
149
+ leaderboard_dataset=leaderboard_dataset,
150
+ cols=COLS,
151
+ )
152
+ hard_leaderboard_dataset = datasets.load_dataset(
153
+ HARD_RESULT_REPO,
154
+ "default",
155
+ split="train",
156
+ cache_dir=HF_HOME,
157
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
158
+ verification_mode="no_checks"
159
+ )
160
+ hard_leaderboard_df = get_leaderboard_df(
161
+ leaderboard_dataset=hard_leaderboard_dataset,
162
+ cols=COLS,
163
+ )
164
+ HARD_LEADERBOARD_DF = hard_leaderboard_df
165
+
166
+ elo_task_df = datasets.load_dataset(
167
+ ELO_REPO,
168
+ "default",
169
+ split="task_no_tie",
170
+ cache_dir=HF_HOME,
171
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
172
+ verification_mode="no_checks"
173
+ ).to_pandas()
174
+ elo_bench_df = datasets.load_dataset(
175
+ ELO_REPO,
176
+ "default",
177
+ split="benchmark_tie",
178
+ cache_dir=HF_HOME,
179
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
180
+ verification_mode="no_checks"
181
+ ).to_pandas()
182
+ ELO_TASK_DF = elo_task_df
183
+ ELO_BENCH_DF = elo_bench_df
184
+
185
+ hard_elo_task_df = datasets.load_dataset(
186
+ HARD_ELO_REPO,
187
+ "default",
188
+ split="task_no_tie",
189
+ cache_dir=HF_HOME,
190
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
191
+ verification_mode="no_checks"
192
+ ).to_pandas()
193
+ hard_elo_bench_df = datasets.load_dataset(
194
+ HARD_ELO_REPO,
195
+ "default",
196
+ split="benchmark_tie",
197
+ cache_dir=HF_HOME,
198
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
199
+ verification_mode="no_checks"
200
+ ).to_pandas()
201
+ HARD_ELO_TASK_DF = hard_elo_task_df
202
+ HARD_ELO_BENCH_DF = hard_elo_bench_df
203
+
204
+ complete_solve_df = datasets.load_dataset(
205
+ SOLVE_REPO,
206
+ "default",
207
+ split="complete",
208
+ cache_dir=HF_HOME,
209
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
210
+ verification_mode="no_checks"
211
+ ).to_pandas()
212
+ instruct_solve_df = datasets.load_dataset(
213
+ SOLVE_REPO,
214
+ "default",
215
+ split="instruct",
216
+ cache_dir=HF_HOME,
217
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
218
+ verification_mode="no_checks"
219
+ ).to_pandas()
220
+ COMPLETE_SOLVE_DF = complete_solve_df
221
+ INSTRUCT_SOLVE_DF = instruct_solve_df
222
+
223
+ hard_complete_solve_df = datasets.load_dataset(
224
+ HARD_SOLVE_REPO,
225
+ "default",
226
+ split="complete",
227
+ cache_dir=HF_HOME,
228
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
229
+ verification_mode="no_checks"
230
+ ).to_pandas()
231
+ hard_instruct_solve_df = datasets.load_dataset(
232
+ HARD_SOLVE_REPO,
233
+ "default",
234
+ split="instruct",
235
+ cache_dir=HF_HOME,
236
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
237
+ verification_mode="no_checks"
238
+ ).to_pandas()
239
+ HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
240
+ HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
241
+
242
+ NEW_DATA_ON_LEADERBOARD = False
243
 
 
 
244
  else:
245
+ LEADERBOARD_DF = leaderboard_initial_df
246
+ HARD_LEADERBOARD_DF = hard_leaderboard_initial_df
247
+ ELO_TASK_DF = elo_task_df
248
+ ELO_BENCH_DF = elo_bench_df
249
+ HARD_ELO_TASK_DF = hard_elo_task_df
250
+ HARD_ELO_BENCH_DF = hard_elo_bench_df
251
+ COMPLETE_SOLVE_DF = complete_solve_df
252
+ INSTRUCT_SOLVE_DF = instruct_solve_df
253
+ HARD_COMPLETE_SOLVE_DF = hard_complete_solve_df
254
+ HARD_INSTRUCT_SOLVE_DF = hard_instruct_solve_df
255
+
256
+ return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
257
+
258
+
259
+ def init_space():
260
+ """Initializes the application space, loading only necessary data."""
261
+
262
+ # Always redownload the leaderboard DataFrame
263
+ global LEADERBOARD_DF
264
+ global HARD_LEADERBOARD_DF
265
+ global ELO_TASK_DF
266
+ global ELO_BENCH_DF
267
+ global HARD_ELO_TASK_DF
268
+ global HARD_ELO_BENCH_DF
269
+ global COMPLETE_SOLVE_DF
270
+ global INSTRUCT_SOLVE_DF
271
+ global HARD_COMPLETE_SOLVE_DF
272
+ global HARD_INSTRUCT_SOLVE_DF
273
+
274
+ LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF = get_latest_data_leaderboard()
275
+
276
+ # Evaluation queue DataFrame retrieval is independent of initialization detail level
277
+ # eval_queue_dfs = get_latest_data_queue()
278
+
279
+ return (LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, HARD_INSTRUCT_SOLVE_DF)
280
+
281
+ # Initialize VoteManager
282
+ # vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
283
+
284
+
285
+ # Schedule the upload_votes method to run every 15 minutes
286
+ # schedule.every(15).minutes.do(vote_manager.upload_votes)
287
+
288
+ # Start the scheduler in a separate thread
289
+ # scheduler_thread = Thread(target=run_scheduler, args=(vote_manager,), daemon=True)
290
+ # scheduler_thread.start()
291
+
292
+ # Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
293
+ # This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
294
+ LEADERBOARD_DF, HARD_LEADERBOARD_DF, ELO_TASK_DF, \
295
+ ELO_BENCH_DF, HARD_ELO_TASK_DF, HARD_ELO_BENCH_DF, \
296
+ COMPLETE_SOLVE_DF, INSTRUCT_SOLVE_DF, HARD_COMPLETE_SOLVE_DF, \
297
+ HARD_INSTRUCT_SOLVE_DF = init_space()
298
+
299
+
300
+ # Data processing for plots now only on demand in the respective Gradio tab
301
+ # def load_and_create_plots():
302
+ # plot_df = create_plot_df(create_scores_df(LEADERBOARD_DF))
303
+ # return plot_df
304
+
305
+ # Function to check if a user is logged in
306
+ def check_login(profile: gr.OAuthProfile | None) -> bool:
307
+ if profile is None:
308
+ return False
309
+ return True
310
+
311
+ def init_leaderboard(dataframe):
312
+ if dataframe is None or dataframe.empty:
313
+ raise ValueError("Leaderboard DataFrame is empty or None.")
314
+ return Leaderboard(
315
+ value=dataframe,
316
+ datatype=[c.type for c in fields(AutoEvalColumn)],
317
+ select_columns=SelectColumns(
318
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
319
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
320
+ label="Select Columns to Display:",
321
+ ),
322
+ search_columns=[AutoEvalColumn.model.name],
323
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
324
+ filter_columns=[
325
+ ColumnFilter(AutoEvalColumn.type.name, type="checkboxgroup", label="Model Types"),
326
+ ColumnFilter(AutoEvalColumn.openness.name, type="checkboxgroup", label="Openness"),
327
+ ColumnFilter(AutoEvalColumn.size_range.name, type="dropdown", label="Model Size"),
328
+ ColumnFilter(AutoEvalColumn.moe.name, type="checkboxgroup", label="Model Architecture"),
329
+ ],
330
+ bool_checkboxgroup_label="Hide models",
331
+ interactive=False,
332
+ )
333
 
 
334
 
335
+ def init_others(dataframe):
336
+ if dataframe is None or dataframe.empty:
337
+ raise ValueError("Gradio DataFrame is empty or None.")
338
+ return gr.Dataframe(dataframe, visible=False)
 
 
 
 
 
 
 
339
 
340
+ main_block = gr.Blocks(css=custom_css)
341
+ with main_block as demo:
342
+ with gr.Row(elem_id="header-row"):
343
+ gr.HTML(TITLE)
344
+
345
+ # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
346
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
347
+ with gr.Tab("💎 Hard Set") as hard_tabs:
348
+ with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="hard_bench"):
349
+ hard_leaderboard = init_leaderboard(HARD_LEADERBOARD_DF)
350
+ gr.Markdown(
351
+ """
352
+ **Notes:**
353
+ - _Hard_ vs _Full_:
354
+ - <u>Hard</u>: A subset of ~150 BigCodeBench tasks which is more user-facing and challenging.
355
+ - <u>Full</u>: The full set of 1140 BigCodeBench tasks.
356
+ - _Complete_ vs _Instruct_:
357
+ - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This split tests if the models are good at coding.
358
+ - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This split tests if the models are really capable enough to understand human intents to code.
359
+ - `Complete` and `Instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark splits.
360
+ - `Average` is the average of `Complete` and `Instruct` when both are available.
361
+ - `Elo Rating` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
362
+ - `#Act Params (B)` is the number of activated model parameters during inference.
363
+ - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
364
+ - For more details check the 📝 About section.
365
+ """,
366
+ elem_classes="markdown-text",
367
+ )
368
+
369
+ with gr.TabItem("📊 Elo Rating", id="hard_elo"):
370
+ with gr.Column():
371
+ with gr.Group():
372
+ gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
373
+ hard_task_elo_map = gr.Plot()
374
+ hard_elo_task_gr = init_others(HARD_ELO_TASK_DF)
375
+ demo.load(plot_elo_mle, [hard_elo_task_gr],
376
+ hard_task_elo_map)
377
+ with gr.Group():
378
+ gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
379
+ hard_bench_elo_map = gr.Plot()
380
+ hard_elo_bench_gr = init_others(HARD_ELO_BENCH_DF)
381
+ demo.load(plot_elo_mle, [hard_elo_bench_gr],
382
+ hard_bench_elo_map)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
 
384
+ with gr.TabItem("🧩 Solve Rate", id="hard_solve"):
385
+ with gr.Column():
386
+ hard_complete_map = gr.Plot()
387
+ hard_complete_solve_gr = init_others(HARD_COMPLETE_SOLVE_DF)
388
+ demo.load(plot_solve_rate, [hard_complete_solve_gr,
389
+ gr.Textbox("Complete", visible=False),
390
+ gr.Number(10, visible=False),
391
+ gr.Number(16, visible=False),
392
+ ], hard_complete_map)
393
+ hard_instruct_map = gr.Plot()
394
+ hard_instruct_solve_gr = init_others(HARD_INSTRUCT_SOLVE_DF)
395
+ demo.load(plot_solve_rate, [hard_instruct_solve_gr,
396
+ gr.Textbox("Instruct", visible=False),
397
+ gr.Number(10, visible=False),
398
+ gr.Number(16, visible=False),
399
+ ], hard_instruct_map)
400
+ with gr.Tab("🎯 Full Set") as full_tabs:
401
+ with gr.TabItem("🏅 Benchmark", elem_id="llm-benchmark-tab-table", id="full_bench"):
402
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
403
+ gr.Markdown(
404
+ """
405
+ **Notes:**
406
+ - _Complete_ vs _Instruct_:
407
+ - <u>Complete</u>: Code Completion based on the (verbose) structured docstring. This variant tests if the models are good at coding.
408
+ - <u>Instruct</u> (🔥Vibe Check🔥): Code Generation based on the (less verbose) NL-oriented instructions. This variant tests if the models are really capable enough to understand human intents to code.
409
+ - `complete` and `instruct` represent the calibrated Pass@1 score on the BigCodeBench benchmark variants.
410
+ - `elo_mle` represents the task-level Bootstrap of Maximum Likelihood Elo rating on the BigCodeBench-Complete split. The rating starts from 1000 and is bootstrapped 500 times.
411
+ - `size` is the amount of activated model weight during inference.
412
+ - Model providers have the responsibility to avoid data contamination. Models trained on close data can be affected by contamination.
413
+ - For more details check the 📝 About section.
414
+ """,
415
+ elem_classes="markdown-text",
416
  )
417
+
418
+ with gr.TabItem("📊 Elo Rating", id="full_elo"):
419
+ with gr.Column():
420
+ with gr.Group():
421
+
422
+ gr.Markdown("## (Task-level, No Tie, BigCodeBench-Complete) -- _Recommended_")
423
+ task_elo_map = gr.Plot()
424
+ elo_task_gr = init_others(ELO_TASK_DF)
425
+ demo.load(plot_elo_mle, [elo_task_gr], task_elo_map)
426
+ with gr.Group():
427
+ gr.Markdown("## (Benchmark-level, BigCodeBench-Complete)")
428
+ bench_elo_map = gr.Plot()
429
+ elo_bench_gr = init_others(ELO_BENCH_DF)
430
+ demo.load(plot_elo_mle, [elo_bench_gr], bench_elo_map)
431
+
432
+ with gr.TabItem("🧩 Solve Rate", id="full_solve"):
433
+ with gr.Column():
434
+ complete_map = gr.Plot()
435
+ complete_solve_gr = init_others(COMPLETE_SOLVE_DF)
436
+ demo.load(plot_solve_rate, [complete_solve_gr,
437
+ gr.Textbox("Complete", visible=False),
438
+ ], complete_map)
439
+ instruct_map = gr.Plot()
440
+ instruct_solve_gr = init_others(INSTRUCT_SOLVE_DF)
441
+ demo.load(plot_solve_rate, [instruct_solve_gr,
442
+ gr.Textbox("Instruct", visible=False),
443
+ ], instruct_map)
444
+
445
+ with gr.TabItem("📝 About", id=3):
446
+ gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
447
+ with gr.TabItem("Request 🚀", id=4):
448
+ gr.Markdown(SUBMISSION_TEXT_3)
449
+
450
+ with gr.Row():
451
+ with gr.Accordion("📙 Citation", open=False):
452
+ citation_button = gr.Textbox(
453
+ value=CITATION_BUTTON_TEXT,
454
+ label=CITATION_BUTTON_LABEL,
455
+ lines=20,
456
+ elem_id="citation-button",
457
+ show_copy_button=True,
458
+ )
459
+
460
+ main_block.load(fn=get_latest_data_leaderboard, inputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr], outputs=[leaderboard, hard_leaderboard, elo_task_gr, elo_bench_gr, hard_elo_task_gr, hard_elo_bench_gr, complete_solve_gr, instruct_solve_gr, hard_complete_solve_gr, hard_instruct_solve_gr])
461
+ # leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
462
+ # pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
463
+
464
+ main_block.queue(default_concurrency_limit=40)
465
+
466
+
467
+ def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
468
+ # Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
469
+ # Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
470
+ # ht to Lucain!
471
+ if SPACE_ID is None:
472
+ print("Not in a Space: Space CI disabled.")
473
+ return WebhooksServer(ui=main_block)
474
+
475
+ if IS_EPHEMERAL_SPACE:
476
+ print("In an ephemeral Space: Space CI disabled.")
477
+ return WebhooksServer(ui=main_block)
478
+
479
+ card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
480
+ config = card.data.get("space_ci", {})
481
+ print(f"Enabling Space CI with config from README: {config}")
482
+
483
+ return configure_space_ci(
484
+ blocks=ui,
485
+ trusted_authors=config.get("trusted_authors"),
486
+ private=config.get("private", "auto"),
487
+ variables=config.get("variables", "auto"),
488
+ secrets=config.get("secrets"),
489
+ hardware=config.get("hardware"),
490
+ storage=config.get("storage"),
491
+ )
492
 
493
+ # Create webhooks server (with CI url if in Space and not ephemeral)
494
+ webhooks_server = enable_space_ci_and_return_server(ui=main_block)
495
+
496
+ # Add webhooks
497
+ @webhooks_server.add_webhook
498
+ def update_leaderboard(payload: WebhookPayload) -> None:
499
+ """Redownloads the leaderboard dataset each time it updates"""
500
+ if payload.repo.type == "dataset" and payload.event.action == "update":
501
+ global NEW_DATA_ON_LEADERBOARD
502
+ if NEW_DATA_ON_LEADERBOARD:
503
+ return
504
+ NEW_DATA_ON_LEADERBOARD = True
505
+
506
+ for repo in [RESULT_REPO, HARD_RESULT_REPO, ELO_REPO, HARD_ELO_REPO, SOLVE_REPO, HARD_SOLVE_REPO]:
507
+ datasets.load_dataset(
508
+ repo,
509
+ "default",
510
+ cache_dir=HF_HOME,
511
+ download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
512
+ verification_mode="no_checks"
513
+ )
514
+
515
+
516
+
517
+ webhooks_server.launch()
518
+
519
+ scheduler = BackgroundScheduler()
520
+ scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h as backup in case automatic updates are not working
521
+ scheduler.start()
requirements.txt CHANGED
@@ -1,19 +1,23 @@
1
- APScheduler
2
- black
3
- click
4
- datasets
5
- gradio
6
- gradio_client
7
  huggingface-hub>=0.18.0
8
- matplotlib
9
- numpy
10
- pandas
11
- python-dateutil
12
- requests
13
- tqdm
14
- transformers
15
- tokenizers>=0.15.0
16
- git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
17
- accelerate
18
  sentencepiece
19
- plotly
 
 
 
 
 
 
 
 
 
 
 
 
1
+ APScheduler==3.10.1
2
+ black==23.11.0
3
+ click==8.1.3
4
+ datasets==2.14.5
 
 
5
  huggingface-hub>=0.18.0
6
+ matplotlib==3.8.4
7
+ numpy==1.26.0
8
+ pandas==2.2.2
9
+ plotly==5.14.1
10
+ python-dateutil==2.8.2
 
 
 
 
 
11
  sentencepiece
12
+ tqdm==4.65.0
13
+ transformers==4.41.1
14
+ tokenizers>=0.15.0
15
+ gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/[email protected] # CI !!!
16
+ isort
17
+ ruff
18
+ gradio==4.31.0
19
+ gradio[oauth]
20
+ gradio_leaderboard==0.0.11
21
+ requests==2.31.0
22
+ requests-oauthlib== 1.3.1
23
+ schedule == 1.2.2
src/{text_content.py → display/about.py} RENAMED
@@ -1,3 +1,11 @@
 
 
 
 
 
 
 
 
1
  ABOUT_TEXT = """# Context
2
  We believe that there are three main expectations of a good execution-based programming benchmark:
3
  1. The benchmark should be easy to use and efficient in evaluating the fundamental capabilities of LLMs. Repo-level and agent-centric benchmarks (e.g., SWE-bench) are not suitable for this purpose.
@@ -135,5 +143,6 @@ CITATION_BUTTON_TEXT = r"""
135
  """
136
 
137
  SUBMISSION_TEXT_3="""
138
- We welcome the community to request for new models to be added to the leaderboard. Please [submit an issue here](https://github.com/bigcode-project/bigcodebench/issues/new/choose) to add the model to the leaderboard 🤗
 
139
  """
 
1
+ TITLE = """<div style="text-align: center;"><h1> 🌸<span style='color: #C867B5;'>BigCodeBench</span> Leaderboard🌸</h1></div>\
2
+ <br>\
3
+ <p>Inspired from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a> and <a href="https://huggingface.co/spaces/bigcode/bigcode-models-leaderboard">⭐ Big Code Models Leaderboard</a>, we compare performance of LLMs on <a href="https://huggingface.co/datasets/bigcode/bigcodebench">BigCodeBench</a> benchmark.</p>
4
+ <p>To get started, please check out <a href="https://github.com/bigcode-project/bigcodebench">our GitHub repository</a>.
5
+ <br>\
6
+ For more details, please check our <a href="https://huggingface.co/blog/leaderboard-bigcodebench-hard">blog on the Hard Set</a>, <a href="https://huggingface.co/blog/leaderboard-bigcodebench">blog on the Full Set</a> and <a href="https://arxiv.org/abs/2406.15877">paper</a>.</p>
7
+ """
8
+
9
  ABOUT_TEXT = """# Context
10
  We believe that there are three main expectations of a good execution-based programming benchmark:
11
  1. The benchmark should be easy to use and efficient in evaluating the fundamental capabilities of LLMs. Repo-level and agent-centric benchmarks (e.g., SWE-bench) are not suitable for this purpose.
 
143
  """
144
 
145
  SUBMISSION_TEXT_3="""
146
+ ## We welcome the community to request for new models to be added to the leaderboard.
147
+ ## Please [file an issue](https://github.com/bigcode-project/bigcodebench/issues/new/choose) to add the model to the leaderboard or [start a discussion](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard/discussions/new) in the community🤗
148
  """
src/{css_html.py → display/css_html_js.py} RENAMED
@@ -1,13 +1,18 @@
1
- # source: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/blob/main/src/assets/css_html_js.py
2
  custom_css = """
3
- #changelog-text {
4
- font-size: 16px !important;
 
 
 
 
5
  }
6
 
7
- #changelog-text h2 {
8
- font-size: 18px !important;
 
9
  }
10
 
 
11
  .markdown-text {
12
  font-size: 16px !important;
13
  }
@@ -29,51 +34,82 @@ custom_css = """
29
  transform: scale(1.3);
30
  }
31
 
32
- #leaderboard-table {
33
- margin-top: 15px
34
- }
35
-
36
- #leaderboard-table-lite {
37
- margin-top: 15px
38
- }
39
-
40
  #search-bar-table-box > div:first-child {
41
  background: none;
42
  border: none;
43
  }
44
-
45
  #search-bar {
46
  padding: 0px;
47
  }
48
 
49
- /* Hides the final AutoEvalColumn */
50
- #llm-benchmark-tab-table table td:last-child,
51
- #llm-benchmark-tab-table table th:last-child {
52
- display: none;
53
  }
54
 
55
- /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
56
- table td:first-child,
57
- table th:first-child {
58
- max-width: 400px;
59
- overflow: auto;
60
- white-space: nowrap;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  }
62
 
63
- .tab-buttons button {
64
- font-size: 20px;
 
 
65
  }
66
 
67
- #scale-logo {
68
- border-style: none !important;
69
- box-shadow: none;
70
- display: block;
71
- margin-left: auto;
72
- margin-right: auto;
73
- max-width: 600px;
74
  }
75
 
76
- #scale-logo .download {
77
- display: none;
78
  }
79
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  custom_css = """
2
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
3
+ table td:first-child,
4
+ table th:first-child {
5
+ max-width: 400px;
6
+ overflow: auto;
7
+ white-space: nowrap;
8
  }
9
 
10
+ /* Full width space */
11
+ .gradio-container {
12
+ max-width: 95% !important;
13
  }
14
 
15
+ /* Text style and margins */
16
  .markdown-text {
17
  font-size: 16px !important;
18
  }
 
34
  transform: scale(1.3);
35
  }
36
 
 
 
 
 
 
 
 
 
37
  #search-bar-table-box > div:first-child {
38
  background: none;
39
  border: none;
40
  }
41
+
42
  #search-bar {
43
  padding: 0px;
44
  }
45
 
46
+ .tab-buttons button {
47
+ font-size: 20px;
 
 
48
  }
49
 
50
+ /* Filters style */
51
+ #filter_type {
52
+ border: 0;
53
+ padding-left: 0;
54
+ padding-top: 0;
55
+ }
56
+ #filter_type label {
57
+ display: flex;
58
+ }
59
+ #filter_type label > span {
60
+ margin-top: var(--spacing-lg);
61
+ margin-right: 0.5em;
62
+ }
63
+ #filter_type label > .wrap {
64
+ width: 103px;
65
+ }
66
+ #filter_type label > .wrap .wrap-inner {
67
+ padding: 2px;
68
+ }
69
+ #filter_type label > .wrap .wrap-inner input {
70
+ width: 1px;
71
+ }
72
+ #filter-columns-type {
73
+ border: 0;
74
+ padding: 0.5;
75
+ }
76
+ #filter-columns-size {
77
+ border: 0;
78
+ padding: 0.5;
79
+ }
80
+ #box-filter > .form {
81
+ border: 0;
82
  }
83
 
84
+ /* Header styles */
85
+ #header-title {
86
+ text-align: left;
87
+ display: inline-block;
88
  }
89
 
90
+ #header-row {
91
+ display: flex;
92
+ justify-content: space-between;
93
+ align-items: center;
 
 
 
94
  }
95
 
96
+ #header-row .gradio-html {
97
+ flex-grow: 1;
98
  }
99
+
100
+ #oauth-button {
101
+ height: auto;
102
+ min-width: max-content;
103
+ white-space: nowrap;
104
+ padding: 10px 20px;
105
+ border-radius: 4px;
106
+ }
107
+ """
108
+
109
+ get_window_url_params = """
110
+ function(url_params) {
111
+ const params = new URLSearchParams(window.location.search);
112
+ url_params = Object.fromEntries(params);
113
+ return url_params;
114
+ }
115
+ """
src/display/formatting.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import HfApi
2
+
3
+ API = HfApi()
4
+
5
+
6
+ def model_hyperlink(link, model_name):
7
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
8
+
9
+
10
+ def make_clickable_model(df, model_col, link_col):
11
+ df[model_col] = df.apply(
12
+ lambda row: model_hyperlink(row[link_col], row[model_col]), axis=1
13
+ )
14
+ df["Openness"] = df.apply(
15
+ lambda row: "Open" if "huggingface.co" in row[link_col] else "Closed", axis=1
16
+ )
17
+ return df
18
+
19
+
20
+ def styled_error(error):
21
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
22
+
23
+
24
+ def styled_warning(warn):
25
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
26
+
27
+
28
+ def styled_message(message):
29
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
30
+
31
+
32
+ def has_no_nan_values(df, columns):
33
+ return df[columns].notna().all(axis=1)
34
+
35
+
36
+ def has_nan_values(df, columns):
37
+ return df[columns].isna().any(axis=1)
src/display/utils.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, make_dataclass
2
+ from enum import Enum
3
+ import json
4
+ import logging
5
+ from datetime import datetime
6
+ import pandas as pd
7
+
8
+
9
+ # Configure logging
10
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
11
+
12
+ # Convert ISO 8601 dates to datetime objects for comparison
13
+ def parse_iso8601_datetime(date_str):
14
+ if date_str.endswith('Z'):
15
+ date_str = date_str[:-1] + '+00:00'
16
+ return datetime.fromisoformat(date_str)
17
+
18
+ def parse_datetime(datetime_str):
19
+ formats = [
20
+ "%Y-%m-%dT%H-%M-%S.%f", # Format with dashes
21
+ "%Y-%m-%dT%H:%M:%S.%f", # Standard format with colons
22
+ "%Y-%m-%dT%H %M %S.%f", # Spaces as separator
23
+ ]
24
+
25
+ for fmt in formats:
26
+ try:
27
+ return datetime.strptime(datetime_str, fmt)
28
+ except ValueError:
29
+ continue
30
+ # in rare cases set unix start time for files with incorrect time (legacy files)
31
+ logging.error(f"No valid date format found for: {datetime_str}")
32
+ return datetime(1970, 1, 1)
33
+
34
+
35
+ def load_json_data(file_path):
36
+ """Safely load JSON data from a file."""
37
+ try:
38
+ with open(file_path, "r") as file:
39
+ return json.load(file)
40
+ except json.JSONDecodeError:
41
+ print(f"Error reading JSON from {file_path}")
42
+ return None # Or raise an exception
43
+
44
+
45
+ def fields(raw_class):
46
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
47
+
48
+
49
+ column_map = {
50
+ "T": "T",
51
+ "model": "Model",
52
+ "type": "Model Type",
53
+ "size_range": "Size Range",
54
+ "complete": "Complete",
55
+ "instruct": "Instruct",
56
+ "average": "Average",
57
+ "elo_mle": "Elo Rating",
58
+ "link": "Link",
59
+ "act_param": "#Act Params (B)",
60
+ "size": "#Params (B)",
61
+ "moe": "MoE",
62
+ "lazy": "Lazy",
63
+ "openness": "Openness",
64
+ "direct_complete": "Direct Completion",
65
+ }
66
+
67
+ type_map = {
68
+ "🔶": "🔶 Chat Models (RLHF, DPO, IFT, ...)",
69
+ "🟢": "🟢 Base Models"
70
+ }
71
+
72
+ moe_map = {
73
+ True: "MoE",
74
+ False: "Dense"
75
+ }
76
+ # These classes are for user facing column names,
77
+ # to avoid having to change them all around the code
78
+ # when a modif is needed
79
+ @dataclass(frozen=True)
80
+ class ColumnContent:
81
+ name: str
82
+ type: str
83
+ displayed_by_default: bool
84
+ hidden: bool = False
85
+ never_hidden: bool = False
86
+ dummy: bool = False
87
+
88
+
89
+ auto_eval_column_dict = []
90
+ # Init
91
+ auto_eval_column_dict.append(["T", ColumnContent, ColumnContent(column_map["T"], "str", True, never_hidden=True)])
92
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent(column_map["model"], "markdown", True, never_hidden=True)])
93
+ auto_eval_column_dict.append(["type", ColumnContent, ColumnContent(column_map["type"], "str", False, True)])
94
+ auto_eval_column_dict.append(["size_range", ColumnContent, ColumnContent(column_map["size_range"], "str", False, True)])
95
+ # Scores
96
+ auto_eval_column_dict.append(["complete", ColumnContent, ColumnContent(column_map["complete"], "number", True)])
97
+ auto_eval_column_dict.append(["instruct", ColumnContent, ColumnContent(column_map["instruct"], "number", True)])
98
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent(column_map["average"], "number", True)])
99
+ auto_eval_column_dict.append(["elo_mle", ColumnContent, ColumnContent(column_map["elo_mle"], "number", True)])
100
+
101
+ # Model information
102
+ auto_eval_column_dict.append(["act_param", ColumnContent, ColumnContent(column_map["act_param"], "number", True)])
103
+ auto_eval_column_dict.append(["link", ColumnContent, ColumnContent(column_map["link"], "str", False, True)])
104
+ auto_eval_column_dict.append(["size", ColumnContent, ColumnContent(column_map["size"], "number", False)])
105
+ auto_eval_column_dict.append(["lazy", ColumnContent, ColumnContent(column_map["lazy"], "bool", False, True)])
106
+ auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent(column_map["moe"], "str", False, True)])
107
+ auto_eval_column_dict.append(["openness", ColumnContent, ColumnContent(column_map["openness"], "str", False, True)])
108
+ auto_eval_column_dict.append(["direct_complete", ColumnContent, ColumnContent(column_map["direct_complete"], "bool", False)])
109
+
110
+ # We use make dataclass to dynamically fill the scores from Tasks
111
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
112
+
113
+
114
+ @dataclass(frozen=True)
115
+ class EvalQueueColumn: # Queue column
116
+ model_link = ColumnContent("link", "markdown", True)
117
+ model_name = ColumnContent("model", "str", True)
118
+
119
+ @dataclass
120
+ class ModelDetails:
121
+ name: str
122
+ symbol: str = "" # emoji, only for the model type
123
+
124
+
125
+ # Column selection
126
+ COLS = [c.name for c in fields(AutoEvalColumn)]
127
+ TYPES = [c.type for c in fields(AutoEvalColumn)]
128
+
129
+ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
130
+ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
131
+
132
+
133
+ NUMERIC_INTERVALS = {
134
+ "?": pd.Interval(-1, 0, closed="right"),
135
+ "~1.5": pd.Interval(0, 2, closed="right"),
136
+ "~3": pd.Interval(2, 4, closed="right"),
137
+ "~7": pd.Interval(4, 9, closed="right"),
138
+ "~13": pd.Interval(9, 20, closed="right"),
139
+ "~35": pd.Interval(20, 45, closed="right"),
140
+ "~60": pd.Interval(45, 70, closed="right"),
141
+ "70+": pd.Interval(70, 10000, closed="right"),
142
+ }
src/envs.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import HfApi
3
+
4
+ # clone / pull the lmeh eval data
5
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
6
+
7
+ REPO_ID = "bigcode/bigcodebench-leaderboard"
8
+ QUEUE_REPO = "bigcode/bigcodebench-requests"
9
+ RESULT_REPO = "bigcode/bigcodebench-results"
10
+ HARD_RESULT_REPO = "bigcode/bigcodebench-hard-results"
11
+
12
+ ELO_REPO = "bigcode/bigcodebench-elo"
13
+ HARD_ELO_REPO = "bigcode/bigcodebench-hard-elo"
14
+ SOLVE_REPO = "bigcode/bigcodebench-solve-rate"
15
+ HARD_SOLVE_REPO = "bigcode/bigcodebench-hard-solve-rate"
16
+
17
+ VOTES_REPO = "bigcode/bigcodebench-votes"
18
+
19
+ HF_HOME = os.getenv("HF_HOME", ".")
20
+
21
+ # Check HF_HOME write access
22
+ print(f"Initial HF_HOME set to: {HF_HOME}")
23
+
24
+ if not os.access(HF_HOME, os.W_OK):
25
+ print(f"No write access to HF_HOME: {HF_HOME}. Resetting to current directory.")
26
+ HF_HOME = "."
27
+ os.environ["HF_HOME"] = HF_HOME
28
+ else:
29
+ print("Write access confirmed for HF_HOME")
30
+
31
+ VOTES_PATH = os.path.join(HF_HOME, "model-votes")
32
+ EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
33
+
34
+ # Rate limit variables
35
+ RATE_LIMIT_PERIOD = 7
36
+ RATE_LIMIT_QUOTA = 5
37
+ HAS_HIGHER_RATE_LIMIT = []
38
+
39
+ API = HfApi(token=HF_TOKEN)
src/populate.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pathlib
2
+ import pandas as pd
3
+ from datasets import Dataset
4
+ from src.display.formatting import has_no_nan_values, make_clickable_model
5
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn
6
+ from src.display.utils import load_json_data, column_map, type_map, moe_map, NUMERIC_INTERVALS
7
+
8
+
9
+
10
+ def get_evaluation_queue_df(save_path, cols):
11
+ """Generate dataframes for pending, running, and finished evaluation entries."""
12
+ save_path = pathlib.Path(save_path)
13
+ all_evals = []
14
+
15
+ for path in save_path.rglob("*.json"):
16
+ data = load_json_data(path)
17
+ # Organizing data by status
18
+ status_map = {
19
+ "PENDING": ["PENDING", "RERUN"],
20
+ "RUNNING": ["RUNNING"],
21
+ "FINISHED": ["FINISHED", "PENDING_NEW_EVAL"],
22
+ }
23
+ status_dfs = {status: [] for status in status_map}
24
+ for eval_data in all_evals:
25
+ for status, extra_statuses in status_map.items():
26
+ if eval_data["status"] in extra_statuses:
27
+ status_dfs[status].append(eval_data)
28
+
29
+ return tuple(pd.DataFrame(status_dfs[status], columns=cols) for status in ["FINISHED", "RUNNING", "PENDING"])
30
+
31
+
32
+ def get_leaderboard_df(leaderboard_dataset: Dataset, cols: list):
33
+ """Retrieve and process leaderboard data."""
34
+ all_data_json = leaderboard_dataset.to_dict()
35
+ num_items = leaderboard_dataset.num_rows
36
+ all_data_json_list = [{k: all_data_json[k][ix] for k in all_data_json.keys()} for ix in range(num_items)]
37
+
38
+ df = pd.DataFrame.from_records(all_data_json_list)
39
+ # replace df.moe true to false, false to true
40
+ # map column names
41
+ df = df.rename(columns=column_map)
42
+ df[AutoEvalColumn.moe.name] = df[AutoEvalColumn.moe.name].map(moe_map)
43
+ df[AutoEvalColumn.T.name] = df[AutoEvalColumn.type.name]
44
+ df[AutoEvalColumn.type.name] = df[AutoEvalColumn.type.name].map(type_map)
45
+ df[AutoEvalColumn.average.name] = df.apply(lambda x: (x[AutoEvalColumn.complete.name] + x[AutoEvalColumn.instruct.name]) / 2 if not pd.isna(x[AutoEvalColumn.complete.name]) and not pd.isna(x[AutoEvalColumn.instruct.name]) else None, axis=1)
46
+ df[AutoEvalColumn.size_range.name] = df[AutoEvalColumn.size.name].apply(lambda x: next((k for k, v in NUMERIC_INTERVALS.items() if x in v), "?"))
47
+ df = make_clickable_model(df, AutoEvalColumn.model.name, AutoEvalColumn.link.name)
48
+ df = df.sort_values(by=[AutoEvalColumn.complete.name], ascending=False)
49
+ df = df[cols].round(decimals=2)
50
+ return df
src/{utils.py → tools/plots.py} RENAMED
@@ -1,45 +1,6 @@
1
- # source: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/blob/main/src/utils_display.py
2
- from dataclasses import dataclass
3
  import plotly.graph_objects as go
4
- from transformers import AutoConfig
5
  import plotly.express as px
6
  import numpy as np
7
- # These classes are for user facing column names, to avoid having to change them
8
- # all around the code when a modif is needed
9
- @dataclass
10
- class ColumnContent:
11
- name: str
12
- type: str
13
- displayed_by_default: bool
14
- hidden: bool = False
15
-
16
-
17
- def fields(raw_class):
18
- return [
19
- v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"
20
- ]
21
-
22
-
23
- @dataclass(frozen=True)
24
- class AutoEvalColumn: # Auto evals column
25
- model_type_symbol = ColumnContent("type", "str", True)
26
- model = ColumnContent("model", "markdown", True)
27
- complete_score = ColumnContent("complete", "number", True)
28
- instruct_score = ColumnContent("instruct", "number", True)
29
- elo_mle = ColumnContent("elo_mle", "number", True)
30
- dummy = ColumnContent("model", "str", True)
31
- size = ColumnContent("size", "number", True)
32
-
33
-
34
- def model_hyperlink(link, model_name):
35
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
36
-
37
-
38
- def make_clickable_names(df):
39
- df["model"] = df.apply(
40
- lambda row: model_hyperlink(row["link"], row["model"]), axis=1
41
- )
42
- return df
43
 
44
 
45
  def plot_elo_mle(df):
@@ -63,13 +24,6 @@ def plot_solve_rate(df, task, rows=30, cols=38):
63
  values = np.array(values)
64
 
65
  n = len(values)
66
- if rows is None or cols is None:
67
- cols = int(math.sqrt(n))
68
- rows = cols if cols * cols >= n else cols + 1
69
-
70
- while rows * cols < n:
71
- cols += 1
72
-
73
  values = np.pad(values, (0, rows * cols - n), 'constant', constant_values=np.nan).reshape((rows, cols))
74
  keys = np.pad(keys, (0, rows * cols - n), 'constant', constant_values='').reshape((rows, cols))
75
 
@@ -102,40 +56,4 @@ def plot_solve_rate(df, task, rows=30, cols=38):
102
  # height=600,
103
  )
104
 
105
- return fig
106
-
107
-
108
- def styled_error(error):
109
- return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
110
-
111
-
112
- def styled_warning(warn):
113
- return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
114
-
115
-
116
- def styled_message(message):
117
- return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
118
-
119
-
120
- def has_no_nan_values(df, columns):
121
- return df[columns].notna().all(axis=1)
122
-
123
-
124
- def has_nan_values(df, columns):
125
- return df[columns].isna().any(axis=1)
126
-
127
-
128
- def is_model_on_hub(model_name: str, revision: str) -> bool:
129
- try:
130
- AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=False)
131
- return True, None
132
-
133
- except ValueError:
134
- return (
135
- False,
136
- "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
137
- )
138
-
139
- except Exception as e:
140
- print(f"Could not get the model config from the hub.: {e}")
141
- return False, "was not found on hub!"
 
 
 
1
  import plotly.graph_objects as go
 
2
  import plotly.express as px
3
  import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
 
6
  def plot_elo_mle(df):
 
24
  values = np.array(values)
25
 
26
  n = len(values)
 
 
 
 
 
 
 
27
  values = np.pad(values, (0, rows * cols - n), 'constant', constant_values=np.nan).reshape((rows, cols))
28
  keys = np.pad(keys, (0, rows * cols - n), 'constant', constant_values='').reshape((rows, cols))
29
 
 
56
  # height=600,
57
  )
58
 
59
+ return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/voting/vote_system.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import pathlib
4
+ import pandas as pd
5
+ import gradio as gr
6
+ import schedule
7
+ import time
8
+ from datetime import datetime, timezone
9
+
10
+ from src.envs import API
11
+
12
+ # Set up logging
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ class VoteManager:
17
+ def __init__(self, votes_path, eval_requests_path, repo_id):
18
+ self.votes_path = votes_path
19
+ self.eval_requests_path = eval_requests_path
20
+ self.repo_id = repo_id
21
+ self.vote_dataset = self.read_vote_dataset()
22
+ self.vote_check_set = self.make_check_set(self.vote_dataset)
23
+ self.votes_to_upload = []
24
+
25
+ def init_vote_dataset(self):
26
+ self.vote_dataset = self.read_vote_dataset()
27
+ self.vote_check_set = self.make_check_set(self.vote_dataset)
28
+
29
+ def read_vote_dataset(self):
30
+ result = []
31
+ votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
32
+ if votes_file.exists():
33
+ with open(votes_file, "r") as f:
34
+ for line in f:
35
+ data = json.loads(line.strip())
36
+ result.append(data)
37
+ result = pd.DataFrame(result)
38
+ return result
39
+
40
+ def make_check_set(self, vote_dataset: pd.DataFrame):
41
+ result = list()
42
+ for row in vote_dataset.itertuples(index=False, name='vote'):
43
+ result.append((row.model, row.revision, row.username))
44
+ return set(result)
45
+
46
+ def get_model_revision(self, selected_model: str) -> str:
47
+ """Fetch the revision for the given model from the request files."""
48
+ for user_folder in pathlib.Path(self.eval_requests_path).iterdir():
49
+ if user_folder.is_dir():
50
+ for file in user_folder.glob("*.json"):
51
+ with open(file, "r") as f:
52
+ data = json.load(f)
53
+ if data.get("model") == selected_model:
54
+ return data.get("revision", "main")
55
+ return "main"
56
+
57
+ def create_request_vote_df(self, pending_models_df: gr.Dataframe):
58
+ if pending_models_df.empty or not "model_name" in pending_models_df.columns:
59
+ return pending_models_df
60
+ self.vote_dataset = self.read_vote_dataset()
61
+ vote_counts = self.vote_dataset.groupby(['model', 'revision']).size().reset_index(name='vote_count')
62
+
63
+ pending_models_df_votes = pd.merge(
64
+ pending_models_df,
65
+ vote_counts,
66
+ left_on=["model_name", 'revision'],
67
+ right_on=['model', 'revision'],
68
+ how='left'
69
+ )
70
+ # Filling empty votes
71
+ pending_models_df_votes['vote_count'] = pending_models_df_votes['vote_count'].fillna(0)
72
+ pending_models_df_votes = pending_models_df_votes.sort_values(by=["vote_count", "model_name"], ascending=[False, True])
73
+ # Removing useless columns
74
+ pending_models_df_votes = pending_models_df_votes.drop(["model_name", "model"], axis=1)
75
+ return pending_models_df_votes
76
+
77
+ # Function to be called when a user votes for a model
78
+ def add_vote(
79
+ self,
80
+ selected_model: str,
81
+ pending_models_df: gr.Dataframe,
82
+ profile: gr.OAuthProfile | None
83
+ ):
84
+ logger.debug(f"Type of list before usage: {type(list)}")
85
+ # model_name, revision, user_id, timestamp
86
+ if selected_model in ["str", ""]:
87
+ gr.Warning("No model selected")
88
+ return
89
+
90
+ if profile is None:
91
+ gr.Warning("Hub Login required")
92
+ return
93
+
94
+ vote_username = profile.username
95
+ model_revision = self.get_model_revision(selected_model)
96
+
97
+ # tuple (immutable) for checking than already voted for model
98
+ check_tuple = (selected_model, model_revision, vote_username)
99
+ if check_tuple in self.vote_check_set:
100
+ gr.Warning("Already voted for this model")
101
+ return
102
+
103
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
104
+
105
+ vote_obj = {
106
+ "model": selected_model,
107
+ "revision": model_revision,
108
+ "username": vote_username,
109
+ "timestamp": current_time
110
+ }
111
+
112
+ # Append the vote to the JSONL file
113
+ try:
114
+ votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
115
+ with open(votes_file, "a") as f:
116
+ f.write(json.dumps(vote_obj) + "\n")
117
+ logger.info(f"Vote added locally: {vote_obj}")
118
+
119
+ self.votes_to_upload.append(vote_obj)
120
+ except Exception as e:
121
+ logger.error(f"Failed to write vote to file: {e}")
122
+ gr.Warning("Failed to record vote. Please try again")
123
+ return
124
+
125
+ self.vote_check_set.add(check_tuple)
126
+ gr.Info(f"Voted for {selected_model}")
127
+
128
+ return self.create_request_vote_df(pending_models_df)
129
+
130
+ def upload_votes(self):
131
+ if self.votes_to_upload:
132
+ votes_file = pathlib.Path(self.votes_path) / "votes_data.jsonl"
133
+ try:
134
+ with open(votes_file, "rb") as f:
135
+ API.upload_file(
136
+ path_or_fileobj=f,
137
+ path_in_repo="votes_data.jsonl",
138
+ repo_id=self.repo_id,
139
+ repo_type="dataset",
140
+ commit_message="Updating votes_data.jsonl with new votes",
141
+ )
142
+ logger.info("Votes uploaded to votes repository")
143
+ self.votes_to_upload.clear()
144
+ except Exception as e:
145
+ logger.error(f"Failed to upload votes to repository: {e}")
146
+
147
+ def run_scheduler(vote_manager):
148
+ while True:
149
+ schedule.run_pending()
150
+ time.sleep(1)