sheonhan commited on
Commit
1363c8a
·
1 Parent(s): f742519

do not display incomplete models for now

Browse files
Files changed (1) hide show
  1. app.py +86 -16
app.py CHANGED
@@ -93,6 +93,21 @@ if not IS_PUBLIC:
93
  EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
94
  EVAL_TYPES = ["markdown", "str", "bool", "bool", "bool", "str"]
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  def get_leaderboard():
98
  if repo:
@@ -125,11 +140,22 @@ def get_leaderboard():
125
  }
126
  all_data.append(gpt35_values)
127
 
128
- dataframe = pd.DataFrame.from_records(all_data)
129
- dataframe = dataframe.sort_values(by=["Average ⬆️"], ascending=False)
130
- # print(dataframe)
131
- dataframe = dataframe[COLS]
132
- return dataframe
 
 
 
 
 
 
 
 
 
 
 
133
 
134
 
135
  def get_eval_table():
@@ -144,7 +170,7 @@ def get_eval_table():
144
  all_evals = []
145
 
146
  for entry in entries:
147
- print(entry)
148
  if ".json" in entry:
149
  file_path = os.path.join("evals/eval_requests", entry)
150
  with open(file_path) as fp:
@@ -171,12 +197,17 @@ def get_eval_table():
171
  data["model"] = make_clickable_model(data["model"])
172
  all_evals.append(data)
173
 
174
- dataframe = pd.DataFrame.from_records(all_evals)
175
- return dataframe[EVAL_COLS]
 
 
 
 
 
176
 
177
 
178
  leaderboard = get_leaderboard()
179
- eval_queue = get_eval_table()
180
 
181
 
182
  def is_model_on_hub(model_name, revision) -> bool:
@@ -237,7 +268,7 @@ def add_new_eval(
237
  if out_path.lower() in requested_models:
238
  duplicate_request_message = "This model has been already submitted."
239
  return f"<p style='color: orange; font-size: 20px; text-align: center;'>{duplicate_request_message}</p>"
240
-
241
  with open(out_path, "w") as f:
242
  f.write(json.dumps(eval_entry))
243
  LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
@@ -256,7 +287,10 @@ def add_new_eval(
256
 
257
 
258
  def refresh():
259
- return get_leaderboard(), get_eval_table()
 
 
 
260
 
261
 
262
  block = gr.Blocks()
@@ -289,16 +323,43 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
289
 
290
  """
291
  )
292
- with gr.Accordion("Evaluation Queue", open=False):
 
 
 
 
 
 
 
 
293
  with gr.Row():
294
- eval_table = gr.components.Dataframe(
295
- value=eval_queue, headers=EVAL_COLS, datatype=EVAL_TYPES, max_rows=5
 
 
 
 
 
 
 
 
 
 
 
 
296
  )
297
 
298
  with gr.Row():
299
  refresh_button = gr.Button("Refresh")
300
  refresh_button.click(
301
- refresh, inputs=[], outputs=[leaderboard_table, eval_table]
 
 
 
 
 
 
 
302
  )
303
 
304
  with gr.Accordion("Submit a new model for evaluation"):
@@ -332,5 +393,14 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
332
  submission_result,
333
  )
334
 
335
- block.load(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
 
 
 
 
 
 
 
 
 
336
  block.launch()
 
93
  EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
94
  EVAL_TYPES = ["markdown", "str", "bool", "bool", "bool", "str"]
95
 
96
+ BENCHMARK_COLS = [
97
+ "ARC (25-shot) ⬆️",
98
+ "HellaSwag (10-shot) ⬆️",
99
+ "MMLU (5-shot) ⬆️",
100
+ "TruthfulQA (0-shot) ⬆️",
101
+ ]
102
+
103
+
104
+ def has_no_nan_values(df, columns):
105
+ return df[columns].notna().all(axis=1)
106
+
107
+
108
+ def has_nan_values(df, columns):
109
+ return df[columns].isna().any(axis=1)
110
+
111
 
112
  def get_leaderboard():
113
  if repo:
 
140
  }
141
  all_data.append(gpt35_values)
142
 
143
+ df = pd.DataFrame.from_records(all_data)
144
+ df = df.sort_values(by=["Average ⬆️"], ascending=False)
145
+ df = df[COLS]
146
+
147
+ # get incomplete models
148
+ incomplete_models = df[has_nan_values(df, BENCHMARK_COLS)]["Model"].tolist()
149
+ print(
150
+ [
151
+ model.split(" style")[0].split("https://huggingface.co/")[1]
152
+ for model in incomplete_models
153
+ ]
154
+ )
155
+
156
+ # filter out if any of the benchmarks have not been produced
157
+ df = df[has_no_nan_values(df, BENCHMARK_COLS)]
158
+ return df
159
 
160
 
161
  def get_eval_table():
 
170
  all_evals = []
171
 
172
  for entry in entries:
173
+ # print(entry)
174
  if ".json" in entry:
175
  file_path = os.path.join("evals/eval_requests", entry)
176
  with open(file_path) as fp:
 
197
  data["model"] = make_clickable_model(data["model"])
198
  all_evals.append(data)
199
 
200
+ pending_list = [e for e in all_evals if e["status"] == "PENDING"]
201
+ running_list = [e for e in all_evals if e["status"] == "RUNNING"]
202
+ finished_list = [e for e in all_evals if e["status"] == "FINISHED"]
203
+ df_pending = pd.DataFrame.from_records(pending_list)
204
+ df_running = pd.DataFrame.from_records(running_list)
205
+ df_finished = pd.DataFrame.from_records(finished_list)
206
+ return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
207
 
208
 
209
  leaderboard = get_leaderboard()
210
+ finished_eval_queue, running_eval_queue, pending_eval_queue = get_eval_table()
211
 
212
 
213
  def is_model_on_hub(model_name, revision) -> bool:
 
268
  if out_path.lower() in requested_models:
269
  duplicate_request_message = "This model has been already submitted."
270
  return f"<p style='color: orange; font-size: 20px; text-align: center;'>{duplicate_request_message}</p>"
271
+
272
  with open(out_path, "w") as f:
273
  f.write(json.dumps(eval_entry))
274
  LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
 
287
 
288
 
289
  def refresh():
290
+ leaderboard = get_leaderboard()
291
+ finished_eval_queue, running_eval_queue, pending_eval_queue = get_eval_table()
292
+ get_leaderboard(), get_eval_table()
293
+ return leaderboard, finished_eval_queue, running_eval_queue, pending_eval_queue
294
 
295
 
296
  block = gr.Blocks()
 
323
 
324
  """
325
  )
326
+ with gr.Accordion("Finished Evaluations", open=False):
327
+ with gr.Row():
328
+ finished_eval_table = gr.components.Dataframe(
329
+ value=finished_eval_queue,
330
+ headers=EVAL_COLS,
331
+ datatype=EVAL_TYPES,
332
+ max_rows=5,
333
+ )
334
+ with gr.Accordion("Running Evaluation Queue", open=False):
335
  with gr.Row():
336
+ running_eval_table = gr.components.Dataframe(
337
+ value=running_eval_queue,
338
+ headers=EVAL_COLS,
339
+ datatype=EVAL_TYPES,
340
+ max_rows=5,
341
+ )
342
+
343
+ with gr.Accordion("Running & Pending Evaluation Queue", open=False):
344
+ with gr.Row():
345
+ pending_eval_table = gr.components.Dataframe(
346
+ value=pending_eval_queue,
347
+ headers=EVAL_COLS,
348
+ datatype=EVAL_TYPES,
349
+ max_rows=5,
350
  )
351
 
352
  with gr.Row():
353
  refresh_button = gr.Button("Refresh")
354
  refresh_button.click(
355
+ refresh,
356
+ inputs=[],
357
+ outputs=[
358
+ leaderboard_table,
359
+ finished_eval_table,
360
+ running_eval_table,
361
+ pending_eval_table,
362
+ ],
363
  )
364
 
365
  with gr.Accordion("Submit a new model for evaluation"):
 
393
  submission_result,
394
  )
395
 
396
+ block.load(
397
+ refresh,
398
+ inputs=[],
399
+ outputs=[
400
+ leaderboard_table,
401
+ finished_eval_table,
402
+ running_eval_table,
403
+ pending_eval_table,
404
+ ],
405
+ )
406
  block.launch()