chivier commited on
Commit
94797da
·
1 Parent(s): f745515

sync from github

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +18 -17
  3. requirements.txt +1 -1
  4. src/leaderboard/read_evals.py +16 -9
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🔥
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 4.9.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
 
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 4.36.1
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
app.py CHANGED
@@ -75,7 +75,7 @@ def restart_space():
75
 
76
 
77
  def init_space():
78
- dataset_df = get_dataset_summary_table(file_path="blog/Hallucination-Leaderboard-Summary.csv")
79
 
80
  if socket.gethostname() not in {"neuromancer"}:
81
  # sync model_type with open-llm-leaderboard
@@ -90,7 +90,8 @@ def init_space():
90
  finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
91
  EVAL_REQUESTS_PATH, EVAL_COLS
92
  )
93
- return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
 
94
 
95
 
96
  def add_benchmark_columns(shown_columns):
@@ -353,21 +354,21 @@ with demo:
353
  queue=True,
354
  )
355
 
356
- with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
357
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
358
-
359
- dataset_table = gr.components.Dataframe(
360
- value=dataset_df,
361
- headers=list(dataset_df.columns),
362
- datatype=["str", "markdown", "str", "str", "str"],
363
- elem_id="dataset-table",
364
- interactive=False,
365
- visible=True,
366
- column_widths=["15%", "20%"],
367
- )
368
-
369
- gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
370
- gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
371
 
372
  with gr.TabItem("Submit a model ", elem_id="llm-benchmark-tab-table", id=3):
373
  with gr.Column():
 
75
 
76
 
77
  def init_space():
78
+ # dataset_df = get_dataset_summary_table(file_path="blog/Hallucination-Leaderboard-Summary.csv")
79
 
80
  if socket.gethostname() not in {"neuromancer"}:
81
  # sync model_type with open-llm-leaderboard
 
90
  finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
91
  EVAL_REQUESTS_PATH, EVAL_COLS
92
  )
93
+ # return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
94
+ return None, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
95
 
96
 
97
  def add_benchmark_columns(shown_columns):
 
354
  queue=True,
355
  )
356
 
357
+ # with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
358
+ # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
359
+
360
+ # dataset_table = gr.components.Dataframe(
361
+ # value=dataset_df,
362
+ # headers=list(dataset_df.columns),
363
+ # datatype=["str", "markdown", "str", "str", "str"],
364
+ # elem_id="dataset-table",
365
+ # interactive=False,
366
+ # visible=True,
367
+ # column_widths=["15%", "20%"],
368
+ # )
369
+
370
+ # gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
371
+ # gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
372
 
373
  with gr.TabItem("Submit a model ", elem_id="llm-benchmark-tab-table", id=3):
374
  with gr.Column():
requirements.txt CHANGED
@@ -4,7 +4,7 @@ APScheduler
4
  black
5
  click
6
  datasets
7
- gradio
8
  gradio_client
9
  huggingface-hub
10
  matplotlib
 
4
  black
5
  click
6
  datasets
7
+ gradio==4.36.1
8
  gradio_client
9
  huggingface-hub
10
  matplotlib
src/leaderboard/read_evals.py CHANGED
@@ -277,15 +277,22 @@ def get_raw_eval_results(results_path: str, requests_path: str, is_backend: bool
277
 
278
  eval_results = {}
279
  for model_result_filepath in tqdm(model_result_filepaths, desc="reading model_result_filepaths"):
280
- # Creation of result
281
- eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend)
282
- eval_result.update_with_request_file(requests_path)
283
- # Store results of same eval together
284
- eval_name = eval_result.eval_name
285
- if eval_name in eval_results.keys():
286
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
287
- else:
288
- eval_results[eval_name] = eval_result
 
 
 
 
 
 
 
289
 
290
  results = []
291
  for v in eval_results.values():
 
277
 
278
  eval_results = {}
279
  for model_result_filepath in tqdm(model_result_filepaths, desc="reading model_result_filepaths"):
280
+ try:
281
+ # Creation of result
282
+ eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend)
283
+ eval_result.update_with_request_file(requests_path)
284
+
285
+ # Store results of same eval together
286
+ eval_name = eval_result.eval_name
287
+ if eval_name in eval_results.keys():
288
+ eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
289
+ else:
290
+ eval_results[eval_name] = eval_result
291
+
292
+ except (FileNotFoundError, ValueError, KeyError, json.JSONDecodeError) as e:
293
+ # Log the error and continue with the next file
294
+ print(f"Error processing file {model_result_filepath}: {e}")
295
+ continue
296
 
297
  results = []
298
  for v in eval_results.values():