djstrong commited on
Commit
3c5ea13
β€’
1 Parent(s): 1889818
Files changed (3) hide show
  1. app.py +86 -86
  2. src/about.py +20 -1
  3. src/leaderboard/read_evals.py +2 -0
app.py CHANGED
@@ -267,92 +267,92 @@ with demo:
267
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
268
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
269
 
270
- with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
271
- with gr.Column():
272
- with gr.Row():
273
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
274
-
275
- with gr.Column():
276
- with gr.Accordion(
277
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
278
- open=False,
279
- ):
280
- with gr.Row():
281
- finished_eval_table = gr.components.Dataframe(
282
- value=finished_eval_queue_df,
283
- headers=EVAL_COLS,
284
- datatype=EVAL_TYPES,
285
- row_count=5,
286
- )
287
- with gr.Accordion(
288
- f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
289
- open=False,
290
- ):
291
- with gr.Row():
292
- running_eval_table = gr.components.Dataframe(
293
- value=running_eval_queue_df,
294
- headers=EVAL_COLS,
295
- datatype=EVAL_TYPES,
296
- row_count=5,
297
- )
298
-
299
- with gr.Accordion(
300
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
301
- open=False,
302
- ):
303
- with gr.Row():
304
- pending_eval_table = gr.components.Dataframe(
305
- value=pending_eval_queue_df,
306
- headers=EVAL_COLS,
307
- datatype=EVAL_TYPES,
308
- row_count=5,
309
- )
310
- with gr.Row():
311
- gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
312
-
313
- with gr.Row():
314
- with gr.Column():
315
- model_name_textbox = gr.Textbox(label="Model name")
316
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
317
- model_type = gr.Dropdown(
318
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
319
- label="Model type",
320
- multiselect=False,
321
- value=None,
322
- interactive=True,
323
- )
324
-
325
- with gr.Column():
326
- precision = gr.Dropdown(
327
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
328
- label="Precision",
329
- multiselect=False,
330
- value="float16" if DEVICE != "cpu" else "float32",
331
- interactive=True,
332
- )
333
- weight_type = gr.Dropdown(
334
- choices=[i.value.name for i in WeightType],
335
- label="Weights type",
336
- multiselect=False,
337
- value="Original",
338
- interactive=True,
339
- )
340
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
341
-
342
- submit_button = gr.Button("Submit Eval")
343
- submission_result = gr.Markdown()
344
- submit_button.click(
345
- add_new_eval,
346
- [
347
- model_name_textbox,
348
- base_model_name_textbox,
349
- revision_name_textbox,
350
- precision,
351
- weight_type,
352
- model_type,
353
- ],
354
- submission_result,
355
- )
356
 
357
  with gr.Row():
358
  with gr.Accordion("πŸ“™ Citation", open=False):
 
267
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
268
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
269
 
270
+ # with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
271
+ # with gr.Column():
272
+ # with gr.Row():
273
+ # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
274
+ #
275
+ # with gr.Column():
276
+ # with gr.Accordion(
277
+ # f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
278
+ # open=False,
279
+ # ):
280
+ # with gr.Row():
281
+ # finished_eval_table = gr.components.Dataframe(
282
+ # value=finished_eval_queue_df,
283
+ # headers=EVAL_COLS,
284
+ # datatype=EVAL_TYPES,
285
+ # row_count=5,
286
+ # )
287
+ # with gr.Accordion(
288
+ # f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
289
+ # open=False,
290
+ # ):
291
+ # with gr.Row():
292
+ # running_eval_table = gr.components.Dataframe(
293
+ # value=running_eval_queue_df,
294
+ # headers=EVAL_COLS,
295
+ # datatype=EVAL_TYPES,
296
+ # row_count=5,
297
+ # )
298
+ #
299
+ # with gr.Accordion(
300
+ # f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
301
+ # open=False,
302
+ # ):
303
+ # with gr.Row():
304
+ # pending_eval_table = gr.components.Dataframe(
305
+ # value=pending_eval_queue_df,
306
+ # headers=EVAL_COLS,
307
+ # datatype=EVAL_TYPES,
308
+ # row_count=5,
309
+ # )
310
+ # with gr.Row():
311
+ # gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
312
+ #
313
+ # with gr.Row():
314
+ # with gr.Column():
315
+ # model_name_textbox = gr.Textbox(label="Model name")
316
+ # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
317
+ # model_type = gr.Dropdown(
318
+ # choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
319
+ # label="Model type",
320
+ # multiselect=False,
321
+ # value=None,
322
+ # interactive=True,
323
+ # )
324
+ #
325
+ # with gr.Column():
326
+ # precision = gr.Dropdown(
327
+ # choices=[i.value.name for i in Precision if i != Precision.Unknown],
328
+ # label="Precision",
329
+ # multiselect=False,
330
+ # value="float16" if DEVICE != "cpu" else "float32",
331
+ # interactive=True,
332
+ # )
333
+ # weight_type = gr.Dropdown(
334
+ # choices=[i.value.name for i in WeightType],
335
+ # label="Weights type",
336
+ # multiselect=False,
337
+ # value="Original",
338
+ # interactive=True,
339
+ # )
340
+ # base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
341
+ #
342
+ # submit_button = gr.Button("Submit Eval")
343
+ # submission_result = gr.Markdown()
344
+ # submit_button.click(
345
+ # add_new_eval,
346
+ # [
347
+ # model_name_textbox,
348
+ # base_model_name_textbox,
349
+ # revision_name_textbox,
350
+ # precision,
351
+ # weight_type,
352
+ # model_type,
353
+ # ],
354
+ # submission_result,
355
+ # )
356
 
357
  with gr.Row():
358
  with gr.Accordion("πŸ“™ Citation", open=False):
src/about.py CHANGED
@@ -48,10 +48,29 @@ _mc suffix means that a model is scored against every possible class (suitable a
48
 
49
  # Which evaluations are you running? how can people reproduce what you have?
50
  LLM_BENCHMARKS_TEXT = f"""
 
 
 
 
 
 
51
  ## How it works
52
 
53
  ## Reproducibility
54
- To reproduce our results, here is the commands you can run:
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  """
57
 
 
48
 
49
  # Which evaluations are you running? how can people reproduce what you have?
50
  LLM_BENCHMARKS_TEXT = f"""
51
+ ## Do you want to add your model to the leaderboard?
52
+
53
+ Contact with me: [LinkedIn](https://www.linkedin.com/in/wrobelkrzysztof/)
54
+
55
+ or join our [Discord SpeakLeash](https://discord.gg/3G9DVM39)
56
+
57
  ## How it works
58
 
59
  ## Reproducibility
60
+ To reproduce our results, you need to clone the repository:
61
+
62
+ ```
63
+ git clone https://github.com/speakleash/lm-evaluation-harness.git
64
+ cd lm-evaluation-harness
65
+ pip install -e .
66
+ ```
67
+
68
+ and run benchmark for 0-shot and 5-shot:
69
+
70
+ ```
71
+ lm_eval --model hf --model_args pretrained=Azurro/APT3-1B-Base --tasks polish --num_fewshot 0 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
72
+ lm_eval --model hf --model_args pretrained=Azurro/APT3-1B-Base --tasks polish --num_fewshot 5 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
73
+ ```
74
 
75
  """
76
 
src/leaderboard/read_evals.py CHANGED
@@ -268,6 +268,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
268
  eval_name = f"{eval_result.eval_name}_{n_shot}-shot"
269
  if eval_name in eval_results.keys():
270
  eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
 
271
  else:
272
  eval_results[eval_name] = eval_result
273
 
@@ -276,6 +277,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
276
  try:
277
  print(v)
278
  v.to_dict() # we test if the dict version is complete
 
279
  results.append(v)
280
  except KeyError: # not all eval values present
281
  print(f"not all eval values present {v.eval_name} {v.full_model}")
 
268
  eval_name = f"{eval_result.eval_name}_{n_shot}-shot"
269
  if eval_name in eval_results.keys():
270
  eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
271
+ #TODO: log updated
272
  else:
273
  eval_results[eval_name] = eval_result
274
 
 
277
  try:
278
  print(v)
279
  v.to_dict() # we test if the dict version is complete
280
+ #if v.results:
281
  results.append(v)
282
  except KeyError: # not all eval values present
283
  print(f"not all eval values present {v.eval_name} {v.full_model}")