per commited on
Commit
81311d9
·
verified ·
2 Parent(s): 38f0641 b5e722a

Merge branch #per/benchbench' into 'ibm/benchbench'

Browse files
Files changed (1) hide show
  1. app.py +69 -21
app.py CHANGED
@@ -51,16 +51,17 @@ st.divider()
51
 
52
  st.markdown(
53
  """
54
- The BenchBench leaderboard ranks benchmarks based on their agreement with the *Aggregate Benchmark* – a comprehensive, combined measure of existing benchmark results.
55
- \n
56
- To achive this, we scraped results from multiple benchmarks (citations below) to allow for obtaining benchmark agreement results with a wide range of benchmark using a large set of models.
57
- \n
58
- BenchBench is for you if:
59
  """
60
  )
61
 
62
  st.markdown(
63
  """
 
 
64
  - **You have a new benchmark**: Show that it agrees/disagrees with known benchmarks.
65
  - **You are looking for a benchmark to run/trust**: Find an efficient/private/preferble alternative.
66
  """
@@ -68,11 +69,10 @@ st.markdown(
68
 
69
  st.markdown(
70
  """
71
- In our work - [Benchmark Agreement Testing Done Right](https://arxiv.org/abs/2407.13696) and [opensource repo](https://github.com/IBM/benchbench),
72
- we standardize BAT and show the importance of its configurations, notably,
73
- the benchmarks we compare to, and the models we use to compare with (see sidebar).
74
  \n
75
- We also show that agreements are best represented with the relative agreement (Z Score) of each benchmark to the Aggragate benchmark, as presented below in the leaderboard.
76
  """
77
  )
78
 
@@ -340,7 +340,8 @@ z_scores["date"] = z_scores["source"].apply(
340
 
341
  z_scores["date"] = pd.to_datetime("20" + z_scores["date"]).dt.date
342
 
343
- z_score_name = "Relative agreement (Z Score)"
 
344
 
345
  data = (
346
  z_scores.rename(
@@ -348,7 +349,7 @@ data = (
348
  "scenario": "Benchmark",
349
  "z_score": z_score_name,
350
  "corr_with_agg": corr_name,
351
- "p_value_of_corr_with_agg": "p-value of Corr.",
352
  # "n_models_of_corr_with_agg": "# Models Used",
353
  "source": "Source",
354
  "date": "Snapshot Date",
@@ -376,12 +377,12 @@ styled_data = (
376
  )
377
  .apply(highlight_uploaded_benchmark, axis=1)
378
  .background_gradient(
379
- subset=["p-value of Corr."],
380
  cmap="Reds",
381
  vmin=0.1,
382
  vmax=1,
383
  )
384
- .format(subset=[z_score_name, corr_name, "p-value of Corr."], formatter="{:.2}")
385
  .set_properties(**{"text-align": "center"})
386
  )
387
 
@@ -389,7 +390,7 @@ cols_used = [
389
  "Benchmark",
390
  z_score_name,
391
  corr_name,
392
- "p-value of Corr.",
393
  "Snapshot Date",
394
  ]
395
 
@@ -399,7 +400,7 @@ st.dataframe(
399
  column_order=cols_used,
400
  hide_index=True,
401
  use_container_width=True,
402
- height=500,
403
  column_config={col: {"alignment": "center"} for col in cols_used},
404
  )
405
 
@@ -420,9 +421,41 @@ with st.expander(label="Aggragate Benchmark scores"):
420
  use_container_width=True,
421
  )
422
 
423
- with st.expander(label="Citations"):
424
- st.code(
425
- r"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
426
 
427
  @misc{berkeley-function-calling-leaderboard,
428
  title={Berkeley Function Calling Leaderboard},
@@ -694,7 +727,7 @@ with st.expander(label="Citations"):
694
  }
695
 
696
  """
697
- )
698
 
699
 
700
  st.subheader("Benchmark Report Card")
@@ -714,9 +747,9 @@ plotted_scenario = st.selectbox(
714
 
715
  col1, col2, col3 = st.columns(3)
716
  cur_data = data.query(f"Benchmark=='{plotted_scenario}'")
717
- col1.metric("Relative agreement", cur_data["Relative agreement (Z Score)"])
718
  col2.metric(corr_name, cur_data[corr_name])
719
- col3.metric("p-value of Corr.", cur_data["p-value of Corr."])
720
 
721
  cur_df = allbench.df.query(f'scenario=="aggregate" or scenario=="{plotted_scenario}"')
722
 
@@ -837,3 +870,18 @@ st.image(
837
  caption="**BenchBench's Standardized Approach Reduces Variance.** This ablation study demonstrates that following the best practices implemented in BenchBench significantly reduces the variance of BAT results, leading to more robust and reliable conclusions.",
838
  use_column_width=True,
839
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  st.markdown(
53
  """
54
+ BenchBench rates benchmarks according to their agreement with the defined *Aggregate Benchmark* –
55
+ an enhanced representation of the benchmarks that are out there (see config in sidebar to modify).
56
+
57
+
 
58
  """
59
  )
60
 
61
  st.markdown(
62
  """
63
+ BenchBench is for you if:
64
+ \n
65
  - **You have a new benchmark**: Show that it agrees/disagrees with known benchmarks.
66
  - **You are looking for a benchmark to run/trust**: Find an efficient/private/preferble alternative.
67
  """
 
69
 
70
  st.markdown(
71
  """
72
+ We also show that agreements are best represented with the the BenchBench Score,
73
+ the relative agreement (Z Score) of each benchmark to the Aggragate benchmark.
 
74
  \n
75
+ Read more in our work [Benchmark Agreement Testing Done Right](https://arxiv.org/abs/2407.13696) and the [BenchBench repo](https://github.com/IBM/benchbench)
76
  """
77
  )
78
 
 
340
 
341
  z_scores["date"] = pd.to_datetime("20" + z_scores["date"]).dt.date
342
 
343
+ z_score_name = "BenchBench Score"
344
+ p_val_name = "p val"
345
 
346
  data = (
347
  z_scores.rename(
 
349
  "scenario": "Benchmark",
350
  "z_score": z_score_name,
351
  "corr_with_agg": corr_name,
352
+ "p_value_of_corr_with_agg": p_val_name,
353
  # "n_models_of_corr_with_agg": "# Models Used",
354
  "source": "Source",
355
  "date": "Snapshot Date",
 
377
  )
378
  .apply(highlight_uploaded_benchmark, axis=1)
379
  .background_gradient(
380
+ subset=[p_val_name],
381
  cmap="Reds",
382
  vmin=0.1,
383
  vmax=1,
384
  )
385
+ .format(subset=[z_score_name, corr_name, p_val_name], formatter="{:.2}")
386
  .set_properties(**{"text-align": "center"})
387
  )
388
 
 
390
  "Benchmark",
391
  z_score_name,
392
  corr_name,
393
+ p_val_name,
394
  "Snapshot Date",
395
  ]
396
 
 
400
  column_order=cols_used,
401
  hide_index=True,
402
  use_container_width=True,
403
+ height=300,
404
  column_config={col: {"alignment": "center"} for col in cols_used},
405
  )
406
 
 
421
  use_container_width=True,
422
  )
423
 
424
+ left, right = st.columns([1, 1])
425
+
426
+ with left:
427
+ with st.expander(label="Cite Us!"):
428
+ st.code(
429
+ r"""
430
+
431
+ @misc{perlitz2024llmbenchmarksagreefixing,
432
+ title={Do These LLM Benchmarks Agree? Fixing Benchmark Evaluation with BenchBench},
433
+ author={Yotam Perlitz and Ariel Gera and Ofir Arviv and Asaf Yehudai and Elron Bandel and Eyal Shnarch and Michal Shmueli-Scheuer and Leshem Choshen},
434
+ year={2024},
435
+ eprint={2407.13696},
436
+ archivePrefix={arXiv},
437
+ primaryClass={cs.CL},
438
+ url={https://arxiv.org/abs/2407.13696},
439
+ }
440
+
441
+ """
442
+ )
443
+
444
+ with right:
445
+ with st.expander(label="Cite Everyone Else!"):
446
+ st.code(
447
+ r"""
448
+
449
+ @misc{perlitz2024llmbenchmarksagreefixing,
450
+ title={Do These LLM Benchmarks Agree? Fixing Benchmark Evaluation with BenchBench},
451
+ author={Yotam Perlitz and Ariel Gera and Ofir Arviv and Asaf Yehudai and Elron Bandel and Eyal Shnarch and Michal Shmueli-Scheuer and Leshem Choshen},
452
+ year={2024},
453
+ eprint={2407.13696},
454
+ archivePrefix={arXiv},
455
+ primaryClass={cs.CL},
456
+ url={https://arxiv.org/abs/2407.13696},
457
+ }
458
+
459
 
460
  @misc{berkeley-function-calling-leaderboard,
461
  title={Berkeley Function Calling Leaderboard},
 
727
  }
728
 
729
  """
730
+ )
731
 
732
 
733
  st.subheader("Benchmark Report Card")
 
747
 
748
  col1, col2, col3 = st.columns(3)
749
  cur_data = data.query(f"Benchmark=='{plotted_scenario}'")
750
+ col1.metric("Relative agreement", cur_data[z_score_name])
751
  col2.metric(corr_name, cur_data[corr_name])
752
+ col3.metric("p-value of Corr.", cur_data[p_val_name])
753
 
754
  cur_df = allbench.df.query(f'scenario=="aggregate" or scenario=="{plotted_scenario}"')
755
 
 
870
  caption="**BenchBench's Standardized Approach Reduces Variance.** This ablation study demonstrates that following the best practices implemented in BenchBench significantly reduces the variance of BAT results, leading to more robust and reliable conclusions.",
871
  use_column_width=True,
872
  )
873
+
874
+
875
+ st.code(
876
+ r"""
877
+ @misc{perlitz2024llmbenchmarksagreefixing,
878
+ title={Do These LLM Benchmarks Agree? Fixing Benchmark Evaluation with BenchBench},
879
+ author={Yotam Perlitz and Ariel Gera and Ofir Arviv and Asaf Yehudai and Elron Bandel and Eyal Shnarch and Michal Shmueli-Scheuer and Leshem Choshen},
880
+ year={2024},
881
+ eprint={2407.13696},
882
+ archivePrefix={arXiv},
883
+ primaryClass={cs.CL},
884
+ url={https://arxiv.org/abs/2407.13696},
885
+ }
886
+ """
887
+ )