Spaces:
Running
Running
Yotam-Perlitz
commited on
Commit
·
b5e722a
1
Parent(s):
5c9c592
improve writings
Browse filesSigned-off-by: Yotam-Perlitz <[email protected]>
app.py
CHANGED
@@ -51,16 +51,17 @@ st.divider()
|
|
51 |
|
52 |
st.markdown(
|
53 |
"""
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
BenchBench is for you if:
|
59 |
"""
|
60 |
)
|
61 |
|
62 |
st.markdown(
|
63 |
"""
|
|
|
|
|
64 |
- **You have a new benchmark**: Show that it agrees/disagrees with known benchmarks.
|
65 |
- **You are looking for a benchmark to run/trust**: Find an efficient/private/preferble alternative.
|
66 |
"""
|
@@ -68,11 +69,10 @@ st.markdown(
|
|
68 |
|
69 |
st.markdown(
|
70 |
"""
|
71 |
-
|
72 |
-
|
73 |
-
the benchmarks we compare to, and the models we use to compare with (see sidebar).
|
74 |
\n
|
75 |
-
|
76 |
"""
|
77 |
)
|
78 |
|
@@ -340,7 +340,8 @@ z_scores["date"] = z_scores["source"].apply(
|
|
340 |
|
341 |
z_scores["date"] = pd.to_datetime("20" + z_scores["date"]).dt.date
|
342 |
|
343 |
-
z_score_name = "
|
|
|
344 |
|
345 |
data = (
|
346 |
z_scores.rename(
|
@@ -348,7 +349,7 @@ data = (
|
|
348 |
"scenario": "Benchmark",
|
349 |
"z_score": z_score_name,
|
350 |
"corr_with_agg": corr_name,
|
351 |
-
"p_value_of_corr_with_agg":
|
352 |
# "n_models_of_corr_with_agg": "# Models Used",
|
353 |
"source": "Source",
|
354 |
"date": "Snapshot Date",
|
@@ -376,12 +377,12 @@ styled_data = (
|
|
376 |
)
|
377 |
.apply(highlight_uploaded_benchmark, axis=1)
|
378 |
.background_gradient(
|
379 |
-
subset=[
|
380 |
cmap="Reds",
|
381 |
vmin=0.1,
|
382 |
vmax=1,
|
383 |
)
|
384 |
-
.format(subset=[z_score_name, corr_name,
|
385 |
.set_properties(**{"text-align": "center"})
|
386 |
)
|
387 |
|
@@ -389,7 +390,7 @@ cols_used = [
|
|
389 |
"Benchmark",
|
390 |
z_score_name,
|
391 |
corr_name,
|
392 |
-
|
393 |
"Snapshot Date",
|
394 |
]
|
395 |
|
@@ -399,7 +400,7 @@ st.dataframe(
|
|
399 |
column_order=cols_used,
|
400 |
hide_index=True,
|
401 |
use_container_width=True,
|
402 |
-
height=
|
403 |
column_config={col: {"alignment": "center"} for col in cols_used},
|
404 |
)
|
405 |
|
@@ -420,9 +421,41 @@ with st.expander(label="Aggragate Benchmark scores"):
|
|
420 |
use_container_width=True,
|
421 |
)
|
422 |
|
423 |
-
|
424 |
-
|
425 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
426 |
|
427 |
@misc{berkeley-function-calling-leaderboard,
|
428 |
title={Berkeley Function Calling Leaderboard},
|
@@ -694,7 +727,7 @@ with st.expander(label="Citations"):
|
|
694 |
}
|
695 |
|
696 |
"""
|
697 |
-
|
698 |
|
699 |
|
700 |
st.subheader("Benchmark Report Card")
|
@@ -714,9 +747,9 @@ plotted_scenario = st.selectbox(
|
|
714 |
|
715 |
col1, col2, col3 = st.columns(3)
|
716 |
cur_data = data.query(f"Benchmark=='{plotted_scenario}'")
|
717 |
-
col1.metric("Relative agreement", cur_data[
|
718 |
col2.metric(corr_name, cur_data[corr_name])
|
719 |
-
col3.metric("p-value of Corr.", cur_data[
|
720 |
|
721 |
cur_df = allbench.df.query(f'scenario=="aggregate" or scenario=="{plotted_scenario}"')
|
722 |
|
@@ -837,3 +870,18 @@ st.image(
|
|
837 |
caption="**BenchBench's Standardized Approach Reduces Variance.** This ablation study demonstrates that following the best practices implemented in BenchBench significantly reduces the variance of BAT results, leading to more robust and reliable conclusions.",
|
838 |
use_column_width=True,
|
839 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
st.markdown(
|
53 |
"""
|
54 |
+
BenchBench rates benchmarks according to their agreement with the defined *Aggregate Benchmark* –
|
55 |
+
an enhanced representation of the benchmarks that are out there (see config in sidebar to modify).
|
56 |
+
|
57 |
+
|
|
|
58 |
"""
|
59 |
)
|
60 |
|
61 |
st.markdown(
|
62 |
"""
|
63 |
+
BenchBench is for you if:
|
64 |
+
\n
|
65 |
- **You have a new benchmark**: Show that it agrees/disagrees with known benchmarks.
|
66 |
- **You are looking for a benchmark to run/trust**: Find an efficient/private/preferble alternative.
|
67 |
"""
|
|
|
69 |
|
70 |
st.markdown(
|
71 |
"""
|
72 |
+
We also show that agreements are best represented with the the BenchBench Score,
|
73 |
+
the relative agreement (Z Score) of each benchmark to the Aggragate benchmark.
|
|
|
74 |
\n
|
75 |
+
Read more in our work [Benchmark Agreement Testing Done Right](https://arxiv.org/abs/2407.13696) and the [BenchBench repo](https://github.com/IBM/benchbench)
|
76 |
"""
|
77 |
)
|
78 |
|
|
|
340 |
|
341 |
z_scores["date"] = pd.to_datetime("20" + z_scores["date"]).dt.date
|
342 |
|
343 |
+
z_score_name = "BenchBench Score"
|
344 |
+
p_val_name = "p val"
|
345 |
|
346 |
data = (
|
347 |
z_scores.rename(
|
|
|
349 |
"scenario": "Benchmark",
|
350 |
"z_score": z_score_name,
|
351 |
"corr_with_agg": corr_name,
|
352 |
+
"p_value_of_corr_with_agg": p_val_name,
|
353 |
# "n_models_of_corr_with_agg": "# Models Used",
|
354 |
"source": "Source",
|
355 |
"date": "Snapshot Date",
|
|
|
377 |
)
|
378 |
.apply(highlight_uploaded_benchmark, axis=1)
|
379 |
.background_gradient(
|
380 |
+
subset=[p_val_name],
|
381 |
cmap="Reds",
|
382 |
vmin=0.1,
|
383 |
vmax=1,
|
384 |
)
|
385 |
+
.format(subset=[z_score_name, corr_name, p_val_name], formatter="{:.2}")
|
386 |
.set_properties(**{"text-align": "center"})
|
387 |
)
|
388 |
|
|
|
390 |
"Benchmark",
|
391 |
z_score_name,
|
392 |
corr_name,
|
393 |
+
p_val_name,
|
394 |
"Snapshot Date",
|
395 |
]
|
396 |
|
|
|
400 |
column_order=cols_used,
|
401 |
hide_index=True,
|
402 |
use_container_width=True,
|
403 |
+
height=300,
|
404 |
column_config={col: {"alignment": "center"} for col in cols_used},
|
405 |
)
|
406 |
|
|
|
421 |
use_container_width=True,
|
422 |
)
|
423 |
|
424 |
+
left, right = st.columns([1, 1])
|
425 |
+
|
426 |
+
with left:
|
427 |
+
with st.expander(label="Cite Us!"):
|
428 |
+
st.code(
|
429 |
+
r"""
|
430 |
+
|
431 |
+
@misc{perlitz2024llmbenchmarksagreefixing,
|
432 |
+
title={Do These LLM Benchmarks Agree? Fixing Benchmark Evaluation with BenchBench},
|
433 |
+
author={Yotam Perlitz and Ariel Gera and Ofir Arviv and Asaf Yehudai and Elron Bandel and Eyal Shnarch and Michal Shmueli-Scheuer and Leshem Choshen},
|
434 |
+
year={2024},
|
435 |
+
eprint={2407.13696},
|
436 |
+
archivePrefix={arXiv},
|
437 |
+
primaryClass={cs.CL},
|
438 |
+
url={https://arxiv.org/abs/2407.13696},
|
439 |
+
}
|
440 |
+
|
441 |
+
"""
|
442 |
+
)
|
443 |
+
|
444 |
+
with right:
|
445 |
+
with st.expander(label="Cite Everyone Else!"):
|
446 |
+
st.code(
|
447 |
+
r"""
|
448 |
+
|
449 |
+
@misc{perlitz2024llmbenchmarksagreefixing,
|
450 |
+
title={Do These LLM Benchmarks Agree? Fixing Benchmark Evaluation with BenchBench},
|
451 |
+
author={Yotam Perlitz and Ariel Gera and Ofir Arviv and Asaf Yehudai and Elron Bandel and Eyal Shnarch and Michal Shmueli-Scheuer and Leshem Choshen},
|
452 |
+
year={2024},
|
453 |
+
eprint={2407.13696},
|
454 |
+
archivePrefix={arXiv},
|
455 |
+
primaryClass={cs.CL},
|
456 |
+
url={https://arxiv.org/abs/2407.13696},
|
457 |
+
}
|
458 |
+
|
459 |
|
460 |
@misc{berkeley-function-calling-leaderboard,
|
461 |
title={Berkeley Function Calling Leaderboard},
|
|
|
727 |
}
|
728 |
|
729 |
"""
|
730 |
+
)
|
731 |
|
732 |
|
733 |
st.subheader("Benchmark Report Card")
|
|
|
747 |
|
748 |
col1, col2, col3 = st.columns(3)
|
749 |
cur_data = data.query(f"Benchmark=='{plotted_scenario}'")
|
750 |
+
col1.metric("Relative agreement", cur_data[z_score_name])
|
751 |
col2.metric(corr_name, cur_data[corr_name])
|
752 |
+
col3.metric("p-value of Corr.", cur_data[p_val_name])
|
753 |
|
754 |
cur_df = allbench.df.query(f'scenario=="aggregate" or scenario=="{plotted_scenario}"')
|
755 |
|
|
|
870 |
caption="**BenchBench's Standardized Approach Reduces Variance.** This ablation study demonstrates that following the best practices implemented in BenchBench significantly reduces the variance of BAT results, leading to more robust and reliable conclusions.",
|
871 |
use_column_width=True,
|
872 |
)
|
873 |
+
|
874 |
+
|
875 |
+
st.code(
|
876 |
+
r"""
|
877 |
+
@misc{perlitz2024llmbenchmarksagreefixing,
|
878 |
+
title={Do These LLM Benchmarks Agree? Fixing Benchmark Evaluation with BenchBench},
|
879 |
+
author={Yotam Perlitz and Ariel Gera and Ofir Arviv and Asaf Yehudai and Elron Bandel and Eyal Shnarch and Michal Shmueli-Scheuer and Leshem Choshen},
|
880 |
+
year={2024},
|
881 |
+
eprint={2407.13696},
|
882 |
+
archivePrefix={arXiv},
|
883 |
+
primaryClass={cs.CL},
|
884 |
+
url={https://arxiv.org/abs/2407.13696},
|
885 |
+
}
|
886 |
+
"""
|
887 |
+
)
|