Spaces:
Running
Running
Yotam-Perlitz
commited on
Commit
•
363d8ae
1
Parent(s):
566ad63
improve bench upload code
Browse filesSigned-off-by: Yotam-Perlitz <[email protected]>
app.py
CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
|
|
5 |
import plotly.express as px
|
6 |
import streamlit as st
|
7 |
from bat import Benchmark, Config, Reporter, Tester
|
|
|
8 |
|
9 |
|
10 |
holistic_scenarios = [
|
@@ -58,7 +59,7 @@ with st.expander("Leaderboard configurations (defaults are great BTW)", icon="
|
|
58 |
label="Select Correlation type", options=["kendall", "pearson"], index=0
|
59 |
)
|
60 |
|
61 |
-
|
62 |
# [
|
63 |
# scen
|
64 |
# for scen in all_scenarios_for_aggragate
|
@@ -98,7 +99,38 @@ with st.expander("Add your benchmarks here!", icon="🔥"):
|
|
98 |
my_benchmark = Benchmark()
|
99 |
if uploaded_file is not None:
|
100 |
df = pd.read_csv(uploaded_file)
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
|
104 |
def run_load(
|
@@ -108,7 +140,7 @@ def run_load(
|
|
108 |
corr_types=["kendall"],
|
109 |
n_exps=10,
|
110 |
my_benchmark=Benchmark(),
|
111 |
-
use_caching=
|
112 |
):
|
113 |
# Create a hash of the inputs to generate a unique cache file for each set of inputs
|
114 |
input_str = (
|
@@ -147,24 +179,6 @@ def run_load(
|
|
147 |
else:
|
148 |
print("Cached results not found, calculating")
|
149 |
|
150 |
-
cfg = Config(
|
151 |
-
exp_to_run="example",
|
152 |
-
n_models_taken_list=n_models_taken_list,
|
153 |
-
model_select_strategy_list=model_select_strategy_list,
|
154 |
-
corr_types=corr_types,
|
155 |
-
n_exps=n_exps if n_models_taken_list != [0] else 1,
|
156 |
-
)
|
157 |
-
|
158 |
-
# holistic = Benchmark()
|
159 |
-
# holistic.load_local_catalog()
|
160 |
-
# holistic.df = holistic.df.query("scenario in @holistic_scenarios")
|
161 |
-
|
162 |
-
# holistic.clear_repeated_scenarios()
|
163 |
-
|
164 |
-
# aggragate_scores = holistic.df.query('scenario=="aggregate"')[
|
165 |
-
# ["model", "score"]
|
166 |
-
# ].sort_values(by="score", ascending=False)
|
167 |
-
|
168 |
allbench = Benchmark()
|
169 |
allbench.load_local_catalog()
|
170 |
|
@@ -172,22 +186,43 @@ def run_load(
|
|
172 |
new_col_name="aggregate",
|
173 |
agg_source_name="aggregate",
|
174 |
scenario_whitelist=aggregate_scenario_whitelist,
|
175 |
-
min_scenario_for_models_to_appear_in_agg=1
|
|
|
|
|
176 |
)
|
177 |
|
|
|
|
|
|
|
178 |
aggragate_scores = allbench.df.query('scenario=="aggregate"')[
|
179 |
["model", "score"]
|
180 |
].sort_values(by="score", ascending=False)
|
181 |
|
182 |
-
|
|
|
|
|
|
|
183 |
|
184 |
-
|
185 |
-
|
186 |
-
|
|
|
|
|
|
|
|
|
187 |
|
188 |
-
|
189 |
-
|
190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
|
192 |
tester = Tester(cfg=cfg)
|
193 |
|
@@ -205,7 +240,7 @@ def run_load(
|
|
205 |
|
206 |
|
207 |
agreements, aggragare_score_df = run_load(
|
208 |
-
aggregate_scenario_whitelist=
|
209 |
n_models_taken_list=n_models_taken_list,
|
210 |
model_select_strategy_list=[model_select_strategy],
|
211 |
corr_types=[corr_type],
|
@@ -255,7 +290,7 @@ data = (
|
|
255 |
|
256 |
# Apply coloring based on 'Z' valuesz
|
257 |
def highlight_uploaded_benchmark(row):
|
258 |
-
if row["Source"]
|
259 |
return ["background-color: rgba(100,100,100,0.1)"] * len(row)
|
260 |
else:
|
261 |
return [""] * len(row)
|
@@ -314,6 +349,16 @@ with st.expander(label="Model scored by the aggragate"):
|
|
314 |
with st.expander(label="Citations"):
|
315 |
st.code(
|
316 |
r"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
317 |
@misc{liu2023agentbenchevaluatingllmsagents,
|
318 |
title={AgentBench: Evaluating LLMs as Agents},
|
319 |
author={Xiao Liu and Hao Yu and Hanchen Zhang and Yifan Xu and Xuanyu Lei and Hanyu Lai and Yu Gu and Hangliang Ding and Kaiwen Men and Kejuan Yang and Shudan Zhang and Xiang Deng and Aohan Zeng and Zhengxiao Du and Chenhui Zhang and Sheng Shen and Tianjun Zhang and Yu Su and Huan Sun and Minlie Huang and Yuxiao Dong and Jie Tang},
|
|
|
5 |
import plotly.express as px
|
6 |
import streamlit as st
|
7 |
from bat import Benchmark, Config, Reporter, Tester
|
8 |
+
from datetime import datetime
|
9 |
|
10 |
|
11 |
holistic_scenarios = [
|
|
|
59 |
label="Select Correlation type", options=["kendall", "pearson"], index=0
|
60 |
)
|
61 |
|
62 |
+
aggregate_scenario_whitelist = aggragate_scenarios
|
63 |
# [
|
64 |
# scen
|
65 |
# for scen in all_scenarios_for_aggragate
|
|
|
99 |
my_benchmark = Benchmark()
|
100 |
if uploaded_file is not None:
|
101 |
df = pd.read_csv(uploaded_file)
|
102 |
+
|
103 |
+
my_benchmark.assign_df(
|
104 |
+
df,
|
105 |
+
data_source=f"uploaded_benchmark_{datetime.now().strftime('%y%m%d')}.csv",
|
106 |
+
)
|
107 |
+
|
108 |
+
allbench = Benchmark()
|
109 |
+
allbench.load_local_catalog()
|
110 |
+
|
111 |
+
allbench.add_aggregate(
|
112 |
+
new_col_name="aggregate",
|
113 |
+
agg_source_name="aggregate",
|
114 |
+
scenario_whitelist=aggregate_scenario_whitelist,
|
115 |
+
min_scenario_for_models_to_appear_in_agg=1
|
116 |
+
if len(aggregate_scenario_whitelist) == 1
|
117 |
+
else 3,
|
118 |
+
)
|
119 |
+
|
120 |
+
uploaded_models = my_benchmark.df[
|
121 |
+
my_benchmark.df["source"].str.contains("uploaded")
|
122 |
+
]["model"].unique()
|
123 |
+
aggregate_models = allbench.df[allbench.df["source"].str.contains("aggregate")][
|
124 |
+
"model"
|
125 |
+
].unique()
|
126 |
+
|
127 |
+
# Find the intersection (overlap) of models
|
128 |
+
overlap_models = set(aggregate_models).intersection(uploaded_models)
|
129 |
+
if len(overlap_models) < n_models_taken_list[0]:
|
130 |
+
st.warning(
|
131 |
+
f"You have just {len(overlap_models)} models intersecting with the aggregate!"
|
132 |
+
f"Here are some models you should run your benchmark over:{aggregate_models}"
|
133 |
+
)
|
134 |
|
135 |
|
136 |
def run_load(
|
|
|
140 |
corr_types=["kendall"],
|
141 |
n_exps=10,
|
142 |
my_benchmark=Benchmark(),
|
143 |
+
use_caching=False,
|
144 |
):
|
145 |
# Create a hash of the inputs to generate a unique cache file for each set of inputs
|
146 |
input_str = (
|
|
|
179 |
else:
|
180 |
print("Cached results not found, calculating")
|
181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
allbench = Benchmark()
|
183 |
allbench.load_local_catalog()
|
184 |
|
|
|
186 |
new_col_name="aggregate",
|
187 |
agg_source_name="aggregate",
|
188 |
scenario_whitelist=aggregate_scenario_whitelist,
|
189 |
+
min_scenario_for_models_to_appear_in_agg=1
|
190 |
+
if len(aggregate_scenario_whitelist) == 1
|
191 |
+
else 2,
|
192 |
)
|
193 |
|
194 |
+
allbench.extend(my_benchmark)
|
195 |
+
allbench.clear_repeated_scenarios()
|
196 |
+
|
197 |
aggragate_scores = allbench.df.query('scenario=="aggregate"')[
|
198 |
["model", "score"]
|
199 |
].sort_values(by="score", ascending=False)
|
200 |
|
201 |
+
if not my_benchmark.is_empty:
|
202 |
+
aggragate_scores["in_uploaded"] = aggragate_scores["model"].apply(
|
203 |
+
lambda x: x in my_benchmark.df["model"].unique()
|
204 |
+
)
|
205 |
|
206 |
+
# Get unique models for each scenario
|
207 |
+
uploaded_models = allbench.df[
|
208 |
+
allbench.df["source"].str.contains("uploaded")
|
209 |
+
]["model"].unique()
|
210 |
+
aggregate_models = allbench.df[
|
211 |
+
allbench.df["source"].str.contains("aggregate")
|
212 |
+
]["model"].unique()
|
213 |
|
214 |
+
# Find the intersection (overlap) of models
|
215 |
+
n_overlap_models = len(set(aggregate_models).intersection(uploaded_models))
|
216 |
+
# make sure we are asking for the maximal number of models between the request benchmark and the aggregate
|
217 |
+
n_models_taken_list = [min(n_models_taken_list[0], n_overlap_models)]
|
218 |
+
|
219 |
+
cfg = Config(
|
220 |
+
exp_to_run="example",
|
221 |
+
n_models_taken_list=n_models_taken_list,
|
222 |
+
model_select_strategy_list=model_select_strategy_list,
|
223 |
+
corr_types=corr_types,
|
224 |
+
n_exps=n_exps if n_models_taken_list != [0] else 1,
|
225 |
+
)
|
226 |
|
227 |
tester = Tester(cfg=cfg)
|
228 |
|
|
|
240 |
|
241 |
|
242 |
agreements, aggragare_score_df = run_load(
|
243 |
+
aggregate_scenario_whitelist=aggregate_scenario_whitelist,
|
244 |
n_models_taken_list=n_models_taken_list,
|
245 |
model_select_strategy_list=[model_select_strategy],
|
246 |
corr_types=[corr_type],
|
|
|
290 |
|
291 |
# Apply coloring based on 'Z' valuesz
|
292 |
def highlight_uploaded_benchmark(row):
|
293 |
+
if "uploaded_benchmark" in row["Source"]:
|
294 |
return ["background-color: rgba(100,100,100,0.1)"] * len(row)
|
295 |
else:
|
296 |
return [""] * len(row)
|
|
|
349 |
with st.expander(label="Citations"):
|
350 |
st.code(
|
351 |
r"""
|
352 |
+
|
353 |
+
@misc{berkeley-function-calling-leaderboard,
|
354 |
+
title={Berkeley Function Calling Leaderboard},
|
355 |
+
author={Fanjia Yan and Huanzhi Mao and Charlie Cheng-Jie Ji
|
356 |
+
and Tianjun Zhang and Shishir G. Patil and Ion Stoica and Joseph E.
|
357 |
+
Gonzalez},
|
358 |
+
howpublished={\url{https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html}},
|
359 |
+
year={2024},
|
360 |
+
}
|
361 |
+
|
362 |
@misc{liu2023agentbenchevaluatingllmsagents,
|
363 |
title={AgentBench: Evaluating LLMs as Agents},
|
364 |
author={Xiao Liu and Hao Yu and Hanchen Zhang and Yifan Xu and Xuanyu Lei and Hanyu Lai and Yu Gu and Hangliang Ding and Kaiwen Men and Kejuan Yang and Shudan Zhang and Xiang Deng and Aohan Zeng and Zhengxiao Du and Chenhui Zhang and Sheng Shen and Tianjun Zhang and Yu Su and Huan Sun and Minlie Huang and Yuxiao Dong and Jie Tang},
|