Spaces:
Runtime error
Runtime error
File size: 11,485 Bytes
25557b5 8a91492 25557b5 30a0c61 611a3ed bea7063 611a3ed 608184c 8a91492 523fad9 611a3ed b1b50fb 611a3ed b1b50fb 729af67 bea7063 07db628 611a3ed 07448fb e611814 bea7063 e611814 bea7063 19a6010 7a0e5b8 e611814 729af67 e611814 523fad9 e611814 05c90f4 9c39267 30a0c61 3caeacd bf6ab81 3caeacd ca2b34f 585c3fa a56da8a 9c39267 bea7063 b1b50fb 9c39267 30a0c61 9c39267 ca2b34f f12aa56 9c39267 7379857 3caeacd 26ef426 3caeacd 71dfe85 3caeacd ca2b34f 26ef426 c8b695a 07448fb bd858f5 611a3ed bd858f5 6cf57e4 7379857 bea7063 608184c 25557b5 729af67 523fad9 1f43e72 bea7063 608184c 3caeacd 523fad9 9c39267 608184c 8f7c83f 608184c 8f7c83f bea7063 523fad9 966ae7b 5b4c5f8 3caeacd 9c39267 ddc25db 9c39267 ca2b34f 8f68cc2 f12aa56 bea7063 f12aa56 585c3fa f12aa56 6679087 bea7063 608184c 07db628 bea7063 a56da8a 608184c b1b50fb 7379857 9c39267 608184c 54202cb 611a3ed bea7063 611a3ed 608184c 611a3ed 523fad9 b1b50fb 54202cb 7379857 8f7c83f 3caeacd ca2b34f 3caeacd 26ef426 3caeacd c8b695a bea7063 c8b695a bea7063 c8b695a 7379857 8f7c83f bea7063 523fad9 966ae7b bd858f5 bea7063 bd858f5 7379857 0d84f54 6cf57e4 bea7063 6cf57e4 7379857 bea7063 7379857 e611814 07448fb 611a3ed bea7063 611a3ed 07448fb e611814 523fad9 8a91492 e611814 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 |
import gradio as gr
from apscheduler.schedulers.background import BackgroundScheduler
import src.constants as constants
from src.details import (
clear_details,
display_details,
display_loading_message_for_details,
load_details,
update_load_details_component,
update_sample_idx_component,
update_subtasks_component,
update_task_description_component,
)
from src.env_impact import plot_env_impact
from src.hub import restart_space
from src.model_tree import load_model_tree
from src.results import (
clear_results,
clear_results_file,
display_loading_message_for_results,
display_results,
download_results,
load_result_paths_per_model,
load_results,
plot_results,
update_tasks_component,
)
# if __name__ == "__main__":
with gr.Blocks(fill_height=True, fill_width=True) as demo:
gr.HTML("<h1 style='text-align: center;'>Compare Results of the 🤗 Open LLM Leaderboard</h1>")
gr.HTML("<h3 style='text-align: center;'>Select models to load and compare their results</h3>")
gr.HTML(
"<p style='text-align: center; color:orange;'>⚠ This demo is a beta version, and we're actively working on it, so you might find some tiny bugs! Please report any issues you have in the Community tab to help us make it better for all.</p>"
)
gr.Markdown(
"Compare Results of the 🤗 [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). "
"Check out the [documentation](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/about) 📄 to find explanations on the evaluations used, their configuration parameters and details on the input/outputs for the models."
)
with gr.Row():
model_ids = gr.Dropdown(label="Models", multiselect=True)
result_paths_per_model = gr.State()
with gr.Accordion("Model tree: Compare base and derived models", open=False):
load_model_tree_btn = gr.Button("Load Model Tree", interactive=False)
model_tree_labels = [constants.BASE_MODEL_TYPE[0]] + [
derived_model_type[0] for derived_model_type in constants.DERIVED_MODEL_TYPES
]
base_and_derived_models = [
gr.Dropdown(label=model_tree_labels[0], multiselect=True),
]
with gr.Row():
for label in model_tree_labels[1:]:
base_and_derived_models.append(gr.Dropdown(label=label, multiselect=True, interactive=False))
with gr.Row():
with gr.Tab("Results"):
load_results_btn = gr.Button("Load", interactive=False)
clear_results_btn = gr.Button("Clear")
results_task = gr.Radio(
["All"] + list(constants.TASKS.values()),
label="Tasks",
info="Evaluation tasks to be displayed",
value="All",
visible=False,
)
results_task_description = gr.Textbox(
label="Task Description",
lines=3,
visible=False,
)
hide_std_errors = gr.Checkbox(label="Hide Standard Errors", value=True, info="Options")
with gr.Row():
results_plot_1 = gr.Plot(visible=True)
results_plot_2 = gr.Plot(visible=True)
results = gr.HTML()
results_dataframe = gr.State()
download_results_btn = gr.Button("Download")
results_file = gr.File(visible=False)
with gr.Tab("Configs"):
load_configs_btn = gr.Button("Load", interactive=False)
clear_configs_btn = gr.Button("Clear")
configs_task = gr.Radio(
["All"] + list(constants.TASKS.values()),
label="Tasks",
info="Evaluation tasks to be displayed",
value="All",
visible=False,
)
configs_task_description = gr.Textbox(
label="Task Description",
lines=3,
visible=False,
)
show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
configs = gr.HTML()
with gr.Tab("Details"):
details_task = gr.Radio(
list(constants.TASKS.values()),
label="Tasks",
info="Evaluation tasks to be loaded",
interactive=True,
)
details_task_description = gr.Textbox(
label="Task Description",
lines=3,
)
with gr.Row():
login_btn = gr.LoginButton(size="sm", visible=False)
subtask = gr.Radio(
choices=None, # constants.SUBTASKS.get(details_task.value),
label="Subtasks",
info="Evaluation subtasks to be loaded (choose one of the Tasks above)",
)
load_details_btn = gr.Button("Load Details", interactive=False)
clear_details_btn = gr.Button("Clear Details")
sample_idx = gr.Number(
label="Sample Index", info="Index of the sample to be displayed", value=0, minimum=0, visible=False
)
details_show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
details = gr.HTML()
details_dataframe = gr.State()
with gr.Tab("Environmental impact"):
gr.Markdown(
"The environmental impact calculations we display are derived from the specific inference setup used "
"for evaluation. We leverage 🤗 [Accelerate](https://huggingface.co/docs/accelerate) to efficiently "
"parallelize the model across 8 Nvidia H100 SXM GPUs in a compute cluster located in Northern Virginia. "
"These results reflect the energy consumption and associated emissions of this configuration, "
"providing transparency and insight into the resource requirements of large language model evaluations. "
"You can find more details in our documentation about the [environmental impact](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/emissions)."
)
load_env_impact_btn = gr.Button("Load", interactive=False)
clear_env_impact_btn = gr.Button("Clear")
with gr.Row():
env_impact_plot_1 = gr.Plot(visible=True)
env_impact_plot_2 = gr.Plot(visible=True)
env_impact = gr.HTML()
# DEMO:
demo.load(
fn=load_result_paths_per_model,
outputs=result_paths_per_model,
).then(
fn=lambda x: gr.Dropdown(choices=list(x.keys())),
inputs=result_paths_per_model,
outputs=model_ids,
)
# Buttons:
gr.on(
triggers=[model_ids.input],
fn=lambda: (gr.Button(interactive=True),) * 4,
outputs=[load_model_tree_btn, load_results_btn, load_configs_btn, load_env_impact_btn],
)
# RESULTS:
gr.on(
triggers=[load_results_btn.click, load_configs_btn.click, load_env_impact_btn.click],
fn=display_loading_message_for_results,
outputs=[results, configs, env_impact],
).then(
fn=load_results,
inputs=[
result_paths_per_model,
model_ids,
*base_and_derived_models,
],
outputs=[results_dataframe, results],
).then(
fn=update_tasks_component,
outputs=[results_task, configs_task],
)
# Synchronize the results_task and configs_task radio buttons
results_task.input(fn=lambda task: task, inputs=results_task, outputs=configs_task)
configs_task.input(fn=lambda task: task, inputs=configs_task, outputs=results_task)
# Update task descriptions
results_task.change(
fn=update_task_description_component,
inputs=results_task,
outputs=results_task_description,
).then(
fn=update_task_description_component,
inputs=results_task,
outputs=configs_task_description,
)
# Display results
gr.on(
triggers=[
results_dataframe.change,
results_task.change,
hide_std_errors.change,
show_only_differences.change,
],
fn=display_results,
inputs=[results_dataframe, results_task, hide_std_errors, show_only_differences],
outputs=[results, configs, env_impact],
).then(
fn=plot_results,
inputs=[results_dataframe, results_task],
outputs=[results_plot_1, results_plot_2],
).then(
fn=plot_env_impact,
inputs=[results_dataframe],
outputs=[env_impact_plot_1, env_impact_plot_2],
).then(
fn=clear_results_file,
outputs=results_file,
)
download_results_btn.click(
fn=download_results,
inputs=results,
outputs=results_file,
)
gr.on(
triggers=[clear_results_btn.click, clear_configs_btn.click, clear_env_impact_btn.click],
fn=clear_results,
outputs=[
model_ids,
results_dataframe,
load_results_btn,
load_configs_btn,
load_env_impact_btn,
results_task,
configs_task,
],
).then(
fn=lambda: gr.Button(interactive=False),
outputs=load_model_tree_btn,
).then(
fn=lambda: [gr.Dropdown(label=label, multiselect=True, interactive=False) for label in model_tree_labels],
outputs=[*base_and_derived_models],
).then(
fn=clear_results_file,
outputs=results_file,
)
# DETAILS:
details_task.change(
fn=update_task_description_component,
inputs=details_task,
outputs=details_task_description,
).then(
fn=update_subtasks_component,
inputs=details_task,
outputs=[login_btn, subtask],
)
gr.on(
triggers=[model_ids.input, subtask.input, details_task.input],
fn=update_load_details_component,
inputs=[model_ids, subtask],
outputs=load_details_btn,
)
load_details_btn.click(
fn=display_loading_message_for_details,
outputs=details,
).then(
fn=load_details,
inputs=[
subtask,
model_ids,
*base_and_derived_models,
],
outputs=[details_dataframe, details],
).then(
fn=update_sample_idx_component,
inputs=[details_dataframe],
outputs=sample_idx,
)
gr.on(
triggers=[
details_dataframe.change,
sample_idx.change,
details_show_only_differences.change,
],
fn=display_details,
inputs=[details_dataframe, sample_idx, details_show_only_differences],
outputs=details,
)
clear_details_btn.click(
fn=clear_details,
outputs=[
model_ids,
details_dataframe,
details_task,
subtask,
load_details_btn,
sample_idx,
],
)
# MODEL TREE:
load_model_tree_btn.click(
fn=load_model_tree,
inputs=[result_paths_per_model, model_ids],
outputs=[
*base_and_derived_models,
],
)
# Start scheduler
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", hours=1) # Restart every 1h
scheduler.start()
demo.launch()
|