Spaces:
Running
Running
Jae-Won Chung
commited on
Commit
•
81672d7
1
Parent(s):
36058af
More info in app and about page (#14)
Browse files- LEADERBOARD.md +15 -1
- app.py +112 -84
LEADERBOARD.md
CHANGED
@@ -65,6 +65,20 @@ Find our benchmark script for one model [here](https://github.com/ml-energy/lead
|
|
65 |
We randomly sampled around 3000 prompts from the [cleaned ShareGPT dataset](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered).
|
66 |
See [here](https://github.com/ml-energy/leaderboard/tree/master/sharegpt) for more detail on how we created the benchmark dataset.
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
## Contributing
|
69 |
|
70 |
Any kind of contribution is more than welcome!
|
@@ -84,7 +98,7 @@ Hence, absolute latency, throughput, and energy numbers should not be used to es
|
|
84 |
|
85 |
Batch size 1, in some sense, is the lowest possible hardware utilization.
|
86 |
We'll soon benchmark batch sizes larger than 1 without continuous batching for comparison.
|
87 |
-
This would show what happens in the case of very high hardware utilization (
|
88 |
By doing this, we can provide numbers for reasonable comparison without being tied to any existing generative model serving system.
|
89 |
|
90 |
## Upcoming
|
|
|
65 |
We randomly sampled around 3000 prompts from the [cleaned ShareGPT dataset](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered).
|
66 |
See [here](https://github.com/ml-energy/leaderboard/tree/master/sharegpt) for more detail on how we created the benchmark dataset.
|
67 |
|
68 |
+
## FAQ
|
69 |
+
|
70 |
+
### So who's the winner?
|
71 |
+
|
72 |
+
It depends on which metric you value most.
|
73 |
+
Some may be tightly constrained by electricity consumption, in which case energy would have higher weight.
|
74 |
+
Some may just want better model quality, in which case the NLP dataset results will be important.
|
75 |
+
Others might want something balanced.
|
76 |
+
This is why we support adding custom columns to the table, and let you choose your own winner!
|
77 |
+
|
78 |
+
### Where can I find more about ML energy-related resources?
|
79 |
+
|
80 |
+
Meet us at the [ML.ENERGY initiative](https://ml.energy) homepage!
|
81 |
+
|
82 |
## Contributing
|
83 |
|
84 |
Any kind of contribution is more than welcome!
|
|
|
98 |
|
99 |
Batch size 1, in some sense, is the lowest possible hardware utilization.
|
100 |
We'll soon benchmark batch sizes larger than 1 without continuous batching for comparison.
|
101 |
+
This would show what happens in the case of very high hardware utilization (although it's with PyTorch), assuming an ideal case where all sequences in each batch generate the same number of output tokens.
|
102 |
By doing this, we can provide numbers for reasonable comparison without being tied to any existing generative model serving system.
|
103 |
|
104 |
## Upcoming
|
app.py
CHANGED
@@ -160,8 +160,8 @@ class TableManager:
|
|
160 |
def get_dropdown(self):
|
161 |
columns = self.full_df.columns.tolist()[1:]
|
162 |
return [
|
163 |
-
gr.Dropdown(choices=columns, label="X"),
|
164 |
-
gr.Dropdown(choices=columns, label="Y"),
|
165 |
gr.Dropdown(choices=["None", *columns], label="Z (optional)"),
|
166 |
]
|
167 |
|
@@ -306,16 +306,36 @@ table th:first-child {
|
|
306 |
overflow: auto;
|
307 |
white-space: nowrap;
|
308 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
309 |
"""
|
310 |
|
311 |
block = gr.Blocks(css=css)
|
312 |
with block:
|
313 |
tbm = gr.State(global_tbm) # type: ignore
|
314 |
-
gr.
|
|
|
315 |
|
316 |
with gr.Tabs():
|
317 |
# Tab 1: Leaderboard.
|
318 |
-
with gr.
|
|
|
|
|
|
|
319 |
# Block 1: Checkboxes to select benchmarking parameters.
|
320 |
with gr.Row():
|
321 |
with gr.Box():
|
@@ -335,96 +355,104 @@ with block:
|
|
335 |
checkbox.change(TableManager.set_filter_get_df, inputs=[tbm, *checkboxes], outputs=dataframe)
|
336 |
|
337 |
# Block 3: Allow users to add new columns.
|
338 |
-
gr.
|
339 |
-
|
340 |
-
with gr.
|
341 |
-
with gr.
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
with gr.
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
[
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
|
|
379 |
|
380 |
# Block 4: Allow users to plot 2D and 3D scatter plots.
|
381 |
-
gr.
|
382 |
-
|
383 |
-
with gr.
|
384 |
-
with gr.
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
with gr.
|
389 |
-
|
|
|
|
|
|
|
|
|
390 |
with gr.Row():
|
391 |
-
|
392 |
-
|
393 |
with gr.Row():
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
inputs=
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
|
|
|
|
|
|
421 |
|
422 |
# Block 5: Leaderboard date.
|
423 |
with gr.Row():
|
424 |
gr.HTML(f"<h3 style='color: gray'>Last updated: {current_date}</h3>")
|
425 |
|
426 |
# Tab 2: About page.
|
427 |
-
with gr.
|
428 |
# Read in LEADERBOARD.md
|
429 |
gr.Markdown(open("LEADERBOARD.md").read())
|
430 |
|
|
|
160 |
def get_dropdown(self):
|
161 |
columns = self.full_df.columns.tolist()[1:]
|
162 |
return [
|
163 |
+
gr.Dropdown(choices=columns, value="parameters", label="X"),
|
164 |
+
gr.Dropdown(choices=columns, value="energy", label="Y"),
|
165 |
gr.Dropdown(choices=["None", *columns], label="Z (optional)"),
|
166 |
]
|
167 |
|
|
|
306 |
overflow: auto;
|
307 |
white-space: nowrap;
|
308 |
}
|
309 |
+
|
310 |
+
/* Make tab buttons larger */
|
311 |
+
.tab-nav > button {
|
312 |
+
font-size: 18px !important;
|
313 |
+
}
|
314 |
+
"""
|
315 |
+
|
316 |
+
intro_text = """
|
317 |
+
<h2>How much energy do modern Large Language Models (LLMs) consume for inference?</h2>
|
318 |
+
|
319 |
+
<p style="font-size: 16px">We used <a href="https://ml.energy/zeus">Zeus</a> to benchmark various open source LLMs in terms of how much time and energy they consume for inference.
|
320 |
+
Time and energy are of course not the only things we care about -- so we also benchmarked all of the models on a variety of NLP datasets,
|
321 |
+
including the ARC Challenge (reasoning), HellaSwag (common sense), and TruthfulQA (truthfulness).</p>
|
322 |
+
|
323 |
+
<p style="font-size: 16px">For more detailed information, please take a look at the <b>About</b> tab.
|
324 |
+
Every benchmark is limited in some sense -- Before you interpret the results, please take a look at the *Limitations* section there, too.</p>
|
325 |
"""
|
326 |
|
327 |
block = gr.Blocks(css=css)
|
328 |
with block:
|
329 |
tbm = gr.State(global_tbm) # type: ignore
|
330 |
+
with gr.Box():
|
331 |
+
gr.HTML("<h1><a href='https://ml.energy' class='text-logo'>ML.ENERGY</a> Leaderboard</h1>")
|
332 |
|
333 |
with gr.Tabs():
|
334 |
# Tab 1: Leaderboard.
|
335 |
+
with gr.Tab("Leaderboard"):
|
336 |
+
with gr.Box():
|
337 |
+
gr.HTML(intro_text)
|
338 |
+
|
339 |
# Block 1: Checkboxes to select benchmarking parameters.
|
340 |
with gr.Row():
|
341 |
with gr.Box():
|
|
|
355 |
checkbox.change(TableManager.set_filter_get_df, inputs=[tbm, *checkboxes], outputs=dataframe)
|
356 |
|
357 |
# Block 3: Allow users to add new columns.
|
358 |
+
with gr.Box():
|
359 |
+
gr.Markdown("### Add custom columns to the table")
|
360 |
+
with gr.Row():
|
361 |
+
with gr.Column(scale=3):
|
362 |
+
with gr.Row():
|
363 |
+
colname_input = gr.Textbox(lines=1, label="Custom column name")
|
364 |
+
formula_input = gr.Textbox(lines=1, label="Formula (@sum, @len, @max, and @min are supported)")
|
365 |
+
with gr.Column(scale=1):
|
366 |
+
with gr.Row():
|
367 |
+
add_col_btn = gr.Button("Add to table (⏎)", elem_classes=["btn-submit"])
|
368 |
+
with gr.Row():
|
369 |
+
clear_input_btn = gr.Button("Clear")
|
370 |
+
with gr.Row():
|
371 |
+
add_col_message = gr.HTML("")
|
372 |
+
gr.Examples(
|
373 |
+
examples=[
|
374 |
+
["power", "energy / latency"],
|
375 |
+
["token_per_joule", "response_length / energy"],
|
376 |
+
["verbose", "response_length > @sum(response_length) / @len(response_length)"],
|
377 |
+
],
|
378 |
+
inputs=[colname_input, formula_input],
|
379 |
+
)
|
380 |
+
colname_input.submit(
|
381 |
+
TableManager.add_column,
|
382 |
+
inputs=[tbm, colname_input, formula_input],
|
383 |
+
outputs=[dataframe, add_col_message],
|
384 |
+
)
|
385 |
+
formula_input.submit(
|
386 |
+
TableManager.add_column,
|
387 |
+
inputs=[tbm, colname_input, formula_input],
|
388 |
+
outputs=[dataframe, add_col_message],
|
389 |
+
)
|
390 |
+
add_col_btn.click(
|
391 |
+
TableManager.add_column,
|
392 |
+
inputs=[tbm, colname_input, formula_input],
|
393 |
+
outputs=[dataframe, add_col_message],
|
394 |
+
)
|
395 |
+
clear_input_btn.click(
|
396 |
+
lambda: (None, None, None),
|
397 |
+
inputs=None,
|
398 |
+
outputs=[colname_input, formula_input, add_col_message],
|
399 |
+
)
|
400 |
|
401 |
# Block 4: Allow users to plot 2D and 3D scatter plots.
|
402 |
+
with gr.Box():
|
403 |
+
gr.Markdown("### Scatter plot (Hover over marker to show model name)")
|
404 |
+
with gr.Row():
|
405 |
+
with gr.Column(scale=3):
|
406 |
+
with gr.Row():
|
407 |
+
# Initialize the dropdown choices with the global TableManager with just the original columns.
|
408 |
+
axis_dropdowns = global_tbm.get_dropdown()
|
409 |
+
with gr.Column(scale=1):
|
410 |
+
with gr.Row():
|
411 |
+
plot_btn = gr.Button("Plot", elem_classes=["btn-submit"])
|
412 |
+
with gr.Row():
|
413 |
+
clear_plot_btn = gr.Button("Clear")
|
414 |
+
with gr.Accordion("Plot size (600 x 600 by default)", open=False):
|
415 |
with gr.Row():
|
416 |
+
plot_width_input = gr.Textbox("600", lines=1, label="Width (px)")
|
417 |
+
plot_height_input = gr.Textbox("600", lines=1, label="Height (px)")
|
418 |
with gr.Row():
|
419 |
+
plot = gr.Plot(value=global_tbm.plot_scatter(
|
420 |
+
plot_width_input.value,
|
421 |
+
plot_height_input.value,
|
422 |
+
x=axis_dropdowns[0].value,
|
423 |
+
y=axis_dropdowns[1].value,
|
424 |
+
z=axis_dropdowns[2].value,
|
425 |
+
)[0]) # type: ignore
|
426 |
+
with gr.Row():
|
427 |
+
plot_message = gr.HTML("")
|
428 |
+
add_col_btn.click(TableManager.update_dropdown, inputs=tbm, outputs=axis_dropdowns) # type: ignore
|
429 |
+
plot_width_input.submit(
|
430 |
+
TableManager.plot_scatter,
|
431 |
+
inputs=[tbm, plot_width_input, plot_height_input, *axis_dropdowns],
|
432 |
+
outputs=[plot, plot_width_input, plot_height_input, plot_message],
|
433 |
+
)
|
434 |
+
plot_height_input.submit(
|
435 |
+
TableManager.plot_scatter,
|
436 |
+
inputs=[tbm, plot_width_input, plot_height_input, *axis_dropdowns],
|
437 |
+
outputs=[plot, plot_width_input, plot_height_input, plot_message],
|
438 |
+
)
|
439 |
+
plot_btn.click(
|
440 |
+
TableManager.plot_scatter,
|
441 |
+
inputs=[tbm, plot_width_input, plot_height_input, *axis_dropdowns],
|
442 |
+
outputs=[plot, plot_width_input, plot_height_input, plot_message],
|
443 |
+
)
|
444 |
+
clear_plot_btn.click(
|
445 |
+
lambda: (None,) * 7,
|
446 |
+
None,
|
447 |
+
outputs=[*axis_dropdowns, plot, plot_width_input, plot_height_input, plot_message],
|
448 |
+
)
|
449 |
|
450 |
# Block 5: Leaderboard date.
|
451 |
with gr.Row():
|
452 |
gr.HTML(f"<h3 style='color: gray'>Last updated: {current_date}</h3>")
|
453 |
|
454 |
# Tab 2: About page.
|
455 |
+
with gr.Tab("About"):
|
456 |
# Read in LEADERBOARD.md
|
457 |
gr.Markdown(open("LEADERBOARD.md").read())
|
458 |
|