Spaces:
Running
Running
Commit
Β·
ab5f5f1
1
Parent(s):
988dbd8
update
Browse files- app.py +71 -344
- huggy_bench.png β logo.png +0 -0
- pyproject.toml +21 -0
- script.py +0 -14
- src/{assets/css_html_js.py β assets.py} +3 -3
- src/bettertransformer.py +148 -0
- src/control_panel.py +168 -0
- src/flashattentionv2.py +148 -0
- src/latency_score_memory.py +67 -0
- src/leaderboard.py +60 -0
- src/llm_perf.py +127 -0
- src/{assets/text_content.py β text.py} +32 -18
- src/utils.py +21 -28
app.py
CHANGED
@@ -1,371 +1,98 @@
|
|
1 |
import os
|
2 |
|
3 |
import gradio as gr
|
4 |
-
import pandas as pd
|
5 |
-
import plotly.express as px
|
6 |
-
from huggingface_hub.file_download import hf_hub_download
|
7 |
|
8 |
-
|
9 |
-
from src.
|
10 |
-
from src.
|
11 |
-
from src.
|
|
|
|
|
|
|
|
|
12 |
TITLE,
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
16 |
CITATION_BUTTON_LABEL,
|
17 |
-
CITATION_BUTTON_TEXT,
|
18 |
)
|
19 |
|
20 |
-
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
21 |
-
LOGO_URL = "https://huggingface.co/spaces/optimum/llm-perf-leaderboard/resolve/main/huggy_bench.png"
|
22 |
-
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
|
23 |
-
ALL_COLUMNS_MAPPING = {
|
24 |
-
"Model": "Model π€",
|
25 |
-
"Arch": "Arch ποΈ",
|
26 |
-
"Size": "Params (B) π",
|
27 |
-
# deployment settings
|
28 |
-
"backend.name": "Backend π",
|
29 |
-
"backend.torch_dtype": "Dtype π₯",
|
30 |
-
"optimization": "Optimization π οΈ",
|
31 |
-
"quantization": "Quantization ποΈ",
|
32 |
-
# measurements
|
33 |
-
"Score": "Open LLM Score (%) β¬οΈ",
|
34 |
-
"decode.throughput(tokens/s)": "Decode Throughput (tokens/s) β¬οΈ",
|
35 |
-
"generate.throughput(tokens/s)": "E2E Throughput (tokens/s) β¬οΈ",
|
36 |
-
"forward.latency(s)": "Prefill Latency (s) β¬οΈ",
|
37 |
-
"generate.latency(s)": "E2E Latency (s) β¬οΈ",
|
38 |
-
"generate.max_memory_allocated(MB)": "Allocated Memory (MB) β¬οΈ",
|
39 |
-
"generate.max_memory_reserved(MB)": "Reserved Memory (MB) β¬οΈ",
|
40 |
-
"generate.max_memory_used(MB)": "Used Memory (MB) β¬οΈ",
|
41 |
-
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) β¬οΈ",
|
42 |
-
}
|
43 |
-
SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
|
44 |
-
SORTING_ASCENDING = [False, False]
|
45 |
-
ALL_COLUMNS_DATATYPES = [
|
46 |
-
# open llm
|
47 |
-
"markdown",
|
48 |
-
"markdown",
|
49 |
-
"number",
|
50 |
-
# deployment settings
|
51 |
-
"str",
|
52 |
-
"str",
|
53 |
-
"str",
|
54 |
-
"str",
|
55 |
-
# measurements
|
56 |
-
"number",
|
57 |
-
"number",
|
58 |
-
"number",
|
59 |
-
"number",
|
60 |
-
"number",
|
61 |
-
"number",
|
62 |
-
"number",
|
63 |
-
"number",
|
64 |
-
"number",
|
65 |
-
"number",
|
66 |
-
]
|
67 |
-
# download data
|
68 |
-
hf_hub_download(
|
69 |
-
repo_id="optimum/llm-perf-dataset",
|
70 |
-
filename="open-llm.csv",
|
71 |
-
local_dir="dataset",
|
72 |
-
repo_type="dataset",
|
73 |
-
token=HF_TOKEN,
|
74 |
-
)
|
75 |
-
OPEN_LLM_DF = pd.read_csv("dataset/open-llm.csv")
|
76 |
|
|
|
77 |
MACHINE_TO_HARDWARE = {"hf-dgx-01": "A100-80GB π₯οΈ"}
|
78 |
-
|
79 |
-
for machine in MACHINE_TO_HARDWARE:
|
80 |
-
hf_hub_download(
|
81 |
-
repo_id="optimum/llm-perf-dataset",
|
82 |
-
filename=f"{machine}/perf-report.csv",
|
83 |
-
local_dir="dataset",
|
84 |
-
repo_type="dataset",
|
85 |
-
token=HF_TOKEN,
|
86 |
-
)
|
87 |
-
MACHINE_TO_PERF[machine] = pd.read_csv(f"dataset/{machine}/perf-report.csv")
|
88 |
-
|
89 |
-
|
90 |
-
def get_benchmark_df(machine="hf-dgx-01"):
|
91 |
-
# merge on model
|
92 |
-
machine_perf_df = MACHINE_TO_PERF[machine].copy()
|
93 |
-
merged_df = OPEN_LLM_DF.merge(machine_perf_df, left_on="Model", right_on="model")
|
94 |
-
# transpose energy consumption
|
95 |
-
merged_df["generate.energy_consumption(tokens/kWh)"] = (
|
96 |
-
1 / merged_df["generate.energy_consumption(kWh/token)"].fillna(1)
|
97 |
-
).astype(int)
|
98 |
-
# fix nan values
|
99 |
-
merged_df.loc[
|
100 |
-
merged_df["generate.energy_consumption(tokens/kWh)"] == 1,
|
101 |
-
"generate.energy_consumption(tokens/kWh)",
|
102 |
-
] = pd.NA
|
103 |
-
# add optimization column
|
104 |
-
merged_df["optimization"] = merged_df[
|
105 |
-
["backend.to_bettertransformer", "backend.use_flash_attention_2"]
|
106 |
-
].apply(
|
107 |
-
lambda x: "BetterTransformer"
|
108 |
-
if x["backend.to_bettertransformer"]
|
109 |
-
else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None"),
|
110 |
-
axis=1,
|
111 |
-
)
|
112 |
-
# add quantization scheme
|
113 |
-
merged_df["quantization"] = merged_df[
|
114 |
-
["backend.quantization_scheme", "backend.quantization_config.exllama_config.version"]
|
115 |
-
].apply(
|
116 |
-
lambda x: "BnB.4bit"
|
117 |
-
if x["backend.quantization_scheme"] == "bnb"
|
118 |
-
else (
|
119 |
-
"GPTQ.4bit+ExllamaV1"
|
120 |
-
if (x["backend.quantization_scheme"] == "gptq")
|
121 |
-
and (x["backend.quantization_config.exllama_config.version"] == 1)
|
122 |
-
else (
|
123 |
-
"GPTQ.4bit+ExllamaV2"
|
124 |
-
if (x["backend.quantization_scheme"] == "gptq")
|
125 |
-
and (x["backend.quantization_config.exllama_config.version"] == 2)
|
126 |
-
else "None"
|
127 |
-
)
|
128 |
-
),
|
129 |
-
axis=1,
|
130 |
-
)
|
131 |
-
# add decode throughput
|
132 |
-
merged_df["decode.throughput(tokens/s)"] = (
|
133 |
-
1000 / (merged_df["generate.latency(s)"] - merged_df["forward.latency(s)"])
|
134 |
-
).round(2)
|
135 |
-
# sort by metric
|
136 |
-
merged_df.sort_values(by=SORTING_COLUMN, ascending=SORTING_ASCENDING, inplace=True)
|
137 |
-
# filter columns
|
138 |
-
merged_df = merged_df[list(ALL_COLUMNS_MAPPING.keys())]
|
139 |
-
# rename columns
|
140 |
-
merged_df.rename(columns=ALL_COLUMNS_MAPPING, inplace=True)
|
141 |
-
|
142 |
-
return merged_df
|
143 |
-
|
144 |
-
|
145 |
-
def get_benchmark_table(bench_df):
|
146 |
-
copy_df = bench_df.copy()
|
147 |
-
# transform
|
148 |
-
copy_df["Model π€"] = copy_df["Model π€"].apply(process_model_name)
|
149 |
-
copy_df["Arch ποΈ"] = copy_df["Arch ποΈ"].apply(process_model_arch)
|
150 |
-
# process quantization
|
151 |
-
copy_df["Open LLM Score (%) β¬οΈ"] = copy_df.apply(
|
152 |
-
lambda x: f"{x['Open LLM Score (%) β¬οΈ']}**"
|
153 |
-
if x["Quantization ποΈ"] in ["BnB.4bit", "GPTQ.4bit"]
|
154 |
-
else x["Open LLM Score (%) β¬οΈ"],
|
155 |
-
axis=1,
|
156 |
-
)
|
157 |
-
return copy_df
|
158 |
-
|
159 |
-
|
160 |
-
def get_benchmark_chart(bench_df):
|
161 |
-
copy_df = bench_df.copy()
|
162 |
-
# transform
|
163 |
-
copy_df["Arch ποΈ"] = copy_df["Arch ποΈ"].apply(process_model_arch)
|
164 |
-
# plot
|
165 |
-
fig = px.scatter(
|
166 |
-
copy_df,
|
167 |
-
y="Open LLM Score (%) β¬οΈ",
|
168 |
-
x="E2E Latency (s) β¬οΈ",
|
169 |
-
size="Allocated Memory (MB) β¬οΈ",
|
170 |
-
color="Arch ποΈ",
|
171 |
-
custom_data=list(ALL_COLUMNS_MAPPING.values()),
|
172 |
-
color_discrete_sequence=px.colors.qualitative.Light24,
|
173 |
-
)
|
174 |
-
fig.update_layout(
|
175 |
-
title={
|
176 |
-
"text": "Latency vs. Score vs. Memory",
|
177 |
-
"y": 0.95,
|
178 |
-
"x": 0.5,
|
179 |
-
"xanchor": "center",
|
180 |
-
"yanchor": "top",
|
181 |
-
},
|
182 |
-
xaxis_title="Per 1000 Tokens Latency (s)",
|
183 |
-
yaxis_title="Open LLM Score (%)",
|
184 |
-
legend_title="LLM Architecture",
|
185 |
-
width=1200,
|
186 |
-
height=600,
|
187 |
-
)
|
188 |
-
fig.update_traces(
|
189 |
-
hovertemplate="<br>".join(
|
190 |
-
[
|
191 |
-
f"<b>{column}:</b> %{{customdata[{i}]}}"
|
192 |
-
for i, column in enumerate(ALL_COLUMNS_MAPPING.values())
|
193 |
-
]
|
194 |
-
)
|
195 |
-
)
|
196 |
-
return fig
|
197 |
-
|
198 |
-
|
199 |
-
def filter_query(
|
200 |
-
text,
|
201 |
-
backends,
|
202 |
-
datatypes,
|
203 |
-
optimizations,
|
204 |
-
quantizations,
|
205 |
-
score,
|
206 |
-
memory,
|
207 |
-
machine,
|
208 |
-
):
|
209 |
-
raw_df = get_benchmark_df(machine=machine)
|
210 |
-
filtered_df = raw_df[
|
211 |
-
raw_df["Model π€"].str.contains(text, case=False)
|
212 |
-
& raw_df["Backend π"].isin(backends)
|
213 |
-
& raw_df["Dtype π₯"].isin(datatypes)
|
214 |
-
& raw_df["Optimization π οΈ"].isin(optimizations)
|
215 |
-
& raw_df["Quantization ποΈ"].isin(quantizations)
|
216 |
-
& (raw_df["Open LLM Score (%) β¬οΈ"] >= score)
|
217 |
-
& (raw_df["Allocated Memory (MB) β¬οΈ"] <= memory)
|
218 |
-
]
|
219 |
-
filtered_table = get_benchmark_table(filtered_df)
|
220 |
-
filtered_chart = get_benchmark_chart(filtered_df)
|
221 |
-
return filtered_table, filtered_chart
|
222 |
|
223 |
|
224 |
-
# Demo interface
|
225 |
demo = gr.Blocks(css=custom_css)
|
226 |
with demo:
|
227 |
-
|
228 |
gr.HTML(f'<img src="{LOGO_URL}">', elem_classes="logo")
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
|
|
|
|
|
|
247 |
with gr.TabItem("Leaderboard π
", id=0):
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
)
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
)
|
277 |
-
with gr.Row():
|
278 |
-
with gr.Column():
|
279 |
-
search_bar = gr.Textbox(
|
280 |
-
label="Model π€",
|
281 |
-
info="π Search for a model name",
|
282 |
-
elem_id="search-bar",
|
283 |
-
)
|
284 |
-
with gr.Row():
|
285 |
-
with gr.Column(scale=1):
|
286 |
-
score_slider = gr.Slider(
|
287 |
-
label="Open LLM Score (%) π",
|
288 |
-
info="ποΈ Slide to minimum Open LLM score",
|
289 |
-
value=0,
|
290 |
-
elem_id="threshold-slider",
|
291 |
-
)
|
292 |
-
with gr.Column(scale=1):
|
293 |
-
memory_slider = gr.Slider(
|
294 |
-
label="Peak Memory (MB) π",
|
295 |
-
info="ποΈ Slide to maximum Peak Memory",
|
296 |
-
minimum=0,
|
297 |
-
maximum=80 * 1024,
|
298 |
-
value=80 * 1024,
|
299 |
-
elem_id="memory-slider",
|
300 |
-
)
|
301 |
-
with gr.Column(scale=1):
|
302 |
-
backend_checkboxes = gr.CheckboxGroup(
|
303 |
-
label="Backends π",
|
304 |
-
choices=["pytorch", "onnxruntime"],
|
305 |
-
value=["pytorch", "onnxruntime"],
|
306 |
-
info="βοΈ Select the backends",
|
307 |
-
elem_id="backend-checkboxes",
|
308 |
-
)
|
309 |
-
with gr.Row():
|
310 |
-
with gr.Column(scale=1):
|
311 |
-
datatype_checkboxes = gr.CheckboxGroup(
|
312 |
-
label="Load Dtypes π₯",
|
313 |
-
choices=["float32", "float16"],
|
314 |
-
value=["float32", "float16"],
|
315 |
-
info="βοΈ Select the load dtypes",
|
316 |
-
elem_id="dtype-checkboxes",
|
317 |
-
)
|
318 |
-
with gr.Column(scale=1):
|
319 |
-
optimization_checkboxes = gr.CheckboxGroup(
|
320 |
-
label="Optimizations π οΈ",
|
321 |
-
choices=["None", "BetterTransformer", "FlashAttentionV2"],
|
322 |
-
value=["None", "BetterTransformer", "FlashAttentionV2"],
|
323 |
-
info="βοΈ Select the optimization",
|
324 |
-
elem_id="optimization-checkboxes",
|
325 |
-
)
|
326 |
-
with gr.Column(scale=1):
|
327 |
-
quantization_checkboxes = gr.CheckboxGroup(
|
328 |
-
label="Quantizations ποΈ",
|
329 |
-
choices=["None", "BnB.4bit", "GPTQ.4bit"],
|
330 |
-
value=["None", "BnB.4bit", "GPTQ.4bit"],
|
331 |
-
info="βοΈ Select the quantization schemes",
|
332 |
-
elem_id="quantization-checkboxes",
|
333 |
-
)
|
334 |
-
with gr.Row():
|
335 |
-
filter_button = gr.Button(
|
336 |
-
value="Filter π",
|
337 |
-
elem_id="filter-button",
|
338 |
-
)
|
339 |
-
for machine in MACHINE_TO_HARDWARE:
|
340 |
-
filter_button.click(
|
341 |
-
filter_query,
|
342 |
-
[
|
343 |
-
search_bar,
|
344 |
-
backend_checkboxes,
|
345 |
-
datatype_checkboxes,
|
346 |
-
optimization_checkboxes,
|
347 |
-
quantization_checkboxes,
|
348 |
-
score_slider,
|
349 |
-
memory_slider,
|
350 |
-
machine_placeholders[machine],
|
351 |
-
],
|
352 |
-
[machine_tables[machine], machine_plots[machine]],
|
353 |
)
|
354 |
-
|
355 |
####################### ABOUT TAB #######################
|
356 |
with gr.TabItem("About π", id=3):
|
357 |
-
gr.HTML(
|
358 |
-
gr.Markdown(
|
359 |
-
|
360 |
-
####################### CITATION #######################
|
361 |
with gr.Row():
|
362 |
with gr.Accordion("π Citation", open=False):
|
363 |
citation_button = gr.Textbox(
|
364 |
-
value=
|
365 |
label=CITATION_BUTTON_LABEL,
|
366 |
elem_id="citation-button",
|
367 |
show_copy_button=True,
|
368 |
)
|
369 |
|
370 |
-
|
371 |
-
demo
|
|
|
|
1 |
import os
|
2 |
|
3 |
import gradio as gr
|
|
|
|
|
|
|
4 |
|
5 |
+
from src.control_panel import create_control_panel, create_control_callback
|
6 |
+
from src.latency_score_memory import create_lat_score_mem_plot
|
7 |
+
from src.leaderboard import create_leaderboard_table
|
8 |
+
from src.flashattentionv2 import create_fa2_plots
|
9 |
+
from src.bettertransformer import create_bt_plots
|
10 |
+
from src.llm_perf import get_llm_perf_df
|
11 |
+
from src.assets import custom_css
|
12 |
+
from src.text import (
|
13 |
TITLE,
|
14 |
+
ABOUT,
|
15 |
+
INTRODUCTION,
|
16 |
+
EXAMPLE_CONFIG,
|
17 |
+
CITATION_BUTTON,
|
18 |
CITATION_BUTTON_LABEL,
|
|
|
19 |
)
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
+
LOGO_URL = "https://huggingface.co/spaces/optimum/llm-perf-leaderboard/resolve/main/logo.png"
|
23 |
MACHINE_TO_HARDWARE = {"hf-dgx-01": "A100-80GB π₯οΈ"}
|
24 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
|
|
|
27 |
demo = gr.Blocks(css=custom_css)
|
28 |
with demo:
|
29 |
+
gr.HTML(TITLE, elem_classes="title")
|
30 |
gr.HTML(f'<img src="{LOGO_URL}">', elem_classes="logo")
|
31 |
+
gr.Markdown(INTRODUCTION, elem_classes="descriptive-text")
|
32 |
+
####################### HARDWARE TABS #######################
|
33 |
+
with gr.Tabs(elem_classes="tabs"):
|
34 |
+
for id, (machine, hardware) in enumerate(MACHINE_TO_HARDWARE.items()):
|
35 |
+
with gr.TabItem(hardware, id=id):
|
36 |
+
####################### CONTROL PANEL #######################
|
37 |
+
(
|
38 |
+
filter_button,
|
39 |
+
machine_textbox,
|
40 |
+
search_bar,
|
41 |
+
score_slider,
|
42 |
+
memory_slider,
|
43 |
+
backend_checkboxes,
|
44 |
+
datatype_checkboxes,
|
45 |
+
optimization_checkboxes,
|
46 |
+
quantization_checkboxes,
|
47 |
+
) = create_control_panel()
|
48 |
+
####################### HARDWARE SUBTABS #######################
|
49 |
+
with gr.Tabs(elem_classes="subtabs"):
|
50 |
+
llm_perf_df = get_llm_perf_df(machine=machine)
|
51 |
+
####################### LEADERBOARD TAB #######################
|
52 |
with gr.TabItem("Leaderboard π
", id=0):
|
53 |
+
leaderboard_table = create_leaderboard_table(llm_perf_df)
|
54 |
+
####################### LAT. vs. SCORE vs. MEM. TAB #######################
|
55 |
+
with gr.TabItem("Latency vs. Score vs. Memory π", id=1):
|
56 |
+
lat_score_mem_plot = create_lat_score_mem_plot(llm_perf_df)
|
57 |
+
####################### BETTERTRANSFORMER SPEEDUP TAB #######################
|
58 |
+
with gr.TabItem("BetterTransformer Speedup π", id=2):
|
59 |
+
bt_prefill_plot, bt_decode_plot = create_bt_plots(llm_perf_df)
|
60 |
+
with gr.TabItem("FlashAttentionV2 Speedup π", id=3):
|
61 |
+
fa2_prefill_plot, fa2_decode_plot = create_fa2_plots(llm_perf_df)
|
62 |
+
####################### CONTROL CALLBACK #######################
|
63 |
+
create_control_callback(
|
64 |
+
filter_button,
|
65 |
+
# inputs
|
66 |
+
machine_textbox,
|
67 |
+
search_bar,
|
68 |
+
score_slider,
|
69 |
+
memory_slider,
|
70 |
+
backend_checkboxes,
|
71 |
+
datatype_checkboxes,
|
72 |
+
optimization_checkboxes,
|
73 |
+
quantization_checkboxes,
|
74 |
+
# outputs
|
75 |
+
leaderboard_table,
|
76 |
+
lat_score_mem_plot,
|
77 |
+
bt_prefill_plot,
|
78 |
+
bt_decode_plot,
|
79 |
+
fa2_prefill_plot,
|
80 |
+
fa2_decode_plot,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
)
|
|
|
82 |
####################### ABOUT TAB #######################
|
83 |
with gr.TabItem("About π", id=3):
|
84 |
+
gr.HTML(ABOUT, elem_classes="descriptive-text")
|
85 |
+
gr.Markdown(EXAMPLE_CONFIG, elem_classes="descriptive-text")
|
86 |
+
####################### CITATION
|
|
|
87 |
with gr.Row():
|
88 |
with gr.Accordion("π Citation", open=False):
|
89 |
citation_button = gr.Textbox(
|
90 |
+
value=CITATION_BUTTON,
|
91 |
label=CITATION_BUTTON_LABEL,
|
92 |
elem_id="citation-button",
|
93 |
show_copy_button=True,
|
94 |
)
|
95 |
|
96 |
+
if __name__ == "__main__":
|
97 |
+
# Launch demo
|
98 |
+
demo.queue().launch()
|
huggy_bench.png β logo.png
RENAMED
File without changes
|
pyproject.toml
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2021 The HuggingFace Team. All rights reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
[tool.black]
|
16 |
+
line-length = 119
|
17 |
+
target-version = ['py37']
|
18 |
+
|
19 |
+
[tool.ruff]
|
20 |
+
ignore = ["E501", "C901"]
|
21 |
+
select = ["C", "E", "F", "I", "W"]
|
script.py
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
from huggingface_hub import hf_hub_download
|
2 |
-
import pandas as pd
|
3 |
-
|
4 |
-
|
5 |
-
hf_hub_download(
|
6 |
-
repo_id="optimum/llm-perf-dataset",
|
7 |
-
filename="open-llm.csv",
|
8 |
-
local_dir="dataset",
|
9 |
-
repo_type="dataset",
|
10 |
-
)
|
11 |
-
|
12 |
-
open_llm = pd.read_csv("dataset/open-llm.csv")
|
13 |
-
print(open_llm["Arch"].unique())
|
14 |
-
print(open_llm[open_llm["Arch"] == "rwkv"]["Model"].unique())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/{assets/css_html_js.py β assets.py}
RENAMED
@@ -6,14 +6,14 @@ custom_css = """
|
|
6 |
max-width: 100%
|
7 |
object-fit: contain;
|
8 |
}
|
9 |
-
.
|
10 |
font-size: 16px !important;
|
11 |
}
|
12 |
|
13 |
-
.
|
14 |
font-size: 20px;
|
15 |
}
|
16 |
-
.
|
17 |
font-size: 20px;
|
18 |
}
|
19 |
|
|
|
6 |
max-width: 100%
|
7 |
object-fit: contain;
|
8 |
}
|
9 |
+
.text {
|
10 |
font-size: 16px !important;
|
11 |
}
|
12 |
|
13 |
+
.tabs button {
|
14 |
font-size: 20px;
|
15 |
}
|
16 |
+
.subtabs button {
|
17 |
font-size: 20px;
|
18 |
}
|
19 |
|
src/bettertransformer.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import plotly.express as px
|
4 |
+
|
5 |
+
|
6 |
+
from src.utils import process_arch
|
7 |
+
|
8 |
+
|
9 |
+
BETTERTRANSFORMER_DATA = [
|
10 |
+
# open llm
|
11 |
+
"Model π€",
|
12 |
+
"Arch ποΈ",
|
13 |
+
"DType π₯",
|
14 |
+
"Backend π",
|
15 |
+
"Params (B)",
|
16 |
+
"Open LLM Score (%)",
|
17 |
+
# deployment settings
|
18 |
+
"DType π₯",
|
19 |
+
"Backend π",
|
20 |
+
"Quantization ποΈ",
|
21 |
+
# primary measurements
|
22 |
+
"Prefill Latency (s)",
|
23 |
+
"Prefill Latency (s) BetterTransformer",
|
24 |
+
"Decode Throughput (tokens/s)",
|
25 |
+
"Decode Throughput (tokens/s) BetterTransformer",
|
26 |
+
"E2E Throughput (tokens/s)",
|
27 |
+
"E2E Throughput (tokens/s) BetterTransformer",
|
28 |
+
# speedups
|
29 |
+
"Prefill Latency Speedup (%)",
|
30 |
+
"Decode Throughput Speedup (%)",
|
31 |
+
]
|
32 |
+
|
33 |
+
|
34 |
+
def get_bt_df(llm_perf_df):
|
35 |
+
bt_df = llm_perf_df.copy()
|
36 |
+
# process
|
37 |
+
bt_df["Arch ποΈ"] = bt_df["Arch ποΈ"].apply(process_arch)
|
38 |
+
# seperate original model experiments from BetterTransformer experiments
|
39 |
+
original_df = bt_df[bt_df["Optimization π οΈ"] == "None"]
|
40 |
+
bt_df = bt_df[bt_df["Optimization π οΈ"] == "BetterTransformer"]
|
41 |
+
# merge the two dataframes
|
42 |
+
bt_df = pd.merge(
|
43 |
+
original_df,
|
44 |
+
bt_df,
|
45 |
+
on=["Model π€", "Quantization ποΈ"],
|
46 |
+
suffixes=["", " BetterTransformer"],
|
47 |
+
)
|
48 |
+
# compute speedups
|
49 |
+
bt_df["Prefill Latency Speedup (%)"] = (
|
50 |
+
(bt_df["Prefill Latency (s)"] / bt_df["Prefill Latency (s) BetterTransformer"]) * 100
|
51 |
+
).round(2)
|
52 |
+
bt_df["Decode Throughput Speedup (%)"] = (
|
53 |
+
(bt_df["Decode Throughput (tokens/s) BetterTransformer"] / bt_df["Decode Throughput (tokens/s)"]) * 100
|
54 |
+
).round(2)
|
55 |
+
|
56 |
+
# filter speedups > 1000%
|
57 |
+
bt_df = bt_df[bt_df["Prefill Latency Speedup (%)"] < 1000]
|
58 |
+
bt_df = bt_df[bt_df["Decode Throughput Speedup (%)"] < 1000]
|
59 |
+
|
60 |
+
return bt_df
|
61 |
+
|
62 |
+
|
63 |
+
def get_bt_decode_fig(llm_perf_df):
|
64 |
+
bt_df = get_bt_df(llm_perf_df)
|
65 |
+
# plot
|
66 |
+
decode_fig = px.box(
|
67 |
+
bt_df,
|
68 |
+
x="Arch ποΈ",
|
69 |
+
y="Decode Throughput Speedup (%)",
|
70 |
+
color_discrete_sequence=px.colors.qualitative.Light24,
|
71 |
+
custom_data=BETTERTRANSFORMER_DATA,
|
72 |
+
color="Quantization ποΈ",
|
73 |
+
points="all",
|
74 |
+
)
|
75 |
+
# add hover data
|
76 |
+
decode_fig.update_traces(
|
77 |
+
hovertemplate="<br>".join(
|
78 |
+
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
|
79 |
+
)
|
80 |
+
)
|
81 |
+
# add layout
|
82 |
+
decode_fig.update_layout(
|
83 |
+
title={
|
84 |
+
"text": "Decode Throughput Speedup per Architecture",
|
85 |
+
"y": 0.95,
|
86 |
+
"x": 0.5,
|
87 |
+
"xanchor": "center",
|
88 |
+
"yanchor": "top",
|
89 |
+
},
|
90 |
+
xaxis_title="LLM Architecture",
|
91 |
+
yaxis_title="Decode Speedup (%)",
|
92 |
+
legend_title="Quantization Scheme",
|
93 |
+
width=1200,
|
94 |
+
height=600,
|
95 |
+
)
|
96 |
+
|
97 |
+
return decode_fig
|
98 |
+
|
99 |
+
|
100 |
+
def get_bt_prefill_fig(llm_perf_df):
|
101 |
+
bt_df = get_bt_df(llm_perf_df)
|
102 |
+
# plot
|
103 |
+
prefill_fig = px.box(
|
104 |
+
bt_df,
|
105 |
+
x="Arch ποΈ",
|
106 |
+
y="Prefill Latency Speedup (%)",
|
107 |
+
color_discrete_sequence=px.colors.qualitative.Light24,
|
108 |
+
custom_data=BETTERTRANSFORMER_DATA,
|
109 |
+
color="Quantization ποΈ",
|
110 |
+
points="all",
|
111 |
+
)
|
112 |
+
# add hover data
|
113 |
+
prefill_fig.update_traces(
|
114 |
+
hovertemplate="<br>".join(
|
115 |
+
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
|
116 |
+
)
|
117 |
+
)
|
118 |
+
# add layout
|
119 |
+
prefill_fig.update_layout(
|
120 |
+
title={
|
121 |
+
"text": "Prefill Latency Speedup per Architecture",
|
122 |
+
"y": 0.95,
|
123 |
+
"x": 0.5,
|
124 |
+
"xanchor": "center",
|
125 |
+
"yanchor": "top",
|
126 |
+
},
|
127 |
+
xaxis_title="LLM Architecture",
|
128 |
+
yaxis_title="Prefill Speedup (%)",
|
129 |
+
legend_title="Quantization Scheme",
|
130 |
+
width=1200,
|
131 |
+
height=600,
|
132 |
+
)
|
133 |
+
|
134 |
+
return prefill_fig
|
135 |
+
|
136 |
+
|
137 |
+
def create_bt_plots(llm_perf_df):
|
138 |
+
# descriptive text
|
139 |
+
gr.HTML("π Hover over the points π for additional information.", elem_id="text")
|
140 |
+
# get figures
|
141 |
+
prefill_fig = get_bt_prefill_fig(llm_perf_df)
|
142 |
+
decode_fig = get_bt_decode_fig(llm_perf_df)
|
143 |
+
|
144 |
+
# create plots
|
145 |
+
prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
|
146 |
+
decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
|
147 |
+
|
148 |
+
return prefill_plot, decode_plot
|
src/control_panel.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
from src.llm_perf import get_llm_perf_df
|
4 |
+
from src.leaderboard import get_leaderboard_df
|
5 |
+
from src.latency_score_memory import get_lat_score_mem_fig
|
6 |
+
from src.bettertransformer import get_bt_prefill_fig, get_bt_decode_fig
|
7 |
+
from src.flashattentionv2 import get_fa2_prefill_fig, get_fa2_decode_fig
|
8 |
+
|
9 |
+
|
10 |
+
def create_control_panel(machine: str = "hf-dgx-01"):
|
11 |
+
# descriptive text
|
12 |
+
gr.HTML("Use this control panel to filter this leaderboard.", elem_id="text")
|
13 |
+
# controls
|
14 |
+
machine_textbox = gr.Textbox(value=machine, visible=False)
|
15 |
+
with gr.Row():
|
16 |
+
with gr.Column():
|
17 |
+
search_bar = gr.Textbox(
|
18 |
+
label="Model π€",
|
19 |
+
info="π Search for a model name",
|
20 |
+
elem_id="search-bar",
|
21 |
+
)
|
22 |
+
with gr.Row():
|
23 |
+
with gr.Column(scale=1):
|
24 |
+
score_slider = gr.Slider(
|
25 |
+
label="Open LLM Score (%) π",
|
26 |
+
info="ποΈ Slide to minimum Open LLM score",
|
27 |
+
value=0,
|
28 |
+
elem_id="threshold-slider",
|
29 |
+
)
|
30 |
+
with gr.Column(scale=1):
|
31 |
+
memory_slider = gr.Slider(
|
32 |
+
label="Peak Memory (MB) π",
|
33 |
+
info="ποΈ Slide to maximum Peak Memory",
|
34 |
+
minimum=0,
|
35 |
+
maximum=80 * 1024,
|
36 |
+
value=80 * 1024,
|
37 |
+
elem_id="memory-slider",
|
38 |
+
)
|
39 |
+
with gr.Column(scale=1):
|
40 |
+
backend_checkboxes = gr.CheckboxGroup(
|
41 |
+
label="Backends π",
|
42 |
+
choices=["pytorch", "onnxruntime"],
|
43 |
+
value=["pytorch", "onnxruntime"],
|
44 |
+
info="βοΈ Select the backends",
|
45 |
+
elem_id="backend-checkboxes",
|
46 |
+
)
|
47 |
+
with gr.Row():
|
48 |
+
with gr.Column(scale=1):
|
49 |
+
datatype_checkboxes = gr.CheckboxGroup(
|
50 |
+
label="DTypes π₯",
|
51 |
+
choices=["float32", "float16"],
|
52 |
+
value=["float32", "float16"],
|
53 |
+
info="βοΈ Select the load data types",
|
54 |
+
elem_id="dtype-checkboxes",
|
55 |
+
)
|
56 |
+
with gr.Column(scale=1):
|
57 |
+
optimization_checkboxes = gr.CheckboxGroup(
|
58 |
+
label="Optimizations π οΈ",
|
59 |
+
choices=["None", "BetterTransformer", "FlashAttentionV2"],
|
60 |
+
value=["None", "BetterTransformer", "FlashAttentionV2"],
|
61 |
+
info="βοΈ Select the optimization",
|
62 |
+
elem_id="optimization-checkboxes",
|
63 |
+
)
|
64 |
+
with gr.Column(scale=1):
|
65 |
+
quantization_checkboxes = gr.CheckboxGroup(
|
66 |
+
label="Quantizations ποΈ",
|
67 |
+
choices=["None", "BnB.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
|
68 |
+
value=["None", "BnB.4bit", "GPTQ.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
|
69 |
+
info="βοΈ Select the quantization schemes",
|
70 |
+
elem_id="quantization-checkboxes",
|
71 |
+
)
|
72 |
+
with gr.Row():
|
73 |
+
filter_button = gr.Button(
|
74 |
+
value="Filter π",
|
75 |
+
elem_id="filter-button",
|
76 |
+
)
|
77 |
+
|
78 |
+
return (
|
79 |
+
filter_button,
|
80 |
+
machine_textbox,
|
81 |
+
search_bar,
|
82 |
+
score_slider,
|
83 |
+
memory_slider,
|
84 |
+
backend_checkboxes,
|
85 |
+
datatype_checkboxes,
|
86 |
+
optimization_checkboxes,
|
87 |
+
quantization_checkboxes,
|
88 |
+
)
|
89 |
+
|
90 |
+
|
91 |
+
def filter_fn(
|
92 |
+
machine,
|
93 |
+
model,
|
94 |
+
backends,
|
95 |
+
datatypes,
|
96 |
+
optimizations,
|
97 |
+
quantizations,
|
98 |
+
score,
|
99 |
+
memory,
|
100 |
+
):
|
101 |
+
raw_df = get_llm_perf_df(machine=machine)
|
102 |
+
filtered_df = raw_df[
|
103 |
+
raw_df["Model π€"].str.contains(model, case=False)
|
104 |
+
& raw_df["Backend π"].isin(backends)
|
105 |
+
& raw_df["DType π₯"].isin(datatypes)
|
106 |
+
& raw_df["Optimization π οΈ"].isin(optimizations)
|
107 |
+
& raw_df["Quantization ποΈ"].isin(quantizations)
|
108 |
+
& (raw_df["Open LLM Score (%)"] >= score)
|
109 |
+
& (raw_df["Allocated Memory (MB)"] <= memory)
|
110 |
+
]
|
111 |
+
filtered_leaderboard_df = get_leaderboard_df(filtered_df)
|
112 |
+
filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_df)
|
113 |
+
filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
|
114 |
+
filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
|
115 |
+
filtered_fa2_prefill_fig = get_fa2_prefill_fig(filtered_df)
|
116 |
+
filtered_fa2_decode_fig = get_fa2_decode_fig(filtered_df)
|
117 |
+
|
118 |
+
return [
|
119 |
+
filtered_leaderboard_df,
|
120 |
+
filtered_lat_score_mem_fig,
|
121 |
+
filtered_bt_prefill_fig,
|
122 |
+
filtered_bt_decode_fig,
|
123 |
+
filtered_fa2_prefill_fig,
|
124 |
+
filtered_fa2_decode_fig,
|
125 |
+
]
|
126 |
+
|
127 |
+
|
128 |
+
def create_control_callback(
|
129 |
+
# button
|
130 |
+
filter_button,
|
131 |
+
# inputs
|
132 |
+
machine_textbox,
|
133 |
+
search_bar,
|
134 |
+
score_slider,
|
135 |
+
memory_slider,
|
136 |
+
backend_checkboxes,
|
137 |
+
datatype_checkboxes,
|
138 |
+
optimization_checkboxes,
|
139 |
+
quantization_checkboxes,
|
140 |
+
# outputs
|
141 |
+
leaderboard_table,
|
142 |
+
lat_score_mem_plot,
|
143 |
+
bt_prefill_plot,
|
144 |
+
bt_decode_plot,
|
145 |
+
fa2_prefill_plot,
|
146 |
+
fa2_decode_plot,
|
147 |
+
):
|
148 |
+
filter_button.click(
|
149 |
+
fn=filter_fn,
|
150 |
+
inputs=[
|
151 |
+
machine_textbox,
|
152 |
+
search_bar,
|
153 |
+
backend_checkboxes,
|
154 |
+
datatype_checkboxes,
|
155 |
+
optimization_checkboxes,
|
156 |
+
quantization_checkboxes,
|
157 |
+
score_slider,
|
158 |
+
memory_slider,
|
159 |
+
],
|
160 |
+
outputs=[
|
161 |
+
leaderboard_table,
|
162 |
+
lat_score_mem_plot,
|
163 |
+
bt_prefill_plot,
|
164 |
+
bt_decode_plot,
|
165 |
+
fa2_prefill_plot,
|
166 |
+
fa2_decode_plot,
|
167 |
+
],
|
168 |
+
)
|
src/flashattentionv2.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import plotly.express as px
|
4 |
+
|
5 |
+
|
6 |
+
from src.utils import process_arch
|
7 |
+
|
8 |
+
|
9 |
+
FLASHATTENTIONV2_DATA = [
|
10 |
+
# open llm
|
11 |
+
"Model π€",
|
12 |
+
"Arch ποΈ",
|
13 |
+
"DType π₯",
|
14 |
+
"Backend π",
|
15 |
+
"Params (B)",
|
16 |
+
"Open LLM Score (%)",
|
17 |
+
# deployment settings
|
18 |
+
"DType π₯",
|
19 |
+
"Backend π",
|
20 |
+
"Quantization ποΈ",
|
21 |
+
# primary measurements
|
22 |
+
"Prefill Latency (s)",
|
23 |
+
"Prefill Latency (s) FlashAttentionV2",
|
24 |
+
"Decode Throughput (tokens/s)",
|
25 |
+
"Decode Throughput (tokens/s) FlashAttentionV2",
|
26 |
+
"E2E Throughput (tokens/s)",
|
27 |
+
"E2E Throughput (tokens/s) FlashAttentionV2",
|
28 |
+
# speedups
|
29 |
+
"Prefill Latency Speedup (%)",
|
30 |
+
"Decode Throughput Speedup (%)",
|
31 |
+
]
|
32 |
+
|
33 |
+
|
34 |
+
def get_fa2_df(llm_perf_df):
|
35 |
+
fa2_df = llm_perf_df.copy()
|
36 |
+
# process
|
37 |
+
fa2_df["Arch ποΈ"] = fa2_df["Arch ποΈ"].apply(process_arch)
|
38 |
+
# seperate original model experiments from FlashAttentionV2 experiments
|
39 |
+
original_df = fa2_df[fa2_df["Optimization π οΈ"] == "None"]
|
40 |
+
fa2_df = fa2_df[fa2_df["Optimization π οΈ"] == "FlashAttentionV2"]
|
41 |
+
# merge the two dataframes
|
42 |
+
fa2_df = pd.merge(
|
43 |
+
original_df,
|
44 |
+
fa2_df,
|
45 |
+
on=["Model π€", "Quantization ποΈ"],
|
46 |
+
suffixes=["", " FlashAttentionV2"],
|
47 |
+
)
|
48 |
+
# compute speedups
|
49 |
+
fa2_df["Prefill Latency Speedup (%)"] = (
|
50 |
+
(fa2_df["Prefill Latency (s)"] / fa2_df["Prefill Latency (s) FlashAttentionV2"]) * 100
|
51 |
+
).round(2)
|
52 |
+
fa2_df["Decode Throughput Speedup (%)"] = (
|
53 |
+
(fa2_df["Decode Throughput (tokens/s) FlashAttentionV2"] / fa2_df["Decode Throughput (tokens/s)"]) * 100
|
54 |
+
).round(2)
|
55 |
+
|
56 |
+
# filter speedups > 1000%
|
57 |
+
fa2_df = fa2_df[fa2_df["Prefill Latency Speedup (%)"] < 1000]
|
58 |
+
fa2_df = fa2_df[fa2_df["Decode Throughput Speedup (%)"] < 1000]
|
59 |
+
|
60 |
+
return fa2_df
|
61 |
+
|
62 |
+
|
63 |
+
def get_fa2_decode_fig(llm_perf_df):
|
64 |
+
fa2_df = get_fa2_df(llm_perf_df)
|
65 |
+
# plot
|
66 |
+
decode_fig = px.box(
|
67 |
+
fa2_df,
|
68 |
+
x="Arch ποΈ",
|
69 |
+
y="Decode Throughput Speedup (%)",
|
70 |
+
color_discrete_sequence=px.colors.qualitative.Light24,
|
71 |
+
custom_data=FLASHATTENTIONV2_DATA,
|
72 |
+
color="Quantization ποΈ",
|
73 |
+
points="all",
|
74 |
+
)
|
75 |
+
# add hover data
|
76 |
+
decode_fig.update_traces(
|
77 |
+
hovertemplate="<br>".join(
|
78 |
+
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(FLASHATTENTIONV2_DATA)]
|
79 |
+
)
|
80 |
+
)
|
81 |
+
# add layout
|
82 |
+
decode_fig.update_layout(
|
83 |
+
title={
|
84 |
+
"text": "Decode Throughput Speedup per Architecture",
|
85 |
+
"y": 0.95,
|
86 |
+
"x": 0.5,
|
87 |
+
"xanchor": "center",
|
88 |
+
"yanchor": "top",
|
89 |
+
},
|
90 |
+
xaxis_title="LLM Architecture",
|
91 |
+
yaxis_title="Decode Speedup (%)",
|
92 |
+
legend_title="Quantization Scheme",
|
93 |
+
width=1200,
|
94 |
+
height=600,
|
95 |
+
)
|
96 |
+
|
97 |
+
return decode_fig
|
98 |
+
|
99 |
+
|
100 |
+
def get_fa2_prefill_fig(llm_perf_df):
|
101 |
+
fa2_df = get_fa2_df(llm_perf_df)
|
102 |
+
# plot
|
103 |
+
prefill_fig = px.box(
|
104 |
+
fa2_df,
|
105 |
+
x="Arch ποΈ",
|
106 |
+
y="Prefill Latency Speedup (%)",
|
107 |
+
color_discrete_sequence=px.colors.qualitative.Light24,
|
108 |
+
custom_data=FLASHATTENTIONV2_DATA,
|
109 |
+
color="Quantization ποΈ",
|
110 |
+
points="all",
|
111 |
+
)
|
112 |
+
# add hover data
|
113 |
+
prefill_fig.update_traces(
|
114 |
+
hovertemplate="<br>".join(
|
115 |
+
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(FLASHATTENTIONV2_DATA)]
|
116 |
+
)
|
117 |
+
)
|
118 |
+
# add layout
|
119 |
+
prefill_fig.update_layout(
|
120 |
+
title={
|
121 |
+
"text": "Prefill Latency Speedup per Architecture",
|
122 |
+
"y": 0.95,
|
123 |
+
"x": 0.5,
|
124 |
+
"xanchor": "center",
|
125 |
+
"yanchor": "top",
|
126 |
+
},
|
127 |
+
xaxis_title="LLM Architecture",
|
128 |
+
yaxis_title="Prefill Speedup (%)",
|
129 |
+
legend_title="Quantization Scheme",
|
130 |
+
width=1200,
|
131 |
+
height=600,
|
132 |
+
)
|
133 |
+
|
134 |
+
return prefill_fig
|
135 |
+
|
136 |
+
|
137 |
+
def create_fa2_plots(llm_perf_df):
|
138 |
+
# descriptive text
|
139 |
+
gr.HTML("π Hover over the points π for additional information.", elem_id="text")
|
140 |
+
# get figures
|
141 |
+
prefill_fig = get_fa2_prefill_fig(llm_perf_df)
|
142 |
+
decode_fig = get_fa2_decode_fig(llm_perf_df)
|
143 |
+
|
144 |
+
# create plots
|
145 |
+
prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
|
146 |
+
decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
|
147 |
+
|
148 |
+
return prefill_plot, decode_plot
|
src/latency_score_memory.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import plotly.express as px
|
3 |
+
|
4 |
+
|
5 |
+
SCORE_MEMORY_LATENCY_DATA = [
|
6 |
+
"Model π€",
|
7 |
+
"Arch ποΈ",
|
8 |
+
"Params (B)",
|
9 |
+
"DType π₯",
|
10 |
+
"Backend π",
|
11 |
+
"Open LLM Score (%)",
|
12 |
+
"Prefill Latency (s)",
|
13 |
+
"Decode Throughput (tokens/s)",
|
14 |
+
"Allocated Memory (MB)",
|
15 |
+
"E2E Latency (s)",
|
16 |
+
"E2E Throughput (tokens/s)",
|
17 |
+
]
|
18 |
+
|
19 |
+
|
20 |
+
def get_lat_score_mem_fig(llm_perf_df):
|
21 |
+
copy_df = llm_perf_df.copy()
|
22 |
+
# plot
|
23 |
+
fig = px.scatter(
|
24 |
+
copy_df,
|
25 |
+
x="E2E Latency (s)",
|
26 |
+
y="Open LLM Score (%)",
|
27 |
+
size="Allocated Memory (MB)",
|
28 |
+
color="Arch ποΈ",
|
29 |
+
custom_data=SCORE_MEMORY_LATENCY_DATA,
|
30 |
+
color_discrete_sequence=px.colors.qualitative.Light24,
|
31 |
+
)
|
32 |
+
fig.update_traces(
|
33 |
+
hovertemplate="<br>".join(
|
34 |
+
[f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(SCORE_MEMORY_LATENCY_DATA)]
|
35 |
+
)
|
36 |
+
)
|
37 |
+
fig.update_layout(
|
38 |
+
title={
|
39 |
+
"text": "Latency vs. Score vs. Memory",
|
40 |
+
"y": 0.95,
|
41 |
+
"x": 0.5,
|
42 |
+
"xanchor": "center",
|
43 |
+
"yanchor": "top",
|
44 |
+
},
|
45 |
+
xaxis_title="Per 1000 Tokens Latency (s)",
|
46 |
+
yaxis_title="Open LLM Score (%)",
|
47 |
+
legend_title="LLM Architecture",
|
48 |
+
width=1200,
|
49 |
+
height=600,
|
50 |
+
)
|
51 |
+
|
52 |
+
return fig
|
53 |
+
|
54 |
+
|
55 |
+
def create_lat_score_mem_plot(llm_perf_df):
|
56 |
+
# descriptive text
|
57 |
+
gr.HTML("π Hover over the points π for additional information. ",elem_id="text")
|
58 |
+
# get figure
|
59 |
+
fig = get_lat_score_mem_fig(llm_perf_df)
|
60 |
+
# create plot
|
61 |
+
plot = gr.components.Plot(
|
62 |
+
value=fig,
|
63 |
+
elem_id="plot",
|
64 |
+
show_label=False,
|
65 |
+
)
|
66 |
+
|
67 |
+
return plot
|
src/leaderboard.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
from src.utils import model_hyperlink, process_score
|
4 |
+
|
5 |
+
|
6 |
+
LEADERBOARD_COLUMN_TO_DATATYPE = {
|
7 |
+
# open llm
|
8 |
+
"Model π€" :"markdown",
|
9 |
+
"Arch ποΈ" :"markdown",
|
10 |
+
"Params (B)": "number",
|
11 |
+
"Open LLM Score (%)": "number",
|
12 |
+
# deployment settings
|
13 |
+
"DType π₯" :"str",
|
14 |
+
"Backend π" :"str",
|
15 |
+
"Optimization π οΈ" :"str",
|
16 |
+
"Quantization ποΈ" :"str",
|
17 |
+
# primary measurements
|
18 |
+
"Prefill Latency (s)": "number",
|
19 |
+
"Decode Throughput (tokens/s)": "number",
|
20 |
+
"Allocated Memory (MB)": "number",
|
21 |
+
"Energy (tokens/kWh)": "number",
|
22 |
+
# additional measurements
|
23 |
+
"E2E Latency (s)": "number",
|
24 |
+
"E2E Throughput (tokens/s)": "number",
|
25 |
+
"Reserved Memory (MB)": "number",
|
26 |
+
"Used Memory (MB)": "number",
|
27 |
+
}
|
28 |
+
|
29 |
+
|
30 |
+
def process_model(model_name):
|
31 |
+
link = f"https://huggingface.co/{model_name}"
|
32 |
+
return model_hyperlink(link, model_name)
|
33 |
+
|
34 |
+
|
35 |
+
def get_leaderboard_df(llm_perf_df):
|
36 |
+
df = llm_perf_df.copy()
|
37 |
+
# transform for leaderboard
|
38 |
+
df["Model π€"] = df["Model π€"].apply(process_model)
|
39 |
+
# process quantization for leaderboard
|
40 |
+
df["Open LLM Score (%)"] = df.apply(
|
41 |
+
lambda x: process_score(x["Open LLM Score (%)"], x["Quantization ποΈ"]),
|
42 |
+
axis=1,
|
43 |
+
)
|
44 |
+
return df
|
45 |
+
|
46 |
+
|
47 |
+
def create_leaderboard_table(llm_perf_df):
|
48 |
+
# descriptive text
|
49 |
+
gr.HTML("π Scroll to the right π for additional columns.", elem_id="text")
|
50 |
+
# get dataframe
|
51 |
+
leaderboard_df = get_leaderboard_df(llm_perf_df)
|
52 |
+
# create table
|
53 |
+
leaderboard_table = gr.components.Dataframe(
|
54 |
+
value=leaderboard_df,
|
55 |
+
datatype=list(LEADERBOARD_COLUMN_TO_DATATYPE.values()),
|
56 |
+
headers=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
|
57 |
+
elem_id="table",
|
58 |
+
)
|
59 |
+
|
60 |
+
return leaderboard_table
|
src/llm_perf.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
from huggingface_hub import hf_hub_download
|
5 |
+
|
6 |
+
LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
|
7 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
8 |
+
|
9 |
+
COLUMNS_MAPPING = {
|
10 |
+
"Model": "Model π€",
|
11 |
+
"Arch": "Arch ποΈ",
|
12 |
+
"Size": "Params (B)",
|
13 |
+
"Score": "Open LLM Score (%)",
|
14 |
+
# deployment settings
|
15 |
+
"backend.name": "Backend π",
|
16 |
+
"backend.torch_dtype": "DType π₯",
|
17 |
+
"optimization": "Optimization π οΈ",
|
18 |
+
"quantization": "Quantization ποΈ",
|
19 |
+
# primary measurements
|
20 |
+
"forward.latency(s)": "Prefill Latency (s)",
|
21 |
+
"decode.throughput(tokens/s)": "Decode Throughput (tokens/s)",
|
22 |
+
"generate.max_memory_allocated(MB)": "Allocated Memory (MB)",
|
23 |
+
"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)",
|
24 |
+
# additional measurements
|
25 |
+
"generate.latency(s)": "E2E Latency (s)",
|
26 |
+
"generate.throughput(tokens/s)": "E2E Throughput (tokens/s)",
|
27 |
+
"generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
|
28 |
+
"generate.max_memory_used(MB)": "Used Memory (MB)",
|
29 |
+
}
|
30 |
+
SORTING_COLUMNS = [
|
31 |
+
"Open LLM Score (%)",
|
32 |
+
"Prefill Latency (s)",
|
33 |
+
"Decode Throughput (tokens/s)",
|
34 |
+
]
|
35 |
+
SORTING_ASCENDING = [False, True, False]
|
36 |
+
|
37 |
+
|
38 |
+
def get_llm_df():
|
39 |
+
hf_hub_download(
|
40 |
+
repo_id=LLM_PERF_DATASET_REPO,
|
41 |
+
filename="open-llm.csv",
|
42 |
+
local_dir="dataset",
|
43 |
+
repo_type="dataset",
|
44 |
+
token=HF_TOKEN,
|
45 |
+
)
|
46 |
+
llm_df = pd.read_csv("dataset/open-llm.csv")
|
47 |
+
|
48 |
+
return llm_df
|
49 |
+
|
50 |
+
|
51 |
+
def get_perf_df(machine: str = "hf-dgx-01"):
|
52 |
+
hf_hub_download(
|
53 |
+
repo_id=LLM_PERF_DATASET_REPO,
|
54 |
+
filename=f"{machine}/perf-report.csv",
|
55 |
+
local_dir="dataset",
|
56 |
+
repo_type="dataset",
|
57 |
+
token=HF_TOKEN,
|
58 |
+
)
|
59 |
+
perf_df = pd.read_csv(f"dataset/{machine}/perf-report.csv")
|
60 |
+
|
61 |
+
return perf_df
|
62 |
+
|
63 |
+
|
64 |
+
def get_llm_perf_df(machine: str = "hf-dgx-01"):
|
65 |
+
# get dataframes
|
66 |
+
llm_df = get_llm_df()
|
67 |
+
perf_df = get_perf_df(machine=machine)
|
68 |
+
llm_perf_df = pd.merge(llm_df, perf_df, left_on="Model", right_on="model")
|
69 |
+
# some assertions
|
70 |
+
assert llm_perf_df["benchmark.input_shapes.batch_size"].nunique() == 1
|
71 |
+
assert llm_perf_df["benchmark.input_shapes.sequence_length"].nunique() == 1
|
72 |
+
assert llm_perf_df["benchmark.new_tokens"].nunique() == 1
|
73 |
+
# transpose energy consumption
|
74 |
+
llm_perf_df["generate.energy_consumption(tokens/kWh)"] = (
|
75 |
+
1 / llm_perf_df["generate.energy_consumption(kWh/token)"].fillna(1)
|
76 |
+
).astype(int)
|
77 |
+
# fix nan values
|
78 |
+
llm_perf_df.loc[
|
79 |
+
llm_perf_df["generate.energy_consumption(tokens/kWh)"] == 1,
|
80 |
+
"generate.energy_consumption(tokens/kWh)",
|
81 |
+
] = pd.NA
|
82 |
+
|
83 |
+
# add optimization column
|
84 |
+
llm_perf_df["optimization"] = llm_perf_df[["backend.to_bettertransformer", "backend.use_flash_attention_2"]].apply(
|
85 |
+
lambda x: "BetterTransformer"
|
86 |
+
if x["backend.to_bettertransformer"]
|
87 |
+
else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None"),
|
88 |
+
axis=1,
|
89 |
+
)
|
90 |
+
# add quantization scheme
|
91 |
+
llm_perf_df["quantization"] = llm_perf_df[
|
92 |
+
[
|
93 |
+
"backend.quantization_scheme",
|
94 |
+
"backend.quantization_config.exllama_config.version",
|
95 |
+
]
|
96 |
+
].apply(
|
97 |
+
lambda x: "BnB.4bit"
|
98 |
+
if x["backend.quantization_scheme"] == "bnb"
|
99 |
+
else (
|
100 |
+
"GPTQ.4bit+ExllamaV1"
|
101 |
+
if (x["backend.quantization_scheme"] == "gptq")
|
102 |
+
and (x["backend.quantization_config.exllama_config.version"] == 1)
|
103 |
+
else (
|
104 |
+
"GPTQ.4bit+ExllamaV2"
|
105 |
+
if (x["backend.quantization_scheme"] == "gptq")
|
106 |
+
and (x["backend.quantization_config.exllama_config.version"] == 2)
|
107 |
+
else "None"
|
108 |
+
)
|
109 |
+
),
|
110 |
+
axis=1,
|
111 |
+
)
|
112 |
+
# add decode throughput
|
113 |
+
llm_perf_df["decode.throughput(tokens/s)"] = (
|
114 |
+
1000 / (llm_perf_df["generate.latency(s)"] - llm_perf_df["forward.latency(s)"])
|
115 |
+
).round(2)
|
116 |
+
# filter columns
|
117 |
+
llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())]
|
118 |
+
# rename columns
|
119 |
+
llm_perf_df.rename(columns=COLUMNS_MAPPING, inplace=True)
|
120 |
+
# sort by metric
|
121 |
+
llm_perf_df.sort_values(
|
122 |
+
by=SORTING_COLUMNS,
|
123 |
+
ascending=SORTING_ASCENDING,
|
124 |
+
inplace=True,
|
125 |
+
)
|
126 |
+
|
127 |
+
return llm_perf_df
|
src/{assets/text_content.py β text.py}
RENAMED
@@ -1,6 +1,6 @@
|
|
1 |
TITLE = """<h1 align="center" id="space-title">π€ LLM-Perf Leaderboard ποΈ</h1>"""
|
2 |
|
3 |
-
|
4 |
The π€ LLM-Perf Leaderboard ποΈ aims to benchmark the performance (latency, throughput, memory & energy) of Large Language Models (LLMs) with different hardwares, backends and optimizations using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors.
|
5 |
|
6 |
Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
|
@@ -8,7 +8,7 @@ Anyone from the community can request a model or a hardware/backend/optimization
|
|
8 |
- Hardware/Backend/Optimization performance requests should be made in the [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions) to assess their relevance and feasibility.
|
9 |
"""
|
10 |
|
11 |
-
|
12 |
<ul>
|
13 |
<li>To avoid communication-dependent results, only one GPU is used.</li>
|
14 |
<li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">π€ Open LLM Leaderboard</a>.</li>
|
@@ -18,11 +18,26 @@ ABOUT_TEXT = """<h3>About the π€ LLM-Perf Leaderboard ποΈ</h3>
|
|
18 |
</ul>
|
19 |
"""
|
20 |
|
21 |
-
|
22 |
Here's an example of the configuration file used to benchmark the models with Optimum-Benchmark:
|
23 |
```yaml
|
24 |
defaults:
|
25 |
-
- backend: pytorch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
- benchmark: inference # default benchmark
|
27 |
- experiment # inheriting from experiment config
|
28 |
- _self_ # for hydra 1.1 compatibility
|
@@ -31,39 +46,38 @@ defaults:
|
|
31 |
|
32 |
hydra:
|
33 |
run:
|
34 |
-
dir:
|
35 |
job:
|
36 |
chdir: true
|
|
|
|
|
|
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
model: {model}
|
41 |
-
|
42 |
-
device: cuda
|
43 |
|
44 |
backend:
|
45 |
-
|
46 |
-
|
47 |
-
bettertransformer: true
|
48 |
-
quantization_scheme: gptq
|
49 |
-
|
50 |
|
51 |
benchmark:
|
|
|
52 |
memory: true
|
53 |
energy: true
|
54 |
-
|
55 |
new_tokens: 1000
|
56 |
input_shapes:
|
57 |
batch_size: 1
|
58 |
sequence_length: 256
|
59 |
|
60 |
-
|
|
|
61 |
```
|
62 |
"""
|
63 |
|
64 |
|
65 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
|
66 |
-
|
67 |
author = {Ilyas Moutawwakil, RΓ©gis Pierrard},
|
68 |
title = {LLM-Perf Leaderboard},
|
69 |
year = {2023},
|
|
|
1 |
TITLE = """<h1 align="center" id="space-title">π€ LLM-Perf Leaderboard ποΈ</h1>"""
|
2 |
|
3 |
+
INTRODUCTION = """
|
4 |
The π€ LLM-Perf Leaderboard ποΈ aims to benchmark the performance (latency, throughput, memory & energy) of Large Language Models (LLMs) with different hardwares, backends and optimizations using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors.
|
5 |
|
6 |
Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
|
|
|
8 |
- Hardware/Backend/Optimization performance requests should be made in the [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions) to assess their relevance and feasibility.
|
9 |
"""
|
10 |
|
11 |
+
ABOUT = """<h3>About the π€ LLM-Perf Leaderboard ποΈ</h3>
|
12 |
<ul>
|
13 |
<li>To avoid communication-dependent results, only one GPU is used.</li>
|
14 |
<li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">π€ Open LLM Leaderboard</a>.</li>
|
|
|
18 |
</ul>
|
19 |
"""
|
20 |
|
21 |
+
EXAMPLE_CONFIG = """
|
22 |
Here's an example of the configuration file used to benchmark the models with Optimum-Benchmark:
|
23 |
```yaml
|
24 |
defaults:
|
25 |
+
- backend: pytorch
|
26 |
+
- _base_ # inheriting from base config
|
27 |
+
- _self_ # for hydra 1.1 compatibility
|
28 |
+
|
29 |
+
experiment_name: pytorch+cuda+float16+bettertransformer
|
30 |
+
device: cuda
|
31 |
+
|
32 |
+
backend:
|
33 |
+
no_weights: true
|
34 |
+
torch_dtype: float16
|
35 |
+
to_bettertransformer: true
|
36 |
+
```
|
37 |
+
|
38 |
+
Where the base config is:
|
39 |
+
```yaml
|
40 |
+
defaults:
|
41 |
- benchmark: inference # default benchmark
|
42 |
- experiment # inheriting from experiment config
|
43 |
- _self_ # for hydra 1.1 compatibility
|
|
|
46 |
|
47 |
hydra:
|
48 |
run:
|
49 |
+
dir: ???
|
50 |
job:
|
51 |
chdir: true
|
52 |
+
env_set:
|
53 |
+
CUDA_VISIBLE_DEVICES: 0
|
54 |
+
CUDA_DEVICE_ORDER: PCI_BUS_ID
|
55 |
|
56 |
+
model: ???
|
57 |
+
experiment_name: ???
|
|
|
|
|
|
|
58 |
|
59 |
backend:
|
60 |
+
initial_isolation_check: true
|
61 |
+
continous_isolation_check: true
|
|
|
|
|
|
|
62 |
|
63 |
benchmark:
|
64 |
+
duration: 10
|
65 |
memory: true
|
66 |
energy: true
|
67 |
+
|
68 |
new_tokens: 1000
|
69 |
input_shapes:
|
70 |
batch_size: 1
|
71 |
sequence_length: 256
|
72 |
|
73 |
+
hub_kwargs:
|
74 |
+
trust_remote_code: true
|
75 |
```
|
76 |
"""
|
77 |
|
78 |
|
79 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
|
80 |
+
CITATION_BUTTON = r"""@misc{llm-perf-leaderboard,
|
81 |
author = {Ilyas Moutawwakil, RΓ©gis Pierrard},
|
82 |
title = {LLM-Perf Leaderboard},
|
83 |
year = {2023},
|
src/utils.py
CHANGED
@@ -1,22 +1,3 @@
|
|
1 |
-
from huggingface_hub import HfApi, Repository
|
2 |
-
import gradio as gr
|
3 |
-
import json
|
4 |
-
|
5 |
-
|
6 |
-
def change_tab(query_param):
|
7 |
-
query_param = query_param.replace("'", '"')
|
8 |
-
query_param = json.loads(query_param)
|
9 |
-
|
10 |
-
if (
|
11 |
-
isinstance(query_param, dict)
|
12 |
-
and "tab" in query_param
|
13 |
-
and query_param["tab"] == "plot"
|
14 |
-
):
|
15 |
-
return gr.Tabs.update(selected=1)
|
16 |
-
else:
|
17 |
-
return gr.Tabs.update(selected=0)
|
18 |
-
|
19 |
-
|
20 |
LLM_MODEL_ARCHS = {
|
21 |
"stablelm_epoch": "π΄ StableLM-Epoch",
|
22 |
"stablelm_alpha": "π΄ StableLM-Alpha",
|
@@ -24,8 +5,8 @@ LLM_MODEL_ARCHS = {
|
|
24 |
"RefinedWebModel": "π¦
Falcon",
|
25 |
"gpt_bigcode": "β StarCoder",
|
26 |
"RefinedWeb": "π¦
Falcon",
|
27 |
-
"baichuan": "π Baichuan ηΎε·",
|
28 |
-
"internlm": "π§βπ InternLM δΉ¦η",
|
29 |
"mistral": "βοΈ Mistral",
|
30 |
"codegen": "βΎοΈ CodeGen",
|
31 |
"chatglm": "π¬ ChatGLM",
|
@@ -34,7 +15,7 @@ LLM_MODEL_ARCHS = {
|
|
34 |
"llama": "π¦ LLaMA",
|
35 |
"rwkv": "π¦ββ¬ RWKV",
|
36 |
"mpt": "𧱠MPT",
|
37 |
-
"Yi": "π« Yi δΊΊ", # people
|
38 |
# suggest something
|
39 |
"gpt_neox": "GPT-NeoX",
|
40 |
"gpt_neo": "GPT-Neo",
|
@@ -50,13 +31,25 @@ def model_hyperlink(link, model_name):
|
|
50 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
51 |
|
52 |
|
53 |
-
def
|
54 |
-
link = f"https://huggingface.co/{model_name}"
|
55 |
-
return model_hyperlink(link, model_name)
|
56 |
-
|
57 |
-
|
58 |
-
def process_model_arch(model_arch):
|
59 |
if model_arch in LLM_MODEL_ARCHS:
|
60 |
return LLM_MODEL_ARCHS[model_arch]
|
61 |
else:
|
62 |
return model_arch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
LLM_MODEL_ARCHS = {
|
2 |
"stablelm_epoch": "π΄ StableLM-Epoch",
|
3 |
"stablelm_alpha": "π΄ StableLM-Alpha",
|
|
|
5 |
"RefinedWebModel": "π¦
Falcon",
|
6 |
"gpt_bigcode": "β StarCoder",
|
7 |
"RefinedWeb": "π¦
Falcon",
|
8 |
+
"baichuan": "π Baichuan ηΎε·", # river
|
9 |
+
"internlm": "π§βπ InternLM δΉ¦η", # scholar
|
10 |
"mistral": "βοΈ Mistral",
|
11 |
"codegen": "βΎοΈ CodeGen",
|
12 |
"chatglm": "π¬ ChatGLM",
|
|
|
15 |
"llama": "π¦ LLaMA",
|
16 |
"rwkv": "π¦ββ¬ RWKV",
|
17 |
"mpt": "𧱠MPT",
|
18 |
+
"Yi": "π« Yi δΊΊ" , # people
|
19 |
# suggest something
|
20 |
"gpt_neox": "GPT-NeoX",
|
21 |
"gpt_neo": "GPT-Neo",
|
|
|
31 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
32 |
|
33 |
|
34 |
+
def process_arch(model_arch):
|
|
|
|
|
|
|
|
|
|
|
35 |
if model_arch in LLM_MODEL_ARCHS:
|
36 |
return LLM_MODEL_ARCHS[model_arch]
|
37 |
else:
|
38 |
return model_arch
|
39 |
+
|
40 |
+
|
41 |
+
def process_score(score, quantization):
|
42 |
+
if quantization != "None":
|
43 |
+
return f"{score:.2f}*"
|
44 |
+
else:
|
45 |
+
return f"{score:.2f} "
|
46 |
+
|
47 |
+
|
48 |
+
# def change_tab(query_param):
|
49 |
+
# query_param = query_param.replace("'", '"')
|
50 |
+
# query_param = json.loads(query_param)
|
51 |
+
|
52 |
+
# if isinstance(query_param, dict) and "tab" in query_param and query_param["tab"] == "plot":
|
53 |
+
# return gr.Tabs.update(selected=1)
|
54 |
+
# else:
|
55 |
+
# return gr.Tabs.update(selected=0)
|