IlyasMoutawwakil HF staff commited on
Commit
ab5f5f1
Β·
1 Parent(s): 988dbd8
app.py CHANGED
@@ -1,371 +1,98 @@
1
  import os
2
 
3
  import gradio as gr
4
- import pandas as pd
5
- import plotly.express as px
6
- from huggingface_hub.file_download import hf_hub_download
7
 
8
-
9
- from src.utils import process_model_name, process_model_arch
10
- from src.assets.css_html_js import custom_css
11
- from src.assets.text_content import (
 
 
 
 
12
  TITLE,
13
- ABOUT_TEXT,
14
- INTRODUCTION_TEXT,
15
- EXAMPLE_CONFIG_TEXT,
 
16
  CITATION_BUTTON_LABEL,
17
- CITATION_BUTTON_TEXT,
18
  )
19
 
20
- HF_TOKEN = os.environ.get("HF_TOKEN", None)
21
- LOGO_URL = "https://huggingface.co/spaces/optimum/llm-perf-leaderboard/resolve/main/huggy_bench.png"
22
- LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
23
- ALL_COLUMNS_MAPPING = {
24
- "Model": "Model πŸ€—",
25
- "Arch": "Arch πŸ›οΈ",
26
- "Size": "Params (B) πŸ“",
27
- # deployment settings
28
- "backend.name": "Backend 🏭",
29
- "backend.torch_dtype": "Dtype πŸ“₯",
30
- "optimization": "Optimization πŸ› οΈ",
31
- "quantization": "Quantization πŸ—œοΈ",
32
- # measurements
33
- "Score": "Open LLM Score (%) ⬆️",
34
- "decode.throughput(tokens/s)": "Decode Throughput (tokens/s) ⬆️",
35
- "generate.throughput(tokens/s)": "E2E Throughput (tokens/s) ⬆️",
36
- "forward.latency(s)": "Prefill Latency (s) ⬇️",
37
- "generate.latency(s)": "E2E Latency (s) ⬇️",
38
- "generate.max_memory_allocated(MB)": "Allocated Memory (MB) ⬇️",
39
- "generate.max_memory_reserved(MB)": "Reserved Memory (MB) ⬇️",
40
- "generate.max_memory_used(MB)": "Used Memory (MB) ⬇️",
41
- "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) ⬇️",
42
- }
43
- SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
44
- SORTING_ASCENDING = [False, False]
45
- ALL_COLUMNS_DATATYPES = [
46
- # open llm
47
- "markdown",
48
- "markdown",
49
- "number",
50
- # deployment settings
51
- "str",
52
- "str",
53
- "str",
54
- "str",
55
- # measurements
56
- "number",
57
- "number",
58
- "number",
59
- "number",
60
- "number",
61
- "number",
62
- "number",
63
- "number",
64
- "number",
65
- "number",
66
- ]
67
- # download data
68
- hf_hub_download(
69
- repo_id="optimum/llm-perf-dataset",
70
- filename="open-llm.csv",
71
- local_dir="dataset",
72
- repo_type="dataset",
73
- token=HF_TOKEN,
74
- )
75
- OPEN_LLM_DF = pd.read_csv("dataset/open-llm.csv")
76
 
 
77
  MACHINE_TO_HARDWARE = {"hf-dgx-01": "A100-80GB πŸ–₯️"}
78
- MACHINE_TO_PERF = {}
79
- for machine in MACHINE_TO_HARDWARE:
80
- hf_hub_download(
81
- repo_id="optimum/llm-perf-dataset",
82
- filename=f"{machine}/perf-report.csv",
83
- local_dir="dataset",
84
- repo_type="dataset",
85
- token=HF_TOKEN,
86
- )
87
- MACHINE_TO_PERF[machine] = pd.read_csv(f"dataset/{machine}/perf-report.csv")
88
-
89
-
90
- def get_benchmark_df(machine="hf-dgx-01"):
91
- # merge on model
92
- machine_perf_df = MACHINE_TO_PERF[machine].copy()
93
- merged_df = OPEN_LLM_DF.merge(machine_perf_df, left_on="Model", right_on="model")
94
- # transpose energy consumption
95
- merged_df["generate.energy_consumption(tokens/kWh)"] = (
96
- 1 / merged_df["generate.energy_consumption(kWh/token)"].fillna(1)
97
- ).astype(int)
98
- # fix nan values
99
- merged_df.loc[
100
- merged_df["generate.energy_consumption(tokens/kWh)"] == 1,
101
- "generate.energy_consumption(tokens/kWh)",
102
- ] = pd.NA
103
- # add optimization column
104
- merged_df["optimization"] = merged_df[
105
- ["backend.to_bettertransformer", "backend.use_flash_attention_2"]
106
- ].apply(
107
- lambda x: "BetterTransformer"
108
- if x["backend.to_bettertransformer"]
109
- else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None"),
110
- axis=1,
111
- )
112
- # add quantization scheme
113
- merged_df["quantization"] = merged_df[
114
- ["backend.quantization_scheme", "backend.quantization_config.exllama_config.version"]
115
- ].apply(
116
- lambda x: "BnB.4bit"
117
- if x["backend.quantization_scheme"] == "bnb"
118
- else (
119
- "GPTQ.4bit+ExllamaV1"
120
- if (x["backend.quantization_scheme"] == "gptq")
121
- and (x["backend.quantization_config.exllama_config.version"] == 1)
122
- else (
123
- "GPTQ.4bit+ExllamaV2"
124
- if (x["backend.quantization_scheme"] == "gptq")
125
- and (x["backend.quantization_config.exllama_config.version"] == 2)
126
- else "None"
127
- )
128
- ),
129
- axis=1,
130
- )
131
- # add decode throughput
132
- merged_df["decode.throughput(tokens/s)"] = (
133
- 1000 / (merged_df["generate.latency(s)"] - merged_df["forward.latency(s)"])
134
- ).round(2)
135
- # sort by metric
136
- merged_df.sort_values(by=SORTING_COLUMN, ascending=SORTING_ASCENDING, inplace=True)
137
- # filter columns
138
- merged_df = merged_df[list(ALL_COLUMNS_MAPPING.keys())]
139
- # rename columns
140
- merged_df.rename(columns=ALL_COLUMNS_MAPPING, inplace=True)
141
-
142
- return merged_df
143
-
144
-
145
- def get_benchmark_table(bench_df):
146
- copy_df = bench_df.copy()
147
- # transform
148
- copy_df["Model πŸ€—"] = copy_df["Model πŸ€—"].apply(process_model_name)
149
- copy_df["Arch πŸ›οΈ"] = copy_df["Arch πŸ›οΈ"].apply(process_model_arch)
150
- # process quantization
151
- copy_df["Open LLM Score (%) ⬆️"] = copy_df.apply(
152
- lambda x: f"{x['Open LLM Score (%) ⬆️']}**"
153
- if x["Quantization πŸ—œοΈ"] in ["BnB.4bit", "GPTQ.4bit"]
154
- else x["Open LLM Score (%) ⬆️"],
155
- axis=1,
156
- )
157
- return copy_df
158
-
159
-
160
- def get_benchmark_chart(bench_df):
161
- copy_df = bench_df.copy()
162
- # transform
163
- copy_df["Arch πŸ›οΈ"] = copy_df["Arch πŸ›οΈ"].apply(process_model_arch)
164
- # plot
165
- fig = px.scatter(
166
- copy_df,
167
- y="Open LLM Score (%) ⬆️",
168
- x="E2E Latency (s) ⬇️",
169
- size="Allocated Memory (MB) ⬇️",
170
- color="Arch πŸ›οΈ",
171
- custom_data=list(ALL_COLUMNS_MAPPING.values()),
172
- color_discrete_sequence=px.colors.qualitative.Light24,
173
- )
174
- fig.update_layout(
175
- title={
176
- "text": "Latency vs. Score vs. Memory",
177
- "y": 0.95,
178
- "x": 0.5,
179
- "xanchor": "center",
180
- "yanchor": "top",
181
- },
182
- xaxis_title="Per 1000 Tokens Latency (s)",
183
- yaxis_title="Open LLM Score (%)",
184
- legend_title="LLM Architecture",
185
- width=1200,
186
- height=600,
187
- )
188
- fig.update_traces(
189
- hovertemplate="<br>".join(
190
- [
191
- f"<b>{column}:</b> %{{customdata[{i}]}}"
192
- for i, column in enumerate(ALL_COLUMNS_MAPPING.values())
193
- ]
194
- )
195
- )
196
- return fig
197
-
198
-
199
- def filter_query(
200
- text,
201
- backends,
202
- datatypes,
203
- optimizations,
204
- quantizations,
205
- score,
206
- memory,
207
- machine,
208
- ):
209
- raw_df = get_benchmark_df(machine=machine)
210
- filtered_df = raw_df[
211
- raw_df["Model πŸ€—"].str.contains(text, case=False)
212
- & raw_df["Backend 🏭"].isin(backends)
213
- & raw_df["Dtype πŸ“₯"].isin(datatypes)
214
- & raw_df["Optimization πŸ› οΈ"].isin(optimizations)
215
- & raw_df["Quantization πŸ—œοΈ"].isin(quantizations)
216
- & (raw_df["Open LLM Score (%) ⬆️"] >= score)
217
- & (raw_df["Allocated Memory (MB) ⬇️"] <= memory)
218
- ]
219
- filtered_table = get_benchmark_table(filtered_df)
220
- filtered_chart = get_benchmark_chart(filtered_df)
221
- return filtered_table, filtered_chart
222
 
223
 
224
- # Demo interface
225
  demo = gr.Blocks(css=custom_css)
226
  with demo:
227
- # logo
228
  gr.HTML(f'<img src="{LOGO_URL}">', elem_classes="logo")
229
- # leaderboard title
230
- gr.HTML(TITLE)
231
- # introduction text
232
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="descriptive-text")
233
-
234
- with gr.Tabs(elem_classes="leaderboard-tabs"):
235
- machine_placeholders = {}
236
- machine_tables = {}
237
- machine_plots = {}
238
- ####################### HARDWARE TABS #######################
239
- for i, (machine, hardware) in enumerate(MACHINE_TO_HARDWARE.items()):
240
- # dummy placeholder of the machine name
241
- machine_placeholders[machine] = gr.Textbox(value=machine, visible=False)
242
-
243
- with gr.TabItem(hardware, id=i):
244
- with gr.Tabs(elem_classes="machine-tabs"):
245
- # placeholder for full dataframe
246
- machine_df = get_benchmark_df(machine=machine)
 
 
 
247
  with gr.TabItem("Leaderboard πŸ…", id=0):
248
- gr.HTML(
249
- "πŸ‘‰ Scroll to the right πŸ‘‰ for additional columns.",
250
- elem_id="descriptive-text",
251
- )
252
- # Original leaderboard table
253
- machine_tables[machine] = gr.components.Dataframe(
254
- value=get_benchmark_table(machine_df),
255
- headers=list(ALL_COLUMNS_MAPPING.values()),
256
- datatype=ALL_COLUMNS_DATATYPES,
257
- elem_id="machine-table",
258
- )
259
- with gr.TabItem("Plot πŸ“Š", id=1):
260
- gr.HTML(
261
- "πŸ‘† Hover over the points πŸ‘† for additional information.",
262
- elem_id="descriptive-text",
263
- )
264
- # Original leaderboard plot
265
- machine_plots[machine] = gr.components.Plot(
266
- value=get_benchmark_chart(machine_df),
267
- elem_id="machine-plot",
268
- show_label=False,
269
- )
270
-
271
- ###################### CONTROL PANEL #######################
272
- with gr.TabItem("Control Panel πŸŽ›οΈ", id=2):
273
- gr.HTML(
274
- "Use this control panel to filter the leaderboard's table and plot.", # noqa: E501
275
- elem_id="descriptive-text",
276
- )
277
- with gr.Row():
278
- with gr.Column():
279
- search_bar = gr.Textbox(
280
- label="Model πŸ€—",
281
- info="πŸ” Search for a model name",
282
- elem_id="search-bar",
283
- )
284
- with gr.Row():
285
- with gr.Column(scale=1):
286
- score_slider = gr.Slider(
287
- label="Open LLM Score (%) πŸ“ˆ",
288
- info="🎚️ Slide to minimum Open LLM score",
289
- value=0,
290
- elem_id="threshold-slider",
291
- )
292
- with gr.Column(scale=1):
293
- memory_slider = gr.Slider(
294
- label="Peak Memory (MB) πŸ“ˆ",
295
- info="🎚️ Slide to maximum Peak Memory",
296
- minimum=0,
297
- maximum=80 * 1024,
298
- value=80 * 1024,
299
- elem_id="memory-slider",
300
- )
301
- with gr.Column(scale=1):
302
- backend_checkboxes = gr.CheckboxGroup(
303
- label="Backends 🏭",
304
- choices=["pytorch", "onnxruntime"],
305
- value=["pytorch", "onnxruntime"],
306
- info="β˜‘οΈ Select the backends",
307
- elem_id="backend-checkboxes",
308
- )
309
- with gr.Row():
310
- with gr.Column(scale=1):
311
- datatype_checkboxes = gr.CheckboxGroup(
312
- label="Load Dtypes πŸ“₯",
313
- choices=["float32", "float16"],
314
- value=["float32", "float16"],
315
- info="β˜‘οΈ Select the load dtypes",
316
- elem_id="dtype-checkboxes",
317
- )
318
- with gr.Column(scale=1):
319
- optimization_checkboxes = gr.CheckboxGroup(
320
- label="Optimizations πŸ› οΈ",
321
- choices=["None", "BetterTransformer", "FlashAttentionV2"],
322
- value=["None", "BetterTransformer", "FlashAttentionV2"],
323
- info="β˜‘οΈ Select the optimization",
324
- elem_id="optimization-checkboxes",
325
- )
326
- with gr.Column(scale=1):
327
- quantization_checkboxes = gr.CheckboxGroup(
328
- label="Quantizations πŸ—œοΈ",
329
- choices=["None", "BnB.4bit", "GPTQ.4bit"],
330
- value=["None", "BnB.4bit", "GPTQ.4bit"],
331
- info="β˜‘οΈ Select the quantization schemes",
332
- elem_id="quantization-checkboxes",
333
- )
334
- with gr.Row():
335
- filter_button = gr.Button(
336
- value="Filter πŸš€",
337
- elem_id="filter-button",
338
- )
339
- for machine in MACHINE_TO_HARDWARE:
340
- filter_button.click(
341
- filter_query,
342
- [
343
- search_bar,
344
- backend_checkboxes,
345
- datatype_checkboxes,
346
- optimization_checkboxes,
347
- quantization_checkboxes,
348
- score_slider,
349
- memory_slider,
350
- machine_placeholders[machine],
351
- ],
352
- [machine_tables[machine], machine_plots[machine]],
353
  )
354
-
355
  ####################### ABOUT TAB #######################
356
  with gr.TabItem("About πŸ“–", id=3):
357
- gr.HTML(ABOUT_TEXT, elem_classes="descriptive-text")
358
- gr.Markdown(EXAMPLE_CONFIG_TEXT, elem_classes="descriptive-text")
359
-
360
- ####################### CITATION #######################
361
  with gr.Row():
362
  with gr.Accordion("πŸ“™ Citation", open=False):
363
  citation_button = gr.Textbox(
364
- value=CITATION_BUTTON_TEXT,
365
  label=CITATION_BUTTON_LABEL,
366
  elem_id="citation-button",
367
  show_copy_button=True,
368
  )
369
 
370
- # Launch demo
371
- demo.queue().launch()
 
 
1
  import os
2
 
3
  import gradio as gr
 
 
 
4
 
5
+ from src.control_panel import create_control_panel, create_control_callback
6
+ from src.latency_score_memory import create_lat_score_mem_plot
7
+ from src.leaderboard import create_leaderboard_table
8
+ from src.flashattentionv2 import create_fa2_plots
9
+ from src.bettertransformer import create_bt_plots
10
+ from src.llm_perf import get_llm_perf_df
11
+ from src.assets import custom_css
12
+ from src.text import (
13
  TITLE,
14
+ ABOUT,
15
+ INTRODUCTION,
16
+ EXAMPLE_CONFIG,
17
+ CITATION_BUTTON,
18
  CITATION_BUTTON_LABEL,
 
19
  )
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ LOGO_URL = "https://huggingface.co/spaces/optimum/llm-perf-leaderboard/resolve/main/logo.png"
23
  MACHINE_TO_HARDWARE = {"hf-dgx-01": "A100-80GB πŸ–₯️"}
24
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
 
 
27
  demo = gr.Blocks(css=custom_css)
28
  with demo:
29
+ gr.HTML(TITLE, elem_classes="title")
30
  gr.HTML(f'<img src="{LOGO_URL}">', elem_classes="logo")
31
+ gr.Markdown(INTRODUCTION, elem_classes="descriptive-text")
32
+ ####################### HARDWARE TABS #######################
33
+ with gr.Tabs(elem_classes="tabs"):
34
+ for id, (machine, hardware) in enumerate(MACHINE_TO_HARDWARE.items()):
35
+ with gr.TabItem(hardware, id=id):
36
+ ####################### CONTROL PANEL #######################
37
+ (
38
+ filter_button,
39
+ machine_textbox,
40
+ search_bar,
41
+ score_slider,
42
+ memory_slider,
43
+ backend_checkboxes,
44
+ datatype_checkboxes,
45
+ optimization_checkboxes,
46
+ quantization_checkboxes,
47
+ ) = create_control_panel()
48
+ ####################### HARDWARE SUBTABS #######################
49
+ with gr.Tabs(elem_classes="subtabs"):
50
+ llm_perf_df = get_llm_perf_df(machine=machine)
51
+ ####################### LEADERBOARD TAB #######################
52
  with gr.TabItem("Leaderboard πŸ…", id=0):
53
+ leaderboard_table = create_leaderboard_table(llm_perf_df)
54
+ ####################### LAT. vs. SCORE vs. MEM. TAB #######################
55
+ with gr.TabItem("Latency vs. Score vs. Memory πŸ“Š", id=1):
56
+ lat_score_mem_plot = create_lat_score_mem_plot(llm_perf_df)
57
+ ####################### BETTERTRANSFORMER SPEEDUP TAB #######################
58
+ with gr.TabItem("BetterTransformer Speedup πŸ“ˆ", id=2):
59
+ bt_prefill_plot, bt_decode_plot = create_bt_plots(llm_perf_df)
60
+ with gr.TabItem("FlashAttentionV2 Speedup πŸ“ˆ", id=3):
61
+ fa2_prefill_plot, fa2_decode_plot = create_fa2_plots(llm_perf_df)
62
+ ####################### CONTROL CALLBACK #######################
63
+ create_control_callback(
64
+ filter_button,
65
+ # inputs
66
+ machine_textbox,
67
+ search_bar,
68
+ score_slider,
69
+ memory_slider,
70
+ backend_checkboxes,
71
+ datatype_checkboxes,
72
+ optimization_checkboxes,
73
+ quantization_checkboxes,
74
+ # outputs
75
+ leaderboard_table,
76
+ lat_score_mem_plot,
77
+ bt_prefill_plot,
78
+ bt_decode_plot,
79
+ fa2_prefill_plot,
80
+ fa2_decode_plot,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  )
 
82
  ####################### ABOUT TAB #######################
83
  with gr.TabItem("About πŸ“–", id=3):
84
+ gr.HTML(ABOUT, elem_classes="descriptive-text")
85
+ gr.Markdown(EXAMPLE_CONFIG, elem_classes="descriptive-text")
86
+ ####################### CITATION
 
87
  with gr.Row():
88
  with gr.Accordion("πŸ“™ Citation", open=False):
89
  citation_button = gr.Textbox(
90
+ value=CITATION_BUTTON,
91
  label=CITATION_BUTTON_LABEL,
92
  elem_id="citation-button",
93
  show_copy_button=True,
94
  )
95
 
96
+ if __name__ == "__main__":
97
+ # Launch demo
98
+ demo.queue().launch()
huggy_bench.png β†’ logo.png RENAMED
File without changes
pyproject.toml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ [tool.black]
16
+ line-length = 119
17
+ target-version = ['py37']
18
+
19
+ [tool.ruff]
20
+ ignore = ["E501", "C901"]
21
+ select = ["C", "E", "F", "I", "W"]
script.py DELETED
@@ -1,14 +0,0 @@
1
- from huggingface_hub import hf_hub_download
2
- import pandas as pd
3
-
4
-
5
- hf_hub_download(
6
- repo_id="optimum/llm-perf-dataset",
7
- filename="open-llm.csv",
8
- local_dir="dataset",
9
- repo_type="dataset",
10
- )
11
-
12
- open_llm = pd.read_csv("dataset/open-llm.csv")
13
- print(open_llm["Arch"].unique())
14
- print(open_llm[open_llm["Arch"] == "rwkv"]["Model"].unique())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/{assets/css_html_js.py β†’ assets.py} RENAMED
@@ -6,14 +6,14 @@ custom_css = """
6
  max-width: 100%
7
  object-fit: contain;
8
  }
9
- .descriptive-text {
10
  font-size: 16px !important;
11
  }
12
 
13
- .leaderboard-tabs button {
14
  font-size: 20px;
15
  }
16
- .hardware-tabs button {
17
  font-size: 20px;
18
  }
19
 
 
6
  max-width: 100%
7
  object-fit: contain;
8
  }
9
+ .text {
10
  font-size: 16px !important;
11
  }
12
 
13
+ .tabs button {
14
  font-size: 20px;
15
  }
16
+ .subtabs button {
17
  font-size: 20px;
18
  }
19
 
src/bettertransformer.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import plotly.express as px
4
+
5
+
6
+ from src.utils import process_arch
7
+
8
+
9
+ BETTERTRANSFORMER_DATA = [
10
+ # open llm
11
+ "Model πŸ€—",
12
+ "Arch πŸ›οΈ",
13
+ "DType πŸ“₯",
14
+ "Backend 🏭",
15
+ "Params (B)",
16
+ "Open LLM Score (%)",
17
+ # deployment settings
18
+ "DType πŸ“₯",
19
+ "Backend 🏭",
20
+ "Quantization πŸ—œοΈ",
21
+ # primary measurements
22
+ "Prefill Latency (s)",
23
+ "Prefill Latency (s) BetterTransformer",
24
+ "Decode Throughput (tokens/s)",
25
+ "Decode Throughput (tokens/s) BetterTransformer",
26
+ "E2E Throughput (tokens/s)",
27
+ "E2E Throughput (tokens/s) BetterTransformer",
28
+ # speedups
29
+ "Prefill Latency Speedup (%)",
30
+ "Decode Throughput Speedup (%)",
31
+ ]
32
+
33
+
34
+ def get_bt_df(llm_perf_df):
35
+ bt_df = llm_perf_df.copy()
36
+ # process
37
+ bt_df["Arch πŸ›οΈ"] = bt_df["Arch πŸ›οΈ"].apply(process_arch)
38
+ # seperate original model experiments from BetterTransformer experiments
39
+ original_df = bt_df[bt_df["Optimization πŸ› οΈ"] == "None"]
40
+ bt_df = bt_df[bt_df["Optimization πŸ› οΈ"] == "BetterTransformer"]
41
+ # merge the two dataframes
42
+ bt_df = pd.merge(
43
+ original_df,
44
+ bt_df,
45
+ on=["Model πŸ€—", "Quantization πŸ—œοΈ"],
46
+ suffixes=["", " BetterTransformer"],
47
+ )
48
+ # compute speedups
49
+ bt_df["Prefill Latency Speedup (%)"] = (
50
+ (bt_df["Prefill Latency (s)"] / bt_df["Prefill Latency (s) BetterTransformer"]) * 100
51
+ ).round(2)
52
+ bt_df["Decode Throughput Speedup (%)"] = (
53
+ (bt_df["Decode Throughput (tokens/s) BetterTransformer"] / bt_df["Decode Throughput (tokens/s)"]) * 100
54
+ ).round(2)
55
+
56
+ # filter speedups > 1000%
57
+ bt_df = bt_df[bt_df["Prefill Latency Speedup (%)"] < 1000]
58
+ bt_df = bt_df[bt_df["Decode Throughput Speedup (%)"] < 1000]
59
+
60
+ return bt_df
61
+
62
+
63
+ def get_bt_decode_fig(llm_perf_df):
64
+ bt_df = get_bt_df(llm_perf_df)
65
+ # plot
66
+ decode_fig = px.box(
67
+ bt_df,
68
+ x="Arch πŸ›οΈ",
69
+ y="Decode Throughput Speedup (%)",
70
+ color_discrete_sequence=px.colors.qualitative.Light24,
71
+ custom_data=BETTERTRANSFORMER_DATA,
72
+ color="Quantization πŸ—œοΈ",
73
+ points="all",
74
+ )
75
+ # add hover data
76
+ decode_fig.update_traces(
77
+ hovertemplate="<br>".join(
78
+ [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
79
+ )
80
+ )
81
+ # add layout
82
+ decode_fig.update_layout(
83
+ title={
84
+ "text": "Decode Throughput Speedup per Architecture",
85
+ "y": 0.95,
86
+ "x": 0.5,
87
+ "xanchor": "center",
88
+ "yanchor": "top",
89
+ },
90
+ xaxis_title="LLM Architecture",
91
+ yaxis_title="Decode Speedup (%)",
92
+ legend_title="Quantization Scheme",
93
+ width=1200,
94
+ height=600,
95
+ )
96
+
97
+ return decode_fig
98
+
99
+
100
+ def get_bt_prefill_fig(llm_perf_df):
101
+ bt_df = get_bt_df(llm_perf_df)
102
+ # plot
103
+ prefill_fig = px.box(
104
+ bt_df,
105
+ x="Arch πŸ›οΈ",
106
+ y="Prefill Latency Speedup (%)",
107
+ color_discrete_sequence=px.colors.qualitative.Light24,
108
+ custom_data=BETTERTRANSFORMER_DATA,
109
+ color="Quantization πŸ—œοΈ",
110
+ points="all",
111
+ )
112
+ # add hover data
113
+ prefill_fig.update_traces(
114
+ hovertemplate="<br>".join(
115
+ [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
116
+ )
117
+ )
118
+ # add layout
119
+ prefill_fig.update_layout(
120
+ title={
121
+ "text": "Prefill Latency Speedup per Architecture",
122
+ "y": 0.95,
123
+ "x": 0.5,
124
+ "xanchor": "center",
125
+ "yanchor": "top",
126
+ },
127
+ xaxis_title="LLM Architecture",
128
+ yaxis_title="Prefill Speedup (%)",
129
+ legend_title="Quantization Scheme",
130
+ width=1200,
131
+ height=600,
132
+ )
133
+
134
+ return prefill_fig
135
+
136
+
137
+ def create_bt_plots(llm_perf_df):
138
+ # descriptive text
139
+ gr.HTML("πŸ‘† Hover over the points πŸ‘† for additional information.", elem_id="text")
140
+ # get figures
141
+ prefill_fig = get_bt_prefill_fig(llm_perf_df)
142
+ decode_fig = get_bt_decode_fig(llm_perf_df)
143
+
144
+ # create plots
145
+ prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
146
+ decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
147
+
148
+ return prefill_plot, decode_plot
src/control_panel.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from src.llm_perf import get_llm_perf_df
4
+ from src.leaderboard import get_leaderboard_df
5
+ from src.latency_score_memory import get_lat_score_mem_fig
6
+ from src.bettertransformer import get_bt_prefill_fig, get_bt_decode_fig
7
+ from src.flashattentionv2 import get_fa2_prefill_fig, get_fa2_decode_fig
8
+
9
+
10
+ def create_control_panel(machine: str = "hf-dgx-01"):
11
+ # descriptive text
12
+ gr.HTML("Use this control panel to filter this leaderboard.", elem_id="text")
13
+ # controls
14
+ machine_textbox = gr.Textbox(value=machine, visible=False)
15
+ with gr.Row():
16
+ with gr.Column():
17
+ search_bar = gr.Textbox(
18
+ label="Model πŸ€—",
19
+ info="πŸ” Search for a model name",
20
+ elem_id="search-bar",
21
+ )
22
+ with gr.Row():
23
+ with gr.Column(scale=1):
24
+ score_slider = gr.Slider(
25
+ label="Open LLM Score (%) πŸ“ˆ",
26
+ info="🎚️ Slide to minimum Open LLM score",
27
+ value=0,
28
+ elem_id="threshold-slider",
29
+ )
30
+ with gr.Column(scale=1):
31
+ memory_slider = gr.Slider(
32
+ label="Peak Memory (MB) πŸ“ˆ",
33
+ info="🎚️ Slide to maximum Peak Memory",
34
+ minimum=0,
35
+ maximum=80 * 1024,
36
+ value=80 * 1024,
37
+ elem_id="memory-slider",
38
+ )
39
+ with gr.Column(scale=1):
40
+ backend_checkboxes = gr.CheckboxGroup(
41
+ label="Backends 🏭",
42
+ choices=["pytorch", "onnxruntime"],
43
+ value=["pytorch", "onnxruntime"],
44
+ info="β˜‘οΈ Select the backends",
45
+ elem_id="backend-checkboxes",
46
+ )
47
+ with gr.Row():
48
+ with gr.Column(scale=1):
49
+ datatype_checkboxes = gr.CheckboxGroup(
50
+ label="DTypes πŸ“₯",
51
+ choices=["float32", "float16"],
52
+ value=["float32", "float16"],
53
+ info="β˜‘οΈ Select the load data types",
54
+ elem_id="dtype-checkboxes",
55
+ )
56
+ with gr.Column(scale=1):
57
+ optimization_checkboxes = gr.CheckboxGroup(
58
+ label="Optimizations πŸ› οΈ",
59
+ choices=["None", "BetterTransformer", "FlashAttentionV2"],
60
+ value=["None", "BetterTransformer", "FlashAttentionV2"],
61
+ info="β˜‘οΈ Select the optimization",
62
+ elem_id="optimization-checkboxes",
63
+ )
64
+ with gr.Column(scale=1):
65
+ quantization_checkboxes = gr.CheckboxGroup(
66
+ label="Quantizations πŸ—œοΈ",
67
+ choices=["None", "BnB.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
68
+ value=["None", "BnB.4bit", "GPTQ.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
69
+ info="β˜‘οΈ Select the quantization schemes",
70
+ elem_id="quantization-checkboxes",
71
+ )
72
+ with gr.Row():
73
+ filter_button = gr.Button(
74
+ value="Filter πŸš€",
75
+ elem_id="filter-button",
76
+ )
77
+
78
+ return (
79
+ filter_button,
80
+ machine_textbox,
81
+ search_bar,
82
+ score_slider,
83
+ memory_slider,
84
+ backend_checkboxes,
85
+ datatype_checkboxes,
86
+ optimization_checkboxes,
87
+ quantization_checkboxes,
88
+ )
89
+
90
+
91
+ def filter_fn(
92
+ machine,
93
+ model,
94
+ backends,
95
+ datatypes,
96
+ optimizations,
97
+ quantizations,
98
+ score,
99
+ memory,
100
+ ):
101
+ raw_df = get_llm_perf_df(machine=machine)
102
+ filtered_df = raw_df[
103
+ raw_df["Model πŸ€—"].str.contains(model, case=False)
104
+ & raw_df["Backend 🏭"].isin(backends)
105
+ & raw_df["DType πŸ“₯"].isin(datatypes)
106
+ & raw_df["Optimization πŸ› οΈ"].isin(optimizations)
107
+ & raw_df["Quantization πŸ—œοΈ"].isin(quantizations)
108
+ & (raw_df["Open LLM Score (%)"] >= score)
109
+ & (raw_df["Allocated Memory (MB)"] <= memory)
110
+ ]
111
+ filtered_leaderboard_df = get_leaderboard_df(filtered_df)
112
+ filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_df)
113
+ filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
114
+ filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
115
+ filtered_fa2_prefill_fig = get_fa2_prefill_fig(filtered_df)
116
+ filtered_fa2_decode_fig = get_fa2_decode_fig(filtered_df)
117
+
118
+ return [
119
+ filtered_leaderboard_df,
120
+ filtered_lat_score_mem_fig,
121
+ filtered_bt_prefill_fig,
122
+ filtered_bt_decode_fig,
123
+ filtered_fa2_prefill_fig,
124
+ filtered_fa2_decode_fig,
125
+ ]
126
+
127
+
128
+ def create_control_callback(
129
+ # button
130
+ filter_button,
131
+ # inputs
132
+ machine_textbox,
133
+ search_bar,
134
+ score_slider,
135
+ memory_slider,
136
+ backend_checkboxes,
137
+ datatype_checkboxes,
138
+ optimization_checkboxes,
139
+ quantization_checkboxes,
140
+ # outputs
141
+ leaderboard_table,
142
+ lat_score_mem_plot,
143
+ bt_prefill_plot,
144
+ bt_decode_plot,
145
+ fa2_prefill_plot,
146
+ fa2_decode_plot,
147
+ ):
148
+ filter_button.click(
149
+ fn=filter_fn,
150
+ inputs=[
151
+ machine_textbox,
152
+ search_bar,
153
+ backend_checkboxes,
154
+ datatype_checkboxes,
155
+ optimization_checkboxes,
156
+ quantization_checkboxes,
157
+ score_slider,
158
+ memory_slider,
159
+ ],
160
+ outputs=[
161
+ leaderboard_table,
162
+ lat_score_mem_plot,
163
+ bt_prefill_plot,
164
+ bt_decode_plot,
165
+ fa2_prefill_plot,
166
+ fa2_decode_plot,
167
+ ],
168
+ )
src/flashattentionv2.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import plotly.express as px
4
+
5
+
6
+ from src.utils import process_arch
7
+
8
+
9
+ FLASHATTENTIONV2_DATA = [
10
+ # open llm
11
+ "Model πŸ€—",
12
+ "Arch πŸ›οΈ",
13
+ "DType πŸ“₯",
14
+ "Backend 🏭",
15
+ "Params (B)",
16
+ "Open LLM Score (%)",
17
+ # deployment settings
18
+ "DType πŸ“₯",
19
+ "Backend 🏭",
20
+ "Quantization πŸ—œοΈ",
21
+ # primary measurements
22
+ "Prefill Latency (s)",
23
+ "Prefill Latency (s) FlashAttentionV2",
24
+ "Decode Throughput (tokens/s)",
25
+ "Decode Throughput (tokens/s) FlashAttentionV2",
26
+ "E2E Throughput (tokens/s)",
27
+ "E2E Throughput (tokens/s) FlashAttentionV2",
28
+ # speedups
29
+ "Prefill Latency Speedup (%)",
30
+ "Decode Throughput Speedup (%)",
31
+ ]
32
+
33
+
34
+ def get_fa2_df(llm_perf_df):
35
+ fa2_df = llm_perf_df.copy()
36
+ # process
37
+ fa2_df["Arch πŸ›οΈ"] = fa2_df["Arch πŸ›οΈ"].apply(process_arch)
38
+ # seperate original model experiments from FlashAttentionV2 experiments
39
+ original_df = fa2_df[fa2_df["Optimization πŸ› οΈ"] == "None"]
40
+ fa2_df = fa2_df[fa2_df["Optimization πŸ› οΈ"] == "FlashAttentionV2"]
41
+ # merge the two dataframes
42
+ fa2_df = pd.merge(
43
+ original_df,
44
+ fa2_df,
45
+ on=["Model πŸ€—", "Quantization πŸ—œοΈ"],
46
+ suffixes=["", " FlashAttentionV2"],
47
+ )
48
+ # compute speedups
49
+ fa2_df["Prefill Latency Speedup (%)"] = (
50
+ (fa2_df["Prefill Latency (s)"] / fa2_df["Prefill Latency (s) FlashAttentionV2"]) * 100
51
+ ).round(2)
52
+ fa2_df["Decode Throughput Speedup (%)"] = (
53
+ (fa2_df["Decode Throughput (tokens/s) FlashAttentionV2"] / fa2_df["Decode Throughput (tokens/s)"]) * 100
54
+ ).round(2)
55
+
56
+ # filter speedups > 1000%
57
+ fa2_df = fa2_df[fa2_df["Prefill Latency Speedup (%)"] < 1000]
58
+ fa2_df = fa2_df[fa2_df["Decode Throughput Speedup (%)"] < 1000]
59
+
60
+ return fa2_df
61
+
62
+
63
+ def get_fa2_decode_fig(llm_perf_df):
64
+ fa2_df = get_fa2_df(llm_perf_df)
65
+ # plot
66
+ decode_fig = px.box(
67
+ fa2_df,
68
+ x="Arch πŸ›οΈ",
69
+ y="Decode Throughput Speedup (%)",
70
+ color_discrete_sequence=px.colors.qualitative.Light24,
71
+ custom_data=FLASHATTENTIONV2_DATA,
72
+ color="Quantization πŸ—œοΈ",
73
+ points="all",
74
+ )
75
+ # add hover data
76
+ decode_fig.update_traces(
77
+ hovertemplate="<br>".join(
78
+ [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(FLASHATTENTIONV2_DATA)]
79
+ )
80
+ )
81
+ # add layout
82
+ decode_fig.update_layout(
83
+ title={
84
+ "text": "Decode Throughput Speedup per Architecture",
85
+ "y": 0.95,
86
+ "x": 0.5,
87
+ "xanchor": "center",
88
+ "yanchor": "top",
89
+ },
90
+ xaxis_title="LLM Architecture",
91
+ yaxis_title="Decode Speedup (%)",
92
+ legend_title="Quantization Scheme",
93
+ width=1200,
94
+ height=600,
95
+ )
96
+
97
+ return decode_fig
98
+
99
+
100
+ def get_fa2_prefill_fig(llm_perf_df):
101
+ fa2_df = get_fa2_df(llm_perf_df)
102
+ # plot
103
+ prefill_fig = px.box(
104
+ fa2_df,
105
+ x="Arch πŸ›οΈ",
106
+ y="Prefill Latency Speedup (%)",
107
+ color_discrete_sequence=px.colors.qualitative.Light24,
108
+ custom_data=FLASHATTENTIONV2_DATA,
109
+ color="Quantization πŸ—œοΈ",
110
+ points="all",
111
+ )
112
+ # add hover data
113
+ prefill_fig.update_traces(
114
+ hovertemplate="<br>".join(
115
+ [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(FLASHATTENTIONV2_DATA)]
116
+ )
117
+ )
118
+ # add layout
119
+ prefill_fig.update_layout(
120
+ title={
121
+ "text": "Prefill Latency Speedup per Architecture",
122
+ "y": 0.95,
123
+ "x": 0.5,
124
+ "xanchor": "center",
125
+ "yanchor": "top",
126
+ },
127
+ xaxis_title="LLM Architecture",
128
+ yaxis_title="Prefill Speedup (%)",
129
+ legend_title="Quantization Scheme",
130
+ width=1200,
131
+ height=600,
132
+ )
133
+
134
+ return prefill_fig
135
+
136
+
137
+ def create_fa2_plots(llm_perf_df):
138
+ # descriptive text
139
+ gr.HTML("πŸ‘† Hover over the points πŸ‘† for additional information.", elem_id="text")
140
+ # get figures
141
+ prefill_fig = get_fa2_prefill_fig(llm_perf_df)
142
+ decode_fig = get_fa2_decode_fig(llm_perf_df)
143
+
144
+ # create plots
145
+ prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
146
+ decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
147
+
148
+ return prefill_plot, decode_plot
src/latency_score_memory.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import plotly.express as px
3
+
4
+
5
+ SCORE_MEMORY_LATENCY_DATA = [
6
+ "Model πŸ€—",
7
+ "Arch πŸ›οΈ",
8
+ "Params (B)",
9
+ "DType πŸ“₯",
10
+ "Backend 🏭",
11
+ "Open LLM Score (%)",
12
+ "Prefill Latency (s)",
13
+ "Decode Throughput (tokens/s)",
14
+ "Allocated Memory (MB)",
15
+ "E2E Latency (s)",
16
+ "E2E Throughput (tokens/s)",
17
+ ]
18
+
19
+
20
+ def get_lat_score_mem_fig(llm_perf_df):
21
+ copy_df = llm_perf_df.copy()
22
+ # plot
23
+ fig = px.scatter(
24
+ copy_df,
25
+ x="E2E Latency (s)",
26
+ y="Open LLM Score (%)",
27
+ size="Allocated Memory (MB)",
28
+ color="Arch πŸ›οΈ",
29
+ custom_data=SCORE_MEMORY_LATENCY_DATA,
30
+ color_discrete_sequence=px.colors.qualitative.Light24,
31
+ )
32
+ fig.update_traces(
33
+ hovertemplate="<br>".join(
34
+ [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(SCORE_MEMORY_LATENCY_DATA)]
35
+ )
36
+ )
37
+ fig.update_layout(
38
+ title={
39
+ "text": "Latency vs. Score vs. Memory",
40
+ "y": 0.95,
41
+ "x": 0.5,
42
+ "xanchor": "center",
43
+ "yanchor": "top",
44
+ },
45
+ xaxis_title="Per 1000 Tokens Latency (s)",
46
+ yaxis_title="Open LLM Score (%)",
47
+ legend_title="LLM Architecture",
48
+ width=1200,
49
+ height=600,
50
+ )
51
+
52
+ return fig
53
+
54
+
55
+ def create_lat_score_mem_plot(llm_perf_df):
56
+ # descriptive text
57
+ gr.HTML("πŸ‘† Hover over the points πŸ‘† for additional information. ",elem_id="text")
58
+ # get figure
59
+ fig = get_lat_score_mem_fig(llm_perf_df)
60
+ # create plot
61
+ plot = gr.components.Plot(
62
+ value=fig,
63
+ elem_id="plot",
64
+ show_label=False,
65
+ )
66
+
67
+ return plot
src/leaderboard.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from src.utils import model_hyperlink, process_score
4
+
5
+
6
+ LEADERBOARD_COLUMN_TO_DATATYPE = {
7
+ # open llm
8
+ "Model πŸ€—" :"markdown",
9
+ "Arch πŸ›οΈ" :"markdown",
10
+ "Params (B)": "number",
11
+ "Open LLM Score (%)": "number",
12
+ # deployment settings
13
+ "DType πŸ“₯" :"str",
14
+ "Backend 🏭" :"str",
15
+ "Optimization πŸ› οΈ" :"str",
16
+ "Quantization πŸ—œοΈ" :"str",
17
+ # primary measurements
18
+ "Prefill Latency (s)": "number",
19
+ "Decode Throughput (tokens/s)": "number",
20
+ "Allocated Memory (MB)": "number",
21
+ "Energy (tokens/kWh)": "number",
22
+ # additional measurements
23
+ "E2E Latency (s)": "number",
24
+ "E2E Throughput (tokens/s)": "number",
25
+ "Reserved Memory (MB)": "number",
26
+ "Used Memory (MB)": "number",
27
+ }
28
+
29
+
30
+ def process_model(model_name):
31
+ link = f"https://huggingface.co/{model_name}"
32
+ return model_hyperlink(link, model_name)
33
+
34
+
35
+ def get_leaderboard_df(llm_perf_df):
36
+ df = llm_perf_df.copy()
37
+ # transform for leaderboard
38
+ df["Model πŸ€—"] = df["Model πŸ€—"].apply(process_model)
39
+ # process quantization for leaderboard
40
+ df["Open LLM Score (%)"] = df.apply(
41
+ lambda x: process_score(x["Open LLM Score (%)"], x["Quantization πŸ—œοΈ"]),
42
+ axis=1,
43
+ )
44
+ return df
45
+
46
+
47
+ def create_leaderboard_table(llm_perf_df):
48
+ # descriptive text
49
+ gr.HTML("πŸ‘‰ Scroll to the right πŸ‘‰ for additional columns.", elem_id="text")
50
+ # get dataframe
51
+ leaderboard_df = get_leaderboard_df(llm_perf_df)
52
+ # create table
53
+ leaderboard_table = gr.components.Dataframe(
54
+ value=leaderboard_df,
55
+ datatype=list(LEADERBOARD_COLUMN_TO_DATATYPE.values()),
56
+ headers=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
57
+ elem_id="table",
58
+ )
59
+
60
+ return leaderboard_table
src/llm_perf.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import pandas as pd
4
+ from huggingface_hub import hf_hub_download
5
+
6
+ LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
7
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
8
+
9
+ COLUMNS_MAPPING = {
10
+ "Model": "Model πŸ€—",
11
+ "Arch": "Arch πŸ›οΈ",
12
+ "Size": "Params (B)",
13
+ "Score": "Open LLM Score (%)",
14
+ # deployment settings
15
+ "backend.name": "Backend 🏭",
16
+ "backend.torch_dtype": "DType πŸ“₯",
17
+ "optimization": "Optimization πŸ› οΈ",
18
+ "quantization": "Quantization πŸ—œοΈ",
19
+ # primary measurements
20
+ "forward.latency(s)": "Prefill Latency (s)",
21
+ "decode.throughput(tokens/s)": "Decode Throughput (tokens/s)",
22
+ "generate.max_memory_allocated(MB)": "Allocated Memory (MB)",
23
+ "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)",
24
+ # additional measurements
25
+ "generate.latency(s)": "E2E Latency (s)",
26
+ "generate.throughput(tokens/s)": "E2E Throughput (tokens/s)",
27
+ "generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
28
+ "generate.max_memory_used(MB)": "Used Memory (MB)",
29
+ }
30
+ SORTING_COLUMNS = [
31
+ "Open LLM Score (%)",
32
+ "Prefill Latency (s)",
33
+ "Decode Throughput (tokens/s)",
34
+ ]
35
+ SORTING_ASCENDING = [False, True, False]
36
+
37
+
38
+ def get_llm_df():
39
+ hf_hub_download(
40
+ repo_id=LLM_PERF_DATASET_REPO,
41
+ filename="open-llm.csv",
42
+ local_dir="dataset",
43
+ repo_type="dataset",
44
+ token=HF_TOKEN,
45
+ )
46
+ llm_df = pd.read_csv("dataset/open-llm.csv")
47
+
48
+ return llm_df
49
+
50
+
51
+ def get_perf_df(machine: str = "hf-dgx-01"):
52
+ hf_hub_download(
53
+ repo_id=LLM_PERF_DATASET_REPO,
54
+ filename=f"{machine}/perf-report.csv",
55
+ local_dir="dataset",
56
+ repo_type="dataset",
57
+ token=HF_TOKEN,
58
+ )
59
+ perf_df = pd.read_csv(f"dataset/{machine}/perf-report.csv")
60
+
61
+ return perf_df
62
+
63
+
64
+ def get_llm_perf_df(machine: str = "hf-dgx-01"):
65
+ # get dataframes
66
+ llm_df = get_llm_df()
67
+ perf_df = get_perf_df(machine=machine)
68
+ llm_perf_df = pd.merge(llm_df, perf_df, left_on="Model", right_on="model")
69
+ # some assertions
70
+ assert llm_perf_df["benchmark.input_shapes.batch_size"].nunique() == 1
71
+ assert llm_perf_df["benchmark.input_shapes.sequence_length"].nunique() == 1
72
+ assert llm_perf_df["benchmark.new_tokens"].nunique() == 1
73
+ # transpose energy consumption
74
+ llm_perf_df["generate.energy_consumption(tokens/kWh)"] = (
75
+ 1 / llm_perf_df["generate.energy_consumption(kWh/token)"].fillna(1)
76
+ ).astype(int)
77
+ # fix nan values
78
+ llm_perf_df.loc[
79
+ llm_perf_df["generate.energy_consumption(tokens/kWh)"] == 1,
80
+ "generate.energy_consumption(tokens/kWh)",
81
+ ] = pd.NA
82
+
83
+ # add optimization column
84
+ llm_perf_df["optimization"] = llm_perf_df[["backend.to_bettertransformer", "backend.use_flash_attention_2"]].apply(
85
+ lambda x: "BetterTransformer"
86
+ if x["backend.to_bettertransformer"]
87
+ else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None"),
88
+ axis=1,
89
+ )
90
+ # add quantization scheme
91
+ llm_perf_df["quantization"] = llm_perf_df[
92
+ [
93
+ "backend.quantization_scheme",
94
+ "backend.quantization_config.exllama_config.version",
95
+ ]
96
+ ].apply(
97
+ lambda x: "BnB.4bit"
98
+ if x["backend.quantization_scheme"] == "bnb"
99
+ else (
100
+ "GPTQ.4bit+ExllamaV1"
101
+ if (x["backend.quantization_scheme"] == "gptq")
102
+ and (x["backend.quantization_config.exllama_config.version"] == 1)
103
+ else (
104
+ "GPTQ.4bit+ExllamaV2"
105
+ if (x["backend.quantization_scheme"] == "gptq")
106
+ and (x["backend.quantization_config.exllama_config.version"] == 2)
107
+ else "None"
108
+ )
109
+ ),
110
+ axis=1,
111
+ )
112
+ # add decode throughput
113
+ llm_perf_df["decode.throughput(tokens/s)"] = (
114
+ 1000 / (llm_perf_df["generate.latency(s)"] - llm_perf_df["forward.latency(s)"])
115
+ ).round(2)
116
+ # filter columns
117
+ llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())]
118
+ # rename columns
119
+ llm_perf_df.rename(columns=COLUMNS_MAPPING, inplace=True)
120
+ # sort by metric
121
+ llm_perf_df.sort_values(
122
+ by=SORTING_COLUMNS,
123
+ ascending=SORTING_ASCENDING,
124
+ inplace=True,
125
+ )
126
+
127
+ return llm_perf_df
src/{assets/text_content.py β†’ text.py} RENAMED
@@ -1,6 +1,6 @@
1
  TITLE = """<h1 align="center" id="space-title">πŸ€— LLM-Perf Leaderboard πŸ‹οΈ</h1>"""
2
 
3
- INTRODUCTION_TEXT = f"""
4
  The πŸ€— LLM-Perf Leaderboard πŸ‹οΈ aims to benchmark the performance (latency, throughput, memory & energy) of Large Language Models (LLMs) with different hardwares, backends and optimizations using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors.
5
 
6
  Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
@@ -8,7 +8,7 @@ Anyone from the community can request a model or a hardware/backend/optimization
8
  - Hardware/Backend/Optimization performance requests should be made in the [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions) to assess their relevance and feasibility.
9
  """
10
 
11
- ABOUT_TEXT = """<h3>About the πŸ€— LLM-Perf Leaderboard πŸ‹οΈ</h3>
12
  <ul>
13
  <li>To avoid communication-dependent results, only one GPU is used.</li>
14
  <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">πŸ€— Open LLM Leaderboard</a>.</li>
@@ -18,11 +18,26 @@ ABOUT_TEXT = """<h3>About the πŸ€— LLM-Perf Leaderboard πŸ‹οΈ</h3>
18
  </ul>
19
  """
20
 
21
- EXAMPLE_CONFIG_TEXT = """
22
  Here's an example of the configuration file used to benchmark the models with Optimum-Benchmark:
23
  ```yaml
24
  defaults:
25
- - backend: pytorch # default backend
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  - benchmark: inference # default benchmark
27
  - experiment # inheriting from experiment config
28
  - _self_ # for hydra 1.1 compatibility
@@ -31,39 +46,38 @@ defaults:
31
 
32
  hydra:
33
  run:
34
- dir: llm-experiments/{experiment_name}
35
  job:
36
  chdir: true
 
 
 
37
 
38
- experiment_name: {experiment_name}
39
-
40
- model: {model}
41
-
42
- device: cuda
43
 
44
  backend:
45
- no_weights: true
46
- torch_dtype: float16
47
- bettertransformer: true
48
- quantization_scheme: gptq
49
-
50
 
51
  benchmark:
 
52
  memory: true
53
  energy: true
54
-
55
  new_tokens: 1000
56
  input_shapes:
57
  batch_size: 1
58
  sequence_length: 256
59
 
60
-
 
61
  ```
62
  """
63
 
64
 
65
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
66
- CITATION_BUTTON_TEXT = r"""@misc{llm-perf-leaderboard,
67
  author = {Ilyas Moutawwakil, RΓ©gis Pierrard},
68
  title = {LLM-Perf Leaderboard},
69
  year = {2023},
 
1
  TITLE = """<h1 align="center" id="space-title">πŸ€— LLM-Perf Leaderboard πŸ‹οΈ</h1>"""
2
 
3
+ INTRODUCTION = """
4
  The πŸ€— LLM-Perf Leaderboard πŸ‹οΈ aims to benchmark the performance (latency, throughput, memory & energy) of Large Language Models (LLMs) with different hardwares, backends and optimizations using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors.
5
 
6
  Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
 
8
  - Hardware/Backend/Optimization performance requests should be made in the [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions) to assess their relevance and feasibility.
9
  """
10
 
11
+ ABOUT = """<h3>About the πŸ€— LLM-Perf Leaderboard πŸ‹οΈ</h3>
12
  <ul>
13
  <li>To avoid communication-dependent results, only one GPU is used.</li>
14
  <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">πŸ€— Open LLM Leaderboard</a>.</li>
 
18
  </ul>
19
  """
20
 
21
+ EXAMPLE_CONFIG = """
22
  Here's an example of the configuration file used to benchmark the models with Optimum-Benchmark:
23
  ```yaml
24
  defaults:
25
+ - backend: pytorch
26
+ - _base_ # inheriting from base config
27
+ - _self_ # for hydra 1.1 compatibility
28
+
29
+ experiment_name: pytorch+cuda+float16+bettertransformer
30
+ device: cuda
31
+
32
+ backend:
33
+ no_weights: true
34
+ torch_dtype: float16
35
+ to_bettertransformer: true
36
+ ```
37
+
38
+ Where the base config is:
39
+ ```yaml
40
+ defaults:
41
  - benchmark: inference # default benchmark
42
  - experiment # inheriting from experiment config
43
  - _self_ # for hydra 1.1 compatibility
 
46
 
47
  hydra:
48
  run:
49
+ dir: ???
50
  job:
51
  chdir: true
52
+ env_set:
53
+ CUDA_VISIBLE_DEVICES: 0
54
+ CUDA_DEVICE_ORDER: PCI_BUS_ID
55
 
56
+ model: ???
57
+ experiment_name: ???
 
 
 
58
 
59
  backend:
60
+ initial_isolation_check: true
61
+ continous_isolation_check: true
 
 
 
62
 
63
  benchmark:
64
+ duration: 10
65
  memory: true
66
  energy: true
67
+
68
  new_tokens: 1000
69
  input_shapes:
70
  batch_size: 1
71
  sequence_length: 256
72
 
73
+ hub_kwargs:
74
+ trust_remote_code: true
75
  ```
76
  """
77
 
78
 
79
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
80
+ CITATION_BUTTON = r"""@misc{llm-perf-leaderboard,
81
  author = {Ilyas Moutawwakil, RΓ©gis Pierrard},
82
  title = {LLM-Perf Leaderboard},
83
  year = {2023},
src/utils.py CHANGED
@@ -1,22 +1,3 @@
1
- from huggingface_hub import HfApi, Repository
2
- import gradio as gr
3
- import json
4
-
5
-
6
- def change_tab(query_param):
7
- query_param = query_param.replace("'", '"')
8
- query_param = json.loads(query_param)
9
-
10
- if (
11
- isinstance(query_param, dict)
12
- and "tab" in query_param
13
- and query_param["tab"] == "plot"
14
- ):
15
- return gr.Tabs.update(selected=1)
16
- else:
17
- return gr.Tabs.update(selected=0)
18
-
19
-
20
  LLM_MODEL_ARCHS = {
21
  "stablelm_epoch": "πŸ”΄ StableLM-Epoch",
22
  "stablelm_alpha": "πŸ”΄ StableLM-Alpha",
@@ -24,8 +5,8 @@ LLM_MODEL_ARCHS = {
24
  "RefinedWebModel": "πŸ¦… Falcon",
25
  "gpt_bigcode": "⭐ StarCoder",
26
  "RefinedWeb": "πŸ¦… Falcon",
27
- "baichuan": "🌊 Baichuan 百川", # river
28
- "internlm": "πŸ§‘β€πŸŽ“ InternLM δΉ¦η”Ÿ", # scholar
29
  "mistral": "Ⓜ️ Mistral",
30
  "codegen": "♾️ CodeGen",
31
  "chatglm": "πŸ’¬ ChatGLM",
@@ -34,7 +15,7 @@ LLM_MODEL_ARCHS = {
34
  "llama": "πŸ¦™ LLaMA",
35
  "rwkv": "πŸ¦β€β¬› RWKV",
36
  "mpt": "🧱 MPT",
37
- "Yi": "πŸ«‚ Yi δΊΊ", # people
38
  # suggest something
39
  "gpt_neox": "GPT-NeoX",
40
  "gpt_neo": "GPT-Neo",
@@ -50,13 +31,25 @@ def model_hyperlink(link, model_name):
50
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
51
 
52
 
53
- def process_model_name(model_name):
54
- link = f"https://huggingface.co/{model_name}"
55
- return model_hyperlink(link, model_name)
56
-
57
-
58
- def process_model_arch(model_arch):
59
  if model_arch in LLM_MODEL_ARCHS:
60
  return LLM_MODEL_ARCHS[model_arch]
61
  else:
62
  return model_arch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  LLM_MODEL_ARCHS = {
2
  "stablelm_epoch": "πŸ”΄ StableLM-Epoch",
3
  "stablelm_alpha": "πŸ”΄ StableLM-Alpha",
 
5
  "RefinedWebModel": "πŸ¦… Falcon",
6
  "gpt_bigcode": "⭐ StarCoder",
7
  "RefinedWeb": "πŸ¦… Falcon",
8
+ "baichuan": "🌊 Baichuan 百川", # river
9
+ "internlm": "πŸ§‘β€πŸŽ“ InternLM δΉ¦η”Ÿ", # scholar
10
  "mistral": "Ⓜ️ Mistral",
11
  "codegen": "♾️ CodeGen",
12
  "chatglm": "πŸ’¬ ChatGLM",
 
15
  "llama": "πŸ¦™ LLaMA",
16
  "rwkv": "πŸ¦β€β¬› RWKV",
17
  "mpt": "🧱 MPT",
18
+ "Yi": "πŸ«‚ Yi δΊΊ" , # people
19
  # suggest something
20
  "gpt_neox": "GPT-NeoX",
21
  "gpt_neo": "GPT-Neo",
 
31
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
32
 
33
 
34
+ def process_arch(model_arch):
 
 
 
 
 
35
  if model_arch in LLM_MODEL_ARCHS:
36
  return LLM_MODEL_ARCHS[model_arch]
37
  else:
38
  return model_arch
39
+
40
+
41
+ def process_score(score, quantization):
42
+ if quantization != "None":
43
+ return f"{score:.2f}*"
44
+ else:
45
+ return f"{score:.2f} "
46
+
47
+
48
+ # def change_tab(query_param):
49
+ # query_param = query_param.replace("'", '"')
50
+ # query_param = json.loads(query_param)
51
+
52
+ # if isinstance(query_param, dict) and "tab" in query_param and query_param["tab"] == "plot":
53
+ # return gr.Tabs.update(selected=1)
54
+ # else:
55
+ # return gr.Tabs.update(selected=0)