Spaces:
Running
Running
BenchmarkBot
commited on
Commit
Β·
0f1bf97
1
Parent(s):
5f0b430
added bettertransformer and LLM.int8
Browse files
app.py
CHANGED
@@ -4,8 +4,20 @@ import pandas as pd
|
|
4 |
import plotly.express as px
|
5 |
from apscheduler.schedulers.background import BackgroundScheduler
|
6 |
|
7 |
-
from src.assets.text_content import
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
from src.assets.css_html_js import custom_css
|
10 |
|
11 |
|
@@ -17,13 +29,28 @@ COLUMNS_MAPPING = {
|
|
17 |
"model": "Model π€",
|
18 |
"backend.name": "Backend π",
|
19 |
"backend.torch_dtype": "Load Dtype π₯",
|
20 |
-
"num_parameters": "
|
|
|
21 |
"forward.peak_memory(MB)": "Peak Memory (MB) β¬οΈ",
|
22 |
"generate.throughput(tokens/s)": "Throughput (tokens/s) β¬οΈ",
|
23 |
"average": "Average Open LLM Score β¬οΈ",
|
|
|
|
|
|
|
24 |
}
|
25 |
-
COLUMNS_DATATYPES = [
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
SORTING_COLUMN = ["Throughput (tokens/s) β¬οΈ"]
|
28 |
|
29 |
|
@@ -35,17 +62,14 @@ def get_benchmark_df(benchmark="1xA100-80GB"):
|
|
35 |
llm_perf_dataset_repo.git_pull()
|
36 |
|
37 |
# load
|
38 |
-
bench_df = pd.read_csv(
|
39 |
-
|
40 |
-
scores_df = pd.read_csv(
|
41 |
-
f"./llm-perf-dataset/reports/additional_data.csv")
|
42 |
bench_df = bench_df.merge(scores_df, on="model", how="left")
|
43 |
|
44 |
return bench_df
|
45 |
|
46 |
|
47 |
def get_benchmark_table(bench_df):
|
48 |
-
|
49 |
# filter
|
50 |
bench_df = bench_df[list(COLUMNS_MAPPING.keys())]
|
51 |
# rename
|
@@ -54,73 +78,76 @@ def get_benchmark_table(bench_df):
|
|
54 |
bench_df.sort_values(by=SORTING_COLUMN, ascending=False, inplace=True)
|
55 |
# transform
|
56 |
bench_df["Model π€"] = bench_df["Model π€"].apply(make_clickable_model)
|
|
|
57 |
bench_df["Average Open LLM Score β¬οΈ"] = bench_df["Average Open LLM Score β¬οΈ"].apply(
|
58 |
-
make_clickable_score
|
59 |
-
|
60 |
-
|
61 |
return bench_df
|
62 |
|
63 |
|
64 |
def get_benchmark_plot(bench_df):
|
65 |
-
|
66 |
# untill falcon gets fixed / natively supported
|
67 |
bench_df = bench_df[bench_df["generate.latency(s)"] < 100]
|
68 |
|
69 |
fig = px.scatter(
|
70 |
-
bench_df,
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
# as many distinct colors as there are model_type,backend.name couples
|
76 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
77 |
)
|
78 |
|
79 |
fig.update_layout(
|
80 |
title={
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
85 |
},
|
86 |
xaxis_title="Per 1000 Tokens Latency (s)",
|
87 |
yaxis_title="Average Open LLM Score",
|
88 |
legend_title="Model Type and Backend",
|
89 |
width=1200,
|
90 |
height=600,
|
91 |
-
# legend=dict(
|
92 |
-
# orientation="h",
|
93 |
-
# yanchor="bottom",
|
94 |
-
# y=-0.35,
|
95 |
-
# xanchor="center",
|
96 |
-
# x=0.5
|
97 |
-
# )
|
98 |
)
|
99 |
|
100 |
fig.update_traces(
|
101 |
-
hovertemplate="<br>".join(
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
|
|
|
|
110 |
)
|
111 |
|
112 |
return fig
|
113 |
|
114 |
|
115 |
def filter_query(text, backends, datatypes, threshold, benchmark="1xA100-80GB"):
|
116 |
-
|
117 |
raw_df = get_benchmark_df(benchmark=benchmark)
|
118 |
|
119 |
filtered_df = raw_df[
|
120 |
-
raw_df["model"].str.lower().str.contains(text.lower())
|
121 |
-
raw_df["backend.name"].isin(backends)
|
122 |
-
raw_df["backend.torch_dtype"].isin(datatypes)
|
123 |
-
(raw_df["average"] >= threshold)
|
124 |
]
|
125 |
|
126 |
filtered_table = get_benchmark_table(filtered_df)
|
@@ -221,8 +248,12 @@ with demo:
|
|
221 |
|
222 |
# Restart space every hour
|
223 |
scheduler = BackgroundScheduler()
|
224 |
-
scheduler.add_job(
|
225 |
-
|
|
|
|
|
|
|
|
|
226 |
scheduler.start()
|
227 |
|
228 |
# Launch demo
|
|
|
4 |
import plotly.express as px
|
5 |
from apscheduler.schedulers.background import BackgroundScheduler
|
6 |
|
7 |
+
from src.assets.text_content import (
|
8 |
+
TITLE,
|
9 |
+
INTRODUCTION_TEXT,
|
10 |
+
SINGLE_A100_TEXT,
|
11 |
+
CITATION_BUTTON_LABEL,
|
12 |
+
CITATION_BUTTON_TEXT,
|
13 |
+
)
|
14 |
+
from src.utils import (
|
15 |
+
restart_space,
|
16 |
+
load_dataset_repo,
|
17 |
+
make_clickable_model,
|
18 |
+
make_clickable_score,
|
19 |
+
num_to_str,
|
20 |
+
)
|
21 |
from src.assets.css_html_js import custom_css
|
22 |
|
23 |
|
|
|
29 |
"model": "Model π€",
|
30 |
"backend.name": "Backend π",
|
31 |
"backend.torch_dtype": "Load Dtype π₯",
|
32 |
+
"num_parameters": "#οΈβ£ Parameters π",
|
33 |
+
#
|
34 |
"forward.peak_memory(MB)": "Peak Memory (MB) β¬οΈ",
|
35 |
"generate.throughput(tokens/s)": "Throughput (tokens/s) β¬οΈ",
|
36 |
"average": "Average Open LLM Score β¬οΈ",
|
37 |
+
#
|
38 |
+
"backend.bettertransformer": "BetterTransformer π€",
|
39 |
+
"backend.load_in_8bit": "LLM.int8 ποΈ",
|
40 |
}
|
41 |
+
COLUMNS_DATATYPES = [
|
42 |
+
"markdown",
|
43 |
+
"str",
|
44 |
+
"str",
|
45 |
+
"str",
|
46 |
+
#
|
47 |
+
"number",
|
48 |
+
"number",
|
49 |
+
"markdown",
|
50 |
+
#
|
51 |
+
"str",
|
52 |
+
"str",
|
53 |
+
]
|
54 |
SORTING_COLUMN = ["Throughput (tokens/s) β¬οΈ"]
|
55 |
|
56 |
|
|
|
62 |
llm_perf_dataset_repo.git_pull()
|
63 |
|
64 |
# load
|
65 |
+
bench_df = pd.read_csv(f"./llm-perf-dataset/reports/{benchmark}.csv")
|
66 |
+
scores_df = pd.read_csv(f"./llm-perf-dataset/reports/additional_data.csv")
|
|
|
|
|
67 |
bench_df = bench_df.merge(scores_df, on="model", how="left")
|
68 |
|
69 |
return bench_df
|
70 |
|
71 |
|
72 |
def get_benchmark_table(bench_df):
|
|
|
73 |
# filter
|
74 |
bench_df = bench_df[list(COLUMNS_MAPPING.keys())]
|
75 |
# rename
|
|
|
78 |
bench_df.sort_values(by=SORTING_COLUMN, ascending=False, inplace=True)
|
79 |
# transform
|
80 |
bench_df["Model π€"] = bench_df["Model π€"].apply(make_clickable_model)
|
81 |
+
bench_df["#οΈβ£ Parameters π"] = bench_df["#οΈβ£ Parameters π"].apply(num_to_str)
|
82 |
bench_df["Average Open LLM Score β¬οΈ"] = bench_df["Average Open LLM Score β¬οΈ"].apply(
|
83 |
+
make_clickable_score
|
84 |
+
)
|
|
|
85 |
return bench_df
|
86 |
|
87 |
|
88 |
def get_benchmark_plot(bench_df):
|
|
|
89 |
# untill falcon gets fixed / natively supported
|
90 |
bench_df = bench_df[bench_df["generate.latency(s)"] < 100]
|
91 |
|
92 |
fig = px.scatter(
|
93 |
+
bench_df,
|
94 |
+
x="generate.latency(s)",
|
95 |
+
y="average",
|
96 |
+
color="model_type",
|
97 |
+
symbol="backend.name",
|
98 |
+
size="forward.peak_memory(MB)",
|
99 |
+
custom_data=[
|
100 |
+
"model",
|
101 |
+
"backend.name",
|
102 |
+
"backend.torch_dtype",
|
103 |
+
"forward.peak_memory(MB)",
|
104 |
+
"generate.throughput(tokens/s)",
|
105 |
+
],
|
106 |
+
symbol_sequence=["triangle-up", "circle"],
|
107 |
# as many distinct colors as there are model_type,backend.name couples
|
108 |
color_discrete_sequence=px.colors.qualitative.Light24,
|
109 |
)
|
110 |
|
111 |
fig.update_layout(
|
112 |
title={
|
113 |
+
"text": "Model Score vs. Latency vs. Memory",
|
114 |
+
"y": 0.95,
|
115 |
+
"x": 0.5,
|
116 |
+
"xanchor": "center",
|
117 |
+
"yanchor": "top",
|
118 |
},
|
119 |
xaxis_title="Per 1000 Tokens Latency (s)",
|
120 |
yaxis_title="Average Open LLM Score",
|
121 |
legend_title="Model Type and Backend",
|
122 |
width=1200,
|
123 |
height=600,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
)
|
125 |
|
126 |
fig.update_traces(
|
127 |
+
hovertemplate="<br>".join(
|
128 |
+
[
|
129 |
+
"Model: %{customdata[0]}",
|
130 |
+
"Backend: %{customdata[1]}",
|
131 |
+
"Datatype: %{customdata[2]}",
|
132 |
+
"Peak Memory (MB): %{customdata[3]}",
|
133 |
+
"Throughput (tokens/s): %{customdata[4]}",
|
134 |
+
"Average Open LLM Score: %{y}",
|
135 |
+
"Per 1000 Tokens Latency (s): %{x}",
|
136 |
+
]
|
137 |
+
)
|
138 |
)
|
139 |
|
140 |
return fig
|
141 |
|
142 |
|
143 |
def filter_query(text, backends, datatypes, threshold, benchmark="1xA100-80GB"):
|
|
|
144 |
raw_df = get_benchmark_df(benchmark=benchmark)
|
145 |
|
146 |
filtered_df = raw_df[
|
147 |
+
raw_df["model"].str.lower().str.contains(text.lower())
|
148 |
+
& raw_df["backend.name"].isin(backends)
|
149 |
+
& raw_df["backend.torch_dtype"].isin(datatypes)
|
150 |
+
& (raw_df["average"] >= threshold)
|
151 |
]
|
152 |
|
153 |
filtered_table = get_benchmark_table(filtered_df)
|
|
|
248 |
|
249 |
# Restart space every hour
|
250 |
scheduler = BackgroundScheduler()
|
251 |
+
scheduler.add_job(
|
252 |
+
restart_space,
|
253 |
+
"interval",
|
254 |
+
seconds=3600,
|
255 |
+
args=[LLM_PERF_LEADERBOARD_REPO, OPTIMUM_TOKEN],
|
256 |
+
)
|
257 |
scheduler.start()
|
258 |
|
259 |
# Launch demo
|