BenchmarkBot commited on
Commit
e747f4e
Β·
1 Parent(s): 570bffa

added peak memory and made scores clickable

Browse files
Files changed (3) hide show
  1. app.py +57 -84
  2. src/assets/text_content.py +8 -0
  3. src/utils.py +9 -0
app.py CHANGED
@@ -4,9 +4,9 @@ import gradio as gr
4
  import pandas as pd
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
 
7
- from src.assets.text_content import TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
8
- from src.utils import restart_space, load_dataset_repo, make_clickable_model
9
- from src.assets.css_html_js import custom_css, get_window_url_params
10
 
11
 
12
  LLM_PERF_LEADERBOARD_REPO = "optimum/llm-perf-leaderboard"
@@ -18,9 +18,10 @@ COLUMNS_MAPPING = {
18
  "backend.name": "Backend 🏭",
19
  "backend.torch_dtype": "Datatype πŸ“₯",
20
  "average": "Average H4 Score ⬆️",
 
21
  "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
22
  }
23
- COLUMNS_DATATYPES = ["markdown", "str", "str", "number", "number", "number"]
24
  SORTING_COLUMN = ["Throughput (tokens/s) ⬆️"]
25
 
26
 
@@ -34,17 +35,15 @@ def get_benchmark_df(benchmark):
34
  # load
35
  bench_df = pd.read_csv(
36
  f"./llm-perf-dataset/reports/{benchmark}/inference_report.csv")
 
37
  scores_df = pd.read_csv(
38
  f"./llm-perf-dataset/reports/average_scores.csv")
39
- # merge on model
40
- bench_df = bench_df.merge(
41
- scores_df, how="left", left_on="model", right_on="model")
42
 
43
  # preprocess
44
  bench_df["model"] = bench_df["model"].apply(make_clickable_model)
45
- # set none datatype to float32
46
- bench_df["backend.torch_dtype"] = bench_df["backend.torch_dtype"].fillna(
47
- "float32")
48
  # filter
49
  bench_df = bench_df[list(COLUMNS_MAPPING.keys())]
50
  # rename
@@ -55,37 +54,38 @@ def get_benchmark_df(benchmark):
55
  return bench_df
56
 
57
 
58
- def change_tab(query_param):
59
- query_param = query_param.replace("'", '"')
60
- query_param = json.loads(query_param)
61
-
62
- if (
63
- isinstance(query_param, dict)
64
- and "tab" in query_param
65
- and query_param["tab"] == "evaluation"
66
- ):
67
- return gr.Tabs.update(selected=1)
68
- else:
69
- return gr.Tabs.update(selected=0)
70
 
 
 
 
 
 
 
 
 
71
 
72
- def submit_query(single_df, multi_df, text, backends, datatypes, threshold):
73
 
74
- filtered_single = single_df[
75
- single_df["Model πŸ€—"].str.contains(text) &
76
- single_df["Backend 🏭"].isin(backends) &
77
- single_df["Datatype πŸ“₯"].isin(datatypes) &
78
- (single_df["Average H4 Score ⬆️"] >= threshold)
79
- ]
 
 
 
 
 
 
 
 
80
 
81
- filtered_multi = multi_df[
82
- multi_df["Model πŸ€—"].str.contains(text) &
83
- multi_df["Backend 🏭"].isin(backends) &
84
- multi_df["Datatype πŸ“₯"].isin(datatypes) &
85
- (multi_df["Average H4 Score ⬆️"] >= threshold)
86
- ]
87
 
88
- return filtered_single, filtered_multi
89
 
90
 
91
  # Define demo interface
@@ -96,29 +96,29 @@ with demo:
96
 
97
  with gr.Row():
98
  search_bar = gr.Textbox(
99
- label="Search πŸ”Ž",
100
- info="Search for a model and press Submit πŸš€",
101
  elem_id="search-bar",
102
  )
103
  backend_checkboxes = gr.CheckboxGroup(
 
104
  choices=["pytorch", "onnxruntime"],
105
  value=["pytorch", "onnxruntime"],
106
- label="Backends 🏭",
107
  info="Select the backends",
108
  elem_id="backend-checkboxes",
109
  )
110
  datatype_checkboxes = gr.CheckboxGroup(
 
111
  choices=["float32", "float16"],
112
  value=["float32", "float16"],
113
- label="Datatypes πŸ“₯",
114
  info="Select the load datatypes",
115
  elem_id="datatype-checkboxes",
116
  )
117
 
118
  with gr.Row():
119
  threshold_slider = gr.Slider(
120
- label="H4 Threshold πŸ“ˆ",
121
- info="Filter by average H4 score",
122
  value=0.0,
123
  elem_id="threshold-slider",
124
  )
@@ -132,13 +132,6 @@ with demo:
132
 
133
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
134
  with gr.TabItem("πŸ–₯️ A100-80GB Benchmark πŸ‹οΈ", elem_id="A100-benchmark", id=0):
135
-
136
- SINGLE_A100_TEXT = """<h3>Single-GPU (1xA100):</h3>
137
- <ul>
138
- <li>Singleton Batch (1)</li>
139
- <li>Thousand Tokens (1000)</li>
140
- </ul>
141
- """
142
  gr.HTML(SINGLE_A100_TEXT)
143
 
144
  single_A100_df = get_benchmark_df(benchmark="1xA100-80GB")
@@ -158,35 +151,15 @@ with demo:
158
  visible=False,
159
  )
160
 
161
- with gr.TabItem("πŸ–₯️ 4xA100-80GB Benchmark πŸ‹οΈ", elem_id="4xA100-benchmark", id=1):
162
- MULTI_A100_TEXT = """<h3>Multi-GPU (4xA100):</h3>
163
- <ul>
164
- <li>Singleton Batch (1)</li>
165
- <li>Thousand Tokens (1000)</li>
166
- <li>Using <a href="https://huggingface.co/docs/accelerate" target="_blank">Accelerate</a>'s Auto Device Map</li>
167
- </ul>"""
168
- gr.HTML(MULTI_A100_TEXT)
169
- multi_A100_df = get_benchmark_df(benchmark="4xA100-80GB")
170
- multi_A100_leaderboard = gr.components.Dataframe(
171
- value=multi_A100_df,
172
- datatype=COLUMNS_DATATYPES,
173
- headers=list(COLUMNS_MAPPING.values()),
174
- elem_id="4xA100-table",
175
- )
176
- # Dummy Leaderboard table for handling the case when the user uses backspace key
177
- multi_A100_for_search = gr.components.Dataframe(
178
- value=multi_A100_df,
179
- datatype=COLUMNS_DATATYPES,
180
- headers=list(COLUMNS_MAPPING.values()),
181
- max_rows=None,
182
- visible=False,
183
- )
184
-
185
  # Callbacks
186
- submit_button.click(submit_query,
187
- [single_A100_for_search, multi_A100_for_search, search_bar,
188
- backend_checkboxes, datatype_checkboxes, threshold_slider],
189
- [single_A100_leaderboard, multi_A100_leaderboard])
 
 
 
 
190
 
191
  with gr.Row():
192
  with gr.Accordion("πŸ“™ Citation", open=False):
@@ -196,13 +169,13 @@ with demo:
196
  elem_id="citation-button",
197
  ).style(show_copy_button=True)
198
 
199
- dummy = gr.Textbox(visible=False)
200
- demo.load(
201
- change_tab,
202
- dummy,
203
- tabs,
204
- _js=get_window_url_params,
205
- )
206
 
207
  # Restart space every hour
208
  scheduler = BackgroundScheduler()
 
4
  import pandas as pd
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
 
7
+ from src.assets.text_content import TITLE, INTRODUCTION_TEXT, SINGLE_A100_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
8
+ from src.utils import restart_space, load_dataset_repo, make_clickable_model, make_clickable_score, extract_score_from_clickable
9
+ from src.assets.css_html_js import custom_css
10
 
11
 
12
  LLM_PERF_LEADERBOARD_REPO = "optimum/llm-perf-leaderboard"
 
18
  "backend.name": "Backend 🏭",
19
  "backend.torch_dtype": "Datatype πŸ“₯",
20
  "average": "Average H4 Score ⬆️",
21
+ "forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
22
  "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
23
  }
24
+ COLUMNS_DATATYPES = ["markdown", "str", "str", "markdown", "number", "number"]
25
  SORTING_COLUMN = ["Throughput (tokens/s) ⬆️"]
26
 
27
 
 
35
  # load
36
  bench_df = pd.read_csv(
37
  f"./llm-perf-dataset/reports/{benchmark}/inference_report.csv")
38
+
39
  scores_df = pd.read_csv(
40
  f"./llm-perf-dataset/reports/average_scores.csv")
41
+ bench_df = bench_df.merge(scores_df, on="model", how="left")
42
+ bench_df["average"] = bench_df["average"].apply(
43
+ make_clickable_score)
44
 
45
  # preprocess
46
  bench_df["model"] = bench_df["model"].apply(make_clickable_model)
 
 
 
47
  # filter
48
  bench_df = bench_df[list(COLUMNS_MAPPING.keys())]
49
  # rename
 
54
  return bench_df
55
 
56
 
57
+ # def change_tab(query_param):
58
+ # query_param = query_param.replace("'", '"')
59
+ # query_param = json.loads(query_param)
 
 
 
 
 
 
 
 
 
60
 
61
+ # if (
62
+ # isinstance(query_param, dict)
63
+ # and "tab" in query_param
64
+ # and query_param["tab"] == "evaluation"
65
+ # ):
66
+ # return gr.Tabs.update(selected=1)
67
+ # else:
68
+ # return gr.Tabs.update(selected=0)
69
 
 
70
 
71
+ def submit_query(text, backends, datatypes, threshold, raw_dfs):
72
+ filtered_dfs = []
73
+ for raw_df in raw_dfs:
74
+ # extract the average score (float) from the clickable score (clickable markdown)
75
+ raw_df["Average H4 Score ⬆️"] = raw_df["Average H4 Score ⬆️"].apply(
76
+ extract_score_from_clickable)
77
+ filtered_df = raw_df[
78
+ raw_df["Model πŸ€—"].str.contains(text) &
79
+ raw_df["Backend 🏭"].isin(backends) &
80
+ raw_df["Datatype πŸ“₯"].isin(datatypes) &
81
+ (raw_df["Average H4 Score ⬆️"] >= threshold)
82
+ ]
83
+ filtered_df["Average H4 Score ⬆️"] = filtered_df["Average H4 Score ⬆️"].apply(
84
+ make_clickable_score)
85
 
86
+ filtered_dfs.append(filtered_df)
 
 
 
 
 
87
 
88
+ return filtered_dfs
89
 
90
 
91
  # Define demo interface
 
96
 
97
  with gr.Row():
98
  search_bar = gr.Textbox(
99
+ label="Model πŸ€—",
100
+ info="Search for a model name",
101
  elem_id="search-bar",
102
  )
103
  backend_checkboxes = gr.CheckboxGroup(
104
+ label="Backends 🏭",
105
  choices=["pytorch", "onnxruntime"],
106
  value=["pytorch", "onnxruntime"],
 
107
  info="Select the backends",
108
  elem_id="backend-checkboxes",
109
  )
110
  datatype_checkboxes = gr.CheckboxGroup(
111
+ label="Datatypes πŸ“₯",
112
  choices=["float32", "float16"],
113
  value=["float32", "float16"],
 
114
  info="Select the load datatypes",
115
  elem_id="datatype-checkboxes",
116
  )
117
 
118
  with gr.Row():
119
  threshold_slider = gr.Slider(
120
+ label="Average H4 Score πŸ“ˆ",
121
+ info="Filter by minimum average H4 score",
122
  value=0.0,
123
  elem_id="threshold-slider",
124
  )
 
132
 
133
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
134
  with gr.TabItem("πŸ–₯️ A100-80GB Benchmark πŸ‹οΈ", elem_id="A100-benchmark", id=0):
 
 
 
 
 
 
 
135
  gr.HTML(SINGLE_A100_TEXT)
136
 
137
  single_A100_df = get_benchmark_df(benchmark="1xA100-80GB")
 
151
  visible=False,
152
  )
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  # Callbacks
155
+ submit_button.click(
156
+ submit_query,
157
+ [
158
+ search_bar, backend_checkboxes, datatype_checkboxes, threshold_slider,
159
+ single_A100_for_search
160
+ ],
161
+ [single_A100_leaderboard]
162
+ )
163
 
164
  with gr.Row():
165
  with gr.Accordion("πŸ“™ Citation", open=False):
 
169
  elem_id="citation-button",
170
  ).style(show_copy_button=True)
171
 
172
+ # dummy = gr.Textbox(visible=False)
173
+ # demo.load(
174
+ # change_tab,
175
+ # dummy,
176
+ # tabs,
177
+ # _js=get_window_url_params,
178
+ # )
179
 
180
  # Restart space every hour
181
  scheduler = BackgroundScheduler()
src/assets/text_content.py CHANGED
@@ -9,6 +9,14 @@ Anyone from the community can request a model or a hardware+backend configuratio
9
  [Config files](https://github.com/huggingface/optimum-benchmark/blob/main/examples/bert.yaml) (which can be used with Optimum-Benchmark) will be available soon for reproduction, questioning and correction of our results.
10
  """
11
 
 
 
 
 
 
 
 
 
12
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
13
  CITATION_BUTTON_TEXT = r"""@misc{open-llm-perf-leaderboard,
14
  author = {Ilyas Moutawwakil},
 
9
  [Config files](https://github.com/huggingface/optimum-benchmark/blob/main/examples/bert.yaml) (which can be used with Optimum-Benchmark) will be available soon for reproduction, questioning and correction of our results.
10
  """
11
 
12
+ SINGLE_A100_TEXT = """<h3>Single-GPU (1xA100):</h3>
13
+ <ul>
14
+ <li>Singleton Batch (1)</li>
15
+ <li>Thousand Tokens (1000)</li>
16
+ </ul>
17
+ """
18
+
19
+
20
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
21
  CITATION_BUTTON_TEXT = r"""@misc{open-llm-perf-leaderboard,
22
  author = {Ilyas Moutawwakil},
src/utils.py CHANGED
@@ -60,3 +60,12 @@ def make_clickable_model(model_name):
60
  link = OASST_LINK
61
 
62
  return model_hyperlink(link, model_name)
 
 
 
 
 
 
 
 
 
 
60
  link = OASST_LINK
61
 
62
  return model_hyperlink(link, model_name)
63
+
64
+
65
+ def make_clickable_score(score):
66
+ link = f"https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard"
67
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{score}</a>'
68
+
69
+
70
+ def extract_score_from_clickable(clickable_score) -> float:
71
+ return float(clickable_score.split(">")[1].split("<")[0])