Julien Simon commited on
Commit
7200b01
·
1 Parent(s): 48660ea

Initial version

Browse files
Files changed (2) hide show
  1. app.py +181 -0
  2. results.py +559 -0
app.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import re
3
+
4
+ import gradio as gr
5
+ import pandas as pd
6
+
7
+ from results import results
8
+
9
+ logging.basicConfig(level=logging.DEBUG)
10
+
11
+
12
+ def get_model_names():
13
+ """
14
+ Retrieve a sorted list of model names from the results data.
15
+
16
+ Returns:
17
+ list: Sorted list of model names.
18
+ """
19
+ return sorted([model['name'] for model in results['models']])
20
+
21
+
22
+ def get_models_by_architecture(model_name):
23
+ """
24
+ Retrieve models with the same architecture as the specified model.
25
+
26
+ Args:
27
+ model_name (str): Name of the model to match architecture.
28
+
29
+ Returns:
30
+ list: List of models with the same architecture.
31
+ """
32
+ selected_model = next((m for m in results['models'] if m['name'] == model_name), None)
33
+ if not selected_model:
34
+ return []
35
+
36
+ model_type = selected_model.get('modelType', '')
37
+ return [m for m in results['models'] if m.get('modelType', '') == model_type]
38
+
39
+
40
+ def custom_sort_key(instance_type):
41
+ """
42
+ Generate a custom sorting key for instance types.
43
+
44
+ Args:
45
+ instance_type (str): The instance type to generate a key for.
46
+
47
+ Returns:
48
+ tuple: A tuple used for sorting, containing (family, size_index).
49
+ """
50
+ size_order = ['xlarge', '2xlarge', '4xlarge', '8xlarge', '12xlarge', '16xlarge', '24xlarge', '48xlarge']
51
+
52
+ match = re.match(r'([a-z]+\d+)\.(\w+)', instance_type)
53
+ if match:
54
+ family, size = match.groups()
55
+ return (family, size_order.index(size) if size in size_order else len(size_order))
56
+ return (instance_type, 0) # Fallback for non-standard instance types
57
+
58
+
59
+ def display_results(model_name):
60
+ """
61
+ Process and display results for a given model.
62
+
63
+ This function retrieves model data, processes it, and formats it for display.
64
+ It handles nested configurations, merges data from multiple models if necessary,
65
+ and sorts the results by instance type.
66
+
67
+ Args:
68
+ model_name (str): Name of the model to display results for.
69
+
70
+ Returns:
71
+ tuple: A tuple containing:
72
+ - str: Markdown formatted string with model information.
73
+ - pandas.DataFrame: Styled DataFrame with the results.
74
+ """
75
+ try:
76
+ models = get_models_by_architecture(model_name)
77
+ if not models:
78
+ logging.warning(f"No models found for {model_name}")
79
+ return f"No results found for the selected model: {model_name}", pd.DataFrame()
80
+
81
+ model_type = models[0].get('modelType', 'N/A')
82
+ data = {}
83
+ merged_models = set()
84
+
85
+ for model in models:
86
+ merged_models.add(model.get('name', 'Unknown'))
87
+ for config in model.get('configurations', []):
88
+ try:
89
+ instance_type = config['instanceType']
90
+ cloud = config.get('cloud', 'N/A')
91
+ key = (instance_type, cloud)
92
+
93
+ if 'configurations' in config:
94
+ for nested_config in config['configurations']:
95
+ nested_key = key + (nested_config.get('quantization', 'N/A'),)
96
+ data[nested_key] = {
97
+ "Cloud": cloud,
98
+ "Instance Type": instance_type,
99
+ "GPU": config.get('gpu', 'N/A'),
100
+ "GPU RAM": config.get('gpuRAM', 'N/A'),
101
+ "Status": nested_config.get('status', 'N/A'),
102
+ "Quantization": nested_config.get('quantization', 'N/A'),
103
+ "TGI": nested_config.get('tgi', 'N/A'),
104
+ "Tokens per Second": nested_config.get('tokensPerSecond', 'N/A'),
105
+ "Notes": nested_config.get('notes', '')
106
+ }
107
+ else:
108
+ data[key] = {
109
+ "Cloud": cloud,
110
+ "Instance Type": instance_type,
111
+ "GPU": config.get('gpu', 'N/A'),
112
+ "GPU RAM": config.get('gpuRAM', 'N/A'),
113
+ "Status": config.get('status', 'N/A'),
114
+ "Quantization": config.get('quantization', 'N/A'),
115
+ "TGI": config.get('tgi', 'N/A'),
116
+ "Tokens per Second": config.get('tokensPerSecond', 'N/A'),
117
+ "Notes": config.get('notes', '')
118
+ }
119
+ except KeyError as e:
120
+ logging.error(f"KeyError in config: {e}")
121
+ continue
122
+
123
+ if not data:
124
+ logging.warning(f"No data extracted for {model_name}")
125
+ return f"No data could be extracted for the selected model: {model_name}", pd.DataFrame()
126
+
127
+ # Merge data if there are conflicts
128
+ for key, value in data.items():
129
+ for field in value:
130
+ if value[field] == 'N/A':
131
+ for other_key, other_value in data.items():
132
+ if other_key[0] == key[0] and other_value[field] != 'N/A':
133
+ value[field] = other_value[field]
134
+ break
135
+
136
+ # Filter out rows where Status is 'N/A'
137
+ data = {k: v for k, v in data.items() if v['Status'] != 'N/A'}
138
+
139
+ merged_models_message = f"Note: Results merged from models: {', '.join(merged_models)}" if len(merged_models) > 1 else None
140
+
141
+ # Sort the data by instance type
142
+ sorted_data = sorted(data.values(), key=lambda x: custom_sort_key(x['Instance Type']))
143
+
144
+ results = f"## Results for {model_name}\n\nModel Type: {model_type}"
145
+ if merged_models_message:
146
+ results += f"\n\n{merged_models_message}"
147
+
148
+ df = pd.DataFrame(sorted_data)
149
+
150
+ def color_status(val):
151
+ if val == 'OK':
152
+ return 'background-color: green; color: white'
153
+ elif val == 'KO':
154
+ return 'background-color: red; color: white'
155
+ else:
156
+ return ''
157
+
158
+ styled_df = df.style.applymap(color_status, subset=['Status'])
159
+
160
+ return results, styled_df
161
+
162
+ except Exception as e:
163
+ logging.exception(f"Error in display_results: {e}")
164
+ return f"An error occurred while processing results for {model_name}: {str(e)}", pd.DataFrame()
165
+
166
+ with gr.Blocks() as demo:
167
+ gr.Markdown("# Model Benchmark Results")
168
+ gr.Markdown("This table shows the benchmark results for each model. [TGI](https://huggingface.co/docs/text-generation-inference/reference/launcher) and [vLLM](https://docs.djl.ai/master/docs/serving/serving/docs/lmi/user_guides/vllm_user_guide.html) settings are default unless noted.")
169
+ model_dropdown = gr.Dropdown(choices=get_model_names(), label="Select Model")
170
+
171
+ results_text = gr.Markdown()
172
+ results_output = gr.DataFrame(label="Results")
173
+
174
+ model_dropdown.change(
175
+ display_results,
176
+ inputs=[model_dropdown],
177
+ outputs=[results_text, results_output]
178
+ )
179
+
180
+ if __name__ == "__main__":
181
+ demo.launch()
results.py ADDED
@@ -0,0 +1,559 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ results = {
2
+ "models": [
3
+ {"name": "Arcee-Meraj",
4
+ "modelType": "Qwen2 72B"
5
+ },
6
+ {
7
+ "name": "Arcee-Nova",
8
+ "modelType": "Qwen2 72B",
9
+ "notes": "",
10
+ "configurations": [
11
+ {
12
+ "region": "us-west-2",
13
+ "instanceType": "g4dn.12xlarge",
14
+ "cloud": "AWS",
15
+ "gpu": "4xNVIDIA T4",
16
+ "gpuRAM": "64 GB",
17
+ "quantization": "bitsandbytes-nf4",
18
+ "tgi": "TGI 2.2.0",
19
+ "status": "KO",
20
+ "tokensPerSecond": "-",
21
+ "notes": "Flash Attention requires Ampere GPUs or newer"
22
+ },
23
+ {
24
+ "region": "us-west-2",
25
+ "instanceType": "g5.12xlarge",
26
+ "cloud": "AWS",
27
+ "gpu": "4xNVIDIA A10G",
28
+ "gpuRAM": "96 GB",
29
+ "configurations": [
30
+ {
31
+ "quantization": "bitsandbytes-nf4",
32
+ "tgi": "TGI 2.2.0",
33
+ "status": "OK",
34
+ "tokensPerSecond": "12"
35
+ },
36
+ {
37
+ "quantization": "bitsandbytes-fp4",
38
+ "tgi": "TGI 2.2.0",
39
+ "status": "OK",
40
+ "tokensPerSecond": "12"
41
+ },
42
+ {
43
+ "quantization": "bitsandbytes (int8)",
44
+ "tgi": "TGI 2.2.0",
45
+ "status": "KO",
46
+ "tokensPerSecond": "-",
47
+ "notes": "CUDA OOM"
48
+ },
49
+ {
50
+ "quantization": "eetq (int8)",
51
+ "tgi": "TGI 2.2.0",
52
+ "status": "KO",
53
+ "tokensPerSecond": "-",
54
+ "notes": "[FT Error] Heurisitc failed to find a valid config."
55
+ }
56
+ ]
57
+ },
58
+ {
59
+ "region": "us-west-2",
60
+ "instanceType": "g5.48xlarge",
61
+ "cloud": "AWS",
62
+ "gpu": "8xNVIDIA A10G",
63
+ "gpuRAM": "192 GB",
64
+ "configurations": [
65
+ {
66
+ "quantization": "none",
67
+ "tgi": "TGI 2.2.0",
68
+ "status": "KO",
69
+ "tokensPerSecond": "-",
70
+ "notes": "CUDA OOM (but g6.48xlarge works!)"
71
+ },
72
+ {
73
+ "quantization": "bitsandbytes-nf4",
74
+ "tgi": "TGI 2.2.0",
75
+ "status": "OK",
76
+ "tokensPerSecond": "12.3"
77
+ },
78
+ {
79
+ "quantization": "bitsandbytes-fp4",
80
+ "tgi": "TGI 2.2.0",
81
+ "status": "OK",
82
+ "tokensPerSecond": "12.5"
83
+ },
84
+ {
85
+ "quantization": "bitsandbytes (int8)",
86
+ "tgi": "TGI 2.2.0",
87
+ "status": "KO",
88
+ "tokensPerSecond": "-",
89
+ "notes": "The model deploys, but inference times out."
90
+ }
91
+ ]
92
+ },
93
+ {
94
+ "region": "us-west-2",
95
+ "instanceType": "g6.12xlarge",
96
+ "cloud": "AWS",
97
+ "gpu": "4xNVIDIA L4",
98
+ "gpuRAM": "96 GB",
99
+ "configurations": [
100
+ {
101
+ "quantization": "bitsandbytes-nf4",
102
+ "tgi": "TGI 2.2.0",
103
+ "status": "OK",
104
+ "tokensPerSecond": "1.5-2",
105
+ "notes": "Too slow, timeouts are likely"
106
+ },
107
+ {
108
+ "quantization": "bitsandbytes-fp4",
109
+ "tgi": "TGI 2.2.0",
110
+ "status": "OK",
111
+ "tokensPerSecond": "2",
112
+ "notes": "Too slow, timeouts are likely"
113
+ },
114
+ {
115
+ "quantization": "bitsandbytes (int8)",
116
+ "tgi": "TGI 2.2.0",
117
+ "status": "KO",
118
+ "tokensPerSecond": "-",
119
+ "notes": "CUDA OOM"
120
+ }
121
+ ]
122
+ },
123
+ {
124
+ "region": "us-west-2",
125
+ "instanceType": "g6.48xlarge",
126
+ "cloud": "AWS",
127
+ "gpu": "8xNVIDIA L4",
128
+ "gpuRAM": "192 GB",
129
+ "quantization": "none",
130
+ "tgi": "TGI 2.2.0",
131
+ "status": "OK",
132
+ "tokensPerSecond": "12"
133
+ },
134
+ {
135
+ "region": "us-west-2",
136
+ "instanceType": "p4d.24xlarge",
137
+ "cloud": "AWS",
138
+ "gpu": "8xNVIDIA A100",
139
+ "gpuRAM": "320 GB",
140
+ "quantization": "none",
141
+ "tgi": "TGI 2.2.0",
142
+ "status": "OK",
143
+ "tokensPerSecond": "40",
144
+ "notes": "\"MAX_INPUT_LENGTH\": \"16384\", \"MAX_TOTAL_TOKENS\": \"32768\","
145
+ },
146
+ {
147
+ "region": "us-west-2",
148
+ "instanceType": "p4de.24xlarge",
149
+ "cloud": "AWS",
150
+ "gpu": "8xNVIDIA A100",
151
+ "gpuRAM": "320 GB",
152
+ "quantization": "none",
153
+ "tgi": "TGI 2.2.0",
154
+ "status": "waiting for quota"
155
+ },
156
+ {
157
+ "region": "us-west-2",
158
+ "instanceType": "p5.48xlarge",
159
+ "cloud": "AWS",
160
+ "gpu": "8xNVIDIA H100",
161
+ "gpuRAM": "640GB",
162
+ "quantization": "none",
163
+ "tgi": "TGI 2.2.0",
164
+ "status": "OK",
165
+ "tokensPerSecond": "58",
166
+ "notes": "\"MAX_INPUT_LENGTH\": \"16384\", \"MAX_TOTAL_TOKENS\": \"32768\","
167
+ },
168
+ {
169
+ "region": "us-west-2",
170
+ "instanceType": "inf2.*",
171
+ "cloud": "AWS",
172
+ "gpu": "-",
173
+ "tgi": "TGI 2.2.0",
174
+ "status": "not supported",
175
+ "tokensPerSecond": "-",
176
+ "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO"
177
+ }
178
+ ]
179
+ },
180
+ {
181
+ "name": "Llama-Spark",
182
+ "modelType": "Llama 3.1 8B",
183
+ "configurations": [
184
+ {
185
+ "region": "us-west-2",
186
+ "instanceType": "g5.2xlarge",
187
+ "cloud": "AWS",
188
+ "gpu": "1xNVIDIA A10G",
189
+ "gpuRAM": "24 GB",
190
+ "quantization": "none",
191
+ "tgi": "TGI 2.2.0",
192
+ "status": "OK",
193
+ "tokensPerSecond": "29",
194
+ "notes": "4K/8K fails"
195
+ },
196
+ {
197
+ "region": "us-west-2",
198
+ "instanceType": "g5.12xlarge",
199
+ "cloud": "AWS",
200
+ "gpu": "4xNVIDIA A10G",
201
+ "gpuRAM": "96 GB",
202
+ "quantization": "none",
203
+ "tgi": "TGI 2.2.0",
204
+ "status": "OK",
205
+ "tokensPerSecond": "85",
206
+ "notes": "\"MAX_INPUT_TOKENS\": \"16384\", \"MAX_TOTAL_TOKENS\": \"32768\","
207
+ },
208
+ {
209
+ "region": "us-west-2",
210
+ "instanceType": "g5.48xlarge",
211
+ "cloud": "AWS",
212
+ "gpu": "8xNVIDIA A10G",
213
+ "gpuRAM": "192 GB",
214
+ "quantization": "none",
215
+ "tgi": "TGI 2.2.0",
216
+ "status": "OK",
217
+ "tokensPerSecond": "105",
218
+ "notes": "\"MAX_INPUT_TOKENS\": \"20480\", \"MAX_TOTAL_TOKENS\": \"40960\"\n\n32K/64K fails"
219
+ },
220
+ {
221
+ "region": "us-west-2",
222
+ "instanceType": "g6.2xlarge",
223
+ "cloud": "AWS",
224
+ "gpu": "1xNVIDIA L4",
225
+ "gpuRAM": "24 GB",
226
+ "configurations": [
227
+ {
228
+ "quantization": "none",
229
+ "tgi": "TGI 2.2.0",
230
+ "status": "OK",
231
+ "tokensPerSecond": "15"
232
+ },
233
+ {
234
+ "quantization": "fp8",
235
+ "tgi": "TGI 2.2.0"
236
+ }
237
+ ]
238
+ },
239
+ {
240
+ "region": "us-west-2",
241
+ "instanceType": "g6.12xlarge",
242
+ "cloud": "AWS",
243
+ "gpu": "4xNVIDIA L4",
244
+ "gpuRAM": "96 GB",
245
+ "quantization": "none",
246
+ "tgi": "TGI 2.2.0",
247
+ "status": "OK",
248
+ "tokensPerSecond": "51",
249
+ "notes": "same as g5?"
250
+ },
251
+ {
252
+ "region": "us-west-2",
253
+ "instanceType": "g6.48xlarge",
254
+ "cloud": "AWS",
255
+ "gpu": "8xNVIDIA L4",
256
+ "gpuRAM": "192 GB",
257
+ "quantization": "none",
258
+ "tgi": "TGI 2.2.0",
259
+ "status": "OK",
260
+ "tokensPerSecond": "81",
261
+ "notes": "same as g5?"
262
+ },
263
+ {
264
+ "region": "us-west-2",
265
+ "instanceType": "g6e.2xlarge",
266
+ "cloud": "AWS",
267
+ "gpu": "1xNVIDIA L40S",
268
+ "gpuRAM": "48 GB",
269
+ "quantization": "none",
270
+ "tgi": "TGI 2.2.0",
271
+ "status": "OK",
272
+ "tokensPerSecond": "42"
273
+ },
274
+ {
275
+ "region": "us-west-2",
276
+ "instanceType": "p4d.24xlarge",
277
+ "cloud": "AWS",
278
+ "gpu": "4xNVIDIA A100",
279
+ "gpuRAM": "320 GB",
280
+ "quantization": "none",
281
+ "tgi": "TGI 2.2.0",
282
+ "status": "OK",
283
+ "tokensPerSecond": "145",
284
+ "notes": "\"MAX_INPUT_TOKENS\": \"40960\", \"MAX_TOTAL_TOKENS\": \"81920\"\n\n64K/128K fails (even with 4-bit)"
285
+ },
286
+ {
287
+ "region": "us-west-2",
288
+ "instanceType": "inf2.*",
289
+ "cloud": "AWS",
290
+ "gpu": "-",
291
+ "status": "not supported",
292
+ "tokensPerSecond": "-",
293
+ "notes": "Llama-3.1: TGI OK, Neuron SDK OK, optimum-neuron KO"
294
+ }
295
+ ]
296
+ },
297
+ {
298
+ "name": "Arcee-Agent",
299
+ "modelType": "Qwen2 7B",
300
+ "notes": "",
301
+ "configurations": [
302
+ {
303
+ "region": "us-west-2",
304
+ "instanceType": "g5.2xlarge",
305
+ "cloud": "AWS",
306
+ "gpu": "1xNVIDIA A10G",
307
+ "gpuRAM": "24 GB",
308
+ "quantization": "none",
309
+ "tgi": "TGI 2.2.0",
310
+ "status": "OK",
311
+ "tokensPerSecond": "30"
312
+ },
313
+ {
314
+ "region": "us-west-2",
315
+ "instanceType": "g5.12xlarge",
316
+ "cloud": "AWS",
317
+ "gpu": "4xNVIDIA A10G",
318
+ "gpuRAM": "96 GB",
319
+ "quantization": "none",
320
+ "tgi": "TGI 2.2.0",
321
+ "status": "OK",
322
+ "tokensPerSecond": "83"
323
+ },
324
+ {
325
+ "region": "us-west-2",
326
+ "instanceType": "g5.48xlarge",
327
+ "cloud": "AWS",
328
+ "gpu": "8xNVIDIA A10G",
329
+ "gpuRAM": "192 GB",
330
+ "quantization": "none",
331
+ "tgi": "TGI 2.2.0",
332
+ "status": "KO",
333
+ "tokensPerSecond": "-",
334
+ "notes": "ValueError: `num_heads` must be divisible by `num_shards` (got `num_heads`: 28 and `num_shards`: 8\n\nSM_NUM_GPUS=7 doesn't work either because tensor size ares not a multiple of 7 (e.g., 512)"
335
+ },
336
+ {
337
+ "region": "us-west-2",
338
+ "instanceType": "g6.2xlarge",
339
+ "cloud": "AWS",
340
+ "gpu": "1xNVIDIA L4",
341
+ "gpuRAM": "24 GB",
342
+ "quantization": "none",
343
+ "tgi": "TGI 2.2.0",
344
+ "status": "OK",
345
+ "tokensPerSecond": "16.3"
346
+ },
347
+ {
348
+ "region": "us-west-2",
349
+ "instanceType": "g6.12xlarge",
350
+ "cloud": "AWS",
351
+ "gpu": "4xNVIDIA L4",
352
+ "gpuRAM": "96 GB",
353
+ "quantization": "none",
354
+ "tgi": "TGI 2.2.0",
355
+ "status": "OK",
356
+ "tokensPerSecond": "54.2"
357
+ },
358
+ {
359
+ "region": "us-west-2",
360
+ "instanceType": "inf2.*",
361
+ "cloud": "AWS",
362
+ "gpu": "-",
363
+ "tgi": "TGI 2.2.0",
364
+ "status": "not supported",
365
+ "tokensPerSecond": "-",
366
+ "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO"
367
+ }
368
+ ]
369
+ },
370
+ {
371
+ "name": "Arcee-Spark",
372
+ "modelType": "Qwen2 7B"
373
+ },
374
+ {
375
+ "name": "Arcee-Lite",
376
+ "modelType": "Qwen2 1.5B distilled from phi-3-medium 14B",
377
+ "configurations": [
378
+ {
379
+ "region": "us-west-2",
380
+ "instanceType": "c6i.xlarge",
381
+ "cloud": "AWS",
382
+ "gpu": "-",
383
+ "gpuRAM": "-",
384
+ "quantization": "bitsandbytes-nf4",
385
+ "tgi": "TGI 2.2.0",
386
+ "status": "KO",
387
+ "tokensPerSecond": "-",
388
+ "notes": "OOM, might work with a prequantized model"
389
+ },
390
+ {
391
+ "region": "us-west-2",
392
+ "instanceType": "c6i.2xlarge",
393
+ "cloud": "AWS",
394
+ "gpu": "-",
395
+ "gpuRAM": "-",
396
+ "quantization": "bitsandbytes-nf4",
397
+ "tgi": "TGI 2.2.0",
398
+ "status": "KO",
399
+ "tokensPerSecond": "-",
400
+ "notes": "OOM, might work with a prequantized model"
401
+ },
402
+ {
403
+ "region": "us-west-2",
404
+ "instanceType": "c6i.4xlarge",
405
+ "cloud": "AWS",
406
+ "gpu": "-",
407
+ "gpuRAM": "-",
408
+ "configurations": [
409
+ {
410
+ "quantization": "none",
411
+ "tgi": "TGI 2.2.0",
412
+ "status": "OK",
413
+ "tokensPerSecond": "10.7"
414
+ },
415
+ {
416
+ "quantization": "bitsandbytes (int8)",
417
+ "tgi": "TGI 2.2.0",
418
+ "status": "OK",
419
+ "tokensPerSecond": "10.5"
420
+ },
421
+ {
422
+ "quantization": "bitsandbytes-nf4",
423
+ "tgi": "TGI 2.2.0",
424
+ "status": "OK",
425
+ "tokensPerSecond": "10.6"
426
+ }
427
+ ]
428
+ },
429
+ {
430
+ "region": "us-west-2",
431
+ "instanceType": "c7i.4xlarge",
432
+ "cloud": "AWS",
433
+ "gpu": "-",
434
+ "gpuRAM": "-",
435
+ "quantization": "none",
436
+ "tgi": "TGI 2.2.0",
437
+ "status": "waiting for quota",
438
+ "tokensPerSecond": "-"
439
+ },
440
+ {
441
+ "region": "us-west-2",
442
+ "instanceType": "g5.xlarge",
443
+ "cloud": "AWS",
444
+ "gpu": "1xNVIDIA A10G",
445
+ "gpuRAM": "24 GB",
446
+ "configurations": [
447
+ {
448
+ "quantization": "none",
449
+ "tgi": "TGI 2.2.0",
450
+ "status": "OK",
451
+ "tokensPerSecond": "110"
452
+ },
453
+ {
454
+ "quantization": "none",
455
+ "tgi": "DJL 0.28 vLLM",
456
+ "status": "OK",
457
+ "tokensPerSecond": "105",
458
+ "notes": "\"OPTION_MAX_MODEL_LEN\": \"32768\","
459
+ }
460
+ ]
461
+ },
462
+ {
463
+ "region": "us-west-2",
464
+ "instanceType": "g6e.2xlarge",
465
+ "cloud": "AWS",
466
+ "gpu": "1xNVIDIA L40S",
467
+ "gpuRAM": "48 GB",
468
+ "quantization": "none",
469
+ "tgi": "TGI 2.2.0",
470
+ "status": "OK",
471
+ "tokensPerSecond": "160"
472
+ }
473
+ ]
474
+ },
475
+ {
476
+ "name": "Arcee-Scribe",
477
+ "modelType": "InternLM2.5 8B",
478
+ "configurations": [
479
+ {
480
+ "cloud": "us-west-2",
481
+ "instanceType": "g5.2xlarge",
482
+ "gpu": "1xNVIDIA A10G",
483
+ "gpuRAM": "24 GB",
484
+ "quantization": "none",
485
+ "tgi": "DJL 0.28 vLLM",
486
+ "status": "OK",
487
+ "tokensPerSecond": 29,
488
+ "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",'
489
+ },
490
+ {
491
+ "cloud": "us-west-2",
492
+ "instanceType": "g5.12xlarge",
493
+ "gpu": "4xNVIDIA A10G",
494
+ "gpuRAM": "96 GB",
495
+ "quantization": "none",
496
+ "tgi": "DJL 0.28 vLLM",
497
+ "status": "OK",
498
+ "tokensPerSecond": 65,
499
+ "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",\nNot supported by AutoAWQ and AutoGPTQ'
500
+ },
501
+ {
502
+ "cloud": "us-west-2",
503
+ "instanceType": "g5.48xlarge",
504
+ "gpu": "8xNVIDIA A10G",
505
+ "gpuRAM": "192 GB",
506
+ "quantization": "none",
507
+ "tgi": "DJL 0.28 vLLM",
508
+ "status": "OK",
509
+ "tokensPerSecond": 80,
510
+ "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",'
511
+ },
512
+ {
513
+ "cloud": "us-west-2",
514
+ "instanceType": "g6.2xlarge",
515
+ "gpu": "1xNVIDIA L4",
516
+ "gpuRAM": "24 GB",
517
+ "quantization": "none",
518
+ "tgi": "DJL 0.28 vLLM",
519
+ "status": "OK",
520
+ "tokensPerSecond": 16,
521
+ "notes": '"OPTION_MAX_MODEL_LEN": "4096"'
522
+ },
523
+ {
524
+ "cloud": "us-west-2",
525
+ "instanceType": "g6.12xlarge",
526
+ "gpu": "4xNVIDIA L4",
527
+ "gpuRAM": "96 GB",
528
+ "quantization": "none",
529
+ "tgi": "DJL 0.28 vLLM",
530
+ "status": "OK",
531
+ "tokensPerSecond": 50,
532
+ "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",'
533
+ },
534
+ {
535
+ "cloud": "us-west-2",
536
+ "instanceType": "g6.48xlarge",
537
+ "gpu": "8xNVIDIA L4",
538
+ "gpuRAM": "192 GB",
539
+ "quantization": "none",
540
+ "tgi": "DJL 0.28 vLLM",
541
+ "status": "OK",
542
+ "tokensPerSecond": 69,
543
+ "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",'
544
+ },
545
+ {
546
+ "cloud": "us-west-2",
547
+ "instanceType": "p4d.24xlarge",
548
+ "gpu": "4xNVIDIA A100",
549
+ "gpuRAM": "320 GB",
550
+ "quantization": "none",
551
+ "tgi": "DJL 0.28 vLLM",
552
+ "status": "OK",
553
+ "tokensPerSecond": 82,
554
+ "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",'
555
+ }
556
+ ]
557
+ }
558
+ ]
559
+ }