Julien Simon commited on
Commit
75e81c7
1 Parent(s): 6f23d6c
Files changed (4) hide show
  1. .gitignore +70 -0
  2. app.py +13 -15
  3. requirements.txt +1 -0
  4. results.py +101 -56
.gitignore ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Jupyter Notebook
24
+ .ipynb_checkpoints
25
+
26
+ # IPython
27
+ profile_default/
28
+ ipython_config.py
29
+
30
+ # pyenv
31
+ .python-version
32
+
33
+ # Environments
34
+ .env
35
+ .venv
36
+ env/
37
+ venv/
38
+ ENV/
39
+ env.bak/
40
+ venv.bak/
41
+
42
+ # mypy
43
+ .mypy_cache/
44
+ .dmypy.json
45
+ dmypy.json
46
+
47
+ # Pytest
48
+ .pytest_cache/
49
+
50
+ # Coverage reports
51
+ htmlcov/
52
+ .coverage
53
+ .coverage.*
54
+ coverage.xml
55
+ *.cover
56
+
57
+ # VS Code
58
+ .vscode/
59
+
60
+ # PyCharm
61
+ .idea/
62
+
63
+ # Jupyter Book
64
+ _build/
65
+
66
+ # macOS
67
+ .DS_Store
68
+
69
+ # Windows
70
+ Thumbs.db
app.py CHANGED
@@ -60,10 +60,6 @@ def display_results(model_name):
60
  """
61
  Process and display results for a given model.
62
 
63
- This function retrieves model data, processes it, and formats it for display.
64
- It handles nested configurations, merges data from multiple models if necessary,
65
- and sorts the results by instance type.
66
-
67
  Args:
68
  model_name (str): Name of the model to display results for.
69
 
@@ -86,10 +82,10 @@ def display_results(model_name):
86
  merged_models.add(model.get('name', 'Unknown'))
87
  for config in model.get('configurations', []):
88
  try:
89
- instance_type = config['instanceType']
90
  cloud = config.get('cloud', 'N/A')
91
- key = (instance_type, cloud)
92
-
 
93
  if 'configurations' in config:
94
  for nested_config in config['configurations']:
95
  nested_key = key + (nested_config.get('quantization', 'N/A'),)
@@ -100,24 +96,26 @@ def display_results(model_name):
100
  "GPU RAM": config.get('gpuRAM', 'N/A'),
101
  "Status": nested_config.get('status', 'N/A'),
102
  "Quantization": nested_config.get('quantization', 'N/A'),
103
- "TGI": nested_config.get('tgi', 'N/A'),
104
  "Tokens per Second": nested_config.get('tokensPerSecond', 'N/A'),
105
- "Notes": nested_config.get('notes', '')
106
  }
107
  else:
108
- data[key] = {
 
 
109
  "Cloud": cloud,
110
  "Instance Type": instance_type,
111
  "GPU": config.get('gpu', 'N/A'),
112
  "GPU RAM": config.get('gpuRAM', 'N/A'),
113
  "Status": config.get('status', 'N/A'),
114
  "Quantization": config.get('quantization', 'N/A'),
115
- "TGI": config.get('tgi', 'N/A'),
116
  "Tokens per Second": config.get('tokensPerSecond', 'N/A'),
117
- "Notes": config.get('notes', '')
118
  }
119
- except KeyError as e:
120
- logging.error(f"KeyError in config: {e}")
121
  continue
122
 
123
  if not data:
@@ -165,7 +163,7 @@ def display_results(model_name):
165
 
166
  with gr.Blocks() as demo:
167
  gr.Markdown("# Model Benchmark Results")
168
- gr.Markdown("This table shows the benchmark results for each model. [TGI](https://huggingface.co/docs/text-generation-inference/reference/launcher) and [vLLM](https://docs.djl.ai/master/docs/serving/serving/docs/lmi/user_guides/vllm_user_guide.html) settings are default unless noted.")
169
  model_dropdown = gr.Dropdown(choices=get_model_names(), label="Select Model")
170
 
171
  results_text = gr.Markdown()
 
60
  """
61
  Process and display results for a given model.
62
 
 
 
 
 
63
  Args:
64
  model_name (str): Name of the model to display results for.
65
 
 
82
  merged_models.add(model.get('name', 'Unknown'))
83
  for config in model.get('configurations', []):
84
  try:
 
85
  cloud = config.get('cloud', 'N/A')
86
+ instance_type = config.get('instanceType', 'N/A')
87
+ key = (cloud, instance_type)
88
+
89
  if 'configurations' in config:
90
  for nested_config in config['configurations']:
91
  nested_key = key + (nested_config.get('quantization', 'N/A'),)
 
96
  "GPU RAM": config.get('gpuRAM', 'N/A'),
97
  "Status": nested_config.get('status', 'N/A'),
98
  "Quantization": nested_config.get('quantization', 'N/A'),
99
+ "Container": nested_config.get('container', nested_config.get('tgi', 'N/A')),
100
  "Tokens per Second": nested_config.get('tokensPerSecond', 'N/A'),
101
+ "Notes": nested_config.get('notes', ''),
102
  }
103
  else:
104
+ # Generate a unique key for each configuration
105
+ unique_key = key + (config.get('quantization', 'N/A'), len(data))
106
+ data[unique_key] = {
107
  "Cloud": cloud,
108
  "Instance Type": instance_type,
109
  "GPU": config.get('gpu', 'N/A'),
110
  "GPU RAM": config.get('gpuRAM', 'N/A'),
111
  "Status": config.get('status', 'N/A'),
112
  "Quantization": config.get('quantization', 'N/A'),
113
+ "Container": config.get('container', config.get('tgi', 'N/A')),
114
  "Tokens per Second": config.get('tokensPerSecond', 'N/A'),
115
+ "Notes": config.get('notes', ''),
116
  }
117
+ except Exception as e:
118
+ print(f"Error processing configuration: {e}")
119
  continue
120
 
121
  if not data:
 
163
 
164
  with gr.Blocks() as demo:
165
  gr.Markdown("# Model Benchmark Results")
166
+ gr.Markdown("This table shows the benchmark results for each model. Container settings ([TGI](https://huggingface.co/docs/text-generation-inference/reference/launcher), [vLLM](https://docs.djl.ai/master/docs/serving/serving/docs/lmi/user_guides/vllm_user_guide.html), etc.) are default unless noted.")
167
  model_dropdown = gr.Dropdown(choices=get_model_names(), label="Select Model")
168
 
169
  results_text = gr.Markdown()
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ gradio
results.py CHANGED
@@ -11,7 +11,7 @@ results = {
11
  "gpu": "4xNVIDIA A10G",
12
  "gpuRAM": "96 GB",
13
  "quantization": "awq",
14
- "tgi": "TGI 2.2.0",
15
  "status": "OK",
16
  "tokensPerSecond": "33",
17
  "notes": "",
@@ -23,7 +23,7 @@ results = {
23
  "gpu": "4xNVIDIA A100",
24
  "gpuRAM": "320 GB",
25
  "quantization": "none",
26
- "tgi": "TGI 2.2.0",
27
  "status": "OK",
28
  "tokensPerSecond": "38",
29
  "notes": "",
@@ -41,7 +41,7 @@ results = {
41
  "gpu": "4xNVIDIA A10G",
42
  "gpuRAM": "96 GB",
43
  "quantization": "awq",
44
- "tgi": "TGI 2.2.0",
45
  "status": "OK",
46
  "tokensPerSecond": "33",
47
  "notes": "",
@@ -53,7 +53,7 @@ results = {
53
  "gpu": "4xNVIDIA A100",
54
  "gpuRAM": "320 GB",
55
  "quantization": "none",
56
- "tgi": "TGI 2.2.0",
57
  "status": "OK",
58
  "tokensPerSecond": "38",
59
  "notes": "",
@@ -72,7 +72,7 @@ results = {
72
  "gpu": "4xNVIDIA T4",
73
  "gpuRAM": "64 GB",
74
  "quantization": "bitsandbytes-nf4",
75
- "tgi": "TGI 2.2.0",
76
  "status": "KO",
77
  "tokensPerSecond": "-",
78
  "notes": "Flash Attention requires Ampere GPUs or newer",
@@ -86,26 +86,26 @@ results = {
86
  "configurations": [
87
  {
88
  "quantization": "bitsandbytes-nf4",
89
- "tgi": "TGI 2.2.0",
90
  "status": "OK",
91
  "tokensPerSecond": "12",
92
  },
93
  {
94
  "quantization": "bitsandbytes-fp4",
95
- "tgi": "TGI 2.2.0",
96
  "status": "OK",
97
  "tokensPerSecond": "12",
98
  },
99
  {
100
  "quantization": "bitsandbytes (int8)",
101
- "tgi": "TGI 2.2.0",
102
  "status": "KO",
103
  "tokensPerSecond": "-",
104
  "notes": "CUDA OOM",
105
  },
106
  {
107
  "quantization": "eetq (int8)",
108
- "tgi": "TGI 2.2.0",
109
  "status": "KO",
110
  "tokensPerSecond": "-",
111
  "notes": "[FT Error] Heurisitc failed to find a valid config.",
@@ -121,26 +121,26 @@ results = {
121
  "configurations": [
122
  {
123
  "quantization": "none",
124
- "tgi": "TGI 2.2.0",
125
  "status": "KO",
126
  "tokensPerSecond": "-",
127
  "notes": "CUDA OOM (but g6.48xlarge works!)",
128
  },
129
  {
130
  "quantization": "bitsandbytes-nf4",
131
- "tgi": "TGI 2.2.0",
132
  "status": "OK",
133
  "tokensPerSecond": "12.3",
134
  },
135
  {
136
  "quantization": "bitsandbytes-fp4",
137
- "tgi": "TGI 2.2.0",
138
  "status": "OK",
139
  "tokensPerSecond": "12.5",
140
  },
141
  {
142
  "quantization": "bitsandbytes (int8)",
143
- "tgi": "TGI 2.2.0",
144
  "status": "KO",
145
  "tokensPerSecond": "-",
146
  "notes": "The model deploys, but inference times out.",
@@ -156,21 +156,21 @@ results = {
156
  "configurations": [
157
  {
158
  "quantization": "bitsandbytes-nf4",
159
- "tgi": "TGI 2.2.0",
160
  "status": "OK",
161
  "tokensPerSecond": "1.5-2",
162
  "notes": "Too slow, timeouts are likely",
163
  },
164
  {
165
  "quantization": "bitsandbytes-fp4",
166
- "tgi": "TGI 2.2.0",
167
  "status": "OK",
168
  "tokensPerSecond": "2",
169
  "notes": "Too slow, timeouts are likely",
170
  },
171
  {
172
  "quantization": "bitsandbytes (int8)",
173
- "tgi": "TGI 2.2.0",
174
  "status": "KO",
175
  "tokensPerSecond": "-",
176
  "notes": "CUDA OOM",
@@ -184,7 +184,7 @@ results = {
184
  "gpu": "8xNVIDIA L4",
185
  "gpuRAM": "192 GB",
186
  "quantization": "none",
187
- "tgi": "TGI 2.2.0",
188
  "status": "OK",
189
  "tokensPerSecond": "12",
190
  },
@@ -195,7 +195,7 @@ results = {
195
  "gpu": "8xNVIDIA A100",
196
  "gpuRAM": "320 GB",
197
  "quantization": "none",
198
- "tgi": "TGI 2.2.0",
199
  "status": "OK",
200
  "tokensPerSecond": "40",
201
  "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
@@ -207,7 +207,7 @@ results = {
207
  "gpu": "8xNVIDIA A100",
208
  "gpuRAM": "320 GB",
209
  "quantization": "none",
210
- "tgi": "TGI 2.2.0",
211
  "status": "waiting for quota",
212
  },
213
  {
@@ -217,7 +217,7 @@ results = {
217
  "gpu": "8xNVIDIA H100",
218
  "gpuRAM": "640GB",
219
  "quantization": "none",
220
- "tgi": "TGI 2.2.0",
221
  "status": "OK",
222
  "tokensPerSecond": "58",
223
  "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
@@ -227,7 +227,7 @@ results = {
227
  "instanceType": "inf2.*",
228
  "cloud": "AWS",
229
  "gpu": "-",
230
- "tgi": "TGI 2.2.0",
231
  "status": "not supported",
232
  "tokensPerSecond": "-",
233
  "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
@@ -245,7 +245,7 @@ results = {
245
  "gpu": "1xNVIDIA A10G",
246
  "gpuRAM": "24 GB",
247
  "quantization": "none",
248
- "tgi": "TGI 2.2.0",
249
  "status": "OK",
250
  "tokensPerSecond": "29",
251
  "notes": "4K/8K fails",
@@ -257,7 +257,7 @@ results = {
257
  "gpu": "4xNVIDIA A10G",
258
  "gpuRAM": "96 GB",
259
  "quantization": "none",
260
- "tgi": "TGI 2.2.0",
261
  "status": "OK",
262
  "tokensPerSecond": "85",
263
  "notes": '"MAX_INPUT_TOKENS": "16384", "MAX_TOTAL_TOKENS": "32768",',
@@ -269,7 +269,7 @@ results = {
269
  "gpu": "8xNVIDIA A10G",
270
  "gpuRAM": "192 GB",
271
  "quantization": "none",
272
- "tgi": "TGI 2.2.0",
273
  "status": "OK",
274
  "tokensPerSecond": "105",
275
  "notes": '"MAX_INPUT_TOKENS": "20480", "MAX_TOTAL_TOKENS": "40960"\n\n32K/64K fails',
@@ -283,11 +283,11 @@ results = {
283
  "configurations": [
284
  {
285
  "quantization": "none",
286
- "tgi": "TGI 2.2.0",
287
  "status": "OK",
288
  "tokensPerSecond": "15",
289
  },
290
- {"quantization": "fp8", "tgi": "TGI 2.2.0"},
291
  ],
292
  },
293
  {
@@ -297,7 +297,7 @@ results = {
297
  "gpu": "4xNVIDIA L4",
298
  "gpuRAM": "96 GB",
299
  "quantization": "none",
300
- "tgi": "TGI 2.2.0",
301
  "status": "OK",
302
  "tokensPerSecond": "51",
303
  "notes": "same as g5?",
@@ -309,7 +309,7 @@ results = {
309
  "gpu": "8xNVIDIA L4",
310
  "gpuRAM": "192 GB",
311
  "quantization": "none",
312
- "tgi": "TGI 2.2.0",
313
  "status": "OK",
314
  "tokensPerSecond": "81",
315
  "notes": "same as g5?",
@@ -321,7 +321,7 @@ results = {
321
  "gpu": "1xNVIDIA L40S",
322
  "gpuRAM": "48 GB",
323
  "quantization": "none",
324
- "tgi": "TGI 2.2.0",
325
  "status": "OK",
326
  "tokensPerSecond": "42.1",
327
  },
@@ -332,9 +332,20 @@ results = {
332
  "gpu": "1xNVIDIA L40S",
333
  "gpuRAM": "48 GB",
334
  "quantization": "none",
335
- "tgi": "SGLang 0.2.13",
336
  "status": "OK",
337
- "tokensPerSecond": "45",
 
 
 
 
 
 
 
 
 
 
 
338
  },
339
  {
340
  "region": "AWS",
@@ -343,7 +354,7 @@ results = {
343
  "gpu": "4xNVIDIA A100",
344
  "gpuRAM": "320 GB",
345
  "quantization": "none",
346
- "tgi": "TGI 2.2.0",
347
  "status": "OK",
348
  "tokensPerSecond": "145",
349
  "notes": '"MAX_INPUT_TOKENS": "40960", "MAX_TOTAL_TOKENS": "81920"\n\n64K/128K fails (even with 4-bit)',
@@ -371,7 +382,7 @@ results = {
371
  "gpu": "1xNVIDIA A10G",
372
  "gpuRAM": "24 GB",
373
  "quantization": "none",
374
- "tgi": "TGI 2.2.0",
375
  "status": "OK",
376
  "tokensPerSecond": "30",
377
  },
@@ -382,7 +393,7 @@ results = {
382
  "gpu": "4xNVIDIA A10G",
383
  "gpuRAM": "96 GB",
384
  "quantization": "none",
385
- "tgi": "TGI 2.2.0",
386
  "status": "OK",
387
  "tokensPerSecond": "83",
388
  },
@@ -393,7 +404,7 @@ results = {
393
  "gpu": "8xNVIDIA A10G",
394
  "gpuRAM": "192 GB",
395
  "quantization": "none",
396
- "tgi": "TGI 2.2.0",
397
  "status": "KO",
398
  "tokensPerSecond": "-",
399
  "notes": "ValueError: `num_heads` must be divisible by `num_shards` (got `num_heads`: 28 and `num_shards`: 8\n\nSM_NUM_GPUS=7 doesn't work either because tensor size ares not a multiple of 7 (e.g., 512)",
@@ -405,7 +416,7 @@ results = {
405
  "gpu": "1xNVIDIA L4",
406
  "gpuRAM": "24 GB",
407
  "quantization": "none",
408
- "tgi": "TGI 2.2.0",
409
  "status": "OK",
410
  "tokensPerSecond": "16.3",
411
  },
@@ -416,7 +427,7 @@ results = {
416
  "gpu": "4xNVIDIA L4",
417
  "gpuRAM": "96 GB",
418
  "quantization": "none",
419
- "tgi": "TGI 2.2.0",
420
  "status": "OK",
421
  "tokensPerSecond": "54.2",
422
  },
@@ -425,14 +436,48 @@ results = {
425
  "instanceType": "inf2.*",
426
  "cloud": "AWS",
427
  "gpu": "-",
428
- "tgi": "TGI 2.2.0",
429
  "status": "not supported",
430
  "tokensPerSecond": "-",
431
  "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
432
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
  ],
434
  },
435
- {"name": "Arcee-Spark", "modelType": "Qwen2 7B"},
 
436
  {
437
  "name": "Arcee-Lite",
438
  "modelType": "Qwen2 1.5B distilled from phi-3-medium 14B",
@@ -444,7 +489,7 @@ results = {
444
  "gpu": "-",
445
  "gpuRAM": "-",
446
  "quantization": "bitsandbytes-nf4",
447
- "tgi": "TGI 2.2.0",
448
  "status": "KO",
449
  "tokensPerSecond": "-",
450
  "notes": "OOM, might work with a prequantized model",
@@ -456,7 +501,7 @@ results = {
456
  "gpu": "-",
457
  "gpuRAM": "-",
458
  "quantization": "bitsandbytes-nf4",
459
- "tgi": "TGI 2.2.0",
460
  "status": "KO",
461
  "tokensPerSecond": "-",
462
  "notes": "OOM, might work with a prequantized model",
@@ -470,19 +515,19 @@ results = {
470
  "configurations": [
471
  {
472
  "quantization": "none",
473
- "tgi": "TGI 2.2.0",
474
  "status": "OK",
475
  "tokensPerSecond": "10.7",
476
  },
477
  {
478
  "quantization": "bitsandbytes (int8)",
479
- "tgi": "TGI 2.2.0",
480
  "status": "OK",
481
  "tokensPerSecond": "10.5",
482
  },
483
  {
484
  "quantization": "bitsandbytes-nf4",
485
- "tgi": "TGI 2.2.0",
486
  "status": "OK",
487
  "tokensPerSecond": "10.6",
488
  },
@@ -495,7 +540,7 @@ results = {
495
  "gpu": "-",
496
  "gpuRAM": "-",
497
  "quantization": "none",
498
- "tgi": "TGI 2.2.0",
499
  "status": "waiting for quota",
500
  "tokensPerSecond": "-",
501
  },
@@ -508,13 +553,13 @@ results = {
508
  "configurations": [
509
  {
510
  "quantization": "none",
511
- "tgi": "TGI 2.2.0",
512
  "status": "OK",
513
  "tokensPerSecond": "110",
514
  },
515
  {
516
  "quantization": "none",
517
- "tgi": "DJL 0.28 vLLM",
518
  "status": "OK",
519
  "tokensPerSecond": "105",
520
  "notes": '"OPTION_MAX_MODEL_LEN": "32768",',
@@ -528,7 +573,7 @@ results = {
528
  "gpu": "1xNVIDIA L40S",
529
  "gpuRAM": "48 GB",
530
  "quantization": "none",
531
- "tgi": "TGI 2.2.0",
532
  "status": "OK",
533
  "tokensPerSecond": "160",
534
  },
@@ -544,7 +589,7 @@ results = {
544
  "gpu": "1xNVIDIA A10G",
545
  "gpuRAM": "24 GB",
546
  "quantization": "none",
547
- "tgi": "DJL 0.28 vLLM",
548
  "status": "OK",
549
  "tokensPerSecond": 29,
550
  "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
@@ -555,7 +600,7 @@ results = {
555
  "gpu": "4xNVIDIA A10G",
556
  "gpuRAM": "96 GB",
557
  "quantization": "none",
558
- "tgi": "DJL 0.28 vLLM",
559
  "status": "OK",
560
  "tokensPerSecond": 65,
561
  "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",\nNot supported by AutoAWQ and AutoGPTQ',
@@ -566,7 +611,7 @@ results = {
566
  "gpu": "8xNVIDIA A10G",
567
  "gpuRAM": "192 GB",
568
  "quantization": "none",
569
- "tgi": "DJL 0.28 vLLM",
570
  "status": "OK",
571
  "tokensPerSecond": 80,
572
  "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
@@ -577,7 +622,7 @@ results = {
577
  "gpu": "1xNVIDIA L4",
578
  "gpuRAM": "24 GB",
579
  "quantization": "none",
580
- "tgi": "DJL 0.28 vLLM",
581
  "status": "OK",
582
  "tokensPerSecond": 16,
583
  "notes": '"OPTION_MAX_MODEL_LEN": "4096"',
@@ -588,7 +633,7 @@ results = {
588
  "gpu": "4xNVIDIA L4",
589
  "gpuRAM": "96 GB",
590
  "quantization": "none",
591
- "tgi": "DJL 0.28 vLLM",
592
  "status": "OK",
593
  "tokensPerSecond": 50,
594
  "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
@@ -599,7 +644,7 @@ results = {
599
  "gpu": "8xNVIDIA L4",
600
  "gpuRAM": "192 GB",
601
  "quantization": "none",
602
- "tgi": "DJL 0.28 vLLM",
603
  "status": "OK",
604
  "tokensPerSecond": 69,
605
  "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
@@ -610,7 +655,7 @@ results = {
610
  "gpu": "1xNVIDIA L40S",
611
  "gpuRAM": "48 GB",
612
  "quantization": "none",
613
- "tgi": "SGLang 0.2.13",
614
  "status": "OK",
615
  "tokensPerSecond": 46,
616
  },
@@ -620,7 +665,7 @@ results = {
620
  "gpu": "4xNVIDIA A100",
621
  "gpuRAM": "320 GB",
622
  "quantization": "none",
623
- "tgi": "DJL 0.28 vLLM",
624
  "status": "OK",
625
  "tokensPerSecond": 82,
626
  "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
 
11
  "gpu": "4xNVIDIA A10G",
12
  "gpuRAM": "96 GB",
13
  "quantization": "awq",
14
+ "container": "TGI 2.2.0",
15
  "status": "OK",
16
  "tokensPerSecond": "33",
17
  "notes": "",
 
23
  "gpu": "4xNVIDIA A100",
24
  "gpuRAM": "320 GB",
25
  "quantization": "none",
26
+ "container": "TGI 2.2.0",
27
  "status": "OK",
28
  "tokensPerSecond": "38",
29
  "notes": "",
 
41
  "gpu": "4xNVIDIA A10G",
42
  "gpuRAM": "96 GB",
43
  "quantization": "awq",
44
+ "container": "TGI 2.2.0",
45
  "status": "OK",
46
  "tokensPerSecond": "33",
47
  "notes": "",
 
53
  "gpu": "4xNVIDIA A100",
54
  "gpuRAM": "320 GB",
55
  "quantization": "none",
56
+ "container": "TGI 2.2.0",
57
  "status": "OK",
58
  "tokensPerSecond": "38",
59
  "notes": "",
 
72
  "gpu": "4xNVIDIA T4",
73
  "gpuRAM": "64 GB",
74
  "quantization": "bitsandbytes-nf4",
75
+ "container": "TGI 2.2.0",
76
  "status": "KO",
77
  "tokensPerSecond": "-",
78
  "notes": "Flash Attention requires Ampere GPUs or newer",
 
86
  "configurations": [
87
  {
88
  "quantization": "bitsandbytes-nf4",
89
+ "container": "TGI 2.2.0",
90
  "status": "OK",
91
  "tokensPerSecond": "12",
92
  },
93
  {
94
  "quantization": "bitsandbytes-fp4",
95
+ "container": "TGI 2.2.0",
96
  "status": "OK",
97
  "tokensPerSecond": "12",
98
  },
99
  {
100
  "quantization": "bitsandbytes (int8)",
101
+ "container": "TGI 2.2.0",
102
  "status": "KO",
103
  "tokensPerSecond": "-",
104
  "notes": "CUDA OOM",
105
  },
106
  {
107
  "quantization": "eetq (int8)",
108
+ "container": "TGI 2.2.0",
109
  "status": "KO",
110
  "tokensPerSecond": "-",
111
  "notes": "[FT Error] Heurisitc failed to find a valid config.",
 
121
  "configurations": [
122
  {
123
  "quantization": "none",
124
+ "container": "TGI 2.2.0",
125
  "status": "KO",
126
  "tokensPerSecond": "-",
127
  "notes": "CUDA OOM (but g6.48xlarge works!)",
128
  },
129
  {
130
  "quantization": "bitsandbytes-nf4",
131
+ "container": "TGI 2.2.0",
132
  "status": "OK",
133
  "tokensPerSecond": "12.3",
134
  },
135
  {
136
  "quantization": "bitsandbytes-fp4",
137
+ "container": "TGI 2.2.0",
138
  "status": "OK",
139
  "tokensPerSecond": "12.5",
140
  },
141
  {
142
  "quantization": "bitsandbytes (int8)",
143
+ "container": "TGI 2.2.0",
144
  "status": "KO",
145
  "tokensPerSecond": "-",
146
  "notes": "The model deploys, but inference times out.",
 
156
  "configurations": [
157
  {
158
  "quantization": "bitsandbytes-nf4",
159
+ "container": "TGI 2.2.0",
160
  "status": "OK",
161
  "tokensPerSecond": "1.5-2",
162
  "notes": "Too slow, timeouts are likely",
163
  },
164
  {
165
  "quantization": "bitsandbytes-fp4",
166
+ "container": "TGI 2.2.0",
167
  "status": "OK",
168
  "tokensPerSecond": "2",
169
  "notes": "Too slow, timeouts are likely",
170
  },
171
  {
172
  "quantization": "bitsandbytes (int8)",
173
+ "container": "TGI 2.2.0",
174
  "status": "KO",
175
  "tokensPerSecond": "-",
176
  "notes": "CUDA OOM",
 
184
  "gpu": "8xNVIDIA L4",
185
  "gpuRAM": "192 GB",
186
  "quantization": "none",
187
+ "container": "TGI 2.2.0",
188
  "status": "OK",
189
  "tokensPerSecond": "12",
190
  },
 
195
  "gpu": "8xNVIDIA A100",
196
  "gpuRAM": "320 GB",
197
  "quantization": "none",
198
+ "container": "TGI 2.2.0",
199
  "status": "OK",
200
  "tokensPerSecond": "40",
201
  "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
 
207
  "gpu": "8xNVIDIA A100",
208
  "gpuRAM": "320 GB",
209
  "quantization": "none",
210
+ "container": "TGI 2.2.0",
211
  "status": "waiting for quota",
212
  },
213
  {
 
217
  "gpu": "8xNVIDIA H100",
218
  "gpuRAM": "640GB",
219
  "quantization": "none",
220
+ "container": "TGI 2.2.0",
221
  "status": "OK",
222
  "tokensPerSecond": "58",
223
  "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
 
227
  "instanceType": "inf2.*",
228
  "cloud": "AWS",
229
  "gpu": "-",
230
+ "container": "TGI 2.2.0",
231
  "status": "not supported",
232
  "tokensPerSecond": "-",
233
  "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
 
245
  "gpu": "1xNVIDIA A10G",
246
  "gpuRAM": "24 GB",
247
  "quantization": "none",
248
+ "container": "TGI 2.2.0",
249
  "status": "OK",
250
  "tokensPerSecond": "29",
251
  "notes": "4K/8K fails",
 
257
  "gpu": "4xNVIDIA A10G",
258
  "gpuRAM": "96 GB",
259
  "quantization": "none",
260
+ "container": "TGI 2.2.0",
261
  "status": "OK",
262
  "tokensPerSecond": "85",
263
  "notes": '"MAX_INPUT_TOKENS": "16384", "MAX_TOTAL_TOKENS": "32768",',
 
269
  "gpu": "8xNVIDIA A10G",
270
  "gpuRAM": "192 GB",
271
  "quantization": "none",
272
+ "container": "TGI 2.2.0",
273
  "status": "OK",
274
  "tokensPerSecond": "105",
275
  "notes": '"MAX_INPUT_TOKENS": "20480", "MAX_TOTAL_TOKENS": "40960"\n\n32K/64K fails',
 
283
  "configurations": [
284
  {
285
  "quantization": "none",
286
+ "container": "TGI 2.2.0",
287
  "status": "OK",
288
  "tokensPerSecond": "15",
289
  },
290
+ {"quantization": "fp8", "container": "TGI 2.2.0"},
291
  ],
292
  },
293
  {
 
297
  "gpu": "4xNVIDIA L4",
298
  "gpuRAM": "96 GB",
299
  "quantization": "none",
300
+ "container": "TGI 2.2.0",
301
  "status": "OK",
302
  "tokensPerSecond": "51",
303
  "notes": "same as g5?",
 
309
  "gpu": "8xNVIDIA L4",
310
  "gpuRAM": "192 GB",
311
  "quantization": "none",
312
+ "container": "TGI 2.2.0",
313
  "status": "OK",
314
  "tokensPerSecond": "81",
315
  "notes": "same as g5?",
 
321
  "gpu": "1xNVIDIA L40S",
322
  "gpuRAM": "48 GB",
323
  "quantization": "none",
324
+ "container": "TGI 2.2.0",
325
  "status": "OK",
326
  "tokensPerSecond": "42.1",
327
  },
 
332
  "gpu": "1xNVIDIA L40S",
333
  "gpuRAM": "48 GB",
334
  "quantization": "none",
335
+ "container": "SGLang 0.2.13",
336
  "status": "OK",
337
+ "tokensPerSecond": "45"
338
+ },
339
+ {
340
+ "region": "AWS",
341
+ "instanceType": "g6e.2xlarge",
342
+ "cloud": "AWS",
343
+ "gpu": "1xNVIDIA L40S",
344
+ "gpuRAM": "48 GB",
345
+ "quantization": "none",
346
+ "container": "vLLM 0.5.5",
347
+ "status": "OK",
348
+ "tokensPerSecond": "43.4"
349
  },
350
  {
351
  "region": "AWS",
 
354
  "gpu": "4xNVIDIA A100",
355
  "gpuRAM": "320 GB",
356
  "quantization": "none",
357
+ "container": "TGI 2.2.0",
358
  "status": "OK",
359
  "tokensPerSecond": "145",
360
  "notes": '"MAX_INPUT_TOKENS": "40960", "MAX_TOTAL_TOKENS": "81920"\n\n64K/128K fails (even with 4-bit)',
 
382
  "gpu": "1xNVIDIA A10G",
383
  "gpuRAM": "24 GB",
384
  "quantization": "none",
385
+ "container": "TGI 2.2.0",
386
  "status": "OK",
387
  "tokensPerSecond": "30",
388
  },
 
393
  "gpu": "4xNVIDIA A10G",
394
  "gpuRAM": "96 GB",
395
  "quantization": "none",
396
+ "container": "TGI 2.2.0",
397
  "status": "OK",
398
  "tokensPerSecond": "83",
399
  },
 
404
  "gpu": "8xNVIDIA A10G",
405
  "gpuRAM": "192 GB",
406
  "quantization": "none",
407
+ "container": "TGI 2.2.0",
408
  "status": "KO",
409
  "tokensPerSecond": "-",
410
  "notes": "ValueError: `num_heads` must be divisible by `num_shards` (got `num_heads`: 28 and `num_shards`: 8\n\nSM_NUM_GPUS=7 doesn't work either because tensor size ares not a multiple of 7 (e.g., 512)",
 
416
  "gpu": "1xNVIDIA L4",
417
  "gpuRAM": "24 GB",
418
  "quantization": "none",
419
+ "container": "TGI 2.2.0",
420
  "status": "OK",
421
  "tokensPerSecond": "16.3",
422
  },
 
427
  "gpu": "4xNVIDIA L4",
428
  "gpuRAM": "96 GB",
429
  "quantization": "none",
430
+ "container": "TGI 2.2.0",
431
  "status": "OK",
432
  "tokensPerSecond": "54.2",
433
  },
 
436
  "instanceType": "inf2.*",
437
  "cloud": "AWS",
438
  "gpu": "-",
439
+ "container": "TGI 2.2.0",
440
  "status": "not supported",
441
  "tokensPerSecond": "-",
442
  "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
443
  },
444
+ {
445
+ "region": "us-west-2",
446
+ "instanceType": "g6e.2xlarge",
447
+ "cloud": "AWS",
448
+ "gpu": "1xNVIDIA L40S",
449
+ "gpuRAM": "48 GB",
450
+ "quantization": "none",
451
+ "container": "TGI 2.2.0",
452
+ "status": "OK",
453
+ "tokensPerSecond": "45",
454
+ },
455
+ {
456
+ "region": "us-west-2",
457
+ "instanceType": "g6e.2xlarge",
458
+ "cloud": "AWS",
459
+ "gpu": "1xNVIDIA L40S",
460
+ "gpuRAM": "48 GB",
461
+ "quantization": "none",
462
+ "container": "SGLang 0.2.13",
463
+ "status": "OK",
464
+ "tokensPerSecond": "48",
465
+ },
466
+ {
467
+ "region": "us-west-2",
468
+ "instanceType": "g6e.2xlarge",
469
+ "cloud": "AWS",
470
+ "gpu": "1xNVIDIA L40S",
471
+ "gpuRAM": "48 GB",
472
+ "quantization": "none",
473
+ "container": "vLLM 0.5.5",
474
+ "status": "OK",
475
+ "tokensPerSecond": "45.7",
476
+ },
477
  ],
478
  },
479
+ {"name": "Arcee-Spark",
480
+ "modelType": "Qwen2 7B"},
481
  {
482
  "name": "Arcee-Lite",
483
  "modelType": "Qwen2 1.5B distilled from phi-3-medium 14B",
 
489
  "gpu": "-",
490
  "gpuRAM": "-",
491
  "quantization": "bitsandbytes-nf4",
492
+ "container": "TGI 2.2.0",
493
  "status": "KO",
494
  "tokensPerSecond": "-",
495
  "notes": "OOM, might work with a prequantized model",
 
501
  "gpu": "-",
502
  "gpuRAM": "-",
503
  "quantization": "bitsandbytes-nf4",
504
+ "container": "TGI 2.2.0",
505
  "status": "KO",
506
  "tokensPerSecond": "-",
507
  "notes": "OOM, might work with a prequantized model",
 
515
  "configurations": [
516
  {
517
  "quantization": "none",
518
+ "container": "TGI 2.2.0",
519
  "status": "OK",
520
  "tokensPerSecond": "10.7",
521
  },
522
  {
523
  "quantization": "bitsandbytes (int8)",
524
+ "container": "TGI 2.2.0",
525
  "status": "OK",
526
  "tokensPerSecond": "10.5",
527
  },
528
  {
529
  "quantization": "bitsandbytes-nf4",
530
+ "container": "TGI 2.2.0",
531
  "status": "OK",
532
  "tokensPerSecond": "10.6",
533
  },
 
540
  "gpu": "-",
541
  "gpuRAM": "-",
542
  "quantization": "none",
543
+ "container": "TGI 2.2.0",
544
  "status": "waiting for quota",
545
  "tokensPerSecond": "-",
546
  },
 
553
  "configurations": [
554
  {
555
  "quantization": "none",
556
+ "container": "TGI 2.2.0",
557
  "status": "OK",
558
  "tokensPerSecond": "110",
559
  },
560
  {
561
  "quantization": "none",
562
+ "container": "DJL 0.28 vLLM",
563
  "status": "OK",
564
  "tokensPerSecond": "105",
565
  "notes": '"OPTION_MAX_MODEL_LEN": "32768",',
 
573
  "gpu": "1xNVIDIA L40S",
574
  "gpuRAM": "48 GB",
575
  "quantization": "none",
576
+ "container": "TGI 2.2.0",
577
  "status": "OK",
578
  "tokensPerSecond": "160",
579
  },
 
589
  "gpu": "1xNVIDIA A10G",
590
  "gpuRAM": "24 GB",
591
  "quantization": "none",
592
+ "container": "DJL 0.28 vLLM",
593
  "status": "OK",
594
  "tokensPerSecond": 29,
595
  "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
 
600
  "gpu": "4xNVIDIA A10G",
601
  "gpuRAM": "96 GB",
602
  "quantization": "none",
603
+ "container": "DJL 0.28 vLLM",
604
  "status": "OK",
605
  "tokensPerSecond": 65,
606
  "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",\nNot supported by AutoAWQ and AutoGPTQ',
 
611
  "gpu": "8xNVIDIA A10G",
612
  "gpuRAM": "192 GB",
613
  "quantization": "none",
614
+ "container": "DJL 0.28 vLLM",
615
  "status": "OK",
616
  "tokensPerSecond": 80,
617
  "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
 
622
  "gpu": "1xNVIDIA L4",
623
  "gpuRAM": "24 GB",
624
  "quantization": "none",
625
+ "container": "DJL 0.28 vLLM",
626
  "status": "OK",
627
  "tokensPerSecond": 16,
628
  "notes": '"OPTION_MAX_MODEL_LEN": "4096"',
 
633
  "gpu": "4xNVIDIA L4",
634
  "gpuRAM": "96 GB",
635
  "quantization": "none",
636
+ "container": "DJL 0.28 vLLM",
637
  "status": "OK",
638
  "tokensPerSecond": 50,
639
  "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
 
644
  "gpu": "8xNVIDIA L4",
645
  "gpuRAM": "192 GB",
646
  "quantization": "none",
647
+ "container": "DJL 0.28 vLLM",
648
  "status": "OK",
649
  "tokensPerSecond": 69,
650
  "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
 
655
  "gpu": "1xNVIDIA L40S",
656
  "gpuRAM": "48 GB",
657
  "quantization": "none",
658
+ "container": "SGLang 0.2.13",
659
  "status": "OK",
660
  "tokensPerSecond": 46,
661
  },
 
665
  "gpu": "4xNVIDIA A100",
666
  "gpuRAM": "320 GB",
667
  "quantization": "none",
668
+ "container": "DJL 0.28 vLLM",
669
  "status": "OK",
670
  "tokensPerSecond": 82,
671
  "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',