Spaces:

arcee-ai
/

Benchmarks

Running

App Files Files Community

Julien Simon commited on Sep 4

Commit

75e81c7

•

1 Parent(s): 6f23d6c

Update

Browse files

Files changed (4) hide show

.gitignore +70 -0
app.py +13 -15
requirements.txt +1 -0
results.py +101 -56

.gitignore ADDED Viewed

	@@ -0,0 +1,70 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pytest
+.pytest_cache/
+# Coverage reports
+htmlcov/
+.coverage
+.coverage.*
+coverage.xml
+*.cover
+# VS Code
+.vscode/
+# PyCharm
+.idea/
+# Jupyter Book
+_build/
+# macOS
+.DS_Store
+# Windows
+Thumbs.db

app.py CHANGED Viewed

@@ -60,10 +60,6 @@ def display_results(model_name):
     """
     Process and display results for a given model.
-    This function retrieves model data, processes it, and formats it for display.
-    It handles nested configurations, merges data from multiple models if necessary,
-    and sorts the results by instance type.
     Args:
         model_name (str): Name of the model to display results for.
@@ -86,10 +82,10 @@ def display_results(model_name):
             merged_models.add(model.get('name', 'Unknown'))
             for config in model.get('configurations', []):
                 try:
-                    instance_type = config['instanceType']
                     cloud = config.get('cloud', 'N/A')
-                    key = (instance_type, cloud)
                     if 'configurations' in config:
                         for nested_config in config['configurations']:
                             nested_key = key + (nested_config.get('quantization', 'N/A'),)
@@ -100,24 +96,26 @@ def display_results(model_name):
                                 "GPU RAM": config.get('gpuRAM', 'N/A'),
                                 "Status": nested_config.get('status', 'N/A'),
                                 "Quantization": nested_config.get('quantization', 'N/A'),
-                                "TGI": nested_config.get('tgi', 'N/A'),
                                 "Tokens per Second": nested_config.get('tokensPerSecond', 'N/A'),
-                                "Notes": nested_config.get('notes', '')
                             }
                     else:
-                        data[key] = {
                             "Cloud": cloud,
                             "Instance Type": instance_type,
                             "GPU": config.get('gpu', 'N/A'),
                             "GPU RAM": config.get('gpuRAM', 'N/A'),
                             "Status": config.get('status', 'N/A'),
                             "Quantization": config.get('quantization', 'N/A'),
-                            "TGI": config.get('tgi', 'N/A'),
                             "Tokens per Second": config.get('tokensPerSecond', 'N/A'),
-                            "Notes": config.get('notes', '')
                         }
-                except KeyError as e:
-                    logging.error(f"KeyError in config: {e}")
                     continue
         if not data:
@@ -165,7 +163,7 @@ def display_results(model_name):
 with gr.Blocks() as demo:
     gr.Markdown("# Model Benchmark Results")
-    gr.Markdown("This table shows the benchmark results for each model. [TGI](https://huggingface.co/docs/text-generation-inference/reference/launcher) and [vLLM](https://docs.djl.ai/master/docs/serving/serving/docs/lmi/user_guides/vllm_user_guide.html) settings are default unless noted.")
     model_dropdown = gr.Dropdown(choices=get_model_names(), label="Select Model")
     results_text = gr.Markdown()

     """
     Process and display results for a given model.
     Args:
         model_name (str): Name of the model to display results for.
             merged_models.add(model.get('name', 'Unknown'))
             for config in model.get('configurations', []):
                 try:
                     cloud = config.get('cloud', 'N/A')
+                    instance_type = config.get('instanceType', 'N/A')
+                    key = (cloud, instance_type)
                     if 'configurations' in config:
                         for nested_config in config['configurations']:
                             nested_key = key + (nested_config.get('quantization', 'N/A'),)
                                 "GPU RAM": config.get('gpuRAM', 'N/A'),
                                 "Status": nested_config.get('status', 'N/A'),
                                 "Quantization": nested_config.get('quantization', 'N/A'),
+                                "Container": nested_config.get('container', nested_config.get('tgi', 'N/A')),
                                 "Tokens per Second": nested_config.get('tokensPerSecond', 'N/A'),
+                                "Notes": nested_config.get('notes', ''),
                             }
                     else:
+                        # Generate a unique key for each configuration
+                        unique_key = key + (config.get('quantization', 'N/A'), len(data))
+                        data[unique_key] = {
                             "Cloud": cloud,
                             "Instance Type": instance_type,
                             "GPU": config.get('gpu', 'N/A'),
                             "GPU RAM": config.get('gpuRAM', 'N/A'),
                             "Status": config.get('status', 'N/A'),
                             "Quantization": config.get('quantization', 'N/A'),
+                            "Container": config.get('container', config.get('tgi', 'N/A')),
                             "Tokens per Second": config.get('tokensPerSecond', 'N/A'),
+                            "Notes": config.get('notes', ''),
                         }
+                except Exception as e:
+                    print(f"Error processing configuration: {e}")
                     continue
         if not data:
 with gr.Blocks() as demo:
     gr.Markdown("# Model Benchmark Results")
+    gr.Markdown("This table shows the benchmark results for each model. Container settings ([TGI](https://huggingface.co/docs/text-generation-inference/reference/launcher), [vLLM](https://docs.djl.ai/master/docs/serving/serving/docs/lmi/user_guides/vllm_user_guide.html), etc.) are default unless noted.")
     model_dropdown = gr.Dropdown(choices=get_model_names(), label="Select Model")
     results_text = gr.Markdown()

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ gradio

results.py CHANGED Viewed

@@ -11,7 +11,7 @@ results = {
                     "gpu": "4xNVIDIA A10G",
                     "gpuRAM": "96 GB",
                     "quantization": "awq",
-                    "tgi": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "33",
                     "notes": "",
@@ -23,7 +23,7 @@ results = {
                     "gpu": "4xNVIDIA A100",
                     "gpuRAM": "320 GB",
                     "quantization": "none",
-                    "tgi": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "38",
                     "notes": "",
@@ -41,7 +41,7 @@ results = {
                     "gpu": "4xNVIDIA A10G",
                     "gpuRAM": "96 GB",
                     "quantization": "awq",
-                    "tgi": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "33",
                     "notes": "",
@@ -53,7 +53,7 @@ results = {
                     "gpu": "4xNVIDIA A100",
                     "gpuRAM": "320 GB",
                     "quantization": "none",
-                    "tgi": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "38",
                     "notes": "",
@@ -72,7 +72,7 @@ results = {
                     "gpu": "4xNVIDIA T4",
                     "gpuRAM": "64 GB",
                     "quantization": "bitsandbytes-nf4",
-                    "tgi": "TGI 2.2.0",
                     "status": "KO",
                     "tokensPerSecond": "-",
                     "notes": "Flash Attention requires Ampere GPUs or newer",
@@ -86,26 +86,26 @@ results = {
                     "configurations": [
                         {
                             "quantization": "bitsandbytes-nf4",
-                            "tgi": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "12",
                         },
                         {
                             "quantization": "bitsandbytes-fp4",
-                            "tgi": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "12",
                         },
                         {
                             "quantization": "bitsandbytes (int8)",
-                            "tgi": "TGI 2.2.0",
                             "status": "KO",
                             "tokensPerSecond": "-",
                             "notes": "CUDA OOM",
                         },
                         {
                             "quantization": "eetq (int8)",
-                            "tgi": "TGI 2.2.0",
                             "status": "KO",
                             "tokensPerSecond": "-",
                             "notes": "[FT Error] Heurisitc failed to find a valid config.",
@@ -121,26 +121,26 @@ results = {
                     "configurations": [
                         {
                             "quantization": "none",
-                            "tgi": "TGI 2.2.0",
                             "status": "KO",
                             "tokensPerSecond": "-",
                             "notes": "CUDA OOM (but g6.48xlarge works!)",
                         },
                         {
                             "quantization": "bitsandbytes-nf4",
-                            "tgi": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "12.3",
                         },
                         {
                             "quantization": "bitsandbytes-fp4",
-                            "tgi": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "12.5",
                         },
                         {
                             "quantization": "bitsandbytes (int8)",
-                            "tgi": "TGI 2.2.0",
                             "status": "KO",
                             "tokensPerSecond": "-",
                             "notes": "The model deploys, but inference times out.",
@@ -156,21 +156,21 @@ results = {
                     "configurations": [
                         {
                             "quantization": "bitsandbytes-nf4",
-                            "tgi": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "1.5-2",
                             "notes": "Too slow, timeouts are likely",
                         },
                         {
                             "quantization": "bitsandbytes-fp4",
-                            "tgi": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "2",
                             "notes": "Too slow, timeouts are likely",
                         },
                         {
                             "quantization": "bitsandbytes (int8)",
-                            "tgi": "TGI 2.2.0",
                             "status": "KO",
                             "tokensPerSecond": "-",
                             "notes": "CUDA OOM",
@@ -184,7 +184,7 @@ results = {
                     "gpu": "8xNVIDIA L4",
                     "gpuRAM": "192 GB",
                     "quantization": "none",
-                    "tgi": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "12",
                 },
@@ -195,7 +195,7 @@ results = {
                     "gpu": "8xNVIDIA A100",
                     "gpuRAM": "320 GB",
                     "quantization": "none",
-                    "tgi": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "40",
                     "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
@@ -207,7 +207,7 @@ results = {
                     "gpu": "8xNVIDIA A100",
                     "gpuRAM": "320 GB",
                     "quantization": "none",
-                    "tgi": "TGI 2.2.0",
                     "status": "waiting for quota",
                 },
                 {
@@ -217,7 +217,7 @@ results = {
                     "gpu": "8xNVIDIA H100",
                     "gpuRAM": "640GB",
                     "quantization": "none",
-                    "tgi": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "58",
                     "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
@@ -227,7 +227,7 @@ results = {
                     "instanceType": "inf2.*",
                     "cloud": "AWS",
                     "gpu": "-",
-                    "tgi": "TGI 2.2.0",
                     "status": "not supported",
                     "tokensPerSecond": "-",
                     "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
@@ -245,7 +245,7 @@ results = {
                     "gpu": "1xNVIDIA A10G",
                     "gpuRAM": "24 GB",
                     "quantization": "none",
-                    "tgi": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "29",
                     "notes": "4K/8K fails",
@@ -257,7 +257,7 @@ results = {
                     "gpu": "4xNVIDIA A10G",
                     "gpuRAM": "96 GB",
                     "quantization": "none",
-                    "tgi": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "85",
                     "notes": '"MAX_INPUT_TOKENS": "16384", "MAX_TOTAL_TOKENS": "32768",',
@@ -269,7 +269,7 @@ results = {
                     "gpu": "8xNVIDIA A10G",
                     "gpuRAM": "192 GB",
                     "quantization": "none",
-                    "tgi": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "105",
                     "notes": '"MAX_INPUT_TOKENS": "20480", "MAX_TOTAL_TOKENS": "40960"\n\n32K/64K fails',
@@ -283,11 +283,11 @@ results = {
                     "configurations": [
                         {
                             "quantization": "none",
-                            "tgi": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "15",
                         },
-                        {"quantization": "fp8", "tgi": "TGI 2.2.0"},
                     ],
                 },
                 {
@@ -297,7 +297,7 @@ results = {
                     "gpu": "4xNVIDIA L4",
                     "gpuRAM": "96 GB",
                     "quantization": "none",
-                    "tgi": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "51",
                     "notes": "same as g5?",
@@ -309,7 +309,7 @@ results = {
                     "gpu": "8xNVIDIA L4",
                     "gpuRAM": "192 GB",
                     "quantization": "none",
-                    "tgi": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "81",
                     "notes": "same as g5?",
@@ -321,7 +321,7 @@ results = {
                     "gpu": "1xNVIDIA L40S",
                     "gpuRAM": "48 GB",
                     "quantization": "none",
-                    "tgi": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "42.1",
                 },
@@ -332,9 +332,20 @@ results = {
                     "gpu": "1xNVIDIA L40S",
                     "gpuRAM": "48 GB",
                     "quantization": "none",
-                    "tgi": "SGLang 0.2.13",
                     "status": "OK",
-                    "tokensPerSecond": "45",
                 },
                 {
                     "region": "AWS",
@@ -343,7 +354,7 @@ results = {
                     "gpu": "4xNVIDIA A100",
                     "gpuRAM": "320 GB",
                     "quantization": "none",
-                    "tgi": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "145",
                     "notes": '"MAX_INPUT_TOKENS": "40960", "MAX_TOTAL_TOKENS": "81920"\n\n64K/128K fails (even with 4-bit)',
@@ -371,7 +382,7 @@ results = {
                     "gpu": "1xNVIDIA A10G",
                     "gpuRAM": "24 GB",
                     "quantization": "none",
-                    "tgi": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "30",
                 },
@@ -382,7 +393,7 @@ results = {
                     "gpu": "4xNVIDIA A10G",
                     "gpuRAM": "96 GB",
                     "quantization": "none",
-                    "tgi": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "83",
                 },
@@ -393,7 +404,7 @@ results = {
                     "gpu": "8xNVIDIA A10G",
                     "gpuRAM": "192 GB",
                     "quantization": "none",
-                    "tgi": "TGI 2.2.0",
                     "status": "KO",
                     "tokensPerSecond": "-",
                     "notes": "ValueError: `num_heads` must be divisible by `num_shards` (got `num_heads`: 28 and `num_shards`: 8\n\nSM_NUM_GPUS=7 doesn't work either because tensor size ares not a multiple of 7 (e.g., 512)",
@@ -405,7 +416,7 @@ results = {
                     "gpu": "1xNVIDIA L4",
                     "gpuRAM": "24 GB",
                     "quantization": "none",
-                    "tgi": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "16.3",
                 },
@@ -416,7 +427,7 @@ results = {
                     "gpu": "4xNVIDIA L4",
                     "gpuRAM": "96 GB",
                     "quantization": "none",
-                    "tgi": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "54.2",
                 },
@@ -425,14 +436,48 @@ results = {
                     "instanceType": "inf2.*",
                     "cloud": "AWS",
                     "gpu": "-",
-                    "tgi": "TGI 2.2.0",
                     "status": "not supported",
                     "tokensPerSecond": "-",
                     "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
                 },
             ],
         },
-        {"name": "Arcee-Spark", "modelType": "Qwen2 7B"},
         {
             "name": "Arcee-Lite",
             "modelType": "Qwen2 1.5B distilled from phi-3-medium 14B",
@@ -444,7 +489,7 @@ results = {
                     "gpu": "-",
                     "gpuRAM": "-",
                     "quantization": "bitsandbytes-nf4",
-                    "tgi": "TGI 2.2.0",
                     "status": "KO",
                     "tokensPerSecond": "-",
                     "notes": "OOM, might work with a prequantized model",
@@ -456,7 +501,7 @@ results = {
                     "gpu": "-",
                     "gpuRAM": "-",
                     "quantization": "bitsandbytes-nf4",
-                    "tgi": "TGI 2.2.0",
                     "status": "KO",
                     "tokensPerSecond": "-",
                     "notes": "OOM, might work with a prequantized model",
@@ -470,19 +515,19 @@ results = {
                     "configurations": [
                         {
                             "quantization": "none",
-                            "tgi": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "10.7",
                         },
                         {
                             "quantization": "bitsandbytes (int8)",
-                            "tgi": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "10.5",
                         },
                         {
                             "quantization": "bitsandbytes-nf4",
-                            "tgi": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "10.6",
                         },
@@ -495,7 +540,7 @@ results = {
                     "gpu": "-",
                     "gpuRAM": "-",
                     "quantization": "none",
-                    "tgi": "TGI 2.2.0",
                     "status": "waiting for quota",
                     "tokensPerSecond": "-",
                 },
@@ -508,13 +553,13 @@ results = {
                     "configurations": [
                         {
                             "quantization": "none",
-                            "tgi": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "110",
                         },
                         {
                             "quantization": "none",
-                            "tgi": "DJL 0.28 vLLM",
                             "status": "OK",
                             "tokensPerSecond": "105",
                             "notes": '"OPTION_MAX_MODEL_LEN": "32768",',
@@ -528,7 +573,7 @@ results = {
                     "gpu": "1xNVIDIA L40S",
                     "gpuRAM": "48 GB",
                     "quantization": "none",
-                    "tgi": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "160",
                 },
@@ -544,7 +589,7 @@ results = {
                     "gpu": "1xNVIDIA A10G",
                     "gpuRAM": "24 GB",
                     "quantization": "none",
-                    "tgi": "DJL 0.28 vLLM",
                     "status": "OK",
                     "tokensPerSecond": 29,
                     "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
@@ -555,7 +600,7 @@ results = {
                     "gpu": "4xNVIDIA A10G",
                     "gpuRAM": "96 GB",
                     "quantization": "none",
-                    "tgi": "DJL 0.28 vLLM",
                     "status": "OK",
                     "tokensPerSecond": 65,
                     "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",\nNot supported by AutoAWQ and AutoGPTQ',
@@ -566,7 +611,7 @@ results = {
                     "gpu": "8xNVIDIA A10G",
                     "gpuRAM": "192 GB",
                     "quantization": "none",
-                    "tgi": "DJL 0.28 vLLM",
                     "status": "OK",
                     "tokensPerSecond": 80,
                     "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
@@ -577,7 +622,7 @@ results = {
                     "gpu": "1xNVIDIA L4",
                     "gpuRAM": "24 GB",
                     "quantization": "none",
-                    "tgi": "DJL 0.28 vLLM",
                     "status": "OK",
                     "tokensPerSecond": 16,
                     "notes": '"OPTION_MAX_MODEL_LEN": "4096"',
@@ -588,7 +633,7 @@ results = {
                     "gpu": "4xNVIDIA L4",
                     "gpuRAM": "96 GB",
                     "quantization": "none",
-                    "tgi": "DJL 0.28 vLLM",
                     "status": "OK",
                     "tokensPerSecond": 50,
                     "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
@@ -599,7 +644,7 @@ results = {
                     "gpu": "8xNVIDIA L4",
                     "gpuRAM": "192 GB",
                     "quantization": "none",
-                    "tgi": "DJL 0.28 vLLM",
                     "status": "OK",
                     "tokensPerSecond": 69,
                     "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
@@ -610,7 +655,7 @@ results = {
                     "gpu": "1xNVIDIA L40S",
                     "gpuRAM": "48 GB",
                     "quantization": "none",
-                    "tgi": "SGLang 0.2.13",
                     "status": "OK",
                     "tokensPerSecond": 46,
                 },
@@ -620,7 +665,7 @@ results = {
                     "gpu": "4xNVIDIA A100",
                     "gpuRAM": "320 GB",
                     "quantization": "none",
-                    "tgi": "DJL 0.28 vLLM",
                     "status": "OK",
                     "tokensPerSecond": 82,
                     "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',

                     "gpu": "4xNVIDIA A10G",
                     "gpuRAM": "96 GB",
                     "quantization": "awq",
+                    "container": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "33",
                     "notes": "",
                     "gpu": "4xNVIDIA A100",
                     "gpuRAM": "320 GB",
                     "quantization": "none",
+                    "container": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "38",
                     "notes": "",
                     "gpu": "4xNVIDIA A10G",
                     "gpuRAM": "96 GB",
                     "quantization": "awq",
+                    "container": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "33",
                     "notes": "",
                     "gpu": "4xNVIDIA A100",
                     "gpuRAM": "320 GB",
                     "quantization": "none",
+                    "container": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "38",
                     "notes": "",
                     "gpu": "4xNVIDIA T4",
                     "gpuRAM": "64 GB",
                     "quantization": "bitsandbytes-nf4",
+                    "container": "TGI 2.2.0",
                     "status": "KO",
                     "tokensPerSecond": "-",
                     "notes": "Flash Attention requires Ampere GPUs or newer",
                     "configurations": [
                         {
                             "quantization": "bitsandbytes-nf4",
+                            "container": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "12",
                         },
                         {
                             "quantization": "bitsandbytes-fp4",
+                            "container": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "12",
                         },
                         {
                             "quantization": "bitsandbytes (int8)",
+                            "container": "TGI 2.2.0",
                             "status": "KO",
                             "tokensPerSecond": "-",
                             "notes": "CUDA OOM",
                         },
                         {
                             "quantization": "eetq (int8)",
+                            "container": "TGI 2.2.0",
                             "status": "KO",
                             "tokensPerSecond": "-",
                             "notes": "[FT Error] Heurisitc failed to find a valid config.",
                     "configurations": [
                         {
                             "quantization": "none",
+                            "container": "TGI 2.2.0",
                             "status": "KO",
                             "tokensPerSecond": "-",
                             "notes": "CUDA OOM (but g6.48xlarge works!)",
                         },
                         {
                             "quantization": "bitsandbytes-nf4",
+                            "container": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "12.3",
                         },
                         {
                             "quantization": "bitsandbytes-fp4",
+                            "container": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "12.5",
                         },
                         {
                             "quantization": "bitsandbytes (int8)",
+                            "container": "TGI 2.2.0",
                             "status": "KO",
                             "tokensPerSecond": "-",
                             "notes": "The model deploys, but inference times out.",
                     "configurations": [
                         {
                             "quantization": "bitsandbytes-nf4",
+                            "container": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "1.5-2",
                             "notes": "Too slow, timeouts are likely",
                         },
                         {
                             "quantization": "bitsandbytes-fp4",
+                            "container": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "2",
                             "notes": "Too slow, timeouts are likely",
                         },
                         {
                             "quantization": "bitsandbytes (int8)",
+                            "container": "TGI 2.2.0",
                             "status": "KO",
                             "tokensPerSecond": "-",
                             "notes": "CUDA OOM",
                     "gpu": "8xNVIDIA L4",
                     "gpuRAM": "192 GB",
                     "quantization": "none",
+                    "container": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "12",
                 },
                     "gpu": "8xNVIDIA A100",
                     "gpuRAM": "320 GB",
                     "quantization": "none",
+                    "container": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "40",
                     "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
                     "gpu": "8xNVIDIA A100",
                     "gpuRAM": "320 GB",
                     "quantization": "none",
+                    "container": "TGI 2.2.0",
                     "status": "waiting for quota",
                 },
                 {
                     "gpu": "8xNVIDIA H100",
                     "gpuRAM": "640GB",
                     "quantization": "none",
+                    "container": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "58",
                     "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
                     "instanceType": "inf2.*",
                     "cloud": "AWS",
                     "gpu": "-",
+                    "container": "TGI 2.2.0",
                     "status": "not supported",
                     "tokensPerSecond": "-",
                     "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
                     "gpu": "1xNVIDIA A10G",
                     "gpuRAM": "24 GB",
                     "quantization": "none",
+                    "container": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "29",
                     "notes": "4K/8K fails",
                     "gpu": "4xNVIDIA A10G",
                     "gpuRAM": "96 GB",
                     "quantization": "none",
+                    "container": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "85",
                     "notes": '"MAX_INPUT_TOKENS": "16384", "MAX_TOTAL_TOKENS": "32768",',
                     "gpu": "8xNVIDIA A10G",
                     "gpuRAM": "192 GB",
                     "quantization": "none",
+                    "container": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "105",
                     "notes": '"MAX_INPUT_TOKENS": "20480", "MAX_TOTAL_TOKENS": "40960"\n\n32K/64K fails',
                     "configurations": [
                         {
                             "quantization": "none",
+                            "container": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "15",
                         },
+                        {"quantization": "fp8", "container": "TGI 2.2.0"},
                     ],
                 },
                 {
                     "gpu": "4xNVIDIA L4",
                     "gpuRAM": "96 GB",
                     "quantization": "none",
+                    "container": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "51",
                     "notes": "same as g5?",
                     "gpu": "8xNVIDIA L4",
                     "gpuRAM": "192 GB",
                     "quantization": "none",
+                    "container": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "81",
                     "notes": "same as g5?",
                     "gpu": "1xNVIDIA L40S",
                     "gpuRAM": "48 GB",
                     "quantization": "none",
+                    "container": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "42.1",
                 },
                     "gpu": "1xNVIDIA L40S",
                     "gpuRAM": "48 GB",
                     "quantization": "none",
+                    "container": "SGLang 0.2.13",
                     "status": "OK",
+                    "tokensPerSecond": "45"
+                },
+                {
+                    "region": "AWS",
+                    "instanceType": "g6e.2xlarge",
+                    "cloud": "AWS",
+                    "gpu": "1xNVIDIA L40S",
+                    "gpuRAM": "48 GB",
+                    "quantization": "none",
+                    "container": "vLLM 0.5.5",
+                    "status": "OK",
+                    "tokensPerSecond": "43.4"
                 },
                 {
                     "region": "AWS",
                     "gpu": "4xNVIDIA A100",
                     "gpuRAM": "320 GB",
                     "quantization": "none",
+                    "container": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "145",
                     "notes": '"MAX_INPUT_TOKENS": "40960", "MAX_TOTAL_TOKENS": "81920"\n\n64K/128K fails (even with 4-bit)',
                     "gpu": "1xNVIDIA A10G",
                     "gpuRAM": "24 GB",
                     "quantization": "none",
+                    "container": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "30",
                 },
                     "gpu": "4xNVIDIA A10G",
                     "gpuRAM": "96 GB",
                     "quantization": "none",
+                    "container": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "83",
                 },
                     "gpu": "8xNVIDIA A10G",
                     "gpuRAM": "192 GB",
                     "quantization": "none",
+                    "container": "TGI 2.2.0",
                     "status": "KO",
                     "tokensPerSecond": "-",
                     "notes": "ValueError: `num_heads` must be divisible by `num_shards` (got `num_heads`: 28 and `num_shards`: 8\n\nSM_NUM_GPUS=7 doesn't work either because tensor size ares not a multiple of 7 (e.g., 512)",
                     "gpu": "1xNVIDIA L4",
                     "gpuRAM": "24 GB",
                     "quantization": "none",
+                    "container": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "16.3",
                 },
                     "gpu": "4xNVIDIA L4",
                     "gpuRAM": "96 GB",
                     "quantization": "none",
+                    "container": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "54.2",
                 },
                     "instanceType": "inf2.*",
                     "cloud": "AWS",
                     "gpu": "-",
+                    "container": "TGI 2.2.0",
                     "status": "not supported",
                     "tokensPerSecond": "-",
                     "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
                 },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g6e.2xlarge",
+                    "cloud": "AWS",
+                    "gpu": "1xNVIDIA L40S",
+                    "gpuRAM": "48 GB",
+                    "quantization": "none",
+                    "container": "TGI 2.2.0",
+                    "status": "OK",
+                    "tokensPerSecond": "45",
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g6e.2xlarge",
+                    "cloud": "AWS",
+                    "gpu": "1xNVIDIA L40S",
+                    "gpuRAM": "48 GB",
+                    "quantization": "none",
+                    "container": "SGLang 0.2.13",
+                    "status": "OK",
+                    "tokensPerSecond": "48",
+                },
+                {
+                    "region": "us-west-2",
+                    "instanceType": "g6e.2xlarge",
+                    "cloud": "AWS",
+                    "gpu": "1xNVIDIA L40S",
+                    "gpuRAM": "48 GB",
+                    "quantization": "none",
+                    "container": "vLLM 0.5.5",
+                    "status": "OK",
+                    "tokensPerSecond": "45.7",
+                },
             ],
         },
+        {"name": "Arcee-Spark",
+         "modelType": "Qwen2 7B"},
         {
             "name": "Arcee-Lite",
             "modelType": "Qwen2 1.5B distilled from phi-3-medium 14B",
                     "gpu": "-",
                     "gpuRAM": "-",
                     "quantization": "bitsandbytes-nf4",
+                    "container": "TGI 2.2.0",
                     "status": "KO",
                     "tokensPerSecond": "-",
                     "notes": "OOM, might work with a prequantized model",
                     "gpu": "-",
                     "gpuRAM": "-",
                     "quantization": "bitsandbytes-nf4",
+                    "container": "TGI 2.2.0",
                     "status": "KO",
                     "tokensPerSecond": "-",
                     "notes": "OOM, might work with a prequantized model",
                     "configurations": [
                         {
                             "quantization": "none",
+                            "container": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "10.7",
                         },
                         {
                             "quantization": "bitsandbytes (int8)",
+                            "container": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "10.5",
                         },
                         {
                             "quantization": "bitsandbytes-nf4",
+                            "container": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "10.6",
                         },
                     "gpu": "-",
                     "gpuRAM": "-",
                     "quantization": "none",
+                    "container": "TGI 2.2.0",
                     "status": "waiting for quota",
                     "tokensPerSecond": "-",
                 },
                     "configurations": [
                         {
                             "quantization": "none",
+                            "container": "TGI 2.2.0",
                             "status": "OK",
                             "tokensPerSecond": "110",
                         },
                         {
                             "quantization": "none",
+                            "container": "DJL 0.28 vLLM",
                             "status": "OK",
                             "tokensPerSecond": "105",
                             "notes": '"OPTION_MAX_MODEL_LEN": "32768",',
                     "gpu": "1xNVIDIA L40S",
                     "gpuRAM": "48 GB",
                     "quantization": "none",
+                    "container": "TGI 2.2.0",
                     "status": "OK",
                     "tokensPerSecond": "160",
                 },
                     "gpu": "1xNVIDIA A10G",
                     "gpuRAM": "24 GB",
                     "quantization": "none",
+                    "container": "DJL 0.28 vLLM",
                     "status": "OK",
                     "tokensPerSecond": 29,
                     "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
                     "gpu": "4xNVIDIA A10G",
                     "gpuRAM": "96 GB",
                     "quantization": "none",
+                    "container": "DJL 0.28 vLLM",
                     "status": "OK",
                     "tokensPerSecond": 65,
                     "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",\nNot supported by AutoAWQ and AutoGPTQ',
                     "gpu": "8xNVIDIA A10G",
                     "gpuRAM": "192 GB",
                     "quantization": "none",
+                    "container": "DJL 0.28 vLLM",
                     "status": "OK",
                     "tokensPerSecond": 80,
                     "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
                     "gpu": "1xNVIDIA L4",
                     "gpuRAM": "24 GB",
                     "quantization": "none",
+                    "container": "DJL 0.28 vLLM",
                     "status": "OK",
                     "tokensPerSecond": 16,
                     "notes": '"OPTION_MAX_MODEL_LEN": "4096"',
                     "gpu": "4xNVIDIA L4",
                     "gpuRAM": "96 GB",
                     "quantization": "none",
+                    "container": "DJL 0.28 vLLM",
                     "status": "OK",
                     "tokensPerSecond": 50,
                     "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
                     "gpu": "8xNVIDIA L4",
                     "gpuRAM": "192 GB",
                     "quantization": "none",
+                    "container": "DJL 0.28 vLLM",
                     "status": "OK",
                     "tokensPerSecond": 69,
                     "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
                     "gpu": "1xNVIDIA L40S",
                     "gpuRAM": "48 GB",
                     "quantization": "none",
+                    "container": "SGLang 0.2.13",
                     "status": "OK",
                     "tokensPerSecond": 46,
                 },
                     "gpu": "4xNVIDIA A100",
                     "gpuRAM": "320 GB",
                     "quantization": "none",
+                    "container": "DJL 0.28 vLLM",
                     "status": "OK",
                     "tokensPerSecond": 82,
                     "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',