Spaces:

arcee-ai
/

Benchmarks

Running

App Files Files Community

Julien Simon commited on Aug 29, 2024

Commit

7200b01

1 Parent(s): 48660ea

Initial version

Browse files

Files changed (2) hide show

app.py +181 -0
results.py +559 -0

app.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import logging
+import re
+import gradio as gr
+import pandas as pd
+from results import results
+logging.basicConfig(level=logging.DEBUG)
+def get_model_names():
+    """
+    Retrieve a sorted list of model names from the results data.
+    Returns:
+        list: Sorted list of model names.
+    """
+    return sorted([model['name'] for model in results['models']])
+def get_models_by_architecture(model_name):
+    """
+    Retrieve models with the same architecture as the specified model.
+    Args:
+        model_name (str): Name of the model to match architecture.
+    Returns:
+        list: List of models with the same architecture.
+    """
+    selected_model = next((m for m in results['models'] if m['name'] == model_name), None)
+    if not selected_model:
+        return []
+    model_type = selected_model.get('modelType', '')
+    return [m for m in results['models'] if m.get('modelType', '') == model_type]
+def custom_sort_key(instance_type):
+    """
+    Generate a custom sorting key for instance types.
+    Args:
+        instance_type (str): The instance type to generate a key for.
+    Returns:
+        tuple: A tuple used for sorting, containing (family, size_index).
+    """
+    size_order = ['xlarge', '2xlarge', '4xlarge', '8xlarge', '12xlarge', '16xlarge', '24xlarge', '48xlarge']
+    match = re.match(r'([a-z]+\d+)\.(\w+)', instance_type)
+    if match:
+        family, size = match.groups()
+        return (family, size_order.index(size) if size in size_order else len(size_order))
+    return (instance_type, 0)  # Fallback for non-standard instance types
+def display_results(model_name):
+    """
+    Process and display results for a given model.
+    This function retrieves model data, processes it, and formats it for display.
+    It handles nested configurations, merges data from multiple models if necessary,
+    and sorts the results by instance type.
+    Args:
+        model_name (str): Name of the model to display results for.
+    Returns:
+        tuple: A tuple containing:
+            - str: Markdown formatted string with model information.
+            - pandas.DataFrame: Styled DataFrame with the results.
+    """
+    try:
+        models = get_models_by_architecture(model_name)
+        if not models:
+            logging.warning(f"No models found for {model_name}")
+            return f"No results found for the selected model: {model_name}", pd.DataFrame()
+        model_type = models[0].get('modelType', 'N/A')
+        data = {}
+        merged_models = set()
+        for model in models:
+            merged_models.add(model.get('name', 'Unknown'))
+            for config in model.get('configurations', []):
+                try:
+                    instance_type = config['instanceType']
+                    cloud = config.get('cloud', 'N/A')
+                    key = (instance_type, cloud)
+                    if 'configurations' in config:
+                        for nested_config in config['configurations']:
+                            nested_key = key + (nested_config.get('quantization', 'N/A'),)
+                            data[nested_key] = {
+                                "Cloud": cloud,
+                                "Instance Type": instance_type,
+                                "GPU": config.get('gpu', 'N/A'),
+                                "GPU RAM": config.get('gpuRAM', 'N/A'),
+                                "Status": nested_config.get('status', 'N/A'),
+                                "Quantization": nested_config.get('quantization', 'N/A'),
+                                "TGI": nested_config.get('tgi', 'N/A'),
+                                "Tokens per Second": nested_config.get('tokensPerSecond', 'N/A'),
+                                "Notes": nested_config.get('notes', '')
+                            }
+                    else:
+                        data[key] = {
+                            "Cloud": cloud,
+                            "Instance Type": instance_type,
+                            "GPU": config.get('gpu', 'N/A'),
+                            "GPU RAM": config.get('gpuRAM', 'N/A'),
+                            "Status": config.get('status', 'N/A'),
+                            "Quantization": config.get('quantization', 'N/A'),
+                            "TGI": config.get('tgi', 'N/A'),
+                            "Tokens per Second": config.get('tokensPerSecond', 'N/A'),
+                            "Notes": config.get('notes', '')
+                        }
+                except KeyError as e:
+                    logging.error(f"KeyError in config: {e}")
+                    continue
+        if not data:
+            logging.warning(f"No data extracted for {model_name}")
+            return f"No data could be extracted for the selected model: {model_name}", pd.DataFrame()
+        # Merge data if there are conflicts
+        for key, value in data.items():
+            for field in value:
+                if value[field] == 'N/A':
+                    for other_key, other_value in data.items():
+                        if other_key[0] == key[0] and other_value[field] != 'N/A':
+                            value[field] = other_value[field]
+                            break
+        # Filter out rows where Status is 'N/A'
+        data = {k: v for k, v in data.items() if v['Status'] != 'N/A'}
+        merged_models_message = f"Note: Results merged from models: {', '.join(merged_models)}" if len(merged_models) > 1 else None
+        # Sort the data by instance type
+        sorted_data = sorted(data.values(), key=lambda x: custom_sort_key(x['Instance Type']))
+        results = f"## Results for {model_name}\n\nModel Type: {model_type}"
+        if merged_models_message:
+            results += f"\n\n{merged_models_message}"
+        df = pd.DataFrame(sorted_data)
+        def color_status(val):
+            if val == 'OK':
+                return 'background-color: green; color: white'
+            elif val == 'KO':
+                return 'background-color: red; color: white'
+            else:
+                return ''
+        styled_df = df.style.applymap(color_status, subset=['Status'])
+        return results, styled_df
+    except Exception as e:
+        logging.exception(f"Error in display_results: {e}")
+        return f"An error occurred while processing results for {model_name}: {str(e)}", pd.DataFrame()
+with gr.Blocks() as demo:
+    gr.Markdown("# Model Benchmark Results")
+    gr.Markdown("This table shows the benchmark results for each model. [TGI](https://huggingface.co/docs/text-generation-inference/reference/launcher) and [vLLM](https://docs.djl.ai/master/docs/serving/serving/docs/lmi/user_guides/vllm_user_guide.html) settings are default unless noted.")
+    model_dropdown = gr.Dropdown(choices=get_model_names(), label="Select Model")
+    results_text = gr.Markdown()
+    results_output = gr.DataFrame(label="Results")
+    model_dropdown.change(
+        display_results,
+        inputs=[model_dropdown],
+        outputs=[results_text, results_output]
+    )
+if __name__ == "__main__":
+    demo.launch()

results.py ADDED Viewed

	@@ -0,0 +1,559 @@

+results = {
+    "models": [
+      {"name": "Arcee-Meraj",
+       "modelType": "Qwen2 72B"
+      },
+      {
+        "name": "Arcee-Nova",
+        "modelType": "Qwen2 72B",
+        "notes": "",
+        "configurations": [
+          {
+            "region": "us-west-2",
+            "instanceType": "g4dn.12xlarge",
+            "cloud": "AWS",
+            "gpu": "4xNVIDIA T4",
+            "gpuRAM": "64 GB",
+            "quantization": "bitsandbytes-nf4",
+            "tgi": "TGI 2.2.0",
+            "status": "KO",
+            "tokensPerSecond": "-",
+            "notes": "Flash Attention requires Ampere GPUs or newer"
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "g5.12xlarge",
+            "cloud": "AWS",
+            "gpu": "4xNVIDIA A10G",
+            "gpuRAM": "96 GB",
+            "configurations": [
+              {
+                "quantization": "bitsandbytes-nf4",
+                "tgi": "TGI 2.2.0",
+                "status": "OK",
+                "tokensPerSecond": "12"
+              },
+              {
+                "quantization": "bitsandbytes-fp4",
+                "tgi": "TGI 2.2.0",
+                "status": "OK",
+                "tokensPerSecond": "12"
+              },
+              {
+                "quantization": "bitsandbytes (int8)",
+                "tgi": "TGI 2.2.0",
+                "status": "KO",
+                "tokensPerSecond": "-",
+                "notes": "CUDA OOM"
+              },
+              {
+                "quantization": "eetq (int8)",
+                "tgi": "TGI 2.2.0",
+                "status": "KO",
+                "tokensPerSecond": "-",
+                "notes": "[FT Error] Heurisitc failed to find a valid config."
+              }
+            ]
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "g5.48xlarge",
+            "cloud": "AWS",
+            "gpu": "8xNVIDIA A10G",
+            "gpuRAM": "192 GB",
+            "configurations": [
+              {
+                "quantization": "none",
+                "tgi": "TGI 2.2.0",
+                "status": "KO",
+                "tokensPerSecond": "-",
+                "notes": "CUDA OOM (but g6.48xlarge works!)"
+              },
+              {
+                "quantization": "bitsandbytes-nf4",
+                "tgi": "TGI 2.2.0",
+                "status": "OK",
+                "tokensPerSecond": "12.3"
+              },
+              {
+                "quantization": "bitsandbytes-fp4",
+                "tgi": "TGI 2.2.0",
+                "status": "OK",
+                "tokensPerSecond": "12.5"
+              },
+              {
+                "quantization": "bitsandbytes (int8)",
+                "tgi": "TGI 2.2.0",
+                "status": "KO",
+                "tokensPerSecond": "-",
+                "notes": "The model deploys, but inference times out."
+              }
+            ]
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "g6.12xlarge",
+            "cloud": "AWS",
+            "gpu": "4xNVIDIA L4",
+            "gpuRAM": "96 GB",
+            "configurations": [
+              {
+                "quantization": "bitsandbytes-nf4",
+                "tgi": "TGI 2.2.0",
+                "status": "OK",
+                "tokensPerSecond": "1.5-2",
+                "notes": "Too slow, timeouts are likely"
+              },
+              {
+                "quantization": "bitsandbytes-fp4",
+                "tgi": "TGI 2.2.0",
+                "status": "OK",
+                "tokensPerSecond": "2",
+                "notes": "Too slow, timeouts are likely"
+              },
+              {
+                "quantization": "bitsandbytes (int8)",
+                "tgi": "TGI 2.2.0",
+                "status": "KO",
+                "tokensPerSecond": "-",
+                "notes": "CUDA OOM"
+              }
+            ]
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "g6.48xlarge",
+            "cloud": "AWS",
+            "gpu": "8xNVIDIA L4",
+            "gpuRAM": "192 GB",
+            "quantization": "none",
+            "tgi": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "12"
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "p4d.24xlarge",
+            "cloud": "AWS",
+            "gpu": "8xNVIDIA A100",
+            "gpuRAM": "320 GB",
+            "quantization": "none",
+            "tgi": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "40",
+            "notes": "\"MAX_INPUT_LENGTH\": \"16384\", \"MAX_TOTAL_TOKENS\": \"32768\","
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "p4de.24xlarge",
+            "cloud": "AWS",
+            "gpu": "8xNVIDIA A100",
+            "gpuRAM": "320 GB",
+            "quantization": "none",
+            "tgi": "TGI 2.2.0",
+            "status": "waiting for quota"
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "p5.48xlarge",
+            "cloud": "AWS",
+            "gpu": "8xNVIDIA H100",
+            "gpuRAM": "640GB",
+            "quantization": "none",
+            "tgi": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "58",
+            "notes": "\"MAX_INPUT_LENGTH\": \"16384\", \"MAX_TOTAL_TOKENS\": \"32768\","
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "inf2.*",
+            "cloud": "AWS",
+            "gpu": "-",
+            "tgi": "TGI 2.2.0",
+            "status": "not supported",
+            "tokensPerSecond": "-",
+            "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO"
+          }
+        ]
+      },
+      {
+        "name": "Llama-Spark",
+        "modelType": "Llama 3.1 8B",
+        "configurations": [
+          {
+            "region": "us-west-2",
+            "instanceType": "g5.2xlarge",
+            "cloud": "AWS",
+            "gpu": "1xNVIDIA A10G",
+            "gpuRAM": "24 GB",
+            "quantization": "none",
+            "tgi": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "29",
+            "notes": "4K/8K fails"
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "g5.12xlarge",
+            "cloud": "AWS",
+            "gpu": "4xNVIDIA A10G",
+            "gpuRAM": "96 GB",
+            "quantization": "none",
+            "tgi": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "85",
+            "notes": "\"MAX_INPUT_TOKENS\": \"16384\", \"MAX_TOTAL_TOKENS\": \"32768\","
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "g5.48xlarge",
+            "cloud": "AWS",
+            "gpu": "8xNVIDIA A10G",
+            "gpuRAM": "192 GB",
+            "quantization": "none",
+            "tgi": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "105",
+            "notes": "\"MAX_INPUT_TOKENS\": \"20480\", \"MAX_TOTAL_TOKENS\": \"40960\"\n\n32K/64K fails"
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "g6.2xlarge",
+            "cloud": "AWS",
+            "gpu": "1xNVIDIA L4",
+            "gpuRAM": "24 GB",
+            "configurations": [
+              {
+                "quantization": "none",
+                "tgi": "TGI 2.2.0",
+                "status": "OK",
+                "tokensPerSecond": "15"
+              },
+              {
+                "quantization": "fp8",
+                "tgi": "TGI 2.2.0"
+              }
+            ]
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "g6.12xlarge",
+            "cloud": "AWS",
+            "gpu": "4xNVIDIA L4",
+            "gpuRAM": "96 GB",
+            "quantization": "none",
+            "tgi": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "51",
+            "notes": "same as g5?"
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "g6.48xlarge",
+            "cloud": "AWS",
+            "gpu": "8xNVIDIA L4",
+            "gpuRAM": "192 GB",
+            "quantization": "none",
+            "tgi": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "81",
+            "notes": "same as g5?"
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "g6e.2xlarge",
+            "cloud": "AWS",
+            "gpu": "1xNVIDIA L40S",
+            "gpuRAM": "48 GB",
+            "quantization": "none",
+            "tgi": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "42"
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "p4d.24xlarge",
+            "cloud": "AWS",
+            "gpu": "4xNVIDIA A100",
+            "gpuRAM": "320 GB",
+            "quantization": "none",
+            "tgi": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "145",
+            "notes": "\"MAX_INPUT_TOKENS\": \"40960\", \"MAX_TOTAL_TOKENS\": \"81920\"\n\n64K/128K fails (even with 4-bit)"
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "inf2.*",
+            "cloud": "AWS",
+            "gpu": "-",
+            "status": "not supported",
+            "tokensPerSecond": "-",
+            "notes": "Llama-3.1: TGI OK, Neuron SDK OK, optimum-neuron KO"
+          }
+        ]
+      },
+      {
+        "name": "Arcee-Agent",
+        "modelType": "Qwen2 7B",
+        "notes": "",
+        "configurations": [
+          {
+            "region": "us-west-2",
+            "instanceType": "g5.2xlarge",
+            "cloud": "AWS",
+            "gpu": "1xNVIDIA A10G",
+            "gpuRAM": "24 GB",
+            "quantization": "none",
+            "tgi": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "30"
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "g5.12xlarge",
+            "cloud": "AWS",
+            "gpu": "4xNVIDIA A10G",
+            "gpuRAM": "96 GB",
+            "quantization": "none",
+            "tgi": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "83"
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "g5.48xlarge",
+            "cloud": "AWS",
+            "gpu": "8xNVIDIA A10G",
+            "gpuRAM": "192 GB",
+            "quantization": "none",
+            "tgi": "TGI 2.2.0",
+            "status": "KO",
+            "tokensPerSecond": "-",
+            "notes": "ValueError: `num_heads` must be divisible by `num_shards` (got `num_heads`: 28 and `num_shards`: 8\n\nSM_NUM_GPUS=7 doesn't work either because tensor size ares not a multiple of 7 (e.g., 512)"
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "g6.2xlarge",
+            "cloud": "AWS",
+            "gpu": "1xNVIDIA L4",
+            "gpuRAM": "24 GB",
+            "quantization": "none",
+            "tgi": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "16.3"
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "g6.12xlarge",
+            "cloud": "AWS",
+            "gpu": "4xNVIDIA L4",
+            "gpuRAM": "96 GB",
+            "quantization": "none",
+            "tgi": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "54.2"
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "inf2.*",
+            "cloud": "AWS",
+            "gpu": "-",
+            "tgi": "TGI 2.2.0",
+            "status": "not supported",
+            "tokensPerSecond": "-",
+            "notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO"
+          }
+        ]
+      },
+      {
+        "name": "Arcee-Spark",
+        "modelType": "Qwen2 7B"
+      },
+      {
+        "name": "Arcee-Lite",
+        "modelType": "Qwen2 1.5B distilled from phi-3-medium 14B",
+        "configurations": [
+          {
+            "region": "us-west-2",
+            "instanceType": "c6i.xlarge",
+            "cloud": "AWS",
+            "gpu": "-",
+            "gpuRAM": "-",
+            "quantization": "bitsandbytes-nf4",
+            "tgi": "TGI 2.2.0",
+            "status": "KO",
+            "tokensPerSecond": "-",
+            "notes": "OOM, might work with a prequantized model"
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "c6i.2xlarge",
+            "cloud": "AWS",
+            "gpu": "-",
+            "gpuRAM": "-",
+            "quantization": "bitsandbytes-nf4",
+            "tgi": "TGI 2.2.0",
+            "status": "KO",
+            "tokensPerSecond": "-",
+            "notes": "OOM, might work with a prequantized model"
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "c6i.4xlarge",
+            "cloud": "AWS",
+            "gpu": "-",
+            "gpuRAM": "-",
+            "configurations": [
+              {
+                "quantization": "none",
+                "tgi": "TGI 2.2.0",
+                "status": "OK",
+                "tokensPerSecond": "10.7"
+              },
+              {
+                "quantization": "bitsandbytes (int8)",
+                "tgi": "TGI 2.2.0",
+                "status": "OK",
+                "tokensPerSecond": "10.5"
+              },
+              {
+                "quantization": "bitsandbytes-nf4",
+                "tgi": "TGI 2.2.0",
+                "status": "OK",
+                "tokensPerSecond": "10.6"
+              }
+            ]
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "c7i.4xlarge",
+            "cloud": "AWS",
+            "gpu": "-",
+            "gpuRAM": "-",
+            "quantization": "none",
+            "tgi": "TGI 2.2.0",
+            "status": "waiting for quota",
+            "tokensPerSecond": "-"
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "g5.xlarge",
+            "cloud": "AWS",
+            "gpu": "1xNVIDIA A10G",
+            "gpuRAM": "24 GB",
+            "configurations": [
+              {
+                "quantization": "none",
+                "tgi": "TGI 2.2.0",
+                "status": "OK",
+                "tokensPerSecond": "110"
+              },
+              {
+                "quantization": "none",
+                "tgi": "DJL 0.28 vLLM",
+                "status": "OK",
+                "tokensPerSecond": "105",
+                "notes": "\"OPTION_MAX_MODEL_LEN\": \"32768\","
+              }
+            ]
+          },
+          {
+            "region": "us-west-2",
+            "instanceType": "g6e.2xlarge",
+            "cloud": "AWS",
+            "gpu": "1xNVIDIA L40S",
+            "gpuRAM": "48 GB",
+            "quantization": "none",
+            "tgi": "TGI 2.2.0",
+            "status": "OK",
+            "tokensPerSecond": "160"
+          }
+        ]
+      },
+      {
+        "name": "Arcee-Scribe",
+        "modelType": "InternLM2.5 8B",
+        "configurations": [
+          {
+            "cloud": "us-west-2",
+            "instanceType": "g5.2xlarge",
+            "gpu": "1xNVIDIA A10G",
+            "gpuRAM": "24 GB",
+            "quantization": "none",
+            "tgi": "DJL 0.28 vLLM",
+            "status": "OK",
+            "tokensPerSecond": 29,
+            "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",'
+          },
+          {
+            "cloud": "us-west-2",
+            "instanceType": "g5.12xlarge",
+            "gpu": "4xNVIDIA A10G",
+            "gpuRAM": "96 GB",
+            "quantization": "none",
+            "tgi": "DJL 0.28 vLLM",
+            "status": "OK",
+            "tokensPerSecond": 65,
+            "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",\nNot supported by AutoAWQ and AutoGPTQ'
+          },
+          {
+            "cloud": "us-west-2",
+            "instanceType": "g5.48xlarge",
+            "gpu": "8xNVIDIA A10G",
+            "gpuRAM": "192 GB",
+            "quantization": "none",
+            "tgi": "DJL 0.28 vLLM",
+            "status": "OK",
+            "tokensPerSecond": 80,
+            "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",'
+          },
+          {
+            "cloud": "us-west-2",
+            "instanceType": "g6.2xlarge",
+            "gpu": "1xNVIDIA L4",
+            "gpuRAM": "24 GB",
+            "quantization": "none",
+            "tgi": "DJL 0.28 vLLM",
+            "status": "OK",
+            "tokensPerSecond": 16,
+            "notes": '"OPTION_MAX_MODEL_LEN": "4096"'
+          },
+          {
+            "cloud": "us-west-2",
+            "instanceType": "g6.12xlarge",
+            "gpu": "4xNVIDIA L4",
+            "gpuRAM": "96 GB",
+            "quantization": "none",
+            "tgi": "DJL 0.28 vLLM",
+            "status": "OK",
+            "tokensPerSecond": 50,
+            "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",'
+          },
+          {
+            "cloud": "us-west-2",
+            "instanceType": "g6.48xlarge",
+            "gpu": "8xNVIDIA L4",
+            "gpuRAM": "192 GB",
+            "quantization": "none",
+            "tgi": "DJL 0.28 vLLM",
+            "status": "OK",
+            "tokensPerSecond": 69,
+            "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",'
+          },
+          {
+            "cloud": "us-west-2",
+            "instanceType": "p4d.24xlarge",
+            "gpu": "4xNVIDIA A100",
+            "gpuRAM": "320 GB",
+            "quantization": "none",
+            "tgi": "DJL 0.28 vLLM",
+            "status": "OK",
+            "tokensPerSecond": 82,
+            "notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",'
+          }
+        ]
+      }
+    ]
+  }