Benchmarks / results.py
Julien Simon
Add Llama-Spark on g6e.2xlarge with SGLang
6f23d6c
raw
history blame
24.9 kB
results = {
"models": [
{
"name": "Arcee-Meraj",
"modelType": "Qwen2 72B",
"configurations": [
{
"region": "us-west-2",
"instanceType": "g5.12xlarge",
"cloud": "AWS",
"gpu": "4xNVIDIA A10G",
"gpuRAM": "96 GB",
"quantization": "awq",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "33",
"notes": "",
},
{
"region": "us-west-2",
"instanceType": "p4d.24xlarge",
"cloud": "AWS",
"gpu": "4xNVIDIA A100",
"gpuRAM": "320 GB",
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "38",
"notes": "",
}
],
},
{
"name": "Arcee-SuperNova",
"modelType": "Llama 3.1 70B",
"configurations": [
{
"region": "us-west-2",
"instanceType": "g5.12xlarge",
"cloud": "AWS",
"gpu": "4xNVIDIA A10G",
"gpuRAM": "96 GB",
"quantization": "awq",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "33",
"notes": "",
},
{
"region": "us-west-2",
"instanceType": "p4d.24xlarge",
"cloud": "AWS",
"gpu": "4xNVIDIA A100",
"gpuRAM": "320 GB",
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "38",
"notes": "",
}
],
},
{
"name": "Arcee-Nova",
"modelType": "Qwen2 72B",
"notes": "",
"configurations": [
{
"region": "us-west-2",
"instanceType": "g4dn.12xlarge",
"cloud": "AWS",
"gpu": "4xNVIDIA T4",
"gpuRAM": "64 GB",
"quantization": "bitsandbytes-nf4",
"tgi": "TGI 2.2.0",
"status": "KO",
"tokensPerSecond": "-",
"notes": "Flash Attention requires Ampere GPUs or newer",
},
{
"region": "us-west-2",
"instanceType": "g5.12xlarge",
"cloud": "AWS",
"gpu": "4xNVIDIA A10G",
"gpuRAM": "96 GB",
"configurations": [
{
"quantization": "bitsandbytes-nf4",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "12",
},
{
"quantization": "bitsandbytes-fp4",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "12",
},
{
"quantization": "bitsandbytes (int8)",
"tgi": "TGI 2.2.0",
"status": "KO",
"tokensPerSecond": "-",
"notes": "CUDA OOM",
},
{
"quantization": "eetq (int8)",
"tgi": "TGI 2.2.0",
"status": "KO",
"tokensPerSecond": "-",
"notes": "[FT Error] Heurisitc failed to find a valid config.",
},
],
},
{
"region": "us-west-2",
"instanceType": "g5.48xlarge",
"cloud": "AWS",
"gpu": "8xNVIDIA A10G",
"gpuRAM": "192 GB",
"configurations": [
{
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "KO",
"tokensPerSecond": "-",
"notes": "CUDA OOM (but g6.48xlarge works!)",
},
{
"quantization": "bitsandbytes-nf4",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "12.3",
},
{
"quantization": "bitsandbytes-fp4",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "12.5",
},
{
"quantization": "bitsandbytes (int8)",
"tgi": "TGI 2.2.0",
"status": "KO",
"tokensPerSecond": "-",
"notes": "The model deploys, but inference times out.",
},
],
},
{
"region": "us-west-2",
"instanceType": "g6.12xlarge",
"cloud": "AWS",
"gpu": "4xNVIDIA L4",
"gpuRAM": "96 GB",
"configurations": [
{
"quantization": "bitsandbytes-nf4",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "1.5-2",
"notes": "Too slow, timeouts are likely",
},
{
"quantization": "bitsandbytes-fp4",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "2",
"notes": "Too slow, timeouts are likely",
},
{
"quantization": "bitsandbytes (int8)",
"tgi": "TGI 2.2.0",
"status": "KO",
"tokensPerSecond": "-",
"notes": "CUDA OOM",
},
],
},
{
"region": "us-west-2",
"instanceType": "g6.48xlarge",
"cloud": "AWS",
"gpu": "8xNVIDIA L4",
"gpuRAM": "192 GB",
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "12",
},
{
"region": "us-west-2",
"instanceType": "p4d.24xlarge",
"cloud": "AWS",
"gpu": "8xNVIDIA A100",
"gpuRAM": "320 GB",
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "40",
"notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
},
{
"region": "us-west-2",
"instanceType": "p4de.24xlarge",
"cloud": "AWS",
"gpu": "8xNVIDIA A100",
"gpuRAM": "320 GB",
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "waiting for quota",
},
{
"region": "us-west-2",
"instanceType": "p5.48xlarge",
"cloud": "AWS",
"gpu": "8xNVIDIA H100",
"gpuRAM": "640GB",
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "58",
"notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
},
{
"region": "us-west-2",
"instanceType": "inf2.*",
"cloud": "AWS",
"gpu": "-",
"tgi": "TGI 2.2.0",
"status": "not supported",
"tokensPerSecond": "-",
"notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
},
],
},
{
"name": "Llama-Spark",
"modelType": "Llama 3.1 8B",
"configurations": [
{
"region": "AWS",
"instanceType": "g5.2xlarge",
"cloud": "AWS",
"gpu": "1xNVIDIA A10G",
"gpuRAM": "24 GB",
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "29",
"notes": "4K/8K fails",
},
{
"region": "AWS",
"instanceType": "g5.12xlarge",
"cloud": "AWS",
"gpu": "4xNVIDIA A10G",
"gpuRAM": "96 GB",
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "85",
"notes": '"MAX_INPUT_TOKENS": "16384", "MAX_TOTAL_TOKENS": "32768",',
},
{
"region": "AWS",
"instanceType": "g5.48xlarge",
"cloud": "AWS",
"gpu": "8xNVIDIA A10G",
"gpuRAM": "192 GB",
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "105",
"notes": '"MAX_INPUT_TOKENS": "20480", "MAX_TOTAL_TOKENS": "40960"\n\n32K/64K fails',
},
{
"region": "AWS",
"instanceType": "g6.2xlarge",
"cloud": "AWS",
"gpu": "1xNVIDIA L4",
"gpuRAM": "24 GB",
"configurations": [
{
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "15",
},
{"quantization": "fp8", "tgi": "TGI 2.2.0"},
],
},
{
"region": "AWS",
"instanceType": "g6.12xlarge",
"cloud": "AWS",
"gpu": "4xNVIDIA L4",
"gpuRAM": "96 GB",
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "51",
"notes": "same as g5?",
},
{
"region": "AWS",
"instanceType": "g6.48xlarge",
"cloud": "AWS",
"gpu": "8xNVIDIA L4",
"gpuRAM": "192 GB",
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "81",
"notes": "same as g5?",
},
{
"region": "AWS",
"instanceType": "g6e.2xlarge",
"cloud": "AWS",
"gpu": "1xNVIDIA L40S",
"gpuRAM": "48 GB",
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "42.1",
},
{
"region": "AWS",
"instanceType": "g6e.2xlarge",
"cloud": "AWS",
"gpu": "1xNVIDIA L40S",
"gpuRAM": "48 GB",
"quantization": "none",
"tgi": "SGLang 0.2.13",
"status": "OK",
"tokensPerSecond": "45",
},
{
"region": "AWS",
"instanceType": "p4d.24xlarge",
"cloud": "AWS",
"gpu": "4xNVIDIA A100",
"gpuRAM": "320 GB",
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "145",
"notes": '"MAX_INPUT_TOKENS": "40960", "MAX_TOTAL_TOKENS": "81920"\n\n64K/128K fails (even with 4-bit)',
},
{
"region": "AWS",
"instanceType": "inf2.*",
"cloud": "AWS",
"gpu": "-",
"status": "not supported",
"tokensPerSecond": "-",
"notes": "Llama-3.1: TGI OK, Neuron SDK OK, optimum-neuron KO",
},
],
},
{
"name": "Arcee-Agent",
"modelType": "Qwen2 7B",
"notes": "",
"configurations": [
{
"region": "us-west-2",
"instanceType": "g5.2xlarge",
"cloud": "AWS",
"gpu": "1xNVIDIA A10G",
"gpuRAM": "24 GB",
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "30",
},
{
"region": "us-west-2",
"instanceType": "g5.12xlarge",
"cloud": "AWS",
"gpu": "4xNVIDIA A10G",
"gpuRAM": "96 GB",
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "83",
},
{
"region": "us-west-2",
"instanceType": "g5.48xlarge",
"cloud": "AWS",
"gpu": "8xNVIDIA A10G",
"gpuRAM": "192 GB",
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "KO",
"tokensPerSecond": "-",
"notes": "ValueError: `num_heads` must be divisible by `num_shards` (got `num_heads`: 28 and `num_shards`: 8\n\nSM_NUM_GPUS=7 doesn't work either because tensor size ares not a multiple of 7 (e.g., 512)",
},
{
"region": "us-west-2",
"instanceType": "g6.2xlarge",
"cloud": "AWS",
"gpu": "1xNVIDIA L4",
"gpuRAM": "24 GB",
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "16.3",
},
{
"region": "us-west-2",
"instanceType": "g6.12xlarge",
"cloud": "AWS",
"gpu": "4xNVIDIA L4",
"gpuRAM": "96 GB",
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "54.2",
},
{
"region": "us-west-2",
"instanceType": "inf2.*",
"cloud": "AWS",
"gpu": "-",
"tgi": "TGI 2.2.0",
"status": "not supported",
"tokensPerSecond": "-",
"notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
},
],
},
{"name": "Arcee-Spark", "modelType": "Qwen2 7B"},
{
"name": "Arcee-Lite",
"modelType": "Qwen2 1.5B distilled from phi-3-medium 14B",
"configurations": [
{
"region": "us-west-2",
"instanceType": "c6i.xlarge",
"cloud": "AWS",
"gpu": "-",
"gpuRAM": "-",
"quantization": "bitsandbytes-nf4",
"tgi": "TGI 2.2.0",
"status": "KO",
"tokensPerSecond": "-",
"notes": "OOM, might work with a prequantized model",
},
{
"region": "us-west-2",
"instanceType": "c6i.2xlarge",
"cloud": "AWS",
"gpu": "-",
"gpuRAM": "-",
"quantization": "bitsandbytes-nf4",
"tgi": "TGI 2.2.0",
"status": "KO",
"tokensPerSecond": "-",
"notes": "OOM, might work with a prequantized model",
},
{
"region": "us-west-2",
"instanceType": "c6i.4xlarge",
"cloud": "AWS",
"gpu": "-",
"gpuRAM": "-",
"configurations": [
{
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "10.7",
},
{
"quantization": "bitsandbytes (int8)",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "10.5",
},
{
"quantization": "bitsandbytes-nf4",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "10.6",
},
],
},
{
"region": "us-west-2",
"instanceType": "c7i.4xlarge",
"cloud": "AWS",
"gpu": "-",
"gpuRAM": "-",
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "waiting for quota",
"tokensPerSecond": "-",
},
{
"region": "us-west-2",
"instanceType": "g5.xlarge",
"cloud": "AWS",
"gpu": "1xNVIDIA A10G",
"gpuRAM": "24 GB",
"configurations": [
{
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "110",
},
{
"quantization": "none",
"tgi": "DJL 0.28 vLLM",
"status": "OK",
"tokensPerSecond": "105",
"notes": '"OPTION_MAX_MODEL_LEN": "32768",',
},
],
},
{
"region": "us-west-2",
"instanceType": "g6e.2xlarge",
"cloud": "AWS",
"gpu": "1xNVIDIA L40S",
"gpuRAM": "48 GB",
"quantization": "none",
"tgi": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "160",
},
],
},
{
"name": "Arcee-Scribe",
"modelType": "InternLM2.5 8B",
"configurations": [
{
"cloud": "AWS",
"instanceType": "g5.2xlarge",
"gpu": "1xNVIDIA A10G",
"gpuRAM": "24 GB",
"quantization": "none",
"tgi": "DJL 0.28 vLLM",
"status": "OK",
"tokensPerSecond": 29,
"notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
},
{
"cloud": "AWS",
"instanceType": "g5.12xlarge",
"gpu": "4xNVIDIA A10G",
"gpuRAM": "96 GB",
"quantization": "none",
"tgi": "DJL 0.28 vLLM",
"status": "OK",
"tokensPerSecond": 65,
"notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",\nNot supported by AutoAWQ and AutoGPTQ',
},
{
"cloud": "AWS",
"instanceType": "g5.48xlarge",
"gpu": "8xNVIDIA A10G",
"gpuRAM": "192 GB",
"quantization": "none",
"tgi": "DJL 0.28 vLLM",
"status": "OK",
"tokensPerSecond": 80,
"notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
},
{
"cloud": "AWS",
"instanceType": "g6.2xlarge",
"gpu": "1xNVIDIA L4",
"gpuRAM": "24 GB",
"quantization": "none",
"tgi": "DJL 0.28 vLLM",
"status": "OK",
"tokensPerSecond": 16,
"notes": '"OPTION_MAX_MODEL_LEN": "4096"',
},
{
"cloud": "AWS",
"instanceType": "g6.12xlarge",
"gpu": "4xNVIDIA L4",
"gpuRAM": "96 GB",
"quantization": "none",
"tgi": "DJL 0.28 vLLM",
"status": "OK",
"tokensPerSecond": 50,
"notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
},
{
"cloud": "AWS",
"instanceType": "g6.48xlarge",
"gpu": "8xNVIDIA L4",
"gpuRAM": "192 GB",
"quantization": "none",
"tgi": "DJL 0.28 vLLM",
"status": "OK",
"tokensPerSecond": 69,
"notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
},
{
"cloud": "AWS",
"instanceType": "g6e.2xlarge",
"gpu": "1xNVIDIA L40S",
"gpuRAM": "48 GB",
"quantization": "none",
"tgi": "SGLang 0.2.13",
"status": "OK",
"tokensPerSecond": 46,
},
{
"cloud": "AWS",
"instanceType": "p4d.24xlarge",
"gpu": "4xNVIDIA A100",
"gpuRAM": "320 GB",
"quantization": "none",
"tgi": "DJL 0.28 vLLM",
"status": "OK",
"tokensPerSecond": 82,
"notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
},
],
},
]
}