Benchmarks / results_arcee_nova.py
Julien Simon
Fix case
633e287
"""Module containing performance results for the Arcee-Nova model."""
results_arcee_nova = {
"name": "Arcee-Nova",
"modelType": "Qwen2 72B",
"notes": "",
"configurations": [
{
"instanceType": "g4dn.12xlarge",
"quantization": "bitsandbytes-nf4",
"container": "TGI 2.2.0",
"status": "KO",
"tokensPerSecond": "-",
"notes": "Flash Attention requires Ampere GPUs or newer",
},
{
"instanceType": "g5.12xlarge",
"configurations": [
{
"quantization": "bitsandbytes-nf4",
"container": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "12",
},
{
"quantization": "bitsandbytes-fp4",
"container": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "12",
},
{
"quantization": "bitsandbytes (int8)",
"container": "TGI 2.2.0",
"status": "KO",
"tokensPerSecond": "-",
"notes": "CUDA OOM",
},
{
"quantization": "eetq (int8)",
"container": "TGI 2.2.0",
"status": "KO",
"tokensPerSecond": "-",
"notes": "[FT Error] Heurisitc failed to find a valid config.",
},
],
},
{
"instanceType": "g5.48xlarge",
"configurations": [
{
"quantization": "none",
"container": "TGI 2.2.0",
"status": "KO",
"tokensPerSecond": "-",
"notes": "CUDA OOM (but g6.48xlarge works!)",
},
{
"quantization": "bitsandbytes-nf4",
"container": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "12.3",
},
{
"quantization": "bitsandbytes-fp4",
"container": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "12.5",
},
{
"quantization": "bitsandbytes (int8)",
"container": "TGI 2.2.0",
"status": "KO",
"tokensPerSecond": "-",
"notes": "The model deploys, but inference times out.",
},
],
},
{
"instanceType": "g6.12xlarge",
"configurations": [
{
"quantization": "bitsandbytes-nf4",
"container": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "1.5-2",
"notes": "Too slow, timeouts are likely",
},
{
"quantization": "bitsandbytes-fp4",
"container": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "2",
"notes": "Too slow, timeouts are likely",
},
{
"quantization": "bitsandbytes (int8)",
"container": "TGI 2.2.0",
"status": "KO",
"tokensPerSecond": "-",
"notes": "CUDA OOM",
},
],
},
{
"instanceType": "g6.48xlarge",
"quantization": "none",
"container": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "12",
},
{
"instanceType": "g6e.12xlarge",
"configurations": [
{
"quantization": "none",
"container": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "17",
},
{
"quantization": "none",
"container": "vLLM 0.5.5",
"status": "OK",
"tokensPerSecond": "17.8",
},
{
"quantization": "none",
"container": "SGLang 0.2.13",
"status": "OK",
"tokensPerSecond": "18.2",
},
],
},
{
"instanceType": "p4d.24xlarge",
"quantization": "none",
"container": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "40",
"notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
},
{
"instanceType": "p4de.24xlarge",
"quantization": "none",
"container": "TGI 2.2.0",
"status": "waiting for quota",
},
{
"instanceType": "p5.48xlarge",
"quantization": "none",
"container": "TGI 2.2.0",
"status": "OK",
"tokensPerSecond": "58",
"notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
},
{
"instanceType": "p5.48xlarge",
"quantization": "none",
"container": "vLLM 0.6.4.post1",
"status": "OK",
"tokensPerSecond": "76",
"notes": "--tensor-parallel-size 8",
},
{
"instanceType": "p5.48xlarge (4 GPUs)",
"quantization": "none",
"container": "vLLM 0.6.4.post1",
"status": "OK",
"tokensPerSecond": "51",
"notes": "--tensor-parallel-size 4",
},
{
"instanceType": "p5.48xlarge (2 GPUs)",
"quantization": "none",
"container": "vLLM 0.6.4.post1",
"status": "OK",
"tokensPerSecond": "32",
"notes": "--tensor-parallel-size 2 --max-model-len 16384 --gpu_memory-utilization 0.95",
},
],
}