"""Module containing performance results for the Arcee-Nova model.""" results_arcee_nova = { "name": "Arcee-Nova", "modelType": "Qwen2 72B", "notes": "", "configurations": [ { "instanceType": "g4dn.12xlarge", "quantization": "bitsandbytes-nf4", "container": "TGI 2.2.0", "status": "KO", "tokensPerSecond": "-", "notes": "Flash Attention requires Ampere GPUs or newer", }, { "instanceType": "g5.12xlarge", "configurations": [ { "quantization": "bitsandbytes-nf4", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "12", }, { "quantization": "bitsandbytes-fp4", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "12", }, { "quantization": "bitsandbytes (int8)", "container": "TGI 2.2.0", "status": "KO", "tokensPerSecond": "-", "notes": "CUDA OOM", }, { "quantization": "eetq (int8)", "container": "TGI 2.2.0", "status": "KO", "tokensPerSecond": "-", "notes": "[FT Error] Heurisitc failed to find a valid config.", }, ], }, { "instanceType": "g5.48xlarge", "configurations": [ { "quantization": "none", "container": "TGI 2.2.0", "status": "KO", "tokensPerSecond": "-", "notes": "CUDA OOM (but g6.48xlarge works!)", }, { "quantization": "bitsandbytes-nf4", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "12.3", }, { "quantization": "bitsandbytes-fp4", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "12.5", }, { "quantization": "bitsandbytes (int8)", "container": "TGI 2.2.0", "status": "KO", "tokensPerSecond": "-", "notes": "The model deploys, but inference times out.", }, ], }, { "instanceType": "g6.12xlarge", "configurations": [ { "quantization": "bitsandbytes-nf4", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "1.5-2", "notes": "Too slow, timeouts are likely", }, { "quantization": "bitsandbytes-fp4", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "2", "notes": "Too slow, timeouts are likely", }, { "quantization": "bitsandbytes (int8)", "container": "TGI 2.2.0", "status": "KO", "tokensPerSecond": "-", "notes": "CUDA OOM", }, ], }, { "instanceType": "g6.48xlarge", "quantization": "none", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "12", }, { "instanceType": "g6e.12xlarge", "configurations": [ { "quantization": "none", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "17", }, { "quantization": "none", "container": "vLLM 0.5.5", "status": "OK", "tokensPerSecond": "17.8", }, { "quantization": "none", "container": "SGLang 0.2.13", "status": "OK", "tokensPerSecond": "18.2", }, ], }, { "instanceType": "p4d.24xlarge", "quantization": "none", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "40", "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",', }, { "instanceType": "p4de.24xlarge", "quantization": "none", "container": "TGI 2.2.0", "status": "waiting for quota", }, { "instanceType": "p5.48xlarge", "quantization": "none", "container": "TGI 2.2.0", "status": "OK", "tokensPerSecond": "58", "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",', }, { "instanceType": "p5.48xlarge", "quantization": "none", "container": "vLLM 0.6.4.post1", "status": "OK", "tokensPerSecond": "76", "notes": "--tensor-parallel-size 8", }, { "instanceType": "p5.48xlarge (4 GPUs)", "quantization": "none", "container": "vLLM 0.6.4.post1", "status": "OK", "tokensPerSecond": "51", "notes": "--tensor-parallel-size 4", }, { "instanceType": "p5.48xlarge (2 GPUs)", "quantization": "none", "container": "vLLM 0.6.4.post1", "status": "OK", "tokensPerSecond": "32", "notes": "--tensor-parallel-size 2 --max-model-len 16384 --gpu_memory-utilization 0.95", }, ], }