Spaces:
Running
Running
"""Module containing performance results for the Arcee-Nova model.""" | |
results_arcee_nova = { | |
"name": "Arcee-Nova", | |
"modelType": "Qwen2 72B", | |
"notes": "", | |
"configurations": [ | |
{ | |
"instanceType": "g4dn.12xlarge", | |
"quantization": "bitsandbytes-nf4", | |
"container": "TGI 2.2.0", | |
"status": "KO", | |
"tokensPerSecond": "-", | |
"notes": "Flash Attention requires Ampere GPUs or newer", | |
}, | |
{ | |
"instanceType": "g5.12xlarge", | |
"configurations": [ | |
{ | |
"quantization": "bitsandbytes-nf4", | |
"container": "TGI 2.2.0", | |
"status": "OK", | |
"tokensPerSecond": "12", | |
}, | |
{ | |
"quantization": "bitsandbytes-fp4", | |
"container": "TGI 2.2.0", | |
"status": "OK", | |
"tokensPerSecond": "12", | |
}, | |
{ | |
"quantization": "bitsandbytes (int8)", | |
"container": "TGI 2.2.0", | |
"status": "KO", | |
"tokensPerSecond": "-", | |
"notes": "CUDA OOM", | |
}, | |
{ | |
"quantization": "eetq (int8)", | |
"container": "TGI 2.2.0", | |
"status": "KO", | |
"tokensPerSecond": "-", | |
"notes": "[FT Error] Heurisitc failed to find a valid config.", | |
}, | |
], | |
}, | |
{ | |
"instanceType": "g5.48xlarge", | |
"configurations": [ | |
{ | |
"quantization": "none", | |
"container": "TGI 2.2.0", | |
"status": "KO", | |
"tokensPerSecond": "-", | |
"notes": "CUDA OOM (but g6.48xlarge works!)", | |
}, | |
{ | |
"quantization": "bitsandbytes-nf4", | |
"container": "TGI 2.2.0", | |
"status": "OK", | |
"tokensPerSecond": "12.3", | |
}, | |
{ | |
"quantization": "bitsandbytes-fp4", | |
"container": "TGI 2.2.0", | |
"status": "OK", | |
"tokensPerSecond": "12.5", | |
}, | |
{ | |
"quantization": "bitsandbytes (int8)", | |
"container": "TGI 2.2.0", | |
"status": "KO", | |
"tokensPerSecond": "-", | |
"notes": "The model deploys, but inference times out.", | |
}, | |
], | |
}, | |
{ | |
"instanceType": "g6.12xlarge", | |
"configurations": [ | |
{ | |
"quantization": "bitsandbytes-nf4", | |
"container": "TGI 2.2.0", | |
"status": "OK", | |
"tokensPerSecond": "1.5-2", | |
"notes": "Too slow, timeouts are likely", | |
}, | |
{ | |
"quantization": "bitsandbytes-fp4", | |
"container": "TGI 2.2.0", | |
"status": "OK", | |
"tokensPerSecond": "2", | |
"notes": "Too slow, timeouts are likely", | |
}, | |
{ | |
"quantization": "bitsandbytes (int8)", | |
"container": "TGI 2.2.0", | |
"status": "KO", | |
"tokensPerSecond": "-", | |
"notes": "CUDA OOM", | |
}, | |
], | |
}, | |
{ | |
"instanceType": "g6.48xlarge", | |
"quantization": "none", | |
"container": "TGI 2.2.0", | |
"status": "OK", | |
"tokensPerSecond": "12", | |
}, | |
{ | |
"instanceType": "g6e.12xlarge", | |
"configurations": [ | |
{ | |
"quantization": "none", | |
"container": "TGI 2.2.0", | |
"status": "OK", | |
"tokensPerSecond": "17", | |
}, | |
{ | |
"quantization": "none", | |
"container": "vLLM 0.5.5", | |
"status": "OK", | |
"tokensPerSecond": "17.8", | |
}, | |
{ | |
"quantization": "none", | |
"container": "SGLang 0.2.13", | |
"status": "OK", | |
"tokensPerSecond": "18.2", | |
}, | |
], | |
}, | |
{ | |
"instanceType": "p4d.24xlarge", | |
"quantization": "none", | |
"container": "TGI 2.2.0", | |
"status": "OK", | |
"tokensPerSecond": "40", | |
"notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",', | |
}, | |
{ | |
"instanceType": "p4de.24xlarge", | |
"quantization": "none", | |
"container": "TGI 2.2.0", | |
"status": "waiting for quota", | |
}, | |
{ | |
"instanceType": "p5.48xlarge", | |
"quantization": "none", | |
"container": "TGI 2.2.0", | |
"status": "OK", | |
"tokensPerSecond": "58", | |
"notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",', | |
}, | |
{ | |
"instanceType": "p5.48xlarge", | |
"quantization": "none", | |
"container": "vLLM 0.6.4.post1", | |
"status": "OK", | |
"tokensPerSecond": "76", | |
"notes": "--tensor-parallel-size 8", | |
}, | |
{ | |
"instanceType": "p5.48xlarge (4 GPUs)", | |
"quantization": "none", | |
"container": "vLLM 0.6.4.post1", | |
"status": "OK", | |
"tokensPerSecond": "51", | |
"notes": "--tensor-parallel-size 4", | |
}, | |
{ | |
"instanceType": "p5.48xlarge (2 GPUs)", | |
"quantization": "none", | |
"container": "vLLM 0.6.4.post1", | |
"status": "OK", | |
"tokensPerSecond": "32", | |
"notes": "--tensor-parallel-size 2 --max-model-len 16384 --gpu_memory-utilization 0.95", | |
}, | |
], | |
} | |