Spaces:
Running
Running
"""Module containing performance results for the Arcee-SuperNova model.""" | |
results_arcee_supernova = { | |
"name": "Arcee-SuperNova", | |
"modelType": "Llama 3.1 70B", | |
"configurations": [ | |
{ | |
"instanceType": "c7g.16xlarge", | |
"quantization": "Q4_0_8_8", | |
"container": "llama.cpp 9/19/24", | |
"status": "OK", | |
"tokensPerSecond": "6.5", | |
"notes": "", | |
}, | |
{ | |
"instanceType": "r8g.16xlarge", | |
"quantization": "Q4_0_4_8", | |
"container": "llama.cpp 9/19/24", | |
"status": "OK", | |
"tokensPerSecond": "9.8", | |
"notes": "With Flash Attention", | |
}, | |
{ | |
"instanceType": "g5.12xlarge", | |
"quantization": "awq", | |
"container": "TGI 2.2.0", | |
"status": "OK", | |
"tokensPerSecond": "33", | |
"notes": "MAX_INPUT_TOKENS: 8192, MAX_TOTAL_TOKENS: 16384", | |
}, | |
{ | |
"instanceType": "g6e.2xlarge", | |
"quantization": "awq (w4 g128)", | |
"container": "vLLM 0.6.2", | |
"status": "OK", | |
"tokensPerSecond": "18", | |
"notes": "--max-model-len 10000 --max-num-seqs 16 --enforce-eager", | |
}, | |
{ | |
"instanceType": "g6e.2xlarge", | |
"quantization": "Q4_K_M", | |
"container": "llama.cpp 10/2/24", | |
"status": "OK", | |
"tokensPerSecond": "16", | |
"notes": "-ngl 81 -c 13000 -fa -t 8", | |
}, | |
{ | |
"instanceType": "g6e.12xlarge", | |
"quantization": "none", | |
"container": "vLLM 0.6.3", | |
"status": "OK", | |
"tokensPerSecond": "18.6", | |
"notes": "--max-model-len 16384", | |
}, | |
{ | |
"instanceType": "p4d.24xlarge", | |
"quantization": "awq", | |
"container": "TGI 2.2.0", | |
"status": "OK", | |
"tokensPerSecond": "58", | |
"notes": "MAX_INPUT_TOKENS: 16384, MAX_TOTAL_TOKENS: 32768", | |
}, | |
{ | |
"instanceType": "p5.48xlarge", | |
"quantization": "awq", | |
"container": "TGI 2.2.0", | |
"status": "OK", | |
"tokensPerSecond": "73", | |
"notes": "MAX_INPUT_TOKENS: 16384, MAX_TOTAL_TOKENS: 32768", | |
}, | |
{ | |
"instanceType": "inf2.24xlarge", | |
"configurations": [ | |
{ | |
"quantization": "none", | |
"container": "LMI 0.29+transformers-neuronx 0.11.351", | |
"status": "KO", | |
"tokensPerSecond": "-", | |
"notes": "OOM bs=2,seqlen=4096 - SDK 2.19.1", | |
}, | |
{ | |
"quantization": "none", | |
"container": "LMI 0.29+transformers-neuronx 0.11.351", | |
"status": "KO", | |
"tokensPerSecond": "-", | |
"notes": "OOM bs=2,seqlen=2048 - SDK 2.19.1", | |
}, | |
{ | |
"quantization": "8-bit", | |
"container": "LMI 0.29+transformers-neuronx 0.11.351", | |
"status": "???", | |
"tokensPerSecond": "???", | |
"notes": "bs=2,seqlen=8192 - SDK 2.19.1 - OPTION_LOAD_IN_8BIT=True", | |
}, | |
], | |
}, | |
{ | |
"instanceType": "inf2.48xlarge", | |
"configurations": [ | |
{ | |
"quantization": "none", | |
"container": "LMI 0.29+transformers-neuronx 0.11.351", | |
"status": "OK", | |
"tokensPerSecond": "28", | |
"notes": "bs=4,seqlen=4096 - SDK 2.19.1", | |
}, | |
{ | |
"quantization": "none", | |
"container": "LMI 0.29+transformers-neuronx 0.11.351", | |
"status": "OK", | |
"tokensPerSecond": "24", | |
"notes": "bs=2,seqlen=8192 - SDK 2.19.1", | |
}, | |
{ | |
"quantization": "none", | |
"container": "LMI 0.29+transformers-neuronx 0.11.351", | |
"status": "KO", | |
"tokensPerSecond": "-", | |
"notes": "OOM bs=2,seqlen=16384 - SDK 2.19.1", | |
}, | |
], | |
}, | |
{ | |
"instanceType": "trn1.32xlarge", | |
"configurations": [ | |
{ | |
"quantization": "none", | |
"container": "LMI 0.29+transformers-neuronx 0.11.351", | |
"status": "OK", | |
"tokensPerSecond": "32", | |
"notes": "bs=2,seqlen=8192 - SDK 2.19.1", | |
}, | |
{ | |
"quantization": "none", | |
"container": "LMI 0.30rc1", | |
"status": "OK", | |
"tokensPerSecond": "34", | |
"notes": "bs=2,seqlen=8192 - SDK 2.20", | |
}, | |
], | |
}, | |
{ | |
"instanceType": "p4d.24xlarge", | |
"configurations": [ | |
{ | |
"quantization": "none", | |
"container": "TGI 2.2.0", | |
"status": "OK", | |
"tokensPerSecond": "30", | |
"notes": "", | |
}, | |
{ | |
"quantization": "none", | |
"container": "LMI 0.29+vLLM 0.5.5", | |
"status": "OK", | |
"tokensPerSecond": "45", | |
"notes": "OPTION_MAX_MODEL_LEN 64k", | |
}, | |
], | |
}, | |
{ | |
"instanceType": "p5.48xlarge", | |
"configurations": [ | |
{ | |
"quantization": "none", | |
"container": "TGI 2.2.0", | |
"status": "OK", | |
"tokensPerSecond": "58", | |
"notes": "MAX_INPUT_TOKENS: 16384, MAX_TOTAL_TOKENS: 32768", | |
}, | |
{ | |
"quantization": "none", | |
"container": "LMI 0.29+vLLM 0.5.5", | |
"status": "OK", | |
"tokensPerSecond": "70", | |
"notes": "OPTION_MAX_MODEL_LEN 128k", | |
}, | |
{ | |
"quantization": "none", | |
"container": "LMI 0.29+vLLM 0.5.5", | |
"status": "OK", | |
"tokensPerSecond": "70", | |
"notes": "OPTION_ENFORCE_EAGER=True", | |
}, | |
], | |
}, | |
{ | |
"instanceType": "p5.48xlarge", | |
"quantization": "none", | |
"container": "vLLM 0.6.4.post1", | |
"status": "OK", | |
"tokensPerSecond": "77", | |
"notes": "--tensor-parallel-size 8", | |
}, | |
{ | |
"instanceType": "p5.48xlarge (4 GPUs)", | |
"quantization": "none", | |
"container": "vLLM 0.6.4.post1", | |
"status": "OK", | |
"tokensPerSecond": "53", | |
"notes": "--tensor-parallel-size 4", | |
}, | |
{ | |
"instanceType": "p5.48xlarge (2 GPUs)", | |
"quantization": "none", | |
"container": "vLLM 0.6.4.post1", | |
"status": "OK", | |
"tokensPerSecond": "33", | |
"notes": "--tensor-parallel-size 2", | |
}, | |
], | |
} | |