Spaces:

arcee-ai
/

Benchmarks

Running

File size: 6,213 Bytes

"""Module containing performance results for the Arcee-Nova model."""

results_arcee_nova = {
    "name": "Arcee-Nova",
    "modelType": "Qwen2 72B",
    "notes": "",
    "configurations": [
        {
            "instanceType": "g4dn.12xlarge",
            "quantization": "bitsandbytes-nf4",
            "container": "TGI 2.2.0",
            "status": "KO",
            "tokensPerSecond": "-",
            "notes": "Flash Attention requires Ampere GPUs or newer",
        },
        {
            "instanceType": "g5.12xlarge",
            "configurations": [
                {
                    "quantization": "bitsandbytes-nf4",
                    "container": "TGI 2.2.0",
                    "status": "OK",
                    "tokensPerSecond": "12",
                },
                {
                    "quantization": "bitsandbytes-fp4",
                    "container": "TGI 2.2.0",
                    "status": "OK",
                    "tokensPerSecond": "12",
                },
                {
                    "quantization": "bitsandbytes (int8)",
                    "container": "TGI 2.2.0",
                    "status": "KO",
                    "tokensPerSecond": "-",
                    "notes": "CUDA OOM",
                },
                {
                    "quantization": "eetq (int8)",
                    "container": "TGI 2.2.0",
                    "status": "KO",
                    "tokensPerSecond": "-",
                    "notes": "[FT Error] Heurisitc failed to find a valid config.",
                },
            ],
        },
        {
            "instanceType": "g5.48xlarge",
            "configurations": [
                {
                    "quantization": "none",
                    "container": "TGI 2.2.0",
                    "status": "KO",
                    "tokensPerSecond": "-",
                    "notes": "CUDA OOM (but g6.48xlarge works!)",
                },
                {
                    "quantization": "bitsandbytes-nf4",
                    "container": "TGI 2.2.0",
                    "status": "OK",
                    "tokensPerSecond": "12.3",
                },
                {
                    "quantization": "bitsandbytes-fp4",
                    "container": "TGI 2.2.0",
                    "status": "OK",
                    "tokensPerSecond": "12.5",
                },
                {
                    "quantization": "bitsandbytes (int8)",
                    "container": "TGI 2.2.0",
                    "status": "KO",
                    "tokensPerSecond": "-",
                    "notes": "The model deploys, but inference times out.",
                },
            ],
        },
        {
            "instanceType": "g6.12xlarge",
            "configurations": [
                {
                    "quantization": "bitsandbytes-nf4",
                    "container": "TGI 2.2.0",
                    "status": "OK",
                    "tokensPerSecond": "1.5-2",
                    "notes": "Too slow, timeouts are likely",
                },
                {
                    "quantization": "bitsandbytes-fp4",
                    "container": "TGI 2.2.0",
                    "status": "OK",
                    "tokensPerSecond": "2",
                    "notes": "Too slow, timeouts are likely",
                },
                {
                    "quantization": "bitsandbytes (int8)",
                    "container": "TGI 2.2.0",
                    "status": "KO",
                    "tokensPerSecond": "-",
                    "notes": "CUDA OOM",
                },
            ],
        },
        {
            "instanceType": "g6.48xlarge",
            "quantization": "none",
            "container": "TGI 2.2.0",
            "status": "OK",
            "tokensPerSecond": "12",
        },
        {
            "instanceType": "g6e.12xlarge",
            "configurations": [
                {
                    "quantization": "none",
                    "container": "TGI 2.2.0",
                    "status": "OK",
                    "tokensPerSecond": "17",
                },
                {
                    "quantization": "none",
                    "container": "vLLM 0.5.5",
                    "status": "OK",
                    "tokensPerSecond": "17.8",
                },
                {
                    "quantization": "none",
                    "container": "SGLang 0.2.13",
                    "status": "OK",
                    "tokensPerSecond": "18.2",
                },
            ],
        },
        {
            "instanceType": "p4d.24xlarge",
            "quantization": "none",
            "container": "TGI 2.2.0",
            "status": "OK",
            "tokensPerSecond": "40",
            "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
        },
        {
            "instanceType": "p4de.24xlarge",
            "quantization": "none",
            "container": "TGI 2.2.0",
            "status": "waiting for quota",
        },
        {
            "instanceType": "p5.48xlarge",
            "quantization": "none",
            "container": "TGI 2.2.0",
            "status": "OK",
            "tokensPerSecond": "58",
            "notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
        },
        {
            "instanceType": "p5.48xlarge",
            "quantization": "none",
            "container": "vLLM 0.6.4.post1",
            "status": "OK",
            "tokensPerSecond": "76",
            "notes": "--tensor-parallel-size 8",
        },
        {
            "instanceType": "p5.48xlarge (4 GPUs)",
            "quantization": "none",
            "container": "vLLM 0.6.4.post1",
            "status": "OK",
            "tokensPerSecond": "51",
            "notes": "--tensor-parallel-size 4",
        },
        {
            "instanceType": "p5.48xlarge (2 GPUs)",
            "quantization": "none",
            "container": "vLLM 0.6.4.post1",
            "status": "OK",
            "tokensPerSecond": "32",
            "notes": "--tensor-parallel-size 2 --max-model-len 16384 --gpu_memory-utilization 0.95",
        },
    ],
}