Spaces:

arcee-ai
/

Benchmarks

Running

Benchmarks / results.py

Julien Simon

Add Llama-Spark on g6e.2xlarge with SGLang

6f23d6c 4 months ago

24.9 kB

	results = {
	"models": [
	{
	"name": "Arcee-Meraj",
	"modelType": "Qwen2 72B",
	"configurations": [
	{
	"region": "us-west-2",
	"instanceType": "g5.12xlarge",
	"cloud": "AWS",
	"gpu": "4xNVIDIA A10G",
	"gpuRAM": "96 GB",
	"quantization": "awq",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "33",
	"notes": "",
	},
	{
	"region": "us-west-2",
	"instanceType": "p4d.24xlarge",
	"cloud": "AWS",
	"gpu": "4xNVIDIA A100",
	"gpuRAM": "320 GB",
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "38",
	"notes": "",
	}
	],
	},
	{
	"name": "Arcee-SuperNova",
	"modelType": "Llama 3.1 70B",
	"configurations": [
	{
	"region": "us-west-2",
	"instanceType": "g5.12xlarge",
	"cloud": "AWS",
	"gpu": "4xNVIDIA A10G",
	"gpuRAM": "96 GB",
	"quantization": "awq",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "33",
	"notes": "",
	},
	{
	"region": "us-west-2",
	"instanceType": "p4d.24xlarge",
	"cloud": "AWS",
	"gpu": "4xNVIDIA A100",
	"gpuRAM": "320 GB",
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "38",
	"notes": "",
	}
	],
	},
	{
	"name": "Arcee-Nova",
	"modelType": "Qwen2 72B",
	"notes": "",
	"configurations": [
	{
	"region": "us-west-2",
	"instanceType": "g4dn.12xlarge",
	"cloud": "AWS",
	"gpu": "4xNVIDIA T4",
	"gpuRAM": "64 GB",
	"quantization": "bitsandbytes-nf4",
	"tgi": "TGI 2.2.0",
	"status": "KO",
	"tokensPerSecond": "-",
	"notes": "Flash Attention requires Ampere GPUs or newer",
	},
	{
	"region": "us-west-2",
	"instanceType": "g5.12xlarge",
	"cloud": "AWS",
	"gpu": "4xNVIDIA A10G",
	"gpuRAM": "96 GB",
	"configurations": [
	{
	"quantization": "bitsandbytes-nf4",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "12",
	},
	{
	"quantization": "bitsandbytes-fp4",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "12",
	},
	{
	"quantization": "bitsandbytes (int8)",
	"tgi": "TGI 2.2.0",
	"status": "KO",
	"tokensPerSecond": "-",
	"notes": "CUDA OOM",
	},
	{
	"quantization": "eetq (int8)",
	"tgi": "TGI 2.2.0",
	"status": "KO",
	"tokensPerSecond": "-",
	"notes": "[FT Error] Heurisitc failed to find a valid config.",
	},
	],
	},
	{
	"region": "us-west-2",
	"instanceType": "g5.48xlarge",
	"cloud": "AWS",
	"gpu": "8xNVIDIA A10G",
	"gpuRAM": "192 GB",
	"configurations": [
	{
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "KO",
	"tokensPerSecond": "-",
	"notes": "CUDA OOM (but g6.48xlarge works!)",
	},
	{
	"quantization": "bitsandbytes-nf4",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "12.3",
	},
	{
	"quantization": "bitsandbytes-fp4",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "12.5",
	},
	{
	"quantization": "bitsandbytes (int8)",
	"tgi": "TGI 2.2.0",
	"status": "KO",
	"tokensPerSecond": "-",
	"notes": "The model deploys, but inference times out.",
	},
	],
	},
	{
	"region": "us-west-2",
	"instanceType": "g6.12xlarge",
	"cloud": "AWS",
	"gpu": "4xNVIDIA L4",
	"gpuRAM": "96 GB",
	"configurations": [
	{
	"quantization": "bitsandbytes-nf4",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "1.5-2",
	"notes": "Too slow, timeouts are likely",
	},
	{
	"quantization": "bitsandbytes-fp4",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "2",
	"notes": "Too slow, timeouts are likely",
	},
	{
	"quantization": "bitsandbytes (int8)",
	"tgi": "TGI 2.2.0",
	"status": "KO",
	"tokensPerSecond": "-",
	"notes": "CUDA OOM",
	},
	],
	},
	{
	"region": "us-west-2",
	"instanceType": "g6.48xlarge",
	"cloud": "AWS",
	"gpu": "8xNVIDIA L4",
	"gpuRAM": "192 GB",
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "12",
	},
	{
	"region": "us-west-2",
	"instanceType": "p4d.24xlarge",
	"cloud": "AWS",
	"gpu": "8xNVIDIA A100",
	"gpuRAM": "320 GB",
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "40",
	"notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
	},
	{
	"region": "us-west-2",
	"instanceType": "p4de.24xlarge",
	"cloud": "AWS",
	"gpu": "8xNVIDIA A100",
	"gpuRAM": "320 GB",
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "waiting for quota",
	},
	{
	"region": "us-west-2",
	"instanceType": "p5.48xlarge",
	"cloud": "AWS",
	"gpu": "8xNVIDIA H100",
	"gpuRAM": "640GB",
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "58",
	"notes": '"MAX_INPUT_LENGTH": "16384", "MAX_TOTAL_TOKENS": "32768",',
	},
	{
	"region": "us-west-2",
	"instanceType": "inf2.*",
	"cloud": "AWS",
	"gpu": "-",
	"tgi": "TGI 2.2.0",
	"status": "not supported",
	"tokensPerSecond": "-",
	"notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
	},
	],
	},
	{
	"name": "Llama-Spark",
	"modelType": "Llama 3.1 8B",
	"configurations": [
	{
	"region": "AWS",
	"instanceType": "g5.2xlarge",
	"cloud": "AWS",
	"gpu": "1xNVIDIA A10G",
	"gpuRAM": "24 GB",
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "29",
	"notes": "4K/8K fails",
	},
	{
	"region": "AWS",
	"instanceType": "g5.12xlarge",
	"cloud": "AWS",
	"gpu": "4xNVIDIA A10G",
	"gpuRAM": "96 GB",
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "85",
	"notes": '"MAX_INPUT_TOKENS": "16384", "MAX_TOTAL_TOKENS": "32768",',
	},
	{
	"region": "AWS",
	"instanceType": "g5.48xlarge",
	"cloud": "AWS",
	"gpu": "8xNVIDIA A10G",
	"gpuRAM": "192 GB",
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "105",
	"notes": '"MAX_INPUT_TOKENS": "20480", "MAX_TOTAL_TOKENS": "40960"\n\n32K/64K fails',
	},
	{
	"region": "AWS",
	"instanceType": "g6.2xlarge",
	"cloud": "AWS",
	"gpu": "1xNVIDIA L4",
	"gpuRAM": "24 GB",
	"configurations": [
	{
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "15",
	},
	{"quantization": "fp8", "tgi": "TGI 2.2.0"},
	],
	},
	{
	"region": "AWS",
	"instanceType": "g6.12xlarge",
	"cloud": "AWS",
	"gpu": "4xNVIDIA L4",
	"gpuRAM": "96 GB",
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "51",
	"notes": "same as g5?",
	},
	{
	"region": "AWS",
	"instanceType": "g6.48xlarge",
	"cloud": "AWS",
	"gpu": "8xNVIDIA L4",
	"gpuRAM": "192 GB",
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "81",
	"notes": "same as g5?",
	},
	{
	"region": "AWS",
	"instanceType": "g6e.2xlarge",
	"cloud": "AWS",
	"gpu": "1xNVIDIA L40S",
	"gpuRAM": "48 GB",
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "42.1",
	},
	{
	"region": "AWS",
	"instanceType": "g6e.2xlarge",
	"cloud": "AWS",
	"gpu": "1xNVIDIA L40S",
	"gpuRAM": "48 GB",
	"quantization": "none",
	"tgi": "SGLang 0.2.13",
	"status": "OK",
	"tokensPerSecond": "45",
	},
	{
	"region": "AWS",
	"instanceType": "p4d.24xlarge",
	"cloud": "AWS",
	"gpu": "4xNVIDIA A100",
	"gpuRAM": "320 GB",
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "145",
	"notes": '"MAX_INPUT_TOKENS": "40960", "MAX_TOTAL_TOKENS": "81920"\n\n64K/128K fails (even with 4-bit)',
	},
	{
	"region": "AWS",
	"instanceType": "inf2.*",
	"cloud": "AWS",
	"gpu": "-",
	"status": "not supported",
	"tokensPerSecond": "-",
	"notes": "Llama-3.1: TGI OK, Neuron SDK OK, optimum-neuron KO",
	},
	],
	},
	{
	"name": "Arcee-Agent",
	"modelType": "Qwen2 7B",
	"notes": "",
	"configurations": [
	{
	"region": "us-west-2",
	"instanceType": "g5.2xlarge",
	"cloud": "AWS",
	"gpu": "1xNVIDIA A10G",
	"gpuRAM": "24 GB",
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "30",
	},
	{
	"region": "us-west-2",
	"instanceType": "g5.12xlarge",
	"cloud": "AWS",
	"gpu": "4xNVIDIA A10G",
	"gpuRAM": "96 GB",
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "83",
	},
	{
	"region": "us-west-2",
	"instanceType": "g5.48xlarge",
	"cloud": "AWS",
	"gpu": "8xNVIDIA A10G",
	"gpuRAM": "192 GB",
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "KO",
	"tokensPerSecond": "-",
	"notes": "ValueError: `num_heads` must be divisible by `num_shards` (got `num_heads`: 28 and `num_shards`: 8\n\nSM_NUM_GPUS=7 doesn't work either because tensor size ares not a multiple of 7 (e.g., 512)",
	},
	{
	"region": "us-west-2",
	"instanceType": "g6.2xlarge",
	"cloud": "AWS",
	"gpu": "1xNVIDIA L4",
	"gpuRAM": "24 GB",
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "16.3",
	},
	{
	"region": "us-west-2",
	"instanceType": "g6.12xlarge",
	"cloud": "AWS",
	"gpu": "4xNVIDIA L4",
	"gpuRAM": "96 GB",
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "54.2",
	},
	{
	"region": "us-west-2",
	"instanceType": "inf2.*",
	"cloud": "AWS",
	"gpu": "-",
	"tgi": "TGI 2.2.0",
	"status": "not supported",
	"tokensPerSecond": "-",
	"notes": "Qwen2: TGI OK, Neuron SDK KO, optimum-neuron KO",
	},
	],
	},
	{"name": "Arcee-Spark", "modelType": "Qwen2 7B"},
	{
	"name": "Arcee-Lite",
	"modelType": "Qwen2 1.5B distilled from phi-3-medium 14B",
	"configurations": [
	{
	"region": "us-west-2",
	"instanceType": "c6i.xlarge",
	"cloud": "AWS",
	"gpu": "-",
	"gpuRAM": "-",
	"quantization": "bitsandbytes-nf4",
	"tgi": "TGI 2.2.0",
	"status": "KO",
	"tokensPerSecond": "-",
	"notes": "OOM, might work with a prequantized model",
	},
	{
	"region": "us-west-2",
	"instanceType": "c6i.2xlarge",
	"cloud": "AWS",
	"gpu": "-",
	"gpuRAM": "-",
	"quantization": "bitsandbytes-nf4",
	"tgi": "TGI 2.2.0",
	"status": "KO",
	"tokensPerSecond": "-",
	"notes": "OOM, might work with a prequantized model",
	},
	{
	"region": "us-west-2",
	"instanceType": "c6i.4xlarge",
	"cloud": "AWS",
	"gpu": "-",
	"gpuRAM": "-",
	"configurations": [
	{
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "10.7",
	},
	{
	"quantization": "bitsandbytes (int8)",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "10.5",
	},
	{
	"quantization": "bitsandbytes-nf4",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "10.6",
	},
	],
	},
	{
	"region": "us-west-2",
	"instanceType": "c7i.4xlarge",
	"cloud": "AWS",
	"gpu": "-",
	"gpuRAM": "-",
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "waiting for quota",
	"tokensPerSecond": "-",
	},
	{
	"region": "us-west-2",
	"instanceType": "g5.xlarge",
	"cloud": "AWS",
	"gpu": "1xNVIDIA A10G",
	"gpuRAM": "24 GB",
	"configurations": [
	{
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "110",
	},
	{
	"quantization": "none",
	"tgi": "DJL 0.28 vLLM",
	"status": "OK",
	"tokensPerSecond": "105",
	"notes": '"OPTION_MAX_MODEL_LEN": "32768",',
	},
	],
	},
	{
	"region": "us-west-2",
	"instanceType": "g6e.2xlarge",
	"cloud": "AWS",
	"gpu": "1xNVIDIA L40S",
	"gpuRAM": "48 GB",
	"quantization": "none",
	"tgi": "TGI 2.2.0",
	"status": "OK",
	"tokensPerSecond": "160",
	},
	],
	},
	{
	"name": "Arcee-Scribe",
	"modelType": "InternLM2.5 8B",
	"configurations": [
	{
	"cloud": "AWS",
	"instanceType": "g5.2xlarge",
	"gpu": "1xNVIDIA A10G",
	"gpuRAM": "24 GB",
	"quantization": "none",
	"tgi": "DJL 0.28 vLLM",
	"status": "OK",
	"tokensPerSecond": 29,
	"notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
	},
	{
	"cloud": "AWS",
	"instanceType": "g5.12xlarge",
	"gpu": "4xNVIDIA A10G",
	"gpuRAM": "96 GB",
	"quantization": "none",
	"tgi": "DJL 0.28 vLLM",
	"status": "OK",
	"tokensPerSecond": 65,
	"notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",\nNot supported by AutoAWQ and AutoGPTQ',
	},
	{
	"cloud": "AWS",
	"instanceType": "g5.48xlarge",
	"gpu": "8xNVIDIA A10G",
	"gpuRAM": "192 GB",
	"quantization": "none",
	"tgi": "DJL 0.28 vLLM",
	"status": "OK",
	"tokensPerSecond": 80,
	"notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
	},
	{
	"cloud": "AWS",
	"instanceType": "g6.2xlarge",
	"gpu": "1xNVIDIA L4",
	"gpuRAM": "24 GB",
	"quantization": "none",
	"tgi": "DJL 0.28 vLLM",
	"status": "OK",
	"tokensPerSecond": 16,
	"notes": '"OPTION_MAX_MODEL_LEN": "4096"',
	},
	{
	"cloud": "AWS",
	"instanceType": "g6.12xlarge",
	"gpu": "4xNVIDIA L4",
	"gpuRAM": "96 GB",
	"quantization": "none",
	"tgi": "DJL 0.28 vLLM",
	"status": "OK",
	"tokensPerSecond": 50,
	"notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
	},
	{
	"cloud": "AWS",
	"instanceType": "g6.48xlarge",
	"gpu": "8xNVIDIA L4",
	"gpuRAM": "192 GB",
	"quantization": "none",
	"tgi": "DJL 0.28 vLLM",
	"status": "OK",
	"tokensPerSecond": 69,
	"notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
	},
	{
	"cloud": "AWS",
	"instanceType": "g6e.2xlarge",
	"gpu": "1xNVIDIA L40S",
	"gpuRAM": "48 GB",
	"quantization": "none",
	"tgi": "SGLang 0.2.13",
	"status": "OK",
	"tokensPerSecond": 46,
	},
	{
	"cloud": "AWS",
	"instanceType": "p4d.24xlarge",
	"gpu": "4xNVIDIA A100",
	"gpuRAM": "320 GB",
	"quantization": "none",
	"tgi": "DJL 0.28 vLLM",
	"status": "OK",
	"tokensPerSecond": 82,
	"notes": '"OPTION_MAX_MODEL_LEN": "32768",\n"TENSOR_PARALLEL_DEGREE": "max",',
	},
	],
	},
	]
	}