"""Module containing performance results for the Llama-3-Supernova-Lite model.""" results_llama_supernova_lite = { "name": "Llama-3.1-SuperNova-Lite", "modelType": "Llama 3.1 8B", "configurations": [ { "instanceType": "c7i.4xlarge", "configurations": [ { "quantization": "Q6_K", "container": "llama.cpp 10/18/24", "status": "OK", "tokensPerSecond": "xxx", "notes": "AMX enabled, Flash Attention enabled", }, { "quantization": "Q5_K", "container": "llama.cpp 10/18/24", "status": "OK", "tokensPerSecond": "xxx", "notes": "AMX enabled, Flash Attention enabled", }, { "quantization": "Q4_K", "container": "llama.cpp 10/18/24", "status": "OK", "tokensPerSecond": "xxx", "notes": "AMX enabled, Flash Attention enabled", }, { "quantization": "IQ4_XS", "container": "llama.cpp 10/18/24", "status": "OK", "tokensPerSecond": "xxx", "notes": "AMX enabled, Flash Attention enabled", }, ], }, { "instanceType": "c7g.8xlarge", "quantization": "Q4_0_8_8", "container": "llama.cpp 9/18/24", "status": "OK", "tokensPerSecond": "39.7", "notes": "requantized from Q4_K_S", }, { "instanceType": "c7g.16xlarge", "quantization": "Q4_0_8_8", "container": "llama.cpp 9/18/24", "status": "OK", "tokensPerSecond": "45.5", "notes": "", }, { "instanceType": "c8g.4xlarge", "quantization": "Q4_0_4_8", "container": "llama.cpp 11/05/24", "status": "OK", "tokensPerSecond": "34", "notes": "with Flash Attention", }, { "instanceType": "r8g.4xlarge", "quantization": "Q4_0_4_8", "container": "llama.cpp 9/11/24", "status": "OK", "tokensPerSecond": "40", "notes": "with Flash Attention", }, { "instanceType": "r8g.8xlarge", "quantization": "Q4_0_4_8", "container": "llama.cpp 9/11/24", "status": "OK", "tokensPerSecond": "63", "notes": "with Flash Attention", }, { "instanceType": "r8g.16xlarge", "quantization": "Q4_0_4_8", "container": "llama.cpp 9/11/24", "status": "OK", "tokensPerSecond": "70", "notes": "with Flash Attention", }, ], }