#!/usr/bin/env bash set -eux cd "$(dirname "$0")" MODEL_DIR="bench-TriLMs-models" LLAMA_CPP_PATH="." sizes=("1.5" "2.4" "3.9") types=("TQ1_0" "TQ2_0" "Q4_K_M" "Q8_0" "F16" "BF16") gputypes=("TQ2_0" "Q4_K_M" "Q8_0" "F16") function gather_models() { echo Gather the models if [ ! -d "$MODEL_DIR" ]; then mkdir -p -- "$MODEL_DIR" fi ( cd "$MODEL_DIR" for sz in "${sizes[@]}"; do filename="TriLM_${sz}B_Unpacked-TQ1_0-F16.gguf" if [ ! -f "$filename" ]; then wget "https://huggingface.co/compilade/quant-tests/resolve/main/${filename}" fi done ) } function build_llama_cpp() { echo Build llama.cpp for CPU ( cd -- "$LLAMA_CPP_PATH" if [ -d build ]; then pwd rm -rf build fi mkdir build cd build cmake .. "$@" make -j llama-bench llama-quantize ) } function quantize() { echo "Make all model types we'll test" ( for sz in "${sizes[@]}"; do for ty in "${types[@]}"; do filenames=("$MODEL_DIR"/TriLM_"${sz}"B_Unpacked-{TQ1_0-F16,"$ty"}.gguf) if [ ! -f "${filenames[1]}" ]; then "$LLAMA_CPP_PATH"/build/bin/llama-quantize --allow-requantize "${filenames[@]}" "$ty" fi done done ) } function bench() { echo Test each model one by one for different numbers of threads for sz in "${sizes[@]}"; do for ty in "$@"; do for th in 1 2 4 8; do { "$LLAMA_CPP_PATH"/build/bin/llama-bench -v -m "${MODEL_DIR}/TriLM_${sz}B_Unpacked-${ty}.gguf" -t "${th}" -p 512 -n 128 -r 4 -o json printf "%s\n" "," } done done done } function bench_cpu() { bench "${types[@]}" >> "$1" } function bench_gpu() { bench "${gputypes[@]}" >> "$1" } currentTime="$(date +'%s')" resultFile="results-${currentTime}.json" infoFile="results-${currentTime}-info.txt" lscpu > "$infoFile" gather_models build_llama_cpp -DGGML_NATIVE=ON -DGGML_CPU=ON quantize echo "---" >> "$infoFile" ls -go "$MODEL_DIR" >> "$infoFile" bench_cpu "$resultFile" if [ -x "$(command -v nvidia-smi)" ]; then echo GPU detected, benchark with that too. build_llama_cpp -DGGML_NATIVE=ON -DGGML_CUDA=ON -DGGML_CUDA_F16=ON bench_gpu "$resultFile" fi