|
|
|
|
|
from __future__ import annotations |
|
|
|
from pathlib import Path |
|
from urllib import request |
|
import os |
|
import shlex |
|
import subprocess |
|
import sys |
|
from typing import Any, Sequence |
|
import logging |
|
import json |
|
import argparse |
|
|
|
curdir = Path(os.path.dirname(__file__)) |
|
|
|
logger = logging.getLogger("bench") |
|
|
|
MODEL_DIR = curdir / "bench-TriLMs-models" |
|
LLAMA_CPP_PATH = curdir / "." |
|
MODEL_SIZES = ("1.5", "2.4", "3.9") |
|
ALL_TYPES = ("TQ1_0", "TQ2_0", "Q4_K_M", "Q8_0", "F16", "BF16") |
|
GPU_TYPES = ("TQ2_0", "Q4_K_M", "Q8_0", "F16") |
|
|
|
|
|
def gather_models(sizes: Sequence[str] = MODEL_SIZES): |
|
logger.info("Gathering models") |
|
if not MODEL_DIR.exists(): |
|
MODEL_DIR.mkdir(parents=True, exist_ok=True) |
|
for size in sizes: |
|
filename = f"TriLM_{size}B_Unpacked-TQ1_0-F16.gguf" |
|
file = MODEL_DIR / filename |
|
if not file.exists(): |
|
url = ( |
|
f"https://huggingface.co/compilade/quant-tests/resolve/main/{filename}" |
|
) |
|
logger.info(f"Fetching {filename} from {url}") |
|
request.urlretrieve(url, file) |
|
|
|
|
|
def build_llama_cpp(options: Sequence[str]): |
|
logger.info("Building llama.cpp") |
|
os.chdir(LLAMA_CPP_PATH) |
|
builddir = LLAMA_CPP_PATH / "build" |
|
if builddir.exists(): |
|
os.system("pwd") |
|
os.system("rm -Ir build") |
|
builddir.mkdir() |
|
os.chdir(builddir) |
|
os.system(shlex.join(("cmake", "..", *options))) |
|
os.system("make -j llama-bench llama-quantize test-backend-ops") |
|
|
|
|
|
def quantize(types: Sequence[str] = ALL_TYPES, sizes: Sequence[str] = MODEL_SIZES): |
|
logger.info("Make all model types we'll test") |
|
for size in sizes: |
|
source = MODEL_DIR / f"TriLM_{size}B_Unpacked-TQ1_0-F16.gguf" |
|
for ty in types: |
|
target = MODEL_DIR / f"TriLM_{size}B_Unpacked-{ty}.gguf" |
|
if not target.exists(): |
|
command = shlex.join( |
|
( |
|
str(LLAMA_CPP_PATH / "build" / "bin" / "llama-quantize"), |
|
"--allow-requantize", |
|
str(source), |
|
str(target), |
|
ty, |
|
) |
|
) |
|
logger.info("Running: %s", command) |
|
os.system(command) |
|
|
|
|
|
def llama_bench( |
|
repetitions: int = 5, |
|
types: Sequence[str] = ALL_TYPES, |
|
sizes: Sequence[str] = MODEL_SIZES, |
|
) -> list[dict[str, Any]]: |
|
logger.info("Test each model one by one for different numbers of threads") |
|
|
|
threads = [2**i for i in range(5) if 2**i <= os.cpu_count()] |
|
logger.info(f"Numbers of threads to be tested: {threads}") |
|
|
|
out = [] |
|
|
|
for size in sizes: |
|
for ty in types: |
|
for th in threads: |
|
model_path = MODEL_DIR / f"TriLM_{size}B_Unpacked-{ty}.gguf" |
|
args = [ |
|
"-v", |
|
"-m", |
|
str(model_path), |
|
"-t", |
|
str(th), |
|
"-r", |
|
str(repetitions), |
|
"-p", |
|
"512", |
|
"-n", |
|
"128", |
|
"-o", |
|
"json", |
|
] |
|
result = subprocess.run( |
|
[str(LLAMA_CPP_PATH / "build" / "bin" / "llama-bench")] + args, |
|
capture_output=True, |
|
) |
|
logger.debug(result.stderr) |
|
|
|
new_output = json.loads(result.stdout) |
|
logger.info(json.dumps(new_output, indent=4)) |
|
out.extend(new_output) |
|
return out |
|
|
|
|
|
def test_backend_perf() -> str: |
|
result = subprocess.run( |
|
[ |
|
str(LLAMA_CPP_PATH / "build" / "bin" / "test-backend-ops"), |
|
"perf", |
|
"-o", |
|
"MUL_MAT", |
|
], |
|
capture_output=True, |
|
) |
|
return result.stdout.decode(encoding="utf-8") |
|
|
|
|
|
def parse_args(args: Sequence[str]): |
|
parser = argparse.ArgumentParser( |
|
prog=args[0], description="Benchmark ternary models" |
|
) |
|
parser.add_argument("--gpu", action="store_true", help="Run benchmarks on GPU") |
|
parser.add_argument("--cpu", action="store_true", help="Run benchmarks on CPU") |
|
parser.add_argument( |
|
"--llama-cpp-path", |
|
type=Path, |
|
default=LLAMA_CPP_PATH, |
|
help="Path to a llama.cpp checkout", |
|
) |
|
parser.add_argument( |
|
"--model-dir", |
|
type=Path, |
|
default=MODEL_DIR, |
|
help="Where the tested models will be stored", |
|
) |
|
parser.add_argument( |
|
"--repetitions", |
|
type=int, |
|
default=5, |
|
required=False, |
|
help="How many repetitions are run for each test", |
|
) |
|
parser.add_argument( |
|
"--out", |
|
type=Path, |
|
default=Path(os.path.curdir) / "result.json", |
|
help="Path of the benchmark results to be written", |
|
) |
|
return parser.parse_args(args[1:]) |
|
|
|
|
|
if __name__ == "__main__": |
|
args = parse_args(sys.argv) |
|
|
|
LLAMA_CPP_PATH = args.llama_cpp_path |
|
MODEL_DIR = args.model_dir |
|
|
|
results = [] |
|
repetitions: int = args.repetitions |
|
|
|
if args.cpu: |
|
gather_models() |
|
build_llama_cpp(["-DGGML_NATIVE=ON", "-DGGML_CPU=ON"]) |
|
quantize() |
|
results.extend(llama_bench(repetitions=repetitions)) |
|
|
|
if args.gpu: |
|
gather_models() |
|
build_llama_cpp(["-DGGML_NATIVE=ON", "-DGGML_CUDA=ON", "-DGGML_CUDA_F16=ON"]) |
|
quantize() |
|
results.extend(llama_bench(repetitions=repetitions, types=GPU_TYPES)) |
|
|
|
cpuinfo = subprocess.run(["lscpu"], capture_output=True).stdout.decode( |
|
encoding="utf-8" |
|
) |
|
mulmat_perf = test_backend_perf() |
|
|
|
final_result = { |
|
"cpuinfo": cpuinfo, |
|
"mulmat_perf": mulmat_perf, |
|
"results": results, |
|
} |
|
with open(args.out, "w") as f: |
|
json.dump(results, f, indent=4) |
|
|