compilade
/

quant-tests

GGUF

Inference Endpoints

Model card Files Files and versions Community

compilade commited on 16 days ago

Commit

7b12d2f

1 Parent(s): eac53d7

Add a Python-based benchmarking script

Browse files

Files changed (2) hide show

bench-TriLMs.py +197 -0
bench-TriLMs.sh +1 -1

bench-TriLMs.py ADDED Viewed

	@@ -0,0 +1,197 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+from pathlib import Path
+from urllib import request
+import os
+import shlex
+import subprocess
+import sys
+from typing import Any, Sequence
+import logging
+import json
+import argparse
+curdir = Path(os.path.dirname(__file__))
+logger = logging.getLogger("bench")
+MODEL_DIR = curdir / "bench-TriLMs-models"
+LLAMA_CPP_PATH = curdir / "."
+MODEL_SIZES = ("1.5", "2.4", "3.9")
+ALL_TYPES = ("TQ1_0", "TQ2_0", "Q4_K_M", "Q8_0", "F16", "BF16")
+GPU_TYPES = ("TQ2_0", "Q4_K_M", "Q8_0", "F16")
+def gather_models(sizes: Sequence[str] = MODEL_SIZES):
+    logger.info("Gathering models")
+    if not MODEL_DIR.exists():
+        MODEL_DIR.mkdir(parents=True, exist_ok=True)
+    for size in sizes:
+        filename = f"TriLM_{size}B_Unpacked-TQ1_0-F16.gguf"
+        file = MODEL_DIR / filename
+        if not file.exists():
+            url = (
+                f"https://huggingface.co/compilade/quant-tests/resolve/main/{filename}"
+            )
+            logger.info(f"Fetching {filename} from {url}")
+            request.urlretrieve(url, file)
+def build_llama_cpp(options: Sequence[str]):
+    logger.info("Building llama.cpp")
+    os.chdir(LLAMA_CPP_PATH)
+    builddir = LLAMA_CPP_PATH / "build"
+    if builddir.exists():
+        os.system("pwd")
+        os.system("rm -Ir build")
+    builddir.mkdir()
+    os.chdir(builddir)
+    os.system(shlex.join(("cmake", "..", *options)))
+    os.system("make -j llama-bench llama-quantize test-backend-ops")
+def quantize(types: Sequence[str] = ALL_TYPES, sizes: Sequence[str] = MODEL_SIZES):
+    logger.info("Make all model types we'll test")
+    for size in sizes:
+        source = MODEL_DIR / f"TriLM_{size}B_Unpacked-TQ1_0-F16.gguf"
+        for ty in types:
+            target = MODEL_DIR / f"TriLM_{size}B_Unpacked-{ty}.gguf"
+            if not target.exists():
+                command = shlex.join(
+                    (
+                        str(LLAMA_CPP_PATH / "build" / "bin" / "llama-quantize"),
+                        "--allow-requantize",
+                        str(source),
+                        str(target),
+                        ty,
+                    )
+                )
+                logger.info("Running: %s", command)
+                os.system(command)
+def llama_bench(
+    repetitions: int = 5,
+    types: Sequence[str] = ALL_TYPES,
+    sizes: Sequence[str] = MODEL_SIZES,
+) -> list[dict[str, Any]]:
+    logger.info("Test each model one by one for different numbers of threads")
+    threads = [2**i for i in range(5) if 2**i <= os.cpu_count()]
+    logger.info(f"Numbers of threads to be tested: {threads}")
+    out = []
+    for size in sizes:
+        for ty in types:
+            for th in threads:
+                model_path = MODEL_DIR / f"TriLM_{size}B_Unpacked-{ty}.gguf"
+                args = [
+                    "-v",
+                    "-m",
+                    str(model_path),
+                    "-t",
+                    str(th),
+                    "-r",
+                    str(repetitions),
+                    "-p",
+                    "512",
+                    "-n",
+                    "128",
+                    "-o",
+                    "json",
+                ]
+                result = subprocess.run(
+                    [str(LLAMA_CPP_PATH / "build" / "bin" / "llama-bench")] + args,
+                    capture_output=True,
+                )
+                logger.debug(result.stderr)
+                new_output = json.loads(result.stdout)
+                logger.info(json.dumps(new_output, indent=4))
+                out.extend(new_output)
+    return out
+def test_backend_perf() -> str:
+    result = subprocess.run(
+        [
+            str(LLAMA_CPP_PATH / "build" / "bin" / "test-backend-ops"),
+            "perf",
+            "-o",
+            "MUL_MAT",
+        ],
+        capture_output=True,
+    )
+    return result.stdout.decode(encoding="utf-8")
+def parse_args(args: Sequence[str]):
+    parser = argparse.ArgumentParser(
+        prog=args[0], description="Benchmark ternary models"
+    )
+    parser.add_argument("--gpu", action="store_true", help="Run benchmarks on GPU")
+    parser.add_argument("--cpu", action="store_true", help="Run benchmarks on CPU")
+    parser.add_argument(
+        "--llama-cpp-path",
+        type=Path,
+        default=LLAMA_CPP_PATH,
+        help="Path to a llama.cpp checkout",
+    )
+    parser.add_argument(
+        "--model-dir",
+        type=Path,
+        default=MODEL_DIR,
+        help="Where the tested models will be stored",
+    )
+    parser.add_argument(
+        "--repetitions",
+        type=int,
+        default=5,
+        required=False,
+        help="How many repetitions are run for each test",
+    )
+    parser.add_argument(
+        "--out",
+        type=Path,
+        default=Path(os.path.curdir) / "result.json",
+        help="Path of the benchmark results to be written",
+    )
+    return parser.parse_args(args[1:])
+if __name__ == "__main__":
+    args = parse_args(sys.argv)
+    LLAMA_CPP_PATH = args.llama_cpp_path
+    MODEL_DIR = args.model_dir
+    results = []
+    repetitions: int = args.repetitions
+    if args.cpu:
+        gather_models()
+        build_llama_cpp(["-DGGML_NATIVE=ON", "-DGGML_CPU=ON"])
+        quantize()
+        results.extend(llama_bench(repetitions=repetitions))
+    if args.gpu:
+        gather_models()
+        build_llama_cpp(["-DGGML_NATIVE=ON", "-DGGML_CUDA=ON", "-DGGML_CUDA_F16=ON"])
+        quantize()
+        results.extend(llama_bench(repetitions=repetitions, types=GPU_TYPES))
+    cpuinfo = subprocess.run(["lscpu"], capture_output=True).stdout.decode(
+        encoding="utf-8"
+    )
+    mulmat_perf = test_backend_perf()
+    final_result = {
+        "cpuinfo": cpuinfo,
+        "mulmat_perf": mulmat_perf,
+        "results": results,
+    }
+    with open(args.out, "w") as f:
+        json.dump(results, f, indent=4)

bench-TriLMs.sh CHANGED Viewed

@@ -7,7 +7,7 @@ MODEL_DIR="bench-TriLMs-models"
 LLAMA_CPP_PATH="."
 sizes=("1.5" "2.4" "3.9")
 types=("TQ1_0" "TQ2_0" "Q4_K_M" "Q8_0" "F16" "BF16")
-gputypes=("Q4_K_M" "Q8_0" "F16" "BF16")
 function gather_models() {
   echo Gather the models

 LLAMA_CPP_PATH="."
 sizes=("1.5" "2.4" "3.9")
 types=("TQ1_0" "TQ2_0" "Q4_K_M" "Q8_0" "F16" "BF16")
+gputypes=("TQ2_0" "Q4_K_M" "Q8_0" "F16")
 function gather_models() {
   echo Gather the models