compilade commited on
Commit
7b12d2f
·
1 Parent(s): eac53d7

Add a Python-based benchmarking script

Browse files
Files changed (2) hide show
  1. bench-TriLMs.py +197 -0
  2. bench-TriLMs.sh +1 -1
bench-TriLMs.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from urllib import request
7
+ import os
8
+ import shlex
9
+ import subprocess
10
+ import sys
11
+ from typing import Any, Sequence
12
+ import logging
13
+ import json
14
+ import argparse
15
+
16
+ curdir = Path(os.path.dirname(__file__))
17
+
18
+ logger = logging.getLogger("bench")
19
+
20
+ MODEL_DIR = curdir / "bench-TriLMs-models"
21
+ LLAMA_CPP_PATH = curdir / "."
22
+ MODEL_SIZES = ("1.5", "2.4", "3.9")
23
+ ALL_TYPES = ("TQ1_0", "TQ2_0", "Q4_K_M", "Q8_0", "F16", "BF16")
24
+ GPU_TYPES = ("TQ2_0", "Q4_K_M", "Q8_0", "F16")
25
+
26
+
27
+ def gather_models(sizes: Sequence[str] = MODEL_SIZES):
28
+ logger.info("Gathering models")
29
+ if not MODEL_DIR.exists():
30
+ MODEL_DIR.mkdir(parents=True, exist_ok=True)
31
+ for size in sizes:
32
+ filename = f"TriLM_{size}B_Unpacked-TQ1_0-F16.gguf"
33
+ file = MODEL_DIR / filename
34
+ if not file.exists():
35
+ url = (
36
+ f"https://huggingface.co/compilade/quant-tests/resolve/main/{filename}"
37
+ )
38
+ logger.info(f"Fetching {filename} from {url}")
39
+ request.urlretrieve(url, file)
40
+
41
+
42
+ def build_llama_cpp(options: Sequence[str]):
43
+ logger.info("Building llama.cpp")
44
+ os.chdir(LLAMA_CPP_PATH)
45
+ builddir = LLAMA_CPP_PATH / "build"
46
+ if builddir.exists():
47
+ os.system("pwd")
48
+ os.system("rm -Ir build")
49
+ builddir.mkdir()
50
+ os.chdir(builddir)
51
+ os.system(shlex.join(("cmake", "..", *options)))
52
+ os.system("make -j llama-bench llama-quantize test-backend-ops")
53
+
54
+
55
+ def quantize(types: Sequence[str] = ALL_TYPES, sizes: Sequence[str] = MODEL_SIZES):
56
+ logger.info("Make all model types we'll test")
57
+ for size in sizes:
58
+ source = MODEL_DIR / f"TriLM_{size}B_Unpacked-TQ1_0-F16.gguf"
59
+ for ty in types:
60
+ target = MODEL_DIR / f"TriLM_{size}B_Unpacked-{ty}.gguf"
61
+ if not target.exists():
62
+ command = shlex.join(
63
+ (
64
+ str(LLAMA_CPP_PATH / "build" / "bin" / "llama-quantize"),
65
+ "--allow-requantize",
66
+ str(source),
67
+ str(target),
68
+ ty,
69
+ )
70
+ )
71
+ logger.info("Running: %s", command)
72
+ os.system(command)
73
+
74
+
75
+ def llama_bench(
76
+ repetitions: int = 5,
77
+ types: Sequence[str] = ALL_TYPES,
78
+ sizes: Sequence[str] = MODEL_SIZES,
79
+ ) -> list[dict[str, Any]]:
80
+ logger.info("Test each model one by one for different numbers of threads")
81
+
82
+ threads = [2**i for i in range(5) if 2**i <= os.cpu_count()]
83
+ logger.info(f"Numbers of threads to be tested: {threads}")
84
+
85
+ out = []
86
+
87
+ for size in sizes:
88
+ for ty in types:
89
+ for th in threads:
90
+ model_path = MODEL_DIR / f"TriLM_{size}B_Unpacked-{ty}.gguf"
91
+ args = [
92
+ "-v",
93
+ "-m",
94
+ str(model_path),
95
+ "-t",
96
+ str(th),
97
+ "-r",
98
+ str(repetitions),
99
+ "-p",
100
+ "512",
101
+ "-n",
102
+ "128",
103
+ "-o",
104
+ "json",
105
+ ]
106
+ result = subprocess.run(
107
+ [str(LLAMA_CPP_PATH / "build" / "bin" / "llama-bench")] + args,
108
+ capture_output=True,
109
+ )
110
+ logger.debug(result.stderr)
111
+
112
+ new_output = json.loads(result.stdout)
113
+ logger.info(json.dumps(new_output, indent=4))
114
+ out.extend(new_output)
115
+ return out
116
+
117
+
118
+ def test_backend_perf() -> str:
119
+ result = subprocess.run(
120
+ [
121
+ str(LLAMA_CPP_PATH / "build" / "bin" / "test-backend-ops"),
122
+ "perf",
123
+ "-o",
124
+ "MUL_MAT",
125
+ ],
126
+ capture_output=True,
127
+ )
128
+ return result.stdout.decode(encoding="utf-8")
129
+
130
+
131
+ def parse_args(args: Sequence[str]):
132
+ parser = argparse.ArgumentParser(
133
+ prog=args[0], description="Benchmark ternary models"
134
+ )
135
+ parser.add_argument("--gpu", action="store_true", help="Run benchmarks on GPU")
136
+ parser.add_argument("--cpu", action="store_true", help="Run benchmarks on CPU")
137
+ parser.add_argument(
138
+ "--llama-cpp-path",
139
+ type=Path,
140
+ default=LLAMA_CPP_PATH,
141
+ help="Path to a llama.cpp checkout",
142
+ )
143
+ parser.add_argument(
144
+ "--model-dir",
145
+ type=Path,
146
+ default=MODEL_DIR,
147
+ help="Where the tested models will be stored",
148
+ )
149
+ parser.add_argument(
150
+ "--repetitions",
151
+ type=int,
152
+ default=5,
153
+ required=False,
154
+ help="How many repetitions are run for each test",
155
+ )
156
+ parser.add_argument(
157
+ "--out",
158
+ type=Path,
159
+ default=Path(os.path.curdir) / "result.json",
160
+ help="Path of the benchmark results to be written",
161
+ )
162
+ return parser.parse_args(args[1:])
163
+
164
+
165
+ if __name__ == "__main__":
166
+ args = parse_args(sys.argv)
167
+
168
+ LLAMA_CPP_PATH = args.llama_cpp_path
169
+ MODEL_DIR = args.model_dir
170
+
171
+ results = []
172
+ repetitions: int = args.repetitions
173
+
174
+ if args.cpu:
175
+ gather_models()
176
+ build_llama_cpp(["-DGGML_NATIVE=ON", "-DGGML_CPU=ON"])
177
+ quantize()
178
+ results.extend(llama_bench(repetitions=repetitions))
179
+
180
+ if args.gpu:
181
+ gather_models()
182
+ build_llama_cpp(["-DGGML_NATIVE=ON", "-DGGML_CUDA=ON", "-DGGML_CUDA_F16=ON"])
183
+ quantize()
184
+ results.extend(llama_bench(repetitions=repetitions, types=GPU_TYPES))
185
+
186
+ cpuinfo = subprocess.run(["lscpu"], capture_output=True).stdout.decode(
187
+ encoding="utf-8"
188
+ )
189
+ mulmat_perf = test_backend_perf()
190
+
191
+ final_result = {
192
+ "cpuinfo": cpuinfo,
193
+ "mulmat_perf": mulmat_perf,
194
+ "results": results,
195
+ }
196
+ with open(args.out, "w") as f:
197
+ json.dump(results, f, indent=4)
bench-TriLMs.sh CHANGED
@@ -7,7 +7,7 @@ MODEL_DIR="bench-TriLMs-models"
7
  LLAMA_CPP_PATH="."
8
  sizes=("1.5" "2.4" "3.9")
9
  types=("TQ1_0" "TQ2_0" "Q4_K_M" "Q8_0" "F16" "BF16")
10
- gputypes=("Q4_K_M" "Q8_0" "F16" "BF16")
11
 
12
  function gather_models() {
13
  echo Gather the models
 
7
  LLAMA_CPP_PATH="."
8
  sizes=("1.5" "2.4" "3.9")
9
  types=("TQ1_0" "TQ2_0" "Q4_K_M" "Q8_0" "F16" "BF16")
10
+ gputypes=("TQ2_0" "Q4_K_M" "Q8_0" "F16")
11
 
12
  function gather_models() {
13
  echo Gather the models