Spaces:
Build error
Build error
Illumotion
commited on
Commit
·
b439a8f
1
Parent(s):
93fbabe
Upload folder using huggingface_hub
Browse files- .gitignore +1 -0
- Dockerfile +1 -1
- Makefile +8 -0
- Remote-Link.cmd +2 -0
- convert.py +71 -24
- examples/CMakeLists.txt +1 -0
- examples/common.cpp +15 -10
- examples/common.h +2 -1
- examples/embedding/embedding.cpp +4 -2
- examples/main/main.cpp +7 -3
- examples/metal/metal.cpp +4 -2
- examples/perplexity/perplexity.cpp +4 -2
- examples/quantize-stats/quantize-stats.cpp +13 -2
- examples/save-load-state/save-load-state.cpp +25 -4
- examples/server/README.md +11 -2
- examples/server/server.cpp +50 -3
- examples/simple/simple.cpp +5 -3
- examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -1
- expose.h +1 -1
- ggml-cuda.cu +214 -131
- ggml-metal.h +4 -1
- ggml-metal.m +88 -23
- ggml-opencl.cpp +357 -176
- ggml.c +828 -139
- ggml.h +141 -9
- gpttype_adapter.cpp +48 -24
- klite.embd +0 -0
- koboldcpp.py +30 -13
- llama.cpp +171 -95
- llama.h +39 -13
- model_adapter.cpp +1 -1
- otherarch/gpt2_v3.cpp +78 -20
- otherarch/gptj_v3.cpp +68 -13
- otherarch/llama_v2.cpp +2 -2
- otherarch/mpt_v3.cpp +65 -16
- otherarch/neox_v3.cpp +72 -41
- otherarch/otherarch.h +0 -1
- otherarch/utils.cpp +45 -23
- otherarch/utils.h +6 -0
- spm-headers/ggml.h +141 -9
.gitignore
CHANGED
@@ -31,6 +31,7 @@ out/
|
|
31 |
/perplexity
|
32 |
/embedding
|
33 |
/train-text-from-scratch
|
|
|
34 |
/benchmark-matmult
|
35 |
/vdot
|
36 |
/server
|
|
|
31 |
/perplexity
|
32 |
/embedding
|
33 |
/train-text-from-scratch
|
34 |
+
/simple
|
35 |
/benchmark-matmult
|
36 |
/vdot
|
37 |
/server
|
Dockerfile
CHANGED
@@ -3,7 +3,7 @@ WORKDIR /app
|
|
3 |
COPY . .
|
4 |
RUN apt update \
|
5 |
&& apt install build-essential wget libopenblas-dev make -y \
|
6 |
-
&& make \
|
7 |
&& wget https://huggingface.co/Yoshiii/pygmalion-7b-ggml/resolve/main/pygmalion-7b-q5_K_M.bin\
|
8 |
&& apt remove build-essential wget make -y
|
9 |
|
|
|
3 |
COPY . .
|
4 |
RUN apt update \
|
5 |
&& apt install build-essential wget libopenblas-dev make -y \
|
6 |
+
&& make LLAMA_OPENBLAS=1 \
|
7 |
&& wget https://huggingface.co/Yoshiii/pygmalion-7b-ggml/resolve/main/pygmalion-7b-q5_K_M.bin\
|
8 |
&& apt remove build-essential wget make -y
|
9 |
|
Makefile
CHANGED
@@ -149,6 +149,14 @@ ifdef LLAMA_CUDA_DMMV_Y
|
|
149 |
else
|
150 |
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
|
151 |
endif # LLAMA_CUDA_DMMV_Y
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
153 |
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
154 |
ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
|
|
|
149 |
else
|
150 |
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
|
151 |
endif # LLAMA_CUDA_DMMV_Y
|
152 |
+
ifdef LLAMA_CUDA_DMMV_F16
|
153 |
+
NVCCFLAGS += -DGGML_CUDA_DMMV_F16
|
154 |
+
endif # LLAMA_CUDA_DMMV_F16
|
155 |
+
ifdef LLAMA_CUDA_KQUANTS_ITER
|
156 |
+
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
157 |
+
else
|
158 |
+
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
|
159 |
+
endif
|
160 |
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
161 |
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
|
162 |
ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
|
Remote-Link.cmd
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe -o cloudflared.exe
|
2 |
+
cloudflared.exe tunnel --url localhost:5001
|
convert.py
CHANGED
@@ -130,6 +130,14 @@ TENSORS_LIST = make_tensors_list()
|
|
130 |
TENSORS_SET = set(TENSORS_LIST)
|
131 |
|
132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
@dataclass
|
134 |
class Params:
|
135 |
n_vocab: int
|
@@ -137,21 +145,61 @@ class Params:
|
|
137 |
n_mult: int
|
138 |
n_head: int
|
139 |
n_layer: int
|
140 |
-
file_type: GGMLFileType
|
141 |
|
142 |
@staticmethod
|
143 |
-
def guessed(model: 'LazyModel'
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
return Params(
|
147 |
n_vocab=n_vocab,
|
148 |
n_embd=n_embd,
|
149 |
n_mult=256,
|
150 |
-
n_head=
|
151 |
-
n_layer=
|
152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
)
|
154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
|
156 |
class SentencePieceVocab:
|
157 |
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
|
@@ -595,18 +643,17 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
|
|
595 |
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
|
596 |
|
597 |
|
598 |
-
def convert_transformers_to_orig(model: LazyModel) -> LazyModel:
|
599 |
out: LazyModel = {}
|
600 |
out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
|
601 |
out["norm.weight"] = model["model.norm.weight"]
|
602 |
out["output.weight"] = model["lm_head.weight"]
|
603 |
|
604 |
-
n_head = model["model.layers.0.self_attn.q_proj.weight"].shape[1] // 128
|
605 |
for i in itertools.count():
|
606 |
if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
|
607 |
break
|
608 |
-
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], n_head)
|
609 |
-
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], n_head)
|
610 |
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
611 |
out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
|
612 |
|
@@ -920,7 +967,7 @@ class OutputFile:
|
|
920 |
def __init__(self, fname_out: Path) -> None:
|
921 |
self.fout = open(fname_out, "wb")
|
922 |
|
923 |
-
def write_file_header(self, params: Params) -> None:
|
924 |
self.fout.write(b"ggjt"[::-1]) # magic
|
925 |
values = [
|
926 |
1, # file version
|
@@ -930,7 +977,7 @@ class OutputFile:
|
|
930 |
params.n_head,
|
931 |
params.n_layer,
|
932 |
params.n_embd // params.n_head, # rot (obsolete)
|
933 |
-
|
934 |
]
|
935 |
self.fout.write(struct.pack("i" * len(values), *values))
|
936 |
|
@@ -951,17 +998,17 @@ class OutputFile:
|
|
951 |
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
|
952 |
of = OutputFile(fname_out)
|
953 |
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0,
|
954 |
-
n_head=1, n_layer=0
|
955 |
of = OutputFile(fname_out)
|
956 |
-
of.write_file_header(params)
|
957 |
of.write_vocab(vocab)
|
958 |
of.fout.close()
|
959 |
|
960 |
@staticmethod
|
961 |
-
def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None:
|
962 |
check_vocab_size(params, vocab)
|
963 |
of = OutputFile(fname_out)
|
964 |
-
of.write_file_header(params)
|
965 |
print("Writing vocab...")
|
966 |
of.write_vocab(vocab)
|
967 |
|
@@ -997,11 +1044,11 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi
|
|
997 |
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
998 |
|
999 |
|
1000 |
-
def do_necessary_conversions(model: LazyModel) -> LazyModel:
|
1001 |
model = handle_quantization(model)
|
1002 |
|
1003 |
if "lm_head.weight" in model:
|
1004 |
-
model = convert_transformers_to_orig(model)
|
1005 |
model = filter_and_sort_tensors(model)
|
1006 |
|
1007 |
return model
|
@@ -1107,14 +1154,14 @@ def load_vocab(path: Path) -> SentencePieceVocab:
|
|
1107 |
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
|
1108 |
|
1109 |
|
1110 |
-
def default_outfile(model_paths: List[Path],
|
1111 |
namestr = {
|
1112 |
GGMLFileType.AllF32: "f32",
|
1113 |
GGMLFileType.MostlyF16: "f16",
|
1114 |
GGMLFileType.MostlyQ4_0: "q4_0",
|
1115 |
GGMLFileType.MostlyQ4_1: "q4_1",
|
1116 |
GGMLFileType.PerLayerIsQ4_1: "q4_1",
|
1117 |
-
}[
|
1118 |
ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
|
1119 |
if ret in model_paths:
|
1120 |
sys.stderr.write(
|
@@ -1164,13 +1211,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
|
|
1164 |
else:
|
1165 |
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
1166 |
vocab = load_vocab(vocab_dir)
|
|
|
1167 |
model = model_plus.model
|
1168 |
-
model = do_necessary_conversions(model)
|
1169 |
output_type = pick_output_type(model, args.outtype)
|
1170 |
model = convert_to_output_type(model, output_type)
|
1171 |
-
|
1172 |
-
|
1173 |
-
OutputFile.write_all(outfile, params, model, vocab)
|
1174 |
print(f"Wrote {outfile}")
|
1175 |
|
1176 |
|
|
|
130 |
TENSORS_SET = set(TENSORS_LIST)
|
131 |
|
132 |
|
133 |
+
def find_n_mult(n_ff: int, n_embd: int) -> int:
|
134 |
+
# hardcoded magic range
|
135 |
+
for n_mult in range(256, 1, -1):
|
136 |
+
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
|
137 |
+
if calc_ff == n_ff:
|
138 |
+
return n_mult
|
139 |
+
return 1
|
140 |
+
|
141 |
@dataclass
|
142 |
class Params:
|
143 |
n_vocab: int
|
|
|
145 |
n_mult: int
|
146 |
n_head: int
|
147 |
n_layer: int
|
|
|
148 |
|
149 |
@staticmethod
|
150 |
+
def guessed(model: 'LazyModel') -> 'Params':
|
151 |
+
# try transformer naming first
|
152 |
+
n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
|
153 |
+
|
154 |
+
# try transformer naming first
|
155 |
+
if "model.layers.0.self_attn.q_proj.weight" in model:
|
156 |
+
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
157 |
+
else:
|
158 |
+
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
159 |
+
|
160 |
+
n_head=n_embd // 128 # guessed
|
161 |
|
162 |
return Params(
|
163 |
n_vocab=n_vocab,
|
164 |
n_embd=n_embd,
|
165 |
n_mult=256,
|
166 |
+
n_head=n_head,
|
167 |
+
n_layer=n_layer,
|
168 |
+
)
|
169 |
+
|
170 |
+
@staticmethod
|
171 |
+
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
|
172 |
+
config = json.load(open(config_path))
|
173 |
+
|
174 |
+
n_vocab = config["vocab_size"];
|
175 |
+
n_embd = config["hidden_size"];
|
176 |
+
n_head = config["num_attention_heads"];
|
177 |
+
n_layer = config["num_hidden_layers"];
|
178 |
+
n_ff = config["intermediate_size"];
|
179 |
+
|
180 |
+
n_mult = find_n_mult(n_ff, n_embd);
|
181 |
+
|
182 |
+
return Params(
|
183 |
+
n_vocab=n_vocab,
|
184 |
+
n_embd=n_embd,
|
185 |
+
n_mult=n_mult,
|
186 |
+
n_head=n_head,
|
187 |
+
n_layer=n_layer,
|
188 |
)
|
189 |
|
190 |
+
@staticmethod
|
191 |
+
def load(model_plus: 'ModelPlus') -> 'Params':
|
192 |
+
orig_config_path = model_plus.paths[0].parent / "params.json"
|
193 |
+
hf_transformer_config_path = model_plus.paths[0].parent / "config.json"
|
194 |
+
|
195 |
+
if hf_transformer_config_path.exists():
|
196 |
+
params = Params.loadHFTransformerJson(model_plus.model, hf_transformer_config_path)
|
197 |
+
else:
|
198 |
+
params = Params.guessed(model_plus.model)
|
199 |
+
|
200 |
+
print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}')
|
201 |
+
return params
|
202 |
+
|
203 |
|
204 |
class SentencePieceVocab:
|
205 |
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
|
|
|
643 |
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
|
644 |
|
645 |
|
646 |
+
def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
|
647 |
out: LazyModel = {}
|
648 |
out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
|
649 |
out["norm.weight"] = model["model.norm.weight"]
|
650 |
out["output.weight"] = model["lm_head.weight"]
|
651 |
|
|
|
652 |
for i in itertools.count():
|
653 |
if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
|
654 |
break
|
655 |
+
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
|
656 |
+
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
|
657 |
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
658 |
out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
|
659 |
|
|
|
967 |
def __init__(self, fname_out: Path) -> None:
|
968 |
self.fout = open(fname_out, "wb")
|
969 |
|
970 |
+
def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
|
971 |
self.fout.write(b"ggjt"[::-1]) # magic
|
972 |
values = [
|
973 |
1, # file version
|
|
|
977 |
params.n_head,
|
978 |
params.n_layer,
|
979 |
params.n_embd // params.n_head, # rot (obsolete)
|
980 |
+
file_type.value,
|
981 |
]
|
982 |
self.fout.write(struct.pack("i" * len(values), *values))
|
983 |
|
|
|
998 |
def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
|
999 |
of = OutputFile(fname_out)
|
1000 |
params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0,
|
1001 |
+
n_head=1, n_layer=0)
|
1002 |
of = OutputFile(fname_out)
|
1003 |
+
of.write_file_header(params, file_type=GGMLFileType.AllF32)
|
1004 |
of.write_vocab(vocab)
|
1005 |
of.fout.close()
|
1006 |
|
1007 |
@staticmethod
|
1008 |
+
def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None:
|
1009 |
check_vocab_size(params, vocab)
|
1010 |
of = OutputFile(fname_out)
|
1011 |
+
of.write_file_header(params, file_type)
|
1012 |
print("Writing vocab...")
|
1013 |
of.write_vocab(vocab)
|
1014 |
|
|
|
1044 |
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
1045 |
|
1046 |
|
1047 |
+
def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel:
|
1048 |
model = handle_quantization(model)
|
1049 |
|
1050 |
if "lm_head.weight" in model:
|
1051 |
+
model = convert_transformers_to_orig(model, params)
|
1052 |
model = filter_and_sort_tensors(model)
|
1053 |
|
1054 |
return model
|
|
|
1154 |
return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
|
1155 |
|
1156 |
|
1157 |
+
def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
|
1158 |
namestr = {
|
1159 |
GGMLFileType.AllF32: "f32",
|
1160 |
GGMLFileType.MostlyF16: "f16",
|
1161 |
GGMLFileType.MostlyQ4_0: "q4_0",
|
1162 |
GGMLFileType.MostlyQ4_1: "q4_1",
|
1163 |
GGMLFileType.PerLayerIsQ4_1: "q4_1",
|
1164 |
+
}[file_type]
|
1165 |
ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
|
1166 |
if ret in model_paths:
|
1167 |
sys.stderr.write(
|
|
|
1211 |
else:
|
1212 |
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
|
1213 |
vocab = load_vocab(vocab_dir)
|
1214 |
+
params = Params.load(model_plus)
|
1215 |
model = model_plus.model
|
1216 |
+
model = do_necessary_conversions(model, params)
|
1217 |
output_type = pick_output_type(model, args.outtype)
|
1218 |
model = convert_to_output_type(model, output_type)
|
1219 |
+
outfile = args.outfile or default_outfile(model_plus.paths, output_type)
|
1220 |
+
OutputFile.write_all(outfile, params, output_type, model, vocab)
|
|
|
1221 |
print(f"Wrote {outfile}")
|
1222 |
|
1223 |
|
examples/CMakeLists.txt
CHANGED
@@ -38,6 +38,7 @@ else()
|
|
38 |
add_subdirectory(benchmark)
|
39 |
add_subdirectory(baby-llama)
|
40 |
add_subdirectory(train-text-from-scratch)
|
|
|
41 |
if (LLAMA_METAL)
|
42 |
add_subdirectory(metal)
|
43 |
endif()
|
|
|
38 |
add_subdirectory(benchmark)
|
39 |
add_subdirectory(baby-llama)
|
40 |
add_subdirectory(train-text-from-scratch)
|
41 |
+
add_subdirectory(simple)
|
42 |
if (LLAMA_METAL)
|
43 |
add_subdirectory(metal)
|
44 |
endif()
|
examples/common.cpp
CHANGED
@@ -106,9 +106,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
106 |
}
|
107 |
|
108 |
if (arg == "-s" || arg == "--seed") {
|
109 |
-
#if defined(GGML_USE_CUBLAS)
|
110 |
-
fprintf(stderr, "WARNING: when using cuBLAS generation results are NOT guaranteed to be reproducible.\n");
|
111 |
-
#endif
|
112 |
if (++i >= argc) {
|
113 |
invalid_param = true;
|
114 |
break;
|
@@ -539,7 +536,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
|
|
539 |
return res;
|
540 |
}
|
541 |
|
542 |
-
struct llama_context
|
543 |
auto lparams = llama_context_default_params();
|
544 |
|
545 |
lparams.n_ctx = params.n_ctx;
|
@@ -555,25 +552,33 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
|
|
555 |
lparams.logits_all = params.perplexity;
|
556 |
lparams.embedding = params.embedding;
|
557 |
|
558 |
-
|
|
|
|
|
|
|
|
|
559 |
|
|
|
560 |
if (lctx == NULL) {
|
561 |
-
fprintf(stderr, "%s: error: failed to
|
562 |
-
|
|
|
563 |
}
|
564 |
|
565 |
if (!params.lora_adapter.empty()) {
|
566 |
-
int err =
|
567 |
params.lora_adapter.c_str(),
|
568 |
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
|
569 |
params.n_threads);
|
570 |
if (err != 0) {
|
571 |
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
572 |
-
|
|
|
|
|
573 |
}
|
574 |
}
|
575 |
|
576 |
-
return lctx;
|
577 |
}
|
578 |
|
579 |
void console_init(console_state & con_st) {
|
|
|
106 |
}
|
107 |
|
108 |
if (arg == "-s" || arg == "--seed") {
|
|
|
|
|
|
|
109 |
if (++i >= argc) {
|
110 |
invalid_param = true;
|
111 |
break;
|
|
|
536 |
return res;
|
537 |
}
|
538 |
|
539 |
+
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
|
540 |
auto lparams = llama_context_default_params();
|
541 |
|
542 |
lparams.n_ctx = params.n_ctx;
|
|
|
552 |
lparams.logits_all = params.perplexity;
|
553 |
lparams.embedding = params.embedding;
|
554 |
|
555 |
+
llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
|
556 |
+
if (model == NULL) {
|
557 |
+
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
558 |
+
return std::make_tuple(nullptr, nullptr);
|
559 |
+
}
|
560 |
|
561 |
+
llama_context * lctx = llama_new_context_with_model(model, lparams);
|
562 |
if (lctx == NULL) {
|
563 |
+
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
564 |
+
llama_free_model(model);
|
565 |
+
return std::make_tuple(nullptr, nullptr);
|
566 |
}
|
567 |
|
568 |
if (!params.lora_adapter.empty()) {
|
569 |
+
int err = llama_model_apply_lora_from_file(model,
|
570 |
params.lora_adapter.c_str(),
|
571 |
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
|
572 |
params.n_threads);
|
573 |
if (err != 0) {
|
574 |
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
575 |
+
llama_free(lctx);
|
576 |
+
llama_free_model(model);
|
577 |
+
return std::make_tuple(nullptr, nullptr);
|
578 |
}
|
579 |
}
|
580 |
|
581 |
+
return std::make_tuple(model, lctx);
|
582 |
}
|
583 |
|
584 |
void console_init(console_state & con_st) {
|
examples/common.h
CHANGED
@@ -9,6 +9,7 @@
|
|
9 |
#include <random>
|
10 |
#include <thread>
|
11 |
#include <unordered_map>
|
|
|
12 |
|
13 |
#if !defined (_WIN32)
|
14 |
#include <stdio.h>
|
@@ -95,7 +96,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
|
|
95 |
// Model utils
|
96 |
//
|
97 |
|
98 |
-
struct llama_context
|
99 |
|
100 |
//
|
101 |
// Console utils
|
|
|
9 |
#include <random>
|
10 |
#include <thread>
|
11 |
#include <unordered_map>
|
12 |
+
#include <tuple>
|
13 |
|
14 |
#if !defined (_WIN32)
|
15 |
#include <stdio.h>
|
|
|
96 |
// Model utils
|
97 |
//
|
98 |
|
99 |
+
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
|
100 |
|
101 |
//
|
102 |
// Console utils
|
examples/embedding/embedding.cpp
CHANGED
@@ -37,11 +37,12 @@ int main(int argc, char ** argv) {
|
|
37 |
|
38 |
llama_init_backend();
|
39 |
|
|
|
40 |
llama_context * ctx;
|
41 |
|
42 |
// load the model
|
43 |
-
ctx = llama_init_from_gpt_params(params);
|
44 |
-
if (
|
45 |
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
46 |
return 1;
|
47 |
}
|
@@ -90,6 +91,7 @@ int main(int argc, char ** argv) {
|
|
90 |
|
91 |
llama_print_timings(ctx);
|
92 |
llama_free(ctx);
|
|
|
93 |
|
94 |
return 0;
|
95 |
}
|
|
|
37 |
|
38 |
llama_init_backend();
|
39 |
|
40 |
+
llama_model * model;
|
41 |
llama_context * ctx;
|
42 |
|
43 |
// load the model
|
44 |
+
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
45 |
+
if (model == NULL) {
|
46 |
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
47 |
return 1;
|
48 |
}
|
|
|
91 |
|
92 |
llama_print_timings(ctx);
|
93 |
llama_free(ctx);
|
94 |
+
llama_free_model(model);
|
95 |
|
96 |
return 0;
|
97 |
}
|
examples/main/main.cpp
CHANGED
@@ -107,12 +107,13 @@ int main(int argc, char ** argv) {
|
|
107 |
|
108 |
llama_init_backend();
|
109 |
|
|
|
110 |
llama_context * ctx;
|
111 |
g_ctx = &ctx;
|
112 |
|
113 |
// load the model and apply lora adapter, if any
|
114 |
-
ctx = llama_init_from_gpt_params(params);
|
115 |
-
if (
|
116 |
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
117 |
return 1;
|
118 |
}
|
@@ -139,6 +140,7 @@ int main(int argc, char ** argv) {
|
|
139 |
|
140 |
llama_print_timings(ctx);
|
141 |
llama_free(ctx);
|
|
|
142 |
|
143 |
return 0;
|
144 |
}
|
@@ -147,6 +149,7 @@ int main(int argc, char ** argv) {
|
|
147 |
if (params.export_cgraph) {
|
148 |
llama_eval_export(ctx, "llama.ggml");
|
149 |
llama_free(ctx);
|
|
|
150 |
|
151 |
return 0;
|
152 |
}
|
@@ -354,7 +357,7 @@ int main(int argc, char ** argv) {
|
|
354 |
if ((int)embd.size() > max_embd_size) {
|
355 |
auto skipped_tokens = embd.size() - max_embd_size;
|
356 |
console_set_color(con_st, CONSOLE_COLOR_ERROR);
|
357 |
-
printf("<<input too long: skipped %
|
358 |
console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
|
359 |
fflush(stdout);
|
360 |
embd.resize(max_embd_size);
|
@@ -666,6 +669,7 @@ int main(int argc, char ** argv) {
|
|
666 |
|
667 |
llama_print_timings(ctx);
|
668 |
llama_free(ctx);
|
|
|
669 |
|
670 |
return 0;
|
671 |
}
|
|
|
107 |
|
108 |
llama_init_backend();
|
109 |
|
110 |
+
llama_model * model;
|
111 |
llama_context * ctx;
|
112 |
g_ctx = &ctx;
|
113 |
|
114 |
// load the model and apply lora adapter, if any
|
115 |
+
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
116 |
+
if (model == NULL) {
|
117 |
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
118 |
return 1;
|
119 |
}
|
|
|
140 |
|
141 |
llama_print_timings(ctx);
|
142 |
llama_free(ctx);
|
143 |
+
llama_free_model(model);
|
144 |
|
145 |
return 0;
|
146 |
}
|
|
|
149 |
if (params.export_cgraph) {
|
150 |
llama_eval_export(ctx, "llama.ggml");
|
151 |
llama_free(ctx);
|
152 |
+
llama_free_model(model);
|
153 |
|
154 |
return 0;
|
155 |
}
|
|
|
357 |
if ((int)embd.size() > max_embd_size) {
|
358 |
auto skipped_tokens = embd.size() - max_embd_size;
|
359 |
console_set_color(con_st, CONSOLE_COLOR_ERROR);
|
360 |
+
printf("<<input too long: skipped %zu token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
361 |
console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
|
362 |
fflush(stdout);
|
363 |
embd.resize(max_embd_size);
|
|
|
669 |
|
670 |
llama_print_timings(ctx);
|
671 |
llama_free(ctx);
|
672 |
+
llama_free_model(model);
|
673 |
|
674 |
return 0;
|
675 |
}
|
examples/metal/metal.cpp
CHANGED
@@ -40,8 +40,10 @@ int main(int argc, char ** argv) {
|
|
40 |
// this allocates all Metal resources and memory buffers
|
41 |
auto * ctx_metal = ggml_metal_init();
|
42 |
|
43 |
-
|
44 |
-
|
|
|
|
|
45 |
|
46 |
// main
|
47 |
{
|
|
|
40 |
// this allocates all Metal resources and memory buffers
|
41 |
auto * ctx_metal = ggml_metal_init();
|
42 |
|
43 |
+
const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
|
44 |
+
const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
|
45 |
+
ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data), max_size_data);
|
46 |
+
ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval), max_size_eval);
|
47 |
|
48 |
// main
|
49 |
{
|
examples/perplexity/perplexity.cpp
CHANGED
@@ -149,11 +149,12 @@ int main(int argc, char ** argv) {
|
|
149 |
|
150 |
llama_init_backend();
|
151 |
|
|
|
152 |
llama_context * ctx;
|
153 |
|
154 |
// load the model and apply lora adapter, if any
|
155 |
-
ctx = llama_init_from_gpt_params(params);
|
156 |
-
if (
|
157 |
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
158 |
return 1;
|
159 |
}
|
@@ -169,6 +170,7 @@ int main(int argc, char ** argv) {
|
|
169 |
|
170 |
llama_print_timings(ctx);
|
171 |
llama_free(ctx);
|
|
|
172 |
|
173 |
return 0;
|
174 |
}
|
|
|
149 |
|
150 |
llama_init_backend();
|
151 |
|
152 |
+
llama_model * model;
|
153 |
llama_context * ctx;
|
154 |
|
155 |
// load the model and apply lora adapter, if any
|
156 |
+
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
157 |
+
if (model == NULL) {
|
158 |
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
159 |
return 1;
|
160 |
}
|
|
|
170 |
|
171 |
llama_print_timings(ctx);
|
172 |
llama_free(ctx);
|
173 |
+
llama_free_model(model);
|
174 |
|
175 |
return 0;
|
176 |
}
|
examples/quantize-stats/quantize-stats.cpp
CHANGED
@@ -320,6 +320,7 @@ int main(int argc, char ** argv) {
|
|
320 |
fprintf(stderr, "Loading model\n");
|
321 |
|
322 |
const int64_t t_main_start_us = ggml_time_us();
|
|
|
323 |
llama_context * ctx;
|
324 |
|
325 |
{
|
@@ -330,12 +331,20 @@ int main(int argc, char ** argv) {
|
|
330 |
lparams.f16_kv = false;
|
331 |
lparams.use_mlock = false;
|
332 |
|
333 |
-
|
334 |
|
335 |
-
if (
|
336 |
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
337 |
return 1;
|
338 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
339 |
}
|
340 |
|
341 |
const auto &tensors = llama_internal_get_tensor_map(ctx);
|
@@ -357,6 +366,7 @@ int main(int argc, char ** argv) {
|
|
357 |
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
|
358 |
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
|
359 |
llama_free(ctx);
|
|
|
360 |
return 1;
|
361 |
}
|
362 |
included_layers++;
|
@@ -415,6 +425,7 @@ int main(int argc, char ** argv) {
|
|
415 |
|
416 |
|
417 |
llama_free(ctx);
|
|
|
418 |
// report timing
|
419 |
{
|
420 |
const int64_t t_main_end_us = ggml_time_us();
|
|
|
320 |
fprintf(stderr, "Loading model\n");
|
321 |
|
322 |
const int64_t t_main_start_us = ggml_time_us();
|
323 |
+
llama_model * model;
|
324 |
llama_context * ctx;
|
325 |
|
326 |
{
|
|
|
331 |
lparams.f16_kv = false;
|
332 |
lparams.use_mlock = false;
|
333 |
|
334 |
+
model = llama_load_model_from_file(params.model.c_str(), lparams);
|
335 |
|
336 |
+
if (model == NULL) {
|
337 |
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
338 |
return 1;
|
339 |
}
|
340 |
+
|
341 |
+
ctx = llama_new_context_with_model(model, lparams);
|
342 |
+
|
343 |
+
if (ctx == NULL) {
|
344 |
+
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
|
345 |
+
llama_free_model(model);
|
346 |
+
return 1;
|
347 |
+
}
|
348 |
}
|
349 |
|
350 |
const auto &tensors = llama_internal_get_tensor_map(ctx);
|
|
|
366 |
fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
|
367 |
"this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
|
368 |
llama_free(ctx);
|
369 |
+
llama_free_model(model);
|
370 |
return 1;
|
371 |
}
|
372 |
included_layers++;
|
|
|
425 |
|
426 |
|
427 |
llama_free(ctx);
|
428 |
+
llama_free_model(model);
|
429 |
// report timing
|
430 |
{
|
431 |
const int64_t t_main_end_us = ggml_time_us();
|
examples/save-load-state/save-load-state.cpp
CHANGED
@@ -35,12 +35,22 @@ int main(int argc, char ** argv) {
|
|
35 |
auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
|
36 |
|
37 |
// init
|
38 |
-
auto
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
auto tokens = std::vector<llama_token>(params.n_ctx);
|
40 |
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
|
41 |
|
42 |
if (n_prompt_tokens < 1) {
|
43 |
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
|
|
|
|
|
44 |
return 1;
|
45 |
}
|
46 |
|
@@ -84,6 +94,8 @@ int main(int argc, char ** argv) {
|
|
84 |
printf("%s", next_token_str);
|
85 |
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
|
86 |
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
|
|
|
|
87 |
return 1;
|
88 |
}
|
89 |
n_past += 1;
|
@@ -91,23 +103,27 @@ int main(int argc, char ** argv) {
|
|
91 |
|
92 |
printf("\n\n");
|
93 |
|
94 |
-
// free old
|
95 |
llama_free(ctx);
|
96 |
|
97 |
-
//
|
98 |
-
auto ctx2 =
|
99 |
|
100 |
// Load state (rng, logits, embedding and kv_cache) from file
|
101 |
{
|
102 |
FILE *fp_read = fopen("dump_state.bin", "rb");
|
103 |
if (state_size != llama_get_state_size(ctx2)) {
|
104 |
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
|
|
|
|
|
105 |
return 1;
|
106 |
}
|
107 |
|
108 |
const size_t ret = fread(state_mem, 1, state_size, fp_read);
|
109 |
if (ret != state_size) {
|
110 |
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
|
|
|
|
111 |
return 1;
|
112 |
}
|
113 |
|
@@ -138,6 +154,8 @@ int main(int argc, char ** argv) {
|
|
138 |
printf("%s", next_token_str);
|
139 |
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
|
140 |
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
|
|
|
|
141 |
return 1;
|
142 |
}
|
143 |
n_past += 1;
|
@@ -145,5 +163,8 @@ int main(int argc, char ** argv) {
|
|
145 |
|
146 |
printf("\n\n");
|
147 |
|
|
|
|
|
|
|
148 |
return 0;
|
149 |
}
|
|
|
35 |
auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
|
36 |
|
37 |
// init
|
38 |
+
auto model = llama_load_model_from_file(params.model.c_str(), lparams);
|
39 |
+
if (model == nullptr) {
|
40 |
+
return 1;
|
41 |
+
}
|
42 |
+
auto ctx = llama_new_context_with_model(model, lparams);
|
43 |
+
if (ctx == nullptr) {
|
44 |
+
llama_free_model(model);
|
45 |
+
return 1;
|
46 |
+
}
|
47 |
auto tokens = std::vector<llama_token>(params.n_ctx);
|
48 |
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
|
49 |
|
50 |
if (n_prompt_tokens < 1) {
|
51 |
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
|
52 |
+
llama_free(ctx);
|
53 |
+
llama_free_model(model);
|
54 |
return 1;
|
55 |
}
|
56 |
|
|
|
94 |
printf("%s", next_token_str);
|
95 |
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
|
96 |
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
97 |
+
llama_free(ctx);
|
98 |
+
llama_free_model(model);
|
99 |
return 1;
|
100 |
}
|
101 |
n_past += 1;
|
|
|
103 |
|
104 |
printf("\n\n");
|
105 |
|
106 |
+
// free old context
|
107 |
llama_free(ctx);
|
108 |
|
109 |
+
// make new context
|
110 |
+
auto ctx2 = llama_new_context_with_model(model, lparams);
|
111 |
|
112 |
// Load state (rng, logits, embedding and kv_cache) from file
|
113 |
{
|
114 |
FILE *fp_read = fopen("dump_state.bin", "rb");
|
115 |
if (state_size != llama_get_state_size(ctx2)) {
|
116 |
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
|
117 |
+
llama_free(ctx2);
|
118 |
+
llama_free_model(model);
|
119 |
return 1;
|
120 |
}
|
121 |
|
122 |
const size_t ret = fread(state_mem, 1, state_size, fp_read);
|
123 |
if (ret != state_size) {
|
124 |
fprintf(stderr, "\n%s : failed to read state\n", __func__);
|
125 |
+
llama_free(ctx2);
|
126 |
+
llama_free_model(model);
|
127 |
return 1;
|
128 |
}
|
129 |
|
|
|
154 |
printf("%s", next_token_str);
|
155 |
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
|
156 |
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
157 |
+
llama_free(ctx2);
|
158 |
+
llama_free_model(model);
|
159 |
return 1;
|
160 |
}
|
161 |
n_past += 1;
|
|
|
163 |
|
164 |
printf("\n\n");
|
165 |
|
166 |
+
llama_free(ctx2);
|
167 |
+
llama_free_model(model);
|
168 |
+
|
169 |
return 0;
|
170 |
}
|
examples/server/README.md
CHANGED
@@ -21,6 +21,7 @@ Command line options:
|
|
21 |
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
|
22 |
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
|
23 |
- `--port`: Set the port to listen. Default: `8080`.
|
|
|
24 |
|
25 |
## Build
|
26 |
|
@@ -119,14 +120,14 @@ node .
|
|
119 |
|
120 |
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
|
121 |
|
122 |
-
`n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character.
|
123 |
|
124 |
`n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
|
125 |
By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
|
126 |
|
127 |
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
128 |
|
129 |
-
`prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate.
|
130 |
|
131 |
`stop`: Specify a JSON array of stopping strings.
|
132 |
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
|
@@ -163,6 +164,14 @@ node .
|
|
163 |
|
164 |
`content`: Set the text to tokenize.
|
165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
## More examples
|
167 |
|
168 |
### Interactive mode
|
|
|
21 |
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
|
22 |
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
|
23 |
- `--port`: Set the port to listen. Default: `8080`.
|
24 |
+
- `--embedding`: Enable embedding extraction, Default: disabled.
|
25 |
|
26 |
## Build
|
27 |
|
|
|
120 |
|
121 |
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
|
122 |
|
123 |
+
`n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: 128, -1 = infinity).
|
124 |
|
125 |
`n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
|
126 |
By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
|
127 |
|
128 |
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
129 |
|
130 |
+
`prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does.
|
131 |
|
132 |
`stop`: Specify a JSON array of stopping strings.
|
133 |
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
|
|
|
164 |
|
165 |
`content`: Set the text to tokenize.
|
166 |
|
167 |
+
Note that the special `BOS` token is not added in fron of the text and also a space character is not inserted automatically as it is for `/completion`.
|
168 |
+
|
169 |
+
- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
|
170 |
+
|
171 |
+
*Options:*
|
172 |
+
|
173 |
+
`content`: Set the text to process.
|
174 |
+
|
175 |
## More examples
|
176 |
|
177 |
### Interactive mode
|
examples/server/server.cpp
CHANGED
@@ -115,6 +115,7 @@ struct llama_server_context {
|
|
115 |
std::vector<llama_token> embd;
|
116 |
std::vector<llama_token> last_n_tokens;
|
117 |
|
|
|
118 |
llama_context * ctx = nullptr;
|
119 |
gpt_params params;
|
120 |
|
@@ -130,6 +131,10 @@ struct llama_server_context {
|
|
130 |
llama_free(ctx);
|
131 |
ctx = nullptr;
|
132 |
}
|
|
|
|
|
|
|
|
|
133 |
}
|
134 |
|
135 |
void rewind() {
|
@@ -150,8 +155,8 @@ struct llama_server_context {
|
|
150 |
|
151 |
bool loadModel(const gpt_params & params_) {
|
152 |
params = params_;
|
153 |
-
ctx = llama_init_from_gpt_params(params);
|
154 |
-
if (
|
155 |
LOG_ERROR("unable to load model", { { "model", params_.model } });
|
156 |
return false;
|
157 |
}
|
@@ -254,6 +259,11 @@ struct llama_server_context {
|
|
254 |
n_past += n_eval;
|
255 |
}
|
256 |
|
|
|
|
|
|
|
|
|
|
|
257 |
// out of user input, sample next token
|
258 |
const float temp = params.temp;
|
259 |
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
|
@@ -419,6 +429,19 @@ struct llama_server_context {
|
|
419 |
|
420 |
return token_text;
|
421 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
422 |
};
|
423 |
|
424 |
static void server_print_usage(const char * argv0, const gpt_params & params,
|
@@ -457,6 +480,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params,
|
|
457 |
fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
458 |
fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);
|
459 |
fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
|
|
|
460 |
fprintf(stderr, "\n");
|
461 |
}
|
462 |
|
@@ -603,6 +627,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
|
603 |
params.use_mlock = true;
|
604 |
} else if (arg == "--no-mmap") {
|
605 |
params.use_mmap = false;
|
|
|
|
|
606 |
} else {
|
607 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
608 |
server_print_usage(argv[0], default_params, default_sparams);
|
@@ -646,6 +672,12 @@ static json format_generation_settings(llama_server_context & llama) {
|
|
646 |
};
|
647 |
}
|
648 |
|
|
|
|
|
|
|
|
|
|
|
|
|
649 |
static json format_final_response(llama_server_context & llama, const std::string & content) {
|
650 |
return json {
|
651 |
{ "content", content },
|
@@ -881,12 +913,27 @@ int main(int argc, char ** argv) {
|
|
881 |
|
882 |
svr.Post("/tokenize", [&llama](const Request & req, Response & res) {
|
883 |
const json body = json::parse(req.body);
|
884 |
-
const std::string content = body
|
885 |
const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
|
886 |
const json data = format_tokenizer_response(tokens);
|
887 |
return res.set_content(data.dump(), "application/json");
|
888 |
});
|
889 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
890 |
svr.set_logger(log_server_request);
|
891 |
|
892 |
svr.set_exception_handler([](const Request &, Response & res, std::exception_ptr ep) {
|
|
|
115 |
std::vector<llama_token> embd;
|
116 |
std::vector<llama_token> last_n_tokens;
|
117 |
|
118 |
+
llama_model * model = nullptr;
|
119 |
llama_context * ctx = nullptr;
|
120 |
gpt_params params;
|
121 |
|
|
|
131 |
llama_free(ctx);
|
132 |
ctx = nullptr;
|
133 |
}
|
134 |
+
if (model) {
|
135 |
+
llama_free_model(model);
|
136 |
+
model = nullptr;
|
137 |
+
}
|
138 |
}
|
139 |
|
140 |
void rewind() {
|
|
|
155 |
|
156 |
bool loadModel(const gpt_params & params_) {
|
157 |
params = params_;
|
158 |
+
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
159 |
+
if (model == nullptr) {
|
160 |
LOG_ERROR("unable to load model", { { "model", params_.model } });
|
161 |
return false;
|
162 |
}
|
|
|
259 |
n_past += n_eval;
|
260 |
}
|
261 |
|
262 |
+
if (params.n_predict == 0) {
|
263 |
+
has_next_token = false;
|
264 |
+
return llama_token_eos();
|
265 |
+
}
|
266 |
+
|
267 |
// out of user input, sample next token
|
268 |
const float temp = params.temp;
|
269 |
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
|
|
|
429 |
|
430 |
return token_text;
|
431 |
}
|
432 |
+
|
433 |
+
std::vector<float> getEmbedding() {
|
434 |
+
static const int n_embd = llama_n_embd(ctx);
|
435 |
+
if (!params.embedding) {
|
436 |
+
LOG_WARNING("embedding disabled", {
|
437 |
+
{ "params.embedding", params.embedding },
|
438 |
+
});
|
439 |
+
return std::vector<float>(n_embd, 0.0f);
|
440 |
+
}
|
441 |
+
const float * data = llama_get_embeddings(ctx);
|
442 |
+
std::vector<float> embedding(data, data + n_embd);
|
443 |
+
return embedding;
|
444 |
+
}
|
445 |
};
|
446 |
|
447 |
static void server_print_usage(const char * argv0, const gpt_params & params,
|
|
|
480 |
fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
481 |
fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);
|
482 |
fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
|
483 |
+
fprintf(stderr, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
|
484 |
fprintf(stderr, "\n");
|
485 |
}
|
486 |
|
|
|
627 |
params.use_mlock = true;
|
628 |
} else if (arg == "--no-mmap") {
|
629 |
params.use_mmap = false;
|
630 |
+
} else if (arg == "--embedding") {
|
631 |
+
params.embedding = true;
|
632 |
} else {
|
633 |
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
634 |
server_print_usage(argv[0], default_params, default_sparams);
|
|
|
672 |
};
|
673 |
}
|
674 |
|
675 |
+
static json format_embedding_response(llama_server_context & llama) {
|
676 |
+
return json {
|
677 |
+
{ "embedding", llama.getEmbedding() },
|
678 |
+
};
|
679 |
+
}
|
680 |
+
|
681 |
static json format_final_response(llama_server_context & llama, const std::string & content) {
|
682 |
return json {
|
683 |
{ "content", content },
|
|
|
913 |
|
914 |
svr.Post("/tokenize", [&llama](const Request & req, Response & res) {
|
915 |
const json body = json::parse(req.body);
|
916 |
+
const std::string content = body.value("content", "");
|
917 |
const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
|
918 |
const json data = format_tokenizer_response(tokens);
|
919 |
return res.set_content(data.dump(), "application/json");
|
920 |
});
|
921 |
|
922 |
+
svr.Post("/embedding", [&llama](const Request & req, Response & res) {
|
923 |
+
const json body = json::parse(req.body);
|
924 |
+
|
925 |
+
llama.rewind();
|
926 |
+
llama_reset_timings(llama.ctx);
|
927 |
+
llama.params.prompt = body.value("content", "");
|
928 |
+
llama.params.n_predict = 0;
|
929 |
+
llama.loadPrompt();
|
930 |
+
llama.beginCompletion();
|
931 |
+
llama.doCompletion();
|
932 |
+
|
933 |
+
const json data = format_embedding_response(llama);
|
934 |
+
return res.set_content(data.dump(), "application/json");
|
935 |
+
});
|
936 |
+
|
937 |
svr.set_logger(log_server_request);
|
938 |
|
939 |
svr.set_exception_handler([](const Request &, Response & res, std::exception_ptr ep) {
|
examples/simple/simple.cpp
CHANGED
@@ -68,11 +68,12 @@ int main(int argc, char ** argv)
|
|
68 |
|
69 |
llama_init_backend();
|
70 |
|
71 |
-
|
|
|
72 |
|
73 |
-
ctx = llama_init_from_gpt_params( params );
|
74 |
|
75 |
-
if (
|
76 |
{
|
77 |
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
|
78 |
return 1;
|
@@ -170,6 +171,7 @@ int main(int argc, char ** argv)
|
|
170 |
} // wend of main loop
|
171 |
|
172 |
llama_free( ctx );
|
|
|
173 |
|
174 |
return 0;
|
175 |
}
|
|
|
68 |
|
69 |
llama_init_backend();
|
70 |
|
71 |
+
llama_model * model;
|
72 |
+
llama_context * ctx;
|
73 |
|
74 |
+
std::tie(model, ctx) = llama_init_from_gpt_params( params );
|
75 |
|
76 |
+
if ( model == NULL )
|
77 |
{
|
78 |
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
|
79 |
return 1;
|
|
|
171 |
} // wend of main loop
|
172 |
|
173 |
llama_free( ctx );
|
174 |
+
llama_free_model( model );
|
175 |
|
176 |
return 0;
|
177 |
}
|
examples/train-text-from-scratch/train-text-from-scratch.cpp
CHANGED
@@ -3054,7 +3054,8 @@ int main(int argc, char ** argv) {
|
|
3054 |
struct llama_context_params llama_params = llama_context_default_params();
|
3055 |
llama_params.vocab_only = true;
|
3056 |
|
3057 |
-
struct
|
|
|
3058 |
|
3059 |
struct llama_vocab vocab;
|
3060 |
{
|
@@ -3395,6 +3396,8 @@ int main(int argc, char ** argv) {
|
|
3395 |
delete[] compute_addr;
|
3396 |
delete[] compute_buf_0;
|
3397 |
delete[] compute_buf_1;
|
|
|
|
|
3398 |
ggml_free(model.ctx);
|
3399 |
|
3400 |
return 0;
|
|
|
3054 |
struct llama_context_params llama_params = llama_context_default_params();
|
3055 |
llama_params.vocab_only = true;
|
3056 |
|
3057 |
+
struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params);
|
3058 |
+
struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
|
3059 |
|
3060 |
struct llama_vocab vocab;
|
3061 |
{
|
|
|
3396 |
delete[] compute_addr;
|
3397 |
delete[] compute_buf_0;
|
3398 |
delete[] compute_buf_1;
|
3399 |
+
llama_free(lctx);
|
3400 |
+
llama_free_model(lmodel);
|
3401 |
ggml_free(model.ctx);
|
3402 |
|
3403 |
return 0;
|
expose.h
CHANGED
@@ -18,7 +18,7 @@ struct load_model_inputs
|
|
18 |
const bool unban_tokens;
|
19 |
const int clblast_info = 0;
|
20 |
const int blasbatchsize = 512;
|
21 |
-
const
|
22 |
const int forceversion = 0;
|
23 |
const int gpulayers = 0;
|
24 |
};
|
|
|
18 |
const bool unban_tokens;
|
19 |
const int clblast_info = 0;
|
20 |
const int blasbatchsize = 512;
|
21 |
+
const int debugmode = 0;
|
22 |
const int forceversion = 0;
|
23 |
const int gpulayers = 0;
|
24 |
};
|
ggml-cuda.cu
CHANGED
@@ -13,6 +13,10 @@
|
|
13 |
#include "ggml-cuda.h"
|
14 |
#include "ggml.h"
|
15 |
|
|
|
|
|
|
|
|
|
16 |
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
17 |
|
18 |
#define CUDA_CHECK(err) \
|
@@ -46,7 +50,15 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|
46 |
} while (0)
|
47 |
#endif // CUDART_VERSION >= 11
|
48 |
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
|
51 |
typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
|
52 |
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
@@ -230,82 +242,106 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
|
|
230 |
}
|
231 |
}
|
232 |
|
233 |
-
static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs,
|
234 |
const block_q4_0 * x = (const block_q4_0 *) vx;
|
235 |
|
236 |
-
const
|
237 |
|
238 |
-
const
|
239 |
|
240 |
-
|
241 |
-
|
242 |
|
243 |
-
|
244 |
-
|
|
|
|
|
|
|
|
|
|
|
245 |
}
|
246 |
|
247 |
-
static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs,
|
248 |
const block_q4_1 * x = (const block_q4_1 *) vx;
|
249 |
|
250 |
-
const
|
251 |
-
const
|
252 |
|
253 |
-
const
|
254 |
|
255 |
-
|
256 |
-
|
257 |
|
258 |
-
|
259 |
-
|
|
|
|
|
|
|
|
|
|
|
260 |
}
|
261 |
|
262 |
-
static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs,
|
263 |
const block_q5_0 * x = (const block_q5_0 *) vx;
|
264 |
|
265 |
-
const
|
266 |
|
267 |
uint32_t qh;
|
268 |
memcpy(&qh, x[ib].qh, sizeof(qh));
|
269 |
|
270 |
-
const
|
271 |
-
const
|
272 |
|
273 |
-
|
274 |
-
|
275 |
|
276 |
-
|
277 |
-
|
|
|
|
|
|
|
|
|
|
|
278 |
}
|
279 |
|
280 |
-
static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs,
|
281 |
const block_q5_1 * x = (const block_q5_1 *) vx;
|
282 |
|
283 |
-
const
|
284 |
-
const
|
285 |
|
286 |
uint32_t qh;
|
287 |
memcpy(&qh, x[ib].qh, sizeof(qh));
|
288 |
|
289 |
-
const
|
290 |
-
const
|
291 |
|
292 |
-
|
293 |
-
|
294 |
|
295 |
-
|
296 |
-
|
|
|
|
|
|
|
|
|
|
|
297 |
}
|
298 |
|
299 |
-
static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs,
|
300 |
const block_q8_0 * x = (const block_q8_0 *) vx;
|
301 |
|
302 |
-
const
|
303 |
|
304 |
-
|
305 |
-
|
306 |
|
307 |
-
|
308 |
-
|
|
|
|
|
|
|
|
|
309 |
}
|
310 |
|
311 |
//================================== k-quants
|
@@ -479,15 +515,15 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
|
479 |
|
480 |
const block_q2_K * x = (const block_q2_K *)vx + ib0;
|
481 |
|
482 |
-
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31
|
483 |
-
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0
|
484 |
|
485 |
const int step = 16/K_QUANTS_PER_ITERATION;
|
486 |
|
487 |
-
const int im = tid/step;
|
488 |
-
const int in = tid - step*im;
|
489 |
|
490 |
-
const int l0 = K_QUANTS_PER_ITERATION*in;
|
491 |
const int q_offset = 32*im + l0;
|
492 |
const int s_offset = 8*im;
|
493 |
const int y_offset = 128*im + l0;
|
@@ -542,27 +578,30 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
|
542 |
}
|
543 |
}
|
544 |
|
545 |
-
static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols) {
|
546 |
|
547 |
const uint16_t kmask1 = 0x0303;
|
548 |
const uint16_t kmask2 = 0x0f0f;
|
549 |
|
550 |
-
const int row = blockIdx.
|
|
|
|
|
551 |
const int num_blocks_per_row = ncols / QK_K;
|
552 |
const int ib0 = row*num_blocks_per_row;
|
553 |
|
554 |
const block_q3_K * x = (const block_q3_K *)vx + ib0;
|
555 |
|
556 |
-
const int tid = threadIdx.x/
|
557 |
-
const int ix = threadIdx.x%
|
558 |
|
559 |
-
const int n =
|
560 |
-
const int
|
561 |
-
const int
|
|
|
562 |
|
563 |
const uint8_t m = 1 << (4*im);
|
564 |
|
565 |
-
const int l0 = n*in;
|
566 |
const int q_offset = 32*im + l0;
|
567 |
const int y_offset = 128*im + l0;
|
568 |
|
@@ -573,7 +612,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
|
573 |
|
574 |
float tmp = 0; // partial sum for thread in warp
|
575 |
|
576 |
-
for (int i = ix; i < num_blocks_per_row; i +=
|
577 |
|
578 |
const float * y = yy + i * QK_K + y_offset;
|
579 |
const uint8_t * q = x[i].qs + q_offset;
|
@@ -614,22 +653,25 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
|
614 |
}
|
615 |
}
|
616 |
|
617 |
-
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols) {
|
618 |
|
619 |
const uint16_t kmask1 = 0x3f3f;
|
620 |
const uint16_t kmask2 = 0x0f0f;
|
621 |
const uint16_t kmask3 = 0xc0c0;
|
622 |
|
623 |
-
const int row = blockIdx.
|
|
|
624 |
const int num_blocks_per_row = ncols / QK_K;
|
625 |
const int ib0 = row*num_blocks_per_row;
|
626 |
|
627 |
-
const int tid = threadIdx.x/
|
628 |
-
const int ix = threadIdx.x%
|
629 |
|
630 |
-
const int
|
631 |
-
|
632 |
-
const int
|
|
|
|
|
633 |
|
634 |
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
635 |
const int in = il%2;
|
@@ -645,7 +687,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
|
645 |
|
646 |
float tmp = 0; // partial sum for thread in warp
|
647 |
|
648 |
-
for (int i = ix; i < num_blocks_per_row; i +=
|
649 |
|
650 |
const uint8_t * q1 = x[i].qs + q_offset;
|
651 |
const uint8_t * q2 = q1 + 64;
|
@@ -700,7 +742,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
|
700 |
|
701 |
const int il = tid/4; // 0...3
|
702 |
const int ir = tid - 4*il;// 0...3
|
703 |
-
const int n =
|
704 |
|
705 |
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
706 |
const int in = il%2;
|
@@ -739,11 +781,16 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
|
739 |
float4 sum = {0.f, 0.f, 0.f, 0.f};
|
740 |
float smin = 0;
|
741 |
for (int l = 0; l < n; ++l) {
|
742 |
-
sum.x += y1[l+ 0] * ((ql1[l] & 0xF) + (qh[l] & (hm1 << 0) ? 16 : 0))
|
743 |
-
|
744 |
-
sum.
|
745 |
-
|
746 |
-
|
|
|
|
|
|
|
|
|
|
|
747 |
}
|
748 |
tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
|
749 |
|
@@ -839,11 +886,12 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
|
|
839 |
}
|
840 |
}
|
841 |
|
842 |
-
static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
843 |
const half * x = (const half *) vx;
|
844 |
|
845 |
-
|
846 |
-
|
|
|
847 |
}
|
848 |
|
849 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
@@ -860,13 +908,15 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
|
|
860 |
const int y_offset = qr == 1 ? 1 : qk/2;
|
861 |
|
862 |
// dequantize
|
863 |
-
|
864 |
-
|
865 |
-
|
|
|
|
|
866 |
}
|
867 |
|
868 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
869 |
-
static __global__ void dequantize_mul_mat_vec(const void * vx, const
|
870 |
// qk = quantized weights per x block
|
871 |
// qr = number of quantized weights per data value in x block
|
872 |
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
@@ -881,7 +931,12 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
|
|
881 |
const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
|
882 |
const int y_offset = qr == 1 ? 1 : qk/2;
|
883 |
|
884 |
-
|
|
|
|
|
|
|
|
|
|
|
885 |
|
886 |
for (int i = 0; i < ncols; i += iter_stride) {
|
887 |
const int col = i + vals_per_iter*tid;
|
@@ -895,14 +950,21 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
|
|
895 |
// process 2 vals per j iter
|
896 |
|
897 |
// dequantize
|
898 |
-
float v0, v1;
|
899 |
-
dequantize_kernel(vx, ib, iqs + j/qr, v0, v1);
|
900 |
// for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
|
|
|
|
|
901 |
|
902 |
// matrix multiplication
|
903 |
-
tmp += v0 * y[iybs + iqs + j/qr + 0];
|
904 |
-
tmp += v1 * y[iybs + iqs + j/qr + y_offset];
|
905 |
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
906 |
}
|
907 |
}
|
908 |
|
@@ -914,7 +976,11 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
|
|
914 |
}
|
915 |
|
916 |
if (tid == 0) {
|
|
|
|
|
|
|
917 |
dst[row] = tmp;
|
|
|
918 |
}
|
919 |
}
|
920 |
|
@@ -1209,7 +1275,7 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
|
|
1209 |
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
1210 |
}
|
1211 |
|
1212 |
-
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const
|
1213 |
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1214 |
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1215 |
const dim3 block_nums(1, block_num_y, 1);
|
@@ -1218,7 +1284,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, f
|
|
1218 |
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1219 |
}
|
1220 |
|
1221 |
-
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const
|
1222 |
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1223 |
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1224 |
const dim3 block_nums(1, block_num_y, 1);
|
@@ -1227,7 +1293,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, f
|
|
1227 |
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1228 |
}
|
1229 |
|
1230 |
-
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const
|
1231 |
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1232 |
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1233 |
const dim3 block_nums(1, block_num_y, 1);
|
@@ -1236,7 +1302,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, f
|
|
1236 |
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1237 |
}
|
1238 |
|
1239 |
-
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const
|
1240 |
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1241 |
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1242 |
const dim3 block_nums(1, block_num_y, 1);
|
@@ -1245,7 +1311,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, f
|
|
1245 |
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1246 |
}
|
1247 |
|
1248 |
-
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const
|
1249 |
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1250 |
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1251 |
const dim3 block_nums(1, block_num_y, 1);
|
@@ -1256,7 +1322,7 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, f
|
|
1256 |
|
1257 |
static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1258 |
GGML_ASSERT(ncols % QK_K == 0);
|
1259 |
-
const int ny = 2;
|
1260 |
const int block_num_y = (nrows + ny - 1) / ny;
|
1261 |
const dim3 block_nums(1, block_num_y, 1);
|
1262 |
const dim3 block_dims(32, ny, 1);
|
@@ -1265,14 +1331,20 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
|
|
1265 |
|
1266 |
static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1267 |
GGML_ASSERT(ncols % QK_K == 0);
|
1268 |
-
const
|
1269 |
-
|
|
|
|
|
|
|
1270 |
}
|
1271 |
|
1272 |
static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1273 |
GGML_ASSERT(ncols % QK_K == 0);
|
1274 |
-
const
|
1275 |
-
|
|
|
|
|
|
|
1276 |
}
|
1277 |
|
1278 |
static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
@@ -1295,7 +1367,7 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
|
|
1295 |
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
1296 |
}
|
1297 |
|
1298 |
-
static void convert_mul_mat_vec_f16_cuda(const void * vx, const
|
1299 |
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1300 |
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1301 |
const dim3 block_nums(1, block_num_y, 1);
|
@@ -1463,19 +1535,13 @@ static void * g_scratch_buffer = nullptr;
|
|
1463 |
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
1464 |
static size_t g_scratch_offset = 0;
|
1465 |
|
1466 |
-
#define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
|
1467 |
-
#define GGML_CUDA_MAX_EVENTS 64
|
1468 |
-
|
1469 |
static int g_device_count = -1;
|
1470 |
static int g_main_device = 0;
|
1471 |
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
1472 |
|
1473 |
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
1474 |
|
1475 |
-
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES]
|
1476 |
-
|
1477 |
-
static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
|
1478 |
-
static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
|
1479 |
|
1480 |
void ggml_init_cublas() {
|
1481 |
static bool initialized = false;
|
@@ -1499,15 +1565,8 @@ void ggml_init_cublas() {
|
|
1499 |
for (int id = 0; id < g_device_count; ++id) {
|
1500 |
CUDA_CHECK(cudaSetDevice(id));
|
1501 |
|
1502 |
-
// create
|
1503 |
-
|
1504 |
-
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
|
1505 |
-
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
|
1506 |
-
}
|
1507 |
-
// create events
|
1508 |
-
for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
|
1509 |
-
CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
|
1510 |
-
}
|
1511 |
|
1512 |
// create cublas handle
|
1513 |
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
|
@@ -1723,21 +1782,40 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
1723 |
const int64_t ne00 = src0->ne[0];
|
1724 |
const int64_t nrows = i01_high - i01_low;
|
1725 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1726 |
switch (src0->type) {
|
1727 |
case GGML_TYPE_Q4_0:
|
1728 |
-
dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i,
|
1729 |
break;
|
1730 |
case GGML_TYPE_Q4_1:
|
1731 |
-
dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i,
|
1732 |
break;
|
1733 |
case GGML_TYPE_Q5_0:
|
1734 |
-
dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i,
|
1735 |
break;
|
1736 |
case GGML_TYPE_Q5_1:
|
1737 |
-
dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i,
|
1738 |
break;
|
1739 |
case GGML_TYPE_Q8_0:
|
1740 |
-
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i,
|
1741 |
break;
|
1742 |
case GGML_TYPE_Q2_K:
|
1743 |
dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
@@ -1755,7 +1833,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
1755 |
dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1756 |
break;
|
1757 |
case GGML_TYPE_F16:
|
1758 |
-
convert_mul_mat_vec_f16_cuda(src0_ddq_i,
|
1759 |
break;
|
1760 |
default:
|
1761 |
GGML_ASSERT(false);
|
@@ -1763,6 +1841,12 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
1763 |
}
|
1764 |
CUDA_CHECK(cudaGetLastError());
|
1765 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1766 |
(void) src1;
|
1767 |
(void) dst;
|
1768 |
(void) src0_ddf_i;
|
@@ -1974,6 +2058,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
1974 |
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
1975 |
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
1976 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1977 |
for (int id = 0; id < g_device_count; ++id) {
|
1978 |
if (!split && id != g_main_device) {
|
1979 |
continue;
|
@@ -2072,9 +2162,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2072 |
}
|
2073 |
const int64_t i11 = i13*ne12 + i12;
|
2074 |
|
2075 |
-
cudaStream_t cudaStream_main
|
2076 |
-
cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
|
2077 |
-
cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
|
2078 |
|
2079 |
// for split tensors the data begins at i0 == i0_offset_low
|
2080 |
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
@@ -2102,14 +2190,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2102 |
if (src1->backend == GGML_BACKEND_CPU) {
|
2103 |
GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
|
2104 |
int64_t nrows1 = flatten_rows ? nrows0 : ne11;
|
2105 |
-
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1,
|
2106 |
} else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
|
2107 |
if (id != g_main_device) {
|
2108 |
GGML_ASSERT(!flatten_rows);
|
2109 |
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
|
2110 |
src1_ddf_i_source += i11*src1_stride;
|
2111 |
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
|
2112 |
-
cudaMemcpyDeviceToDevice,
|
2113 |
}
|
2114 |
} else if (src1_on_device && !src1_is_contiguous) {
|
2115 |
GGML_ASSERT(!split);
|
@@ -2118,7 +2206,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2118 |
GGML_ASSERT(false);
|
2119 |
}
|
2120 |
}
|
2121 |
-
CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
|
2122 |
|
2123 |
if (!src0_on_device || !src0_is_contiguous) {
|
2124 |
if (src0_is_f32) {
|
@@ -2134,9 +2221,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2134 |
CUDA_CHECK(cudaGetLastError());
|
2135 |
}
|
2136 |
|
2137 |
-
// wait with main stream until src1 memcpy is done
|
2138 |
-
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, cudaEvent_memcpy_src1, 0));
|
2139 |
-
|
2140 |
// do the computation
|
2141 |
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
|
2142 |
|
@@ -2174,8 +2258,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2174 |
|
2175 |
// wait until each device is finished, then free their buffers
|
2176 |
for (int id = 0; id < g_device_count; ++id) {
|
|
|
|
|
|
|
|
|
2177 |
CUDA_CHECK(cudaSetDevice(id));
|
2178 |
CUDA_CHECK(cudaDeviceSynchronize());
|
|
|
2179 |
if (src0_asq[id] > 0) {
|
2180 |
ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
|
2181 |
}
|
@@ -2241,7 +2330,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
2241 |
const int64_t ne02 = src0->ne[2];
|
2242 |
|
2243 |
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2244 |
-
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]
|
2245 |
|
2246 |
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2247 |
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -2253,8 +2342,6 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
|
|
2253 |
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
2254 |
|
2255 |
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
|
2256 |
-
|
2257 |
-
CUDA_CHECK(cudaDeviceSynchronize());
|
2258 |
}
|
2259 |
|
2260 |
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
@@ -2272,7 +2359,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
2272 |
const int64_t nb02 = src0->nb[2];
|
2273 |
|
2274 |
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2275 |
-
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]
|
2276 |
|
2277 |
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2278 |
void * src0_ddq = src0_extra->data_device[g_main_device];
|
@@ -2287,8 +2374,6 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
|
|
2287 |
const int channel_stride_x = nb02 / sizeof(half);
|
2288 |
|
2289 |
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
|
2290 |
-
|
2291 |
-
CUDA_CHECK(cudaDeviceSynchronize());
|
2292 |
}
|
2293 |
|
2294 |
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -2344,7 +2429,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
2344 |
const int64_t nb12 = src1->nb[2];
|
2345 |
|
2346 |
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2347 |
-
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]
|
2348 |
|
2349 |
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2350 |
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
@@ -2362,8 +2447,6 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
|
2362 |
GGML_ASSERT(false);
|
2363 |
}
|
2364 |
|
2365 |
-
CUDA_CHECK(cudaDeviceSynchronize());
|
2366 |
-
|
2367 |
(void) dst;
|
2368 |
}
|
2369 |
|
@@ -2552,7 +2635,7 @@ void ggml_cuda_free_scratch() {
|
|
2552 |
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
2553 |
ggml_cuda_func_t func;
|
2554 |
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
2555 |
-
|| tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT
|
2556 |
|| (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
|
2557 |
|
2558 |
switch (tensor->op) {
|
|
|
13 |
#include "ggml-cuda.h"
|
14 |
#include "ggml.h"
|
15 |
|
16 |
+
#if defined(_MSC_VER)
|
17 |
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
18 |
+
#endif
|
19 |
+
|
20 |
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
21 |
|
22 |
#define CUDA_CHECK(err) \
|
|
|
50 |
} while (0)
|
51 |
#endif // CUDART_VERSION >= 11
|
52 |
|
53 |
+
#ifdef GGML_CUDA_DMMV_F16
|
54 |
+
typedef half dfloat; // dequantize float
|
55 |
+
typedef half2 dfloat2;
|
56 |
+
#else
|
57 |
+
typedef float dfloat; // dequantize float
|
58 |
+
typedef float2 dfloat2;
|
59 |
+
#endif //GGML_CUDA_DMMV_F16
|
60 |
+
|
61 |
+
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
62 |
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
|
63 |
typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
|
64 |
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
|
|
242 |
}
|
243 |
}
|
244 |
|
245 |
+
static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
246 |
const block_q4_0 * x = (const block_q4_0 *) vx;
|
247 |
|
248 |
+
const dfloat d = x[ib].d;
|
249 |
|
250 |
+
const int vui = x[ib].qs[iqs];
|
251 |
|
252 |
+
v.x = vui & 0xF;
|
253 |
+
v.y = vui >> 4;
|
254 |
|
255 |
+
#ifdef GGML_CUDA_DMMV_F16
|
256 |
+
v = __hsub2(v, {8.0f, 8.0f});
|
257 |
+
v = __hmul2(v, {d, d});
|
258 |
+
#else
|
259 |
+
v.x = (v.x - 8.0f) * d;
|
260 |
+
v.y = (v.y - 8.0f) * d;
|
261 |
+
#endif // GGML_CUDA_DMMV_F16
|
262 |
}
|
263 |
|
264 |
+
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
265 |
const block_q4_1 * x = (const block_q4_1 *) vx;
|
266 |
|
267 |
+
const dfloat d = x[ib].d;
|
268 |
+
const dfloat m = x[ib].m;
|
269 |
|
270 |
+
const int vui = x[ib].qs[iqs];
|
271 |
|
272 |
+
v.x = vui & 0xF;
|
273 |
+
v.y = vui >> 4;
|
274 |
|
275 |
+
#ifdef GGML_CUDA_DMMV_F16
|
276 |
+
v = __hmul2(v, {d, d});
|
277 |
+
v = __hadd2(v, {m, m});
|
278 |
+
#else
|
279 |
+
v.x = (v.x * d) + m;
|
280 |
+
v.y = (v.y * d) + m;
|
281 |
+
#endif // GGML_CUDA_DMMV_F16
|
282 |
}
|
283 |
|
284 |
+
static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
285 |
const block_q5_0 * x = (const block_q5_0 *) vx;
|
286 |
|
287 |
+
const dfloat d = x[ib].d;
|
288 |
|
289 |
uint32_t qh;
|
290 |
memcpy(&qh, x[ib].qh, sizeof(qh));
|
291 |
|
292 |
+
const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
|
293 |
+
const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
|
294 |
|
295 |
+
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
296 |
+
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
297 |
|
298 |
+
#ifdef GGML_CUDA_DMMV_F16
|
299 |
+
v = __hsub2(v, {16.0f, 16.0f});
|
300 |
+
v = __hmul2(v, {d, d});
|
301 |
+
#else
|
302 |
+
v.x = (v.x - 16.0f) * d;
|
303 |
+
v.y = (v.y - 16.0f) * d;
|
304 |
+
#endif // GGML_CUDA_DMMV_F16
|
305 |
}
|
306 |
|
307 |
+
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
308 |
const block_q5_1 * x = (const block_q5_1 *) vx;
|
309 |
|
310 |
+
const dfloat d = x[ib].d;
|
311 |
+
const dfloat m = x[ib].m;
|
312 |
|
313 |
uint32_t qh;
|
314 |
memcpy(&qh, x[ib].qh, sizeof(qh));
|
315 |
|
316 |
+
const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
|
317 |
+
const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
|
318 |
|
319 |
+
v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
|
320 |
+
v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
|
321 |
|
322 |
+
#ifdef GGML_CUDA_DMMV_F16
|
323 |
+
v = __hmul2(v, {d, d});
|
324 |
+
v = __hadd2(v, {m, m});
|
325 |
+
#else
|
326 |
+
v.x = (v.x * d) + m;
|
327 |
+
v.y = (v.y * d) + m;
|
328 |
+
#endif // GGML_CUDA_DMMV_F16
|
329 |
}
|
330 |
|
331 |
+
static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
332 |
const block_q8_0 * x = (const block_q8_0 *) vx;
|
333 |
|
334 |
+
const dfloat d = x[ib].d;
|
335 |
|
336 |
+
v.x = x[ib].qs[iqs + 0];
|
337 |
+
v.y = x[ib].qs[iqs + 1];
|
338 |
|
339 |
+
#ifdef GGML_CUDA_DMMV_F16
|
340 |
+
v = __hmul2(v, {d, d});
|
341 |
+
#else
|
342 |
+
v.x *= d;
|
343 |
+
v.y *= d;
|
344 |
+
#endif // GGML_CUDA_DMMV_F16
|
345 |
}
|
346 |
|
347 |
//================================== k-quants
|
|
|
515 |
|
516 |
const block_q2_K * x = (const block_q2_K *)vx + ib0;
|
517 |
|
518 |
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
519 |
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
520 |
|
521 |
const int step = 16/K_QUANTS_PER_ITERATION;
|
522 |
|
523 |
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
524 |
+
const int in = tid - step*im; // 0...15 or 0...7
|
525 |
|
526 |
+
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
|
527 |
const int q_offset = 32*im + l0;
|
528 |
const int s_offset = 8*im;
|
529 |
const int y_offset = 128*im + l0;
|
|
|
578 |
}
|
579 |
}
|
580 |
|
581 |
+
static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
582 |
|
583 |
const uint16_t kmask1 = 0x0303;
|
584 |
const uint16_t kmask2 = 0x0f0f;
|
585 |
|
586 |
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
587 |
+
if (row > nrows) return;
|
588 |
+
|
589 |
const int num_blocks_per_row = ncols / QK_K;
|
590 |
const int ib0 = row*num_blocks_per_row;
|
591 |
|
592 |
const block_q3_K * x = (const block_q3_K *)vx + ib0;
|
593 |
|
594 |
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
595 |
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
596 |
|
597 |
+
const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
|
598 |
+
const int step = 16/K_QUANTS_PER_ITERATION;
|
599 |
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
600 |
+
const int in = tid - step*im; // 0....15 or 0...7
|
601 |
|
602 |
const uint8_t m = 1 << (4*im);
|
603 |
|
604 |
+
const int l0 = n*in; // 0...15 or 0...14 in steps of 2
|
605 |
const int q_offset = 32*im + l0;
|
606 |
const int y_offset = 128*im + l0;
|
607 |
|
|
|
612 |
|
613 |
float tmp = 0; // partial sum for thread in warp
|
614 |
|
615 |
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
616 |
|
617 |
const float * y = yy + i * QK_K + y_offset;
|
618 |
const uint8_t * q = x[i].qs + q_offset;
|
|
|
653 |
}
|
654 |
}
|
655 |
|
656 |
+
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
657 |
|
658 |
const uint16_t kmask1 = 0x3f3f;
|
659 |
const uint16_t kmask2 = 0x0f0f;
|
660 |
const uint16_t kmask3 = 0xc0c0;
|
661 |
|
662 |
+
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
663 |
+
if (row > nrows) return;
|
664 |
const int num_blocks_per_row = ncols / QK_K;
|
665 |
const int ib0 = row*num_blocks_per_row;
|
666 |
|
667 |
+
const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
668 |
+
const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
669 |
|
670 |
+
const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
|
671 |
+
|
672 |
+
const int il = tid/step; // 0...3
|
673 |
+
const int ir = tid - step*il; // 0...7 or 0...3
|
674 |
+
const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
|
675 |
|
676 |
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
677 |
const int in = il%2;
|
|
|
687 |
|
688 |
float tmp = 0; // partial sum for thread in warp
|
689 |
|
690 |
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
691 |
|
692 |
const uint8_t * q1 = x[i].qs + q_offset;
|
693 |
const uint8_t * q2 = q1 + 64;
|
|
|
742 |
|
743 |
const int il = tid/4; // 0...3
|
744 |
const int ir = tid - 4*il;// 0...3
|
745 |
+
const int n = 2;
|
746 |
|
747 |
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
748 |
const int in = il%2;
|
|
|
781 |
float4 sum = {0.f, 0.f, 0.f, 0.f};
|
782 |
float smin = 0;
|
783 |
for (int l = 0; l < n; ++l) {
|
784 |
+
sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
|
785 |
+
+ y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
|
786 |
+
sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
|
787 |
+
+ y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
|
788 |
+
sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
|
789 |
+
+ y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
|
790 |
+
sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
|
791 |
+
+ y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
|
792 |
+
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
|
793 |
+
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
794 |
}
|
795 |
tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
|
796 |
|
|
|
886 |
}
|
887 |
}
|
888 |
|
889 |
+
static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
890 |
const half * x = (const half *) vx;
|
891 |
|
892 |
+
// automatic half -> float type cast if dfloat == float
|
893 |
+
v.x = x[ib + iqs + 0];
|
894 |
+
v.y = x[ib + iqs + 1];
|
895 |
}
|
896 |
|
897 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
|
|
908 |
const int y_offset = qr == 1 ? 1 : qk/2;
|
909 |
|
910 |
// dequantize
|
911 |
+
dfloat2 v;
|
912 |
+
dequantize_kernel(vx, ib, iqs, v);
|
913 |
+
|
914 |
+
y[iybs + iqs + 0] = v.x;
|
915 |
+
y[iybs + iqs + y_offset] = v.y;
|
916 |
}
|
917 |
|
918 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
919 |
+
static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
|
920 |
// qk = quantized weights per x block
|
921 |
// qr = number of quantized weights per data value in x block
|
922 |
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
|
|
931 |
const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
|
932 |
const int y_offset = qr == 1 ? 1 : qk/2;
|
933 |
|
934 |
+
// partial sum for each thread
|
935 |
+
#ifdef GGML_CUDA_DMMV_F16
|
936 |
+
half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
|
937 |
+
#else
|
938 |
+
float tmp = 0.0f;
|
939 |
+
#endif // GGML_CUDA_DMMV_F16
|
940 |
|
941 |
for (int i = 0; i < ncols; i += iter_stride) {
|
942 |
const int col = i + vals_per_iter*tid;
|
|
|
950 |
// process 2 vals per j iter
|
951 |
|
952 |
// dequantize
|
|
|
|
|
953 |
// for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
|
954 |
+
dfloat2 v;
|
955 |
+
dequantize_kernel(vx, ib, iqs + j/qr, v);
|
956 |
|
957 |
// matrix multiplication
|
|
|
|
|
958 |
// for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
|
959 |
+
#ifdef GGML_CUDA_DMMV_F16
|
960 |
+
tmp += __hmul2(v, {
|
961 |
+
y[iybs + iqs + j/qr + 0],
|
962 |
+
y[iybs + iqs + j/qr + y_offset]
|
963 |
+
});
|
964 |
+
#else
|
965 |
+
tmp += v.x * y[iybs + iqs + j/qr + 0];
|
966 |
+
tmp += v.y * y[iybs + iqs + j/qr + y_offset];
|
967 |
+
#endif // GGML_CUDA_DMMV_F16
|
968 |
}
|
969 |
}
|
970 |
|
|
|
976 |
}
|
977 |
|
978 |
if (tid == 0) {
|
979 |
+
#ifdef GGML_CUDA_DMMV_F16
|
980 |
+
dst[row] = tmp.x + tmp.y;
|
981 |
+
#else
|
982 |
dst[row] = tmp;
|
983 |
+
#endif // GGML_CUDA_DMMV_F16
|
984 |
}
|
985 |
}
|
986 |
|
|
|
1275 |
dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
|
1276 |
}
|
1277 |
|
1278 |
+
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1279 |
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1280 |
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1281 |
const dim3 block_nums(1, block_num_y, 1);
|
|
|
1284 |
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1285 |
}
|
1286 |
|
1287 |
+
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1288 |
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1289 |
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1290 |
const dim3 block_nums(1, block_num_y, 1);
|
|
|
1293 |
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1294 |
}
|
1295 |
|
1296 |
+
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1297 |
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1298 |
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1299 |
const dim3 block_nums(1, block_num_y, 1);
|
|
|
1302 |
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1303 |
}
|
1304 |
|
1305 |
+
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1306 |
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1307 |
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1308 |
const dim3 block_nums(1, block_num_y, 1);
|
|
|
1311 |
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1312 |
}
|
1313 |
|
1314 |
+
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1315 |
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1316 |
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1317 |
const dim3 block_nums(1, block_num_y, 1);
|
|
|
1322 |
|
1323 |
static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1324 |
GGML_ASSERT(ncols % QK_K == 0);
|
1325 |
+
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
1326 |
const int block_num_y = (nrows + ny - 1) / ny;
|
1327 |
const dim3 block_nums(1, block_num_y, 1);
|
1328 |
const dim3 block_dims(32, ny, 1);
|
|
|
1331 |
|
1332 |
static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1333 |
GGML_ASSERT(ncols % QK_K == 0);
|
1334 |
+
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
1335 |
+
const int block_num_y = (nrows + ny - 1) / ny;
|
1336 |
+
const dim3 block_nums(1, block_num_y, 1);
|
1337 |
+
const dim3 block_dims(32, ny, 1);
|
1338 |
+
dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1339 |
}
|
1340 |
|
1341 |
static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1342 |
GGML_ASSERT(ncols % QK_K == 0);
|
1343 |
+
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
1344 |
+
const int block_num_y = (nrows + ny - 1) / ny;
|
1345 |
+
const dim3 block_nums(1, block_num_y, 1);
|
1346 |
+
const dim3 block_dims(32, ny, 1);
|
1347 |
+
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
1348 |
}
|
1349 |
|
1350 |
static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
|
|
1367 |
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
1368 |
}
|
1369 |
|
1370 |
+
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
1371 |
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
1372 |
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
1373 |
const dim3 block_nums(1, block_num_y, 1);
|
|
|
1535 |
static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
|
1536 |
static size_t g_scratch_offset = 0;
|
1537 |
|
|
|
|
|
|
|
1538 |
static int g_device_count = -1;
|
1539 |
static int g_main_device = 0;
|
1540 |
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
1541 |
|
1542 |
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
1543 |
|
1544 |
+
static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
|
|
|
|
|
|
1545 |
|
1546 |
void ggml_init_cublas() {
|
1547 |
static bool initialized = false;
|
|
|
1565 |
for (int id = 0; id < g_device_count; ++id) {
|
1566 |
CUDA_CHECK(cudaSetDevice(id));
|
1567 |
|
1568 |
+
// create main stream
|
1569 |
+
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1570 |
|
1571 |
// create cublas handle
|
1572 |
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
|
|
|
1782 |
const int64_t ne00 = src0->ne[0];
|
1783 |
const int64_t nrows = i01_high - i01_low;
|
1784 |
|
1785 |
+
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
1786 |
+
#ifdef GGML_CUDA_DMMV_F16
|
1787 |
+
size_t ash;
|
1788 |
+
dfloat * src1_dfloat = nullptr; // dfloat == half
|
1789 |
+
|
1790 |
+
bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
1791 |
+
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
1792 |
+
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
1793 |
+
|
1794 |
+
if (src1_convert_f16) {
|
1795 |
+
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
|
1796 |
+
ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
1797 |
+
ne00, 1, sizeof(float), 0, 0,
|
1798 |
+
ne00, 1, sizeof(half), 0, 0, cudaStream_main);
|
1799 |
+
}
|
1800 |
+
#else
|
1801 |
+
dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
|
1802 |
+
#endif // GGML_CUDA_DMMV_F16
|
1803 |
+
|
1804 |
switch (src0->type) {
|
1805 |
case GGML_TYPE_Q4_0:
|
1806 |
+
dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1807 |
break;
|
1808 |
case GGML_TYPE_Q4_1:
|
1809 |
+
dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1810 |
break;
|
1811 |
case GGML_TYPE_Q5_0:
|
1812 |
+
dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1813 |
break;
|
1814 |
case GGML_TYPE_Q5_1:
|
1815 |
+
dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1816 |
break;
|
1817 |
case GGML_TYPE_Q8_0:
|
1818 |
+
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1819 |
break;
|
1820 |
case GGML_TYPE_Q2_K:
|
1821 |
dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
|
|
1833 |
dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1834 |
break;
|
1835 |
case GGML_TYPE_F16:
|
1836 |
+
convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
1837 |
break;
|
1838 |
default:
|
1839 |
GGML_ASSERT(false);
|
|
|
1841 |
}
|
1842 |
CUDA_CHECK(cudaGetLastError());
|
1843 |
|
1844 |
+
#ifdef GGML_CUDA_DMMV_F16
|
1845 |
+
if (src1_convert_f16) {
|
1846 |
+
ggml_cuda_pool_free(src1_dfloat, ash);
|
1847 |
+
}
|
1848 |
+
#endif // GGML_CUDA_DMMV_F16
|
1849 |
+
|
1850 |
(void) src1;
|
1851 |
(void) dst;
|
1852 |
(void) src0_ddf_i;
|
|
|
2058 |
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
2059 |
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
2060 |
|
2061 |
+
// if multiple GPUs are used they need to wait for the main GPU to finish
|
2062 |
+
if (split && g_device_count > 1) {
|
2063 |
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2064 |
+
CUDA_CHECK(cudaDeviceSynchronize());
|
2065 |
+
}
|
2066 |
+
|
2067 |
for (int id = 0; id < g_device_count; ++id) {
|
2068 |
if (!split && id != g_main_device) {
|
2069 |
continue;
|
|
|
2162 |
}
|
2163 |
const int64_t i11 = i13*ne12 + i12;
|
2164 |
|
2165 |
+
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
|
|
|
|
2166 |
|
2167 |
// for split tensors the data begins at i0 == i0_offset_low
|
2168 |
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
|
|
2190 |
if (src1->backend == GGML_BACKEND_CPU) {
|
2191 |
GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
|
2192 |
int64_t nrows1 = flatten_rows ? nrows0 : ne11;
|
2193 |
+
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
|
2194 |
} else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
|
2195 |
if (id != g_main_device) {
|
2196 |
GGML_ASSERT(!flatten_rows);
|
2197 |
float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
|
2198 |
src1_ddf_i_source += i11*src1_stride;
|
2199 |
CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
|
2200 |
+
cudaMemcpyDeviceToDevice, cudaStream_main));
|
2201 |
}
|
2202 |
} else if (src1_on_device && !src1_is_contiguous) {
|
2203 |
GGML_ASSERT(!split);
|
|
|
2206 |
GGML_ASSERT(false);
|
2207 |
}
|
2208 |
}
|
|
|
2209 |
|
2210 |
if (!src0_on_device || !src0_is_contiguous) {
|
2211 |
if (src0_is_f32) {
|
|
|
2221 |
CUDA_CHECK(cudaGetLastError());
|
2222 |
}
|
2223 |
|
|
|
|
|
|
|
2224 |
// do the computation
|
2225 |
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
|
2226 |
|
|
|
2258 |
|
2259 |
// wait until each device is finished, then free their buffers
|
2260 |
for (int id = 0; id < g_device_count; ++id) {
|
2261 |
+
if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
|
2262 |
+
continue;
|
2263 |
+
}
|
2264 |
+
|
2265 |
CUDA_CHECK(cudaSetDevice(id));
|
2266 |
CUDA_CHECK(cudaDeviceSynchronize());
|
2267 |
+
|
2268 |
if (src0_asq[id] > 0) {
|
2269 |
ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
|
2270 |
}
|
|
|
2330 |
const int64_t ne02 = src0->ne[2];
|
2331 |
|
2332 |
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2333 |
+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
2334 |
|
2335 |
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2336 |
void * src0_ddq = src0_extra->data_device[g_main_device];
|
|
|
2342 |
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
2343 |
|
2344 |
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
|
|
|
|
|
2345 |
}
|
2346 |
|
2347 |
void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
|
|
2359 |
const int64_t nb02 = src0->nb[2];
|
2360 |
|
2361 |
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2362 |
+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
2363 |
|
2364 |
struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2365 |
void * src0_ddq = src0_extra->data_device[g_main_device];
|
|
|
2374 |
const int channel_stride_x = nb02 / sizeof(half);
|
2375 |
|
2376 |
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
|
|
|
|
|
2377 |
}
|
2378 |
|
2379 |
void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
2429 |
const int64_t nb12 = src1->nb[2];
|
2430 |
|
2431 |
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2432 |
+
cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
|
2433 |
|
2434 |
const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
2435 |
const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
|
|
2447 |
GGML_ASSERT(false);
|
2448 |
}
|
2449 |
|
|
|
|
|
2450 |
(void) dst;
|
2451 |
}
|
2452 |
|
|
|
2635 |
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
2636 |
ggml_cuda_func_t func;
|
2637 |
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
2638 |
+
|| (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT))
|
2639 |
|| (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
|
2640 |
|
2641 |
switch (tensor->op) {
|
ggml-metal.h
CHANGED
@@ -41,12 +41,15 @@ void ggml_metal_free(struct ggml_metal_context * ctx);
|
|
41 |
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
42 |
// - the mapping is used during computation to determine the arguments of the compute kernels
|
43 |
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
|
|
|
|
|
44 |
//
|
45 |
bool ggml_metal_add_buffer(
|
46 |
struct ggml_metal_context * ctx,
|
47 |
const char * name,
|
48 |
void * data,
|
49 |
-
size_t size
|
|
|
50 |
|
51 |
// set data from host memory into the device
|
52 |
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
|
|
41 |
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
42 |
// - the mapping is used during computation to determine the arguments of the compute kernels
|
43 |
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
|
44 |
+
// - max_size specifies the maximum size of a tensor and is used to create shared views such
|
45 |
+
// that it is guaranteed that the tensor will fit in at least one of the views
|
46 |
//
|
47 |
bool ggml_metal_add_buffer(
|
48 |
struct ggml_metal_context * ctx,
|
49 |
const char * name,
|
50 |
void * data,
|
51 |
+
size_t size,
|
52 |
+
size_t max_size);
|
53 |
|
54 |
// set data from host memory into the device
|
55 |
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
ggml-metal.m
CHANGED
@@ -183,6 +183,14 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
|
183 |
#undef GGML_METAL_ADD_KERNEL
|
184 |
}
|
185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
return ctx;
|
187 |
}
|
188 |
|
@@ -199,10 +207,13 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
199 |
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
|
200 |
//fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
201 |
|
|
|
|
|
|
|
202 |
for (int i = 0; i < ctx->n_buffers; ++i) {
|
203 |
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
|
204 |
|
205 |
-
if (ioffs >= 0 && ioffs
|
206 |
*offs = (size_t) ioffs;
|
207 |
|
208 |
//fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
|
@@ -220,7 +231,8 @@ bool ggml_metal_add_buffer(
|
|
220 |
struct ggml_metal_context * ctx,
|
221 |
const char * name,
|
222 |
void * data,
|
223 |
-
size_t size
|
|
|
224 |
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
|
225 |
fprintf(stderr, "%s: too many buffers\n", __func__);
|
226 |
return false;
|
@@ -237,30 +249,68 @@ bool ggml_metal_add_buffer(
|
|
237 |
}
|
238 |
}
|
239 |
|
240 |
-
size_t
|
241 |
-
|
242 |
-
|
243 |
-
|
|
|
244 |
}
|
245 |
|
246 |
-
|
247 |
-
ctx->
|
248 |
-
|
|
|
|
|
249 |
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
|
|
255 |
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
} else {
|
260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
}
|
262 |
|
263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
}
|
265 |
|
266 |
return true;
|
@@ -765,18 +815,23 @@ void ggml_metal_graph_compute(
|
|
765 |
} break;
|
766 |
case GGML_OP_ALIBI:
|
767 |
{
|
|
|
|
|
|
|
|
|
768 |
GGML_ASSERT((src0t == GGML_TYPE_F32));
|
769 |
-
|
|
|
770 |
const int n_head = ((int32_t *) src1->data)[1];
|
771 |
const float max_bias = ((float *) src1->data)[2];
|
|
|
772 |
if (__builtin_popcount(n_head) != 1) {
|
773 |
GGML_ASSERT(false && "only power-of-two n_head implemented");
|
774 |
}
|
|
|
775 |
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
776 |
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
777 |
-
|
778 |
-
encoder = [command_buffer computeCommandEncoder];
|
779 |
-
}
|
780 |
[encoder setComputePipelineState:ctx->pipeline_alibi_f32];
|
781 |
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
782 |
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
@@ -904,4 +959,14 @@ void ggml_metal_graph_compute(
|
|
904 |
dispatch_barrier_sync(queue, ^{});
|
905 |
|
906 |
[command_buffers[n_cb - 1] waitUntilCompleted];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
907 |
}
|
|
|
183 |
#undef GGML_METAL_ADD_KERNEL
|
184 |
}
|
185 |
|
186 |
+
fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
187 |
+
fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
188 |
+
if (ctx->device.maxTransferRate != 0) {
|
189 |
+
fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
190 |
+
} else {
|
191 |
+
fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__);
|
192 |
+
}
|
193 |
+
|
194 |
return ctx;
|
195 |
}
|
196 |
|
|
|
207 |
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
|
208 |
//fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
209 |
|
210 |
+
const int64_t tsize = ggml_nbytes(t);
|
211 |
+
|
212 |
+
// find the view that contains the tensor fully
|
213 |
for (int i = 0; i < ctx->n_buffers; ++i) {
|
214 |
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
|
215 |
|
216 |
+
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
|
217 |
*offs = (size_t) ioffs;
|
218 |
|
219 |
//fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
|
|
|
231 |
struct ggml_metal_context * ctx,
|
232 |
const char * name,
|
233 |
void * data,
|
234 |
+
size_t size,
|
235 |
+
size_t max_size) {
|
236 |
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
|
237 |
fprintf(stderr, "%s: too many buffers\n", __func__);
|
238 |
return false;
|
|
|
249 |
}
|
250 |
}
|
251 |
|
252 |
+
const size_t size_page = getpagesize();
|
253 |
+
|
254 |
+
size_t size_aligned = size;
|
255 |
+
if ((size_aligned % size_page) != 0) {
|
256 |
+
size_aligned += (size_page - (size_aligned % size_page));
|
257 |
}
|
258 |
|
259 |
+
// the buffer fits into the max buffer size allowed by the device
|
260 |
+
if (size_aligned <= ctx->device.maxBufferLength) {
|
261 |
+
ctx->buffers[ctx->n_buffers].name = name;
|
262 |
+
ctx->buffers[ctx->n_buffers].data = data;
|
263 |
+
ctx->buffers[ctx->n_buffers].size = size;
|
264 |
|
265 |
+
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
266 |
+
|
267 |
+
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
268 |
+
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
269 |
+
return false;
|
270 |
+
}
|
271 |
|
272 |
+
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
|
273 |
+
|
274 |
+
++ctx->n_buffers;
|
275 |
} else {
|
276 |
+
// this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
|
277 |
+
// one of the views
|
278 |
+
const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
|
279 |
+
const size_t size_step = ctx->device.maxBufferLength - size_ovlp;
|
280 |
+
const size_t size_view = ctx->device.maxBufferLength;
|
281 |
+
|
282 |
+
for (size_t i = 0; i < size; i += size_step) {
|
283 |
+
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
|
284 |
+
|
285 |
+
ctx->buffers[ctx->n_buffers].name = name;
|
286 |
+
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
|
287 |
+
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
|
288 |
+
|
289 |
+
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
290 |
+
|
291 |
+
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
292 |
+
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
293 |
+
return false;
|
294 |
+
}
|
295 |
+
|
296 |
+
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
297 |
+
if (i + size_step < size) {
|
298 |
+
fprintf(stderr, "\n");
|
299 |
+
}
|
300 |
+
|
301 |
+
++ctx->n_buffers;
|
302 |
+
}
|
303 |
}
|
304 |
|
305 |
+
fprintf(stderr, ", (%8.2f / %8.2f)",
|
306 |
+
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
|
307 |
+
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
308 |
+
|
309 |
+
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
|
310 |
+
fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n");
|
311 |
+
} else {
|
312 |
+
fprintf(stderr, "\n");
|
313 |
+
}
|
314 |
}
|
315 |
|
316 |
return true;
|
|
|
815 |
} break;
|
816 |
case GGML_OP_ALIBI:
|
817 |
{
|
818 |
+
if (encoder == nil) {
|
819 |
+
encoder = [command_buffer computeCommandEncoder];
|
820 |
+
}
|
821 |
+
|
822 |
GGML_ASSERT((src0t == GGML_TYPE_F32));
|
823 |
+
|
824 |
+
const int n_past = ((int32_t *) src1->data)[0]; UNUSED(n_past);
|
825 |
const int n_head = ((int32_t *) src1->data)[1];
|
826 |
const float max_bias = ((float *) src1->data)[2];
|
827 |
+
|
828 |
if (__builtin_popcount(n_head) != 1) {
|
829 |
GGML_ASSERT(false && "only power-of-two n_head implemented");
|
830 |
}
|
831 |
+
|
832 |
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
833 |
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
834 |
+
|
|
|
|
|
835 |
[encoder setComputePipelineState:ctx->pipeline_alibi_f32];
|
836 |
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
837 |
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
|
|
959 |
dispatch_barrier_sync(queue, ^{});
|
960 |
|
961 |
[command_buffers[n_cb - 1] waitUntilCompleted];
|
962 |
+
|
963 |
+
// check status of command buffers
|
964 |
+
// needed to detect if the device ran out-of-memory for example (#1881)
|
965 |
+
for (int i = 0; i < n_cb; i++) {
|
966 |
+
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
|
967 |
+
if (status != MTLCommandBufferStatusCompleted) {
|
968 |
+
fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
969 |
+
GGML_ASSERT(false);
|
970 |
+
}
|
971 |
+
}
|
972 |
}
|
ggml-opencl.cpp
CHANGED
@@ -16,13 +16,25 @@
|
|
16 |
|
17 |
#include "ggml.h"
|
18 |
|
|
|
|
|
|
|
|
|
19 |
#define CL_DMMV_BLOCK_SIZE 32
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
#define MULTILINE_QUOTE(...) #__VA_ARGS__
|
22 |
static std::string program_source = MULTILINE_QUOTE(
|
23 |
|
24 |
typedef char int8_t;
|
25 |
typedef uchar uint8_t;
|
|
|
|
|
26 |
typedef int int32_t;
|
27 |
typedef uint uint32_t;
|
28 |
|
@@ -172,7 +184,9 @@ void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float
|
|
172 |
*v0 = vload_half(0, &x[ib + 0]);
|
173 |
*v1 = vload_half(0, &x[ib + 1]);
|
174 |
}
|
|
|
175 |
|
|
|
176 |
inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m)
|
177 |
{
|
178 |
if (j < 4)
|
@@ -196,7 +210,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa
|
|
196 |
const int is = 8 * n + l / 16;
|
197 |
|
198 |
const uint8_t q = x[i].qs[32 * n + l];
|
199 |
-
__global float *y = yy + i *
|
200 |
|
201 |
const float dall = vload_half(0, &x[i].d);
|
202 |
const float dmin = vload_half(0, &x[i].dmin);
|
@@ -228,7 +242,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
|
|
228 |
float d_all = vload_half(0, &x[i].d);
|
229 |
float dl = d_all * (us - 32);
|
230 |
|
231 |
-
__global float *y = yy + i *
|
232 |
const __global uint8_t *q = x[i].qs + 32 * n;
|
233 |
const __global uint8_t *hm = x[i].hmask;
|
234 |
|
@@ -245,7 +259,7 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa
|
|
245 |
const int is = 2 * il;
|
246 |
const int n = 4;
|
247 |
|
248 |
-
__global float *y = yy + i *
|
249 |
|
250 |
const float dall = vload_half(0, &x[i].d);
|
251 |
const float dmin = vload_half(0, &x[i].dmin);
|
@@ -274,7 +288,7 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa
|
|
274 |
const int ir = tid % 16;
|
275 |
const int is = 2 * il;
|
276 |
|
277 |
-
__global float *y = yy + i *
|
278 |
|
279 |
const float dall = vload_half(0, &x[i].d);
|
280 |
const float dmin = vload_half(0, &x[i].dmin);
|
@@ -306,7 +320,7 @@ __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __globa
|
|
306 |
const int il = tid - 32 * ip;
|
307 |
const int is = 8 * ip + il / 16;
|
308 |
|
309 |
-
__global float *y = yy + i *
|
310 |
|
311 |
const float d = vload_half(0, &x[i].d);
|
312 |
|
@@ -320,161 +334,383 @@ __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __globa
|
|
320 |
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
321 |
}
|
322 |
|
|
|
323 |
|
324 |
-
|
325 |
|
326 |
-
int
|
327 |
-
int
|
328 |
-
int l = r / 8;
|
329 |
|
330 |
-
__global const
|
331 |
-
__global const uint8_t *q = x[ib].qs + 32 * n + l;
|
332 |
-
__global const uint8_t *s = x[ib].scales + 8 * n;
|
333 |
|
334 |
-
const
|
335 |
-
const
|
336 |
|
337 |
-
|
338 |
-
+ y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
|
339 |
-
+ y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
|
340 |
-
+ y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
|
341 |
-
+ y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
|
342 |
-
+ y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
|
343 |
-
+ y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
|
344 |
-
+ y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
|
345 |
|
346 |
-
|
347 |
-
|
348 |
|
349 |
-
|
|
|
|
|
|
|
350 |
|
351 |
-
|
352 |
-
const uint32_t kmask2 = 0x0f0f0f0f;
|
353 |
|
354 |
-
uint32_t aux[
|
355 |
-
|
|
|
356 |
|
357 |
-
int
|
358 |
-
int r = iqs - 128*n;
|
359 |
-
int l = r/8;
|
360 |
|
361 |
-
|
362 |
-
|
363 |
-
__global const uint8_t * hm = x[ib].hmask + l;
|
364 |
-
const int8_t * s = (const int8_t *)utmp + 8*n;
|
365 |
|
366 |
-
|
367 |
-
|
368 |
-
aux[2] = x[ib].scales[8] | x[ib].scales[9] << 8 | x[ib].scales[10] << 16 | x[ib].scales[11] << 24;
|
369 |
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
|
|
374 |
|
375 |
-
|
376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
|
378 |
-
|
379 |
-
|
380 |
-
+ y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
|
381 |
-
+ y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
|
382 |
-
+ y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
|
383 |
-
+ y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
|
384 |
-
+ y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
|
385 |
-
+ y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
|
386 |
|
387 |
-
|
388 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
389 |
}
|
390 |
|
391 |
-
void
|
|
|
|
|
392 |
|
393 |
-
const int
|
394 |
-
const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
|
395 |
-
const int is = 2*j; // is is in 0...6 in steps of 2
|
396 |
|
397 |
-
|
398 |
-
|
399 |
|
400 |
-
const
|
401 |
-
const float dmin = vload_half(0, &x[ib].dmin);
|
402 |
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
const
|
407 |
-
|
408 |
-
const
|
409 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
410 |
|
411 |
-
float sum = 0;
|
412 |
-
for (int k = 0; k < 4; ++k) {
|
413 |
-
sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1);
|
414 |
-
sum += y[k + 32] * (d2 * (q[k] >> 4) - m2);
|
415 |
}
|
416 |
|
417 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
418 |
}
|
419 |
|
420 |
-
void
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
421 |
|
422 |
-
const int
|
423 |
-
const int
|
424 |
-
const int is = 2*j;
|
425 |
|
426 |
-
|
427 |
-
__global const uint8_t * ql = x[ib].qs + 32*j + ir;
|
428 |
-
__global const uint8_t * qh = x[ib].qh + ir;
|
429 |
|
430 |
-
const
|
431 |
-
const
|
|
|
432 |
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
const
|
437 |
-
|
438 |
-
const
|
439 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
440 |
|
441 |
-
uint8_t hm = 1 << is;
|
442 |
-
float sum = 0;
|
443 |
-
for (int k = 0; k < 4; ++k) {
|
444 |
-
sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
|
445 |
-
}
|
446 |
-
hm <<= 1;
|
447 |
-
for (int k = 0; k < 4; ++k) {
|
448 |
-
sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2);
|
449 |
}
|
450 |
-
*result = sum;
|
451 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
452 |
}
|
453 |
|
454 |
-
void
|
455 |
|
|
|
|
|
|
|
456 |
|
457 |
-
const int
|
458 |
-
const int
|
459 |
-
const int
|
460 |
|
461 |
-
|
|
|
462 |
|
463 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
464 |
|
465 |
-
|
466 |
-
|
467 |
-
|
|
|
|
|
468 |
|
469 |
-
|
470 |
-
|
471 |
-
+ y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32)
|
472 |
-
+ y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32)
|
473 |
-
+ y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32)
|
474 |
-
+ y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32)
|
475 |
-
+ y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32)
|
476 |
-
+ y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32);
|
477 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
478 |
}
|
479 |
|
480 |
);
|
@@ -546,44 +782,6 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
|
|
546 |
}
|
547 |
);
|
548 |
|
549 |
-
std::string dequant_mul_mat_vec_k_template = MULTILINE_QUOTE(
|
550 |
-
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
|
551 |
-
const int block_size = get_local_size(0);
|
552 |
-
const int row = get_group_id(0);
|
553 |
-
const int tid = get_local_id(0);
|
554 |
-
|
555 |
-
const int iter_stride = 256;
|
556 |
-
const int vals_per_iter = iter_stride / block_size;
|
557 |
-
const int num_blocks_per_row = ncols / 256;
|
558 |
-
const int ib0 = row*num_blocks_per_row;
|
559 |
-
|
560 |
-
tmp[tid] = 0;
|
561 |
-
|
562 |
-
for (int i = 0; i < ncols; i += iter_stride) {
|
563 |
-
const int col = i + vals_per_iter*tid;
|
564 |
-
const int ib = ib0 + col/256; // x block index
|
565 |
-
const int iqs = col%256; // x quant index
|
566 |
-
const int iybs = col - col%256; // y block start index
|
567 |
-
|
568 |
-
// dequantize
|
569 |
-
float v;
|
570 |
-
DOT_KERNEL(x, ib, iqs, y + iybs, &v);
|
571 |
-
tmp[tid] += v;
|
572 |
-
}
|
573 |
-
|
574 |
-
// sum up partial sums and write back result
|
575 |
-
barrier(CLK_LOCAL_MEM_FENCE);
|
576 |
-
for (int s=block_size/2; s>0; s>>=1) {
|
577 |
-
if (tid < s) {
|
578 |
-
tmp[tid] += tmp[tid + s];
|
579 |
-
}
|
580 |
-
barrier(CLK_LOCAL_MEM_FENCE);
|
581 |
-
}
|
582 |
-
if (tid == 0) {
|
583 |
-
dst[row] = tmp[0];
|
584 |
-
}
|
585 |
-
}
|
586 |
-
);
|
587 |
|
588 |
std::string mul_template = MULTILINE_QUOTE(
|
589 |
__kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
|
@@ -648,18 +846,6 @@ std::array<std::string, 2> mul_str_values = {
|
|
648 |
"mul_f32", "float"
|
649 |
};
|
650 |
|
651 |
-
std::array<std::string, 3> dmmv_k_str_keys = {
|
652 |
-
"KERNEL_NAME", "X_TYPE", "DOT_KERNEL"
|
653 |
-
};
|
654 |
-
|
655 |
-
std::array<std::string, 15> dmmv_k_str_values = {
|
656 |
-
"dequantize_mul_mat_vec_q2_K", "struct block_q2_K", "vec_dot_q2_K",
|
657 |
-
"dequantize_mul_mat_vec_q3_K", "struct block_q3_K", "vec_dot_q3_K",
|
658 |
-
"dequantize_mul_mat_vec_q4_K", "struct block_q4_K", "vec_dot_q4_K",
|
659 |
-
"dequantize_mul_mat_vec_q5_K", "struct block_q5_K", "vec_dot_q5_K",
|
660 |
-
"dequantize_mul_mat_vec_q6_K", "struct block_q6_K", "vec_dot_q6_K",
|
661 |
-
};
|
662 |
-
|
663 |
std::string& replace(std::string& s, const std::string& from, const std::string& to) {
|
664 |
size_t pos = 0;
|
665 |
while ((pos = s.find(from, pos)) != std::string::npos) {
|
@@ -672,6 +858,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
|
|
672 |
std::string generate_kernels() {
|
673 |
std::stringstream src;
|
674 |
src << program_source << '\n';
|
|
|
675 |
for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
|
676 |
std::string dequant_kernel = dequant_template;
|
677 |
std::string dmmv_kernel = dequant_mul_mat_vec_template;
|
@@ -689,13 +876,6 @@ std::string generate_kernels() {
|
|
689 |
}
|
690 |
src << mul_kernel << '\n';
|
691 |
}
|
692 |
-
for (size_t i = 0; i < dmmv_k_str_values.size(); i += dmmv_k_str_keys.size()) {
|
693 |
-
std::string dmmv_k_kernel = dequant_mul_mat_vec_k_template;
|
694 |
-
for (size_t j = 0; j < dmmv_k_str_keys.size(); j++) {
|
695 |
-
replace(dmmv_k_kernel, dmmv_k_str_keys[j], dmmv_k_str_values[i + j]);
|
696 |
-
}
|
697 |
-
src << dmmv_k_kernel << '\n';
|
698 |
-
}
|
699 |
|
700 |
return src.str();
|
701 |
}
|
@@ -728,10 +908,11 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
|
|
728 |
exit(1);
|
729 |
}
|
730 |
|
731 |
-
|
732 |
-
"-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1"
|
|
|
733 |
|
734 |
-
err = clBuildProgram(p, 0, NULL, compile_opts, NULL, NULL);
|
735 |
if(err < 0) {
|
736 |
|
737 |
clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
|
|
|
16 |
|
17 |
#include "ggml.h"
|
18 |
|
19 |
+
#if defined(_MSC_VER)
|
20 |
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
21 |
+
#endif
|
22 |
+
|
23 |
#define CL_DMMV_BLOCK_SIZE 32
|
24 |
|
25 |
+
#ifndef K_QUANTS_PER_ITERATION
|
26 |
+
#define K_QUANTS_PER_ITERATION 1
|
27 |
+
#else
|
28 |
+
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
29 |
+
#endif
|
30 |
+
|
31 |
#define MULTILINE_QUOTE(...) #__VA_ARGS__
|
32 |
static std::string program_source = MULTILINE_QUOTE(
|
33 |
|
34 |
typedef char int8_t;
|
35 |
typedef uchar uint8_t;
|
36 |
+
typedef short int16_t;
|
37 |
+
typedef ushort uint16_t;
|
38 |
typedef int int32_t;
|
39 |
typedef uint uint32_t;
|
40 |
|
|
|
184 |
*v0 = vload_half(0, &x[ib + 0]);
|
185 |
*v1 = vload_half(0, &x[ib + 1]);
|
186 |
}
|
187 |
+
);
|
188 |
|
189 |
+
static std::string k_quants_source = MULTILINE_QUOTE(
|
190 |
inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m)
|
191 |
{
|
192 |
if (j < 4)
|
|
|
210 |
const int is = 8 * n + l / 16;
|
211 |
|
212 |
const uint8_t q = x[i].qs[32 * n + l];
|
213 |
+
__global float *y = yy + i * QK_K + 128 * n;
|
214 |
|
215 |
const float dall = vload_half(0, &x[i].d);
|
216 |
const float dmin = vload_half(0, &x[i].dmin);
|
|
|
242 |
float d_all = vload_half(0, &x[i].d);
|
243 |
float dl = d_all * (us - 32);
|
244 |
|
245 |
+
__global float *y = yy + i * QK_K + 128 * n + 32 * j;
|
246 |
const __global uint8_t *q = x[i].qs + 32 * n;
|
247 |
const __global uint8_t *hm = x[i].hmask;
|
248 |
|
|
|
259 |
const int is = 2 * il;
|
260 |
const int n = 4;
|
261 |
|
262 |
+
__global float *y = yy + i * QK_K + 64 * il + n * ir;
|
263 |
|
264 |
const float dall = vload_half(0, &x[i].d);
|
265 |
const float dmin = vload_half(0, &x[i].dmin);
|
|
|
288 |
const int ir = tid % 16;
|
289 |
const int is = 2 * il;
|
290 |
|
291 |
+
__global float *y = yy + i * QK_K + 64 * il + 2 * ir;
|
292 |
|
293 |
const float dall = vload_half(0, &x[i].d);
|
294 |
const float dmin = vload_half(0, &x[i].dmin);
|
|
|
320 |
const int il = tid - 32 * ip;
|
321 |
const int is = 8 * ip + il / 16;
|
322 |
|
323 |
+
__global float *y = yy + i * QK_K + 128 * ip + il;
|
324 |
|
325 |
const float d = vload_half(0, &x[i].d);
|
326 |
|
|
|
334 |
y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
|
335 |
}
|
336 |
|
337 |
+
__kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
|
338 |
|
339 |
+
const int row = get_group_id(0);
|
340 |
|
341 |
+
const int num_blocks_per_row = ncols / QK_K;
|
342 |
+
const int ib0 = row*num_blocks_per_row;
|
|
|
343 |
|
344 |
+
__global const struct block_q2_K * x = xx + ib0;
|
|
|
|
|
345 |
|
346 |
+
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
|
347 |
+
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
348 |
|
349 |
+
const int step = 16/K_QUANTS_PER_ITERATION;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
350 |
|
351 |
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
352 |
+
const int in = tid - step*im; // 0...15 or 0...7
|
353 |
|
354 |
+
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
|
355 |
+
const int q_offset = 32*im + l0;
|
356 |
+
const int s_offset = 8*im;
|
357 |
+
const int y_offset = 128*im + l0;
|
358 |
|
359 |
+
tmp[16 * ix + tid] = 0;
|
|
|
360 |
|
361 |
+
uint32_t aux[4];
|
362 |
+
const uint8_t * d = (const uint8_t *)aux;
|
363 |
+
const uint8_t * m = (const uint8_t *)(aux + 2);
|
364 |
|
365 |
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
|
|
|
|
366 |
|
367 |
+
__global const float * y = yy + i * QK_K + y_offset;
|
368 |
+
__global const uint8_t * q = x[i].qs + q_offset;
|
|
|
|
|
369 |
|
370 |
+
const float dall = vload_half(0, &x[i].d);
|
371 |
+
const float dmin = vload_half(0, &x[i].dmin);
|
|
|
372 |
|
373 |
+
__global const uint32_t * a = (__global const uint32_t *)(x[i].scales + s_offset);
|
374 |
+
aux[0] = a[0] & 0x0f0f0f0f;
|
375 |
+
aux[1] = a[1] & 0x0f0f0f0f;
|
376 |
+
aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
|
377 |
+
aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
|
378 |
|
379 |
+
float sum1 = 0, sum2 = 0;
|
380 |
+
for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
|
381 |
+
sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
|
382 |
+
+ y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
|
383 |
+
+ y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
|
384 |
+
+ y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
|
385 |
+
+ y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
|
386 |
+
+ y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
|
387 |
+
+ y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
|
388 |
+
+y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
|
389 |
+
sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
|
390 |
+
+ y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
|
391 |
|
392 |
+
}
|
393 |
+
tmp[16 * ix + tid] += dall * sum1 - dmin * sum2;
|
|
|
|
|
|
|
|
|
|
|
|
|
394 |
|
395 |
+
}
|
396 |
|
397 |
+
// sum up partial sums and write back result
|
398 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
399 |
+
for (int s=16; s>0; s>>=1) {
|
400 |
+
if (tid < s) {
|
401 |
+
tmp[tid] += tmp[tid + s];
|
402 |
+
}
|
403 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
404 |
+
}
|
405 |
+
if (tid == 0) {
|
406 |
+
dst[row] = tmp[0];
|
407 |
+
}
|
408 |
}
|
409 |
|
410 |
+
__kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
|
411 |
+
const uint16_t kmask1 = 0x0303;
|
412 |
+
const uint16_t kmask2 = 0x0f0f;
|
413 |
|
414 |
+
const int row = get_group_id(0);
|
|
|
|
|
415 |
|
416 |
+
const int num_blocks_per_row = ncols / QK_K;
|
417 |
+
const int ib0 = row*num_blocks_per_row;
|
418 |
|
419 |
+
__global const struct block_q3_K * x = xx + ib0;
|
|
|
420 |
|
421 |
+
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
422 |
+
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1
|
423 |
+
|
424 |
+
const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
|
425 |
+
const int step = 16/K_QUANTS_PER_ITERATION;
|
426 |
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
427 |
+
const int in = tid - step*im; // 0....15 or 0...7
|
428 |
+
|
429 |
+
const uint8_t m = 1 << (4*im);
|
430 |
+
|
431 |
+
const int l0 = n*in; // 0...15 or 0...14 in steps of 2
|
432 |
+
const int q_offset = 32*im + l0;
|
433 |
+
const int y_offset = 128*im + l0;
|
434 |
+
|
435 |
+
uint16_t utmp[4];
|
436 |
+
const int8_t * s = (const int8_t *)utmp;
|
437 |
+
|
438 |
+
const uint16_t s_shift = 4*im;
|
439 |
+
|
440 |
+
tmp[16 * ix + tid] = 0;
|
441 |
+
|
442 |
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
443 |
+
|
444 |
+
__global const float * y = yy + i * QK_K + y_offset;
|
445 |
+
__global const uint8_t * q = x[i].qs + q_offset;
|
446 |
+
__global const uint8_t * h = x[i].hmask + l0;
|
447 |
+
|
448 |
+
__global const uint16_t * a = (__global const uint16_t *)x[i].scales;
|
449 |
+
utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
|
450 |
+
utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
|
451 |
+
utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
|
452 |
+
utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
|
453 |
+
|
454 |
+
const float d = vload_half(0, &x[i].d);
|
455 |
+
|
456 |
+
float sum = 0;
|
457 |
+
for (int l = 0; l < n; ++l) {
|
458 |
+
sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
|
459 |
+
+ y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
|
460 |
+
+ y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
|
461 |
+
+ y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
|
462 |
+
sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
|
463 |
+
+ y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
|
464 |
+
+ y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
|
465 |
+
+ y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
|
466 |
+
}
|
467 |
+
tmp[16 * ix + tid] += d * sum;
|
468 |
|
|
|
|
|
|
|
|
|
469 |
}
|
470 |
|
471 |
+
// sum up partial sums and write back result
|
472 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
473 |
+
for (int s=16; s>0; s>>=1) {
|
474 |
+
if (tid < s) {
|
475 |
+
tmp[tid] += tmp[tid + s];
|
476 |
+
}
|
477 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
478 |
+
}
|
479 |
+
if (tid == 0) {
|
480 |
+
dst[row] = tmp[0];
|
481 |
+
}
|
482 |
}
|
483 |
|
484 |
+
__kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
|
485 |
+
|
486 |
+
//to rename it later, just to test now
|
487 |
+
const uint16_t kmask1 = 0x3f3f;
|
488 |
+
const uint16_t kmask2 = 0x0f0f;
|
489 |
+
const uint16_t kmask3 = 0xc0c0;
|
490 |
+
|
491 |
+
const int row = get_group_id(0);
|
492 |
+
const int num_blocks_per_row = ncols / QK_K;
|
493 |
+
const int ib0 = row*num_blocks_per_row;
|
494 |
|
495 |
+
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
|
496 |
+
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
|
|
|
497 |
|
498 |
+
const int step = 8/K_QUANTS_PER_ITERATION;
|
|
|
|
|
499 |
|
500 |
+
const int il = tid/step; // 0...3
|
501 |
+
const int ir = tid - step*il;// 0...3
|
502 |
+
const int n = 2*K_QUANTS_PER_ITERATION;
|
503 |
|
504 |
+
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
505 |
+
const int in = il%2;
|
506 |
+
|
507 |
+
const int l0 = n*(2*ir + in);
|
508 |
+
const int q_offset = 32*im + l0;
|
509 |
+
const int y_offset = 64*im + l0;
|
510 |
+
|
511 |
+
uint16_t aux[4];
|
512 |
+
const uint8_t * sc = (const uint8_t *)aux;
|
513 |
+
|
514 |
+
__global const struct block_q4_K * x = xx + ib0;
|
515 |
+
|
516 |
+
tmp[16 * ix + tid] = 0;
|
517 |
+
|
518 |
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
519 |
+
|
520 |
+
__global const uint8_t * q1 = x[i].qs + q_offset;
|
521 |
+
__global const uint8_t * q2 = q1 + 64;
|
522 |
+
__global const float * y1 = yy + i*QK_K + y_offset;
|
523 |
+
__global const float * y2 = y1 + 128;
|
524 |
+
|
525 |
+
const float dall = vload_half(0, &x[i].d);
|
526 |
+
const float dmin = vload_half(0, &x[i].dmin);
|
527 |
+
|
528 |
+
__global const uint16_t * a = (__global const uint16_t *)x[i].scales;
|
529 |
+
aux[0] = a[im+0] & kmask1;
|
530 |
+
aux[1] = a[im+2] & kmask1;
|
531 |
+
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
532 |
+
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
533 |
+
|
534 |
+
float4 s = (float4)(0.f);
|
535 |
+
float smin = 0;
|
536 |
+
for (int l = 0; l < n; ++l) {
|
537 |
+
s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
|
538 |
+
s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
|
539 |
+
smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
|
540 |
+
}
|
541 |
+
tmp[16 * ix + tid] += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
|
542 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
543 |
}
|
|
|
544 |
|
545 |
+
// sum up partial sums and write back result
|
546 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
547 |
+
for (int s=16; s>0; s>>=1) {
|
548 |
+
if (tid < s) {
|
549 |
+
tmp[tid] += tmp[tid + s];
|
550 |
+
}
|
551 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
552 |
+
}
|
553 |
+
if (tid == 0) {
|
554 |
+
dst[row] = tmp[0];
|
555 |
+
}
|
556 |
}
|
557 |
|
558 |
+
__kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
|
559 |
|
560 |
+
const uint16_t kmask1 = 0x3f3f;
|
561 |
+
const uint16_t kmask2 = 0x0f0f;
|
562 |
+
const uint16_t kmask3 = 0xc0c0;
|
563 |
|
564 |
+
const int row = get_group_id(0);
|
565 |
+
const int num_blocks_per_row = ncols / QK_K;
|
566 |
+
const int ib0 = row*num_blocks_per_row;
|
567 |
|
568 |
+
const int tid = get_local_id(0)/2; // 0...15
|
569 |
+
const int ix = get_local_id(0)%2;
|
570 |
|
571 |
+
const int il = tid/4; // 0...3
|
572 |
+
const int ir = tid - 4*il;// 0...3
|
573 |
+
const int n = 2;
|
574 |
+
|
575 |
+
const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
|
576 |
+
const int in = il%2;
|
577 |
+
|
578 |
+
const int l0 = n*(2*ir + in);
|
579 |
+
const int q_offset = 32*im + l0;
|
580 |
+
const int y_offset = 64*im + l0;
|
581 |
+
|
582 |
+
const uint8_t hm1 = 1 << (2*im);
|
583 |
+
const uint8_t hm2 = hm1 << 4;
|
584 |
+
|
585 |
+
uint16_t aux[4];
|
586 |
+
const uint8_t * sc = (const uint8_t *)aux;
|
587 |
+
|
588 |
+
__global const struct block_q5_K * x = xx + ib0;
|
589 |
+
|
590 |
+
tmp[16 * ix + tid] = 0;
|
591 |
+
|
592 |
+
for (int i = ix; i < num_blocks_per_row; i += 2) {
|
593 |
|
594 |
+
__global const uint8_t * ql1 = x[i].qs + q_offset;
|
595 |
+
__global const uint8_t * ql2 = ql1 + 64;
|
596 |
+
__global const uint8_t * qh = x[i].qh + l0;
|
597 |
+
__global const float * y1 = yy + i*QK_K + y_offset;
|
598 |
+
__global const float * y2 = y1 + 128;
|
599 |
|
600 |
+
const float dall = vload_half(0, &x[i].d);
|
601 |
+
const float dmin = vload_half(0, &x[i].dmin);
|
|
|
|
|
|
|
|
|
|
|
|
|
602 |
|
603 |
+
__global const uint16_t * a = (__global const uint16_t *)x[i].scales;
|
604 |
+
aux[0] = a[im+0] & kmask1;
|
605 |
+
aux[1] = a[im+2] & kmask1;
|
606 |
+
aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
|
607 |
+
aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
|
608 |
+
|
609 |
+
float4 sum = (float4)(0.f);
|
610 |
+
float smin = 0;
|
611 |
+
for (int l = 0; l < n; ++l) {
|
612 |
+
sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
|
613 |
+
+ y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
|
614 |
+
sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
|
615 |
+
+ y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
|
616 |
+
sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
|
617 |
+
+ y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
|
618 |
+
sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
|
619 |
+
+ y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
|
620 |
+
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
|
621 |
+
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
|
622 |
+
}
|
623 |
+
tmp[16 * ix + tid] += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
|
624 |
+
|
625 |
+
}
|
626 |
+
|
627 |
+
// sum up partial sums and write back result
|
628 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
629 |
+
for (int s=16; s>0; s>>=1) {
|
630 |
+
if (tid < s) {
|
631 |
+
tmp[tid] += tmp[tid + s];
|
632 |
+
}
|
633 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
634 |
+
}
|
635 |
+
if (tid == 0) {
|
636 |
+
dst[row] = tmp[0];
|
637 |
+
}
|
638 |
+
}
|
639 |
+
|
640 |
+
__kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, __local float* tmp, __global const float * yy, __global float * dst, const int ncols) {
|
641 |
+
|
642 |
+
const int row = get_group_id(0);
|
643 |
+
|
644 |
+
const int num_blocks_per_row = ncols / QK_K;
|
645 |
+
const int ib0 = row*num_blocks_per_row;
|
646 |
+
|
647 |
+
__global const struct block_q6_K * x = xx + ib0;
|
648 |
+
|
649 |
+
const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
|
650 |
+
const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0, 1
|
651 |
+
|
652 |
+
const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
|
653 |
+
|
654 |
+
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
655 |
+
const int in = tid - step*im; // 0...15 or 0...7
|
656 |
+
|
657 |
+
#if K_QUANTS_PER_ITERATION == 1
|
658 |
+
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
|
659 |
+
const int is = 0;
|
660 |
+
#else
|
661 |
+
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
662 |
+
const int is = in / 4;
|
663 |
+
#endif
|
664 |
+
const int ql_offset = 64*im + l0;
|
665 |
+
const int qh_offset = 32*im + l0;
|
666 |
+
const int s_offset = 8*im + is;
|
667 |
+
const int y_offset = 128*im + l0;
|
668 |
+
|
669 |
+
tmp[16 * ix + tid] = 0; // partial sum for thread in warp
|
670 |
+
|
671 |
+
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
|
672 |
+
|
673 |
+
__global const float * y = yy + i * QK_K + y_offset;
|
674 |
+
__global const uint8_t * ql = x[i].ql + ql_offset;
|
675 |
+
__global const uint8_t * qh = x[i].qh + qh_offset;
|
676 |
+
__global const int8_t * s = x[i].scales + s_offset;
|
677 |
+
|
678 |
+
const float d = vload_half(0, &x[i].d);
|
679 |
+
|
680 |
+
#if K_QUANTS_PER_ITERATION == 1
|
681 |
+
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
|
682 |
+
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
|
683 |
+
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
|
684 |
+
+ y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
|
685 |
+
+ y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
|
686 |
+
+ y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
|
687 |
+
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
|
688 |
+
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
|
689 |
+
tmp[16 * ix + tid] += sum;
|
690 |
+
#else
|
691 |
+
float sum = 0;
|
692 |
+
for (int l = 0; l < 4; ++l) {
|
693 |
+
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
|
694 |
+
+ y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
|
695 |
+
+ y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
|
696 |
+
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
|
697 |
+
}
|
698 |
+
tmp[16 * ix + tid] += sum;
|
699 |
+
#endif
|
700 |
+
|
701 |
+
}
|
702 |
+
|
703 |
+
// sum up partial sums and write back result
|
704 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
705 |
+
for (int s=16; s>0; s>>=1) {
|
706 |
+
if (tid < s) {
|
707 |
+
tmp[tid] += tmp[tid + s];
|
708 |
+
}
|
709 |
+
barrier(CLK_LOCAL_MEM_FENCE);
|
710 |
+
}
|
711 |
+
if (tid == 0) {
|
712 |
+
dst[row] = tmp[0];
|
713 |
+
}
|
714 |
}
|
715 |
|
716 |
);
|
|
|
782 |
}
|
783 |
);
|
784 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
785 |
|
786 |
std::string mul_template = MULTILINE_QUOTE(
|
787 |
__kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
|
|
|
846 |
"mul_f32", "float"
|
847 |
};
|
848 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
849 |
std::string& replace(std::string& s, const std::string& from, const std::string& to) {
|
850 |
size_t pos = 0;
|
851 |
while ((pos = s.find(from, pos)) != std::string::npos) {
|
|
|
858 |
std::string generate_kernels() {
|
859 |
std::stringstream src;
|
860 |
src << program_source << '\n';
|
861 |
+
src << k_quants_source << '\n';
|
862 |
for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
|
863 |
std::string dequant_kernel = dequant_template;
|
864 |
std::string dmmv_kernel = dequant_mul_mat_vec_template;
|
|
|
876 |
}
|
877 |
src << mul_kernel << '\n';
|
878 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
879 |
|
880 |
return src.str();
|
881 |
}
|
|
|
908 |
exit(1);
|
909 |
}
|
910 |
|
911 |
+
std::string compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math "
|
912 |
+
"-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1 "
|
913 |
+
"-DQK_K=256 -DK_QUANTS_PER_ITERATION=" + std::to_string(K_QUANTS_PER_ITERATION);
|
914 |
|
915 |
+
err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL);
|
916 |
if(err < 0) {
|
917 |
|
918 |
clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
|
ggml.c
CHANGED
@@ -24,6 +24,7 @@
|
|
24 |
#include <stdio.h>
|
25 |
#include <float.h>
|
26 |
#include <limits.h>
|
|
|
27 |
|
28 |
#ifdef GGML_USE_METAL
|
29 |
#include <unistd.h>
|
@@ -112,6 +113,7 @@ typedef void* thread_ret_t;
|
|
112 |
/*#define GGML_PERF*/
|
113 |
#define GGML_DEBUG 0
|
114 |
#define GGML_GELU_FP16
|
|
|
115 |
#define GGML_SILU_FP16
|
116 |
|
117 |
#define GGML_SOFT_MAX_UNROLL 4
|
@@ -340,6 +342,9 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
340 |
// precomputed gelu table for f16 (128 KB)
|
341 |
static ggml_fp16_t table_gelu_f16[1 << 16];
|
342 |
|
|
|
|
|
|
|
343 |
// precomputed silu table for f16 (128 KB)
|
344 |
static ggml_fp16_t table_silu_f16[1 << 16];
|
345 |
|
@@ -1677,14 +1682,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
1677 |
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
1678 |
#define GGML_F32x4_REDUCE(res, x) \
|
1679 |
{ \
|
1680 |
-
|
1681 |
-
|
|
|
1682 |
} \
|
1683 |
-
|
1684 |
-
|
|
|
1685 |
} \
|
1686 |
-
|
1687 |
-
|
|
|
1688 |
} \
|
1689 |
res = GGML_F32x4_REDUCE_ONE(x[0]); \
|
1690 |
}
|
@@ -1715,14 +1723,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
1715 |
#define GGML_F16x8_MUL vmulq_f16
|
1716 |
#define GGML_F16x8_REDUCE(res, x) \
|
1717 |
{ \
|
1718 |
-
|
1719 |
-
|
|
|
1720 |
} \
|
1721 |
-
|
1722 |
-
|
|
|
1723 |
} \
|
1724 |
-
|
1725 |
-
|
|
|
1726 |
} \
|
1727 |
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
|
1728 |
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
|
@@ -1789,14 +1800,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
|
1789 |
#define GGML_F32x8_MUL _mm256_mul_ps
|
1790 |
#define GGML_F32x8_REDUCE(res, x) \
|
1791 |
{ \
|
1792 |
-
|
1793 |
-
|
|
|
1794 |
} \
|
1795 |
-
|
1796 |
-
|
|
|
1797 |
} \
|
1798 |
-
|
1799 |
-
|
|
|
1800 |
} \
|
1801 |
const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
|
1802 |
_mm256_extractf128_ps(x[0], 1)); \
|
@@ -1886,14 +1900,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
1886 |
#define GGML_F32x4_MUL vec_mul
|
1887 |
#define GGML_F32x4_REDUCE(res, x) \
|
1888 |
{ \
|
1889 |
-
|
1890 |
-
|
|
|
1891 |
} \
|
1892 |
-
|
1893 |
-
|
|
|
1894 |
} \
|
1895 |
-
|
1896 |
-
|
|
|
1897 |
} \
|
1898 |
res = vec_extract(x[0], 0) + \
|
1899 |
vec_extract(x[0], 1) + \
|
@@ -1949,14 +1966,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
1949 |
#define GGML_F32x4_MUL wasm_f32x4_mul
|
1950 |
#define GGML_F32x4_REDUCE(res, x) \
|
1951 |
{ \
|
1952 |
-
|
1953 |
-
|
|
|
1954 |
} \
|
1955 |
-
|
1956 |
-
|
|
|
1957 |
} \
|
1958 |
-
|
1959 |
-
|
|
|
1960 |
} \
|
1961 |
res = wasm_f32x4_extract_lane(x[0], 0) + \
|
1962 |
wasm_f32x4_extract_lane(x[0], 1) + \
|
@@ -2011,14 +2031,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
|
2011 |
#define GGML_F16x4_MUL wasm_f32x4_mul
|
2012 |
#define GGML_F16x4_REDUCE(res, x) \
|
2013 |
{ \
|
2014 |
-
|
2015 |
-
|
|
|
2016 |
} \
|
2017 |
-
|
2018 |
-
|
|
|
2019 |
} \
|
2020 |
-
|
2021 |
-
|
|
|
2022 |
} \
|
2023 |
res = wasm_f32x4_extract_lane(x[0], 0) + \
|
2024 |
wasm_f32x4_extract_lane(x[0], 1) + \
|
@@ -2060,14 +2083,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
|
2060 |
#define GGML_F32x4_MUL _mm_mul_ps
|
2061 |
#define GGML_F32x4_REDUCE(res, x) \
|
2062 |
{ \
|
2063 |
-
|
2064 |
-
|
|
|
2065 |
} \
|
2066 |
-
|
2067 |
-
|
|
|
2068 |
} \
|
2069 |
-
|
2070 |
-
|
|
|
2071 |
} \
|
2072 |
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
|
2073 |
res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
|
@@ -3356,6 +3382,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
|
|
3356 |
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
3357 |
|
3358 |
static const float GELU_COEF_A = 0.044715f;
|
|
|
3359 |
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
3360 |
|
3361 |
inline static float ggml_gelu_f32(float x) {
|
@@ -3386,6 +3413,34 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
|
|
3386 |
}
|
3387 |
#endif
|
3388 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3389 |
// Sigmoid Linear Unit (SiLU) function
|
3390 |
inline static float ggml_silu_f32(float x) {
|
3391 |
return x/(1.0f + expf(-x));
|
@@ -3616,6 +3671,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3616 |
"STEP",
|
3617 |
"RELU",
|
3618 |
"GELU",
|
|
|
3619 |
"SILU",
|
3620 |
"SILU_BACK",
|
3621 |
"NORM",
|
@@ -3644,12 +3700,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3644 |
"ROPE_BACK",
|
3645 |
"ALIBI",
|
3646 |
"CLAMP",
|
3647 |
-
"
|
3648 |
-
"
|
|
|
3649 |
|
3650 |
"FLASH_ATTN",
|
3651 |
"FLASH_FF",
|
3652 |
"FLASH_ATTN_BACK",
|
|
|
|
|
3653 |
|
3654 |
"MAP_UNARY",
|
3655 |
"MAP_BINARY",
|
@@ -3658,7 +3717,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|
3658 |
"CROSS_ENTROPY_LOSS_BACK",
|
3659 |
};
|
3660 |
|
3661 |
-
static_assert(GGML_OP_COUNT ==
|
3662 |
|
3663 |
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3664 |
"none",
|
@@ -3684,6 +3743,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3684 |
"step(x)",
|
3685 |
"relu(x)",
|
3686 |
"gelu(x)",
|
|
|
3687 |
"silu(x)",
|
3688 |
"silu_back(x)",
|
3689 |
"norm(x)",
|
@@ -3712,12 +3772,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3712 |
"rope_back(x)",
|
3713 |
"alibi(x)",
|
3714 |
"clamp(x)",
|
3715 |
-
"
|
3716 |
-
"
|
|
|
3717 |
|
3718 |
"flash_attn(x)",
|
3719 |
"flash_ff(x)",
|
3720 |
"flash_attn_back(x)",
|
|
|
|
|
3721 |
|
3722 |
"f(x)",
|
3723 |
"f(x,y)",
|
@@ -3726,7 +3789,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|
3726 |
"cross_entropy_loss_back(x,y)",
|
3727 |
};
|
3728 |
|
3729 |
-
static_assert(GGML_OP_COUNT ==
|
3730 |
|
3731 |
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
3732 |
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
@@ -4017,7 +4080,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4017 |
// initialize time system (required on Windows)
|
4018 |
ggml_time_init();
|
4019 |
|
4020 |
-
// initialize GELU, SILU and EXP F32 tables
|
4021 |
{
|
4022 |
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
|
4023 |
|
@@ -4027,13 +4090,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4027 |
memcpy(&ii, &ui, sizeof(ii));
|
4028 |
const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
|
4029 |
table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
|
|
|
4030 |
table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
|
4031 |
table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
|
4032 |
}
|
4033 |
|
4034 |
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
4035 |
|
4036 |
-
GGML_PRINT_DEBUG("%s: GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
4037 |
}
|
4038 |
|
4039 |
// initialize g_state
|
@@ -4154,14 +4218,34 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
|
|
4154 |
ctx->no_alloc = no_alloc;
|
4155 |
}
|
4156 |
|
4157 |
-
void * ggml_get_mem_buffer(struct ggml_context * ctx) {
|
4158 |
return ctx->mem_buffer;
|
4159 |
}
|
4160 |
|
4161 |
-
size_t ggml_get_mem_size(struct ggml_context * ctx) {
|
4162 |
return ctx->mem_size;
|
4163 |
}
|
4164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4165 |
// IMPORTANT:
|
4166 |
// when creating "opt" tensors, always save and load the scratch buffer
|
4167 |
// this is an error prone process, but it is necessary to support inplace
|
@@ -4645,15 +4729,25 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) {
|
|
4645 |
return tensor->name;
|
4646 |
}
|
4647 |
|
4648 |
-
|
4649 |
strncpy(tensor->name, name, sizeof(tensor->name));
|
4650 |
tensor->name[sizeof(tensor->name) - 1] = '\0';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4651 |
}
|
4652 |
|
4653 |
struct ggml_tensor * ggml_view_tensor(
|
4654 |
struct ggml_context * ctx,
|
4655 |
const struct ggml_tensor * src) {
|
4656 |
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
|
|
|
4657 |
|
4658 |
result->nb[0] = src->nb[0];
|
4659 |
result->nb[1] = src->nb[1];
|
@@ -5426,6 +5520,40 @@ struct ggml_tensor * ggml_gelu_inplace(
|
|
5426 |
return ggml_gelu_impl(ctx, a, true);
|
5427 |
}
|
5428 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5429 |
// ggml_silu
|
5430 |
|
5431 |
struct ggml_tensor * ggml_silu_impl(
|
@@ -5781,6 +5909,11 @@ struct ggml_tensor * ggml_cpy_impl(
|
|
5781 |
|
5782 |
// make a view of the destination
|
5783 |
struct ggml_tensor * result = ggml_view_tensor(ctx, b);
|
|
|
|
|
|
|
|
|
|
|
5784 |
|
5785 |
result->op = GGML_OP_CPY;
|
5786 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5817,6 +5950,7 @@ struct ggml_tensor * ggml_cont_impl(
|
|
5817 |
}
|
5818 |
|
5819 |
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
|
|
5820 |
|
5821 |
result->op = GGML_OP_CONT;
|
5822 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5860,6 +5994,7 @@ struct ggml_tensor * ggml_reshape(
|
|
5860 |
}
|
5861 |
|
5862 |
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
|
|
|
5863 |
|
5864 |
result->op = GGML_OP_RESHAPE;
|
5865 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5884,6 +6019,7 @@ struct ggml_tensor * ggml_reshape_1d(
|
|
5884 |
|
5885 |
const int64_t ne[1] = { ne0 };
|
5886 |
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
|
|
|
5887 |
|
5888 |
result->op = GGML_OP_RESHAPE;
|
5889 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5909,6 +6045,7 @@ struct ggml_tensor * ggml_reshape_2d(
|
|
5909 |
|
5910 |
const int64_t ne[2] = { ne0, ne1 };
|
5911 |
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
|
|
|
5912 |
|
5913 |
result->op = GGML_OP_RESHAPE;
|
5914 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5935,6 +6072,7 @@ struct ggml_tensor * ggml_reshape_3d(
|
|
5935 |
|
5936 |
const int64_t ne[3] = { ne0, ne1, ne2 };
|
5937 |
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
|
|
|
5938 |
|
5939 |
result->op = GGML_OP_RESHAPE;
|
5940 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5963,6 +6101,7 @@ struct ggml_tensor * ggml_reshape_4d(
|
|
5963 |
|
5964 |
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
5965 |
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
|
|
|
5966 |
|
5967 |
result->op = GGML_OP_RESHAPE;
|
5968 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5987,10 +6126,12 @@ struct ggml_tensor * ggml_view_1d(
|
|
5987 |
}
|
5988 |
|
5989 |
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
|
|
|
5990 |
|
5991 |
ggml_scratch_save(ctx);
|
5992 |
|
5993 |
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
|
|
5994 |
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
5995 |
|
5996 |
ggml_scratch_load(ctx);
|
@@ -6023,10 +6164,12 @@ struct ggml_tensor * ggml_view_2d(
|
|
6023 |
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
6024 |
|
6025 |
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
|
|
|
6026 |
|
6027 |
ggml_scratch_save(ctx);
|
6028 |
|
6029 |
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
|
|
6030 |
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6031 |
|
6032 |
ggml_scratch_load(ctx);
|
@@ -6065,10 +6208,12 @@ struct ggml_tensor * ggml_view_3d(
|
|
6065 |
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
6066 |
|
6067 |
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
|
|
|
6068 |
|
6069 |
ggml_scratch_save(ctx);
|
6070 |
|
6071 |
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
|
|
6072 |
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6073 |
|
6074 |
ggml_scratch_load(ctx);
|
@@ -6109,10 +6254,12 @@ struct ggml_tensor * ggml_view_4d(
|
|
6109 |
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
|
6110 |
|
6111 |
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
|
|
|
6112 |
|
6113 |
ggml_scratch_save(ctx);
|
6114 |
|
6115 |
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
|
|
6116 |
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6117 |
|
6118 |
ggml_scratch_load(ctx);
|
@@ -6158,6 +6305,7 @@ struct ggml_tensor * ggml_permute(
|
|
6158 |
}
|
6159 |
|
6160 |
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
|
|
6161 |
|
6162 |
int ne[GGML_MAX_DIMS];
|
6163 |
int nb[GGML_MAX_DIMS];
|
@@ -6217,6 +6365,7 @@ struct ggml_tensor * ggml_transpose(
|
|
6217 |
}
|
6218 |
|
6219 |
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
|
|
6220 |
|
6221 |
result->ne[0] = a->ne[1];
|
6222 |
result->ne[1] = a->ne[0];
|
@@ -6625,7 +6774,7 @@ struct ggml_tensor * ggml_clamp(
|
|
6625 |
|
6626 |
ggml_scratch_save(ctx);
|
6627 |
|
6628 |
-
struct ggml_tensor * b = ggml_new_tensor_1d(ctx,
|
6629 |
|
6630 |
((float *) b->data)[0] = min;
|
6631 |
((float *) b->data)[1] = max;
|
@@ -6640,9 +6789,9 @@ struct ggml_tensor * ggml_clamp(
|
|
6640 |
return result;
|
6641 |
}
|
6642 |
|
6643 |
-
//
|
6644 |
|
6645 |
-
struct ggml_tensor *
|
6646 |
struct ggml_context * ctx,
|
6647 |
struct ggml_tensor * a,
|
6648 |
struct ggml_tensor * b) {
|
@@ -6659,7 +6808,7 @@ struct ggml_tensor * ggml_conv_1d_1s(
|
|
6659 |
const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
|
6660 |
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6661 |
|
6662 |
-
result->op =
|
6663 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6664 |
result->src0 = a;
|
6665 |
result->src1 = b;
|
@@ -6667,9 +6816,9 @@ struct ggml_tensor * ggml_conv_1d_1s(
|
|
6667 |
return result;
|
6668 |
}
|
6669 |
|
6670 |
-
//
|
6671 |
|
6672 |
-
struct ggml_tensor *
|
6673 |
struct ggml_context * ctx,
|
6674 |
struct ggml_tensor * a,
|
6675 |
struct ggml_tensor * b) {
|
@@ -6686,7 +6835,35 @@ struct ggml_tensor * ggml_conv_1d_2s(
|
|
6686 |
const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
|
6687 |
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6688 |
|
6689 |
-
result->op =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6690 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6691 |
result->src0 = a;
|
6692 |
result->src1 = b;
|
@@ -6820,6 +6997,89 @@ struct ggml_tensor * ggml_flash_attn_back(
|
|
6820 |
return result;
|
6821 |
}
|
6822 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6823 |
|
6824 |
// ggml_map_unary
|
6825 |
|
@@ -7898,7 +8158,7 @@ static void ggml_compute_forward_add_q_f32(
|
|
7898 |
|
7899 |
void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
|
7900 |
float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
|
7901 |
-
void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*
|
7902 |
|
7903 |
assert(ne00 % 32 == 0);
|
7904 |
|
@@ -9459,8 +9719,65 @@ static void ggml_compute_forward_gelu(
|
|
9459 |
GGML_ASSERT(false);
|
9460 |
} break;
|
9461 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9462 |
|
9463 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9464 |
}
|
9465 |
|
9466 |
// ggml_compute_forward_silu
|
@@ -10858,7 +11175,7 @@ static void ggml_compute_forward_set_f32(
|
|
10858 |
const int im2 = (ne12 == 0 ? 0 : ne12-1);
|
10859 |
const int im3 = (ne13 == 0 ? 0 : ne13-1);
|
10860 |
|
10861 |
-
GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3
|
10862 |
|
10863 |
GGML_ASSERT(nb10 == sizeof(float));
|
10864 |
|
@@ -11579,8 +11896,9 @@ static void ggml_compute_forward_alibi_f32(
|
|
11579 |
const struct ggml_tensor * src1,
|
11580 |
struct ggml_tensor * dst) {
|
11581 |
assert(params->ith == 0);
|
11582 |
-
|
11583 |
-
|
|
|
11584 |
|
11585 |
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11586 |
return;
|
@@ -11643,8 +11961,9 @@ static void ggml_compute_forward_alibi_f16(
|
|
11643 |
const struct ggml_tensor * src1,
|
11644 |
struct ggml_tensor * dst) {
|
11645 |
assert(params->ith == 0);
|
11646 |
-
|
11647 |
-
|
|
|
11648 |
|
11649 |
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11650 |
return;
|
@@ -11746,15 +12065,16 @@ static void ggml_compute_forward_clamp_f32(
|
|
11746 |
const struct ggml_tensor * src1,
|
11747 |
struct ggml_tensor * dst) {
|
11748 |
assert(params->ith == 0);
|
11749 |
-
|
11750 |
-
|
|
|
11751 |
|
11752 |
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11753 |
return;
|
11754 |
}
|
11755 |
|
11756 |
-
const
|
11757 |
-
const
|
11758 |
|
11759 |
const int ith = params->ith;
|
11760 |
const int nth = params->nth;
|
@@ -12312,9 +12632,9 @@ static void ggml_compute_forward_rope_back(
|
|
12312 |
}
|
12313 |
}
|
12314 |
|
12315 |
-
//
|
12316 |
|
12317 |
-
static void
|
12318 |
const struct ggml_compute_params * params,
|
12319 |
const struct ggml_tensor * src0,
|
12320 |
const struct ggml_tensor * src1,
|
@@ -12434,7 +12754,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
|
|
12434 |
}
|
12435 |
}
|
12436 |
|
12437 |
-
static void
|
12438 |
const struct ggml_compute_params * params,
|
12439 |
const struct ggml_tensor * src0,
|
12440 |
const struct ggml_tensor * src1,
|
@@ -12554,7 +12874,7 @@ static void ggml_compute_forward_conv_1d_1s_f32(
|
|
12554 |
}
|
12555 |
}
|
12556 |
|
12557 |
-
static void
|
12558 |
const struct ggml_compute_params * params,
|
12559 |
const struct ggml_tensor * src0,
|
12560 |
const struct ggml_tensor * src1,
|
@@ -12562,11 +12882,11 @@ static void ggml_compute_forward_conv_1d_1s(
|
|
12562 |
switch (src0->type) {
|
12563 |
case GGML_TYPE_F16:
|
12564 |
{
|
12565 |
-
|
12566 |
} break;
|
12567 |
case GGML_TYPE_F32:
|
12568 |
{
|
12569 |
-
|
12570 |
} break;
|
12571 |
default:
|
12572 |
{
|
@@ -12575,9 +12895,9 @@ static void ggml_compute_forward_conv_1d_1s(
|
|
12575 |
}
|
12576 |
}
|
12577 |
|
12578 |
-
//
|
12579 |
|
12580 |
-
static void
|
12581 |
const struct ggml_compute_params * params,
|
12582 |
const struct ggml_tensor * src0,
|
12583 |
const struct ggml_tensor * src1,
|
@@ -12697,7 +13017,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
|
|
12697 |
}
|
12698 |
}
|
12699 |
|
12700 |
-
static void
|
12701 |
const struct ggml_compute_params * params,
|
12702 |
const struct ggml_tensor * src0,
|
12703 |
const struct ggml_tensor * src1,
|
@@ -12817,7 +13137,7 @@ static void ggml_compute_forward_conv_1d_2s_f32(
|
|
12817 |
}
|
12818 |
}
|
12819 |
|
12820 |
-
static void
|
12821 |
const struct ggml_compute_params * params,
|
12822 |
const struct ggml_tensor * src0,
|
12823 |
const struct ggml_tensor * src1,
|
@@ -12825,11 +13145,148 @@ static void ggml_compute_forward_conv_1d_2s(
|
|
12825 |
switch (src0->type) {
|
12826 |
case GGML_TYPE_F16:
|
12827 |
{
|
12828 |
-
|
12829 |
} break;
|
12830 |
case GGML_TYPE_F32:
|
12831 |
{
|
12832 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12833 |
} break;
|
12834 |
default:
|
12835 |
{
|
@@ -13932,6 +14389,145 @@ static void ggml_compute_forward_flash_attn_back(
|
|
13932 |
}
|
13933 |
}
|
13934 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13935 |
// ggml_compute_forward_map_unary
|
13936 |
|
13937 |
static void ggml_compute_forward_map_unary_f32(
|
@@ -14315,7 +14911,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14315 |
if (skip_cpu) {
|
14316 |
return;
|
14317 |
}
|
14318 |
-
GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
|
14319 |
GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
|
14320 |
#endif // GGML_USE_CUBLAS
|
14321 |
|
@@ -14404,6 +15000,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14404 |
{
|
14405 |
ggml_compute_forward_gelu(params, tensor->src0, tensor);
|
14406 |
} break;
|
|
|
|
|
|
|
|
|
14407 |
case GGML_OP_SILU:
|
14408 |
{
|
14409 |
ggml_compute_forward_silu(params, tensor->src0, tensor);
|
@@ -14508,19 +15108,23 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14508 |
{
|
14509 |
ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
|
14510 |
} break;
|
14511 |
-
case
|
|
|
|
|
|
|
|
|
14512 |
{
|
14513 |
-
|
14514 |
} break;
|
14515 |
-
case
|
14516 |
{
|
14517 |
-
|
14518 |
} break;
|
14519 |
case GGML_OP_FLASH_ATTN:
|
14520 |
{
|
14521 |
-
int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
|
14522 |
GGML_ASSERT(t == 0 || t == 1);
|
14523 |
-
bool masked = t != 0;
|
14524 |
ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor);
|
14525 |
} break;
|
14526 |
case GGML_OP_FLASH_FF:
|
@@ -14534,6 +15138,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
14534 |
bool masked = t != 0;
|
14535 |
ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor);
|
14536 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14537 |
case GGML_OP_MAP_UNARY:
|
14538 |
{
|
14539 |
const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
|
@@ -14805,6 +15417,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
14805 |
{
|
14806 |
GGML_ASSERT(false); // TODO: not implemented
|
14807 |
} break;
|
|
|
|
|
|
|
|
|
14808 |
case GGML_OP_ALIBI:
|
14809 |
{
|
14810 |
GGML_ASSERT(false); // TODO: not implemented
|
@@ -15167,11 +15783,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15167 |
// noop
|
15168 |
}
|
15169 |
} break;
|
15170 |
-
case
|
|
|
|
|
|
|
|
|
15171 |
{
|
15172 |
GGML_ASSERT(false); // TODO: not implemented
|
15173 |
} break;
|
15174 |
-
case
|
15175 |
{
|
15176 |
GGML_ASSERT(false); // TODO: not implemented
|
15177 |
} break;
|
@@ -15340,6 +15960,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|
15340 |
{
|
15341 |
GGML_ASSERT(false); // not supported
|
15342 |
} break;
|
|
|
|
|
15343 |
case GGML_OP_MAP_UNARY:
|
15344 |
case GGML_OP_MAP_BINARY:
|
15345 |
{
|
@@ -15413,7 +16035,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
15413 |
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
|
15414 |
|
15415 |
if (strlen(node->name) == 0) {
|
15416 |
-
|
15417 |
}
|
15418 |
|
15419 |
cgraph->leafs[cgraph->n_leafs] = node;
|
@@ -15422,7 +16044,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
|
15422 |
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
|
15423 |
|
15424 |
if (strlen(node->name) == 0) {
|
15425 |
-
|
15426 |
}
|
15427 |
|
15428 |
cgraph->nodes[cgraph->n_nodes] = node;
|
@@ -15748,6 +16370,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15748 |
} break;
|
15749 |
case GGML_OP_MUL:
|
15750 |
case GGML_OP_GELU:
|
|
|
15751 |
case GGML_OP_SILU:
|
15752 |
case GGML_OP_SILU_BACK:
|
15753 |
case GGML_OP_NORM:
|
@@ -15854,8 +16477,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15854 |
{
|
15855 |
node->n_tasks = 1; //TODO
|
15856 |
} break;
|
15857 |
-
case
|
15858 |
-
case
|
15859 |
{
|
15860 |
node->n_tasks = n_threads;
|
15861 |
|
@@ -15882,6 +16505,41 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15882 |
GGML_ASSERT(false);
|
15883 |
}
|
15884 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15885 |
work_size = MAX(work_size, cur);
|
15886 |
} break;
|
15887 |
case GGML_OP_FLASH_ATTN:
|
@@ -15943,6 +16601,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|
15943 |
|
15944 |
work_size = MAX(work_size, cur);
|
15945 |
} break;
|
|
|
|
|
15946 |
case GGML_OP_MAP_UNARY:
|
15947 |
case GGML_OP_MAP_BINARY:
|
15948 |
{
|
@@ -16475,16 +17135,20 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
|
|
16475 |
|
16476 |
if (!*ctx_data) {
|
16477 |
fprintf(stderr, "%s: failed to create ggml context\n", __func__);
|
|
|
16478 |
return result;
|
16479 |
}
|
16480 |
}
|
16481 |
|
16482 |
data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
|
16483 |
|
16484 |
-
|
16485 |
-
|
16486 |
-
|
16487 |
-
|
|
|
|
|
|
|
16488 |
}
|
16489 |
|
16490 |
fclose(fin);
|
@@ -16764,6 +17428,26 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr
|
|
16764 |
return NULL;
|
16765 |
}
|
16766 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16767 |
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
|
16768 |
char color[16];
|
16769 |
|
@@ -16799,7 +17483,9 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
16799 |
(void *) node, color);
|
16800 |
|
16801 |
if (strlen(node->name) > 0) {
|
16802 |
-
fprintf(fp, "%s |", node->name);
|
|
|
|
|
16803 |
}
|
16804 |
|
16805 |
if (node->n_dims == 2) {
|
@@ -16808,7 +17494,6 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
16808 |
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
|
16809 |
}
|
16810 |
|
16811 |
-
|
16812 |
if (node->grad) {
|
16813 |
fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
|
16814 |
} else {
|
@@ -16827,18 +17512,29 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
16827 |
(void *) node, color);
|
16828 |
|
16829 |
if (strlen(node->name) > 0) {
|
16830 |
-
|
|
|
|
|
16831 |
}
|
16832 |
-
|
16833 |
-
|
16834 |
-
|
16835 |
-
|
16836 |
-
|
16837 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16838 |
}
|
16839 |
-
|
16840 |
-
else {
|
16841 |
-
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
|
16842 |
}
|
16843 |
fprintf(fp, "\"; ]\n");
|
16844 |
}
|
@@ -16846,30 +17542,20 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
16846 |
for (int i = 0; i < gb->n_nodes; i++) {
|
16847 |
struct ggml_tensor * node = gb->nodes[i];
|
16848 |
|
16849 |
-
struct ggml_tensor * parent = ggml_graph_get_parent(gb, node);
|
16850 |
-
|
16851 |
if (node->src0) {
|
16852 |
-
|
16853 |
-
|
16854 |
-
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n",
|
16855 |
-
parent0 ? (void *) parent0 : (void *) node->src0,
|
16856 |
-
parent0 ? "g" : "x",
|
16857 |
-
parent ? (void *) parent : (void *) node,
|
16858 |
-
parent ? "g" : "x",
|
16859 |
-
parent ? "empty" : "vee",
|
16860 |
-
parent ? "dashed" : "solid");
|
16861 |
}
|
16862 |
|
16863 |
if (node->src1) {
|
16864 |
-
|
16865 |
-
|
16866 |
-
|
16867 |
-
|
16868 |
-
|
16869 |
-
|
16870 |
-
|
16871 |
-
|
16872 |
-
|
16873 |
}
|
16874 |
}
|
16875 |
|
@@ -16877,15 +17563,19 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|
16877 |
struct ggml_tensor * node = gb->leafs[i];
|
16878 |
|
16879 |
if (node->src0) {
|
16880 |
-
|
16881 |
-
(void *) node->src0, "x",
|
16882 |
-
(void *) node, "x");
|
16883 |
}
|
16884 |
|
16885 |
if (node->src1) {
|
16886 |
-
|
16887 |
-
|
16888 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
16889 |
}
|
16890 |
}
|
16891 |
|
@@ -17604,7 +18294,6 @@ GGML_API void ggml_opt_init(
|
|
17604 |
ggml_set_zero(opt->lbfgs.g);
|
17605 |
ggml_set_zero(opt->lbfgs.gp);
|
17606 |
ggml_set_zero(opt->lbfgs.d);
|
17607 |
-
ggml_set_zero(opt->lbfgs.pf);
|
17608 |
if (opt->lbfgs.pf) {
|
17609 |
ggml_set_zero(opt->lbfgs.pf);
|
17610 |
}
|
|
|
24 |
#include <stdio.h>
|
25 |
#include <float.h>
|
26 |
#include <limits.h>
|
27 |
+
#include <stdarg.h>
|
28 |
|
29 |
#ifdef GGML_USE_METAL
|
30 |
#include <unistd.h>
|
|
|
113 |
/*#define GGML_PERF*/
|
114 |
#define GGML_DEBUG 0
|
115 |
#define GGML_GELU_FP16
|
116 |
+
#define GGML_GELU_QUICK_FP16
|
117 |
#define GGML_SILU_FP16
|
118 |
|
119 |
#define GGML_SOFT_MAX_UNROLL 4
|
|
|
342 |
// precomputed gelu table for f16 (128 KB)
|
343 |
static ggml_fp16_t table_gelu_f16[1 << 16];
|
344 |
|
345 |
+
// precomputed quick gelu table for f16 (128 KB)
|
346 |
+
static ggml_fp16_t table_gelu_quick_f16[1 << 16];
|
347 |
+
|
348 |
// precomputed silu table for f16 (128 KB)
|
349 |
static ggml_fp16_t table_silu_f16[1 << 16];
|
350 |
|
|
|
1682 |
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
1683 |
#define GGML_F32x4_REDUCE(res, x) \
|
1684 |
{ \
|
1685 |
+
int offset = GGML_F32_ARR >> 1; \
|
1686 |
+
for (int i = 0; i < offset; ++i) { \
|
1687 |
+
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
1688 |
} \
|
1689 |
+
offset >>= 1; \
|
1690 |
+
for (int i = 0; i < offset; ++i) { \
|
1691 |
+
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
1692 |
} \
|
1693 |
+
offset >>= 1; \
|
1694 |
+
for (int i = 0; i < offset; ++i) { \
|
1695 |
+
x[i] = vaddq_f32(x[i], x[offset+i]); \
|
1696 |
} \
|
1697 |
res = GGML_F32x4_REDUCE_ONE(x[0]); \
|
1698 |
}
|
|
|
1723 |
#define GGML_F16x8_MUL vmulq_f16
|
1724 |
#define GGML_F16x8_REDUCE(res, x) \
|
1725 |
{ \
|
1726 |
+
int offset = GGML_F16_ARR >> 1; \
|
1727 |
+
for (int i = 0; i < offset; ++i) { \
|
1728 |
+
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
1729 |
} \
|
1730 |
+
offset >>= 1; \
|
1731 |
+
for (int i = 0; i < offset; ++i) { \
|
1732 |
+
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
1733 |
} \
|
1734 |
+
offset >>= 1; \
|
1735 |
+
for (int i = 0; i < offset; ++i) { \
|
1736 |
+
x[i] = vaddq_f16(x[i], x[offset+i]); \
|
1737 |
} \
|
1738 |
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
|
1739 |
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
|
|
|
1800 |
#define GGML_F32x8_MUL _mm256_mul_ps
|
1801 |
#define GGML_F32x8_REDUCE(res, x) \
|
1802 |
{ \
|
1803 |
+
int offset = GGML_F32_ARR >> 1; \
|
1804 |
+
for (int i = 0; i < offset; ++i) { \
|
1805 |
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
1806 |
} \
|
1807 |
+
offset >>= 1; \
|
1808 |
+
for (int i = 0; i < offset; ++i) { \
|
1809 |
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
1810 |
} \
|
1811 |
+
offset >>= 1; \
|
1812 |
+
for (int i = 0; i < offset; ++i) { \
|
1813 |
+
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
1814 |
} \
|
1815 |
const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
|
1816 |
_mm256_extractf128_ps(x[0], 1)); \
|
|
|
1900 |
#define GGML_F32x4_MUL vec_mul
|
1901 |
#define GGML_F32x4_REDUCE(res, x) \
|
1902 |
{ \
|
1903 |
+
int offset = GGML_F32_ARR >> 1; \
|
1904 |
+
for (int i = 0; i < offset; ++i) { \
|
1905 |
+
x[i] = vec_add(x[i], x[offset+i]); \
|
1906 |
} \
|
1907 |
+
offset >>= 1; \
|
1908 |
+
for (int i = 0; i < offset; ++i) { \
|
1909 |
+
x[i] = vec_add(x[i], x[offset+i]); \
|
1910 |
} \
|
1911 |
+
offset >>= 1; \
|
1912 |
+
for (int i = 0; i < offset; ++i) { \
|
1913 |
+
x[i] = vec_add(x[i], x[offset+i]); \
|
1914 |
} \
|
1915 |
res = vec_extract(x[0], 0) + \
|
1916 |
vec_extract(x[0], 1) + \
|
|
|
1966 |
#define GGML_F32x4_MUL wasm_f32x4_mul
|
1967 |
#define GGML_F32x4_REDUCE(res, x) \
|
1968 |
{ \
|
1969 |
+
int offset = GGML_F32_ARR >> 1; \
|
1970 |
+
for (int i = 0; i < offset; ++i) { \
|
1971 |
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
1972 |
} \
|
1973 |
+
offset >>= 1; \
|
1974 |
+
for (int i = 0; i < offset; ++i) { \
|
1975 |
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
1976 |
} \
|
1977 |
+
offset >>= 1; \
|
1978 |
+
for (int i = 0; i < offset; ++i) { \
|
1979 |
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
1980 |
} \
|
1981 |
res = wasm_f32x4_extract_lane(x[0], 0) + \
|
1982 |
wasm_f32x4_extract_lane(x[0], 1) + \
|
|
|
2031 |
#define GGML_F16x4_MUL wasm_f32x4_mul
|
2032 |
#define GGML_F16x4_REDUCE(res, x) \
|
2033 |
{ \
|
2034 |
+
int offset = GGML_F16_ARR >> 1; \
|
2035 |
+
for (int i = 0; i < offset; ++i) { \
|
2036 |
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
2037 |
} \
|
2038 |
+
offset >>= 1; \
|
2039 |
+
for (int i = 0; i < offset; ++i) { \
|
2040 |
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
2041 |
} \
|
2042 |
+
offset >>= 1; \
|
2043 |
+
for (int i = 0; i < offset; ++i) { \
|
2044 |
+
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
2045 |
} \
|
2046 |
res = wasm_f32x4_extract_lane(x[0], 0) + \
|
2047 |
wasm_f32x4_extract_lane(x[0], 1) + \
|
|
|
2083 |
#define GGML_F32x4_MUL _mm_mul_ps
|
2084 |
#define GGML_F32x4_REDUCE(res, x) \
|
2085 |
{ \
|
2086 |
+
int offset = GGML_F32_ARR >> 1; \
|
2087 |
+
for (int i = 0; i < offset; ++i) { \
|
2088 |
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
2089 |
} \
|
2090 |
+
offset >>= 1; \
|
2091 |
+
for (int i = 0; i < offset; ++i) { \
|
2092 |
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
2093 |
} \
|
2094 |
+
offset >>= 1; \
|
2095 |
+
for (int i = 0; i < offset; ++i) { \
|
2096 |
+
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
2097 |
} \
|
2098 |
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
|
2099 |
res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
|
|
|
3382 |
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
3383 |
|
3384 |
static const float GELU_COEF_A = 0.044715f;
|
3385 |
+
static const float GELU_QUICK_COEF = -1.702f;
|
3386 |
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
3387 |
|
3388 |
inline static float ggml_gelu_f32(float x) {
|
|
|
3413 |
}
|
3414 |
#endif
|
3415 |
|
3416 |
+
inline static float ggml_gelu_quick_f32(float x) {
|
3417 |
+
return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
|
3418 |
+
}
|
3419 |
+
|
3420 |
+
//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
3421 |
+
// const uint16_t * i16 = (const uint16_t *) x;
|
3422 |
+
// for (int i = 0; i < n; ++i) {
|
3423 |
+
// y[i] = table_gelu_quick_f16[i16[i]];
|
3424 |
+
// }
|
3425 |
+
//}
|
3426 |
+
|
3427 |
+
#ifdef GGML_GELU_QUICK_FP16
|
3428 |
+
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
|
3429 |
+
uint16_t t;
|
3430 |
+
for (int i = 0; i < n; ++i) {
|
3431 |
+
ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
|
3432 |
+
memcpy(&t, &fp16, sizeof(uint16_t));
|
3433 |
+
y[i] = GGML_FP16_TO_FP32(table_gelu_quick_f16[t]);
|
3434 |
+
}
|
3435 |
+
}
|
3436 |
+
#else
|
3437 |
+
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
|
3438 |
+
for (int i = 0; i < n; ++i) {
|
3439 |
+
y[i] = ggml_gelu_quick_f32(x[i]);
|
3440 |
+
}
|
3441 |
+
}
|
3442 |
+
#endif
|
3443 |
+
|
3444 |
// Sigmoid Linear Unit (SiLU) function
|
3445 |
inline static float ggml_silu_f32(float x) {
|
3446 |
return x/(1.0f + expf(-x));
|
|
|
3671 |
"STEP",
|
3672 |
"RELU",
|
3673 |
"GELU",
|
3674 |
+
"GELU_QUICK",
|
3675 |
"SILU",
|
3676 |
"SILU_BACK",
|
3677 |
"NORM",
|
|
|
3700 |
"ROPE_BACK",
|
3701 |
"ALIBI",
|
3702 |
"CLAMP",
|
3703 |
+
"CONV_1D_S1_PH",
|
3704 |
+
"CONV_1D_S2_PH",
|
3705 |
+
"CONV_2D_SK_P0",
|
3706 |
|
3707 |
"FLASH_ATTN",
|
3708 |
"FLASH_FF",
|
3709 |
"FLASH_ATTN_BACK",
|
3710 |
+
"WIN_PART",
|
3711 |
+
"WIN_UNPART",
|
3712 |
|
3713 |
"MAP_UNARY",
|
3714 |
"MAP_BINARY",
|
|
|
3717 |
"CROSS_ENTROPY_LOSS_BACK",
|
3718 |
};
|
3719 |
|
3720 |
+
static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
|
3721 |
|
3722 |
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
3723 |
"none",
|
|
|
3743 |
"step(x)",
|
3744 |
"relu(x)",
|
3745 |
"gelu(x)",
|
3746 |
+
"gelu_quick(x)",
|
3747 |
"silu(x)",
|
3748 |
"silu_back(x)",
|
3749 |
"norm(x)",
|
|
|
3772 |
"rope_back(x)",
|
3773 |
"alibi(x)",
|
3774 |
"clamp(x)",
|
3775 |
+
"conv_1d_s1_ph(x)",
|
3776 |
+
"conv_1d_s2_ph(x)",
|
3777 |
+
"conv_2d_sk_p0(x)",
|
3778 |
|
3779 |
"flash_attn(x)",
|
3780 |
"flash_ff(x)",
|
3781 |
"flash_attn_back(x)",
|
3782 |
+
"win_part(x)",
|
3783 |
+
"win_unpart(x)",
|
3784 |
|
3785 |
"f(x)",
|
3786 |
"f(x,y)",
|
|
|
3789 |
"cross_entropy_loss_back(x,y)",
|
3790 |
};
|
3791 |
|
3792 |
+
static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
|
3793 |
|
3794 |
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
3795 |
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
|
|
4080 |
// initialize time system (required on Windows)
|
4081 |
ggml_time_init();
|
4082 |
|
4083 |
+
// initialize GELU, Quick GELU, SILU and EXP F32 tables
|
4084 |
{
|
4085 |
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
|
4086 |
|
|
|
4090 |
memcpy(&ii, &ui, sizeof(ii));
|
4091 |
const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
|
4092 |
table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
|
4093 |
+
table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
|
4094 |
table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
|
4095 |
table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
|
4096 |
}
|
4097 |
|
4098 |
const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
|
4099 |
|
4100 |
+
GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
4101 |
}
|
4102 |
|
4103 |
// initialize g_state
|
|
|
4218 |
ctx->no_alloc = no_alloc;
|
4219 |
}
|
4220 |
|
4221 |
+
void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
|
4222 |
return ctx->mem_buffer;
|
4223 |
}
|
4224 |
|
4225 |
+
size_t ggml_get_mem_size(const struct ggml_context * ctx) {
|
4226 |
return ctx->mem_size;
|
4227 |
}
|
4228 |
|
4229 |
+
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
|
4230 |
+
size_t max_size = 0;
|
4231 |
+
|
4232 |
+
struct ggml_object * obj = ctx->objects_begin;
|
4233 |
+
|
4234 |
+
while (obj != NULL) {
|
4235 |
+
struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
|
4236 |
+
|
4237 |
+
const size_t size = ggml_nbytes(tensor);
|
4238 |
+
|
4239 |
+
if (max_size < size) {
|
4240 |
+
max_size = size;
|
4241 |
+
}
|
4242 |
+
|
4243 |
+
obj = obj->next;
|
4244 |
+
}
|
4245 |
+
|
4246 |
+
return max_size;
|
4247 |
+
}
|
4248 |
+
|
4249 |
// IMPORTANT:
|
4250 |
// when creating "opt" tensors, always save and load the scratch buffer
|
4251 |
// this is an error prone process, but it is necessary to support inplace
|
|
|
4729 |
return tensor->name;
|
4730 |
}
|
4731 |
|
4732 |
+
struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
|
4733 |
strncpy(tensor->name, name, sizeof(tensor->name));
|
4734 |
tensor->name[sizeof(tensor->name) - 1] = '\0';
|
4735 |
+
return tensor;
|
4736 |
+
}
|
4737 |
+
|
4738 |
+
struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
|
4739 |
+
va_list args;
|
4740 |
+
va_start(args, fmt);
|
4741 |
+
vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
|
4742 |
+
va_end(args);
|
4743 |
+
return tensor;
|
4744 |
}
|
4745 |
|
4746 |
struct ggml_tensor * ggml_view_tensor(
|
4747 |
struct ggml_context * ctx,
|
4748 |
const struct ggml_tensor * src) {
|
4749 |
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
|
4750 |
+
ggml_format_name(result, "%s (view)", src->name);
|
4751 |
|
4752 |
result->nb[0] = src->nb[0];
|
4753 |
result->nb[1] = src->nb[1];
|
|
|
5520 |
return ggml_gelu_impl(ctx, a, true);
|
5521 |
}
|
5522 |
|
5523 |
+
// ggml_gelu_quick
|
5524 |
+
|
5525 |
+
struct ggml_tensor * ggml_gelu_quick_impl(
|
5526 |
+
struct ggml_context * ctx,
|
5527 |
+
struct ggml_tensor * a,
|
5528 |
+
bool inplace) {
|
5529 |
+
bool is_node = false;
|
5530 |
+
|
5531 |
+
if (!inplace && (a->grad)) {
|
5532 |
+
is_node = true;
|
5533 |
+
}
|
5534 |
+
|
5535 |
+
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5536 |
+
|
5537 |
+
result->op = GGML_OP_GELU_QUICK;
|
5538 |
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
5539 |
+
result->src0 = a;
|
5540 |
+
result->src1 = NULL;
|
5541 |
+
|
5542 |
+
return result;
|
5543 |
+
}
|
5544 |
+
|
5545 |
+
struct ggml_tensor * ggml_gelu_quick(
|
5546 |
+
struct ggml_context * ctx,
|
5547 |
+
struct ggml_tensor * a) {
|
5548 |
+
return ggml_gelu_quick_impl(ctx, a, false);
|
5549 |
+
}
|
5550 |
+
|
5551 |
+
struct ggml_tensor * ggml_gelu_quick_inplace(
|
5552 |
+
struct ggml_context * ctx,
|
5553 |
+
struct ggml_tensor * a) {
|
5554 |
+
return ggml_gelu_quick_impl(ctx, a, true);
|
5555 |
+
}
|
5556 |
+
|
5557 |
// ggml_silu
|
5558 |
|
5559 |
struct ggml_tensor * ggml_silu_impl(
|
|
|
5909 |
|
5910 |
// make a view of the destination
|
5911 |
struct ggml_tensor * result = ggml_view_tensor(ctx, b);
|
5912 |
+
if (strlen(b->name) > 0) {
|
5913 |
+
ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
|
5914 |
+
} else {
|
5915 |
+
ggml_format_name(result, "%s (copy)", a->name);
|
5916 |
+
}
|
5917 |
|
5918 |
result->op = GGML_OP_CPY;
|
5919 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
|
5950 |
}
|
5951 |
|
5952 |
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
5953 |
+
ggml_format_name(result, "%s (cont)", a->name);
|
5954 |
|
5955 |
result->op = GGML_OP_CONT;
|
5956 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
|
5994 |
}
|
5995 |
|
5996 |
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
|
5997 |
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
5998 |
|
5999 |
result->op = GGML_OP_RESHAPE;
|
6000 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
|
6019 |
|
6020 |
const int64_t ne[1] = { ne0 };
|
6021 |
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
|
6022 |
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
6023 |
|
6024 |
result->op = GGML_OP_RESHAPE;
|
6025 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
|
6045 |
|
6046 |
const int64_t ne[2] = { ne0, ne1 };
|
6047 |
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
|
6048 |
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
6049 |
|
6050 |
result->op = GGML_OP_RESHAPE;
|
6051 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
|
6072 |
|
6073 |
const int64_t ne[3] = { ne0, ne1, ne2 };
|
6074 |
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
|
6075 |
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
6076 |
|
6077 |
result->op = GGML_OP_RESHAPE;
|
6078 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
|
6101 |
|
6102 |
const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
|
6103 |
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
|
6104 |
+
ggml_format_name(result, "%s (reshaped)", a->name);
|
6105 |
|
6106 |
result->op = GGML_OP_RESHAPE;
|
6107 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
|
|
6126 |
}
|
6127 |
|
6128 |
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
|
6129 |
+
ggml_format_name(result, "%s (view)", a->name);
|
6130 |
|
6131 |
ggml_scratch_save(ctx);
|
6132 |
|
6133 |
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6134 |
+
ggml_set_name(offs, "offset");
|
6135 |
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6136 |
|
6137 |
ggml_scratch_load(ctx);
|
|
|
6164 |
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
6165 |
|
6166 |
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
|
6167 |
+
ggml_format_name(result, "%s (view)", a->name);
|
6168 |
|
6169 |
ggml_scratch_save(ctx);
|
6170 |
|
6171 |
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6172 |
+
ggml_set_name(offs, "offset");
|
6173 |
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6174 |
|
6175 |
ggml_scratch_load(ctx);
|
|
|
6208 |
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
6209 |
|
6210 |
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
|
6211 |
+
ggml_format_name(result, "%s (view)", a->name);
|
6212 |
|
6213 |
ggml_scratch_save(ctx);
|
6214 |
|
6215 |
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6216 |
+
ggml_set_name(offs, "offset");
|
6217 |
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6218 |
|
6219 |
ggml_scratch_load(ctx);
|
|
|
6254 |
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
|
6255 |
|
6256 |
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
|
6257 |
+
ggml_format_name(result, "%s (view)", a->name);
|
6258 |
|
6259 |
ggml_scratch_save(ctx);
|
6260 |
|
6261 |
struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
|
6262 |
+
ggml_set_name(offs, "offset");
|
6263 |
memcpy(offs->data, &offset, 2*sizeof(int32_t));
|
6264 |
|
6265 |
ggml_scratch_load(ctx);
|
|
|
6305 |
}
|
6306 |
|
6307 |
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
6308 |
+
ggml_format_name(result, "%s (permuted)", a->name);
|
6309 |
|
6310 |
int ne[GGML_MAX_DIMS];
|
6311 |
int nb[GGML_MAX_DIMS];
|
|
|
6365 |
}
|
6366 |
|
6367 |
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
|
6368 |
+
ggml_format_name(result, "%s (transposed)", a->name);
|
6369 |
|
6370 |
result->ne[0] = a->ne[1];
|
6371 |
result->ne[1] = a->ne[0];
|
|
|
6774 |
|
6775 |
ggml_scratch_save(ctx);
|
6776 |
|
6777 |
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2);
|
6778 |
|
6779 |
((float *) b->data)[0] = min;
|
6780 |
((float *) b->data)[1] = max;
|
|
|
6789 |
return result;
|
6790 |
}
|
6791 |
|
6792 |
+
// ggml_conv_1d_s1_ph
|
6793 |
|
6794 |
+
struct ggml_tensor * ggml_conv_1d_s1_ph(
|
6795 |
struct ggml_context * ctx,
|
6796 |
struct ggml_tensor * a,
|
6797 |
struct ggml_tensor * b) {
|
|
|
6808 |
const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
|
6809 |
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6810 |
|
6811 |
+
result->op = GGML_OP_CONV_1D_S1_PH;
|
6812 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6813 |
result->src0 = a;
|
6814 |
result->src1 = b;
|
|
|
6816 |
return result;
|
6817 |
}
|
6818 |
|
6819 |
+
// ggml_conv_1d_s2_ph
|
6820 |
|
6821 |
+
struct ggml_tensor * ggml_conv_1d_s2_ph(
|
6822 |
struct ggml_context * ctx,
|
6823 |
struct ggml_tensor * a,
|
6824 |
struct ggml_tensor * b) {
|
|
|
6835 |
const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
|
6836 |
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
|
6837 |
|
6838 |
+
result->op = GGML_OP_CONV_1D_S2_PH;
|
6839 |
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6840 |
+
result->src0 = a;
|
6841 |
+
result->src1 = b;
|
6842 |
+
|
6843 |
+
return result;
|
6844 |
+
}
|
6845 |
+
|
6846 |
+
// ggml_conv_2d_sk_p0
|
6847 |
+
|
6848 |
+
struct ggml_tensor * ggml_conv_2d_sk_p0(
|
6849 |
+
struct ggml_context * ctx,
|
6850 |
+
struct ggml_tensor * a,
|
6851 |
+
struct ggml_tensor * b) {
|
6852 |
+
GGML_ASSERT(b->ne[3] == 1);
|
6853 |
+
GGML_ASSERT(a->ne[2] == b->ne[2]);
|
6854 |
+
GGML_ASSERT(b->ne[0] % a->ne[0] == 0);
|
6855 |
+
GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
|
6856 |
+
bool is_node = false;
|
6857 |
+
|
6858 |
+
if (a->grad || b->grad) {
|
6859 |
+
GGML_ASSERT(false); // TODO: implement backward
|
6860 |
+
is_node = true;
|
6861 |
+
}
|
6862 |
+
|
6863 |
+
const int64_t ne[4] = { b->ne[0]/a->ne[0], b->ne[1]/a->ne[1], a->ne[3], 1, };
|
6864 |
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
6865 |
+
|
6866 |
+
result->op = GGML_OP_CONV_2D_SK_P0;
|
6867 |
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
6868 |
result->src0 = a;
|
6869 |
result->src1 = b;
|
|
|
6997 |
return result;
|
6998 |
}
|
6999 |
|
7000 |
+
// ggml_win_part
|
7001 |
+
|
7002 |
+
struct ggml_tensor * ggml_win_part(
|
7003 |
+
struct ggml_context * ctx,
|
7004 |
+
struct ggml_tensor * a,
|
7005 |
+
int w) {
|
7006 |
+
GGML_ASSERT(a->ne[3] == 1);
|
7007 |
+
GGML_ASSERT(a->type == GGML_TYPE_F32);
|
7008 |
+
|
7009 |
+
bool is_node = false;
|
7010 |
+
|
7011 |
+
if (a->grad) {
|
7012 |
+
GGML_ASSERT(false); // TODO: implement backward
|
7013 |
+
is_node = true;
|
7014 |
+
}
|
7015 |
+
|
7016 |
+
// padding
|
7017 |
+
const int px = (w - a->ne[1]%w)%w;
|
7018 |
+
const int py = (w - a->ne[2]%w)%w;
|
7019 |
+
|
7020 |
+
const int npx = (px + a->ne[1])/w;
|
7021 |
+
const int npy = (py + a->ne[2])/w;
|
7022 |
+
const int np = npx*npy;
|
7023 |
+
|
7024 |
+
const int64_t ne[4] = { a->ne[0], w, w, np, };
|
7025 |
+
|
7026 |
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
7027 |
+
|
7028 |
+
ggml_scratch_save(ctx);
|
7029 |
+
|
7030 |
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
|
7031 |
+
|
7032 |
+
((int32_t *) b->data)[0] = npx;
|
7033 |
+
((int32_t *) b->data)[1] = npy;
|
7034 |
+
((int32_t *) b->data)[2] = w;
|
7035 |
+
|
7036 |
+
ggml_scratch_load(ctx);
|
7037 |
+
|
7038 |
+
result->op = GGML_OP_WIN_PART;
|
7039 |
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7040 |
+
result->src0 = a;
|
7041 |
+
result->src1 = NULL;
|
7042 |
+
result->opt[0] = b;
|
7043 |
+
|
7044 |
+
return result;
|
7045 |
+
}
|
7046 |
+
|
7047 |
+
// ggml_win_unpart
|
7048 |
+
|
7049 |
+
struct ggml_tensor * ggml_win_unpart(
|
7050 |
+
struct ggml_context * ctx,
|
7051 |
+
struct ggml_tensor * a,
|
7052 |
+
int w0,
|
7053 |
+
int h0,
|
7054 |
+
int w) {
|
7055 |
+
GGML_ASSERT(a->type == GGML_TYPE_F32);
|
7056 |
+
|
7057 |
+
bool is_node = false;
|
7058 |
+
|
7059 |
+
if (a->grad) {
|
7060 |
+
GGML_ASSERT(false); // TODO: implement backward
|
7061 |
+
is_node = true;
|
7062 |
+
}
|
7063 |
+
|
7064 |
+
const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
|
7065 |
+
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
|
7066 |
+
|
7067 |
+
ggml_scratch_save(ctx);
|
7068 |
+
|
7069 |
+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
|
7070 |
+
|
7071 |
+
((int32_t *) b->data)[0] = w;
|
7072 |
+
|
7073 |
+
ggml_scratch_load(ctx);
|
7074 |
+
|
7075 |
+
result->op = GGML_OP_WIN_UNPART;
|
7076 |
+
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
7077 |
+
result->src0 = a;
|
7078 |
+
result->src1 = NULL;
|
7079 |
+
result->opt[0] = b;
|
7080 |
+
|
7081 |
+
return result;
|
7082 |
+
}
|
7083 |
|
7084 |
// ggml_map_unary
|
7085 |
|
|
|
8158 |
|
8159 |
void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
|
8160 |
float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
|
8161 |
+
void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
|
8162 |
|
8163 |
assert(ne00 % 32 == 0);
|
8164 |
|
|
|
9719 |
GGML_ASSERT(false);
|
9720 |
} break;
|
9721 |
}
|
9722 |
+
}
|
9723 |
+
|
9724 |
+
// ggml_compute_forward_gelu_quick
|
9725 |
+
|
9726 |
+
static void ggml_compute_forward_gelu_quick_f32(
|
9727 |
+
const struct ggml_compute_params * params,
|
9728 |
+
const struct ggml_tensor * src0,
|
9729 |
+
struct ggml_tensor * dst) {
|
9730 |
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
9731 |
+
GGML_ASSERT(ggml_is_contiguous(dst));
|
9732 |
+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
9733 |
+
|
9734 |
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
9735 |
+
return;
|
9736 |
+
}
|
9737 |
+
|
9738 |
+
const int ith = params->ith;
|
9739 |
+
const int nth = params->nth;
|
9740 |
+
|
9741 |
+
const int nc = src0->ne[0];
|
9742 |
+
const int nr = ggml_nrows(src0);
|
9743 |
+
|
9744 |
+
// rows per thread
|
9745 |
+
const int dr = (nr + nth - 1)/nth;
|
9746 |
+
|
9747 |
+
// row range for this thread
|
9748 |
+
const int ir0 = dr*ith;
|
9749 |
+
const int ir1 = MIN(ir0 + dr, nr);
|
9750 |
+
|
9751 |
+
for (int i1 = ir0; i1 < ir1; i1++) {
|
9752 |
+
ggml_vec_gelu_quick_f32(nc,
|
9753 |
+
(float *) ((char *) dst->data + i1*( dst->nb[1])),
|
9754 |
+
(float *) ((char *) src0->data + i1*(src0->nb[1])));
|
9755 |
+
|
9756 |
+
#ifndef NDEBUG
|
9757 |
+
for (int k = 0; k < nc; k++) {
|
9758 |
+
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
9759 |
+
UNUSED(x);
|
9760 |
+
assert(!isnan(x));
|
9761 |
+
assert(!isinf(x));
|
9762 |
+
}
|
9763 |
+
#endif
|
9764 |
+
}
|
9765 |
+
}
|
9766 |
|
9767 |
+
static void ggml_compute_forward_gelu_quick(
|
9768 |
+
const struct ggml_compute_params * params,
|
9769 |
+
const struct ggml_tensor * src0,
|
9770 |
+
struct ggml_tensor * dst) {
|
9771 |
+
switch (src0->type) {
|
9772 |
+
case GGML_TYPE_F32:
|
9773 |
+
{
|
9774 |
+
ggml_compute_forward_gelu_quick_f32(params, src0, dst);
|
9775 |
+
} break;
|
9776 |
+
default:
|
9777 |
+
{
|
9778 |
+
GGML_ASSERT(false);
|
9779 |
+
} break;
|
9780 |
+
}
|
9781 |
}
|
9782 |
|
9783 |
// ggml_compute_forward_silu
|
|
|
11175 |
const int im2 = (ne12 == 0 ? 0 : ne12-1);
|
11176 |
const int im3 = (ne13 == 0 ? 0 : ne13-1);
|
11177 |
|
11178 |
+
GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst));
|
11179 |
|
11180 |
GGML_ASSERT(nb10 == sizeof(float));
|
11181 |
|
|
|
11896 |
const struct ggml_tensor * src1,
|
11897 |
struct ggml_tensor * dst) {
|
11898 |
assert(params->ith == 0);
|
11899 |
+
|
11900 |
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
11901 |
+
GGML_ASSERT(ggml_nelements(src1) == 3);
|
11902 |
|
11903 |
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11904 |
return;
|
|
|
11961 |
const struct ggml_tensor * src1,
|
11962 |
struct ggml_tensor * dst) {
|
11963 |
assert(params->ith == 0);
|
11964 |
+
|
11965 |
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
11966 |
+
GGML_ASSERT(ggml_nelements(src1) == 3);
|
11967 |
|
11968 |
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
11969 |
return;
|
|
|
12065 |
const struct ggml_tensor * src1,
|
12066 |
struct ggml_tensor * dst) {
|
12067 |
assert(params->ith == 0);
|
12068 |
+
|
12069 |
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
12070 |
+
GGML_ASSERT(ggml_nelements(src1) == 2);
|
12071 |
|
12072 |
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
12073 |
return;
|
12074 |
}
|
12075 |
|
12076 |
+
const float min = ((float *) src1->data)[0];
|
12077 |
+
const float max = ((float *) src1->data)[1];
|
12078 |
|
12079 |
const int ith = params->ith;
|
12080 |
const int nth = params->nth;
|
|
|
12632 |
}
|
12633 |
}
|
12634 |
|
12635 |
+
// ggml_compute_forward_conv_1d_s1_ph
|
12636 |
|
12637 |
+
static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
|
12638 |
const struct ggml_compute_params * params,
|
12639 |
const struct ggml_tensor * src0,
|
12640 |
const struct ggml_tensor * src1,
|
|
|
12754 |
}
|
12755 |
}
|
12756 |
|
12757 |
+
static void ggml_compute_forward_conv_1d_s1_ph_f32(
|
12758 |
const struct ggml_compute_params * params,
|
12759 |
const struct ggml_tensor * src0,
|
12760 |
const struct ggml_tensor * src1,
|
|
|
12874 |
}
|
12875 |
}
|
12876 |
|
12877 |
+
static void ggml_compute_forward_conv_1d_s1_ph(
|
12878 |
const struct ggml_compute_params * params,
|
12879 |
const struct ggml_tensor * src0,
|
12880 |
const struct ggml_tensor * src1,
|
|
|
12882 |
switch (src0->type) {
|
12883 |
case GGML_TYPE_F16:
|
12884 |
{
|
12885 |
+
ggml_compute_forward_conv_1d_s1_ph_f16_f32(params, src0, src1, dst);
|
12886 |
} break;
|
12887 |
case GGML_TYPE_F32:
|
12888 |
{
|
12889 |
+
ggml_compute_forward_conv_1d_s1_ph_f32(params, src0, src1, dst);
|
12890 |
} break;
|
12891 |
default:
|
12892 |
{
|
|
|
12895 |
}
|
12896 |
}
|
12897 |
|
12898 |
+
// ggml_compute_forward_conv_1d_s2_ph
|
12899 |
|
12900 |
+
static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
|
12901 |
const struct ggml_compute_params * params,
|
12902 |
const struct ggml_tensor * src0,
|
12903 |
const struct ggml_tensor * src1,
|
|
|
13017 |
}
|
13018 |
}
|
13019 |
|
13020 |
+
static void ggml_compute_forward_conv_1d_s2_ph_f32(
|
13021 |
const struct ggml_compute_params * params,
|
13022 |
const struct ggml_tensor * src0,
|
13023 |
const struct ggml_tensor * src1,
|
|
|
13137 |
}
|
13138 |
}
|
13139 |
|
13140 |
+
static void ggml_compute_forward_conv_1d_s2_ph(
|
13141 |
const struct ggml_compute_params * params,
|
13142 |
const struct ggml_tensor * src0,
|
13143 |
const struct ggml_tensor * src1,
|
|
|
13145 |
switch (src0->type) {
|
13146 |
case GGML_TYPE_F16:
|
13147 |
{
|
13148 |
+
ggml_compute_forward_conv_1d_s2_ph_f16_f32(params, src0, src1, dst);
|
13149 |
} break;
|
13150 |
case GGML_TYPE_F32:
|
13151 |
{
|
13152 |
+
ggml_compute_forward_conv_1d_s2_ph_f32(params, src0, src1, dst);
|
13153 |
+
} break;
|
13154 |
+
default:
|
13155 |
+
{
|
13156 |
+
GGML_ASSERT(false);
|
13157 |
+
} break;
|
13158 |
+
}
|
13159 |
+
}
|
13160 |
+
|
13161 |
+
// ggml_compute_forward_conv_2d_sk_p0
|
13162 |
+
|
13163 |
+
static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
|
13164 |
+
const struct ggml_compute_params * params,
|
13165 |
+
const struct ggml_tensor * src0,
|
13166 |
+
const struct ggml_tensor * src1,
|
13167 |
+
struct ggml_tensor * dst) {
|
13168 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
13169 |
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13170 |
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
13171 |
+
|
13172 |
+
int64_t t0 = ggml_perf_time_us();
|
13173 |
+
UNUSED(t0);
|
13174 |
+
|
13175 |
+
const int ne00 = src0->ne[0];
|
13176 |
+
const int ne01 = src0->ne[1];
|
13177 |
+
const int ne02 = src0->ne[2];
|
13178 |
+
//const int ne03 = src0->ne[3];
|
13179 |
+
|
13180 |
+
const int ne10 = src1->ne[0];
|
13181 |
+
//const int ne11 = src1->ne[1];
|
13182 |
+
const int ne12 = src1->ne[2];
|
13183 |
+
//const int ne13 = src1->ne[3];
|
13184 |
+
|
13185 |
+
const int ne0 = dst->ne[0];
|
13186 |
+
const int ne1 = dst->ne[1];
|
13187 |
+
const int ne2 = dst->ne[2];
|
13188 |
+
//const int ne3 = dst->ne[3];
|
13189 |
+
//const int ne = ne0*ne1*ne2*ne3;
|
13190 |
+
|
13191 |
+
const int nb00 = src0->nb[0];
|
13192 |
+
//const int nb01 = src0->nb[1];
|
13193 |
+
//const int nb02 = src0->nb[2];
|
13194 |
+
const int nb03 = src0->nb[3];
|
13195 |
+
|
13196 |
+
const int nb10 = src1->nb[0];
|
13197 |
+
//const int nb11 = src1->nb[1];
|
13198 |
+
const int nb12 = src1->nb[2];
|
13199 |
+
//const int nb13 = src1->nb[3];
|
13200 |
+
|
13201 |
+
//const int nb0 = dst->nb[0];
|
13202 |
+
//const int nb1 = dst->nb[1];
|
13203 |
+
const int nb2 = dst->nb[2];
|
13204 |
+
//const int nb3 = dst->nb[3];
|
13205 |
+
|
13206 |
+
const int ith = params->ith;
|
13207 |
+
const int nth = params->nth;
|
13208 |
+
|
13209 |
+
const int nk0 = ne00;
|
13210 |
+
const int nk1 = ne01;
|
13211 |
+
|
13212 |
+
// size of the convolution row - the kernel size unrolled across all channels
|
13213 |
+
// round-up so it is more suitable for SIMD
|
13214 |
+
const int ew0 = ggml_up32(nk0*nk1*ne02);
|
13215 |
+
|
13216 |
+
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
13217 |
+
GGML_ASSERT(nb10 == sizeof(float));
|
13218 |
+
|
13219 |
+
if (params->type == GGML_TASK_INIT) {
|
13220 |
+
// TODO: fix this memset (wsize is overestimated)
|
13221 |
+
memset(params->wdata, 0, params->wsize);
|
13222 |
+
|
13223 |
+
// prepare source data (src1)
|
13224 |
+
{
|
13225 |
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13226 |
+
|
13227 |
+
for (int i12 = 0; i12 < ne12; i12++) {
|
13228 |
+
const float * const src = (float *)((char *) src1->data + i12*nb12);
|
13229 |
+
ggml_fp16_t * dst_data = wdata;
|
13230 |
+
|
13231 |
+
for (int i1 = 0; i1 < ne1; i1++) {
|
13232 |
+
for (int i0 = 0; i0 < ne0; i0++) {
|
13233 |
+
for (int ik1 = 0; ik1 < nk1; ik1++) {
|
13234 |
+
for (int ik0 = 0; ik0 < nk0; ik0++) {
|
13235 |
+
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
|
13236 |
+
GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
|
13237 |
+
}
|
13238 |
+
}
|
13239 |
+
}
|
13240 |
+
}
|
13241 |
+
}
|
13242 |
+
}
|
13243 |
+
|
13244 |
+
return;
|
13245 |
+
}
|
13246 |
+
|
13247 |
+
if (params->type == GGML_TASK_FINALIZE) {
|
13248 |
+
return;
|
13249 |
+
}
|
13250 |
+
|
13251 |
+
// total patches in dst
|
13252 |
+
const int np = ne2;
|
13253 |
+
|
13254 |
+
// patches per thread
|
13255 |
+
const int dp = (np + nth - 1)/nth;
|
13256 |
+
|
13257 |
+
// patch range for this thread
|
13258 |
+
const int ip0 = dp*ith;
|
13259 |
+
const int ip1 = MIN(ip0 + dp, np);
|
13260 |
+
|
13261 |
+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
13262 |
+
|
13263 |
+
for (int i2 = ip0; i2 < ip1; i2++) {
|
13264 |
+
float * dst_data = (float *)((char *) dst->data + i2*nb2);
|
13265 |
+
|
13266 |
+
for (int i1 = 0; i1 < ne1; ++i1) {
|
13267 |
+
for (int i0 = 0; i0 < ne0; ++i0) {
|
13268 |
+
ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
|
13269 |
+
(ggml_fp16_t *) ((char *) src0->data + i2*nb03),
|
13270 |
+
(ggml_fp16_t *) wdata + (i1*ne0 + i0)*ew0);
|
13271 |
+
}
|
13272 |
+
}
|
13273 |
+
}
|
13274 |
+
}
|
13275 |
+
|
13276 |
+
static void ggml_compute_forward_conv_2d_sk_p0(
|
13277 |
+
const struct ggml_compute_params * params,
|
13278 |
+
const struct ggml_tensor * src0,
|
13279 |
+
const struct ggml_tensor * src1,
|
13280 |
+
struct ggml_tensor * dst) {
|
13281 |
+
switch (src0->type) {
|
13282 |
+
case GGML_TYPE_F16:
|
13283 |
+
{
|
13284 |
+
ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst);
|
13285 |
+
} break;
|
13286 |
+
case GGML_TYPE_F32:
|
13287 |
+
{
|
13288 |
+
//ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst);
|
13289 |
+
GGML_ASSERT(false);
|
13290 |
} break;
|
13291 |
default:
|
13292 |
{
|
|
|
14389 |
}
|
14390 |
}
|
14391 |
|
14392 |
+
// ggml_compute_forward_win_part
|
14393 |
+
|
14394 |
+
static void ggml_compute_forward_win_part_f32(
|
14395 |
+
const struct ggml_compute_params * params,
|
14396 |
+
const struct ggml_tensor * src0,
|
14397 |
+
const struct ggml_tensor * opt0,
|
14398 |
+
struct ggml_tensor * dst) {
|
14399 |
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14400 |
+
return;
|
14401 |
+
}
|
14402 |
+
|
14403 |
+
const int64_t ne00 = src0->ne[0]; UNUSED(ne00);
|
14404 |
+
const int64_t ne01 = src0->ne[1];
|
14405 |
+
const int64_t ne02 = src0->ne[2];
|
14406 |
+
const int64_t ne03 = src0->ne[3]; UNUSED(ne03);
|
14407 |
+
|
14408 |
+
const int64_t ne0 = dst->ne[0];
|
14409 |
+
const int64_t ne1 = dst->ne[1];
|
14410 |
+
const int64_t ne2 = dst->ne[2];
|
14411 |
+
const int64_t ne3 = dst->ne[3]; UNUSED(ne3);
|
14412 |
+
|
14413 |
+
const int32_t nep0 = ((const int32_t *)(opt0->data))[0];
|
14414 |
+
const int32_t nep1 = ((const int32_t *)(opt0->data))[1];
|
14415 |
+
const int32_t w = ((const int32_t *)(opt0->data))[2];
|
14416 |
+
|
14417 |
+
assert(ne00 == ne0);
|
14418 |
+
assert(ne3 == nep0*nep1);
|
14419 |
+
|
14420 |
+
// TODO: optimize / multi-thread
|
14421 |
+
for (int py = 0; py < nep1; ++py) {
|
14422 |
+
for (int px = 0; px < nep0; ++px) {
|
14423 |
+
const int64_t i3 = py*nep0 + px;
|
14424 |
+
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
14425 |
+
for (int64_t i1 = 0; i1 < ne1; ++i1) {
|
14426 |
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
14427 |
+
const int64_t i02 = py*w + i2;
|
14428 |
+
const int64_t i01 = px*w + i1;
|
14429 |
+
const int64_t i00 = i0;
|
14430 |
+
|
14431 |
+
const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + i0;
|
14432 |
+
const int64_t j = i02*ne01*ne00 + i01*ne00 + i00;
|
14433 |
+
|
14434 |
+
if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
|
14435 |
+
((float *) dst->data)[i] = 0.0f;
|
14436 |
+
} else {
|
14437 |
+
((float *) dst->data)[i] = ((float *) src0->data)[j];
|
14438 |
+
}
|
14439 |
+
}
|
14440 |
+
}
|
14441 |
+
}
|
14442 |
+
}
|
14443 |
+
}
|
14444 |
+
}
|
14445 |
+
|
14446 |
+
static void ggml_compute_forward_win_part(
|
14447 |
+
const struct ggml_compute_params * params,
|
14448 |
+
const struct ggml_tensor * src0,
|
14449 |
+
const struct ggml_tensor * opt0,
|
14450 |
+
struct ggml_tensor * dst) {
|
14451 |
+
switch (src0->type) {
|
14452 |
+
case GGML_TYPE_F32:
|
14453 |
+
{
|
14454 |
+
ggml_compute_forward_win_part_f32(params, src0, opt0, dst);
|
14455 |
+
} break;
|
14456 |
+
default:
|
14457 |
+
{
|
14458 |
+
GGML_ASSERT(false);
|
14459 |
+
} break;
|
14460 |
+
}
|
14461 |
+
}
|
14462 |
+
|
14463 |
+
// ggml_compute_forward_win_unpart
|
14464 |
+
|
14465 |
+
static void ggml_compute_forward_win_unpart_f32(
|
14466 |
+
const struct ggml_compute_params * params,
|
14467 |
+
const struct ggml_tensor * src0,
|
14468 |
+
const struct ggml_tensor * opt0,
|
14469 |
+
struct ggml_tensor * dst) {
|
14470 |
+
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
14471 |
+
return;
|
14472 |
+
}
|
14473 |
+
|
14474 |
+
const int64_t ne00 = src0->ne[0];
|
14475 |
+
const int64_t ne01 = src0->ne[1];
|
14476 |
+
const int64_t ne02 = src0->ne[2];
|
14477 |
+
//const int64_t ne03 = src0->ne[3];
|
14478 |
+
|
14479 |
+
const int64_t ne0 = dst->ne[0];
|
14480 |
+
const int64_t ne1 = dst->ne[1];
|
14481 |
+
const int64_t ne2 = dst->ne[2];
|
14482 |
+
|
14483 |
+
const int32_t w = ((const int32_t *)(opt0->data))[0];
|
14484 |
+
|
14485 |
+
// padding
|
14486 |
+
const int px = (w - ne1%w)%w;
|
14487 |
+
//const int py = (w - ne2%w)%w;
|
14488 |
+
|
14489 |
+
const int npx = (px + ne1)/w;
|
14490 |
+
//const int npy = (py + ne2)/w;
|
14491 |
+
|
14492 |
+
assert(ne0 == ne00);
|
14493 |
+
|
14494 |
+
// TODO: optimize / multi-thread
|
14495 |
+
for (int64_t i2 = 0; i2 < ne2; ++i2) {
|
14496 |
+
for (int64_t i1 = 0; i1 < ne1; ++i1) {
|
14497 |
+
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
14498 |
+
const int ip2 = i2/w;
|
14499 |
+
const int ip1 = i1/w;
|
14500 |
+
|
14501 |
+
const int64_t i02 = i2%w;
|
14502 |
+
const int64_t i01 = i1%w;
|
14503 |
+
const int64_t i00 = i0;
|
14504 |
+
|
14505 |
+
const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00;
|
14506 |
+
const int64_t j = i2*ne1*ne0 + i1*ne0 + i0;
|
14507 |
+
|
14508 |
+
((float *) dst->data)[j] = ((float *) src0->data)[i];
|
14509 |
+
}
|
14510 |
+
}
|
14511 |
+
}
|
14512 |
+
}
|
14513 |
+
|
14514 |
+
static void ggml_compute_forward_win_unpart(
|
14515 |
+
const struct ggml_compute_params * params,
|
14516 |
+
const struct ggml_tensor * src0,
|
14517 |
+
const struct ggml_tensor * opt0,
|
14518 |
+
struct ggml_tensor * dst) {
|
14519 |
+
switch (src0->type) {
|
14520 |
+
case GGML_TYPE_F32:
|
14521 |
+
{
|
14522 |
+
ggml_compute_forward_win_unpart_f32(params, src0, opt0, dst);
|
14523 |
+
} break;
|
14524 |
+
default:
|
14525 |
+
{
|
14526 |
+
GGML_ASSERT(false);
|
14527 |
+
} break;
|
14528 |
+
}
|
14529 |
+
}
|
14530 |
+
|
14531 |
// ggml_compute_forward_map_unary
|
14532 |
|
14533 |
static void ggml_compute_forward_map_unary_f32(
|
|
|
14911 |
if (skip_cpu) {
|
14912 |
return;
|
14913 |
}
|
14914 |
+
GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU);
|
14915 |
GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
|
14916 |
#endif // GGML_USE_CUBLAS
|
14917 |
|
|
|
15000 |
{
|
15001 |
ggml_compute_forward_gelu(params, tensor->src0, tensor);
|
15002 |
} break;
|
15003 |
+
case GGML_OP_GELU_QUICK:
|
15004 |
+
{
|
15005 |
+
ggml_compute_forward_gelu_quick(params, tensor->src0, tensor);
|
15006 |
+
} break;
|
15007 |
case GGML_OP_SILU:
|
15008 |
{
|
15009 |
ggml_compute_forward_silu(params, tensor->src0, tensor);
|
|
|
15108 |
{
|
15109 |
ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
|
15110 |
} break;
|
15111 |
+
case GGML_OP_CONV_1D_S1_PH:
|
15112 |
+
{
|
15113 |
+
ggml_compute_forward_conv_1d_s1_ph(params, tensor->src0, tensor->src1, tensor);
|
15114 |
+
} break;
|
15115 |
+
case GGML_OP_CONV_1D_S2_PH:
|
15116 |
{
|
15117 |
+
ggml_compute_forward_conv_1d_s2_ph(params, tensor->src0, tensor->src1, tensor);
|
15118 |
} break;
|
15119 |
+
case GGML_OP_CONV_2D_SK_P0:
|
15120 |
{
|
15121 |
+
ggml_compute_forward_conv_2d_sk_p0(params, tensor->src0, tensor->src1, tensor);
|
15122 |
} break;
|
15123 |
case GGML_OP_FLASH_ATTN:
|
15124 |
{
|
15125 |
+
const int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
|
15126 |
GGML_ASSERT(t == 0 || t == 1);
|
15127 |
+
const bool masked = t != 0;
|
15128 |
ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor);
|
15129 |
} break;
|
15130 |
case GGML_OP_FLASH_FF:
|
|
|
15138 |
bool masked = t != 0;
|
15139 |
ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor);
|
15140 |
} break;
|
15141 |
+
case GGML_OP_WIN_PART:
|
15142 |
+
{
|
15143 |
+
ggml_compute_forward_win_part(params, tensor->src0, tensor->opt[0], tensor);
|
15144 |
+
} break;
|
15145 |
+
case GGML_OP_WIN_UNPART:
|
15146 |
+
{
|
15147 |
+
ggml_compute_forward_win_unpart(params, tensor->src0, tensor->opt[0], tensor);
|
15148 |
+
} break;
|
15149 |
case GGML_OP_MAP_UNARY:
|
15150 |
{
|
15151 |
const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
|
|
|
15417 |
{
|
15418 |
GGML_ASSERT(false); // TODO: not implemented
|
15419 |
} break;
|
15420 |
+
case GGML_OP_GELU_QUICK:
|
15421 |
+
{
|
15422 |
+
GGML_ASSERT(false); // TODO: not implemented
|
15423 |
+
} break;
|
15424 |
case GGML_OP_ALIBI:
|
15425 |
{
|
15426 |
GGML_ASSERT(false); // TODO: not implemented
|
|
|
15783 |
// noop
|
15784 |
}
|
15785 |
} break;
|
15786 |
+
case GGML_OP_CONV_1D_S1_PH:
|
15787 |
+
{
|
15788 |
+
GGML_ASSERT(false); // TODO: not implemented
|
15789 |
+
} break;
|
15790 |
+
case GGML_OP_CONV_1D_S2_PH:
|
15791 |
{
|
15792 |
GGML_ASSERT(false); // TODO: not implemented
|
15793 |
} break;
|
15794 |
+
case GGML_OP_CONV_2D_SK_P0:
|
15795 |
{
|
15796 |
GGML_ASSERT(false); // TODO: not implemented
|
15797 |
} break;
|
|
|
15960 |
{
|
15961 |
GGML_ASSERT(false); // not supported
|
15962 |
} break;
|
15963 |
+
case GGML_OP_WIN_PART:
|
15964 |
+
case GGML_OP_WIN_UNPART:
|
15965 |
case GGML_OP_MAP_UNARY:
|
15966 |
case GGML_OP_MAP_BINARY:
|
15967 |
{
|
|
|
16035 |
GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
|
16036 |
|
16037 |
if (strlen(node->name) == 0) {
|
16038 |
+
ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
|
16039 |
}
|
16040 |
|
16041 |
cgraph->leafs[cgraph->n_leafs] = node;
|
|
|
16044 |
GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
|
16045 |
|
16046 |
if (strlen(node->name) == 0) {
|
16047 |
+
ggml_format_name(node, "node_%d", cgraph->n_nodes);
|
16048 |
}
|
16049 |
|
16050 |
cgraph->nodes[cgraph->n_nodes] = node;
|
|
|
16370 |
} break;
|
16371 |
case GGML_OP_MUL:
|
16372 |
case GGML_OP_GELU:
|
16373 |
+
case GGML_OP_GELU_QUICK:
|
16374 |
case GGML_OP_SILU:
|
16375 |
case GGML_OP_SILU_BACK:
|
16376 |
case GGML_OP_NORM:
|
|
|
16477 |
{
|
16478 |
node->n_tasks = 1; //TODO
|
16479 |
} break;
|
16480 |
+
case GGML_OP_CONV_1D_S1_PH:
|
16481 |
+
case GGML_OP_CONV_1D_S2_PH:
|
16482 |
{
|
16483 |
node->n_tasks = n_threads;
|
16484 |
|
|
|
16505 |
GGML_ASSERT(false);
|
16506 |
}
|
16507 |
|
16508 |
+
work_size = MAX(work_size, cur);
|
16509 |
+
} break;
|
16510 |
+
case GGML_OP_CONV_2D_SK_P0:
|
16511 |
+
{
|
16512 |
+
node->n_tasks = n_threads;
|
16513 |
+
|
16514 |
+
GGML_ASSERT(node->src1->ne[3] == 1);
|
16515 |
+
|
16516 |
+
const int64_t ne00 = node->src0->ne[0]; // W
|
16517 |
+
const int64_t ne01 = node->src0->ne[1]; // H
|
16518 |
+
const int64_t ne02 = node->src0->ne[2]; // C
|
16519 |
+
const int64_t ne03 = node->src0->ne[3]; // N
|
16520 |
+
|
16521 |
+
const int64_t ne10 = node->src1->ne[0]; // W
|
16522 |
+
const int64_t ne11 = node->src1->ne[1]; // H
|
16523 |
+
const int64_t ne12 = node->src1->ne[2]; // C
|
16524 |
+
|
16525 |
+
const int64_t nk = ne00*ne01;
|
16526 |
+
|
16527 |
+
UNUSED(ne02);
|
16528 |
+
UNUSED(ne03);
|
16529 |
+
UNUSED(nk);
|
16530 |
+
|
16531 |
+
size_t cur = 0;
|
16532 |
+
|
16533 |
+
if (node->src0->type == GGML_TYPE_F16 &&
|
16534 |
+
node->src1->type == GGML_TYPE_F32) {
|
16535 |
+
cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
|
16536 |
+
} else if (node->src0->type == GGML_TYPE_F32 &&
|
16537 |
+
node->src1->type == GGML_TYPE_F32) {
|
16538 |
+
cur = sizeof(float)* (ne10*ne11*ne12);
|
16539 |
+
} else {
|
16540 |
+
GGML_ASSERT(false);
|
16541 |
+
}
|
16542 |
+
|
16543 |
work_size = MAX(work_size, cur);
|
16544 |
} break;
|
16545 |
case GGML_OP_FLASH_ATTN:
|
|
|
16601 |
|
16602 |
work_size = MAX(work_size, cur);
|
16603 |
} break;
|
16604 |
+
case GGML_OP_WIN_PART:
|
16605 |
+
case GGML_OP_WIN_UNPART:
|
16606 |
case GGML_OP_MAP_UNARY:
|
16607 |
case GGML_OP_MAP_BINARY:
|
16608 |
{
|
|
|
17135 |
|
17136 |
if (!*ctx_data) {
|
17137 |
fprintf(stderr, "%s: failed to create ggml context\n", __func__);
|
17138 |
+
fclose(fin);
|
17139 |
return result;
|
17140 |
}
|
17141 |
}
|
17142 |
|
17143 |
data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
|
17144 |
|
17145 |
+
{
|
17146 |
+
const size_t ret = fread(data->data, sizeof(char), fsize, fin);
|
17147 |
+
if (ret != fsize) {
|
17148 |
+
fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
|
17149 |
+
fclose(fin);
|
17150 |
+
return result;
|
17151 |
+
}
|
17152 |
}
|
17153 |
|
17154 |
fclose(fin);
|
|
|
17428 |
return NULL;
|
17429 |
}
|
17430 |
|
17431 |
+
static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
|
17432 |
+
struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
|
17433 |
+
struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
|
17434 |
+
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
|
17435 |
+
gparent0 ? (void *) gparent0 : (void *) parent,
|
17436 |
+
gparent0 ? "g" : "x",
|
17437 |
+
gparent ? (void *) gparent : (void *) node,
|
17438 |
+
gparent ? "g" : "x",
|
17439 |
+
gparent ? "empty" : "vee",
|
17440 |
+
gparent ? "dashed" : "solid",
|
17441 |
+
label);
|
17442 |
+
}
|
17443 |
+
|
17444 |
+
static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
|
17445 |
+
fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
|
17446 |
+
(void *) parent, "x",
|
17447 |
+
(void *) node, "x",
|
17448 |
+
label);
|
17449 |
+
}
|
17450 |
+
|
17451 |
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
|
17452 |
char color[16];
|
17453 |
|
|
|
17483 |
(void *) node, color);
|
17484 |
|
17485 |
if (strlen(node->name) > 0) {
|
17486 |
+
fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
|
17487 |
+
} else {
|
17488 |
+
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
17489 |
}
|
17490 |
|
17491 |
if (node->n_dims == 2) {
|
|
|
17494 |
fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
|
17495 |
}
|
17496 |
|
|
|
17497 |
if (node->grad) {
|
17498 |
fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
|
17499 |
} else {
|
|
|
17512 |
(void *) node, color);
|
17513 |
|
17514 |
if (strlen(node->name) > 0) {
|
17515 |
+
fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
|
17516 |
+
} else {
|
17517 |
+
fprintf(fp, "(%s)|", ggml_type_name(node->type));
|
17518 |
}
|
17519 |
+
|
17520 |
+
fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
|
17521 |
+
if (ggml_nelements(node) < 5) {
|
17522 |
+
fprintf(fp, " | (");
|
17523 |
+
for (int j = 0; j < ggml_nelements(node); j++) {
|
17524 |
+
if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
|
17525 |
+
fprintf(fp, "%d", ggml_get_i32_1d(node, j));
|
17526 |
+
}
|
17527 |
+
else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) {
|
17528 |
+
fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
|
17529 |
+
}
|
17530 |
+
else {
|
17531 |
+
fprintf(fp, "#");
|
17532 |
+
}
|
17533 |
+
if (j < ggml_nelements(node) - 1) {
|
17534 |
+
fprintf(fp, ", ");
|
17535 |
+
}
|
17536 |
}
|
17537 |
+
fprintf(fp, ")");
|
|
|
|
|
17538 |
}
|
17539 |
fprintf(fp, "\"; ]\n");
|
17540 |
}
|
|
|
17542 |
for (int i = 0; i < gb->n_nodes; i++) {
|
17543 |
struct ggml_tensor * node = gb->nodes[i];
|
17544 |
|
|
|
|
|
17545 |
if (node->src0) {
|
17546 |
+
ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17547 |
}
|
17548 |
|
17549 |
if (node->src1) {
|
17550 |
+
ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y");
|
17551 |
+
}
|
17552 |
+
|
17553 |
+
for (int j = 0; j < GGML_MAX_OPT; j++) {
|
17554 |
+
if (node->opt[j]) {
|
17555 |
+
char label[16];
|
17556 |
+
snprintf(label, sizeof(label), "opt %d", j);
|
17557 |
+
ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label);
|
17558 |
+
}
|
17559 |
}
|
17560 |
}
|
17561 |
|
|
|
17563 |
struct ggml_tensor * node = gb->leafs[i];
|
17564 |
|
17565 |
if (node->src0) {
|
17566 |
+
ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x");
|
|
|
|
|
17567 |
}
|
17568 |
|
17569 |
if (node->src1) {
|
17570 |
+
ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y");
|
17571 |
+
}
|
17572 |
+
|
17573 |
+
for (int j = 0; j < GGML_MAX_OPT; j++) {
|
17574 |
+
if (node->opt[j]) {
|
17575 |
+
char label[16];
|
17576 |
+
snprintf(label, sizeof(label), "opt %d", j);
|
17577 |
+
ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label);
|
17578 |
+
}
|
17579 |
}
|
17580 |
}
|
17581 |
|
|
|
18294 |
ggml_set_zero(opt->lbfgs.g);
|
18295 |
ggml_set_zero(opt->lbfgs.gp);
|
18296 |
ggml_set_zero(opt->lbfgs.d);
|
|
|
18297 |
if (opt->lbfgs.pf) {
|
18298 |
ggml_set_zero(opt->lbfgs.pf);
|
18299 |
}
|
ggml.h
CHANGED
@@ -303,6 +303,7 @@ extern "C" {
|
|
303 |
GGML_OP_STEP,
|
304 |
GGML_OP_RELU,
|
305 |
GGML_OP_GELU,
|
|
|
306 |
GGML_OP_SILU,
|
307 |
GGML_OP_SILU_BACK,
|
308 |
GGML_OP_NORM, // normalize
|
@@ -331,12 +332,15 @@ extern "C" {
|
|
331 |
GGML_OP_ROPE_BACK,
|
332 |
GGML_OP_ALIBI,
|
333 |
GGML_OP_CLAMP,
|
334 |
-
|
335 |
-
|
|
|
336 |
|
337 |
GGML_OP_FLASH_ATTN,
|
338 |
GGML_OP_FLASH_FF,
|
339 |
GGML_OP_FLASH_ATTN_BACK,
|
|
|
|
|
340 |
|
341 |
GGML_OP_MAP_UNARY,
|
342 |
GGML_OP_MAP_BINARY,
|
@@ -500,8 +504,9 @@ extern "C" {
|
|
500 |
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
501 |
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
502 |
|
503 |
-
GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
|
504 |
-
GGML_API size_t ggml_get_mem_size
|
|
|
505 |
|
506 |
GGML_API struct ggml_tensor * ggml_new_tensor(
|
507 |
struct ggml_context * ctx,
|
@@ -556,8 +561,9 @@ extern "C" {
|
|
556 |
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
557 |
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
558 |
|
559 |
-
GGML_API const char *
|
560 |
-
GGML_API
|
|
|
561 |
|
562 |
//
|
563 |
// operations on tensors with backpropagation
|
@@ -610,24 +616,47 @@ extern "C" {
|
|
610 |
struct ggml_tensor * a,
|
611 |
struct ggml_tensor * b);
|
612 |
|
|
|
|
|
|
|
|
|
|
|
613 |
GGML_API struct ggml_tensor * ggml_mul(
|
614 |
struct ggml_context * ctx,
|
615 |
struct ggml_tensor * a,
|
616 |
struct ggml_tensor * b);
|
617 |
|
|
|
|
|
|
|
|
|
|
|
618 |
GGML_API struct ggml_tensor * ggml_div(
|
619 |
struct ggml_context * ctx,
|
620 |
struct ggml_tensor * a,
|
621 |
struct ggml_tensor * b);
|
622 |
|
|
|
|
|
|
|
|
|
|
|
623 |
GGML_API struct ggml_tensor * ggml_sqr(
|
624 |
struct ggml_context * ctx,
|
625 |
struct ggml_tensor * a);
|
626 |
|
|
|
|
|
|
|
|
|
627 |
GGML_API struct ggml_tensor * ggml_sqrt(
|
628 |
struct ggml_context * ctx,
|
629 |
struct ggml_tensor * a);
|
630 |
|
|
|
|
|
|
|
|
|
631 |
GGML_API struct ggml_tensor * ggml_log(
|
632 |
struct ggml_context * ctx,
|
633 |
struct ggml_tensor * a);
|
@@ -667,31 +696,67 @@ extern "C" {
|
|
667 |
struct ggml_context * ctx,
|
668 |
struct ggml_tensor * a);
|
669 |
|
|
|
|
|
|
|
|
|
670 |
GGML_API struct ggml_tensor * ggml_sgn(
|
671 |
struct ggml_context * ctx,
|
672 |
struct ggml_tensor * a);
|
673 |
|
|
|
|
|
|
|
|
|
674 |
GGML_API struct ggml_tensor * ggml_neg(
|
675 |
struct ggml_context * ctx,
|
676 |
struct ggml_tensor * a);
|
677 |
|
|
|
|
|
|
|
|
|
678 |
GGML_API struct ggml_tensor * ggml_step(
|
679 |
struct ggml_context * ctx,
|
680 |
struct ggml_tensor * a);
|
681 |
|
|
|
|
|
|
|
|
|
682 |
GGML_API struct ggml_tensor * ggml_relu(
|
683 |
struct ggml_context * ctx,
|
684 |
struct ggml_tensor * a);
|
685 |
|
|
|
|
|
|
|
|
|
686 |
// TODO: double-check this computation is correct
|
687 |
GGML_API struct ggml_tensor * ggml_gelu(
|
688 |
struct ggml_context * ctx,
|
689 |
struct ggml_tensor * a);
|
690 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
691 |
GGML_API struct ggml_tensor * ggml_silu(
|
692 |
struct ggml_context * ctx,
|
693 |
struct ggml_tensor * a);
|
694 |
|
|
|
|
|
|
|
|
|
695 |
// a - x
|
696 |
// b - dy
|
697 |
GGML_API struct ggml_tensor * ggml_silu_back(
|
@@ -705,10 +770,18 @@ extern "C" {
|
|
705 |
struct ggml_context * ctx,
|
706 |
struct ggml_tensor * a);
|
707 |
|
|
|
|
|
|
|
|
|
708 |
GGML_API struct ggml_tensor * ggml_rms_norm(
|
709 |
struct ggml_context * ctx,
|
710 |
struct ggml_tensor * a);
|
711 |
|
|
|
|
|
|
|
|
|
712 |
// a - x
|
713 |
// b - dy
|
714 |
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
@@ -998,16 +1071,55 @@ extern "C" {
|
|
998 |
float min,
|
999 |
float max);
|
1000 |
|
1001 |
-
//
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1002 |
// TODO: we don't support extra parameters for now
|
1003 |
// that's why we are hard-coding the stride, padding, and dilation
|
1004 |
// not great ..
|
1005 |
-
|
|
|
|
|
|
|
|
|
|
|
1006 |
struct ggml_context * ctx,
|
1007 |
struct ggml_tensor * a,
|
1008 |
struct ggml_tensor * b);
|
1009 |
|
1010 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1011 |
struct ggml_context * ctx,
|
1012 |
struct ggml_tensor * a,
|
1013 |
struct ggml_tensor * b);
|
@@ -1035,6 +1147,26 @@ extern "C" {
|
|
1035 |
struct ggml_tensor * c0,
|
1036 |
struct ggml_tensor * c1);
|
1037 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1038 |
// Mapping operations
|
1039 |
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
1040 |
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
|
|
303 |
GGML_OP_STEP,
|
304 |
GGML_OP_RELU,
|
305 |
GGML_OP_GELU,
|
306 |
+
GGML_OP_GELU_QUICK,
|
307 |
GGML_OP_SILU,
|
308 |
GGML_OP_SILU_BACK,
|
309 |
GGML_OP_NORM, // normalize
|
|
|
332 |
GGML_OP_ROPE_BACK,
|
333 |
GGML_OP_ALIBI,
|
334 |
GGML_OP_CLAMP,
|
335 |
+
GGML_OP_CONV_1D_S1_PH,
|
336 |
+
GGML_OP_CONV_1D_S2_PH,
|
337 |
+
GGML_OP_CONV_2D_SK_P0,
|
338 |
|
339 |
GGML_OP_FLASH_ATTN,
|
340 |
GGML_OP_FLASH_FF,
|
341 |
GGML_OP_FLASH_ATTN_BACK,
|
342 |
+
GGML_OP_WIN_PART,
|
343 |
+
GGML_OP_WIN_UNPART,
|
344 |
|
345 |
GGML_OP_MAP_UNARY,
|
346 |
GGML_OP_MAP_BINARY,
|
|
|
504 |
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
505 |
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
506 |
|
507 |
+
GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
|
508 |
+
GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
|
509 |
+
GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
|
510 |
|
511 |
GGML_API struct ggml_tensor * ggml_new_tensor(
|
512 |
struct ggml_context * ctx,
|
|
|
561 |
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
562 |
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
563 |
|
564 |
+
GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
|
565 |
+
GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
|
566 |
+
GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...);
|
567 |
|
568 |
//
|
569 |
// operations on tensors with backpropagation
|
|
|
616 |
struct ggml_tensor * a,
|
617 |
struct ggml_tensor * b);
|
618 |
|
619 |
+
GGML_API struct ggml_tensor * ggml_sub_inplace(
|
620 |
+
struct ggml_context * ctx,
|
621 |
+
struct ggml_tensor * a,
|
622 |
+
struct ggml_tensor * b);
|
623 |
+
|
624 |
GGML_API struct ggml_tensor * ggml_mul(
|
625 |
struct ggml_context * ctx,
|
626 |
struct ggml_tensor * a,
|
627 |
struct ggml_tensor * b);
|
628 |
|
629 |
+
GGML_API struct ggml_tensor * ggml_mul_inplace(
|
630 |
+
struct ggml_context * ctx,
|
631 |
+
struct ggml_tensor * a,
|
632 |
+
struct ggml_tensor * b);
|
633 |
+
|
634 |
GGML_API struct ggml_tensor * ggml_div(
|
635 |
struct ggml_context * ctx,
|
636 |
struct ggml_tensor * a,
|
637 |
struct ggml_tensor * b);
|
638 |
|
639 |
+
GGML_API struct ggml_tensor * ggml_div_inplace(
|
640 |
+
struct ggml_context * ctx,
|
641 |
+
struct ggml_tensor * a,
|
642 |
+
struct ggml_tensor * b);
|
643 |
+
|
644 |
GGML_API struct ggml_tensor * ggml_sqr(
|
645 |
struct ggml_context * ctx,
|
646 |
struct ggml_tensor * a);
|
647 |
|
648 |
+
GGML_API struct ggml_tensor * ggml_sqr_inplace(
|
649 |
+
struct ggml_context * ctx,
|
650 |
+
struct ggml_tensor * a);
|
651 |
+
|
652 |
GGML_API struct ggml_tensor * ggml_sqrt(
|
653 |
struct ggml_context * ctx,
|
654 |
struct ggml_tensor * a);
|
655 |
|
656 |
+
GGML_API struct ggml_tensor * ggml_sqrt_inplace(
|
657 |
+
struct ggml_context * ctx,
|
658 |
+
struct ggml_tensor * a);
|
659 |
+
|
660 |
GGML_API struct ggml_tensor * ggml_log(
|
661 |
struct ggml_context * ctx,
|
662 |
struct ggml_tensor * a);
|
|
|
696 |
struct ggml_context * ctx,
|
697 |
struct ggml_tensor * a);
|
698 |
|
699 |
+
GGML_API struct ggml_tensor * ggml_abs_inplace(
|
700 |
+
struct ggml_context * ctx,
|
701 |
+
struct ggml_tensor * a);
|
702 |
+
|
703 |
GGML_API struct ggml_tensor * ggml_sgn(
|
704 |
struct ggml_context * ctx,
|
705 |
struct ggml_tensor * a);
|
706 |
|
707 |
+
GGML_API struct ggml_tensor * ggml_sgn_inplace(
|
708 |
+
struct ggml_context * ctx,
|
709 |
+
struct ggml_tensor * a);
|
710 |
+
|
711 |
GGML_API struct ggml_tensor * ggml_neg(
|
712 |
struct ggml_context * ctx,
|
713 |
struct ggml_tensor * a);
|
714 |
|
715 |
+
GGML_API struct ggml_tensor * ggml_neg_inplace(
|
716 |
+
struct ggml_context * ctx,
|
717 |
+
struct ggml_tensor * a);
|
718 |
+
|
719 |
GGML_API struct ggml_tensor * ggml_step(
|
720 |
struct ggml_context * ctx,
|
721 |
struct ggml_tensor * a);
|
722 |
|
723 |
+
GGML_API struct ggml_tensor * ggml_step_inplace(
|
724 |
+
struct ggml_context * ctx,
|
725 |
+
struct ggml_tensor * a);
|
726 |
+
|
727 |
GGML_API struct ggml_tensor * ggml_relu(
|
728 |
struct ggml_context * ctx,
|
729 |
struct ggml_tensor * a);
|
730 |
|
731 |
+
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
732 |
+
struct ggml_context * ctx,
|
733 |
+
struct ggml_tensor * a);
|
734 |
+
|
735 |
// TODO: double-check this computation is correct
|
736 |
GGML_API struct ggml_tensor * ggml_gelu(
|
737 |
struct ggml_context * ctx,
|
738 |
struct ggml_tensor * a);
|
739 |
|
740 |
+
GGML_API struct ggml_tensor * ggml_gelu_inplace(
|
741 |
+
struct ggml_context * ctx,
|
742 |
+
struct ggml_tensor * a);
|
743 |
+
|
744 |
+
GGML_API struct ggml_tensor * ggml_gelu_quick(
|
745 |
+
struct ggml_context * ctx,
|
746 |
+
struct ggml_tensor * a);
|
747 |
+
|
748 |
+
GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
|
749 |
+
struct ggml_context * ctx,
|
750 |
+
struct ggml_tensor * a);
|
751 |
+
|
752 |
GGML_API struct ggml_tensor * ggml_silu(
|
753 |
struct ggml_context * ctx,
|
754 |
struct ggml_tensor * a);
|
755 |
|
756 |
+
GGML_API struct ggml_tensor * ggml_silu_inplace(
|
757 |
+
struct ggml_context * ctx,
|
758 |
+
struct ggml_tensor * a);
|
759 |
+
|
760 |
// a - x
|
761 |
// b - dy
|
762 |
GGML_API struct ggml_tensor * ggml_silu_back(
|
|
|
770 |
struct ggml_context * ctx,
|
771 |
struct ggml_tensor * a);
|
772 |
|
773 |
+
GGML_API struct ggml_tensor * ggml_norm_inplace(
|
774 |
+
struct ggml_context * ctx,
|
775 |
+
struct ggml_tensor * a);
|
776 |
+
|
777 |
GGML_API struct ggml_tensor * ggml_rms_norm(
|
778 |
struct ggml_context * ctx,
|
779 |
struct ggml_tensor * a);
|
780 |
|
781 |
+
GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
|
782 |
+
struct ggml_context * ctx,
|
783 |
+
struct ggml_tensor * a);
|
784 |
+
|
785 |
// a - x
|
786 |
// b - dy
|
787 |
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
|
|
1071 |
float min,
|
1072 |
float max);
|
1073 |
|
1074 |
+
// TODO: implement general-purpose convolutions
|
1075 |
+
// GGML_API struct ggml_tensor * ggml_conv_1d(
|
1076 |
+
// struct ggml_context * ctx,
|
1077 |
+
// struct ggml_tensor * a,
|
1078 |
+
// struct ggml_tensor * b,
|
1079 |
+
// int s0
|
1080 |
+
// int p0,
|
1081 |
+
// int d0);
|
1082 |
+
//
|
1083 |
+
// GGML_API struct ggml_tensor * ggml_conv_2d(
|
1084 |
+
// struct ggml_context * ctx,
|
1085 |
+
// struct ggml_tensor * a,
|
1086 |
+
// struct ggml_tensor * b,
|
1087 |
+
// int s0,
|
1088 |
+
// int s1,
|
1089 |
+
// int p0,
|
1090 |
+
// int p1,
|
1091 |
+
// int d0,
|
1092 |
+
// int d1);
|
1093 |
+
|
1094 |
+
// padding = half
|
1095 |
// TODO: we don't support extra parameters for now
|
1096 |
// that's why we are hard-coding the stride, padding, and dilation
|
1097 |
// not great ..
|
1098 |
+
// example:
|
1099 |
+
// a: 3 80 768 1
|
1100 |
+
// b: 3000 80 1 1
|
1101 |
+
// res: 3000 768 1 1
|
1102 |
+
// used in whisper
|
1103 |
+
GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
|
1104 |
struct ggml_context * ctx,
|
1105 |
struct ggml_tensor * a,
|
1106 |
struct ggml_tensor * b);
|
1107 |
|
1108 |
+
// used in whisper
|
1109 |
+
GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
|
1110 |
+
struct ggml_context * ctx,
|
1111 |
+
struct ggml_tensor * a,
|
1112 |
+
struct ggml_tensor * b);
|
1113 |
+
|
1114 |
+
// kernel size is a->ne[0] x a->ne[1]
|
1115 |
+
// stride is equal to kernel size
|
1116 |
+
// padding is zero
|
1117 |
+
// example:
|
1118 |
+
// a: 16 16 3 768
|
1119 |
+
// b: 1024 1024 3 1
|
1120 |
+
// res: 64 64 768 1
|
1121 |
+
// used in sam
|
1122 |
+
GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
|
1123 |
struct ggml_context * ctx,
|
1124 |
struct ggml_tensor * a,
|
1125 |
struct ggml_tensor * b);
|
|
|
1147 |
struct ggml_tensor * c0,
|
1148 |
struct ggml_tensor * c1);
|
1149 |
|
1150 |
+
// partition into non-overlapping windows with padding if needed
|
1151 |
+
// example:
|
1152 |
+
// a: 768 64 64 1
|
1153 |
+
// w: 14
|
1154 |
+
// res: 768 14 14 25
|
1155 |
+
// used in sam
|
1156 |
+
GGML_API struct ggml_tensor * ggml_win_part(
|
1157 |
+
struct ggml_context * ctx,
|
1158 |
+
struct ggml_tensor * a,
|
1159 |
+
int w);
|
1160 |
+
|
1161 |
+
// reverse of ggml_win_part
|
1162 |
+
// used in sam
|
1163 |
+
GGML_API struct ggml_tensor * ggml_win_unpart(
|
1164 |
+
struct ggml_context * ctx,
|
1165 |
+
struct ggml_tensor * a,
|
1166 |
+
int w0,
|
1167 |
+
int h0,
|
1168 |
+
int w);
|
1169 |
+
|
1170 |
// Mapping operations
|
1171 |
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
1172 |
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
gpttype_adapter.cpp
CHANGED
@@ -68,7 +68,7 @@ static int n_batch = 8;
|
|
68 |
static bool useSmartContext = false;
|
69 |
static bool unbanTokens = false;
|
70 |
static int blasbatchsize = 512;
|
71 |
-
static
|
72 |
static std::string modelname;
|
73 |
static std::vector<gpt_vocab::id> last_n_tokens;
|
74 |
static std::vector<gpt_vocab::id> current_context_tokens;
|
@@ -78,6 +78,7 @@ static std::vector<int> smartcontext;
|
|
78 |
static std::vector<std::string> stop_sequence;
|
79 |
static std::vector<llama_token_data> top_picks;
|
80 |
static int remaining_tokens = 0;
|
|
|
81 |
static std::string concat_output = "";
|
82 |
|
83 |
inline bool IsNanCheck(float f)
|
@@ -118,7 +119,7 @@ llama_token sample_token(llama_token_data_array * candidates, std::mt19937 & rng
|
|
118 |
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
119 |
int idx = dist(rng);
|
120 |
|
121 |
-
if(debugmode)
|
122 |
{
|
123 |
top_picks.push_back(candidates->data[idx]);
|
124 |
for (size_t i = 0; (i < candidates->size && i<4); ++i)
|
@@ -308,8 +309,13 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|
308 |
params.memory_f16 = inputs.f16_kv;
|
309 |
params.n_ctx = inputs.max_context_length;
|
310 |
|
311 |
-
neox_ctx_v2.hparams.n_ctx
|
312 |
-
=
|
|
|
|
|
|
|
|
|
|
|
313 |
|
314 |
printf("System Info: %s\n", llama_print_system_info());
|
315 |
SetQuantsUnshuffled(false);
|
@@ -387,9 +393,15 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|
387 |
{
|
388 |
printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str());
|
389 |
|
|
|
|
|
|
|
|
|
|
|
|
|
390 |
int err = llama_apply_lora_from_file(llama_ctx_v3,
|
391 |
lora_filename.c_str(),
|
392 |
-
|
393 |
n_threads);
|
394 |
if (err != 0)
|
395 |
{
|
@@ -539,7 +551,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|
539 |
return res;
|
540 |
}
|
541 |
// determine the required inference memory per token:
|
542 |
-
gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token,
|
543 |
return ModelLoadResult::SUCCESS;
|
544 |
}
|
545 |
else
|
@@ -606,14 +618,14 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|
606 |
}
|
607 |
|
608 |
// determine the required inference memory per token:
|
609 |
-
gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
|
610 |
|
611 |
//if the logits are NAN or duplicated, it means the model is incompatible
|
612 |
std::vector<float> oldlogits(logits);
|
613 |
|
614 |
//this is another hack because they change the library - we run the eval through the model
|
615 |
//twice and compare logits. if they give the same logits for different inputs, model is broken
|
616 |
-
gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token);
|
617 |
|
618 |
if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits)))
|
619 |
{
|
@@ -665,7 +677,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|
665 |
{
|
666 |
if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
|
667 |
{
|
668 |
-
ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx_v3, vocab, file_format);
|
669 |
if(res==ModelLoadResult::FAIL)
|
670 |
{
|
671 |
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
@@ -678,7 +690,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|
678 |
}
|
679 |
|
680 |
// determine the required inference memory per token:
|
681 |
-
gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
|
682 |
|
683 |
return ModelLoadResult::SUCCESS;
|
684 |
}
|
@@ -727,7 +739,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|
727 |
}
|
728 |
else if(file_format==FileFormat::MPT_1)
|
729 |
{
|
730 |
-
bool res = mpt_model_load(params.model, mpt_ctx_v3, vocab);
|
731 |
if(res==false)
|
732 |
{
|
733 |
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
@@ -735,7 +747,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|
735 |
}
|
736 |
|
737 |
// determine the required inference memory per token:
|
738 |
-
mpt_eval(mpt_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token);
|
739 |
return ModelLoadResult::SUCCESS;
|
740 |
}
|
741 |
else
|
@@ -748,6 +760,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|
748 |
|
749 |
bool gpttype_generate_abort()
|
750 |
{
|
|
|
751 |
remaining_tokens = 0;
|
752 |
return true;
|
753 |
}
|
@@ -888,12 +901,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|
888 |
current_context_tokens.resize(n_past);
|
889 |
|
890 |
remaining_tokens = params.n_predict;
|
891 |
-
|
892 |
int input_consumed = 0;
|
893 |
std::mt19937 rng(params.seed);
|
894 |
concat_output = "";
|
895 |
|
896 |
bool startedsampling = false;
|
|
|
897 |
|
898 |
timer_start();
|
899 |
double time1 = 0, time2 = 0;
|
@@ -981,9 +995,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|
981 |
printf("Bad format!");
|
982 |
}
|
983 |
|
984 |
-
|
|
|
|
|
|
|
985 |
|
986 |
-
if (debugmode)
|
987 |
{
|
988 |
std::string outstr = "";
|
989 |
printf("\n[Debug: Dump Input Tokens, format: %d]\n", file_format);
|
@@ -1013,7 +1030,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|
1013 |
// predict
|
1014 |
unsigned int embdsize = embd.size();
|
1015 |
//print progress
|
1016 |
-
if (!startedsampling)
|
1017 |
{
|
1018 |
printf("\rProcessing Prompt%s (%d / %d tokens)", (blasmode ? " [BLAS]" : ""), input_consumed, embd_inp.size());
|
1019 |
}
|
@@ -1065,7 +1082,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|
1065 |
}
|
1066 |
else if(file_format==FileFormat::GPT2_4)
|
1067 |
{
|
1068 |
-
evalres = gpt2_eval(gpt2_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token,
|
1069 |
}
|
1070 |
else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
|
1071 |
{
|
@@ -1073,7 +1090,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|
1073 |
}
|
1074 |
else if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
|
1075 |
{
|
1076 |
-
evalres = gpt_neox_eval(neox_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token);
|
1077 |
}
|
1078 |
else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2)
|
1079 |
{
|
@@ -1085,11 +1102,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|
1085 |
}
|
1086 |
else if(file_format==FileFormat::GPTJ_5)
|
1087 |
{
|
1088 |
-
evalres = gptj_eval(gptj_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token);
|
1089 |
}
|
1090 |
else if(file_format==FileFormat::MPT_1)
|
1091 |
{
|
1092 |
-
evalres = mpt_eval(mpt_ctx_v3, params.n_threads, n_past, embd, logits, false, mem_per_token);
|
1093 |
}
|
1094 |
else
|
1095 |
{
|
@@ -1126,7 +1143,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|
1126 |
params.n_threads = original_threads;
|
1127 |
time1 = timer_check();
|
1128 |
timer_start();
|
1129 |
-
|
|
|
|
|
|
|
1130 |
}
|
1131 |
|
1132 |
unsigned int eosID = 0;
|
@@ -1229,11 +1249,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|
1229 |
concat_output += tokenizedstr;
|
1230 |
}
|
1231 |
|
1232 |
-
if (startedsampling)
|
1233 |
{
|
1234 |
printf("\rGenerating (%d / %d tokens)", (params.n_predict - remaining_tokens), params.n_predict);
|
1235 |
}
|
1236 |
-
if(debugmode && top_picks.size()>0)
|
1237 |
{
|
1238 |
printf(" [");
|
1239 |
bool firstloop = true;
|
@@ -1253,6 +1273,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|
1253 |
|
1254 |
if(unbanTokens && id==eosID)
|
1255 |
{
|
|
|
1256 |
printf("\n(EOS token triggered!)");
|
1257 |
remaining_tokens = 0;
|
1258 |
}
|
@@ -1263,7 +1284,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|
1263 |
{
|
1264 |
stopper_unused_tokens = remaining_tokens;
|
1265 |
remaining_tokens = 0;
|
1266 |
-
|
|
|
|
|
|
|
1267 |
break;
|
1268 |
}
|
1269 |
}
|
|
|
68 |
static bool useSmartContext = false;
|
69 |
static bool unbanTokens = false;
|
70 |
static int blasbatchsize = 512;
|
71 |
+
static int debugmode = 0; //-1 = hide all, 0 = normal, 1 = showall
|
72 |
static std::string modelname;
|
73 |
static std::vector<gpt_vocab::id> last_n_tokens;
|
74 |
static std::vector<gpt_vocab::id> current_context_tokens;
|
|
|
78 |
static std::vector<std::string> stop_sequence;
|
79 |
static std::vector<llama_token_data> top_picks;
|
80 |
static int remaining_tokens = 0;
|
81 |
+
static int stopper_unused_tokens = 0;
|
82 |
static std::string concat_output = "";
|
83 |
|
84 |
inline bool IsNanCheck(float f)
|
|
|
119 |
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
120 |
int idx = dist(rng);
|
121 |
|
122 |
+
if(debugmode==1)
|
123 |
{
|
124 |
top_picks.push_back(candidates->data[idx]);
|
125 |
for (size_t i = 0; (i < candidates->size && i<4); ++i)
|
|
|
309 |
params.memory_f16 = inputs.f16_kv;
|
310 |
params.n_ctx = inputs.max_context_length;
|
311 |
|
312 |
+
neox_ctx_v2.hparams.n_ctx = neox_ctx_v3.hparams.n_ctx
|
313 |
+
= gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx
|
314 |
+
= gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx
|
315 |
+
= mpt_ctx_v3.hparams.n_ctx = params.n_ctx;
|
316 |
+
|
317 |
+
//this is used for the mem_per_token eval, openblas needs more RAM
|
318 |
+
bool use_scratch = ggml_cpu_has_gpublas();
|
319 |
|
320 |
printf("System Info: %s\n", llama_print_system_info());
|
321 |
SetQuantsUnshuffled(false);
|
|
|
393 |
{
|
394 |
printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str());
|
395 |
|
396 |
+
const char * lora_base_arg = NULL;
|
397 |
+
if (lora_base != "") {
|
398 |
+
printf("Using LORA base model: %s\n", lora_base.c_str());
|
399 |
+
lora_base_arg = lora_base.c_str();
|
400 |
+
}
|
401 |
+
|
402 |
int err = llama_apply_lora_from_file(llama_ctx_v3,
|
403 |
lora_filename.c_str(),
|
404 |
+
lora_base_arg,
|
405 |
n_threads);
|
406 |
if (err != 0)
|
407 |
{
|
|
|
551 |
return res;
|
552 |
}
|
553 |
// determine the required inference memory per token:
|
554 |
+
gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch);
|
555 |
return ModelLoadResult::SUCCESS;
|
556 |
}
|
557 |
else
|
|
|
618 |
}
|
619 |
|
620 |
// determine the required inference memory per token:
|
621 |
+
gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch);
|
622 |
|
623 |
//if the logits are NAN or duplicated, it means the model is incompatible
|
624 |
std::vector<float> oldlogits(logits);
|
625 |
|
626 |
//this is another hack because they change the library - we run the eval through the model
|
627 |
//twice and compare logits. if they give the same logits for different inputs, model is broken
|
628 |
+
gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token, use_scratch);
|
629 |
|
630 |
if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits)))
|
631 |
{
|
|
|
677 |
{
|
678 |
if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
|
679 |
{
|
680 |
+
ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx_v3, vocab, file_format, inputs.gpulayers);
|
681 |
if(res==ModelLoadResult::FAIL)
|
682 |
{
|
683 |
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
|
|
690 |
}
|
691 |
|
692 |
// determine the required inference memory per token:
|
693 |
+
gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch);
|
694 |
|
695 |
return ModelLoadResult::SUCCESS;
|
696 |
}
|
|
|
739 |
}
|
740 |
else if(file_format==FileFormat::MPT_1)
|
741 |
{
|
742 |
+
bool res = mpt_model_load(params.model, mpt_ctx_v3, vocab, inputs.gpulayers);
|
743 |
if(res==false)
|
744 |
{
|
745 |
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
|
|
747 |
}
|
748 |
|
749 |
// determine the required inference memory per token:
|
750 |
+
mpt_eval(mpt_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token, use_scratch);
|
751 |
return ModelLoadResult::SUCCESS;
|
752 |
}
|
753 |
else
|
|
|
760 |
|
761 |
bool gpttype_generate_abort()
|
762 |
{
|
763 |
+
stopper_unused_tokens = remaining_tokens;
|
764 |
remaining_tokens = 0;
|
765 |
return true;
|
766 |
}
|
|
|
901 |
current_context_tokens.resize(n_past);
|
902 |
|
903 |
remaining_tokens = params.n_predict;
|
904 |
+
stopper_unused_tokens = 0;
|
905 |
int input_consumed = 0;
|
906 |
std::mt19937 rng(params.seed);
|
907 |
concat_output = "";
|
908 |
|
909 |
bool startedsampling = false;
|
910 |
+
bool use_scratch = true; //for normal inference always use scratch
|
911 |
|
912 |
timer_start();
|
913 |
double time1 = 0, time2 = 0;
|
|
|
995 |
printf("Bad format!");
|
996 |
}
|
997 |
|
998 |
+
if(debugmode!=-1)
|
999 |
+
{
|
1000 |
+
printf("\n");
|
1001 |
+
}
|
1002 |
|
1003 |
+
if (debugmode==1)
|
1004 |
{
|
1005 |
std::string outstr = "";
|
1006 |
printf("\n[Debug: Dump Input Tokens, format: %d]\n", file_format);
|
|
|
1030 |
// predict
|
1031 |
unsigned int embdsize = embd.size();
|
1032 |
//print progress
|
1033 |
+
if (!startedsampling && debugmode!=-1)
|
1034 |
{
|
1035 |
printf("\rProcessing Prompt%s (%d / %d tokens)", (blasmode ? " [BLAS]" : ""), input_consumed, embd_inp.size());
|
1036 |
}
|
|
|
1082 |
}
|
1083 |
else if(file_format==FileFormat::GPT2_4)
|
1084 |
{
|
1085 |
+
evalres = gpt2_eval(gpt2_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, use_scratch);
|
1086 |
}
|
1087 |
else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
|
1088 |
{
|
|
|
1090 |
}
|
1091 |
else if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
|
1092 |
{
|
1093 |
+
evalres = gpt_neox_eval(neox_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, use_scratch);
|
1094 |
}
|
1095 |
else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2)
|
1096 |
{
|
|
|
1102 |
}
|
1103 |
else if(file_format==FileFormat::GPTJ_5)
|
1104 |
{
|
1105 |
+
evalres = gptj_eval(gptj_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, use_scratch);
|
1106 |
}
|
1107 |
else if(file_format==FileFormat::MPT_1)
|
1108 |
{
|
1109 |
+
evalres = mpt_eval(mpt_ctx_v3, params.n_threads, n_past, embd, logits, false, mem_per_token, use_scratch);
|
1110 |
}
|
1111 |
else
|
1112 |
{
|
|
|
1143 |
params.n_threads = original_threads;
|
1144 |
time1 = timer_check();
|
1145 |
timer_start();
|
1146 |
+
if(debugmode!=-1)
|
1147 |
+
{
|
1148 |
+
printf("\n");
|
1149 |
+
}
|
1150 |
}
|
1151 |
|
1152 |
unsigned int eosID = 0;
|
|
|
1249 |
concat_output += tokenizedstr;
|
1250 |
}
|
1251 |
|
1252 |
+
if (startedsampling && debugmode!=-1)
|
1253 |
{
|
1254 |
printf("\rGenerating (%d / %d tokens)", (params.n_predict - remaining_tokens), params.n_predict);
|
1255 |
}
|
1256 |
+
if(debugmode==1 && top_picks.size()>0)
|
1257 |
{
|
1258 |
printf(" [");
|
1259 |
bool firstloop = true;
|
|
|
1273 |
|
1274 |
if(unbanTokens && id==eosID)
|
1275 |
{
|
1276 |
+
stopper_unused_tokens = remaining_tokens;
|
1277 |
printf("\n(EOS token triggered!)");
|
1278 |
remaining_tokens = 0;
|
1279 |
}
|
|
|
1284 |
{
|
1285 |
stopper_unused_tokens = remaining_tokens;
|
1286 |
remaining_tokens = 0;
|
1287 |
+
if(debugmode!=-1)
|
1288 |
+
{
|
1289 |
+
printf("\n(Stop sequence triggered: <%s>)", matched.c_str());
|
1290 |
+
}
|
1291 |
break;
|
1292 |
}
|
1293 |
}
|
klite.embd
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
koboldcpp.py
CHANGED
@@ -26,7 +26,7 @@ class load_model_inputs(ctypes.Structure):
|
|
26 |
("unban_tokens", ctypes.c_bool),
|
27 |
("clblast_info", ctypes.c_int),
|
28 |
("blasbatchsize", ctypes.c_int),
|
29 |
-
("debugmode", ctypes.
|
30 |
("forceversion", ctypes.c_int),
|
31 |
("gpulayers", ctypes.c_int)]
|
32 |
|
@@ -221,10 +221,12 @@ def utfprint(str):
|
|
221 |
#################################################################
|
222 |
friendlymodelname = "concedo/koboldcpp" # local kobold api apparently needs a hardcoded known HF model name
|
223 |
maxctx = 2048
|
224 |
-
|
|
|
225 |
modelbusy = False
|
226 |
defaultport = 5001
|
227 |
-
KcppVersion = "1.
|
|
|
228 |
|
229 |
class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
230 |
sys_version = ""
|
@@ -238,6 +240,12 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
238 |
def __call__(self, *args, **kwargs):
|
239 |
super().__init__(*args, **kwargs)
|
240 |
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
async def generate_text(self, newprompt, genparams, basic_api_flag, stream_flag):
|
242 |
|
243 |
def run_blocking():
|
@@ -281,7 +289,8 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
281 |
else:
|
282 |
recvtxt = run_blocking()
|
283 |
|
284 |
-
|
|
|
285 |
|
286 |
res = {"data": {"seqs":[recvtxt]}} if basic_api_flag else {"results": [{"text": recvtxt}]}
|
287 |
|
@@ -345,7 +354,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
345 |
|
346 |
|
347 |
def do_GET(self):
|
348 |
-
global maxctx,
|
349 |
self.path = self.path.rstrip('/')
|
350 |
response_body = None
|
351 |
|
@@ -371,10 +380,10 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
371 |
response_body = (json.dumps({'result': friendlymodelname }).encode())
|
372 |
|
373 |
elif self.path.endswith(('/api/v1/config/max_length', '/api/latest/config/max_length')):
|
374 |
-
response_body = (json.dumps({"value":
|
375 |
|
376 |
elif self.path.endswith(('/api/v1/config/max_context_length', '/api/latest/config/max_context_length')):
|
377 |
-
response_body = (json.dumps({"value": maxctx}).encode())
|
378 |
|
379 |
elif self.path.endswith(('/api/v1/config/soft_prompt', '/api/latest/config/soft_prompt')):
|
380 |
response_body = (json.dumps({"value":""}).encode())
|
@@ -414,7 +423,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
414 |
self.send_response(200)
|
415 |
self.end_headers()
|
416 |
self.wfile.write(json.dumps({"success": ("true" if ag else "false")}).encode())
|
417 |
-
print("
|
418 |
modelbusy = False
|
419 |
return
|
420 |
|
@@ -453,7 +462,8 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
453 |
utfprint("Body Err: " + str(body))
|
454 |
return self.send_response(503)
|
455 |
|
456 |
-
|
|
|
457 |
|
458 |
modelbusy = True
|
459 |
|
@@ -714,10 +724,17 @@ def main(args):
|
|
714 |
sys.exit(2)
|
715 |
|
716 |
if args.hordeconfig and args.hordeconfig[0]!="":
|
717 |
-
global friendlymodelname,
|
718 |
friendlymodelname = "koboldcpp/"+args.hordeconfig[0]
|
719 |
if len(args.hordeconfig) > 1:
|
720 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
721 |
|
722 |
if args.highpriority:
|
723 |
print("Setting process to Higher Priority - Use Caution")
|
@@ -839,9 +856,9 @@ if __name__ == '__main__':
|
|
839 |
parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
|
840 |
parser.add_argument("--usemlock", help="For Apple Systems. Force system to keep model in RAM rather than swapping or compressing", action='store_true')
|
841 |
parser.add_argument("--noavx2", help="Do not use AVX2 instructions, a slower compatibility mode for older devices. Does not work with --clblast.", action='store_true')
|
842 |
-
parser.add_argument("--debugmode", help="Shows additional debug info in the terminal.", action='
|
843 |
parser.add_argument("--skiplauncher", help="Doesn't display or use the new GUI launcher.", action='store_true')
|
844 |
-
parser.add_argument("--hordeconfig", help="Sets the display model name to something else, for easy use on AI Horde.
|
845 |
compatgroup = parser.add_mutually_exclusive_group()
|
846 |
compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
|
847 |
compatgroup.add_argument("--useclblast", help="Use CLBlast instead of OpenBLAS for prompt ingestion. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
|
|
|
26 |
("unban_tokens", ctypes.c_bool),
|
27 |
("clblast_info", ctypes.c_int),
|
28 |
("blasbatchsize", ctypes.c_int),
|
29 |
+
("debugmode", ctypes.c_int),
|
30 |
("forceversion", ctypes.c_int),
|
31 |
("gpulayers", ctypes.c_int)]
|
32 |
|
|
|
221 |
#################################################################
|
222 |
friendlymodelname = "concedo/koboldcpp" # local kobold api apparently needs a hardcoded known HF model name
|
223 |
maxctx = 2048
|
224 |
+
maxhordectx = 1024
|
225 |
+
maxhordelen = 256
|
226 |
modelbusy = False
|
227 |
defaultport = 5001
|
228 |
+
KcppVersion = "1.33"
|
229 |
+
showdebug = True
|
230 |
|
231 |
class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
232 |
sys_version = ""
|
|
|
240 |
def __call__(self, *args, **kwargs):
|
241 |
super().__init__(*args, **kwargs)
|
242 |
|
243 |
+
def log_message(self, format, *args):
|
244 |
+
global showdebug
|
245 |
+
if showdebug:
|
246 |
+
super().log_message(format, *args)
|
247 |
+
pass
|
248 |
+
|
249 |
async def generate_text(self, newprompt, genparams, basic_api_flag, stream_flag):
|
250 |
|
251 |
def run_blocking():
|
|
|
289 |
else:
|
290 |
recvtxt = run_blocking()
|
291 |
|
292 |
+
if args.debugmode!=-1:
|
293 |
+
utfprint("\nOutput: " + recvtxt)
|
294 |
|
295 |
res = {"data": {"seqs":[recvtxt]}} if basic_api_flag else {"results": [{"text": recvtxt}]}
|
296 |
|
|
|
354 |
|
355 |
|
356 |
def do_GET(self):
|
357 |
+
global maxctx, maxhordelen, friendlymodelname, KcppVersion, streamLock
|
358 |
self.path = self.path.rstrip('/')
|
359 |
response_body = None
|
360 |
|
|
|
380 |
response_body = (json.dumps({'result': friendlymodelname }).encode())
|
381 |
|
382 |
elif self.path.endswith(('/api/v1/config/max_length', '/api/latest/config/max_length')):
|
383 |
+
response_body = (json.dumps({"value": maxhordelen}).encode())
|
384 |
|
385 |
elif self.path.endswith(('/api/v1/config/max_context_length', '/api/latest/config/max_context_length')):
|
386 |
+
response_body = (json.dumps({"value": min(maxctx,maxhordectx)}).encode())
|
387 |
|
388 |
elif self.path.endswith(('/api/v1/config/soft_prompt', '/api/latest/config/soft_prompt')):
|
389 |
response_body = (json.dumps({"value":""}).encode())
|
|
|
423 |
self.send_response(200)
|
424 |
self.end_headers()
|
425 |
self.wfile.write(json.dumps({"success": ("true" if ag else "false")}).encode())
|
426 |
+
print("\nGeneration Aborted")
|
427 |
modelbusy = False
|
428 |
return
|
429 |
|
|
|
462 |
utfprint("Body Err: " + str(body))
|
463 |
return self.send_response(503)
|
464 |
|
465 |
+
if args.debugmode!=-1:
|
466 |
+
utfprint("\nInput: " + json.dumps(genparams))
|
467 |
|
468 |
modelbusy = True
|
469 |
|
|
|
724 |
sys.exit(2)
|
725 |
|
726 |
if args.hordeconfig and args.hordeconfig[0]!="":
|
727 |
+
global friendlymodelname, maxhordelen, maxhordectx, showdebug
|
728 |
friendlymodelname = "koboldcpp/"+args.hordeconfig[0]
|
729 |
if len(args.hordeconfig) > 1:
|
730 |
+
maxhordelen = int(args.hordeconfig[1])
|
731 |
+
if len(args.hordeconfig) > 2:
|
732 |
+
maxhordectx = int(args.hordeconfig[2])
|
733 |
+
if args.debugmode == 0:
|
734 |
+
args.debugmode = -1
|
735 |
+
|
736 |
+
if args.debugmode != 1:
|
737 |
+
showdebug = False
|
738 |
|
739 |
if args.highpriority:
|
740 |
print("Setting process to Higher Priority - Use Caution")
|
|
|
856 |
parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
|
857 |
parser.add_argument("--usemlock", help="For Apple Systems. Force system to keep model in RAM rather than swapping or compressing", action='store_true')
|
858 |
parser.add_argument("--noavx2", help="Do not use AVX2 instructions, a slower compatibility mode for older devices. Does not work with --clblast.", action='store_true')
|
859 |
+
parser.add_argument("--debugmode", help="Shows additional debug info in the terminal.", action='store_const', const=1, default=0)
|
860 |
parser.add_argument("--skiplauncher", help="Doesn't display or use the new GUI launcher.", action='store_true')
|
861 |
+
parser.add_argument("--hordeconfig", help="Sets the display model name to something else, for easy use on AI Horde. Optional additional parameters set the horde max genlength and max ctxlen.",metavar=('[hordename]', '[hordelength] [hordectx]'), nargs='+')
|
862 |
compatgroup = parser.add_mutually_exclusive_group()
|
863 |
compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
|
864 |
compatgroup.add_argument("--useclblast", help="Use CLBlast instead of OpenBLAS for prompt ingestion. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
|
llama.cpp
CHANGED
@@ -19,6 +19,11 @@
|
|
19 |
#ifdef GGML_USE_METAL
|
20 |
#include "ggml-metal.h"
|
21 |
#endif
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
#include <array>
|
24 |
#include <ctime>
|
@@ -75,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
|
75 |
{ MODEL_3B, 256ull * MB },
|
76 |
{ MODEL_7B, 512ull * MB },
|
77 |
{ MODEL_13B, 512ull * MB },
|
78 |
-
{ MODEL_30B,
|
79 |
{ MODEL_65B, 1024ull * MB },
|
80 |
};
|
81 |
return k_sizes;
|
@@ -87,7 +92,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
|
87 |
{ MODEL_3B, 256ull * MB },
|
88 |
{ MODEL_7B, 512ull * MB },
|
89 |
{ MODEL_13B, 512ull * MB },
|
90 |
-
{ MODEL_30B,
|
91 |
{ MODEL_65B, 1024ull * MB },
|
92 |
};
|
93 |
return k_sizes;
|
@@ -100,7 +105,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
|
100 |
{ MODEL_3B, 682ull * MB },
|
101 |
{ MODEL_7B, 1026ull * MB },
|
102 |
{ MODEL_13B, 1608ull * MB },
|
103 |
-
{ MODEL_30B,
|
104 |
{ MODEL_65B, 5120ull * MB },
|
105 |
};
|
106 |
return k_sizes;
|
@@ -114,7 +119,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
|
114 |
{ MODEL_3B, 512ull * MB },
|
115 |
{ MODEL_7B, 800ull * MB },
|
116 |
{ MODEL_13B, 1024ull * MB },
|
117 |
-
{ MODEL_30B,
|
118 |
{ MODEL_65B, 1536ull * MB },
|
119 |
};
|
120 |
return k_sizes;
|
@@ -177,6 +182,19 @@ struct llama_kv_cache {
|
|
177 |
}
|
178 |
};
|
179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
struct llama_model {
|
181 |
e_model type = MODEL_UNKNOWN;
|
182 |
|
@@ -193,10 +211,6 @@ struct llama_model {
|
|
193 |
// context
|
194 |
struct ggml_context * ctx = NULL;
|
195 |
|
196 |
-
// key + value cache for the self attention
|
197 |
-
// TODO: move to llama_state
|
198 |
-
struct llama_kv_cache kv_self;
|
199 |
-
|
200 |
// the model memory buffer
|
201 |
llama_ctx_buffer buf;
|
202 |
|
@@ -210,6 +224,11 @@ struct llama_model {
|
|
210 |
// for quantize-stats only
|
211 |
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
212 |
|
|
|
|
|
|
|
|
|
|
|
213 |
~llama_model() {
|
214 |
if (ctx) {
|
215 |
ggml_free(ctx);
|
@@ -228,24 +247,11 @@ struct llama_model {
|
|
228 |
}
|
229 |
};
|
230 |
|
231 |
-
struct llama_vocab {
|
232 |
-
using id = int32_t;
|
233 |
-
using token = std::string;
|
234 |
-
|
235 |
-
struct token_score {
|
236 |
-
token tok;
|
237 |
-
float score;
|
238 |
-
};
|
239 |
-
|
240 |
-
std::unordered_map<token, id> token_to_id;
|
241 |
-
std::vector<token_score> id_to_token;
|
242 |
-
};
|
243 |
-
|
244 |
struct llama_context {
|
|
|
|
|
245 |
std::mt19937 rng;
|
246 |
|
247 |
-
int64_t t_load_us = 0;
|
248 |
-
int64_t t_start_us = 0;
|
249 |
bool has_evaluated_once = false;
|
250 |
|
251 |
int64_t t_sample_us = 0;
|
@@ -256,8 +262,16 @@ struct llama_context {
|
|
256 |
int32_t n_eval = 0; // number of eval calls
|
257 |
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
258 |
|
259 |
-
llama_model model;
|
260 |
-
llama_vocab vocab;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
|
262 |
size_t mem_per_token = 0;
|
263 |
|
@@ -886,6 +900,7 @@ static bool kv_cache_init(
|
|
886 |
const int64_t n_elements = n_embd*n_mem;
|
887 |
|
888 |
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
|
|
889 |
|
890 |
struct ggml_init_params params;
|
891 |
params.mem_size = cache.buf.size;
|
@@ -904,6 +919,7 @@ static bool kv_cache_init(
|
|
904 |
ggml_set_name(cache.k, "cache_k");
|
905 |
ggml_set_name(cache.v, "cache_v");
|
906 |
|
|
|
907 |
#ifdef GGML_USE_CUBLAS
|
908 |
if (n_gpu_layers > n_layer + 1) {
|
909 |
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
@@ -918,21 +934,21 @@ static bool kv_cache_init(
|
|
918 |
|
919 |
struct llama_context_params llama_context_default_params() {
|
920 |
struct llama_context_params result = {
|
|
|
921 |
/*.n_ctx =*/ 512,
|
922 |
/*.n_batch =*/ 512,
|
923 |
/*.gpu_layers =*/ 0,
|
924 |
/*.main_gpu =*/ 0,
|
925 |
/*.tensor_split =*/ {0},
|
|
|
|
|
926 |
/*.low_vram =*/ false,
|
927 |
-
/*.seed =*/ -1,
|
928 |
/*.f16_kv =*/ true,
|
929 |
/*.logits_all =*/ false,
|
930 |
/*.vocab_only =*/ false,
|
931 |
/*.use_mmap =*/ true,
|
932 |
/*.use_mlock =*/ false,
|
933 |
/*.embedding =*/ false,
|
934 |
-
/*.progress_callback =*/ nullptr,
|
935 |
-
/*.progress_callback_user_data =*/ nullptr,
|
936 |
};
|
937 |
|
938 |
return result;
|
@@ -1026,7 +1042,8 @@ static const char *llama_model_type_name(e_model type) {
|
|
1026 |
|
1027 |
static void llama_model_load_internal(
|
1028 |
const std::string & fname,
|
1029 |
-
|
|
|
1030 |
int n_ctx,
|
1031 |
int n_batch,
|
1032 |
int n_gpu_layers,
|
@@ -1040,12 +1057,11 @@ static void llama_model_load_internal(
|
|
1040 |
llama_progress_callback progress_callback,
|
1041 |
void * progress_callback_user_data) {
|
1042 |
|
1043 |
-
|
1044 |
|
1045 |
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
|
1046 |
|
1047 |
-
|
1048 |
-
auto & model = lctx.model;
|
1049 |
model.hparams = ml->file_loaders.at(0)->hparams;
|
1050 |
model.n_gpu_layers = n_gpu_layers;
|
1051 |
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
@@ -1115,15 +1131,15 @@ static void llama_model_load_internal(
|
|
1115 |
|
1116 |
// create the ggml context
|
1117 |
{
|
1118 |
-
|
1119 |
if (use_mlock) {
|
1120 |
-
|
1121 |
-
|
1122 |
}
|
1123 |
|
1124 |
struct ggml_init_params params = {
|
1125 |
-
/*.mem_size =*/
|
1126 |
-
/*.mem_buffer =*/
|
1127 |
/*.no_alloc =*/ ml->use_mmap,
|
1128 |
};
|
1129 |
|
@@ -1253,7 +1269,7 @@ static void llama_model_load_internal(
|
|
1253 |
vram_scratch = n_batch * MB;
|
1254 |
ggml_cuda_set_scratch_size(vram_scratch);
|
1255 |
if (n_gpu_layers > 0) {
|
1256 |
-
fprintf(stderr, "%s: allocating batch_size x 1 MB = %
|
1257 |
__func__, vram_scratch / MB);
|
1258 |
}
|
1259 |
}
|
@@ -1304,7 +1320,7 @@ static void llama_model_load_internal(
|
|
1304 |
}
|
1305 |
#endif
|
1306 |
|
1307 |
-
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &
|
1308 |
|
1309 |
if (progress_callback) {
|
1310 |
progress_callback(1.0f, progress_callback_user_data);
|
@@ -1314,12 +1330,13 @@ static void llama_model_load_internal(
|
|
1314 |
|
1315 |
// loading time will be recalculate after the first eval, so
|
1316 |
// we take page faults deferred by mmap() into consideration
|
1317 |
-
|
1318 |
}
|
1319 |
|
1320 |
static bool llama_model_load(
|
1321 |
const std::string & fname,
|
1322 |
-
|
|
|
1323 |
int n_ctx,
|
1324 |
int n_batch,
|
1325 |
int n_gpu_layers,
|
@@ -1333,7 +1350,7 @@ static bool llama_model_load(
|
|
1333 |
llama_progress_callback progress_callback,
|
1334 |
void *progress_callback_user_data) {
|
1335 |
try {
|
1336 |
-
llama_model_load_internal(fname,
|
1337 |
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1338 |
return true;
|
1339 |
} catch (const std::exception & err) {
|
@@ -1371,7 +1388,7 @@ static bool llama_eval_internal(
|
|
1371 |
const auto & model = lctx.model;
|
1372 |
const auto & hparams = model.hparams;
|
1373 |
|
1374 |
-
const auto & kv_self =
|
1375 |
|
1376 |
LLAMA_ASSERT(!!kv_self.ctx);
|
1377 |
|
@@ -1613,7 +1630,7 @@ static bool llama_eval_internal(
|
|
1613 |
model.layers[il].w1,
|
1614 |
cur);
|
1615 |
offload_func(cur);
|
1616 |
-
ggml_set_name(cur, "
|
1617 |
|
1618 |
// SILU activation
|
1619 |
cur = ggml_silu(ctx0, cur);
|
@@ -1650,11 +1667,7 @@ static bool llama_eval_internal(
|
|
1650 |
{
|
1651 |
cur = ggml_rms_norm(ctx0, inpL);
|
1652 |
offload_func_nr(cur);
|
1653 |
-
ggml_set_name(cur, "
|
1654 |
-
|
1655 |
-
cur = ggml_rms_norm(ctx0, cur);
|
1656 |
-
offload_func_nr(cur);
|
1657 |
-
ggml_set_name(cur, "rms_norm_after");
|
1658 |
|
1659 |
// cur = cur*norm(broadcasted)
|
1660 |
cur = ggml_mul(ctx0, cur, model.norm);
|
@@ -1723,7 +1736,7 @@ static bool llama_eval_internal(
|
|
1723 |
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1724 |
|
1725 |
// update kv token count
|
1726 |
-
lctx.
|
1727 |
|
1728 |
// extract logits
|
1729 |
{
|
@@ -2002,9 +2015,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
|
2002 |
for (size_t i = 0; i < candidates->size; ++i) {
|
2003 |
cum_sum += candidates->data[i].p;
|
2004 |
|
2005 |
-
// Check if the running sum is
|
2006 |
-
|
2007 |
-
|
|
|
2008 |
break;
|
2009 |
}
|
2010 |
}
|
@@ -2489,8 +2503,23 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2489 |
} else {
|
2490 |
new_type = quantized_type;
|
2491 |
#ifdef GGML_USE_K_QUANTS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2492 |
if (tensor.name == "output.weight") {
|
2493 |
-
|
|
|
|
|
|
|
|
|
2494 |
} else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
2495 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2496 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
@@ -2616,12 +2645,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
2616 |
// interface implementation
|
2617 |
//
|
2618 |
|
2619 |
-
struct
|
2620 |
const char * path_model,
|
2621 |
struct llama_context_params params) {
|
2622 |
ggml_time_init();
|
2623 |
|
2624 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2625 |
|
2626 |
if (params.seed < 0) {
|
2627 |
params.seed = time(NULL);
|
@@ -2649,24 +2705,16 @@ struct llama_context * llama_init_from_file(
|
|
2649 |
|
2650 |
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2651 |
|
2652 |
-
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
|
2653 |
-
params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
2654 |
-
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2655 |
-
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2656 |
-
llama_free(ctx);
|
2657 |
-
return nullptr;
|
2658 |
-
}
|
2659 |
-
|
2660 |
// reserve memory for context buffers
|
2661 |
if (!params.vocab_only) {
|
2662 |
-
if (!kv_cache_init(ctx->model.hparams, ctx->
|
2663 |
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
2664 |
llama_free(ctx);
|
2665 |
return nullptr;
|
2666 |
}
|
2667 |
|
2668 |
{
|
2669 |
-
const size_t memory_size = ggml_nbytes(ctx->
|
2670 |
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
2671 |
}
|
2672 |
|
@@ -2694,16 +2742,21 @@ struct llama_context * llama_init_from_file(
|
|
2694 |
// this allocates all Metal resources and memory buffers
|
2695 |
ctx->ctx_metal = ggml_metal_init();
|
2696 |
|
2697 |
-
void *data_ptr
|
2698 |
size_t data_size = 0;
|
|
|
2699 |
if (params.use_mmap) {
|
2700 |
-
data_ptr
|
2701 |
-
data_size= ctx->model.mapping->size;
|
2702 |
} else {
|
2703 |
-
data_ptr
|
2704 |
-
data_size= ggml_get_mem_size(ctx->model.ctx);
|
2705 |
}
|
2706 |
|
|
|
|
|
|
|
|
|
2707 |
#define LLAMA_METAL_CHECK_BUF(result) \
|
2708 |
if (!(result)) { \
|
2709 |
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
|
@@ -2711,12 +2764,13 @@ struct llama_context * llama_init_from_file(
|
|
2711 |
return NULL; \
|
2712 |
}
|
2713 |
|
2714 |
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
|
2715 |
-
|
|
|
|
|
2716 |
|
2717 |
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "
|
2718 |
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "
|
2719 |
-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
|
2720 |
#undef LLAMA_METAL_CHECK_BUF
|
2721 |
}
|
2722 |
#endif
|
@@ -2724,7 +2778,23 @@ struct llama_context * llama_init_from_file(
|
|
2724 |
return ctx;
|
2725 |
}
|
2726 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2727 |
void llama_free(struct llama_context * ctx) {
|
|
|
|
|
|
|
2728 |
delete ctx;
|
2729 |
}
|
2730 |
|
@@ -2741,11 +2811,9 @@ int llama_model_quantize(
|
|
2741 |
}
|
2742 |
}
|
2743 |
|
2744 |
-
int llama_apply_lora_from_file_internal(struct
|
2745 |
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
2746 |
|
2747 |
-
auto & model = ctx->model;
|
2748 |
-
|
2749 |
const int64_t t_start_lora_us = ggml_time_us();
|
2750 |
|
2751 |
auto fin = std::ifstream(path_lora, std::ios::binary);
|
@@ -2988,7 +3056,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
|
2988 |
|
2989 |
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
2990 |
try {
|
2991 |
-
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2992 |
} catch (const std::exception & err) {
|
2993 |
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
2994 |
return 1;
|
@@ -2996,7 +3073,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
|
|
2996 |
}
|
2997 |
|
2998 |
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
2999 |
-
return ctx->
|
3000 |
}
|
3001 |
|
3002 |
#define LLAMA_MAX_RNG_STATE (64*1024)
|
@@ -3021,7 +3098,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
3021 |
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
3022 |
const size_t s_kv_size = sizeof(size_t);
|
3023 |
const size_t s_kv_ntok = sizeof(int);
|
3024 |
-
const size_t s_kv = ctx->
|
3025 |
|
3026 |
const size_t s_total = (
|
3027 |
+ s_rng_size
|
@@ -3087,7 +3164,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3087 |
|
3088 |
// copy kv cache
|
3089 |
{
|
3090 |
-
const auto & kv_self = ctx->
|
3091 |
const auto & hparams = ctx->model.hparams;
|
3092 |
const int n_layer = hparams.n_layer;
|
3093 |
const int n_embd = hparams.n_embd;
|
@@ -3102,9 +3179,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
3102 |
if (kv_size) {
|
3103 |
const size_t elt_size = ggml_element_size(kv_self.k);
|
3104 |
|
3105 |
-
|
3106 |
-
|
3107 |
-
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
3108 |
ggml_cgraph gf{};
|
3109 |
gf.n_threads = 1;
|
3110 |
|
@@ -3193,7 +3268,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3193 |
|
3194 |
// set kv cache
|
3195 |
{
|
3196 |
-
const auto & kv_self = ctx->
|
3197 |
const auto & hparams = ctx->model.hparams;
|
3198 |
const int n_layer = hparams.n_layer;
|
3199 |
const int n_embd = hparams.n_embd;
|
@@ -3210,9 +3285,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3210 |
|
3211 |
const size_t elt_size = ggml_element_size(kv_self.k);
|
3212 |
|
3213 |
-
|
3214 |
-
|
3215 |
-
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
3216 |
ggml_cgraph gf{};
|
3217 |
gf.n_threads = 1;
|
3218 |
|
@@ -3239,7 +3312,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3239 |
ggml_free(cpy_ctx);
|
3240 |
}
|
3241 |
|
3242 |
-
ctx->
|
3243 |
}
|
3244 |
|
3245 |
const size_t nread = inp - src;
|
@@ -3447,9 +3520,12 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
3447 |
|
3448 |
fprintf(stderr, "\n");
|
3449 |
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
3450 |
-
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token
|
3451 |
-
|
3452 |
-
fprintf(stderr, "%s:
|
|
|
|
|
|
|
3453 |
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
|
3454 |
}
|
3455 |
|
@@ -3483,6 +3559,6 @@ const char * llama_print_system_info(void) {
|
|
3483 |
}
|
3484 |
|
3485 |
// For internal test use
|
3486 |
-
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
3487 |
return ctx->model.tensors_by_name;
|
3488 |
}
|
|
|
19 |
#ifdef GGML_USE_METAL
|
20 |
#include "ggml-metal.h"
|
21 |
#endif
|
22 |
+
#ifdef GGML_USE_K_QUANTS
|
23 |
+
#ifndef QK_K
|
24 |
+
#define QK_K 256
|
25 |
+
#endif
|
26 |
+
#endif
|
27 |
|
28 |
#include <array>
|
29 |
#include <ctime>
|
|
|
80 |
{ MODEL_3B, 256ull * MB },
|
81 |
{ MODEL_7B, 512ull * MB },
|
82 |
{ MODEL_13B, 512ull * MB },
|
83 |
+
{ MODEL_30B, 640ull * MB },
|
84 |
{ MODEL_65B, 1024ull * MB },
|
85 |
};
|
86 |
return k_sizes;
|
|
|
92 |
{ MODEL_3B, 256ull * MB },
|
93 |
{ MODEL_7B, 512ull * MB },
|
94 |
{ MODEL_13B, 512ull * MB },
|
95 |
+
{ MODEL_30B, 640ull * MB },
|
96 |
{ MODEL_65B, 1024ull * MB },
|
97 |
};
|
98 |
return k_sizes;
|
|
|
105 |
{ MODEL_3B, 682ull * MB },
|
106 |
{ MODEL_7B, 1026ull * MB },
|
107 |
{ MODEL_13B, 1608ull * MB },
|
108 |
+
{ MODEL_30B, 3224ull * MB },
|
109 |
{ MODEL_65B, 5120ull * MB },
|
110 |
};
|
111 |
return k_sizes;
|
|
|
119 |
{ MODEL_3B, 512ull * MB },
|
120 |
{ MODEL_7B, 800ull * MB },
|
121 |
{ MODEL_13B, 1024ull * MB },
|
122 |
+
{ MODEL_30B, 1380ull * MB },
|
123 |
{ MODEL_65B, 1536ull * MB },
|
124 |
};
|
125 |
return k_sizes;
|
|
|
182 |
}
|
183 |
};
|
184 |
|
185 |
+
struct llama_vocab {
|
186 |
+
using id = int32_t;
|
187 |
+
using token = std::string;
|
188 |
+
|
189 |
+
struct token_score {
|
190 |
+
token tok;
|
191 |
+
float score;
|
192 |
+
};
|
193 |
+
|
194 |
+
std::unordered_map<token, id> token_to_id;
|
195 |
+
std::vector<token_score> id_to_token;
|
196 |
+
};
|
197 |
+
|
198 |
struct llama_model {
|
199 |
e_model type = MODEL_UNKNOWN;
|
200 |
|
|
|
211 |
// context
|
212 |
struct ggml_context * ctx = NULL;
|
213 |
|
|
|
|
|
|
|
|
|
214 |
// the model memory buffer
|
215 |
llama_ctx_buffer buf;
|
216 |
|
|
|
224 |
// for quantize-stats only
|
225 |
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
226 |
|
227 |
+
int64_t t_load_us = 0;
|
228 |
+
int64_t t_start_us = 0;
|
229 |
+
|
230 |
+
llama_vocab vocab;
|
231 |
+
|
232 |
~llama_model() {
|
233 |
if (ctx) {
|
234 |
ggml_free(ctx);
|
|
|
247 |
}
|
248 |
};
|
249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
struct llama_context {
|
251 |
+
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
252 |
+
|
253 |
std::mt19937 rng;
|
254 |
|
|
|
|
|
255 |
bool has_evaluated_once = false;
|
256 |
|
257 |
int64_t t_sample_us = 0;
|
|
|
262 |
int32_t n_eval = 0; // number of eval calls
|
263 |
int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
264 |
|
265 |
+
const llama_model & model;
|
266 |
+
const llama_vocab & vocab;
|
267 |
+
|
268 |
+
bool model_owner = false;
|
269 |
+
|
270 |
+
int64_t t_load_us;
|
271 |
+
int64_t t_start_us;
|
272 |
+
|
273 |
+
// key + value cache for the self attention
|
274 |
+
struct llama_kv_cache kv_self;
|
275 |
|
276 |
size_t mem_per_token = 0;
|
277 |
|
|
|
900 |
const int64_t n_elements = n_embd*n_mem;
|
901 |
|
902 |
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
903 |
+
cache.n = 0;
|
904 |
|
905 |
struct ggml_init_params params;
|
906 |
params.mem_size = cache.buf.size;
|
|
|
919 |
ggml_set_name(cache.k, "cache_k");
|
920 |
ggml_set_name(cache.v, "cache_v");
|
921 |
|
922 |
+
(void) n_gpu_layers;
|
923 |
#ifdef GGML_USE_CUBLAS
|
924 |
if (n_gpu_layers > n_layer + 1) {
|
925 |
ggml_cuda_assign_buffers_no_scratch(cache.v);
|
|
|
934 |
|
935 |
struct llama_context_params llama_context_default_params() {
|
936 |
struct llama_context_params result = {
|
937 |
+
/*.seed =*/ -1,
|
938 |
/*.n_ctx =*/ 512,
|
939 |
/*.n_batch =*/ 512,
|
940 |
/*.gpu_layers =*/ 0,
|
941 |
/*.main_gpu =*/ 0,
|
942 |
/*.tensor_split =*/ {0},
|
943 |
+
/*.progress_callback =*/ nullptr,
|
944 |
+
/*.progress_callback_user_data =*/ nullptr,
|
945 |
/*.low_vram =*/ false,
|
|
|
946 |
/*.f16_kv =*/ true,
|
947 |
/*.logits_all =*/ false,
|
948 |
/*.vocab_only =*/ false,
|
949 |
/*.use_mmap =*/ true,
|
950 |
/*.use_mlock =*/ false,
|
951 |
/*.embedding =*/ false,
|
|
|
|
|
952 |
};
|
953 |
|
954 |
return result;
|
|
|
1042 |
|
1043 |
static void llama_model_load_internal(
|
1044 |
const std::string & fname,
|
1045 |
+
llama_model & model,
|
1046 |
+
llama_vocab & vocab,
|
1047 |
int n_ctx,
|
1048 |
int n_batch,
|
1049 |
int n_gpu_layers,
|
|
|
1057 |
llama_progress_callback progress_callback,
|
1058 |
void * progress_callback_user_data) {
|
1059 |
|
1060 |
+
model.t_start_us = ggml_time_us();
|
1061 |
|
1062 |
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
|
1063 |
|
1064 |
+
vocab = std::move(ml->file_loaders.at(0)->vocab);
|
|
|
1065 |
model.hparams = ml->file_loaders.at(0)->hparams;
|
1066 |
model.n_gpu_layers = n_gpu_layers;
|
1067 |
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
|
|
|
1131 |
|
1132 |
// create the ggml context
|
1133 |
{
|
1134 |
+
model.buf.resize(ctx_size);
|
1135 |
if (use_mlock) {
|
1136 |
+
model.mlock_buf.init(model.buf.addr);
|
1137 |
+
model.mlock_buf.grow_to(model.buf.size);
|
1138 |
}
|
1139 |
|
1140 |
struct ggml_init_params params = {
|
1141 |
+
/*.mem_size =*/ model.buf.size,
|
1142 |
+
/*.mem_buffer =*/ model.buf.addr,
|
1143 |
/*.no_alloc =*/ ml->use_mmap,
|
1144 |
};
|
1145 |
|
|
|
1269 |
vram_scratch = n_batch * MB;
|
1270 |
ggml_cuda_set_scratch_size(vram_scratch);
|
1271 |
if (n_gpu_layers > 0) {
|
1272 |
+
fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
|
1273 |
__func__, vram_scratch / MB);
|
1274 |
}
|
1275 |
}
|
|
|
1320 |
}
|
1321 |
#endif
|
1322 |
|
1323 |
+
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
1324 |
|
1325 |
if (progress_callback) {
|
1326 |
progress_callback(1.0f, progress_callback_user_data);
|
|
|
1330 |
|
1331 |
// loading time will be recalculate after the first eval, so
|
1332 |
// we take page faults deferred by mmap() into consideration
|
1333 |
+
model.t_load_us = ggml_time_us() - model.t_start_us;
|
1334 |
}
|
1335 |
|
1336 |
static bool llama_model_load(
|
1337 |
const std::string & fname,
|
1338 |
+
llama_model & model,
|
1339 |
+
llama_vocab & vocab,
|
1340 |
int n_ctx,
|
1341 |
int n_batch,
|
1342 |
int n_gpu_layers,
|
|
|
1350 |
llama_progress_callback progress_callback,
|
1351 |
void *progress_callback_user_data) {
|
1352 |
try {
|
1353 |
+
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
|
1354 |
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
1355 |
return true;
|
1356 |
} catch (const std::exception & err) {
|
|
|
1388 |
const auto & model = lctx.model;
|
1389 |
const auto & hparams = model.hparams;
|
1390 |
|
1391 |
+
const auto & kv_self = lctx.kv_self;
|
1392 |
|
1393 |
LLAMA_ASSERT(!!kv_self.ctx);
|
1394 |
|
|
|
1630 |
model.layers[il].w1,
|
1631 |
cur);
|
1632 |
offload_func(cur);
|
1633 |
+
ggml_set_name(cur, "result_w1");
|
1634 |
|
1635 |
// SILU activation
|
1636 |
cur = ggml_silu(ctx0, cur);
|
|
|
1667 |
{
|
1668 |
cur = ggml_rms_norm(ctx0, inpL);
|
1669 |
offload_func_nr(cur);
|
1670 |
+
ggml_set_name(cur, "rms_norm_2");
|
|
|
|
|
|
|
|
|
1671 |
|
1672 |
// cur = cur*norm(broadcasted)
|
1673 |
cur = ggml_mul(ctx0, cur, model.norm);
|
|
|
1736 |
//memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
|
1737 |
|
1738 |
// update kv token count
|
1739 |
+
lctx.kv_self.n = n_past + N;
|
1740 |
|
1741 |
// extract logits
|
1742 |
{
|
|
|
2015 |
for (size_t i = 0; i < candidates->size; ++i) {
|
2016 |
cum_sum += candidates->data[i].p;
|
2017 |
|
2018 |
+
// Check if the running sum is at least p or if we have kept at least min_keep tokens
|
2019 |
+
// we set the last index to i+1 to indicate that the current iterate should be included in the set
|
2020 |
+
if (cum_sum >= p && i + 1 >= min_keep) {
|
2021 |
+
last_idx = i + 1;
|
2022 |
break;
|
2023 |
}
|
2024 |
}
|
|
|
2503 |
} else {
|
2504 |
new_type = quantized_type;
|
2505 |
#ifdef GGML_USE_K_QUANTS
|
2506 |
+
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
|
2507 |
+
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
|
2508 |
+
int nx = tensor.ne.at(0);
|
2509 |
+
int ny = tensor.ne.at(1);
|
2510 |
+
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
2511 |
+
fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
|
2512 |
+
fprintf(stderr, "This is required to be able to use k-quants for now!\n");
|
2513 |
+
fprintf(stderr, "========================================================================================\n\n");
|
2514 |
+
throw std::runtime_error("Unsupported tensor size encountered\n");
|
2515 |
+
}
|
2516 |
+
}
|
2517 |
if (tensor.name == "output.weight") {
|
2518 |
+
int nx = tensor.ne.at(0);
|
2519 |
+
int ny = tensor.ne.at(1);
|
2520 |
+
if (nx % QK_K == 0 && ny % QK_K == 0) {
|
2521 |
+
new_type = GGML_TYPE_Q6_K;
|
2522 |
+
}
|
2523 |
} else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
|
2524 |
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
2525 |
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
|
|
2645 |
// interface implementation
|
2646 |
//
|
2647 |
|
2648 |
+
struct llama_model * llama_load_model_from_file(
|
2649 |
const char * path_model,
|
2650 |
struct llama_context_params params) {
|
2651 |
ggml_time_init();
|
2652 |
|
2653 |
+
llama_model * model = new llama_model;
|
2654 |
+
|
2655 |
+
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2656 |
+
|
2657 |
+
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
|
2658 |
+
params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
|
2659 |
+
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
|
2660 |
+
delete model;
|
2661 |
+
fprintf(stderr, "%s: failed to load model\n", __func__);
|
2662 |
+
return nullptr;
|
2663 |
+
}
|
2664 |
+
|
2665 |
+
return model;
|
2666 |
+
}
|
2667 |
+
|
2668 |
+
void llama_free_model(struct llama_model * model) {
|
2669 |
+
delete model;
|
2670 |
+
}
|
2671 |
+
|
2672 |
+
struct llama_context * llama_new_context_with_model(
|
2673 |
+
struct llama_model * model,
|
2674 |
+
struct llama_context_params params) {
|
2675 |
+
|
2676 |
+
if (!model) {
|
2677 |
+
return nullptr;
|
2678 |
+
}
|
2679 |
+
|
2680 |
+
llama_context * ctx = new llama_context(*model, model->vocab);
|
2681 |
|
2682 |
if (params.seed < 0) {
|
2683 |
params.seed = time(NULL);
|
|
|
2705 |
|
2706 |
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
2707 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2708 |
// reserve memory for context buffers
|
2709 |
if (!params.vocab_only) {
|
2710 |
+
if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
|
2711 |
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
2712 |
llama_free(ctx);
|
2713 |
return nullptr;
|
2714 |
}
|
2715 |
|
2716 |
{
|
2717 |
+
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
2718 |
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
2719 |
}
|
2720 |
|
|
|
2742 |
// this allocates all Metal resources and memory buffers
|
2743 |
ctx->ctx_metal = ggml_metal_init();
|
2744 |
|
2745 |
+
void * data_ptr = NULL;
|
2746 |
size_t data_size = 0;
|
2747 |
+
|
2748 |
if (params.use_mmap) {
|
2749 |
+
data_ptr = ctx->model.mapping->addr;
|
2750 |
+
data_size = ctx->model.mapping->size;
|
2751 |
} else {
|
2752 |
+
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
|
2753 |
+
data_size = ggml_get_mem_size (ctx->model.ctx);
|
2754 |
}
|
2755 |
|
2756 |
+
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
2757 |
+
|
2758 |
+
printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
2759 |
+
|
2760 |
#define LLAMA_METAL_CHECK_BUF(result) \
|
2761 |
if (!(result)) { \
|
2762 |
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
|
|
|
2764 |
return NULL; \
|
2765 |
}
|
2766 |
|
2767 |
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
|
2768 |
+
|
2769 |
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
|
2770 |
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
|
2771 |
|
2772 |
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
|
2773 |
+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
|
|
|
2774 |
#undef LLAMA_METAL_CHECK_BUF
|
2775 |
}
|
2776 |
#endif
|
|
|
2778 |
return ctx;
|
2779 |
}
|
2780 |
|
2781 |
+
struct llama_context * llama_init_from_file(
|
2782 |
+
const char * path_model,
|
2783 |
+
struct llama_context_params params) {
|
2784 |
+
|
2785 |
+
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
2786 |
+
if (!model) {
|
2787 |
+
return nullptr;
|
2788 |
+
}
|
2789 |
+
struct llama_context * ctx = llama_new_context_with_model(model, params);
|
2790 |
+
ctx->model_owner = true;
|
2791 |
+
return ctx;
|
2792 |
+
}
|
2793 |
+
|
2794 |
void llama_free(struct llama_context * ctx) {
|
2795 |
+
if (ctx->model_owner) {
|
2796 |
+
delete &ctx->model;
|
2797 |
+
}
|
2798 |
delete ctx;
|
2799 |
}
|
2800 |
|
|
|
2811 |
}
|
2812 |
}
|
2813 |
|
2814 |
+
int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
|
2815 |
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
2816 |
|
|
|
|
|
2817 |
const int64_t t_start_lora_us = ggml_time_us();
|
2818 |
|
2819 |
auto fin = std::ifstream(path_lora, std::ios::binary);
|
|
|
3056 |
|
3057 |
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
3058 |
try {
|
3059 |
+
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
3060 |
+
} catch (const std::exception & err) {
|
3061 |
+
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
3062 |
+
return 1;
|
3063 |
+
}
|
3064 |
+
}
|
3065 |
+
|
3066 |
+
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
|
3067 |
+
try {
|
3068 |
+
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
3069 |
} catch (const std::exception & err) {
|
3070 |
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
3071 |
return 1;
|
|
|
3073 |
}
|
3074 |
|
3075 |
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
3076 |
+
return ctx->kv_self.n;
|
3077 |
}
|
3078 |
|
3079 |
#define LLAMA_MAX_RNG_STATE (64*1024)
|
|
|
3098 |
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
3099 |
const size_t s_kv_size = sizeof(size_t);
|
3100 |
const size_t s_kv_ntok = sizeof(int);
|
3101 |
+
const size_t s_kv = ctx->kv_self.buf.size;
|
3102 |
|
3103 |
const size_t s_total = (
|
3104 |
+ s_rng_size
|
|
|
3164 |
|
3165 |
// copy kv cache
|
3166 |
{
|
3167 |
+
const auto & kv_self = ctx->kv_self;
|
3168 |
const auto & hparams = ctx->model.hparams;
|
3169 |
const int n_layer = hparams.n_layer;
|
3170 |
const int n_embd = hparams.n_embd;
|
|
|
3179 |
if (kv_size) {
|
3180 |
const size_t elt_size = ggml_element_size(kv_self.k);
|
3181 |
|
3182 |
+
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
|
|
|
|
3183 |
ggml_cgraph gf{};
|
3184 |
gf.n_threads = 1;
|
3185 |
|
|
|
3268 |
|
3269 |
// set kv cache
|
3270 |
{
|
3271 |
+
const auto & kv_self = ctx->kv_self;
|
3272 |
const auto & hparams = ctx->model.hparams;
|
3273 |
const int n_layer = hparams.n_layer;
|
3274 |
const int n_embd = hparams.n_embd;
|
|
|
3285 |
|
3286 |
const size_t elt_size = ggml_element_size(kv_self.k);
|
3287 |
|
3288 |
+
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
|
|
|
|
3289 |
ggml_cgraph gf{};
|
3290 |
gf.n_threads = 1;
|
3291 |
|
|
|
3312 |
ggml_free(cpy_ctx);
|
3313 |
}
|
3314 |
|
3315 |
+
ctx->kv_self.n = kv_ntok;
|
3316 |
}
|
3317 |
|
3318 |
const size_t nread = inp - src;
|
|
|
3520 |
|
3521 |
fprintf(stderr, "\n");
|
3522 |
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
3523 |
+
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3524 |
+
__func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
|
3525 |
+
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
3526 |
+
__func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
|
3527 |
+
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
3528 |
+
__func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
|
3529 |
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
|
3530 |
}
|
3531 |
|
|
|
3559 |
}
|
3560 |
|
3561 |
// For internal test use
|
3562 |
+
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
|
3563 |
return ctx->model.tensors_by_name;
|
3564 |
}
|
llama.h
CHANGED
@@ -26,6 +26,14 @@
|
|
26 |
# define LLAMA_API
|
27 |
#endif
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
30 |
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
31 |
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
@@ -53,6 +61,7 @@ extern "C" {
|
|
53 |
// TODO: show sample usage
|
54 |
//
|
55 |
|
|
|
56 |
struct llama_context;
|
57 |
|
58 |
typedef int llama_token;
|
@@ -71,28 +80,27 @@ extern "C" {
|
|
71 |
|
72 |
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
73 |
|
74 |
-
|
|
|
75 |
int n_ctx; // text context
|
76 |
int n_batch; // prompt processing batch size
|
77 |
int n_gpu_layers; // number of layers to store in VRAM
|
78 |
int main_gpu; // the GPU that is used for scratch and small tensors
|
79 |
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
80 |
-
|
81 |
-
|
|
|
|
|
82 |
|
|
|
|
|
83 |
bool f16_kv; // use fp16 for KV cache
|
84 |
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
85 |
bool vocab_only; // only load the vocabulary, no weights
|
86 |
bool use_mmap; // use mmap if possible
|
87 |
bool use_mlock; // force system to keep model in RAM
|
88 |
bool embedding; // embedding mode only
|
89 |
-
|
90 |
-
// called with a progress value between 0 and 1, pass NULL to disable
|
91 |
-
llama_progress_callback progress_callback;
|
92 |
-
// context pointer passed to the progress callback
|
93 |
-
void * progress_callback_user_data;
|
94 |
};
|
95 |
-
|
96 |
// model file types
|
97 |
enum llama_ftype {
|
98 |
LLAMA_FTYPE_ALL_F32 = 0,
|
@@ -137,12 +145,23 @@ extern "C" {
|
|
137 |
|
138 |
LLAMA_API int64_t llama_time_us();
|
139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
// Various functions for loading a ggml llama model.
|
141 |
// Allocate (almost) all memory needed for the model.
|
142 |
// Return NULL on failure
|
143 |
-
LLAMA_API struct llama_context * llama_init_from_file(
|
144 |
const char * path_model,
|
145 |
-
struct llama_context_params params)
|
|
|
146 |
|
147 |
// Frees all allocated memory
|
148 |
LLAMA_API void llama_free(struct llama_context * ctx);
|
@@ -159,8 +178,15 @@ extern "C" {
|
|
159 |
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
160 |
// will be applied on top of the previous one
|
161 |
// Returns 0 on success
|
162 |
-
LLAMA_API int llama_apply_lora_from_file(
|
163 |
struct llama_context * ctx,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
const char * path_lora,
|
165 |
const char * path_base_model,
|
166 |
int n_threads);
|
@@ -311,7 +337,7 @@ extern "C" {
|
|
311 |
#include <string>
|
312 |
struct ggml_tensor;
|
313 |
|
314 |
-
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
315 |
|
316 |
#endif
|
317 |
|
|
|
26 |
# define LLAMA_API
|
27 |
#endif
|
28 |
|
29 |
+
#ifdef __GNUC__
|
30 |
+
# define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
31 |
+
#elif defined(_MSC_VER)
|
32 |
+
# define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
|
33 |
+
#else
|
34 |
+
# define DEPRECATED(func, hint) func
|
35 |
+
#endif
|
36 |
+
|
37 |
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
|
38 |
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
39 |
#define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
|
|
|
61 |
// TODO: show sample usage
|
62 |
//
|
63 |
|
64 |
+
struct llama_model;
|
65 |
struct llama_context;
|
66 |
|
67 |
typedef int llama_token;
|
|
|
80 |
|
81 |
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
82 |
|
83 |
+
struct llama_context_params {
|
84 |
+
int seed; // RNG seed, -1 for random
|
85 |
int n_ctx; // text context
|
86 |
int n_batch; // prompt processing batch size
|
87 |
int n_gpu_layers; // number of layers to store in VRAM
|
88 |
int main_gpu; // the GPU that is used for scratch and small tensors
|
89 |
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
90 |
+
// called with a progress value between 0 and 1, pass NULL to disable
|
91 |
+
llama_progress_callback progress_callback;
|
92 |
+
// context pointer passed to the progress callback
|
93 |
+
void * progress_callback_user_data;
|
94 |
|
95 |
+
// Keep the booleans together to avoid misalignment during copy-by-value.
|
96 |
+
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
97 |
bool f16_kv; // use fp16 for KV cache
|
98 |
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
99 |
bool vocab_only; // only load the vocabulary, no weights
|
100 |
bool use_mmap; // use mmap if possible
|
101 |
bool use_mlock; // force system to keep model in RAM
|
102 |
bool embedding; // embedding mode only
|
|
|
|
|
|
|
|
|
|
|
103 |
};
|
|
|
104 |
// model file types
|
105 |
enum llama_ftype {
|
106 |
LLAMA_FTYPE_ALL_F32 = 0,
|
|
|
145 |
|
146 |
LLAMA_API int64_t llama_time_us();
|
147 |
|
148 |
+
LLAMA_API struct llama_model * llama_load_model_from_file(
|
149 |
+
const char * path_model,
|
150 |
+
struct llama_context_params params);
|
151 |
+
|
152 |
+
LLAMA_API void llama_free_model(struct llama_model * model);
|
153 |
+
|
154 |
+
LLAMA_API struct llama_context * llama_new_context_with_model(
|
155 |
+
struct llama_model * model,
|
156 |
+
struct llama_context_params params);
|
157 |
+
|
158 |
// Various functions for loading a ggml llama model.
|
159 |
// Allocate (almost) all memory needed for the model.
|
160 |
// Return NULL on failure
|
161 |
+
LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
|
162 |
const char * path_model,
|
163 |
+
struct llama_context_params params),
|
164 |
+
"please use llama_load_model_from_file combined with llama_new_context_with_model instead");
|
165 |
|
166 |
// Frees all allocated memory
|
167 |
LLAMA_API void llama_free(struct llama_context * ctx);
|
|
|
178 |
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
179 |
// will be applied on top of the previous one
|
180 |
// Returns 0 on success
|
181 |
+
LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
|
182 |
struct llama_context * ctx,
|
183 |
+
const char * path_lora,
|
184 |
+
const char * path_base_model,
|
185 |
+
int n_threads),
|
186 |
+
"please use llama_model_apply_lora_from_file instead");
|
187 |
+
|
188 |
+
LLAMA_API int llama_model_apply_lora_from_file(
|
189 |
+
const struct llama_model * model,
|
190 |
const char * path_lora,
|
191 |
const char * path_base_model,
|
192 |
int n_threads);
|
|
|
337 |
#include <string>
|
338 |
struct ggml_tensor;
|
339 |
|
340 |
+
const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
341 |
|
342 |
#endif
|
343 |
|
model_adapter.cpp
CHANGED
@@ -98,7 +98,7 @@ void print_tok_vec(std::vector<float> &embd)
|
|
98 |
//we need to read more to determine
|
99 |
int32_t vocabsiz = 0;
|
100 |
fin.read((char *) &vocabsiz, sizeof(int32_t));
|
101 |
-
if(vocabsiz==4096) //actually the d_model for mpt
|
102 |
{
|
103 |
fileformat = FileFormat::MPT_1;
|
104 |
}
|
|
|
98 |
//we need to read more to determine
|
99 |
int32_t vocabsiz = 0;
|
100 |
fin.read((char *) &vocabsiz, sizeof(int32_t));
|
101 |
+
if(vocabsiz==4096 || vocabsiz==7168) //actually the d_model for mpt
|
102 |
{
|
103 |
fileformat = FileFormat::MPT_1;
|
104 |
}
|
otherarch/gpt2_v3.cpp
CHANGED
@@ -12,6 +12,7 @@
|
|
12 |
#include <string>
|
13 |
#include <vector>
|
14 |
#include <iostream>
|
|
|
15 |
|
16 |
#include "model_adapter.h"
|
17 |
|
@@ -39,6 +40,8 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
|
|
39 |
}
|
40 |
}
|
41 |
|
|
|
|
|
42 |
// load hparams
|
43 |
{
|
44 |
auto & hparams = model.hparams;
|
@@ -53,7 +56,7 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
|
|
53 |
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
54 |
|
55 |
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
56 |
-
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
57 |
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
58 |
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
59 |
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
@@ -90,9 +93,19 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
|
|
90 |
|
91 |
// if (i < 10) fprintf(stderr, "%.s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
|
92 |
}
|
93 |
-
}
|
94 |
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
98 |
// in order to save memory and also to speed up the computation
|
@@ -144,10 +157,10 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
|
|
144 |
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
|
145 |
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
|
146 |
|
147 |
-
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(
|
148 |
-
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(
|
149 |
|
150 |
-
ctx_size += (6 + 12*n_layer)*
|
151 |
|
152 |
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
153 |
}
|
@@ -158,7 +171,6 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
|
|
158 |
params.mem_size = ctx_size;
|
159 |
params.mem_buffer = NULL;
|
160 |
params.no_alloc = false;
|
161 |
-
|
162 |
|
163 |
model.ctx = ggml_init(params);
|
164 |
if (!model.ctx) {
|
@@ -247,11 +259,11 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
|
|
247 |
const int n_layer = hparams.n_layer;
|
248 |
const int n_ctx = hparams.n_ctx;
|
249 |
|
250 |
-
const int n_mem = n_layer*n_ctx;
|
251 |
const int n_elements = n_embd*n_mem;
|
252 |
|
253 |
-
model.memory_k = ggml_new_tensor_1d(ctx,
|
254 |
-
model.memory_v = ggml_new_tensor_1d(ctx,
|
255 |
|
256 |
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
257 |
|
@@ -293,14 +305,14 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
|
|
293 |
}
|
294 |
|
295 |
auto tensor = model.tensors[name.data()];
|
296 |
-
if (
|
297 |
-
fprintf(stderr, "%s: tensor '%s' has wrong
|
|
|
298 |
return ModelLoadResult::FAIL;
|
299 |
}
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
304 |
return ModelLoadResult::FAIL;
|
305 |
}
|
306 |
|
@@ -336,6 +348,28 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
|
|
336 |
|
337 |
fin.close();
|
338 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
339 |
|
340 |
return ModelLoadResult::SUCCESS;
|
341 |
}
|
@@ -355,7 +389,7 @@ bool gpt2_eval(
|
|
355 |
const std::vector<gpt_vocab::id> & embd_inp,
|
356 |
std::vector<float> & embd_w,
|
357 |
size_t & mem_per_token,
|
358 |
-
|
359 |
const int N = embd_inp.size();
|
360 |
|
361 |
const auto & hparams = model.hparams;
|
@@ -369,8 +403,16 @@ bool gpt2_eval(
|
|
369 |
static size_t buf_size = 256u*1024*1024;
|
370 |
static void * buf = malloc(buf_size);
|
371 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
372 |
if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
|
373 |
-
const size_t buf_size_new = 320u*1024*1024 + 1.
|
374 |
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
375 |
|
376 |
// reallocate
|
@@ -380,7 +422,7 @@ bool gpt2_eval(
|
|
380 |
buf = realloc(buf, buf_size);
|
381 |
if (buf == nullptr)
|
382 |
{
|
383 |
-
fprintf(stderr, "%s: failed to allocate %zu bytes
|
384 |
return false;
|
385 |
}
|
386 |
}
|
@@ -390,7 +432,7 @@ bool gpt2_eval(
|
|
390 |
params.mem_size = buf_size;
|
391 |
params.mem_buffer = buf;
|
392 |
params.no_alloc = false;
|
393 |
-
|
394 |
|
395 |
struct ggml_context * ctx0 = ggml_init(params);
|
396 |
struct ggml_cgraph gf = {};
|
@@ -413,6 +455,10 @@ bool gpt2_eval(
|
|
413 |
for (int il = 0; il < n_layer; ++il) {
|
414 |
struct ggml_tensor * cur;
|
415 |
|
|
|
|
|
|
|
|
|
416 |
// norm
|
417 |
{
|
418 |
// [ 768, N]
|
@@ -559,6 +605,10 @@ bool gpt2_eval(
|
|
559 |
|
560 |
struct ggml_tensor * inpFF = cur;
|
561 |
|
|
|
|
|
|
|
|
|
562 |
// feed-forward network
|
563 |
{
|
564 |
// norm
|
@@ -615,6 +665,10 @@ bool gpt2_eval(
|
|
615 |
inpL = ggml_add(ctx0, cur, inpFF);
|
616 |
}
|
617 |
|
|
|
|
|
|
|
|
|
618 |
// norm
|
619 |
{
|
620 |
// [ 768, N]
|
@@ -629,6 +683,10 @@ bool gpt2_eval(
|
|
629 |
ggml_repeat(ctx0, model.ln_f_b, inpL));
|
630 |
}
|
631 |
|
|
|
|
|
|
|
|
|
632 |
// inpL = WTE * inpL
|
633 |
// [ 768, 50257] - model.lm_head
|
634 |
// [ 768, N] - inpL
|
|
|
12 |
#include <string>
|
13 |
#include <vector>
|
14 |
#include <iostream>
|
15 |
+
#include <algorithm>
|
16 |
|
17 |
#include "model_adapter.h"
|
18 |
|
|
|
40 |
}
|
41 |
}
|
42 |
|
43 |
+
int32_t origmaxctx = model.hparams.n_ctx;
|
44 |
+
|
45 |
// load hparams
|
46 |
{
|
47 |
auto & hparams = model.hparams;
|
|
|
56 |
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
57 |
|
58 |
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
59 |
+
printf("%s: n_ctx = %d (%d)\n", __func__, hparams.n_ctx,origmaxctx);
|
60 |
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
61 |
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
62 |
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
|
|
93 |
|
94 |
// if (i < 10) fprintf(stderr, "%.s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
|
95 |
}
|
|
|
96 |
|
97 |
+
// Add StarChat special tokens.
|
98 |
+
for (const std::string & token : {
|
99 |
+
"<|system|>",
|
100 |
+
"<|user|>",
|
101 |
+
"<|assistant|>",
|
102 |
+
"<|end|>",
|
103 |
+
}) {
|
104 |
+
if (vocab.token_to_id.find(token) != vocab.token_to_id.end()) {
|
105 |
+
vocab.add_special_token(token);
|
106 |
+
}
|
107 |
+
}
|
108 |
+
}
|
109 |
|
110 |
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
111 |
// in order to save memory and also to speed up the computation
|
|
|
157 |
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
|
158 |
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
|
159 |
|
160 |
+
ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
|
161 |
+
ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
|
162 |
|
163 |
+
ctx_size += (6 + 12*n_layer)*1024; // object overhead
|
164 |
|
165 |
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
166 |
}
|
|
|
171 |
params.mem_size = ctx_size;
|
172 |
params.mem_buffer = NULL;
|
173 |
params.no_alloc = false;
|
|
|
174 |
|
175 |
model.ctx = ggml_init(params);
|
176 |
if (!model.ctx) {
|
|
|
259 |
const int n_layer = hparams.n_layer;
|
260 |
const int n_ctx = hparams.n_ctx;
|
261 |
|
262 |
+
const int n_mem = n_layer*std::max(origmaxctx,n_ctx);
|
263 |
const int n_elements = n_embd*n_mem;
|
264 |
|
265 |
+
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
266 |
+
model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
267 |
|
268 |
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
269 |
|
|
|
305 |
}
|
306 |
|
307 |
auto tensor = model.tensors[name.data()];
|
308 |
+
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
309 |
+
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
310 |
+
__func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
|
311 |
return ModelLoadResult::FAIL;
|
312 |
}
|
313 |
+
if (ggml_nelements(tensor) != nelements) {
|
314 |
+
fprintf(stderr, "%s: tensor '%s' has wrong size in model file. got %d, expected %d\n",
|
315 |
+
__func__, name.data(), (int) ggml_nelements(tensor), nelements);
|
|
|
316 |
return ModelLoadResult::FAIL;
|
317 |
}
|
318 |
|
|
|
348 |
|
349 |
fin.close();
|
350 |
|
351 |
+
//gpu offload
|
352 |
+
#if defined(GGML_USE_CLBLAST)
|
353 |
+
if(gpulayers>0)
|
354 |
+
{
|
355 |
+
const auto & hparams = model.hparams;
|
356 |
+
size_t vram_total = 0;
|
357 |
+
const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
|
358 |
+
fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
|
359 |
+
for (int i = 0; i < n_gpu; ++i) {
|
360 |
+
const auto & layer = model.layers[i];
|
361 |
+
layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
|
362 |
+
layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
|
363 |
+
layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
|
364 |
+
layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
|
365 |
+
ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
|
366 |
+
ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
|
367 |
+
ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
|
368 |
+
ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
|
369 |
+
}
|
370 |
+
fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
371 |
+
}
|
372 |
+
#endif
|
373 |
|
374 |
return ModelLoadResult::SUCCESS;
|
375 |
}
|
|
|
389 |
const std::vector<gpt_vocab::id> & embd_inp,
|
390 |
std::vector<float> & embd_w,
|
391 |
size_t & mem_per_token,
|
392 |
+
bool use_scratch) {
|
393 |
const int N = embd_inp.size();
|
394 |
|
395 |
const auto & hparams = model.hparams;
|
|
|
403 |
static size_t buf_size = 256u*1024*1024;
|
404 |
static void * buf = malloc(buf_size);
|
405 |
|
406 |
+
// use 2 scratch buffers
|
407 |
+
// TODO: very hacky solution - reimplement in a more elegant way
|
408 |
+
static size_t scr0_size = (n_embd>2400?512u:256u)*1024*1024;
|
409 |
+
static size_t scr1_size = (n_embd>2400?512u:256u)*1024*1024;
|
410 |
+
|
411 |
+
static void * scr0 = malloc(scr0_size);
|
412 |
+
static void * scr1 = malloc(scr1_size);
|
413 |
+
|
414 |
if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
|
415 |
+
const size_t buf_size_new = 320u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead
|
416 |
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
417 |
|
418 |
// reallocate
|
|
|
422 |
buf = realloc(buf, buf_size);
|
423 |
if (buf == nullptr)
|
424 |
{
|
425 |
+
fprintf(stderr, "%s: failed to allocate %zu bytes. Try reducing batch size.\n", __func__, buf_size);
|
426 |
return false;
|
427 |
}
|
428 |
}
|
|
|
432 |
params.mem_size = buf_size;
|
433 |
params.mem_buffer = buf;
|
434 |
params.no_alloc = false;
|
435 |
+
|
436 |
|
437 |
struct ggml_context * ctx0 = ggml_init(params);
|
438 |
struct ggml_cgraph gf = {};
|
|
|
455 |
for (int il = 0; il < n_layer; ++il) {
|
456 |
struct ggml_tensor * cur;
|
457 |
|
458 |
+
if(use_scratch){
|
459 |
+
ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
|
460 |
+
}
|
461 |
+
|
462 |
// norm
|
463 |
{
|
464 |
// [ 768, N]
|
|
|
605 |
|
606 |
struct ggml_tensor * inpFF = cur;
|
607 |
|
608 |
+
if(use_scratch){
|
609 |
+
ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
|
610 |
+
}
|
611 |
+
|
612 |
// feed-forward network
|
613 |
{
|
614 |
// norm
|
|
|
665 |
inpL = ggml_add(ctx0, cur, inpFF);
|
666 |
}
|
667 |
|
668 |
+
if(use_scratch){
|
669 |
+
ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
|
670 |
+
}
|
671 |
+
|
672 |
// norm
|
673 |
{
|
674 |
// [ 768, N]
|
|
|
683 |
ggml_repeat(ctx0, model.ln_f_b, inpL));
|
684 |
}
|
685 |
|
686 |
+
if(use_scratch){
|
687 |
+
ggml_set_scratch(ctx0, { 0, 0, nullptr, });
|
688 |
+
}
|
689 |
+
|
690 |
// inpL = WTE * inpL
|
691 |
// [ 768, 50257] - model.lm_head
|
692 |
// [ 768, N] - inpL
|
otherarch/gptj_v3.cpp
CHANGED
@@ -12,10 +12,13 @@
|
|
12 |
#include <string>
|
13 |
#include <vector>
|
14 |
#include <iostream>
|
|
|
15 |
|
16 |
#include "model_adapter.h"
|
17 |
|
18 |
-
|
|
|
|
|
19 |
|
20 |
// load the model's weights from a file
|
21 |
ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab, int gpulayers) {
|
@@ -37,6 +40,8 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
|
|
37 |
}
|
38 |
}
|
39 |
|
|
|
|
|
40 |
// load hparams
|
41 |
{
|
42 |
auto & hparams = model.hparams;
|
@@ -52,7 +57,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
|
|
52 |
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
53 |
|
54 |
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
55 |
-
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
56 |
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
57 |
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
58 |
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
@@ -136,8 +141,8 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
|
|
136 |
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
|
137 |
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
|
138 |
|
139 |
-
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
|
140 |
-
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
|
141 |
|
142 |
ctx_size += (5 + 10*n_layer)*512; // object overhead
|
143 |
|
@@ -150,7 +155,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
|
|
150 |
params.mem_size = ctx_size;
|
151 |
params.mem_buffer = NULL;
|
152 |
params.no_alloc = false;
|
153 |
-
|
154 |
|
155 |
model.ctx = ggml_init(params);
|
156 |
if (!model.ctx) {
|
@@ -230,7 +235,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
|
|
230 |
const int n_layer = hparams.n_layer;
|
231 |
const int n_ctx = hparams.n_ctx;
|
232 |
|
233 |
-
const int n_mem = n_layer*n_ctx;
|
234 |
const int n_elements = n_embd*n_mem;
|
235 |
|
236 |
model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements);
|
@@ -281,7 +286,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
|
|
281 |
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
282 |
return ModelLoadResult::FAIL;
|
283 |
}
|
284 |
-
|
285 |
|
286 |
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
287 |
|
@@ -298,7 +303,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
|
|
298 |
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
299 |
return ModelLoadResult::FAIL;
|
300 |
}
|
301 |
-
|
302 |
}
|
303 |
|
304 |
// for debugging
|
@@ -331,7 +336,32 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
|
|
331 |
|
332 |
fin.close();
|
333 |
|
334 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
335 |
|
336 |
return ModelLoadResult::SUCCESS;
|
337 |
}
|
@@ -352,7 +382,8 @@ bool gptj_eval(
|
|
352 |
const int n_past,
|
353 |
const std::vector<gpt_vocab::id> & embd_inp,
|
354 |
std::vector<float> & embd_w,
|
355 |
-
size_t & mem_per_token
|
|
|
356 |
const int N = embd_inp.size();
|
357 |
|
358 |
const auto & hparams = model.hparams;
|
@@ -367,8 +398,16 @@ bool gptj_eval(
|
|
367 |
static size_t buf_size = 256u*1024*1024;
|
368 |
static void * buf = malloc(buf_size);
|
369 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
370 |
if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
|
371 |
-
const size_t buf_size_new = 320u*1024*1024 + 1.
|
372 |
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
373 |
|
374 |
// reallocate
|
@@ -378,7 +417,7 @@ bool gptj_eval(
|
|
378 |
buf = realloc(buf, buf_size);
|
379 |
if (buf == nullptr)
|
380 |
{
|
381 |
-
fprintf(stderr, "%s: failed to allocate %zu bytes
|
382 |
return false;
|
383 |
}
|
384 |
}
|
@@ -388,7 +427,7 @@ bool gptj_eval(
|
|
388 |
params.mem_size = buf_size;
|
389 |
params.mem_buffer = buf;
|
390 |
params.no_alloc = false;
|
391 |
-
|
392 |
|
393 |
struct ggml_context * ctx0 = ggml_init(params);
|
394 |
struct ggml_cgraph gf = {};
|
@@ -403,6 +442,10 @@ bool gptj_eval(
|
|
403 |
for (int il = 0; il < n_layer; ++il) {
|
404 |
struct ggml_tensor * cur;
|
405 |
|
|
|
|
|
|
|
|
|
406 |
// norm
|
407 |
{
|
408 |
cur = ggml_norm(ctx0, inpL);
|
@@ -490,6 +533,10 @@ bool gptj_eval(
|
|
490 |
cur);
|
491 |
}
|
492 |
|
|
|
|
|
|
|
|
|
493 |
struct ggml_tensor * inpFF = cur;
|
494 |
|
495 |
// feed-forward network
|
@@ -525,6 +572,10 @@ bool gptj_eval(
|
|
525 |
inpL = ggml_add(ctx0, cur, inpL);
|
526 |
}
|
527 |
|
|
|
|
|
|
|
|
|
528 |
// norm
|
529 |
{
|
530 |
inpL = ggml_norm(ctx0, inpL);
|
@@ -537,6 +588,10 @@ bool gptj_eval(
|
|
537 |
ggml_repeat(ctx0, model.ln_f_b, inpL));
|
538 |
}
|
539 |
|
|
|
|
|
|
|
|
|
540 |
// lm_head
|
541 |
{
|
542 |
inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
|
|
|
12 |
#include <string>
|
13 |
#include <vector>
|
14 |
#include <iostream>
|
15 |
+
#include <algorithm>
|
16 |
|
17 |
#include "model_adapter.h"
|
18 |
|
19 |
+
#if defined(GGML_USE_CLBLAST)
|
20 |
+
#include "ggml-opencl.h"
|
21 |
+
#endif
|
22 |
|
23 |
// load the model's weights from a file
|
24 |
ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab, int gpulayers) {
|
|
|
40 |
}
|
41 |
}
|
42 |
|
43 |
+
int32_t origmaxctx = model.hparams.n_ctx;
|
44 |
+
|
45 |
// load hparams
|
46 |
{
|
47 |
auto & hparams = model.hparams;
|
|
|
57 |
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
58 |
|
59 |
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
60 |
+
printf("%s: n_ctx = %d (%d)\n", __func__, hparams.n_ctx,origmaxctx);
|
61 |
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
62 |
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
63 |
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
|
|
141 |
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
|
142 |
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
|
143 |
|
144 |
+
ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
|
145 |
+
ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
|
146 |
|
147 |
ctx_size += (5 + 10*n_layer)*512; // object overhead
|
148 |
|
|
|
155 |
params.mem_size = ctx_size;
|
156 |
params.mem_buffer = NULL;
|
157 |
params.no_alloc = false;
|
158 |
+
|
159 |
|
160 |
model.ctx = ggml_init(params);
|
161 |
if (!model.ctx) {
|
|
|
235 |
const int n_layer = hparams.n_layer;
|
236 |
const int n_ctx = hparams.n_ctx;
|
237 |
|
238 |
+
const int n_mem = n_layer*std::max(origmaxctx,n_ctx);
|
239 |
const int n_elements = n_embd*n_mem;
|
240 |
|
241 |
model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements);
|
|
|
286 |
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
287 |
return ModelLoadResult::FAIL;
|
288 |
}
|
289 |
+
|
290 |
|
291 |
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
292 |
|
|
|
303 |
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
304 |
return ModelLoadResult::FAIL;
|
305 |
}
|
306 |
+
|
307 |
}
|
308 |
|
309 |
// for debugging
|
|
|
336 |
|
337 |
fin.close();
|
338 |
|
339 |
+
//gpu offload
|
340 |
+
#if defined(GGML_USE_CLBLAST)
|
341 |
+
if(gpulayers>0)
|
342 |
+
{
|
343 |
+
const auto & hparams = model.hparams;
|
344 |
+
size_t vram_total = 0;
|
345 |
+
const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
|
346 |
+
fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
|
347 |
+
for (int i = 0; i < n_gpu; ++i) {
|
348 |
+
const auto & layer = model.layers[i];
|
349 |
+
layer.c_attn_q_proj_w->backend = GGML_BACKEND_GPU;
|
350 |
+
layer.c_attn_k_proj_w->backend = GGML_BACKEND_GPU;
|
351 |
+
layer.c_attn_v_proj_w->backend = GGML_BACKEND_GPU;
|
352 |
+
layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
|
353 |
+
layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
|
354 |
+
layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
|
355 |
+
ggml_cl_transform_tensor(layer.c_attn_q_proj_w->data,layer.c_attn_q_proj_w); vram_total += ggml_nbytes(layer.c_attn_q_proj_w);
|
356 |
+
ggml_cl_transform_tensor(layer.c_attn_k_proj_w->data,layer.c_attn_k_proj_w); vram_total += ggml_nbytes(layer.c_attn_k_proj_w);
|
357 |
+
ggml_cl_transform_tensor(layer.c_attn_v_proj_w->data,layer.c_attn_v_proj_w); vram_total += ggml_nbytes(layer.c_attn_v_proj_w);
|
358 |
+
ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
|
359 |
+
ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
|
360 |
+
ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
|
361 |
+
}
|
362 |
+
fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
363 |
+
}
|
364 |
+
#endif
|
365 |
|
366 |
return ModelLoadResult::SUCCESS;
|
367 |
}
|
|
|
382 |
const int n_past,
|
383 |
const std::vector<gpt_vocab::id> & embd_inp,
|
384 |
std::vector<float> & embd_w,
|
385 |
+
size_t & mem_per_token,
|
386 |
+
bool use_scratch) {
|
387 |
const int N = embd_inp.size();
|
388 |
|
389 |
const auto & hparams = model.hparams;
|
|
|
398 |
static size_t buf_size = 256u*1024*1024;
|
399 |
static void * buf = malloc(buf_size);
|
400 |
|
401 |
+
// use 2 scratch buffers
|
402 |
+
// TODO: very hacky solution - reimplement in a more elegant way
|
403 |
+
static size_t scr0_size = 512u*1024*1024;
|
404 |
+
static size_t scr1_size = 512u*1024*1024;
|
405 |
+
|
406 |
+
static void * scr0 = malloc(scr0_size);
|
407 |
+
static void * scr1 = malloc(scr1_size);
|
408 |
+
|
409 |
if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
|
410 |
+
const size_t buf_size_new = 320u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead
|
411 |
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
412 |
|
413 |
// reallocate
|
|
|
417 |
buf = realloc(buf, buf_size);
|
418 |
if (buf == nullptr)
|
419 |
{
|
420 |
+
fprintf(stderr, "%s: failed to allocate %zu bytes. Try reducing batch size.\n", __func__, buf_size);
|
421 |
return false;
|
422 |
}
|
423 |
}
|
|
|
427 |
params.mem_size = buf_size;
|
428 |
params.mem_buffer = buf;
|
429 |
params.no_alloc = false;
|
430 |
+
|
431 |
|
432 |
struct ggml_context * ctx0 = ggml_init(params);
|
433 |
struct ggml_cgraph gf = {};
|
|
|
442 |
for (int il = 0; il < n_layer; ++il) {
|
443 |
struct ggml_tensor * cur;
|
444 |
|
445 |
+
if(use_scratch){
|
446 |
+
ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
|
447 |
+
}
|
448 |
+
|
449 |
// norm
|
450 |
{
|
451 |
cur = ggml_norm(ctx0, inpL);
|
|
|
533 |
cur);
|
534 |
}
|
535 |
|
536 |
+
if(use_scratch){
|
537 |
+
ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
|
538 |
+
}
|
539 |
+
|
540 |
struct ggml_tensor * inpFF = cur;
|
541 |
|
542 |
// feed-forward network
|
|
|
572 |
inpL = ggml_add(ctx0, cur, inpL);
|
573 |
}
|
574 |
|
575 |
+
if(use_scratch){
|
576 |
+
ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
|
577 |
+
}
|
578 |
+
|
579 |
// norm
|
580 |
{
|
581 |
inpL = ggml_norm(ctx0, inpL);
|
|
|
588 |
ggml_repeat(ctx0, model.ln_f_b, inpL));
|
589 |
}
|
590 |
|
591 |
+
if(use_scratch){
|
592 |
+
ggml_set_scratch(ctx0, { 0, 0, nullptr, });
|
593 |
+
}
|
594 |
+
|
595 |
// lm_head
|
596 |
{
|
597 |
inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
|
otherarch/llama_v2.cpp
CHANGED
@@ -59,7 +59,7 @@ static const std::map<e_model2, size_t> & MEM_REQ_SCRATCH0_2()
|
|
59 |
{ MODEL_UNKNOWN_2, 512ull * MB_2 },
|
60 |
{ MODEL_7B_2, 512ull * MB_2 },
|
61 |
{ MODEL_13B_2, 512ull * MB_2 },
|
62 |
-
{ MODEL_30B_2,
|
63 |
{ MODEL_65B_2, 1024ull * MB_2 },
|
64 |
};
|
65 |
return k_sizes;
|
@@ -71,7 +71,7 @@ static const std::map<e_model2, size_t> & MEM_REQ_SCRATCH1_2()
|
|
71 |
{ MODEL_UNKNOWN_2, 512ull * MB_2 },
|
72 |
{ MODEL_7B_2, 512ull * MB_2 },
|
73 |
{ MODEL_13B_2, 512ull * MB_2 },
|
74 |
-
{ MODEL_30B_2,
|
75 |
{ MODEL_65B_2, 1024ull * MB_2 },
|
76 |
};
|
77 |
return k_sizes;
|
|
|
59 |
{ MODEL_UNKNOWN_2, 512ull * MB_2 },
|
60 |
{ MODEL_7B_2, 512ull * MB_2 },
|
61 |
{ MODEL_13B_2, 512ull * MB_2 },
|
62 |
+
{ MODEL_30B_2, 640ull * MB_2 },
|
63 |
{ MODEL_65B_2, 1024ull * MB_2 },
|
64 |
};
|
65 |
return k_sizes;
|
|
|
71 |
{ MODEL_UNKNOWN_2, 512ull * MB_2 },
|
72 |
{ MODEL_7B_2, 512ull * MB_2 },
|
73 |
{ MODEL_13B_2, 512ull * MB_2 },
|
74 |
+
{ MODEL_30B_2, 640ull * MB_2 },
|
75 |
{ MODEL_65B_2, 1024ull * MB_2 },
|
76 |
};
|
77 |
return k_sizes;
|
otherarch/mpt_v3.cpp
CHANGED
@@ -12,13 +12,16 @@
|
|
12 |
#include <string>
|
13 |
#include <vector>
|
14 |
#include <iostream>
|
|
|
15 |
|
16 |
#include "model_adapter.h"
|
17 |
|
18 |
-
|
|
|
|
|
19 |
|
20 |
// load the model's weights from a file
|
21 |
-
bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab) {
|
22 |
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
23 |
|
24 |
auto fin = std::ifstream(fname, std::ios::binary);
|
@@ -75,7 +78,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
|
|
75 |
std::string word;
|
76 |
std::vector<char> buf(128);
|
77 |
|
78 |
-
for (int i = 0; i < n_vocab; i++) {
|
79 |
uint32_t len;
|
80 |
fin.read((char *) &len, sizeof(len));
|
81 |
|
@@ -83,6 +86,16 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
|
|
83 |
fin.read((char *) buf.data(), len);
|
84 |
word.assign(buf.data(), len);
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
vocab.token_to_id[word] = i;
|
87 |
vocab.id_to_token[i] = word;
|
88 |
}
|
@@ -120,8 +133,8 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
|
|
120 |
ctx_size += n_layer * (4 * n_embd * n_embd * ggml_type_sizef(wtype)); // mlp_mlp_up_weight
|
121 |
ctx_size += n_layer * (n_embd * n_embd * 4 * ggml_type_sizef(wtype)); // mlp_mlp_down_weight
|
122 |
|
123 |
-
ctx_size +=
|
124 |
-
ctx_size +=
|
125 |
|
126 |
ctx_size += (6 + 6 * n_layer) * 512; // object overhead
|
127 |
|
@@ -278,6 +291,29 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
|
|
278 |
|
279 |
fin.close();
|
280 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
281 |
return true;
|
282 |
}
|
283 |
|
@@ -290,7 +326,8 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
|
|
290 |
// - embd_w: the predicted logits for the next token
|
291 |
//
|
292 |
bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
|
293 |
-
const std::vector<gpt_vocab::id> & embd_inp, std::vector<float> & embd_w,
|
|
|
294 |
const int N = embd_inp.size();
|
295 |
|
296 |
const auto & hparams = model.hparams;
|
@@ -306,22 +343,26 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
|
|
306 |
|
307 |
// use 2 scratch buffers
|
308 |
// TODO: very hacky solution - reimplement in a more elegant way
|
309 |
-
|
310 |
-
static
|
|
|
311 |
|
312 |
-
static
|
313 |
static void * scr1 = malloc(scr1_size);
|
314 |
|
315 |
-
if (mem_per_token > 0 && mem_per_token *
|
316 |
-
const size_t buf_size_new = 1.
|
317 |
// printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__,
|
318 |
// buf_size, buf_size_new);
|
319 |
// reallocate
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
|
|
|
|
|
|
325 |
}
|
326 |
}
|
327 |
|
@@ -343,7 +384,9 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
|
|
343 |
|
344 |
struct ggml_tensor * cur;
|
345 |
|
|
|
346 |
ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
|
|
|
347 |
|
348 |
// a = self.ln_1(x)
|
349 |
{
|
@@ -439,7 +482,9 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
|
|
439 |
|
440 |
inpL = ggml_add(ctx0, inpL, cur);
|
441 |
|
|
|
442 |
ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
|
|
|
443 |
|
444 |
// m = self.ln_2(x)
|
445 |
{
|
@@ -465,7 +510,9 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
|
|
465 |
inpL = ggml_add(ctx0, inpL, cur);
|
466 |
}
|
467 |
|
|
|
468 |
ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
|
|
|
469 |
|
470 |
// norm
|
471 |
{
|
@@ -474,7 +521,9 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
|
|
474 |
inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_weight, inpL), inpL);
|
475 |
}
|
476 |
|
|
|
477 |
ggml_set_scratch(ctx0, { 0, 0, nullptr, });
|
|
|
478 |
|
479 |
// output embedding weight tied to input embedding
|
480 |
inpL = ggml_mul_mat(ctx0, model.wte_weight, inpL);
|
|
|
12 |
#include <string>
|
13 |
#include <vector>
|
14 |
#include <iostream>
|
15 |
+
#include <algorithm>
|
16 |
|
17 |
#include "model_adapter.h"
|
18 |
|
19 |
+
#if defined(GGML_USE_CLBLAST)
|
20 |
+
#include "ggml-opencl.h"
|
21 |
+
#endif
|
22 |
|
23 |
// load the model's weights from a file
|
24 |
+
bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab, int gpulayers) {
|
25 |
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
26 |
|
27 |
auto fin = std::ifstream(fname, std::ios::binary);
|
|
|
78 |
std::string word;
|
79 |
std::vector<char> buf(128);
|
80 |
|
81 |
+
for (int i = 0; i < n_vocab; i++) {
|
82 |
uint32_t len;
|
83 |
fin.read((char *) &len, sizeof(len));
|
84 |
|
|
|
86 |
fin.read((char *) buf.data(), len);
|
87 |
word.assign(buf.data(), len);
|
88 |
|
89 |
+
// Convert token from utf-8
|
90 |
+
// std::wstring word_multibytes = convert_to_wstring(word);
|
91 |
+
// if(word_multibytes!=L"")
|
92 |
+
// {
|
93 |
+
// word.resize(word_multibytes.size());
|
94 |
+
// for (int w = 0; w < word_multibytes.size(); w++) {
|
95 |
+
// word[w] = uint8_t(word_multibytes[w]);
|
96 |
+
// }
|
97 |
+
// }
|
98 |
+
|
99 |
vocab.token_to_id[word] = i;
|
100 |
vocab.id_to_token[i] = word;
|
101 |
}
|
|
|
133 |
ctx_size += n_layer * (4 * n_embd * n_embd * ggml_type_sizef(wtype)); // mlp_mlp_up_weight
|
134 |
ctx_size += n_layer * (n_embd * n_embd * 4 * ggml_type_sizef(wtype)); // mlp_mlp_down_weight
|
135 |
|
136 |
+
ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_k
|
137 |
+
ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_v
|
138 |
|
139 |
ctx_size += (6 + 6 * n_layer) * 512; // object overhead
|
140 |
|
|
|
291 |
|
292 |
fin.close();
|
293 |
|
294 |
+
//gpu offload
|
295 |
+
#if defined(GGML_USE_CLBLAST)
|
296 |
+
if(gpulayers>0)
|
297 |
+
{
|
298 |
+
const auto & hparams = model.hparams;
|
299 |
+
size_t vram_total = 0;
|
300 |
+
const int n_gpu = std::min(gpulayers, int(hparams.n_layers));
|
301 |
+
fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
|
302 |
+
for (int i = 0; i < n_gpu; ++i) {
|
303 |
+
const auto & layer = model.layers[i];
|
304 |
+
layer.ffn_up_proj->backend = GGML_BACKEND_GPU;
|
305 |
+
layer.ffn_down_proj->backend = GGML_BACKEND_GPU;
|
306 |
+
layer.c_attn_wqkv_weight->backend = GGML_BACKEND_GPU;
|
307 |
+
layer.c_attn_out_proj_weight->backend = GGML_BACKEND_GPU;
|
308 |
+
ggml_cl_transform_tensor(layer.ffn_up_proj->data,layer.ffn_up_proj); vram_total += ggml_nbytes(layer.ffn_up_proj);
|
309 |
+
ggml_cl_transform_tensor(layer.ffn_down_proj->data,layer.ffn_down_proj); vram_total += ggml_nbytes(layer.ffn_down_proj);
|
310 |
+
ggml_cl_transform_tensor(layer.c_attn_wqkv_weight->data,layer.c_attn_wqkv_weight); vram_total += ggml_nbytes(layer.c_attn_wqkv_weight);
|
311 |
+
ggml_cl_transform_tensor(layer.c_attn_out_proj_weight->data,layer.c_attn_out_proj_weight); vram_total += ggml_nbytes(layer.c_attn_out_proj_weight);
|
312 |
+
}
|
313 |
+
fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
314 |
+
}
|
315 |
+
#endif
|
316 |
+
|
317 |
return true;
|
318 |
}
|
319 |
|
|
|
326 |
// - embd_w: the predicted logits for the next token
|
327 |
//
|
328 |
bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
|
329 |
+
const std::vector<gpt_vocab::id> & embd_inp, std::vector<float> & embd_w,
|
330 |
+
bool logits_all, size_t & mem_per_token, bool use_scratch) {
|
331 |
const int N = embd_inp.size();
|
332 |
|
333 |
const auto & hparams = model.hparams;
|
|
|
343 |
|
344 |
// use 2 scratch buffers
|
345 |
// TODO: very hacky solution - reimplement in a more elegant way
|
346 |
+
//MPT 30B needs more scratch memory
|
347 |
+
static size_t scr0_size = (n_embd>=7168?2048u:1024u)*1024*1024;
|
348 |
+
static size_t scr1_size = (n_embd>=7168?2048u:1024u)*1024*1024;
|
349 |
|
350 |
+
static void * scr0 = malloc(scr0_size);
|
351 |
static void * scr1 = malloc(scr1_size);
|
352 |
|
353 |
+
if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
|
354 |
+
const size_t buf_size_new = 320u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead
|
355 |
// printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__,
|
356 |
// buf_size, buf_size_new);
|
357 |
// reallocate
|
358 |
+
if (buf_size_new > buf_size)
|
359 |
+
{
|
360 |
+
buf_size = buf_size_new;
|
361 |
+
buf = realloc(buf, buf_size);
|
362 |
+
if (buf == nullptr) {
|
363 |
+
fprintf(stderr, "%s: failed to allocate %zu bytes. Try reducing batch size.\n", __func__, buf_size);
|
364 |
+
return false;
|
365 |
+
}
|
366 |
}
|
367 |
}
|
368 |
|
|
|
384 |
|
385 |
struct ggml_tensor * cur;
|
386 |
|
387 |
+
if(use_scratch){
|
388 |
ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
|
389 |
+
}
|
390 |
|
391 |
// a = self.ln_1(x)
|
392 |
{
|
|
|
482 |
|
483 |
inpL = ggml_add(ctx0, inpL, cur);
|
484 |
|
485 |
+
if(use_scratch){
|
486 |
ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
|
487 |
+
}
|
488 |
|
489 |
// m = self.ln_2(x)
|
490 |
{
|
|
|
510 |
inpL = ggml_add(ctx0, inpL, cur);
|
511 |
}
|
512 |
|
513 |
+
if(use_scratch){
|
514 |
ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
|
515 |
+
}
|
516 |
|
517 |
// norm
|
518 |
{
|
|
|
521 |
inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_weight, inpL), inpL);
|
522 |
}
|
523 |
|
524 |
+
if(use_scratch){
|
525 |
ggml_set_scratch(ctx0, { 0, 0, nullptr, });
|
526 |
+
}
|
527 |
|
528 |
// output embedding weight tied to input embedding
|
529 |
inpL = ggml_mul_mat(ctx0, model.wte_weight, inpL);
|
otherarch/neox_v3.cpp
CHANGED
@@ -12,11 +12,14 @@
|
|
12 |
#include <string>
|
13 |
#include <vector>
|
14 |
#include <iostream>
|
|
|
15 |
|
16 |
-
|
|
|
|
|
17 |
|
18 |
// load the model's weights from a file
|
19 |
-
ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format) {
|
20 |
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
21 |
|
22 |
auto fin = std::ifstream(fname, std::ios::binary);
|
@@ -35,30 +38,25 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
|
|
35 |
}
|
36 |
}
|
37 |
|
|
|
|
|
38 |
// load hparams
|
39 |
{
|
40 |
auto & hparams = model.hparams;
|
41 |
-
|
42 |
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
43 |
fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
44 |
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
45 |
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
46 |
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
47 |
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
48 |
-
|
49 |
-
{
|
50 |
-
fin.read((char *) &hparams.par_res, sizeof(hparams.par_res));
|
51 |
-
}
|
52 |
-
if(file_format==FileFormat::NEOX_3)
|
53 |
-
{
|
54 |
-
hparams.par_res = 0;
|
55 |
-
}
|
56 |
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
57 |
|
58 |
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
59 |
|
60 |
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
61 |
-
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
62 |
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
63 |
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
64 |
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
@@ -107,10 +105,10 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
|
|
107 |
{
|
108 |
const auto & hparams = model.hparams;
|
109 |
|
110 |
-
const
|
111 |
-
const
|
112 |
-
const
|
113 |
-
const
|
114 |
|
115 |
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
|
116 |
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
|
@@ -138,10 +136,10 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
|
|
138 |
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
|
139 |
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
|
140 |
|
141 |
-
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
|
142 |
-
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
|
143 |
|
144 |
-
ctx_size += (6 + 16*n_layer)*
|
145 |
|
146 |
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
147 |
}
|
@@ -152,7 +150,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
|
|
152 |
params.mem_size = ctx_size;
|
153 |
params.mem_buffer = NULL;
|
154 |
params.no_alloc = false;
|
155 |
-
|
156 |
model.ctx = ggml_init(params);
|
157 |
if (!model.ctx) {
|
158 |
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
@@ -237,7 +235,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
|
|
237 |
const int n_layer = hparams.n_layer;
|
238 |
const int n_ctx = hparams.n_ctx;
|
239 |
|
240 |
-
const int64_t n_mem = n_layer*n_ctx;
|
241 |
const int64_t n_elements = n_embd*n_mem;
|
242 |
|
243 |
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
@@ -300,22 +298,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
|
|
300 |
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
|
301 |
}
|
302 |
|
303 |
-
size_t bpe = ggml_type_size(ggml_type(ttype));
|
304 |
-
|
305 |
-
if(file_format==FileFormat::NEOX_1)
|
306 |
-
{
|
307 |
-
switch (ttype) {
|
308 |
-
case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
|
309 |
-
case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
|
310 |
-
case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
|
311 |
-
case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
|
312 |
-
default:
|
313 |
-
{
|
314 |
-
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ttype);
|
315 |
-
return ModelLoadResult::FAIL;
|
316 |
-
}
|
317 |
-
};
|
318 |
-
}
|
319 |
|
320 |
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
321 |
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
@@ -340,6 +323,29 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
|
|
340 |
|
341 |
fin.close();
|
342 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
343 |
return ModelLoadResult::SUCCESS;
|
344 |
}
|
345 |
|
@@ -394,7 +400,8 @@ bool gpt_neox_eval(
|
|
394 |
const int n_past,
|
395 |
const std::vector<gpt_vocab::id> & embd_inp,
|
396 |
std::vector<float> & embd_w,
|
397 |
-
size_t & mem_per_token
|
|
|
398 |
const int N = embd_inp.size();
|
399 |
|
400 |
const auto & hparams = model.hparams;
|
@@ -409,8 +416,16 @@ bool gpt_neox_eval(
|
|
409 |
static size_t buf_size = 256u*1024*1024;
|
410 |
static void * buf = malloc(buf_size);
|
411 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
412 |
if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
|
413 |
-
const size_t buf_size_new = 360u*1024*1024 + 1.
|
414 |
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
415 |
|
416 |
// reallocate
|
@@ -420,7 +435,7 @@ bool gpt_neox_eval(
|
|
420 |
buf = realloc(buf, buf_size);
|
421 |
if (buf == nullptr)
|
422 |
{
|
423 |
-
fprintf(stderr, "%s: failed to allocate %zu bytes
|
424 |
return false;
|
425 |
}
|
426 |
}
|
@@ -430,7 +445,7 @@ bool gpt_neox_eval(
|
|
430 |
params.mem_size = buf_size;
|
431 |
params.mem_buffer = buf;
|
432 |
params.no_alloc = false;
|
433 |
-
|
434 |
|
435 |
struct ggml_context * ctx0 = ggml_init(params);
|
436 |
struct ggml_cgraph gf = {};
|
@@ -445,6 +460,10 @@ bool gpt_neox_eval(
|
|
445 |
for (int il = 0; il < n_layer; ++il) {
|
446 |
struct ggml_tensor * cur;
|
447 |
|
|
|
|
|
|
|
|
|
448 |
// self-attention
|
449 |
{
|
450 |
{
|
@@ -548,6 +567,10 @@ bool gpt_neox_eval(
|
|
548 |
}
|
549 |
}
|
550 |
|
|
|
|
|
|
|
|
|
551 |
if (hparams.par_res == 0) {
|
552 |
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
|
553 |
|
@@ -570,6 +593,10 @@ bool gpt_neox_eval(
|
|
570 |
}
|
571 |
}
|
572 |
|
|
|
|
|
|
|
|
|
573 |
// norm
|
574 |
{
|
575 |
inpL = ggml_norm(ctx0, inpL);
|
@@ -582,6 +609,10 @@ bool gpt_neox_eval(
|
|
582 |
ggml_repeat(ctx0, model.ln_f_b, inpL));
|
583 |
}
|
584 |
|
|
|
|
|
|
|
|
|
585 |
// lm_head
|
586 |
{
|
587 |
inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
|
|
|
12 |
#include <string>
|
13 |
#include <vector>
|
14 |
#include <iostream>
|
15 |
+
#include <algorithm>
|
16 |
|
17 |
+
#if defined(GGML_USE_CLBLAST)
|
18 |
+
#include "ggml-opencl.h"
|
19 |
+
#endif
|
20 |
|
21 |
// load the model's weights from a file
|
22 |
+
ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) {
|
23 |
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
24 |
|
25 |
auto fin = std::ifstream(fname, std::ios::binary);
|
|
|
38 |
}
|
39 |
}
|
40 |
|
41 |
+
int32_t origmaxctx = model.hparams.n_ctx;
|
42 |
+
|
43 |
// load hparams
|
44 |
{
|
45 |
auto & hparams = model.hparams;
|
46 |
+
|
47 |
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
48 |
fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
49 |
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
50 |
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
51 |
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
52 |
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
53 |
+
fin.read((char *) &hparams.par_res, sizeof(hparams.par_res));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
55 |
|
56 |
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
57 |
|
58 |
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
59 |
+
printf("%s: n_ctx = %d (%d)\n", __func__, hparams.n_ctx,origmaxctx);
|
60 |
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
61 |
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
62 |
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
|
|
105 |
{
|
106 |
const auto & hparams = model.hparams;
|
107 |
|
108 |
+
const size_t n_embd = hparams.n_embd;
|
109 |
+
const size_t n_layer = hparams.n_layer;
|
110 |
+
const size_t n_ctx = hparams.n_ctx;
|
111 |
+
const size_t n_vocab = hparams.n_vocab;
|
112 |
|
113 |
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
|
114 |
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
|
|
|
136 |
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
|
137 |
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
|
138 |
|
139 |
+
ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
|
140 |
+
ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
|
141 |
|
142 |
+
ctx_size += (6 + 16*n_layer)*1024; // object overhead
|
143 |
|
144 |
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
145 |
}
|
|
|
150 |
params.mem_size = ctx_size;
|
151 |
params.mem_buffer = NULL;
|
152 |
params.no_alloc = false;
|
153 |
+
|
154 |
model.ctx = ggml_init(params);
|
155 |
if (!model.ctx) {
|
156 |
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
|
|
235 |
const int n_layer = hparams.n_layer;
|
236 |
const int n_ctx = hparams.n_ctx;
|
237 |
|
238 |
+
const int64_t n_mem = n_layer*std::max(origmaxctx,n_ctx);
|
239 |
const int64_t n_elements = n_embd*n_mem;
|
240 |
|
241 |
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
|
|
298 |
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
|
299 |
}
|
300 |
|
301 |
+
const size_t bpe = ggml_type_size(ggml_type(ttype));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
|
303 |
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
304 |
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
|
|
323 |
|
324 |
fin.close();
|
325 |
|
326 |
+
//gpu offload
|
327 |
+
#if defined(GGML_USE_CLBLAST)
|
328 |
+
if(gpulayers>0)
|
329 |
+
{
|
330 |
+
const auto & hparams = model.hparams;
|
331 |
+
size_t vram_total = 0;
|
332 |
+
const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
|
333 |
+
fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
|
334 |
+
for (int i = 0; i < n_gpu; ++i) {
|
335 |
+
const auto & layer = model.layers[i];
|
336 |
+
layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
|
337 |
+
layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
|
338 |
+
layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
|
339 |
+
layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
|
340 |
+
ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
|
341 |
+
ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
|
342 |
+
ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
|
343 |
+
ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
|
344 |
+
}
|
345 |
+
fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
346 |
+
}
|
347 |
+
#endif
|
348 |
+
|
349 |
return ModelLoadResult::SUCCESS;
|
350 |
}
|
351 |
|
|
|
400 |
const int n_past,
|
401 |
const std::vector<gpt_vocab::id> & embd_inp,
|
402 |
std::vector<float> & embd_w,
|
403 |
+
size_t & mem_per_token,
|
404 |
+
bool use_scratch) {
|
405 |
const int N = embd_inp.size();
|
406 |
|
407 |
const auto & hparams = model.hparams;
|
|
|
416 |
static size_t buf_size = 256u*1024*1024;
|
417 |
static void * buf = malloc(buf_size);
|
418 |
|
419 |
+
// use 2 scratch buffers
|
420 |
+
// TODO: very hacky solution - reimplement in a more elegant way
|
421 |
+
static size_t scr0_size = (n_embd>2400?512u:256u)*1024*1024;
|
422 |
+
static size_t scr1_size = (n_embd>2400?512u:256u)*1024*1024;
|
423 |
+
|
424 |
+
static void * scr0 = malloc(scr0_size);
|
425 |
+
static void * scr1 = malloc(scr1_size);
|
426 |
+
|
427 |
if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
|
428 |
+
const size_t buf_size_new = 360u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead
|
429 |
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
430 |
|
431 |
// reallocate
|
|
|
435 |
buf = realloc(buf, buf_size);
|
436 |
if (buf == nullptr)
|
437 |
{
|
438 |
+
fprintf(stderr, "%s: failed to allocate %zu bytes. Try reducing batch size.\n", __func__, buf_size);
|
439 |
return false;
|
440 |
}
|
441 |
}
|
|
|
445 |
params.mem_size = buf_size;
|
446 |
params.mem_buffer = buf;
|
447 |
params.no_alloc = false;
|
448 |
+
|
449 |
|
450 |
struct ggml_context * ctx0 = ggml_init(params);
|
451 |
struct ggml_cgraph gf = {};
|
|
|
460 |
for (int il = 0; il < n_layer; ++il) {
|
461 |
struct ggml_tensor * cur;
|
462 |
|
463 |
+
if(use_scratch){
|
464 |
+
ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
|
465 |
+
}
|
466 |
+
|
467 |
// self-attention
|
468 |
{
|
469 |
{
|
|
|
567 |
}
|
568 |
}
|
569 |
|
570 |
+
if(use_scratch){
|
571 |
+
ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
|
572 |
+
}
|
573 |
+
|
574 |
if (hparams.par_res == 0) {
|
575 |
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
|
576 |
|
|
|
593 |
}
|
594 |
}
|
595 |
|
596 |
+
if(use_scratch){
|
597 |
+
ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
|
598 |
+
}
|
599 |
+
|
600 |
// norm
|
601 |
{
|
602 |
inpL = ggml_norm(ctx0, inpL);
|
|
|
609 |
ggml_repeat(ctx0, model.ln_f_b, inpL));
|
610 |
}
|
611 |
|
612 |
+
if(use_scratch){
|
613 |
+
ggml_set_scratch(ctx0, { 0, 0, nullptr, });
|
614 |
+
}
|
615 |
+
|
616 |
// lm_head
|
617 |
{
|
618 |
inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
|
otherarch/otherarch.h
CHANGED
@@ -43,7 +43,6 @@ struct gptj_layer {
|
|
43 |
struct ggml_tensor * c_mlp_fc_b;
|
44 |
|
45 |
struct ggml_tensor * c_mlp_proj_w;
|
46 |
-
struct ggml_tensor * c_mlp_proj_w_trans; //for backwards compatibility
|
47 |
struct ggml_tensor * c_mlp_proj_b;
|
48 |
};
|
49 |
struct gptj_layer_v2 {
|
|
|
43 |
struct ggml_tensor * c_mlp_fc_b;
|
44 |
|
45 |
struct ggml_tensor * c_mlp_proj_w;
|
|
|
46 |
struct ggml_tensor * c_mlp_proj_b;
|
47 |
};
|
48 |
struct gptj_layer_v2 {
|
otherarch/utils.cpp
CHANGED
@@ -122,8 +122,27 @@ std::string convert_to_utf8(const std::wstring & input) {
|
|
122 |
|
123 |
|
124 |
std::wstring convert_to_wstring(const std::string & input) {
|
125 |
-
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
}
|
128 |
|
129 |
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
@@ -132,31 +151,34 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
|
|
132 |
// first split the text into words
|
133 |
{
|
134 |
std::string str = text;
|
135 |
-
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
136 |
|
137 |
// Generate the subpattern from the special_tokens vector if it's not empty
|
138 |
if (!vocab.special_tokens.empty()) {
|
|
|
139 |
std::string special_tokens_subpattern;
|
140 |
for (const auto & token : vocab.special_tokens) {
|
141 |
if (!special_tokens_subpattern.empty()) {
|
142 |
special_tokens_subpattern += "|";
|
143 |
}
|
144 |
-
special_tokens_subpattern += token;
|
145 |
}
|
146 |
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
|
|
157 |
}
|
158 |
-
|
159 |
}
|
|
|
|
|
160 |
}
|
161 |
|
162 |
// find the longest token that forms each word in words:
|
@@ -185,15 +207,15 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
|
|
185 |
|
186 |
bool should_transpose_layer(std::string name)
|
187 |
{
|
188 |
-
|
189 |
-
if(name.find(".mlp.fc_in.weight")!=std::string::npos ||
|
190 |
-
name.find(".attn.out_proj.weight")!=std::string::npos ||
|
191 |
-
name.find(".attn.q_proj.weight")!=std::string::npos ||
|
192 |
-
name.find(".attn.k_proj.weight")!=std::string::npos ||
|
193 |
name.find(".attn.v_proj.weight")!=std::string::npos ||
|
194 |
-
name.find("/attn/c_attn/w")!=std::string::npos ||
|
195 |
-
name.find("/attn/c_proj/w")!=std::string::npos ||
|
196 |
-
name.find("/mlp/c_fc/w")!=std::string::npos ||
|
197 |
name.find("/mlp/c_proj/w")!=std::string::npos)
|
198 |
{
|
199 |
return true;
|
|
|
122 |
|
123 |
|
124 |
std::wstring convert_to_wstring(const std::string & input) {
|
125 |
+
try {
|
126 |
+
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
127 |
+
return converter.from_bytes(input);
|
128 |
+
} catch (const std::range_error& e) {
|
129 |
+
return L"";
|
130 |
+
} catch (...) {
|
131 |
+
return L"";
|
132 |
+
}
|
133 |
+
}
|
134 |
+
|
135 |
+
void gpt_split_words(std::string str, std::vector<std::string>& words) {
|
136 |
+
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
137 |
+
const std::regex re(pattern);
|
138 |
+
std::smatch m;
|
139 |
+
|
140 |
+
while (std::regex_search(str, m, re)) {
|
141 |
+
for (auto x : m) {
|
142 |
+
words.push_back(x);
|
143 |
+
}
|
144 |
+
str = m.suffix();
|
145 |
+
}
|
146 |
}
|
147 |
|
148 |
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
|
|
151 |
// first split the text into words
|
152 |
{
|
153 |
std::string str = text;
|
|
|
154 |
|
155 |
// Generate the subpattern from the special_tokens vector if it's not empty
|
156 |
if (!vocab.special_tokens.empty()) {
|
157 |
+
const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])");
|
158 |
std::string special_tokens_subpattern;
|
159 |
for (const auto & token : vocab.special_tokens) {
|
160 |
if (!special_tokens_subpattern.empty()) {
|
161 |
special_tokens_subpattern += "|";
|
162 |
}
|
163 |
+
special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)");
|
164 |
}
|
165 |
|
166 |
+
std::regex re(special_tokens_subpattern);
|
167 |
+
std::smatch m;
|
168 |
+
// Split the text by special tokens.
|
169 |
+
while (std::regex_search(str, m, re)) {
|
170 |
+
// Split the substrings in-between special tokens into words.
|
171 |
+
gpt_split_words(m.prefix(), words);
|
172 |
+
// Add matched special tokens as words.
|
173 |
+
for (auto x : m) {
|
174 |
+
words.push_back(x);
|
175 |
+
}
|
176 |
+
str = m.suffix();
|
177 |
}
|
178 |
+
// Remaining text without special tokens will be handled below.
|
179 |
}
|
180 |
+
|
181 |
+
gpt_split_words(str, words);
|
182 |
}
|
183 |
|
184 |
// find the longest token that forms each word in words:
|
|
|
207 |
|
208 |
bool should_transpose_layer(std::string name)
|
209 |
{
|
210 |
+
|
211 |
+
if(name.find(".mlp.fc_in.weight")!=std::string::npos ||
|
212 |
+
name.find(".attn.out_proj.weight")!=std::string::npos ||
|
213 |
+
name.find(".attn.q_proj.weight")!=std::string::npos ||
|
214 |
+
name.find(".attn.k_proj.weight")!=std::string::npos ||
|
215 |
name.find(".attn.v_proj.weight")!=std::string::npos ||
|
216 |
+
name.find("/attn/c_attn/w")!=std::string::npos ||
|
217 |
+
name.find("/attn/c_proj/w")!=std::string::npos ||
|
218 |
+
name.find("/mlp/c_fc/w")!=std::string::npos ||
|
219 |
name.find("/mlp/c_proj/w")!=std::string::npos)
|
220 |
{
|
221 |
return true;
|
otherarch/utils.h
CHANGED
@@ -34,6 +34,12 @@ void utreplace(std::string & str, const std::string & needle, const std::string
|
|
34 |
// poor-man's JSON parsing
|
35 |
std::map<std::string, int32_t> json_parse(const std::string & fname);
|
36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
// split text into tokens
|
38 |
//
|
39 |
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
|
|
|
34 |
// poor-man's JSON parsing
|
35 |
std::map<std::string, int32_t> json_parse(const std::string & fname);
|
36 |
|
37 |
+
std::string convert_to_utf8(const std::wstring & input);
|
38 |
+
|
39 |
+
std::wstring convert_to_wstring(const std::string & input);
|
40 |
+
|
41 |
+
void gpt_split_words(std::string str, std::vector<std::string>& words);
|
42 |
+
|
43 |
// split text into tokens
|
44 |
//
|
45 |
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
|
spm-headers/ggml.h
CHANGED
@@ -303,6 +303,7 @@ extern "C" {
|
|
303 |
GGML_OP_STEP,
|
304 |
GGML_OP_RELU,
|
305 |
GGML_OP_GELU,
|
|
|
306 |
GGML_OP_SILU,
|
307 |
GGML_OP_SILU_BACK,
|
308 |
GGML_OP_NORM, // normalize
|
@@ -331,12 +332,15 @@ extern "C" {
|
|
331 |
GGML_OP_ROPE_BACK,
|
332 |
GGML_OP_ALIBI,
|
333 |
GGML_OP_CLAMP,
|
334 |
-
|
335 |
-
|
|
|
336 |
|
337 |
GGML_OP_FLASH_ATTN,
|
338 |
GGML_OP_FLASH_FF,
|
339 |
GGML_OP_FLASH_ATTN_BACK,
|
|
|
|
|
340 |
|
341 |
GGML_OP_MAP_UNARY,
|
342 |
GGML_OP_MAP_BINARY,
|
@@ -500,8 +504,9 @@ extern "C" {
|
|
500 |
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
501 |
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
502 |
|
503 |
-
GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
|
504 |
-
GGML_API size_t ggml_get_mem_size
|
|
|
505 |
|
506 |
GGML_API struct ggml_tensor * ggml_new_tensor(
|
507 |
struct ggml_context * ctx,
|
@@ -556,8 +561,9 @@ extern "C" {
|
|
556 |
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
557 |
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
558 |
|
559 |
-
GGML_API const char *
|
560 |
-
GGML_API
|
|
|
561 |
|
562 |
//
|
563 |
// operations on tensors with backpropagation
|
@@ -610,24 +616,47 @@ extern "C" {
|
|
610 |
struct ggml_tensor * a,
|
611 |
struct ggml_tensor * b);
|
612 |
|
|
|
|
|
|
|
|
|
|
|
613 |
GGML_API struct ggml_tensor * ggml_mul(
|
614 |
struct ggml_context * ctx,
|
615 |
struct ggml_tensor * a,
|
616 |
struct ggml_tensor * b);
|
617 |
|
|
|
|
|
|
|
|
|
|
|
618 |
GGML_API struct ggml_tensor * ggml_div(
|
619 |
struct ggml_context * ctx,
|
620 |
struct ggml_tensor * a,
|
621 |
struct ggml_tensor * b);
|
622 |
|
|
|
|
|
|
|
|
|
|
|
623 |
GGML_API struct ggml_tensor * ggml_sqr(
|
624 |
struct ggml_context * ctx,
|
625 |
struct ggml_tensor * a);
|
626 |
|
|
|
|
|
|
|
|
|
627 |
GGML_API struct ggml_tensor * ggml_sqrt(
|
628 |
struct ggml_context * ctx,
|
629 |
struct ggml_tensor * a);
|
630 |
|
|
|
|
|
|
|
|
|
631 |
GGML_API struct ggml_tensor * ggml_log(
|
632 |
struct ggml_context * ctx,
|
633 |
struct ggml_tensor * a);
|
@@ -667,31 +696,67 @@ extern "C" {
|
|
667 |
struct ggml_context * ctx,
|
668 |
struct ggml_tensor * a);
|
669 |
|
|
|
|
|
|
|
|
|
670 |
GGML_API struct ggml_tensor * ggml_sgn(
|
671 |
struct ggml_context * ctx,
|
672 |
struct ggml_tensor * a);
|
673 |
|
|
|
|
|
|
|
|
|
674 |
GGML_API struct ggml_tensor * ggml_neg(
|
675 |
struct ggml_context * ctx,
|
676 |
struct ggml_tensor * a);
|
677 |
|
|
|
|
|
|
|
|
|
678 |
GGML_API struct ggml_tensor * ggml_step(
|
679 |
struct ggml_context * ctx,
|
680 |
struct ggml_tensor * a);
|
681 |
|
|
|
|
|
|
|
|
|
682 |
GGML_API struct ggml_tensor * ggml_relu(
|
683 |
struct ggml_context * ctx,
|
684 |
struct ggml_tensor * a);
|
685 |
|
|
|
|
|
|
|
|
|
686 |
// TODO: double-check this computation is correct
|
687 |
GGML_API struct ggml_tensor * ggml_gelu(
|
688 |
struct ggml_context * ctx,
|
689 |
struct ggml_tensor * a);
|
690 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
691 |
GGML_API struct ggml_tensor * ggml_silu(
|
692 |
struct ggml_context * ctx,
|
693 |
struct ggml_tensor * a);
|
694 |
|
|
|
|
|
|
|
|
|
695 |
// a - x
|
696 |
// b - dy
|
697 |
GGML_API struct ggml_tensor * ggml_silu_back(
|
@@ -705,10 +770,18 @@ extern "C" {
|
|
705 |
struct ggml_context * ctx,
|
706 |
struct ggml_tensor * a);
|
707 |
|
|
|
|
|
|
|
|
|
708 |
GGML_API struct ggml_tensor * ggml_rms_norm(
|
709 |
struct ggml_context * ctx,
|
710 |
struct ggml_tensor * a);
|
711 |
|
|
|
|
|
|
|
|
|
712 |
// a - x
|
713 |
// b - dy
|
714 |
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
@@ -998,16 +1071,55 @@ extern "C" {
|
|
998 |
float min,
|
999 |
float max);
|
1000 |
|
1001 |
-
//
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1002 |
// TODO: we don't support extra parameters for now
|
1003 |
// that's why we are hard-coding the stride, padding, and dilation
|
1004 |
// not great ..
|
1005 |
-
|
|
|
|
|
|
|
|
|
|
|
1006 |
struct ggml_context * ctx,
|
1007 |
struct ggml_tensor * a,
|
1008 |
struct ggml_tensor * b);
|
1009 |
|
1010 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1011 |
struct ggml_context * ctx,
|
1012 |
struct ggml_tensor * a,
|
1013 |
struct ggml_tensor * b);
|
@@ -1035,6 +1147,26 @@ extern "C" {
|
|
1035 |
struct ggml_tensor * c0,
|
1036 |
struct ggml_tensor * c1);
|
1037 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1038 |
// Mapping operations
|
1039 |
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
1040 |
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
|
|
303 |
GGML_OP_STEP,
|
304 |
GGML_OP_RELU,
|
305 |
GGML_OP_GELU,
|
306 |
+
GGML_OP_GELU_QUICK,
|
307 |
GGML_OP_SILU,
|
308 |
GGML_OP_SILU_BACK,
|
309 |
GGML_OP_NORM, // normalize
|
|
|
332 |
GGML_OP_ROPE_BACK,
|
333 |
GGML_OP_ALIBI,
|
334 |
GGML_OP_CLAMP,
|
335 |
+
GGML_OP_CONV_1D_S1_PH,
|
336 |
+
GGML_OP_CONV_1D_S2_PH,
|
337 |
+
GGML_OP_CONV_2D_SK_P0,
|
338 |
|
339 |
GGML_OP_FLASH_ATTN,
|
340 |
GGML_OP_FLASH_FF,
|
341 |
GGML_OP_FLASH_ATTN_BACK,
|
342 |
+
GGML_OP_WIN_PART,
|
343 |
+
GGML_OP_WIN_UNPART,
|
344 |
|
345 |
GGML_OP_MAP_UNARY,
|
346 |
GGML_OP_MAP_BINARY,
|
|
|
504 |
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
505 |
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
506 |
|
507 |
+
GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
|
508 |
+
GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
|
509 |
+
GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
|
510 |
|
511 |
GGML_API struct ggml_tensor * ggml_new_tensor(
|
512 |
struct ggml_context * ctx,
|
|
|
561 |
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
562 |
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
563 |
|
564 |
+
GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
|
565 |
+
GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
|
566 |
+
GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...);
|
567 |
|
568 |
//
|
569 |
// operations on tensors with backpropagation
|
|
|
616 |
struct ggml_tensor * a,
|
617 |
struct ggml_tensor * b);
|
618 |
|
619 |
+
GGML_API struct ggml_tensor * ggml_sub_inplace(
|
620 |
+
struct ggml_context * ctx,
|
621 |
+
struct ggml_tensor * a,
|
622 |
+
struct ggml_tensor * b);
|
623 |
+
|
624 |
GGML_API struct ggml_tensor * ggml_mul(
|
625 |
struct ggml_context * ctx,
|
626 |
struct ggml_tensor * a,
|
627 |
struct ggml_tensor * b);
|
628 |
|
629 |
+
GGML_API struct ggml_tensor * ggml_mul_inplace(
|
630 |
+
struct ggml_context * ctx,
|
631 |
+
struct ggml_tensor * a,
|
632 |
+
struct ggml_tensor * b);
|
633 |
+
|
634 |
GGML_API struct ggml_tensor * ggml_div(
|
635 |
struct ggml_context * ctx,
|
636 |
struct ggml_tensor * a,
|
637 |
struct ggml_tensor * b);
|
638 |
|
639 |
+
GGML_API struct ggml_tensor * ggml_div_inplace(
|
640 |
+
struct ggml_context * ctx,
|
641 |
+
struct ggml_tensor * a,
|
642 |
+
struct ggml_tensor * b);
|
643 |
+
|
644 |
GGML_API struct ggml_tensor * ggml_sqr(
|
645 |
struct ggml_context * ctx,
|
646 |
struct ggml_tensor * a);
|
647 |
|
648 |
+
GGML_API struct ggml_tensor * ggml_sqr_inplace(
|
649 |
+
struct ggml_context * ctx,
|
650 |
+
struct ggml_tensor * a);
|
651 |
+
|
652 |
GGML_API struct ggml_tensor * ggml_sqrt(
|
653 |
struct ggml_context * ctx,
|
654 |
struct ggml_tensor * a);
|
655 |
|
656 |
+
GGML_API struct ggml_tensor * ggml_sqrt_inplace(
|
657 |
+
struct ggml_context * ctx,
|
658 |
+
struct ggml_tensor * a);
|
659 |
+
|
660 |
GGML_API struct ggml_tensor * ggml_log(
|
661 |
struct ggml_context * ctx,
|
662 |
struct ggml_tensor * a);
|
|
|
696 |
struct ggml_context * ctx,
|
697 |
struct ggml_tensor * a);
|
698 |
|
699 |
+
GGML_API struct ggml_tensor * ggml_abs_inplace(
|
700 |
+
struct ggml_context * ctx,
|
701 |
+
struct ggml_tensor * a);
|
702 |
+
|
703 |
GGML_API struct ggml_tensor * ggml_sgn(
|
704 |
struct ggml_context * ctx,
|
705 |
struct ggml_tensor * a);
|
706 |
|
707 |
+
GGML_API struct ggml_tensor * ggml_sgn_inplace(
|
708 |
+
struct ggml_context * ctx,
|
709 |
+
struct ggml_tensor * a);
|
710 |
+
|
711 |
GGML_API struct ggml_tensor * ggml_neg(
|
712 |
struct ggml_context * ctx,
|
713 |
struct ggml_tensor * a);
|
714 |
|
715 |
+
GGML_API struct ggml_tensor * ggml_neg_inplace(
|
716 |
+
struct ggml_context * ctx,
|
717 |
+
struct ggml_tensor * a);
|
718 |
+
|
719 |
GGML_API struct ggml_tensor * ggml_step(
|
720 |
struct ggml_context * ctx,
|
721 |
struct ggml_tensor * a);
|
722 |
|
723 |
+
GGML_API struct ggml_tensor * ggml_step_inplace(
|
724 |
+
struct ggml_context * ctx,
|
725 |
+
struct ggml_tensor * a);
|
726 |
+
|
727 |
GGML_API struct ggml_tensor * ggml_relu(
|
728 |
struct ggml_context * ctx,
|
729 |
struct ggml_tensor * a);
|
730 |
|
731 |
+
GGML_API struct ggml_tensor * ggml_relu_inplace(
|
732 |
+
struct ggml_context * ctx,
|
733 |
+
struct ggml_tensor * a);
|
734 |
+
|
735 |
// TODO: double-check this computation is correct
|
736 |
GGML_API struct ggml_tensor * ggml_gelu(
|
737 |
struct ggml_context * ctx,
|
738 |
struct ggml_tensor * a);
|
739 |
|
740 |
+
GGML_API struct ggml_tensor * ggml_gelu_inplace(
|
741 |
+
struct ggml_context * ctx,
|
742 |
+
struct ggml_tensor * a);
|
743 |
+
|
744 |
+
GGML_API struct ggml_tensor * ggml_gelu_quick(
|
745 |
+
struct ggml_context * ctx,
|
746 |
+
struct ggml_tensor * a);
|
747 |
+
|
748 |
+
GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
|
749 |
+
struct ggml_context * ctx,
|
750 |
+
struct ggml_tensor * a);
|
751 |
+
|
752 |
GGML_API struct ggml_tensor * ggml_silu(
|
753 |
struct ggml_context * ctx,
|
754 |
struct ggml_tensor * a);
|
755 |
|
756 |
+
GGML_API struct ggml_tensor * ggml_silu_inplace(
|
757 |
+
struct ggml_context * ctx,
|
758 |
+
struct ggml_tensor * a);
|
759 |
+
|
760 |
// a - x
|
761 |
// b - dy
|
762 |
GGML_API struct ggml_tensor * ggml_silu_back(
|
|
|
770 |
struct ggml_context * ctx,
|
771 |
struct ggml_tensor * a);
|
772 |
|
773 |
+
GGML_API struct ggml_tensor * ggml_norm_inplace(
|
774 |
+
struct ggml_context * ctx,
|
775 |
+
struct ggml_tensor * a);
|
776 |
+
|
777 |
GGML_API struct ggml_tensor * ggml_rms_norm(
|
778 |
struct ggml_context * ctx,
|
779 |
struct ggml_tensor * a);
|
780 |
|
781 |
+
GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
|
782 |
+
struct ggml_context * ctx,
|
783 |
+
struct ggml_tensor * a);
|
784 |
+
|
785 |
// a - x
|
786 |
// b - dy
|
787 |
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
|
|
1071 |
float min,
|
1072 |
float max);
|
1073 |
|
1074 |
+
// TODO: implement general-purpose convolutions
|
1075 |
+
// GGML_API struct ggml_tensor * ggml_conv_1d(
|
1076 |
+
// struct ggml_context * ctx,
|
1077 |
+
// struct ggml_tensor * a,
|
1078 |
+
// struct ggml_tensor * b,
|
1079 |
+
// int s0
|
1080 |
+
// int p0,
|
1081 |
+
// int d0);
|
1082 |
+
//
|
1083 |
+
// GGML_API struct ggml_tensor * ggml_conv_2d(
|
1084 |
+
// struct ggml_context * ctx,
|
1085 |
+
// struct ggml_tensor * a,
|
1086 |
+
// struct ggml_tensor * b,
|
1087 |
+
// int s0,
|
1088 |
+
// int s1,
|
1089 |
+
// int p0,
|
1090 |
+
// int p1,
|
1091 |
+
// int d0,
|
1092 |
+
// int d1);
|
1093 |
+
|
1094 |
+
// padding = half
|
1095 |
// TODO: we don't support extra parameters for now
|
1096 |
// that's why we are hard-coding the stride, padding, and dilation
|
1097 |
// not great ..
|
1098 |
+
// example:
|
1099 |
+
// a: 3 80 768 1
|
1100 |
+
// b: 3000 80 1 1
|
1101 |
+
// res: 3000 768 1 1
|
1102 |
+
// used in whisper
|
1103 |
+
GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
|
1104 |
struct ggml_context * ctx,
|
1105 |
struct ggml_tensor * a,
|
1106 |
struct ggml_tensor * b);
|
1107 |
|
1108 |
+
// used in whisper
|
1109 |
+
GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
|
1110 |
+
struct ggml_context * ctx,
|
1111 |
+
struct ggml_tensor * a,
|
1112 |
+
struct ggml_tensor * b);
|
1113 |
+
|
1114 |
+
// kernel size is a->ne[0] x a->ne[1]
|
1115 |
+
// stride is equal to kernel size
|
1116 |
+
// padding is zero
|
1117 |
+
// example:
|
1118 |
+
// a: 16 16 3 768
|
1119 |
+
// b: 1024 1024 3 1
|
1120 |
+
// res: 64 64 768 1
|
1121 |
+
// used in sam
|
1122 |
+
GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
|
1123 |
struct ggml_context * ctx,
|
1124 |
struct ggml_tensor * a,
|
1125 |
struct ggml_tensor * b);
|
|
|
1147 |
struct ggml_tensor * c0,
|
1148 |
struct ggml_tensor * c1);
|
1149 |
|
1150 |
+
// partition into non-overlapping windows with padding if needed
|
1151 |
+
// example:
|
1152 |
+
// a: 768 64 64 1
|
1153 |
+
// w: 14
|
1154 |
+
// res: 768 14 14 25
|
1155 |
+
// used in sam
|
1156 |
+
GGML_API struct ggml_tensor * ggml_win_part(
|
1157 |
+
struct ggml_context * ctx,
|
1158 |
+
struct ggml_tensor * a,
|
1159 |
+
int w);
|
1160 |
+
|
1161 |
+
// reverse of ggml_win_part
|
1162 |
+
// used in sam
|
1163 |
+
GGML_API struct ggml_tensor * ggml_win_unpart(
|
1164 |
+
struct ggml_context * ctx,
|
1165 |
+
struct ggml_tensor * a,
|
1166 |
+
int w0,
|
1167 |
+
int h0,
|
1168 |
+
int w);
|
1169 |
+
|
1170 |
// Mapping operations
|
1171 |
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
1172 |
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|