Illumotion commited on
Commit
b439a8f
·
1 Parent(s): 93fbabe

Upload folder using huggingface_hub

Browse files
.gitignore CHANGED
@@ -31,6 +31,7 @@ out/
31
  /perplexity
32
  /embedding
33
  /train-text-from-scratch
 
34
  /benchmark-matmult
35
  /vdot
36
  /server
 
31
  /perplexity
32
  /embedding
33
  /train-text-from-scratch
34
+ /simple
35
  /benchmark-matmult
36
  /vdot
37
  /server
Dockerfile CHANGED
@@ -3,7 +3,7 @@ WORKDIR /app
3
  COPY . .
4
  RUN apt update \
5
  && apt install build-essential wget libopenblas-dev make -y \
6
- && make \
7
  && wget https://huggingface.co/Yoshiii/pygmalion-7b-ggml/resolve/main/pygmalion-7b-q5_K_M.bin\
8
  && apt remove build-essential wget make -y
9
 
 
3
  COPY . .
4
  RUN apt update \
5
  && apt install build-essential wget libopenblas-dev make -y \
6
+ && make LLAMA_OPENBLAS=1 \
7
  && wget https://huggingface.co/Yoshiii/pygmalion-7b-ggml/resolve/main/pygmalion-7b-q5_K_M.bin\
8
  && apt remove build-essential wget make -y
9
 
Makefile CHANGED
@@ -149,6 +149,14 @@ ifdef LLAMA_CUDA_DMMV_Y
149
  else
150
  NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
151
  endif # LLAMA_CUDA_DMMV_Y
 
 
 
 
 
 
 
 
152
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
153
  $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
154
  ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
 
149
  else
150
  NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
151
  endif # LLAMA_CUDA_DMMV_Y
152
+ ifdef LLAMA_CUDA_DMMV_F16
153
+ NVCCFLAGS += -DGGML_CUDA_DMMV_F16
154
+ endif # LLAMA_CUDA_DMMV_F16
155
+ ifdef LLAMA_CUDA_KQUANTS_ITER
156
+ NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
157
+ else
158
+ NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
159
+ endif
160
  ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
161
  $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
162
  ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
Remote-Link.cmd ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-windows-amd64.exe -o cloudflared.exe
2
+ cloudflared.exe tunnel --url localhost:5001
convert.py CHANGED
@@ -130,6 +130,14 @@ TENSORS_LIST = make_tensors_list()
130
  TENSORS_SET = set(TENSORS_LIST)
131
 
132
 
 
 
 
 
 
 
 
 
133
  @dataclass
134
  class Params:
135
  n_vocab: int
@@ -137,21 +145,61 @@ class Params:
137
  n_mult: int
138
  n_head: int
139
  n_layer: int
140
- file_type: GGMLFileType
141
 
142
  @staticmethod
143
- def guessed(model: 'LazyModel', file_type: GGMLFileType) -> 'Params':
144
- n_vocab, n_embd = model["tok_embeddings.weight"].shape
 
 
 
 
 
 
 
 
 
145
 
146
  return Params(
147
  n_vocab=n_vocab,
148
  n_embd=n_embd,
149
  n_mult=256,
150
- n_head=n_embd // 128,
151
- n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model),
152
- file_type=file_type,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  )
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  class SentencePieceVocab:
157
  def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
@@ -595,18 +643,17 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
595
  return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
596
 
597
 
598
- def convert_transformers_to_orig(model: LazyModel) -> LazyModel:
599
  out: LazyModel = {}
600
  out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
601
  out["norm.weight"] = model["model.norm.weight"]
602
  out["output.weight"] = model["lm_head.weight"]
603
 
604
- n_head = model["model.layers.0.self_attn.q_proj.weight"].shape[1] // 128
605
  for i in itertools.count():
606
  if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
607
  break
608
- out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], n_head)
609
- out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], n_head)
610
  out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
611
  out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
612
 
@@ -920,7 +967,7 @@ class OutputFile:
920
  def __init__(self, fname_out: Path) -> None:
921
  self.fout = open(fname_out, "wb")
922
 
923
- def write_file_header(self, params: Params) -> None:
924
  self.fout.write(b"ggjt"[::-1]) # magic
925
  values = [
926
  1, # file version
@@ -930,7 +977,7 @@ class OutputFile:
930
  params.n_head,
931
  params.n_layer,
932
  params.n_embd // params.n_head, # rot (obsolete)
933
- params.file_type.value,
934
  ]
935
  self.fout.write(struct.pack("i" * len(values), *values))
936
 
@@ -951,17 +998,17 @@ class OutputFile:
951
  def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
952
  of = OutputFile(fname_out)
953
  params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0,
954
- n_head=1, n_layer=0, file_type=GGMLFileType.AllF32)
955
  of = OutputFile(fname_out)
956
- of.write_file_header(params)
957
  of.write_vocab(vocab)
958
  of.fout.close()
959
 
960
  @staticmethod
961
- def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None:
962
  check_vocab_size(params, vocab)
963
  of = OutputFile(fname_out)
964
- of.write_file_header(params)
965
  print("Writing vocab...")
966
  of.write_vocab(vocab)
967
 
@@ -997,11 +1044,11 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi
997
  raise Exception(f"Unexpected combination of types: {name_to_type}")
998
 
999
 
1000
- def do_necessary_conversions(model: LazyModel) -> LazyModel:
1001
  model = handle_quantization(model)
1002
 
1003
  if "lm_head.weight" in model:
1004
- model = convert_transformers_to_orig(model)
1005
  model = filter_and_sort_tensors(model)
1006
 
1007
  return model
@@ -1107,14 +1154,14 @@ def load_vocab(path: Path) -> SentencePieceVocab:
1107
  return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
1108
 
1109
 
1110
- def default_outfile(model_paths: List[Path], params: Params) -> Path:
1111
  namestr = {
1112
  GGMLFileType.AllF32: "f32",
1113
  GGMLFileType.MostlyF16: "f16",
1114
  GGMLFileType.MostlyQ4_0: "q4_0",
1115
  GGMLFileType.MostlyQ4_1: "q4_1",
1116
  GGMLFileType.PerLayerIsQ4_1: "q4_1",
1117
- }[params.file_type]
1118
  ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
1119
  if ret in model_paths:
1120
  sys.stderr.write(
@@ -1164,13 +1211,13 @@ def main(args_in: Optional[List[str]] = None) -> None:
1164
  else:
1165
  vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
1166
  vocab = load_vocab(vocab_dir)
 
1167
  model = model_plus.model
1168
- model = do_necessary_conversions(model)
1169
  output_type = pick_output_type(model, args.outtype)
1170
  model = convert_to_output_type(model, output_type)
1171
- params = Params.guessed(model, output_type)
1172
- outfile = args.outfile or default_outfile(model_plus.paths, params)
1173
- OutputFile.write_all(outfile, params, model, vocab)
1174
  print(f"Wrote {outfile}")
1175
 
1176
 
 
130
  TENSORS_SET = set(TENSORS_LIST)
131
 
132
 
133
+ def find_n_mult(n_ff: int, n_embd: int) -> int:
134
+ # hardcoded magic range
135
+ for n_mult in range(256, 1, -1):
136
+ calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
137
+ if calc_ff == n_ff:
138
+ return n_mult
139
+ return 1
140
+
141
  @dataclass
142
  class Params:
143
  n_vocab: int
 
145
  n_mult: int
146
  n_head: int
147
  n_layer: int
 
148
 
149
  @staticmethod
150
+ def guessed(model: 'LazyModel') -> 'Params':
151
+ # try transformer naming first
152
+ n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
153
+
154
+ # try transformer naming first
155
+ if "model.layers.0.self_attn.q_proj.weight" in model:
156
+ n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
157
+ else:
158
+ n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
159
+
160
+ n_head=n_embd // 128 # guessed
161
 
162
  return Params(
163
  n_vocab=n_vocab,
164
  n_embd=n_embd,
165
  n_mult=256,
166
+ n_head=n_head,
167
+ n_layer=n_layer,
168
+ )
169
+
170
+ @staticmethod
171
+ def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
172
+ config = json.load(open(config_path))
173
+
174
+ n_vocab = config["vocab_size"];
175
+ n_embd = config["hidden_size"];
176
+ n_head = config["num_attention_heads"];
177
+ n_layer = config["num_hidden_layers"];
178
+ n_ff = config["intermediate_size"];
179
+
180
+ n_mult = find_n_mult(n_ff, n_embd);
181
+
182
+ return Params(
183
+ n_vocab=n_vocab,
184
+ n_embd=n_embd,
185
+ n_mult=n_mult,
186
+ n_head=n_head,
187
+ n_layer=n_layer,
188
  )
189
 
190
+ @staticmethod
191
+ def load(model_plus: 'ModelPlus') -> 'Params':
192
+ orig_config_path = model_plus.paths[0].parent / "params.json"
193
+ hf_transformer_config_path = model_plus.paths[0].parent / "config.json"
194
+
195
+ if hf_transformer_config_path.exists():
196
+ params = Params.loadHFTransformerJson(model_plus.model, hf_transformer_config_path)
197
+ else:
198
+ params = Params.guessed(model_plus.model)
199
+
200
+ print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}')
201
+ return params
202
+
203
 
204
  class SentencePieceVocab:
205
  def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
 
643
  return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
644
 
645
 
646
+ def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
647
  out: LazyModel = {}
648
  out["tok_embeddings.weight"] = model["model.embed_tokens.weight"]
649
  out["norm.weight"] = model["model.norm.weight"]
650
  out["output.weight"] = model["lm_head.weight"]
651
 
 
652
  for i in itertools.count():
653
  if f"model.layers.{i}.self_attn.q_proj.weight" not in model:
654
  break
655
+ out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
656
+ out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
657
  out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
658
  out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
659
 
 
967
  def __init__(self, fname_out: Path) -> None:
968
  self.fout = open(fname_out, "wb")
969
 
970
+ def write_file_header(self, params: Params, file_type: GGMLFileType) -> None:
971
  self.fout.write(b"ggjt"[::-1]) # magic
972
  values = [
973
  1, # file version
 
977
  params.n_head,
978
  params.n_layer,
979
  params.n_embd // params.n_head, # rot (obsolete)
980
+ file_type.value,
981
  ]
982
  self.fout.write(struct.pack("i" * len(values), *values))
983
 
 
998
  def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
999
  of = OutputFile(fname_out)
1000
  params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0,
1001
+ n_head=1, n_layer=0)
1002
  of = OutputFile(fname_out)
1003
+ of.write_file_header(params, file_type=GGMLFileType.AllF32)
1004
  of.write_vocab(vocab)
1005
  of.fout.close()
1006
 
1007
  @staticmethod
1008
+ def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None:
1009
  check_vocab_size(params, vocab)
1010
  of = OutputFile(fname_out)
1011
+ of.write_file_header(params, file_type)
1012
  print("Writing vocab...")
1013
  of.write_vocab(vocab)
1014
 
 
1044
  raise Exception(f"Unexpected combination of types: {name_to_type}")
1045
 
1046
 
1047
+ def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel:
1048
  model = handle_quantization(model)
1049
 
1050
  if "lm_head.weight" in model:
1051
+ model = convert_transformers_to_orig(model, params)
1052
  model = filter_and_sort_tensors(model)
1053
 
1054
  return model
 
1154
  return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
1155
 
1156
 
1157
+ def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
1158
  namestr = {
1159
  GGMLFileType.AllF32: "f32",
1160
  GGMLFileType.MostlyF16: "f16",
1161
  GGMLFileType.MostlyQ4_0: "q4_0",
1162
  GGMLFileType.MostlyQ4_1: "q4_1",
1163
  GGMLFileType.PerLayerIsQ4_1: "q4_1",
1164
+ }[file_type]
1165
  ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
1166
  if ret in model_paths:
1167
  sys.stderr.write(
 
1211
  else:
1212
  vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
1213
  vocab = load_vocab(vocab_dir)
1214
+ params = Params.load(model_plus)
1215
  model = model_plus.model
1216
+ model = do_necessary_conversions(model, params)
1217
  output_type = pick_output_type(model, args.outtype)
1218
  model = convert_to_output_type(model, output_type)
1219
+ outfile = args.outfile or default_outfile(model_plus.paths, output_type)
1220
+ OutputFile.write_all(outfile, params, output_type, model, vocab)
 
1221
  print(f"Wrote {outfile}")
1222
 
1223
 
examples/CMakeLists.txt CHANGED
@@ -38,6 +38,7 @@ else()
38
  add_subdirectory(benchmark)
39
  add_subdirectory(baby-llama)
40
  add_subdirectory(train-text-from-scratch)
 
41
  if (LLAMA_METAL)
42
  add_subdirectory(metal)
43
  endif()
 
38
  add_subdirectory(benchmark)
39
  add_subdirectory(baby-llama)
40
  add_subdirectory(train-text-from-scratch)
41
+ add_subdirectory(simple)
42
  if (LLAMA_METAL)
43
  add_subdirectory(metal)
44
  endif()
examples/common.cpp CHANGED
@@ -106,9 +106,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
106
  }
107
 
108
  if (arg == "-s" || arg == "--seed") {
109
- #if defined(GGML_USE_CUBLAS)
110
- fprintf(stderr, "WARNING: when using cuBLAS generation results are NOT guaranteed to be reproducible.\n");
111
- #endif
112
  if (++i >= argc) {
113
  invalid_param = true;
114
  break;
@@ -539,7 +536,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
539
  return res;
540
  }
541
 
542
- struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
543
  auto lparams = llama_context_default_params();
544
 
545
  lparams.n_ctx = params.n_ctx;
@@ -555,25 +552,33 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
555
  lparams.logits_all = params.perplexity;
556
  lparams.embedding = params.embedding;
557
 
558
- llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
 
 
 
 
559
 
 
560
  if (lctx == NULL) {
561
- fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
562
- return NULL;
 
563
  }
564
 
565
  if (!params.lora_adapter.empty()) {
566
- int err = llama_apply_lora_from_file(lctx,
567
  params.lora_adapter.c_str(),
568
  params.lora_base.empty() ? NULL : params.lora_base.c_str(),
569
  params.n_threads);
570
  if (err != 0) {
571
  fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
572
- return NULL;
 
 
573
  }
574
  }
575
 
576
- return lctx;
577
  }
578
 
579
  void console_init(console_state & con_st) {
 
106
  }
107
 
108
  if (arg == "-s" || arg == "--seed") {
 
 
 
109
  if (++i >= argc) {
110
  invalid_param = true;
111
  break;
 
536
  return res;
537
  }
538
 
539
+ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
540
  auto lparams = llama_context_default_params();
541
 
542
  lparams.n_ctx = params.n_ctx;
 
552
  lparams.logits_all = params.perplexity;
553
  lparams.embedding = params.embedding;
554
 
555
+ llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
556
+ if (model == NULL) {
557
+ fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
558
+ return std::make_tuple(nullptr, nullptr);
559
+ }
560
 
561
+ llama_context * lctx = llama_new_context_with_model(model, lparams);
562
  if (lctx == NULL) {
563
+ fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
564
+ llama_free_model(model);
565
+ return std::make_tuple(nullptr, nullptr);
566
  }
567
 
568
  if (!params.lora_adapter.empty()) {
569
+ int err = llama_model_apply_lora_from_file(model,
570
  params.lora_adapter.c_str(),
571
  params.lora_base.empty() ? NULL : params.lora_base.c_str(),
572
  params.n_threads);
573
  if (err != 0) {
574
  fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
575
+ llama_free(lctx);
576
+ llama_free_model(model);
577
+ return std::make_tuple(nullptr, nullptr);
578
  }
579
  }
580
 
581
+ return std::make_tuple(model, lctx);
582
  }
583
 
584
  void console_init(console_state & con_st) {
examples/common.h CHANGED
@@ -9,6 +9,7 @@
9
  #include <random>
10
  #include <thread>
11
  #include <unordered_map>
 
12
 
13
  #if !defined (_WIN32)
14
  #include <stdio.h>
@@ -95,7 +96,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
95
  // Model utils
96
  //
97
 
98
- struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
99
 
100
  //
101
  // Console utils
 
9
  #include <random>
10
  #include <thread>
11
  #include <unordered_map>
12
+ #include <tuple>
13
 
14
  #if !defined (_WIN32)
15
  #include <stdio.h>
 
96
  // Model utils
97
  //
98
 
99
+ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
100
 
101
  //
102
  // Console utils
examples/embedding/embedding.cpp CHANGED
@@ -37,11 +37,12 @@ int main(int argc, char ** argv) {
37
 
38
  llama_init_backend();
39
 
 
40
  llama_context * ctx;
41
 
42
  // load the model
43
- ctx = llama_init_from_gpt_params(params);
44
- if (ctx == NULL) {
45
  fprintf(stderr, "%s: error: unable to load model\n", __func__);
46
  return 1;
47
  }
@@ -90,6 +91,7 @@ int main(int argc, char ** argv) {
90
 
91
  llama_print_timings(ctx);
92
  llama_free(ctx);
 
93
 
94
  return 0;
95
  }
 
37
 
38
  llama_init_backend();
39
 
40
+ llama_model * model;
41
  llama_context * ctx;
42
 
43
  // load the model
44
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
45
+ if (model == NULL) {
46
  fprintf(stderr, "%s: error: unable to load model\n", __func__);
47
  return 1;
48
  }
 
91
 
92
  llama_print_timings(ctx);
93
  llama_free(ctx);
94
+ llama_free_model(model);
95
 
96
  return 0;
97
  }
examples/main/main.cpp CHANGED
@@ -107,12 +107,13 @@ int main(int argc, char ** argv) {
107
 
108
  llama_init_backend();
109
 
 
110
  llama_context * ctx;
111
  g_ctx = &ctx;
112
 
113
  // load the model and apply lora adapter, if any
114
- ctx = llama_init_from_gpt_params(params);
115
- if (ctx == NULL) {
116
  fprintf(stderr, "%s: error: unable to load model\n", __func__);
117
  return 1;
118
  }
@@ -139,6 +140,7 @@ int main(int argc, char ** argv) {
139
 
140
  llama_print_timings(ctx);
141
  llama_free(ctx);
 
142
 
143
  return 0;
144
  }
@@ -147,6 +149,7 @@ int main(int argc, char ** argv) {
147
  if (params.export_cgraph) {
148
  llama_eval_export(ctx, "llama.ggml");
149
  llama_free(ctx);
 
150
 
151
  return 0;
152
  }
@@ -354,7 +357,7 @@ int main(int argc, char ** argv) {
354
  if ((int)embd.size() > max_embd_size) {
355
  auto skipped_tokens = embd.size() - max_embd_size;
356
  console_set_color(con_st, CONSOLE_COLOR_ERROR);
357
- printf("<<input too long: skipped %" PRIu64 " token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
358
  console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
359
  fflush(stdout);
360
  embd.resize(max_embd_size);
@@ -666,6 +669,7 @@ int main(int argc, char ** argv) {
666
 
667
  llama_print_timings(ctx);
668
  llama_free(ctx);
 
669
 
670
  return 0;
671
  }
 
107
 
108
  llama_init_backend();
109
 
110
+ llama_model * model;
111
  llama_context * ctx;
112
  g_ctx = &ctx;
113
 
114
  // load the model and apply lora adapter, if any
115
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
116
+ if (model == NULL) {
117
  fprintf(stderr, "%s: error: unable to load model\n", __func__);
118
  return 1;
119
  }
 
140
 
141
  llama_print_timings(ctx);
142
  llama_free(ctx);
143
+ llama_free_model(model);
144
 
145
  return 0;
146
  }
 
149
  if (params.export_cgraph) {
150
  llama_eval_export(ctx, "llama.ggml");
151
  llama_free(ctx);
152
+ llama_free_model(model);
153
 
154
  return 0;
155
  }
 
357
  if ((int)embd.size() > max_embd_size) {
358
  auto skipped_tokens = embd.size() - max_embd_size;
359
  console_set_color(con_st, CONSOLE_COLOR_ERROR);
360
+ printf("<<input too long: skipped %zu token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
361
  console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
362
  fflush(stdout);
363
  embd.resize(max_embd_size);
 
669
 
670
  llama_print_timings(ctx);
671
  llama_free(ctx);
672
+ llama_free_model(model);
673
 
674
  return 0;
675
  }
examples/metal/metal.cpp CHANGED
@@ -40,8 +40,10 @@ int main(int argc, char ** argv) {
40
  // this allocates all Metal resources and memory buffers
41
  auto * ctx_metal = ggml_metal_init();
42
 
43
- ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data));
44
- ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval));
 
 
45
 
46
  // main
47
  {
 
40
  // this allocates all Metal resources and memory buffers
41
  auto * ctx_metal = ggml_metal_init();
42
 
43
+ const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
44
+ const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
45
+ ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data), max_size_data);
46
+ ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval), max_size_eval);
47
 
48
  // main
49
  {
examples/perplexity/perplexity.cpp CHANGED
@@ -149,11 +149,12 @@ int main(int argc, char ** argv) {
149
 
150
  llama_init_backend();
151
 
 
152
  llama_context * ctx;
153
 
154
  // load the model and apply lora adapter, if any
155
- ctx = llama_init_from_gpt_params(params);
156
- if (ctx == NULL) {
157
  fprintf(stderr, "%s: error: unable to load model\n", __func__);
158
  return 1;
159
  }
@@ -169,6 +170,7 @@ int main(int argc, char ** argv) {
169
 
170
  llama_print_timings(ctx);
171
  llama_free(ctx);
 
172
 
173
  return 0;
174
  }
 
149
 
150
  llama_init_backend();
151
 
152
+ llama_model * model;
153
  llama_context * ctx;
154
 
155
  // load the model and apply lora adapter, if any
156
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
157
+ if (model == NULL) {
158
  fprintf(stderr, "%s: error: unable to load model\n", __func__);
159
  return 1;
160
  }
 
170
 
171
  llama_print_timings(ctx);
172
  llama_free(ctx);
173
+ llama_free_model(model);
174
 
175
  return 0;
176
  }
examples/quantize-stats/quantize-stats.cpp CHANGED
@@ -320,6 +320,7 @@ int main(int argc, char ** argv) {
320
  fprintf(stderr, "Loading model\n");
321
 
322
  const int64_t t_main_start_us = ggml_time_us();
 
323
  llama_context * ctx;
324
 
325
  {
@@ -330,12 +331,20 @@ int main(int argc, char ** argv) {
330
  lparams.f16_kv = false;
331
  lparams.use_mlock = false;
332
 
333
- ctx = llama_init_from_file(params.model.c_str(), lparams);
334
 
335
- if (ctx == NULL) {
336
  fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
337
  return 1;
338
  }
 
 
 
 
 
 
 
 
339
  }
340
 
341
  const auto &tensors = llama_internal_get_tensor_map(ctx);
@@ -357,6 +366,7 @@ int main(int argc, char ** argv) {
357
  fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
358
  "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
359
  llama_free(ctx);
 
360
  return 1;
361
  }
362
  included_layers++;
@@ -415,6 +425,7 @@ int main(int argc, char ** argv) {
415
 
416
 
417
  llama_free(ctx);
 
418
  // report timing
419
  {
420
  const int64_t t_main_end_us = ggml_time_us();
 
320
  fprintf(stderr, "Loading model\n");
321
 
322
  const int64_t t_main_start_us = ggml_time_us();
323
+ llama_model * model;
324
  llama_context * ctx;
325
 
326
  {
 
331
  lparams.f16_kv = false;
332
  lparams.use_mlock = false;
333
 
334
+ model = llama_load_model_from_file(params.model.c_str(), lparams);
335
 
336
+ if (model == NULL) {
337
  fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
338
  return 1;
339
  }
340
+
341
+ ctx = llama_new_context_with_model(model, lparams);
342
+
343
+ if (ctx == NULL) {
344
+ fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
345
+ llama_free_model(model);
346
+ return 1;
347
+ }
348
  }
349
 
350
  const auto &tensors = llama_internal_get_tensor_map(ctx);
 
366
  fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
367
  "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
368
  llama_free(ctx);
369
+ llama_free_model(model);
370
  return 1;
371
  }
372
  included_layers++;
 
425
 
426
 
427
  llama_free(ctx);
428
+ llama_free_model(model);
429
  // report timing
430
  {
431
  const int64_t t_main_end_us = ggml_time_us();
examples/save-load-state/save-load-state.cpp CHANGED
@@ -35,12 +35,22 @@ int main(int argc, char ** argv) {
35
  auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
36
 
37
  // init
38
- auto ctx = llama_init_from_file(params.model.c_str(), lparams);
 
 
 
 
 
 
 
 
39
  auto tokens = std::vector<llama_token>(params.n_ctx);
40
  auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
41
 
42
  if (n_prompt_tokens < 1) {
43
  fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
 
 
44
  return 1;
45
  }
46
 
@@ -84,6 +94,8 @@ int main(int argc, char ** argv) {
84
  printf("%s", next_token_str);
85
  if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
86
  fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
 
 
87
  return 1;
88
  }
89
  n_past += 1;
@@ -91,23 +103,27 @@ int main(int argc, char ** argv) {
91
 
92
  printf("\n\n");
93
 
94
- // free old model
95
  llama_free(ctx);
96
 
97
- // load new model
98
- auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
99
 
100
  // Load state (rng, logits, embedding and kv_cache) from file
101
  {
102
  FILE *fp_read = fopen("dump_state.bin", "rb");
103
  if (state_size != llama_get_state_size(ctx2)) {
104
  fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
 
 
105
  return 1;
106
  }
107
 
108
  const size_t ret = fread(state_mem, 1, state_size, fp_read);
109
  if (ret != state_size) {
110
  fprintf(stderr, "\n%s : failed to read state\n", __func__);
 
 
111
  return 1;
112
  }
113
 
@@ -138,6 +154,8 @@ int main(int argc, char ** argv) {
138
  printf("%s", next_token_str);
139
  if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
140
  fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
 
 
141
  return 1;
142
  }
143
  n_past += 1;
@@ -145,5 +163,8 @@ int main(int argc, char ** argv) {
145
 
146
  printf("\n\n");
147
 
 
 
 
148
  return 0;
149
  }
 
35
  auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
36
 
37
  // init
38
+ auto model = llama_load_model_from_file(params.model.c_str(), lparams);
39
+ if (model == nullptr) {
40
+ return 1;
41
+ }
42
+ auto ctx = llama_new_context_with_model(model, lparams);
43
+ if (ctx == nullptr) {
44
+ llama_free_model(model);
45
+ return 1;
46
+ }
47
  auto tokens = std::vector<llama_token>(params.n_ctx);
48
  auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
49
 
50
  if (n_prompt_tokens < 1) {
51
  fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
52
+ llama_free(ctx);
53
+ llama_free_model(model);
54
  return 1;
55
  }
56
 
 
94
  printf("%s", next_token_str);
95
  if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
96
  fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
97
+ llama_free(ctx);
98
+ llama_free_model(model);
99
  return 1;
100
  }
101
  n_past += 1;
 
103
 
104
  printf("\n\n");
105
 
106
+ // free old context
107
  llama_free(ctx);
108
 
109
+ // make new context
110
+ auto ctx2 = llama_new_context_with_model(model, lparams);
111
 
112
  // Load state (rng, logits, embedding and kv_cache) from file
113
  {
114
  FILE *fp_read = fopen("dump_state.bin", "rb");
115
  if (state_size != llama_get_state_size(ctx2)) {
116
  fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
117
+ llama_free(ctx2);
118
+ llama_free_model(model);
119
  return 1;
120
  }
121
 
122
  const size_t ret = fread(state_mem, 1, state_size, fp_read);
123
  if (ret != state_size) {
124
  fprintf(stderr, "\n%s : failed to read state\n", __func__);
125
+ llama_free(ctx2);
126
+ llama_free_model(model);
127
  return 1;
128
  }
129
 
 
154
  printf("%s", next_token_str);
155
  if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
156
  fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
157
+ llama_free(ctx2);
158
+ llama_free_model(model);
159
  return 1;
160
  }
161
  n_past += 1;
 
163
 
164
  printf("\n\n");
165
 
166
+ llama_free(ctx2);
167
+ llama_free_model(model);
168
+
169
  return 0;
170
  }
examples/server/README.md CHANGED
@@ -21,6 +21,7 @@ Command line options:
21
  - `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
22
  - `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
23
  - `--port`: Set the port to listen. Default: `8080`.
 
24
 
25
  ## Build
26
 
@@ -119,14 +120,14 @@ node .
119
 
120
  `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
121
 
122
- `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. (default: 128, -1 = infinity).
123
 
124
  `n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
125
  By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
126
 
127
  `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
128
 
129
- `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate.
130
 
131
  `stop`: Specify a JSON array of stopping strings.
132
  These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
@@ -163,6 +164,14 @@ node .
163
 
164
  `content`: Set the text to tokenize.
165
 
 
 
 
 
 
 
 
 
166
  ## More examples
167
 
168
  ### Interactive mode
 
21
  - `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
22
  - `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
23
  - `--port`: Set the port to listen. Default: `8080`.
24
+ - `--embedding`: Enable embedding extraction, Default: disabled.
25
 
26
  ## Build
27
 
 
120
 
121
  `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
122
 
123
+ `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: 128, -1 = infinity).
124
 
125
  `n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
126
  By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
127
 
128
  `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
129
 
130
+ `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does.
131
 
132
  `stop`: Specify a JSON array of stopping strings.
133
  These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
 
164
 
165
  `content`: Set the text to tokenize.
166
 
167
+ Note that the special `BOS` token is not added in fron of the text and also a space character is not inserted automatically as it is for `/completion`.
168
+
169
+ - **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
170
+
171
+ *Options:*
172
+
173
+ `content`: Set the text to process.
174
+
175
  ## More examples
176
 
177
  ### Interactive mode
examples/server/server.cpp CHANGED
@@ -115,6 +115,7 @@ struct llama_server_context {
115
  std::vector<llama_token> embd;
116
  std::vector<llama_token> last_n_tokens;
117
 
 
118
  llama_context * ctx = nullptr;
119
  gpt_params params;
120
 
@@ -130,6 +131,10 @@ struct llama_server_context {
130
  llama_free(ctx);
131
  ctx = nullptr;
132
  }
 
 
 
 
133
  }
134
 
135
  void rewind() {
@@ -150,8 +155,8 @@ struct llama_server_context {
150
 
151
  bool loadModel(const gpt_params & params_) {
152
  params = params_;
153
- ctx = llama_init_from_gpt_params(params);
154
- if (ctx == nullptr) {
155
  LOG_ERROR("unable to load model", { { "model", params_.model } });
156
  return false;
157
  }
@@ -254,6 +259,11 @@ struct llama_server_context {
254
  n_past += n_eval;
255
  }
256
 
 
 
 
 
 
257
  // out of user input, sample next token
258
  const float temp = params.temp;
259
  const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
@@ -419,6 +429,19 @@ struct llama_server_context {
419
 
420
  return token_text;
421
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
422
  };
423
 
424
  static void server_print_usage(const char * argv0, const gpt_params & params,
@@ -457,6 +480,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params,
457
  fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
458
  fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);
459
  fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
 
460
  fprintf(stderr, "\n");
461
  }
462
 
@@ -603,6 +627,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
603
  params.use_mlock = true;
604
  } else if (arg == "--no-mmap") {
605
  params.use_mmap = false;
 
 
606
  } else {
607
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
608
  server_print_usage(argv[0], default_params, default_sparams);
@@ -646,6 +672,12 @@ static json format_generation_settings(llama_server_context & llama) {
646
  };
647
  }
648
 
 
 
 
 
 
 
649
  static json format_final_response(llama_server_context & llama, const std::string & content) {
650
  return json {
651
  { "content", content },
@@ -881,12 +913,27 @@ int main(int argc, char ** argv) {
881
 
882
  svr.Post("/tokenize", [&llama](const Request & req, Response & res) {
883
  const json body = json::parse(req.body);
884
- const std::string content = body["content"].get<std::string>();
885
  const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
886
  const json data = format_tokenizer_response(tokens);
887
  return res.set_content(data.dump(), "application/json");
888
  });
889
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
890
  svr.set_logger(log_server_request);
891
 
892
  svr.set_exception_handler([](const Request &, Response & res, std::exception_ptr ep) {
 
115
  std::vector<llama_token> embd;
116
  std::vector<llama_token> last_n_tokens;
117
 
118
+ llama_model * model = nullptr;
119
  llama_context * ctx = nullptr;
120
  gpt_params params;
121
 
 
131
  llama_free(ctx);
132
  ctx = nullptr;
133
  }
134
+ if (model) {
135
+ llama_free_model(model);
136
+ model = nullptr;
137
+ }
138
  }
139
 
140
  void rewind() {
 
155
 
156
  bool loadModel(const gpt_params & params_) {
157
  params = params_;
158
+ std::tie(model, ctx) = llama_init_from_gpt_params(params);
159
+ if (model == nullptr) {
160
  LOG_ERROR("unable to load model", { { "model", params_.model } });
161
  return false;
162
  }
 
259
  n_past += n_eval;
260
  }
261
 
262
+ if (params.n_predict == 0) {
263
+ has_next_token = false;
264
+ return llama_token_eos();
265
+ }
266
+
267
  // out of user input, sample next token
268
  const float temp = params.temp;
269
  const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
 
429
 
430
  return token_text;
431
  }
432
+
433
+ std::vector<float> getEmbedding() {
434
+ static const int n_embd = llama_n_embd(ctx);
435
+ if (!params.embedding) {
436
+ LOG_WARNING("embedding disabled", {
437
+ { "params.embedding", params.embedding },
438
+ });
439
+ return std::vector<float>(n_embd, 0.0f);
440
+ }
441
+ const float * data = llama_get_embeddings(ctx);
442
+ std::vector<float> embedding(data, data + n_embd);
443
+ return embedding;
444
+ }
445
  };
446
 
447
  static void server_print_usage(const char * argv0, const gpt_params & params,
 
480
  fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
481
  fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);
482
  fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
483
+ fprintf(stderr, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
484
  fprintf(stderr, "\n");
485
  }
486
 
 
627
  params.use_mlock = true;
628
  } else if (arg == "--no-mmap") {
629
  params.use_mmap = false;
630
+ } else if (arg == "--embedding") {
631
+ params.embedding = true;
632
  } else {
633
  fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
634
  server_print_usage(argv[0], default_params, default_sparams);
 
672
  };
673
  }
674
 
675
+ static json format_embedding_response(llama_server_context & llama) {
676
+ return json {
677
+ { "embedding", llama.getEmbedding() },
678
+ };
679
+ }
680
+
681
  static json format_final_response(llama_server_context & llama, const std::string & content) {
682
  return json {
683
  { "content", content },
 
913
 
914
  svr.Post("/tokenize", [&llama](const Request & req, Response & res) {
915
  const json body = json::parse(req.body);
916
+ const std::string content = body.value("content", "");
917
  const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
918
  const json data = format_tokenizer_response(tokens);
919
  return res.set_content(data.dump(), "application/json");
920
  });
921
 
922
+ svr.Post("/embedding", [&llama](const Request & req, Response & res) {
923
+ const json body = json::parse(req.body);
924
+
925
+ llama.rewind();
926
+ llama_reset_timings(llama.ctx);
927
+ llama.params.prompt = body.value("content", "");
928
+ llama.params.n_predict = 0;
929
+ llama.loadPrompt();
930
+ llama.beginCompletion();
931
+ llama.doCompletion();
932
+
933
+ const json data = format_embedding_response(llama);
934
+ return res.set_content(data.dump(), "application/json");
935
+ });
936
+
937
  svr.set_logger(log_server_request);
938
 
939
  svr.set_exception_handler([](const Request &, Response & res, std::exception_ptr ep) {
examples/simple/simple.cpp CHANGED
@@ -68,11 +68,12 @@ int main(int argc, char ** argv)
68
 
69
  llama_init_backend();
70
 
71
- llama_context * ctx ;
 
72
 
73
- ctx = llama_init_from_gpt_params( params );
74
 
75
- if ( ctx == NULL )
76
  {
77
  fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
78
  return 1;
@@ -170,6 +171,7 @@ int main(int argc, char ** argv)
170
  } // wend of main loop
171
 
172
  llama_free( ctx );
 
173
 
174
  return 0;
175
  }
 
68
 
69
  llama_init_backend();
70
 
71
+ llama_model * model;
72
+ llama_context * ctx;
73
 
74
+ std::tie(model, ctx) = llama_init_from_gpt_params( params );
75
 
76
+ if ( model == NULL )
77
  {
78
  fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
79
  return 1;
 
171
  } // wend of main loop
172
 
173
  llama_free( ctx );
174
+ llama_free_model( model );
175
 
176
  return 0;
177
  }
examples/train-text-from-scratch/train-text-from-scratch.cpp CHANGED
@@ -3054,7 +3054,8 @@ int main(int argc, char ** argv) {
3054
  struct llama_context_params llama_params = llama_context_default_params();
3055
  llama_params.vocab_only = true;
3056
 
3057
- struct llama_context * lctx = llama_init_from_file(params.fn_vocab_model, llama_params);
 
3058
 
3059
  struct llama_vocab vocab;
3060
  {
@@ -3395,6 +3396,8 @@ int main(int argc, char ** argv) {
3395
  delete[] compute_addr;
3396
  delete[] compute_buf_0;
3397
  delete[] compute_buf_1;
 
 
3398
  ggml_free(model.ctx);
3399
 
3400
  return 0;
 
3054
  struct llama_context_params llama_params = llama_context_default_params();
3055
  llama_params.vocab_only = true;
3056
 
3057
+ struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params);
3058
+ struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
3059
 
3060
  struct llama_vocab vocab;
3061
  {
 
3396
  delete[] compute_addr;
3397
  delete[] compute_buf_0;
3398
  delete[] compute_buf_1;
3399
+ llama_free(lctx);
3400
+ llama_free_model(lmodel);
3401
  ggml_free(model.ctx);
3402
 
3403
  return 0;
expose.h CHANGED
@@ -18,7 +18,7 @@ struct load_model_inputs
18
  const bool unban_tokens;
19
  const int clblast_info = 0;
20
  const int blasbatchsize = 512;
21
- const bool debugmode;
22
  const int forceversion = 0;
23
  const int gpulayers = 0;
24
  };
 
18
  const bool unban_tokens;
19
  const int clblast_info = 0;
20
  const int blasbatchsize = 512;
21
+ const int debugmode = 0;
22
  const int forceversion = 0;
23
  const int gpulayers = 0;
24
  };
ggml-cuda.cu CHANGED
@@ -13,6 +13,10 @@
13
  #include "ggml-cuda.h"
14
  #include "ggml.h"
15
 
 
 
 
 
16
  static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
17
 
18
  #define CUDA_CHECK(err) \
@@ -46,7 +50,15 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
46
  } while (0)
47
  #endif // CUDART_VERSION >= 11
48
 
49
- typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
 
 
 
 
 
 
 
 
50
  typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
51
  typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
52
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
@@ -230,82 +242,106 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
230
  }
231
  }
232
 
233
- static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
234
  const block_q4_0 * x = (const block_q4_0 *) vx;
235
 
236
- const float d = x[ib].d;
237
 
238
- const uint8_t vui = x[ib].qs[iqs];
239
 
240
- const int8_t vi0 = vui & 0xF;
241
- const int8_t vi1 = vui >> 4;
242
 
243
- v0 = (vi0 - 8)*d;
244
- v1 = (vi1 - 8)*d;
 
 
 
 
 
245
  }
246
 
247
- static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
248
  const block_q4_1 * x = (const block_q4_1 *) vx;
249
 
250
- const float d = x[ib].d;
251
- const float m = x[ib].m;
252
 
253
- const uint8_t vui = x[ib].qs[iqs];
254
 
255
- const int8_t vi0 = vui & 0xF;
256
- const int8_t vi1 = vui >> 4;
257
 
258
- v0 = vi0*d + m;
259
- v1 = vi1*d + m;
 
 
 
 
 
260
  }
261
 
262
- static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
263
  const block_q5_0 * x = (const block_q5_0 *) vx;
264
 
265
- const float d = x[ib].d;
266
 
267
  uint32_t qh;
268
  memcpy(&qh, x[ib].qh, sizeof(qh));
269
 
270
- const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
271
- const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
272
 
273
- const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
274
- const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16;
275
 
276
- v0 = x0*d;
277
- v1 = x1*d;
 
 
 
 
 
278
  }
279
 
280
- static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
281
  const block_q5_1 * x = (const block_q5_1 *) vx;
282
 
283
- const float d = x[ib].d;
284
- const float m = x[ib].m;
285
 
286
  uint32_t qh;
287
  memcpy(&qh, x[ib].qh, sizeof(qh));
288
 
289
- const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
290
- const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
291
 
292
- const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
293
- const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1);
294
 
295
- v0 = x0*d + m;
296
- v1 = x1*d + m;
 
 
 
 
 
297
  }
298
 
299
- static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
300
  const block_q8_0 * x = (const block_q8_0 *) vx;
301
 
302
- const float d = x[ib].d;
303
 
304
- const int8_t vi0 = x[ib].qs[iqs + 0];
305
- const int8_t vi1 = x[ib].qs[iqs + 1];
306
 
307
- v0 = vi0*d;
308
- v1 = vi1*d;
 
 
 
 
309
  }
310
 
311
  //================================== k-quants
@@ -479,15 +515,15 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
479
 
480
  const block_q2_K * x = (const block_q2_K *)vx + ib0;
481
 
482
- const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31
483
- const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0
484
 
485
  const int step = 16/K_QUANTS_PER_ITERATION;
486
 
487
- const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
488
- const int in = tid - step*im; // 0...7
489
 
490
- const int l0 = K_QUANTS_PER_ITERATION*in; // 0...14 in steps of 4
491
  const int q_offset = 32*im + l0;
492
  const int s_offset = 8*im;
493
  const int y_offset = 128*im + l0;
@@ -542,27 +578,30 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
542
  }
543
  }
544
 
545
- static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols) {
546
 
547
  const uint16_t kmask1 = 0x0303;
548
  const uint16_t kmask2 = 0x0f0f;
549
 
550
- const int row = blockIdx.x;
 
 
551
  const int num_blocks_per_row = ncols / QK_K;
552
  const int ib0 = row*num_blocks_per_row;
553
 
554
  const block_q3_K * x = (const block_q3_K *)vx + ib0;
555
 
556
- const int tid = threadIdx.x/2; // 0...15
557
- const int ix = threadIdx.x%2; // 0, 1
558
 
559
- const int n = 2; // iterations in the inner loop
560
- const int im = tid/8; // 0 or 1. 0 computes 0..., 1 computes 128...
561
- const int in = tid - 8*im; // 0...7
 
562
 
563
  const uint8_t m = 1 << (4*im);
564
 
565
- const int l0 = n*in; // 0...28 in steps of 4
566
  const int q_offset = 32*im + l0;
567
  const int y_offset = 128*im + l0;
568
 
@@ -573,7 +612,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
573
 
574
  float tmp = 0; // partial sum for thread in warp
575
 
576
- for (int i = ix; i < num_blocks_per_row; i += 2) {
577
 
578
  const float * y = yy + i * QK_K + y_offset;
579
  const uint8_t * q = x[i].qs + q_offset;
@@ -614,22 +653,25 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
614
  }
615
  }
616
 
617
- static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols) {
618
 
619
  const uint16_t kmask1 = 0x3f3f;
620
  const uint16_t kmask2 = 0x0f0f;
621
  const uint16_t kmask3 = 0xc0c0;
622
 
623
- const int row = blockIdx.x;
 
624
  const int num_blocks_per_row = ncols / QK_K;
625
  const int ib0 = row*num_blocks_per_row;
626
 
627
- const int tid = threadIdx.x/2; // 0...15
628
- const int ix = threadIdx.x%2;
629
 
630
- const int il = tid/4; // 0...3
631
- const int ir = tid - 4*il;// 0...3
632
- const int n = 4;
 
 
633
 
634
  const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
635
  const int in = il%2;
@@ -645,7 +687,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
645
 
646
  float tmp = 0; // partial sum for thread in warp
647
 
648
- for (int i = ix; i < num_blocks_per_row; i += 2) {
649
 
650
  const uint8_t * q1 = x[i].qs + q_offset;
651
  const uint8_t * q2 = q1 + 64;
@@ -700,7 +742,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
700
 
701
  const int il = tid/4; // 0...3
702
  const int ir = tid - 4*il;// 0...3
703
- const int n = 4;
704
 
705
  const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
706
  const int in = il%2;
@@ -739,11 +781,16 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
739
  float4 sum = {0.f, 0.f, 0.f, 0.f};
740
  float smin = 0;
741
  for (int l = 0; l < n; ++l) {
742
- sum.x += y1[l+ 0] * ((ql1[l] & 0xF) + (qh[l] & (hm1 << 0) ? 16 : 0));
743
- sum.y += y1[l+32] * ((ql1[l] >> 4) + (qh[l] & (hm1 << 1) ? 16 : 0));
744
- sum.z += y2[l+ 0] * ((ql2[l] & 0xF) + (qh[l] & (hm2 << 0) ? 16 : 0));
745
- sum.w += y2[l+32] * ((ql2[l] >> 4) + (qh[l] & (hm2 << 1) ? 16 : 0));
746
- smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
 
 
 
 
 
747
  }
748
  tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
749
 
@@ -839,11 +886,12 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
839
  }
840
  }
841
 
842
- static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){
843
  const half * x = (const half *) vx;
844
 
845
- v0 = __half2float(x[ib + iqs + 0]);
846
- v1 = __half2float(x[ib + iqs + 1]);
 
847
  }
848
 
849
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -860,13 +908,15 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
860
  const int y_offset = qr == 1 ? 1 : qk/2;
861
 
862
  // dequantize
863
- float & v0 = y[iybs + iqs + 0];
864
- float & v1 = y[iybs + iqs + y_offset];
865
- dequantize_kernel(vx, ib, iqs, v0, v1);
 
 
866
  }
867
 
868
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
869
- static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols, const int nrows) {
870
  // qk = quantized weights per x block
871
  // qr = number of quantized weights per data value in x block
872
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
@@ -881,7 +931,12 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
881
  const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
882
  const int y_offset = qr == 1 ? 1 : qk/2;
883
 
884
- float tmp = 0.0f; // partial sum for thread in warp
 
 
 
 
 
885
 
886
  for (int i = 0; i < ncols; i += iter_stride) {
887
  const int col = i + vals_per_iter*tid;
@@ -895,14 +950,21 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
895
  // process 2 vals per j iter
896
 
897
  // dequantize
898
- float v0, v1;
899
- dequantize_kernel(vx, ib, iqs + j/qr, v0, v1);
900
  // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
 
 
901
 
902
  // matrix multiplication
903
- tmp += v0 * y[iybs + iqs + j/qr + 0];
904
- tmp += v1 * y[iybs + iqs + j/qr + y_offset];
905
  // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
 
 
 
 
 
 
 
 
 
906
  }
907
  }
908
 
@@ -914,7 +976,11 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y,
914
  }
915
 
916
  if (tid == 0) {
 
 
 
917
  dst[row] = tmp;
 
918
  }
919
  }
920
 
@@ -1209,7 +1275,7 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
1209
  dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
1210
  }
1211
 
1212
- static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1213
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1214
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1215
  const dim3 block_nums(1, block_num_y, 1);
@@ -1218,7 +1284,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, f
1218
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1219
  }
1220
 
1221
- static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1222
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1223
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1224
  const dim3 block_nums(1, block_num_y, 1);
@@ -1227,7 +1293,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, f
1227
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1228
  }
1229
 
1230
- static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1231
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1232
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1233
  const dim3 block_nums(1, block_num_y, 1);
@@ -1236,7 +1302,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, f
1236
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1237
  }
1238
 
1239
- static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1240
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1241
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1242
  const dim3 block_nums(1, block_num_y, 1);
@@ -1245,7 +1311,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, f
1245
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1246
  }
1247
 
1248
- static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1249
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1250
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1251
  const dim3 block_nums(1, block_num_y, 1);
@@ -1256,7 +1322,7 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, f
1256
 
1257
  static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1258
  GGML_ASSERT(ncols % QK_K == 0);
1259
- const int ny = 2;
1260
  const int block_num_y = (nrows + ny - 1) / ny;
1261
  const dim3 block_nums(1, block_num_y, 1);
1262
  const dim3 block_dims(32, ny, 1);
@@ -1265,14 +1331,20 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
1265
 
1266
  static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1267
  GGML_ASSERT(ncols % QK_K == 0);
1268
- const dim3 block_dims(32, 1, 1);
1269
- dequantize_mul_mat_vec_q3_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
 
 
 
1270
  }
1271
 
1272
  static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1273
  GGML_ASSERT(ncols % QK_K == 0);
1274
- const dim3 block_dims(32, 1, 1);
1275
- dequantize_mul_mat_vec_q4_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
 
 
 
1276
  }
1277
 
1278
  static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
@@ -1295,7 +1367,7 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
1295
  dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
1296
  }
1297
 
1298
- static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1299
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1300
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1301
  const dim3 block_nums(1, block_num_y, 1);
@@ -1463,19 +1535,13 @@ static void * g_scratch_buffer = nullptr;
1463
  static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
1464
  static size_t g_scratch_offset = 0;
1465
 
1466
- #define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
1467
- #define GGML_CUDA_MAX_EVENTS 64
1468
-
1469
  static int g_device_count = -1;
1470
  static int g_main_device = 0;
1471
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
1472
 
1473
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
1474
 
1475
- static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1476
-
1477
- static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr };
1478
- static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr };
1479
 
1480
  void ggml_init_cublas() {
1481
  static bool initialized = false;
@@ -1499,15 +1565,8 @@ void ggml_init_cublas() {
1499
  for (int id = 0; id < g_device_count; ++id) {
1500
  CUDA_CHECK(cudaSetDevice(id));
1501
 
1502
- // create streams
1503
- for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) {
1504
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id][i], cudaStreamNonBlocking));
1505
- CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking));
1506
- }
1507
- // create events
1508
- for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
1509
- CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming));
1510
- }
1511
 
1512
  // create cublas handle
1513
  CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
@@ -1723,21 +1782,40 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
1723
  const int64_t ne00 = src0->ne[0];
1724
  const int64_t nrows = i01_high - i01_low;
1725
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1726
  switch (src0->type) {
1727
  case GGML_TYPE_Q4_0:
1728
- dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1729
  break;
1730
  case GGML_TYPE_Q4_1:
1731
- dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1732
  break;
1733
  case GGML_TYPE_Q5_0:
1734
- dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1735
  break;
1736
  case GGML_TYPE_Q5_1:
1737
- dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1738
  break;
1739
  case GGML_TYPE_Q8_0:
1740
- dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1741
  break;
1742
  case GGML_TYPE_Q2_K:
1743
  dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
@@ -1755,7 +1833,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
1755
  dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1756
  break;
1757
  case GGML_TYPE_F16:
1758
- convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1759
  break;
1760
  default:
1761
  GGML_ASSERT(false);
@@ -1763,6 +1841,12 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
1763
  }
1764
  CUDA_CHECK(cudaGetLastError());
1765
 
 
 
 
 
 
 
1766
  (void) src1;
1767
  (void) dst;
1768
  (void) src0_ddf_i;
@@ -1974,6 +2058,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
1974
  size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
1975
  size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
1976
 
 
 
 
 
 
 
1977
  for (int id = 0; id < g_device_count; ++id) {
1978
  if (!split && id != g_main_device) {
1979
  continue;
@@ -2072,9 +2162,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2072
  }
2073
  const int64_t i11 = i13*ne12 + i12;
2074
 
2075
- cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS];
2076
- cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS];
2077
- cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS];
2078
 
2079
  // for split tensors the data begins at i0 == i0_offset_low
2080
  char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
@@ -2102,14 +2190,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2102
  if (src1->backend == GGML_BACKEND_CPU) {
2103
  GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
2104
  int64_t nrows1 = flatten_rows ? nrows0 : ne11;
2105
- CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_memcpy_src1));
2106
  } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
2107
  if (id != g_main_device) {
2108
  GGML_ASSERT(!flatten_rows);
2109
  float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
2110
  src1_ddf_i_source += i11*src1_stride;
2111
  CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
2112
- cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1));
2113
  }
2114
  } else if (src1_on_device && !src1_is_contiguous) {
2115
  GGML_ASSERT(!split);
@@ -2118,7 +2206,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2118
  GGML_ASSERT(false);
2119
  }
2120
  }
2121
- CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1));
2122
 
2123
  if (!src0_on_device || !src0_is_contiguous) {
2124
  if (src0_is_f32) {
@@ -2134,9 +2221,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2134
  CUDA_CHECK(cudaGetLastError());
2135
  }
2136
 
2137
- // wait with main stream until src1 memcpy is done
2138
- CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, cudaEvent_memcpy_src1, 0));
2139
-
2140
  // do the computation
2141
  op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
2142
 
@@ -2174,8 +2258,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
2174
 
2175
  // wait until each device is finished, then free their buffers
2176
  for (int id = 0; id < g_device_count; ++id) {
 
 
 
 
2177
  CUDA_CHECK(cudaSetDevice(id));
2178
  CUDA_CHECK(cudaDeviceSynchronize());
 
2179
  if (src0_asq[id] > 0) {
2180
  ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
2181
  }
@@ -2241,7 +2330,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
2241
  const int64_t ne02 = src0->ne[2];
2242
 
2243
  CUDA_CHECK(cudaSetDevice(g_main_device));
2244
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
2245
 
2246
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2247
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -2253,8 +2342,6 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
2253
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
2254
 
2255
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
2256
-
2257
- CUDA_CHECK(cudaDeviceSynchronize());
2258
  }
2259
 
2260
  void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
@@ -2272,7 +2359,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
2272
  const int64_t nb02 = src0->nb[2];
2273
 
2274
  CUDA_CHECK(cudaSetDevice(g_main_device));
2275
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
2276
 
2277
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2278
  void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -2287,8 +2374,6 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
2287
  const int channel_stride_x = nb02 / sizeof(half);
2288
 
2289
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
2290
-
2291
- CUDA_CHECK(cudaDeviceSynchronize());
2292
  }
2293
 
2294
  void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2344,7 +2429,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2344
  const int64_t nb12 = src1->nb[2];
2345
 
2346
  CUDA_CHECK(cudaSetDevice(g_main_device));
2347
- cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0];
2348
 
2349
  const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2350
  const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
@@ -2362,8 +2447,6 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
2362
  GGML_ASSERT(false);
2363
  }
2364
 
2365
- CUDA_CHECK(cudaDeviceSynchronize());
2366
-
2367
  (void) dst;
2368
  }
2369
 
@@ -2552,7 +2635,7 @@ void ggml_cuda_free_scratch() {
2552
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
2553
  ggml_cuda_func_t func;
2554
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
2555
- || tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT
2556
  || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
2557
 
2558
  switch (tensor->op) {
 
13
  #include "ggml-cuda.h"
14
  #include "ggml.h"
15
 
16
+ #if defined(_MSC_VER)
17
+ #pragma warning(disable: 4244 4267) // possible loss of data
18
+ #endif
19
+
20
  static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
21
 
22
  #define CUDA_CHECK(err) \
 
50
  } while (0)
51
  #endif // CUDART_VERSION >= 11
52
 
53
+ #ifdef GGML_CUDA_DMMV_F16
54
+ typedef half dfloat; // dequantize float
55
+ typedef half2 dfloat2;
56
+ #else
57
+ typedef float dfloat; // dequantize float
58
+ typedef float2 dfloat2;
59
+ #endif //GGML_CUDA_DMMV_F16
60
+
61
+ typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
62
  typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
63
  typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
64
  typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
 
242
  }
243
  }
244
 
245
+ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
246
  const block_q4_0 * x = (const block_q4_0 *) vx;
247
 
248
+ const dfloat d = x[ib].d;
249
 
250
+ const int vui = x[ib].qs[iqs];
251
 
252
+ v.x = vui & 0xF;
253
+ v.y = vui >> 4;
254
 
255
+ #ifdef GGML_CUDA_DMMV_F16
256
+ v = __hsub2(v, {8.0f, 8.0f});
257
+ v = __hmul2(v, {d, d});
258
+ #else
259
+ v.x = (v.x - 8.0f) * d;
260
+ v.y = (v.y - 8.0f) * d;
261
+ #endif // GGML_CUDA_DMMV_F16
262
  }
263
 
264
+ static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
265
  const block_q4_1 * x = (const block_q4_1 *) vx;
266
 
267
+ const dfloat d = x[ib].d;
268
+ const dfloat m = x[ib].m;
269
 
270
+ const int vui = x[ib].qs[iqs];
271
 
272
+ v.x = vui & 0xF;
273
+ v.y = vui >> 4;
274
 
275
+ #ifdef GGML_CUDA_DMMV_F16
276
+ v = __hmul2(v, {d, d});
277
+ v = __hadd2(v, {m, m});
278
+ #else
279
+ v.x = (v.x * d) + m;
280
+ v.y = (v.y * d) + m;
281
+ #endif // GGML_CUDA_DMMV_F16
282
  }
283
 
284
+ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
285
  const block_q5_0 * x = (const block_q5_0 *) vx;
286
 
287
+ const dfloat d = x[ib].d;
288
 
289
  uint32_t qh;
290
  memcpy(&qh, x[ib].qh, sizeof(qh));
291
 
292
+ const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
293
+ const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
294
 
295
+ v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
296
+ v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
297
 
298
+ #ifdef GGML_CUDA_DMMV_F16
299
+ v = __hsub2(v, {16.0f, 16.0f});
300
+ v = __hmul2(v, {d, d});
301
+ #else
302
+ v.x = (v.x - 16.0f) * d;
303
+ v.y = (v.y - 16.0f) * d;
304
+ #endif // GGML_CUDA_DMMV_F16
305
  }
306
 
307
+ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
308
  const block_q5_1 * x = (const block_q5_1 *) vx;
309
 
310
+ const dfloat d = x[ib].d;
311
+ const dfloat m = x[ib].m;
312
 
313
  uint32_t qh;
314
  memcpy(&qh, x[ib].qh, sizeof(qh));
315
 
316
+ const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
317
+ const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
318
 
319
+ v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
320
+ v.y = ((x[ib].qs[iqs] >> 4) | xh_1);
321
 
322
+ #ifdef GGML_CUDA_DMMV_F16
323
+ v = __hmul2(v, {d, d});
324
+ v = __hadd2(v, {m, m});
325
+ #else
326
+ v.x = (v.x * d) + m;
327
+ v.y = (v.y * d) + m;
328
+ #endif // GGML_CUDA_DMMV_F16
329
  }
330
 
331
+ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
332
  const block_q8_0 * x = (const block_q8_0 *) vx;
333
 
334
+ const dfloat d = x[ib].d;
335
 
336
+ v.x = x[ib].qs[iqs + 0];
337
+ v.y = x[ib].qs[iqs + 1];
338
 
339
+ #ifdef GGML_CUDA_DMMV_F16
340
+ v = __hmul2(v, {d, d});
341
+ #else
342
+ v.x *= d;
343
+ v.y *= d;
344
+ #endif // GGML_CUDA_DMMV_F16
345
  }
346
 
347
  //================================== k-quants
 
515
 
516
  const block_q2_K * x = (const block_q2_K *)vx + ib0;
517
 
518
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
519
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
520
 
521
  const int step = 16/K_QUANTS_PER_ITERATION;
522
 
523
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
524
+ const int in = tid - step*im; // 0...15 or 0...7
525
 
526
+ const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
527
  const int q_offset = 32*im + l0;
528
  const int s_offset = 8*im;
529
  const int y_offset = 128*im + l0;
 
578
  }
579
  }
580
 
581
+ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
582
 
583
  const uint16_t kmask1 = 0x0303;
584
  const uint16_t kmask2 = 0x0f0f;
585
 
586
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
587
+ if (row > nrows) return;
588
+
589
  const int num_blocks_per_row = ncols / QK_K;
590
  const int ib0 = row*num_blocks_per_row;
591
 
592
  const block_q3_K * x = (const block_q3_K *)vx + ib0;
593
 
594
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
595
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
596
 
597
+ const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
598
+ const int step = 16/K_QUANTS_PER_ITERATION;
599
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
600
+ const int in = tid - step*im; // 0....15 or 0...7
601
 
602
  const uint8_t m = 1 << (4*im);
603
 
604
+ const int l0 = n*in; // 0...15 or 0...14 in steps of 2
605
  const int q_offset = 32*im + l0;
606
  const int y_offset = 128*im + l0;
607
 
 
612
 
613
  float tmp = 0; // partial sum for thread in warp
614
 
615
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
616
 
617
  const float * y = yy + i * QK_K + y_offset;
618
  const uint8_t * q = x[i].qs + q_offset;
 
653
  }
654
  }
655
 
656
+ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
657
 
658
  const uint16_t kmask1 = 0x3f3f;
659
  const uint16_t kmask2 = 0x0f0f;
660
  const uint16_t kmask3 = 0xc0c0;
661
 
662
+ const int row = blockIdx.y*blockDim.y + threadIdx.y;
663
+ if (row > nrows) return;
664
  const int num_blocks_per_row = ncols / QK_K;
665
  const int ib0 = row*num_blocks_per_row;
666
 
667
+ const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
668
+ const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1
669
 
670
+ const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4
671
+
672
+ const int il = tid/step; // 0...3
673
+ const int ir = tid - step*il; // 0...7 or 0...3
674
+ const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4
675
 
676
  const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
677
  const int in = il%2;
 
687
 
688
  float tmp = 0; // partial sum for thread in warp
689
 
690
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
691
 
692
  const uint8_t * q1 = x[i].qs + q_offset;
693
  const uint8_t * q2 = q1 + 64;
 
742
 
743
  const int il = tid/4; // 0...3
744
  const int ir = tid - 4*il;// 0...3
745
+ const int n = 2;
746
 
747
  const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
748
  const int in = il%2;
 
781
  float4 sum = {0.f, 0.f, 0.f, 0.f};
782
  float smin = 0;
783
  for (int l = 0; l < n; ++l) {
784
+ sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
785
+ + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
786
+ sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
787
+ + y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
788
+ sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
789
+ + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
790
+ sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
791
+ + y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
792
+ smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
793
+ + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
794
  }
795
  tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
796
 
 
886
  }
887
  }
888
 
889
+ static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
890
  const half * x = (const half *) vx;
891
 
892
+ // automatic half -> float type cast if dfloat == float
893
+ v.x = x[ib + iqs + 0];
894
+ v.y = x[ib + iqs + 1];
895
  }
896
 
897
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
 
908
  const int y_offset = qr == 1 ? 1 : qk/2;
909
 
910
  // dequantize
911
+ dfloat2 v;
912
+ dequantize_kernel(vx, ib, iqs, v);
913
+
914
+ y[iybs + iqs + 0] = v.x;
915
+ y[iybs + iqs + y_offset] = v.y;
916
  }
917
 
918
  template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
919
+ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
920
  // qk = quantized weights per x block
921
  // qr = number of quantized weights per data value in x block
922
  const int row = blockIdx.y*blockDim.y + threadIdx.y;
 
931
  const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
932
  const int y_offset = qr == 1 ? 1 : qk/2;
933
 
934
+ // partial sum for each thread
935
+ #ifdef GGML_CUDA_DMMV_F16
936
+ half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
937
+ #else
938
+ float tmp = 0.0f;
939
+ #endif // GGML_CUDA_DMMV_F16
940
 
941
  for (int i = 0; i < ncols; i += iter_stride) {
942
  const int col = i + vals_per_iter*tid;
 
950
  // process 2 vals per j iter
951
 
952
  // dequantize
 
 
953
  // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
954
+ dfloat2 v;
955
+ dequantize_kernel(vx, ib, iqs + j/qr, v);
956
 
957
  // matrix multiplication
 
 
958
  // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
959
+ #ifdef GGML_CUDA_DMMV_F16
960
+ tmp += __hmul2(v, {
961
+ y[iybs + iqs + j/qr + 0],
962
+ y[iybs + iqs + j/qr + y_offset]
963
+ });
964
+ #else
965
+ tmp += v.x * y[iybs + iqs + j/qr + 0];
966
+ tmp += v.y * y[iybs + iqs + j/qr + y_offset];
967
+ #endif // GGML_CUDA_DMMV_F16
968
  }
969
  }
970
 
 
976
  }
977
 
978
  if (tid == 0) {
979
+ #ifdef GGML_CUDA_DMMV_F16
980
+ dst[row] = tmp.x + tmp.y;
981
+ #else
982
  dst[row] = tmp;
983
+ #endif // GGML_CUDA_DMMV_F16
984
  }
985
  }
986
 
 
1275
  dequantize_block_q6_K<<<nb, 64, 0, stream>>>(vx, y);
1276
  }
1277
 
1278
+ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1279
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1280
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1281
  const dim3 block_nums(1, block_num_y, 1);
 
1284
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1285
  }
1286
 
1287
+ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1288
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1289
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1290
  const dim3 block_nums(1, block_num_y, 1);
 
1293
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1294
  }
1295
 
1296
+ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1297
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1298
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1299
  const dim3 block_nums(1, block_num_y, 1);
 
1302
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1303
  }
1304
 
1305
+ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1306
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1307
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1308
  const dim3 block_nums(1, block_num_y, 1);
 
1311
  <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1312
  }
1313
 
1314
+ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1315
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1316
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1317
  const dim3 block_nums(1, block_num_y, 1);
 
1322
 
1323
  static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1324
  GGML_ASSERT(ncols % QK_K == 0);
1325
+ const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
1326
  const int block_num_y = (nrows + ny - 1) / ny;
1327
  const dim3 block_nums(1, block_num_y, 1);
1328
  const dim3 block_dims(32, ny, 1);
 
1331
 
1332
  static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1333
  GGML_ASSERT(ncols % QK_K == 0);
1334
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
1335
+ const int block_num_y = (nrows + ny - 1) / ny;
1336
+ const dim3 block_nums(1, block_num_y, 1);
1337
+ const dim3 block_dims(32, ny, 1);
1338
+ dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1339
  }
1340
 
1341
  static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1342
  GGML_ASSERT(ncols % QK_K == 0);
1343
+ const int ny = 2 / K_QUANTS_PER_ITERATION;
1344
+ const int block_num_y = (nrows + ny - 1) / ny;
1345
+ const dim3 block_nums(1, block_num_y, 1);
1346
+ const dim3 block_dims(32, ny, 1);
1347
+ dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
1348
  }
1349
 
1350
  static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
 
1367
  dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
1368
  }
1369
 
1370
+ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1371
  GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1372
  const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1373
  const dim3 block_nums(1, block_num_y, 1);
 
1535
  static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default
1536
  static size_t g_scratch_offset = 0;
1537
 
 
 
 
1538
  static int g_device_count = -1;
1539
  static int g_main_device = 0;
1540
  static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
1541
 
1542
  static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
1543
 
1544
+ static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr };
 
 
 
1545
 
1546
  void ggml_init_cublas() {
1547
  static bool initialized = false;
 
1565
  for (int id = 0; id < g_device_count; ++id) {
1566
  CUDA_CHECK(cudaSetDevice(id));
1567
 
1568
+ // create main stream
1569
+ CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking));
 
 
 
 
 
 
 
1570
 
1571
  // create cublas handle
1572
  CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
 
1782
  const int64_t ne00 = src0->ne[0];
1783
  const int64_t nrows = i01_high - i01_low;
1784
 
1785
+ // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
1786
+ #ifdef GGML_CUDA_DMMV_F16
1787
+ size_t ash;
1788
+ dfloat * src1_dfloat = nullptr; // dfloat == half
1789
+
1790
+ bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
1791
+ src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
1792
+ src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
1793
+
1794
+ if (src1_convert_f16) {
1795
+ src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
1796
+ ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
1797
+ ne00, 1, sizeof(float), 0, 0,
1798
+ ne00, 1, sizeof(half), 0, 0, cudaStream_main);
1799
+ }
1800
+ #else
1801
+ dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
1802
+ #endif // GGML_CUDA_DMMV_F16
1803
+
1804
  switch (src0->type) {
1805
  case GGML_TYPE_Q4_0:
1806
+ dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1807
  break;
1808
  case GGML_TYPE_Q4_1:
1809
+ dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1810
  break;
1811
  case GGML_TYPE_Q5_0:
1812
+ dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1813
  break;
1814
  case GGML_TYPE_Q5_1:
1815
+ dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1816
  break;
1817
  case GGML_TYPE_Q8_0:
1818
+ dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1819
  break;
1820
  case GGML_TYPE_Q2_K:
1821
  dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
 
1833
  dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
1834
  break;
1835
  case GGML_TYPE_F16:
1836
+ convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
1837
  break;
1838
  default:
1839
  GGML_ASSERT(false);
 
1841
  }
1842
  CUDA_CHECK(cudaGetLastError());
1843
 
1844
+ #ifdef GGML_CUDA_DMMV_F16
1845
+ if (src1_convert_f16) {
1846
+ ggml_cuda_pool_free(src1_dfloat, ash);
1847
+ }
1848
+ #endif // GGML_CUDA_DMMV_F16
1849
+
1850
  (void) src1;
1851
  (void) dst;
1852
  (void) src0_ddf_i;
 
2058
  size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
2059
  size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
2060
 
2061
+ // if multiple GPUs are used they need to wait for the main GPU to finish
2062
+ if (split && g_device_count > 1) {
2063
+ CUDA_CHECK(cudaSetDevice(g_main_device));
2064
+ CUDA_CHECK(cudaDeviceSynchronize());
2065
+ }
2066
+
2067
  for (int id = 0; id < g_device_count; ++id) {
2068
  if (!split && id != g_main_device) {
2069
  continue;
 
2162
  }
2163
  const int64_t i11 = i13*ne12 + i12;
2164
 
2165
+ cudaStream_t cudaStream_main = g_cudaStreams_main[id];
 
 
2166
 
2167
  // for split tensors the data begins at i0 == i0_offset_low
2168
  char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
 
2190
  if (src1->backend == GGML_BACKEND_CPU) {
2191
  GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1));
2192
  int64_t nrows1 = flatten_rows ? nrows0 : ne11;
2193
+ CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main));
2194
  } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
2195
  if (id != g_main_device) {
2196
  GGML_ASSERT(!flatten_rows);
2197
  float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
2198
  src1_ddf_i_source += i11*src1_stride;
2199
  CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float),
2200
+ cudaMemcpyDeviceToDevice, cudaStream_main));
2201
  }
2202
  } else if (src1_on_device && !src1_is_contiguous) {
2203
  GGML_ASSERT(!split);
 
2206
  GGML_ASSERT(false);
2207
  }
2208
  }
 
2209
 
2210
  if (!src0_on_device || !src0_is_contiguous) {
2211
  if (src0_is_f32) {
 
2221
  CUDA_CHECK(cudaGetLastError());
2222
  }
2223
 
 
 
 
2224
  // do the computation
2225
  op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
2226
 
 
2258
 
2259
  // wait until each device is finished, then free their buffers
2260
  for (int id = 0; id < g_device_count; ++id) {
2261
+ if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) {
2262
+ continue;
2263
+ }
2264
+
2265
  CUDA_CHECK(cudaSetDevice(id));
2266
  CUDA_CHECK(cudaDeviceSynchronize());
2267
+
2268
  if (src0_asq[id] > 0) {
2269
  ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
2270
  }
 
2330
  const int64_t ne02 = src0->ne[2];
2331
 
2332
  CUDA_CHECK(cudaSetDevice(g_main_device));
2333
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2334
 
2335
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2336
  void * src0_ddq = src0_extra->data_device[g_main_device];
 
2342
  float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
2343
 
2344
  ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main);
 
 
2345
  }
2346
 
2347
  void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
 
2359
  const int64_t nb02 = src0->nb[2];
2360
 
2361
  CUDA_CHECK(cudaSetDevice(g_main_device));
2362
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2363
 
2364
  struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2365
  void * src0_ddq = src0_extra->data_device[g_main_device];
 
2374
  const int channel_stride_x = nb02 / sizeof(half);
2375
 
2376
  ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main);
 
 
2377
  }
2378
 
2379
  void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
2429
  const int64_t nb12 = src1->nb[2];
2430
 
2431
  CUDA_CHECK(cudaSetDevice(g_main_device));
2432
+ cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device];
2433
 
2434
  const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
2435
  const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
 
2447
  GGML_ASSERT(false);
2448
  }
2449
 
 
 
2450
  (void) dst;
2451
  }
2452
 
 
2635
  bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
2636
  ggml_cuda_func_t func;
2637
  const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
2638
+ || (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT))
2639
  || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
2640
 
2641
  switch (tensor->op) {
ggml-metal.h CHANGED
@@ -41,12 +41,15 @@ void ggml_metal_free(struct ggml_metal_context * ctx);
41
  // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
42
  // - the mapping is used during computation to determine the arguments of the compute kernels
43
  // - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
 
 
44
  //
45
  bool ggml_metal_add_buffer(
46
  struct ggml_metal_context * ctx,
47
  const char * name,
48
  void * data,
49
- size_t size);
 
50
 
51
  // set data from host memory into the device
52
  void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
 
41
  // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
42
  // - the mapping is used during computation to determine the arguments of the compute kernels
43
  // - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
44
+ // - max_size specifies the maximum size of a tensor and is used to create shared views such
45
+ // that it is guaranteed that the tensor will fit in at least one of the views
46
  //
47
  bool ggml_metal_add_buffer(
48
  struct ggml_metal_context * ctx,
49
  const char * name,
50
  void * data,
51
+ size_t size,
52
+ size_t max_size);
53
 
54
  // set data from host memory into the device
55
  void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
ggml-metal.m CHANGED
@@ -183,6 +183,14 @@ struct ggml_metal_context * ggml_metal_init(void) {
183
  #undef GGML_METAL_ADD_KERNEL
184
  }
185
 
 
 
 
 
 
 
 
 
186
  return ctx;
187
  }
188
 
@@ -199,10 +207,13 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
199
  static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
200
  //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
201
 
 
 
 
202
  for (int i = 0; i < ctx->n_buffers; ++i) {
203
  const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
204
 
205
- if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
206
  *offs = (size_t) ioffs;
207
 
208
  //fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
@@ -220,7 +231,8 @@ bool ggml_metal_add_buffer(
220
  struct ggml_metal_context * ctx,
221
  const char * name,
222
  void * data,
223
- size_t size) {
 
224
  if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
225
  fprintf(stderr, "%s: too many buffers\n", __func__);
226
  return false;
@@ -237,30 +249,68 @@ bool ggml_metal_add_buffer(
237
  }
238
  }
239
 
240
- size_t page_size = getpagesize();
241
- size_t aligned_size = size;
242
- if ((aligned_size % page_size) != 0) {
243
- aligned_size += (page_size - (aligned_size % page_size));
 
244
  }
245
 
246
- ctx->buffers[ctx->n_buffers].name = name;
247
- ctx->buffers[ctx->n_buffers].data = data;
248
- ctx->buffers[ctx->n_buffers].size = size;
 
 
249
 
250
- if (ctx->device.maxBufferLength < aligned_size) {
251
- fprintf(stderr, "%s: buffer '%s' size %zu is larger than buffer maximum of %zu\n", __func__, name, aligned_size, ctx->device.maxBufferLength);
252
- return false;
253
- }
254
- ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil];
 
255
 
256
- if (ctx->buffers[ctx->n_buffers].metal == nil) {
257
- fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
258
- return false;
259
  } else {
260
- fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  }
262
 
263
- ++ctx->n_buffers;
 
 
 
 
 
 
 
 
264
  }
265
 
266
  return true;
@@ -765,18 +815,23 @@ void ggml_metal_graph_compute(
765
  } break;
766
  case GGML_OP_ALIBI:
767
  {
 
 
 
 
768
  GGML_ASSERT((src0t == GGML_TYPE_F32));
769
- const int n_past = ((int32_t *) src1->data)[0];
 
770
  const int n_head = ((int32_t *) src1->data)[1];
771
  const float max_bias = ((float *) src1->data)[2];
 
772
  if (__builtin_popcount(n_head) != 1) {
773
  GGML_ASSERT(false && "only power-of-two n_head implemented");
774
  }
 
775
  const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
776
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
777
- if (encoder == nil) {
778
- encoder = [command_buffer computeCommandEncoder];
779
- }
780
  [encoder setComputePipelineState:ctx->pipeline_alibi_f32];
781
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
782
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
@@ -904,4 +959,14 @@ void ggml_metal_graph_compute(
904
  dispatch_barrier_sync(queue, ^{});
905
 
906
  [command_buffers[n_cb - 1] waitUntilCompleted];
 
 
 
 
 
 
 
 
 
 
907
  }
 
183
  #undef GGML_METAL_ADD_KERNEL
184
  }
185
 
186
+ fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
187
+ fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
188
+ if (ctx->device.maxTransferRate != 0) {
189
+ fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
190
+ } else {
191
+ fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__);
192
+ }
193
+
194
  return ctx;
195
  }
196
 
 
207
  static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
208
  //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
209
 
210
+ const int64_t tsize = ggml_nbytes(t);
211
+
212
+ // find the view that contains the tensor fully
213
  for (int i = 0; i < ctx->n_buffers; ++i) {
214
  const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
215
 
216
+ if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
217
  *offs = (size_t) ioffs;
218
 
219
  //fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
 
231
  struct ggml_metal_context * ctx,
232
  const char * name,
233
  void * data,
234
+ size_t size,
235
+ size_t max_size) {
236
  if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
237
  fprintf(stderr, "%s: too many buffers\n", __func__);
238
  return false;
 
249
  }
250
  }
251
 
252
+ const size_t size_page = getpagesize();
253
+
254
+ size_t size_aligned = size;
255
+ if ((size_aligned % size_page) != 0) {
256
+ size_aligned += (size_page - (size_aligned % size_page));
257
  }
258
 
259
+ // the buffer fits into the max buffer size allowed by the device
260
+ if (size_aligned <= ctx->device.maxBufferLength) {
261
+ ctx->buffers[ctx->n_buffers].name = name;
262
+ ctx->buffers[ctx->n_buffers].data = data;
263
+ ctx->buffers[ctx->n_buffers].size = size;
264
 
265
+ ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
266
+
267
+ if (ctx->buffers[ctx->n_buffers].metal == nil) {
268
+ fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
269
+ return false;
270
+ }
271
 
272
+ fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
273
+
274
+ ++ctx->n_buffers;
275
  } else {
276
+ // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
277
+ // one of the views
278
+ const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
279
+ const size_t size_step = ctx->device.maxBufferLength - size_ovlp;
280
+ const size_t size_view = ctx->device.maxBufferLength;
281
+
282
+ for (size_t i = 0; i < size; i += size_step) {
283
+ const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
284
+
285
+ ctx->buffers[ctx->n_buffers].name = name;
286
+ ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
287
+ ctx->buffers[ctx->n_buffers].size = size_step_aligned;
288
+
289
+ ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
290
+
291
+ if (ctx->buffers[ctx->n_buffers].metal == nil) {
292
+ fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
293
+ return false;
294
+ }
295
+
296
+ fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
297
+ if (i + size_step < size) {
298
+ fprintf(stderr, "\n");
299
+ }
300
+
301
+ ++ctx->n_buffers;
302
+ }
303
  }
304
 
305
+ fprintf(stderr, ", (%8.2f / %8.2f)",
306
+ ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
307
+ ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
308
+
309
+ if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
310
+ fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n");
311
+ } else {
312
+ fprintf(stderr, "\n");
313
+ }
314
  }
315
 
316
  return true;
 
815
  } break;
816
  case GGML_OP_ALIBI:
817
  {
818
+ if (encoder == nil) {
819
+ encoder = [command_buffer computeCommandEncoder];
820
+ }
821
+
822
  GGML_ASSERT((src0t == GGML_TYPE_F32));
823
+
824
+ const int n_past = ((int32_t *) src1->data)[0]; UNUSED(n_past);
825
  const int n_head = ((int32_t *) src1->data)[1];
826
  const float max_bias = ((float *) src1->data)[2];
827
+
828
  if (__builtin_popcount(n_head) != 1) {
829
  GGML_ASSERT(false && "only power-of-two n_head implemented");
830
  }
831
+
832
  const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
833
  const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
834
+
 
 
835
  [encoder setComputePipelineState:ctx->pipeline_alibi_f32];
836
  [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
837
  [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
 
959
  dispatch_barrier_sync(queue, ^{});
960
 
961
  [command_buffers[n_cb - 1] waitUntilCompleted];
962
+
963
+ // check status of command buffers
964
+ // needed to detect if the device ran out-of-memory for example (#1881)
965
+ for (int i = 0; i < n_cb; i++) {
966
+ MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
967
+ if (status != MTLCommandBufferStatusCompleted) {
968
+ fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
969
+ GGML_ASSERT(false);
970
+ }
971
+ }
972
  }
ggml-opencl.cpp CHANGED
@@ -16,13 +16,25 @@
16
 
17
  #include "ggml.h"
18
 
 
 
 
 
19
  #define CL_DMMV_BLOCK_SIZE 32
20
 
 
 
 
 
 
 
21
  #define MULTILINE_QUOTE(...) #__VA_ARGS__
22
  static std::string program_source = MULTILINE_QUOTE(
23
 
24
  typedef char int8_t;
25
  typedef uchar uint8_t;
 
 
26
  typedef int int32_t;
27
  typedef uint uint32_t;
28
 
@@ -172,7 +184,9 @@ void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float
172
  *v0 = vload_half(0, &x[ib + 0]);
173
  *v1 = vload_half(0, &x[ib + 1]);
174
  }
 
175
 
 
176
  inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m)
177
  {
178
  if (j < 4)
@@ -196,7 +210,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa
196
  const int is = 8 * n + l / 16;
197
 
198
  const uint8_t q = x[i].qs[32 * n + l];
199
- __global float *y = yy + i * 256 + 128 * n;
200
 
201
  const float dall = vload_half(0, &x[i].d);
202
  const float dmin = vload_half(0, &x[i].dmin);
@@ -228,7 +242,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
228
  float d_all = vload_half(0, &x[i].d);
229
  float dl = d_all * (us - 32);
230
 
231
- __global float *y = yy + i * 256 + 128 * n + 32 * j;
232
  const __global uint8_t *q = x[i].qs + 32 * n;
233
  const __global uint8_t *hm = x[i].hmask;
234
 
@@ -245,7 +259,7 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa
245
  const int is = 2 * il;
246
  const int n = 4;
247
 
248
- __global float *y = yy + i * 256 + 64 * il + n * ir;
249
 
250
  const float dall = vload_half(0, &x[i].d);
251
  const float dmin = vload_half(0, &x[i].dmin);
@@ -274,7 +288,7 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa
274
  const int ir = tid % 16;
275
  const int is = 2 * il;
276
 
277
- __global float *y = yy + i * 256 + 64 * il + 2 * ir;
278
 
279
  const float dall = vload_half(0, &x[i].d);
280
  const float dmin = vload_half(0, &x[i].dmin);
@@ -306,7 +320,7 @@ __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __globa
306
  const int il = tid - 32 * ip;
307
  const int is = 8 * ip + il / 16;
308
 
309
- __global float *y = yy + i * 256 + 128 * ip + il;
310
 
311
  const float d = vload_half(0, &x[i].d);
312
 
@@ -320,161 +334,383 @@ __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __globa
320
  y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
321
  }
322
 
 
323
 
324
- void vec_dot_q2_K(__global const struct block_q2_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
325
 
326
- int n = iqs / 128;
327
- int r = iqs - 128 * n;
328
- int l = r / 8;
329
 
330
- __global const float *y = yy + 128 * n + l;
331
- __global const uint8_t *q = x[ib].qs + 32 * n + l;
332
- __global const uint8_t *s = x[ib].scales + 8 * n;
333
 
334
- const float dall = vload_half(0, &x[ib].d);
335
- const float dmin = vload_half(0, &x[ib].dmin);
336
 
337
- float sum = y[ 0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
338
- + y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
339
- + y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
340
- + y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
341
- + y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
342
- + y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
343
- + y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
344
- + y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
345
 
346
- *result = sum;
347
- }
348
 
349
- void vec_dot_q3_K(__global const struct block_q3_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
 
 
 
350
 
351
- const uint32_t kmask1 = 0x03030303;
352
- const uint32_t kmask2 = 0x0f0f0f0f;
353
 
354
- uint32_t aux[3];
355
- uint32_t utmp[4];
 
356
 
357
- int n = iqs/128;
358
- int r = iqs - 128*n;
359
- int l = r/8;
360
 
361
- __global const float * y = yy + 128*n + l;
362
- __global const uint8_t * q = x[ib].qs + 32*n + l;
363
- __global const uint8_t * hm = x[ib].hmask + l;
364
- const int8_t * s = (const int8_t *)utmp + 8*n;
365
 
366
- aux[0] = x[ib].scales[0] | x[ib].scales[1] << 8 | x[ib].scales[2] << 16 | x[ib].scales[3] << 24;
367
- aux[1] = x[ib].scales[4] | x[ib].scales[5] << 8 | x[ib].scales[6] << 16 | x[ib].scales[7] << 24;
368
- aux[2] = x[ib].scales[8] | x[ib].scales[9] << 8 | x[ib].scales[10] << 16 | x[ib].scales[11] << 24;
369
 
370
- utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
371
- utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
372
- utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
373
- utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
 
374
 
375
- const float dall = vload_half(0, &x[ib].d);
376
- const uint8_t m = 1 << (4*n);
 
 
 
 
 
 
 
 
 
 
377
 
378
- float sum = y[ 0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
379
- + y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
380
- + y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
381
- + y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
382
- + y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
383
- + y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
384
- + y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
385
- + y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
386
 
387
- *result = sum * dall;
388
 
 
 
 
 
 
 
 
 
 
 
 
389
  }
390
 
391
- void vec_dot_q4_K(__global const struct block_q4_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
 
 
392
 
393
- const int j = iqs / 64; // j is in 0...3
394
- const int ir = (iqs - 64*j)/2; // ir is in 0...28 in steps of 4
395
- const int is = 2*j; // is is in 0...6 in steps of 2
396
 
397
- __global const float * y = yy + 64*j + ir;
398
- __global const uint8_t * q = x[ib].qs + 32*j + ir;
399
 
400
- const float dall = vload_half(0, &x[ib].d);
401
- const float dmin = vload_half(0, &x[ib].dmin);
402
 
403
- uint8_t sc, m;
404
- get_scale_min_k4(is + 0, x[ib].scales, &sc, &m);
405
- const float d1 = dall * sc;
406
- const float m1 = dmin * m;
407
- get_scale_min_k4(is + 1, x[ib].scales, &sc, &m);
408
- const float d2 = dall * sc;
409
- const float m2 = dmin * m;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
 
411
- float sum = 0;
412
- for (int k = 0; k < 4; ++k) {
413
- sum += y[k + 0] * (d1 * (q[k] & 0xF) - m1);
414
- sum += y[k + 32] * (d2 * (q[k] >> 4) - m2);
415
  }
416
 
417
- *result = sum;
 
 
 
 
 
 
 
 
 
 
418
  }
419
 
420
- void vec_dot_q5_K(__global const struct block_q5_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
 
 
 
 
 
 
 
 
 
421
 
422
- const int j = iqs / 64;
423
- const int ir = (iqs - 64*j)/2;
424
- const int is = 2*j;
425
 
426
- __global const float * y = yy + 64*j + ir;
427
- __global const uint8_t * ql = x[ib].qs + 32*j + ir;
428
- __global const uint8_t * qh = x[ib].qh + ir;
429
 
430
- const float dall = vload_half(0, &x[ib].d);
431
- const float dmin = vload_half(0, &x[ib].dmin);
 
432
 
433
- uint8_t sc, m;
434
- get_scale_min_k4(is + 0, x[ib].scales, &sc, &m);
435
- const float d1 = dall * sc;
436
- const float m1 = dmin * m;
437
- get_scale_min_k4(is + 1, x[ib].scales, &sc, &m);
438
- const float d2 = dall * sc;
439
- const float m2 = dmin * m;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
 
441
- uint8_t hm = 1 << is;
442
- float sum = 0;
443
- for (int k = 0; k < 4; ++k) {
444
- sum += y[k + 0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
445
- }
446
- hm <<= 1;
447
- for (int k = 0; k < 4; ++k) {
448
- sum += y[k + 32] * (d2 * ((ql[k] >> 4) + (qh[k] & hm ? 16 : 0)) - m2);
449
  }
450
- *result = sum;
451
 
 
 
 
 
 
 
 
 
 
 
 
452
  }
453
 
454
- void vec_dot_q6_K(__global const struct block_q6_K* x, const int ib, const int iqs, const __global float *yy, float *result) {
455
 
 
 
 
456
 
457
- const int ip = iqs / 128; // 0 or 1
458
- const int il = (iqs - 128*ip)/8; // 0...15
459
- const int is = 8*ip;
460
 
461
- __global const float * y = yy + 128*ip + il;
 
462
 
463
- const float d = vload_half(0, &x[ib].d);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
 
465
- __global const uint8_t * ql = x[ib].ql + 64*ip + il;
466
- __global const uint8_t * qh = x[ib].qh + 32*ip + il;
467
- __global const int8_t * sc = x[ib].scales + is;
 
 
468
 
469
- *result = y[ 0] * d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh[ 0] >> 0) & 3) << 4)) - 32)
470
- + y[ 32] * d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh[ 0] >> 2) & 3) << 4)) - 32)
471
- + y[ 64] * d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh[ 0] >> 4) & 3) << 4)) - 32)
472
- + y[ 96] * d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh[ 0] >> 6) & 3) << 4)) - 32)
473
- + y[ 16] * d * sc[1] * ((int8_t)((ql[16] & 0xF) | (((qh[16] >> 0) & 3) << 4)) - 32)
474
- + y[ 48] * d * sc[3] * ((int8_t)((ql[48] & 0xF) | (((qh[16] >> 2) & 3) << 4)) - 32)
475
- + y[ 80] * d * sc[5] * ((int8_t)((ql[16] >> 4) | (((qh[16] >> 4) & 3) << 4)) - 32)
476
- + y[112] * d * sc[7] * ((int8_t)((ql[48] >> 4) | (((qh[16] >> 6) & 3) << 4)) - 32);
477
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
  }
479
 
480
  );
@@ -546,44 +782,6 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
546
  }
547
  );
548
 
549
- std::string dequant_mul_mat_vec_k_template = MULTILINE_QUOTE(
550
- __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
551
- const int block_size = get_local_size(0);
552
- const int row = get_group_id(0);
553
- const int tid = get_local_id(0);
554
-
555
- const int iter_stride = 256;
556
- const int vals_per_iter = iter_stride / block_size;
557
- const int num_blocks_per_row = ncols / 256;
558
- const int ib0 = row*num_blocks_per_row;
559
-
560
- tmp[tid] = 0;
561
-
562
- for (int i = 0; i < ncols; i += iter_stride) {
563
- const int col = i + vals_per_iter*tid;
564
- const int ib = ib0 + col/256; // x block index
565
- const int iqs = col%256; // x quant index
566
- const int iybs = col - col%256; // y block start index
567
-
568
- // dequantize
569
- float v;
570
- DOT_KERNEL(x, ib, iqs, y + iybs, &v);
571
- tmp[tid] += v;
572
- }
573
-
574
- // sum up partial sums and write back result
575
- barrier(CLK_LOCAL_MEM_FENCE);
576
- for (int s=block_size/2; s>0; s>>=1) {
577
- if (tid < s) {
578
- tmp[tid] += tmp[tid + s];
579
- }
580
- barrier(CLK_LOCAL_MEM_FENCE);
581
- }
582
- if (tid == 0) {
583
- dst[row] = tmp[0];
584
- }
585
- }
586
- );
587
 
588
  std::string mul_template = MULTILINE_QUOTE(
589
  __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
@@ -648,18 +846,6 @@ std::array<std::string, 2> mul_str_values = {
648
  "mul_f32", "float"
649
  };
650
 
651
- std::array<std::string, 3> dmmv_k_str_keys = {
652
- "KERNEL_NAME", "X_TYPE", "DOT_KERNEL"
653
- };
654
-
655
- std::array<std::string, 15> dmmv_k_str_values = {
656
- "dequantize_mul_mat_vec_q2_K", "struct block_q2_K", "vec_dot_q2_K",
657
- "dequantize_mul_mat_vec_q3_K", "struct block_q3_K", "vec_dot_q3_K",
658
- "dequantize_mul_mat_vec_q4_K", "struct block_q4_K", "vec_dot_q4_K",
659
- "dequantize_mul_mat_vec_q5_K", "struct block_q5_K", "vec_dot_q5_K",
660
- "dequantize_mul_mat_vec_q6_K", "struct block_q6_K", "vec_dot_q6_K",
661
- };
662
-
663
  std::string& replace(std::string& s, const std::string& from, const std::string& to) {
664
  size_t pos = 0;
665
  while ((pos = s.find(from, pos)) != std::string::npos) {
@@ -672,6 +858,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
672
  std::string generate_kernels() {
673
  std::stringstream src;
674
  src << program_source << '\n';
 
675
  for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
676
  std::string dequant_kernel = dequant_template;
677
  std::string dmmv_kernel = dequant_mul_mat_vec_template;
@@ -689,13 +876,6 @@ std::string generate_kernels() {
689
  }
690
  src << mul_kernel << '\n';
691
  }
692
- for (size_t i = 0; i < dmmv_k_str_values.size(); i += dmmv_k_str_keys.size()) {
693
- std::string dmmv_k_kernel = dequant_mul_mat_vec_k_template;
694
- for (size_t j = 0; j < dmmv_k_str_keys.size(); j++) {
695
- replace(dmmv_k_kernel, dmmv_k_str_keys[j], dmmv_k_str_values[i + j]);
696
- }
697
- src << dmmv_k_kernel << '\n';
698
- }
699
 
700
  return src.str();
701
  }
@@ -728,10 +908,11 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
728
  exit(1);
729
  }
730
 
731
- const char* compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math "
732
- "-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1";
 
733
 
734
- err = clBuildProgram(p, 0, NULL, compile_opts, NULL, NULL);
735
  if(err < 0) {
736
 
737
  clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
 
16
 
17
  #include "ggml.h"
18
 
19
+ #if defined(_MSC_VER)
20
+ #pragma warning(disable: 4244 4267) // possible loss of data
21
+ #endif
22
+
23
  #define CL_DMMV_BLOCK_SIZE 32
24
 
25
+ #ifndef K_QUANTS_PER_ITERATION
26
+ #define K_QUANTS_PER_ITERATION 1
27
+ #else
28
+ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
29
+ #endif
30
+
31
  #define MULTILINE_QUOTE(...) #__VA_ARGS__
32
  static std::string program_source = MULTILINE_QUOTE(
33
 
34
  typedef char int8_t;
35
  typedef uchar uint8_t;
36
+ typedef short int16_t;
37
+ typedef ushort uint16_t;
38
  typedef int int32_t;
39
  typedef uint uint32_t;
40
 
 
184
  *v0 = vload_half(0, &x[ib + 0]);
185
  *v1 = vload_half(0, &x[ib + 1]);
186
  }
187
+ );
188
 
189
+ static std::string k_quants_source = MULTILINE_QUOTE(
190
  inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m)
191
  {
192
  if (j < 4)
 
210
  const int is = 8 * n + l / 16;
211
 
212
  const uint8_t q = x[i].qs[32 * n + l];
213
+ __global float *y = yy + i * QK_K + 128 * n;
214
 
215
  const float dall = vload_half(0, &x[i].d);
216
  const float dmin = vload_half(0, &x[i].dmin);
 
242
  float d_all = vload_half(0, &x[i].d);
243
  float dl = d_all * (us - 32);
244
 
245
+ __global float *y = yy + i * QK_K + 128 * n + 32 * j;
246
  const __global uint8_t *q = x[i].qs + 32 * n;
247
  const __global uint8_t *hm = x[i].hmask;
248
 
 
259
  const int is = 2 * il;
260
  const int n = 4;
261
 
262
+ __global float *y = yy + i * QK_K + 64 * il + n * ir;
263
 
264
  const float dall = vload_half(0, &x[i].d);
265
  const float dmin = vload_half(0, &x[i].dmin);
 
288
  const int ir = tid % 16;
289
  const int is = 2 * il;
290
 
291
+ __global float *y = yy + i * QK_K + 64 * il + 2 * ir;
292
 
293
  const float dall = vload_half(0, &x[i].d);
294
  const float dmin = vload_half(0, &x[i].dmin);
 
320
  const int il = tid - 32 * ip;
321
  const int is = 8 * ip + il / 16;
322
 
323
+ __global float *y = yy + i * QK_K + 128 * ip + il;
324
 
325
  const float d = vload_half(0, &x[i].d);
326
 
 
334
  y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
335
  }
336
 
337
+ __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
338
 
339
+ const int row = get_group_id(0);
340
 
341
+ const int num_blocks_per_row = ncols / QK_K;
342
+ const int ib0 = row*num_blocks_per_row;
 
343
 
344
+ __global const struct block_q2_K * x = xx + ib0;
 
 
345
 
346
+ const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...15
347
+ const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1
348
 
349
+ const int step = 16/K_QUANTS_PER_ITERATION;
 
 
 
 
 
 
 
350
 
351
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
352
+ const int in = tid - step*im; // 0...15 or 0...7
353
 
354
+ const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2
355
+ const int q_offset = 32*im + l0;
356
+ const int s_offset = 8*im;
357
+ const int y_offset = 128*im + l0;
358
 
359
+ tmp[16 * ix + tid] = 0;
 
360
 
361
+ uint32_t aux[4];
362
+ const uint8_t * d = (const uint8_t *)aux;
363
+ const uint8_t * m = (const uint8_t *)(aux + 2);
364
 
365
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
 
 
366
 
367
+ __global const float * y = yy + i * QK_K + y_offset;
368
+ __global const uint8_t * q = x[i].qs + q_offset;
 
 
369
 
370
+ const float dall = vload_half(0, &x[i].d);
371
+ const float dmin = vload_half(0, &x[i].dmin);
 
372
 
373
+ __global const uint32_t * a = (__global const uint32_t *)(x[i].scales + s_offset);
374
+ aux[0] = a[0] & 0x0f0f0f0f;
375
+ aux[1] = a[1] & 0x0f0f0f0f;
376
+ aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
377
+ aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
378
 
379
+ float sum1 = 0, sum2 = 0;
380
+ for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
381
+ sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
382
+ + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
383
+ + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
384
+ + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
385
+ + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
386
+ + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
387
+ + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
388
+ +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
389
+ sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
390
+ + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
391
 
392
+ }
393
+ tmp[16 * ix + tid] += dall * sum1 - dmin * sum2;
 
 
 
 
 
 
394
 
395
+ }
396
 
397
+ // sum up partial sums and write back result
398
+ barrier(CLK_LOCAL_MEM_FENCE);
399
+ for (int s=16; s>0; s>>=1) {
400
+ if (tid < s) {
401
+ tmp[tid] += tmp[tid + s];
402
+ }
403
+ barrier(CLK_LOCAL_MEM_FENCE);
404
+ }
405
+ if (tid == 0) {
406
+ dst[row] = tmp[0];
407
+ }
408
  }
409
 
410
+ __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
411
+ const uint16_t kmask1 = 0x0303;
412
+ const uint16_t kmask2 = 0x0f0f;
413
 
414
+ const int row = get_group_id(0);
 
 
415
 
416
+ const int num_blocks_per_row = ncols / QK_K;
417
+ const int ib0 = row*num_blocks_per_row;
418
 
419
+ __global const struct block_q3_K * x = xx + ib0;
 
420
 
421
+ const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
422
+ const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1
423
+
424
+ const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop
425
+ const int step = 16/K_QUANTS_PER_ITERATION;
426
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
427
+ const int in = tid - step*im; // 0....15 or 0...7
428
+
429
+ const uint8_t m = 1 << (4*im);
430
+
431
+ const int l0 = n*in; // 0...15 or 0...14 in steps of 2
432
+ const int q_offset = 32*im + l0;
433
+ const int y_offset = 128*im + l0;
434
+
435
+ uint16_t utmp[4];
436
+ const int8_t * s = (const int8_t *)utmp;
437
+
438
+ const uint16_t s_shift = 4*im;
439
+
440
+ tmp[16 * ix + tid] = 0;
441
+
442
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
443
+
444
+ __global const float * y = yy + i * QK_K + y_offset;
445
+ __global const uint8_t * q = x[i].qs + q_offset;
446
+ __global const uint8_t * h = x[i].hmask + l0;
447
+
448
+ __global const uint16_t * a = (__global const uint16_t *)x[i].scales;
449
+ utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
450
+ utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
451
+ utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
452
+ utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
453
+
454
+ const float d = vload_half(0, &x[i].d);
455
+
456
+ float sum = 0;
457
+ for (int l = 0; l < n; ++l) {
458
+ sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
459
+ + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
460
+ + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
461
+ + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
462
+ sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
463
+ + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
464
+ + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
465
+ + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
466
+ }
467
+ tmp[16 * ix + tid] += d * sum;
468
 
 
 
 
 
469
  }
470
 
471
+ // sum up partial sums and write back result
472
+ barrier(CLK_LOCAL_MEM_FENCE);
473
+ for (int s=16; s>0; s>>=1) {
474
+ if (tid < s) {
475
+ tmp[tid] += tmp[tid + s];
476
+ }
477
+ barrier(CLK_LOCAL_MEM_FENCE);
478
+ }
479
+ if (tid == 0) {
480
+ dst[row] = tmp[0];
481
+ }
482
  }
483
 
484
+ __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
485
+
486
+ //to rename it later, just to test now
487
+ const uint16_t kmask1 = 0x3f3f;
488
+ const uint16_t kmask2 = 0x0f0f;
489
+ const uint16_t kmask3 = 0xc0c0;
490
+
491
+ const int row = get_group_id(0);
492
+ const int num_blocks_per_row = ncols / QK_K;
493
+ const int ib0 = row*num_blocks_per_row;
494
 
495
+ const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15
496
+ const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION;
 
497
 
498
+ const int step = 8/K_QUANTS_PER_ITERATION;
 
 
499
 
500
+ const int il = tid/step; // 0...3
501
+ const int ir = tid - step*il;// 0...3
502
+ const int n = 2*K_QUANTS_PER_ITERATION;
503
 
504
+ const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
505
+ const int in = il%2;
506
+
507
+ const int l0 = n*(2*ir + in);
508
+ const int q_offset = 32*im + l0;
509
+ const int y_offset = 64*im + l0;
510
+
511
+ uint16_t aux[4];
512
+ const uint8_t * sc = (const uint8_t *)aux;
513
+
514
+ __global const struct block_q4_K * x = xx + ib0;
515
+
516
+ tmp[16 * ix + tid] = 0;
517
+
518
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
519
+
520
+ __global const uint8_t * q1 = x[i].qs + q_offset;
521
+ __global const uint8_t * q2 = q1 + 64;
522
+ __global const float * y1 = yy + i*QK_K + y_offset;
523
+ __global const float * y2 = y1 + 128;
524
+
525
+ const float dall = vload_half(0, &x[i].d);
526
+ const float dmin = vload_half(0, &x[i].dmin);
527
+
528
+ __global const uint16_t * a = (__global const uint16_t *)x[i].scales;
529
+ aux[0] = a[im+0] & kmask1;
530
+ aux[1] = a[im+2] & kmask1;
531
+ aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
532
+ aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
533
+
534
+ float4 s = (float4)(0.f);
535
+ float smin = 0;
536
+ for (int l = 0; l < n; ++l) {
537
+ s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
538
+ s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
539
+ smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
540
+ }
541
+ tmp[16 * ix + tid] += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
542
 
 
 
 
 
 
 
 
 
543
  }
 
544
 
545
+ // sum up partial sums and write back result
546
+ barrier(CLK_LOCAL_MEM_FENCE);
547
+ for (int s=16; s>0; s>>=1) {
548
+ if (tid < s) {
549
+ tmp[tid] += tmp[tid + s];
550
+ }
551
+ barrier(CLK_LOCAL_MEM_FENCE);
552
+ }
553
+ if (tid == 0) {
554
+ dst[row] = tmp[0];
555
+ }
556
  }
557
 
558
+ __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
559
 
560
+ const uint16_t kmask1 = 0x3f3f;
561
+ const uint16_t kmask2 = 0x0f0f;
562
+ const uint16_t kmask3 = 0xc0c0;
563
 
564
+ const int row = get_group_id(0);
565
+ const int num_blocks_per_row = ncols / QK_K;
566
+ const int ib0 = row*num_blocks_per_row;
567
 
568
+ const int tid = get_local_id(0)/2; // 0...15
569
+ const int ix = get_local_id(0)%2;
570
 
571
+ const int il = tid/4; // 0...3
572
+ const int ir = tid - 4*il;// 0...3
573
+ const int n = 2;
574
+
575
+ const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
576
+ const int in = il%2;
577
+
578
+ const int l0 = n*(2*ir + in);
579
+ const int q_offset = 32*im + l0;
580
+ const int y_offset = 64*im + l0;
581
+
582
+ const uint8_t hm1 = 1 << (2*im);
583
+ const uint8_t hm2 = hm1 << 4;
584
+
585
+ uint16_t aux[4];
586
+ const uint8_t * sc = (const uint8_t *)aux;
587
+
588
+ __global const struct block_q5_K * x = xx + ib0;
589
+
590
+ tmp[16 * ix + tid] = 0;
591
+
592
+ for (int i = ix; i < num_blocks_per_row; i += 2) {
593
 
594
+ __global const uint8_t * ql1 = x[i].qs + q_offset;
595
+ __global const uint8_t * ql2 = ql1 + 64;
596
+ __global const uint8_t * qh = x[i].qh + l0;
597
+ __global const float * y1 = yy + i*QK_K + y_offset;
598
+ __global const float * y2 = y1 + 128;
599
 
600
+ const float dall = vload_half(0, &x[i].d);
601
+ const float dmin = vload_half(0, &x[i].dmin);
 
 
 
 
 
 
602
 
603
+ __global const uint16_t * a = (__global const uint16_t *)x[i].scales;
604
+ aux[0] = a[im+0] & kmask1;
605
+ aux[1] = a[im+2] & kmask1;
606
+ aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
607
+ aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
608
+
609
+ float4 sum = (float4)(0.f);
610
+ float smin = 0;
611
+ for (int l = 0; l < n; ++l) {
612
+ sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
613
+ + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
614
+ sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
615
+ + y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
616
+ sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
617
+ + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
618
+ sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
619
+ + y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
620
+ smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
621
+ + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
622
+ }
623
+ tmp[16 * ix + tid] += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
624
+
625
+ }
626
+
627
+ // sum up partial sums and write back result
628
+ barrier(CLK_LOCAL_MEM_FENCE);
629
+ for (int s=16; s>0; s>>=1) {
630
+ if (tid < s) {
631
+ tmp[tid] += tmp[tid + s];
632
+ }
633
+ barrier(CLK_LOCAL_MEM_FENCE);
634
+ }
635
+ if (tid == 0) {
636
+ dst[row] = tmp[0];
637
+ }
638
+ }
639
+
640
+ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, __local float* tmp, __global const float * yy, __global float * dst, const int ncols) {
641
+
642
+ const int row = get_group_id(0);
643
+
644
+ const int num_blocks_per_row = ncols / QK_K;
645
+ const int ib0 = row*num_blocks_per_row;
646
+
647
+ __global const struct block_q6_K * x = xx + ib0;
648
+
649
+ const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
650
+ const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0, 1
651
+
652
+ const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
653
+
654
+ const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
655
+ const int in = tid - step*im; // 0...15 or 0...7
656
+
657
+ #if K_QUANTS_PER_ITERATION == 1
658
+ const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
659
+ const int is = 0;
660
+ #else
661
+ const int l0 = 4 * in; // 0, 4, 8, ..., 28
662
+ const int is = in / 4;
663
+ #endif
664
+ const int ql_offset = 64*im + l0;
665
+ const int qh_offset = 32*im + l0;
666
+ const int s_offset = 8*im + is;
667
+ const int y_offset = 128*im + l0;
668
+
669
+ tmp[16 * ix + tid] = 0; // partial sum for thread in warp
670
+
671
+ for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
672
+
673
+ __global const float * y = yy + i * QK_K + y_offset;
674
+ __global const uint8_t * ql = x[i].ql + ql_offset;
675
+ __global const uint8_t * qh = x[i].qh + qh_offset;
676
+ __global const int8_t * s = x[i].scales + s_offset;
677
+
678
+ const float d = vload_half(0, &x[i].d);
679
+
680
+ #if K_QUANTS_PER_ITERATION == 1
681
+ float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
682
+ + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
683
+ + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
684
+ + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
685
+ + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
686
+ + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
687
+ + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
688
+ +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
689
+ tmp[16 * ix + tid] += sum;
690
+ #else
691
+ float sum = 0;
692
+ for (int l = 0; l < 4; ++l) {
693
+ sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
694
+ + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
695
+ + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
696
+ + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
697
+ }
698
+ tmp[16 * ix + tid] += sum;
699
+ #endif
700
+
701
+ }
702
+
703
+ // sum up partial sums and write back result
704
+ barrier(CLK_LOCAL_MEM_FENCE);
705
+ for (int s=16; s>0; s>>=1) {
706
+ if (tid < s) {
707
+ tmp[tid] += tmp[tid + s];
708
+ }
709
+ barrier(CLK_LOCAL_MEM_FENCE);
710
+ }
711
+ if (tid == 0) {
712
+ dst[row] = tmp[0];
713
+ }
714
  }
715
 
716
  );
 
782
  }
783
  );
784
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
785
 
786
  std::string mul_template = MULTILINE_QUOTE(
787
  __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
 
846
  "mul_f32", "float"
847
  };
848
 
 
 
 
 
 
 
 
 
 
 
 
 
849
  std::string& replace(std::string& s, const std::string& from, const std::string& to) {
850
  size_t pos = 0;
851
  while ((pos = s.find(from, pos)) != std::string::npos) {
 
858
  std::string generate_kernels() {
859
  std::stringstream src;
860
  src << program_source << '\n';
861
+ src << k_quants_source << '\n';
862
  for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
863
  std::string dequant_kernel = dequant_template;
864
  std::string dmmv_kernel = dequant_mul_mat_vec_template;
 
876
  }
877
  src << mul_kernel << '\n';
878
  }
 
 
 
 
 
 
 
879
 
880
  return src.str();
881
  }
 
908
  exit(1);
909
  }
910
 
911
+ std::string compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math "
912
+ "-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1 "
913
+ "-DQK_K=256 -DK_QUANTS_PER_ITERATION=" + std::to_string(K_QUANTS_PER_ITERATION);
914
 
915
+ err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL);
916
  if(err < 0) {
917
 
918
  clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
ggml.c CHANGED
@@ -24,6 +24,7 @@
24
  #include <stdio.h>
25
  #include <float.h>
26
  #include <limits.h>
 
27
 
28
  #ifdef GGML_USE_METAL
29
  #include <unistd.h>
@@ -112,6 +113,7 @@ typedef void* thread_ret_t;
112
  /*#define GGML_PERF*/
113
  #define GGML_DEBUG 0
114
  #define GGML_GELU_FP16
 
115
  #define GGML_SILU_FP16
116
 
117
  #define GGML_SOFT_MAX_UNROLL 4
@@ -340,6 +342,9 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
340
  // precomputed gelu table for f16 (128 KB)
341
  static ggml_fp16_t table_gelu_f16[1 << 16];
342
 
 
 
 
343
  // precomputed silu table for f16 (128 KB)
344
  static ggml_fp16_t table_silu_f16[1 << 16];
345
 
@@ -1677,14 +1682,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1677
  #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
1678
  #define GGML_F32x4_REDUCE(res, x) \
1679
  { \
1680
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1681
- x[2*i] = vaddq_f32(x[2*i], x[2*i+1]); \
 
1682
  } \
1683
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1684
- x[4*i] = vaddq_f32(x[4*i], x[4*i+2]); \
 
1685
  } \
1686
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1687
- x[8*i] = vaddq_f32(x[8*i], x[8*i+4]); \
 
1688
  } \
1689
  res = GGML_F32x4_REDUCE_ONE(x[0]); \
1690
  }
@@ -1715,14 +1723,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1715
  #define GGML_F16x8_MUL vmulq_f16
1716
  #define GGML_F16x8_REDUCE(res, x) \
1717
  { \
1718
- for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
1719
- x[2*i] = vaddq_f16(x[2*i], x[2*i+1]); \
 
1720
  } \
1721
- for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
1722
- x[4*i] = vaddq_f16(x[4*i], x[4*i+2]); \
 
1723
  } \
1724
- for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
1725
- x[8*i] = vaddq_f16(x[8*i], x[8*i+4]); \
 
1726
  } \
1727
  const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
1728
  const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
@@ -1789,14 +1800,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
1789
  #define GGML_F32x8_MUL _mm256_mul_ps
1790
  #define GGML_F32x8_REDUCE(res, x) \
1791
  { \
1792
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1793
- x[2*i] = _mm256_add_ps(x[2*i], x[2*i+1]); \
 
1794
  } \
1795
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1796
- x[4*i] = _mm256_add_ps(x[4*i], x[4*i+2]); \
 
1797
  } \
1798
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1799
- x[8*i] = _mm256_add_ps(x[8*i], x[8*i+4]); \
 
1800
  } \
1801
  const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
1802
  _mm256_extractf128_ps(x[0], 1)); \
@@ -1886,14 +1900,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1886
  #define GGML_F32x4_MUL vec_mul
1887
  #define GGML_F32x4_REDUCE(res, x) \
1888
  { \
1889
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1890
- x[2*i] = vec_add(x[2*i], x[2*i+1]); \
 
1891
  } \
1892
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1893
- x[4*i] = vec_add(x[4*i], x[4*i+2]); \
 
1894
  } \
1895
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1896
- x[8*i] = vec_add(x[8*i], x[8*i+4]); \
 
1897
  } \
1898
  res = vec_extract(x[0], 0) + \
1899
  vec_extract(x[0], 1) + \
@@ -1949,14 +1966,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1949
  #define GGML_F32x4_MUL wasm_f32x4_mul
1950
  #define GGML_F32x4_REDUCE(res, x) \
1951
  { \
1952
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
1953
- x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
 
1954
  } \
1955
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
1956
- x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
 
1957
  } \
1958
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
1959
- x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
 
1960
  } \
1961
  res = wasm_f32x4_extract_lane(x[0], 0) + \
1962
  wasm_f32x4_extract_lane(x[0], 1) + \
@@ -2011,14 +2031,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
2011
  #define GGML_F16x4_MUL wasm_f32x4_mul
2012
  #define GGML_F16x4_REDUCE(res, x) \
2013
  { \
2014
- for (int i = 0; i < GGML_F16_ARR/2; ++i) { \
2015
- x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
 
2016
  } \
2017
- for (int i = 0; i < GGML_F16_ARR/4; ++i) { \
2018
- x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
 
2019
  } \
2020
- for (int i = 0; i < GGML_F16_ARR/8; ++i) { \
2021
- x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
 
2022
  } \
2023
  res = wasm_f32x4_extract_lane(x[0], 0) + \
2024
  wasm_f32x4_extract_lane(x[0], 1) + \
@@ -2060,14 +2083,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
2060
  #define GGML_F32x4_MUL _mm_mul_ps
2061
  #define GGML_F32x4_REDUCE(res, x) \
2062
  { \
2063
- for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
2064
- x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \
 
2065
  } \
2066
- for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
2067
- x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \
 
2068
  } \
2069
- for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
2070
- x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \
 
2071
  } \
2072
  const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
2073
  res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
@@ -3356,6 +3382,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) {
3356
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
3357
 
3358
  static const float GELU_COEF_A = 0.044715f;
 
3359
  static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
3360
 
3361
  inline static float ggml_gelu_f32(float x) {
@@ -3386,6 +3413,34 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
3386
  }
3387
  #endif
3388
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3389
  // Sigmoid Linear Unit (SiLU) function
3390
  inline static float ggml_silu_f32(float x) {
3391
  return x/(1.0f + expf(-x));
@@ -3616,6 +3671,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3616
  "STEP",
3617
  "RELU",
3618
  "GELU",
 
3619
  "SILU",
3620
  "SILU_BACK",
3621
  "NORM",
@@ -3644,12 +3700,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3644
  "ROPE_BACK",
3645
  "ALIBI",
3646
  "CLAMP",
3647
- "CONV_1D_1S",
3648
- "CONV_1D_2S",
 
3649
 
3650
  "FLASH_ATTN",
3651
  "FLASH_FF",
3652
  "FLASH_ATTN_BACK",
 
 
3653
 
3654
  "MAP_UNARY",
3655
  "MAP_BINARY",
@@ -3658,7 +3717,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
3658
  "CROSS_ENTROPY_LOSS_BACK",
3659
  };
3660
 
3661
- static_assert(GGML_OP_COUNT == 57, "GGML_OP_COUNT != 57");
3662
 
3663
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3664
  "none",
@@ -3684,6 +3743,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3684
  "step(x)",
3685
  "relu(x)",
3686
  "gelu(x)",
 
3687
  "silu(x)",
3688
  "silu_back(x)",
3689
  "norm(x)",
@@ -3712,12 +3772,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3712
  "rope_back(x)",
3713
  "alibi(x)",
3714
  "clamp(x)",
3715
- "conv_1d_1s(x)",
3716
- "conv_1d_2s(x)",
 
3717
 
3718
  "flash_attn(x)",
3719
  "flash_ff(x)",
3720
  "flash_attn_back(x)",
 
 
3721
 
3722
  "f(x)",
3723
  "f(x,y)",
@@ -3726,7 +3789,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3726
  "cross_entropy_loss_back(x,y)",
3727
  };
3728
 
3729
- static_assert(GGML_OP_COUNT == 57, "GGML_OP_COUNT != 57");
3730
 
3731
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
3732
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -4017,7 +4080,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4017
  // initialize time system (required on Windows)
4018
  ggml_time_init();
4019
 
4020
- // initialize GELU, SILU and EXP F32 tables
4021
  {
4022
  const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
4023
 
@@ -4027,13 +4090,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4027
  memcpy(&ii, &ui, sizeof(ii));
4028
  const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
4029
  table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
 
4030
  table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
4031
  table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
4032
  }
4033
 
4034
  const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
4035
 
4036
- GGML_PRINT_DEBUG("%s: GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
4037
  }
4038
 
4039
  // initialize g_state
@@ -4154,14 +4218,34 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) {
4154
  ctx->no_alloc = no_alloc;
4155
  }
4156
 
4157
- void * ggml_get_mem_buffer(struct ggml_context * ctx) {
4158
  return ctx->mem_buffer;
4159
  }
4160
 
4161
- size_t ggml_get_mem_size(struct ggml_context * ctx) {
4162
  return ctx->mem_size;
4163
  }
4164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4165
  // IMPORTANT:
4166
  // when creating "opt" tensors, always save and load the scratch buffer
4167
  // this is an error prone process, but it is necessary to support inplace
@@ -4645,15 +4729,25 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) {
4645
  return tensor->name;
4646
  }
4647
 
4648
- void ggml_set_name(struct ggml_tensor * tensor, const char * name) {
4649
  strncpy(tensor->name, name, sizeof(tensor->name));
4650
  tensor->name[sizeof(tensor->name) - 1] = '\0';
 
 
 
 
 
 
 
 
 
4651
  }
4652
 
4653
  struct ggml_tensor * ggml_view_tensor(
4654
  struct ggml_context * ctx,
4655
  const struct ggml_tensor * src) {
4656
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
 
4657
 
4658
  result->nb[0] = src->nb[0];
4659
  result->nb[1] = src->nb[1];
@@ -5426,6 +5520,40 @@ struct ggml_tensor * ggml_gelu_inplace(
5426
  return ggml_gelu_impl(ctx, a, true);
5427
  }
5428
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5429
  // ggml_silu
5430
 
5431
  struct ggml_tensor * ggml_silu_impl(
@@ -5781,6 +5909,11 @@ struct ggml_tensor * ggml_cpy_impl(
5781
 
5782
  // make a view of the destination
5783
  struct ggml_tensor * result = ggml_view_tensor(ctx, b);
 
 
 
 
 
5784
 
5785
  result->op = GGML_OP_CPY;
5786
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5817,6 +5950,7 @@ struct ggml_tensor * ggml_cont_impl(
5817
  }
5818
 
5819
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
5820
 
5821
  result->op = GGML_OP_CONT;
5822
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5860,6 +5994,7 @@ struct ggml_tensor * ggml_reshape(
5860
  }
5861
 
5862
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
 
5863
 
5864
  result->op = GGML_OP_RESHAPE;
5865
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5884,6 +6019,7 @@ struct ggml_tensor * ggml_reshape_1d(
5884
 
5885
  const int64_t ne[1] = { ne0 };
5886
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
 
5887
 
5888
  result->op = GGML_OP_RESHAPE;
5889
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5909,6 +6045,7 @@ struct ggml_tensor * ggml_reshape_2d(
5909
 
5910
  const int64_t ne[2] = { ne0, ne1 };
5911
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
 
5912
 
5913
  result->op = GGML_OP_RESHAPE;
5914
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5935,6 +6072,7 @@ struct ggml_tensor * ggml_reshape_3d(
5935
 
5936
  const int64_t ne[3] = { ne0, ne1, ne2 };
5937
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
 
5938
 
5939
  result->op = GGML_OP_RESHAPE;
5940
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5963,6 +6101,7 @@ struct ggml_tensor * ggml_reshape_4d(
5963
 
5964
  const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
5965
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
 
5966
 
5967
  result->op = GGML_OP_RESHAPE;
5968
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -5987,10 +6126,12 @@ struct ggml_tensor * ggml_view_1d(
5987
  }
5988
 
5989
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
 
5990
 
5991
  ggml_scratch_save(ctx);
5992
 
5993
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
 
5994
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
5995
 
5996
  ggml_scratch_load(ctx);
@@ -6023,10 +6164,12 @@ struct ggml_tensor * ggml_view_2d(
6023
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
6024
 
6025
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
 
6026
 
6027
  ggml_scratch_save(ctx);
6028
 
6029
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
 
6030
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
6031
 
6032
  ggml_scratch_load(ctx);
@@ -6065,10 +6208,12 @@ struct ggml_tensor * ggml_view_3d(
6065
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
6066
 
6067
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
 
6068
 
6069
  ggml_scratch_save(ctx);
6070
 
6071
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
 
6072
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
6073
 
6074
  ggml_scratch_load(ctx);
@@ -6109,10 +6254,12 @@ struct ggml_tensor * ggml_view_4d(
6109
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
6110
 
6111
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
 
6112
 
6113
  ggml_scratch_save(ctx);
6114
 
6115
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
 
6116
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
6117
 
6118
  ggml_scratch_load(ctx);
@@ -6158,6 +6305,7 @@ struct ggml_tensor * ggml_permute(
6158
  }
6159
 
6160
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
 
6161
 
6162
  int ne[GGML_MAX_DIMS];
6163
  int nb[GGML_MAX_DIMS];
@@ -6217,6 +6365,7 @@ struct ggml_tensor * ggml_transpose(
6217
  }
6218
 
6219
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
 
6220
 
6221
  result->ne[0] = a->ne[1];
6222
  result->ne[1] = a->ne[0];
@@ -6625,7 +6774,7 @@ struct ggml_tensor * ggml_clamp(
6625
 
6626
  ggml_scratch_save(ctx);
6627
 
6628
- struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
6629
 
6630
  ((float *) b->data)[0] = min;
6631
  ((float *) b->data)[1] = max;
@@ -6640,9 +6789,9 @@ struct ggml_tensor * ggml_clamp(
6640
  return result;
6641
  }
6642
 
6643
- // ggml_conv_1d_1s
6644
 
6645
- struct ggml_tensor * ggml_conv_1d_1s(
6646
  struct ggml_context * ctx,
6647
  struct ggml_tensor * a,
6648
  struct ggml_tensor * b) {
@@ -6659,7 +6808,7 @@ struct ggml_tensor * ggml_conv_1d_1s(
6659
  const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
6660
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6661
 
6662
- result->op = GGML_OP_CONV_1D_1S;
6663
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6664
  result->src0 = a;
6665
  result->src1 = b;
@@ -6667,9 +6816,9 @@ struct ggml_tensor * ggml_conv_1d_1s(
6667
  return result;
6668
  }
6669
 
6670
- // ggml_conv_1d_2s
6671
 
6672
- struct ggml_tensor * ggml_conv_1d_2s(
6673
  struct ggml_context * ctx,
6674
  struct ggml_tensor * a,
6675
  struct ggml_tensor * b) {
@@ -6686,7 +6835,35 @@ struct ggml_tensor * ggml_conv_1d_2s(
6686
  const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
6687
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6688
 
6689
- result->op = GGML_OP_CONV_1D_2S;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6690
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6691
  result->src0 = a;
6692
  result->src1 = b;
@@ -6820,6 +6997,89 @@ struct ggml_tensor * ggml_flash_attn_back(
6820
  return result;
6821
  }
6822
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6823
 
6824
  // ggml_map_unary
6825
 
@@ -7898,7 +8158,7 @@ static void ggml_compute_forward_add_q_f32(
7898
 
7899
  void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
7900
  float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
7901
- void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb0));
7902
 
7903
  assert(ne00 % 32 == 0);
7904
 
@@ -9459,8 +9719,65 @@ static void ggml_compute_forward_gelu(
9459
  GGML_ASSERT(false);
9460
  } break;
9461
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9462
 
9463
- //printf("XXXXXXXX gelu\n");
 
 
 
 
 
 
 
 
 
 
 
 
 
9464
  }
9465
 
9466
  // ggml_compute_forward_silu
@@ -10858,7 +11175,7 @@ static void ggml_compute_forward_set_f32(
10858
  const int im2 = (ne12 == 0 ? 0 : ne12-1);
10859
  const int im3 = (ne13 == 0 ? 0 : ne13-1);
10860
 
10861
- GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 < ggml_nbytes(dst));
10862
 
10863
  GGML_ASSERT(nb10 == sizeof(float));
10864
 
@@ -11579,8 +11896,9 @@ static void ggml_compute_forward_alibi_f32(
11579
  const struct ggml_tensor * src1,
11580
  struct ggml_tensor * dst) {
11581
  assert(params->ith == 0);
11582
- assert(src1->type == GGML_TYPE_I32);
11583
- assert(ggml_nelements(src1) == 3);
 
11584
 
11585
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11586
  return;
@@ -11643,8 +11961,9 @@ static void ggml_compute_forward_alibi_f16(
11643
  const struct ggml_tensor * src1,
11644
  struct ggml_tensor * dst) {
11645
  assert(params->ith == 0);
11646
- assert(src1->type == GGML_TYPE_I32);
11647
- assert(ggml_nelements(src1) == 3);
 
11648
 
11649
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11650
  return;
@@ -11746,15 +12065,16 @@ static void ggml_compute_forward_clamp_f32(
11746
  const struct ggml_tensor * src1,
11747
  struct ggml_tensor * dst) {
11748
  assert(params->ith == 0);
11749
- assert(src1->type == GGML_TYPE_I32);
11750
- assert(ggml_nelements(src1) == 2);
 
11751
 
11752
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11753
  return;
11754
  }
11755
 
11756
- const int min = ((float *) src1->data)[0];
11757
- const int max = ((float *) src1->data)[1];
11758
 
11759
  const int ith = params->ith;
11760
  const int nth = params->nth;
@@ -12312,9 +12632,9 @@ static void ggml_compute_forward_rope_back(
12312
  }
12313
  }
12314
 
12315
- // ggml_compute_forward_conv_1d_1s
12316
 
12317
- static void ggml_compute_forward_conv_1d_1s_f16_f32(
12318
  const struct ggml_compute_params * params,
12319
  const struct ggml_tensor * src0,
12320
  const struct ggml_tensor * src1,
@@ -12434,7 +12754,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
12434
  }
12435
  }
12436
 
12437
- static void ggml_compute_forward_conv_1d_1s_f32(
12438
  const struct ggml_compute_params * params,
12439
  const struct ggml_tensor * src0,
12440
  const struct ggml_tensor * src1,
@@ -12554,7 +12874,7 @@ static void ggml_compute_forward_conv_1d_1s_f32(
12554
  }
12555
  }
12556
 
12557
- static void ggml_compute_forward_conv_1d_1s(
12558
  const struct ggml_compute_params * params,
12559
  const struct ggml_tensor * src0,
12560
  const struct ggml_tensor * src1,
@@ -12562,11 +12882,11 @@ static void ggml_compute_forward_conv_1d_1s(
12562
  switch (src0->type) {
12563
  case GGML_TYPE_F16:
12564
  {
12565
- ggml_compute_forward_conv_1d_1s_f16_f32(params, src0, src1, dst);
12566
  } break;
12567
  case GGML_TYPE_F32:
12568
  {
12569
- ggml_compute_forward_conv_1d_1s_f32(params, src0, src1, dst);
12570
  } break;
12571
  default:
12572
  {
@@ -12575,9 +12895,9 @@ static void ggml_compute_forward_conv_1d_1s(
12575
  }
12576
  }
12577
 
12578
- // ggml_compute_forward_conv_1d_2s
12579
 
12580
- static void ggml_compute_forward_conv_1d_2s_f16_f32(
12581
  const struct ggml_compute_params * params,
12582
  const struct ggml_tensor * src0,
12583
  const struct ggml_tensor * src1,
@@ -12697,7 +13017,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
12697
  }
12698
  }
12699
 
12700
- static void ggml_compute_forward_conv_1d_2s_f32(
12701
  const struct ggml_compute_params * params,
12702
  const struct ggml_tensor * src0,
12703
  const struct ggml_tensor * src1,
@@ -12817,7 +13137,7 @@ static void ggml_compute_forward_conv_1d_2s_f32(
12817
  }
12818
  }
12819
 
12820
- static void ggml_compute_forward_conv_1d_2s(
12821
  const struct ggml_compute_params * params,
12822
  const struct ggml_tensor * src0,
12823
  const struct ggml_tensor * src1,
@@ -12825,11 +13145,148 @@ static void ggml_compute_forward_conv_1d_2s(
12825
  switch (src0->type) {
12826
  case GGML_TYPE_F16:
12827
  {
12828
- ggml_compute_forward_conv_1d_2s_f16_f32(params, src0, src1, dst);
12829
  } break;
12830
  case GGML_TYPE_F32:
12831
  {
12832
- ggml_compute_forward_conv_1d_2s_f32(params, src0, src1, dst);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12833
  } break;
12834
  default:
12835
  {
@@ -13932,6 +14389,145 @@ static void ggml_compute_forward_flash_attn_back(
13932
  }
13933
  }
13934
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13935
  // ggml_compute_forward_map_unary
13936
 
13937
  static void ggml_compute_forward_map_unary_f32(
@@ -14315,7 +14911,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14315
  if (skip_cpu) {
14316
  return;
14317
  }
14318
- GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
14319
  GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
14320
  #endif // GGML_USE_CUBLAS
14321
 
@@ -14404,6 +15000,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14404
  {
14405
  ggml_compute_forward_gelu(params, tensor->src0, tensor);
14406
  } break;
 
 
 
 
14407
  case GGML_OP_SILU:
14408
  {
14409
  ggml_compute_forward_silu(params, tensor->src0, tensor);
@@ -14508,19 +15108,23 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14508
  {
14509
  ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
14510
  } break;
14511
- case GGML_OP_CONV_1D_1S:
 
 
 
 
14512
  {
14513
- ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
14514
  } break;
14515
- case GGML_OP_CONV_1D_2S:
14516
  {
14517
- ggml_compute_forward_conv_1d_2s(params, tensor->src0, tensor->src1, tensor);
14518
  } break;
14519
  case GGML_OP_FLASH_ATTN:
14520
  {
14521
- int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
14522
  GGML_ASSERT(t == 0 || t == 1);
14523
- bool masked = t != 0;
14524
  ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor);
14525
  } break;
14526
  case GGML_OP_FLASH_FF:
@@ -14534,6 +15138,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
14534
  bool masked = t != 0;
14535
  ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor);
14536
  } break;
 
 
 
 
 
 
 
 
14537
  case GGML_OP_MAP_UNARY:
14538
  {
14539
  const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
@@ -14805,6 +15417,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
14805
  {
14806
  GGML_ASSERT(false); // TODO: not implemented
14807
  } break;
 
 
 
 
14808
  case GGML_OP_ALIBI:
14809
  {
14810
  GGML_ASSERT(false); // TODO: not implemented
@@ -15167,11 +15783,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15167
  // noop
15168
  }
15169
  } break;
15170
- case GGML_OP_CONV_1D_1S:
 
 
 
 
15171
  {
15172
  GGML_ASSERT(false); // TODO: not implemented
15173
  } break;
15174
- case GGML_OP_CONV_1D_2S:
15175
  {
15176
  GGML_ASSERT(false); // TODO: not implemented
15177
  } break;
@@ -15340,6 +15960,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
15340
  {
15341
  GGML_ASSERT(false); // not supported
15342
  } break;
 
 
15343
  case GGML_OP_MAP_UNARY:
15344
  case GGML_OP_MAP_BINARY:
15345
  {
@@ -15413,7 +16035,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15413
  GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
15414
 
15415
  if (strlen(node->name) == 0) {
15416
- snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs);
15417
  }
15418
 
15419
  cgraph->leafs[cgraph->n_leafs] = node;
@@ -15422,7 +16044,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
15422
  GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
15423
 
15424
  if (strlen(node->name) == 0) {
15425
- snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes);
15426
  }
15427
 
15428
  cgraph->nodes[cgraph->n_nodes] = node;
@@ -15748,6 +16370,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15748
  } break;
15749
  case GGML_OP_MUL:
15750
  case GGML_OP_GELU:
 
15751
  case GGML_OP_SILU:
15752
  case GGML_OP_SILU_BACK:
15753
  case GGML_OP_NORM:
@@ -15854,8 +16477,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15854
  {
15855
  node->n_tasks = 1; //TODO
15856
  } break;
15857
- case GGML_OP_CONV_1D_1S:
15858
- case GGML_OP_CONV_1D_2S:
15859
  {
15860
  node->n_tasks = n_threads;
15861
 
@@ -15882,6 +16505,41 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15882
  GGML_ASSERT(false);
15883
  }
15884
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15885
  work_size = MAX(work_size, cur);
15886
  } break;
15887
  case GGML_OP_FLASH_ATTN:
@@ -15943,6 +16601,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
15943
 
15944
  work_size = MAX(work_size, cur);
15945
  } break;
 
 
15946
  case GGML_OP_MAP_UNARY:
15947
  case GGML_OP_MAP_BINARY:
15948
  {
@@ -16475,16 +17135,20 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context **
16475
 
16476
  if (!*ctx_data) {
16477
  fprintf(stderr, "%s: failed to create ggml context\n", __func__);
 
16478
  return result;
16479
  }
16480
  }
16481
 
16482
  data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
16483
 
16484
- const size_t ret = fread(data->data, sizeof(char), fsize, fin);
16485
- if (ret != fsize) {
16486
- fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
16487
- return result;
 
 
 
16488
  }
16489
 
16490
  fclose(fin);
@@ -16764,6 +17428,26 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr
16764
  return NULL;
16765
  }
16766
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16767
  void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
16768
  char color[16];
16769
 
@@ -16799,7 +17483,9 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
16799
  (void *) node, color);
16800
 
16801
  if (strlen(node->name) > 0) {
16802
- fprintf(fp, "%s |", node->name);
 
 
16803
  }
16804
 
16805
  if (node->n_dims == 2) {
@@ -16808,7 +17494,6 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
16808
  fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
16809
  }
16810
 
16811
-
16812
  if (node->grad) {
16813
  fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
16814
  } else {
@@ -16827,18 +17512,29 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
16827
  (void *) node, color);
16828
 
16829
  if (strlen(node->name) > 0) {
16830
- fprintf(fp, "%s | ", node->name);
 
 
16831
  }
16832
- if (ggml_nelements(node) == 1) {
16833
- if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
16834
- fprintf(fp, "%d", ggml_get_i32_1d(node, 0));
16835
- }
16836
- else {
16837
- fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0));
 
 
 
 
 
 
 
 
 
 
 
16838
  }
16839
- }
16840
- else {
16841
- fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
16842
  }
16843
  fprintf(fp, "\"; ]\n");
16844
  }
@@ -16846,30 +17542,20 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
16846
  for (int i = 0; i < gb->n_nodes; i++) {
16847
  struct ggml_tensor * node = gb->nodes[i];
16848
 
16849
- struct ggml_tensor * parent = ggml_graph_get_parent(gb, node);
16850
-
16851
  if (node->src0) {
16852
- struct ggml_tensor * parent0 = ggml_graph_get_parent(gb, node->src0);
16853
-
16854
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n",
16855
- parent0 ? (void *) parent0 : (void *) node->src0,
16856
- parent0 ? "g" : "x",
16857
- parent ? (void *) parent : (void *) node,
16858
- parent ? "g" : "x",
16859
- parent ? "empty" : "vee",
16860
- parent ? "dashed" : "solid");
16861
  }
16862
 
16863
  if (node->src1) {
16864
- struct ggml_tensor * parent1 = ggml_graph_get_parent(gb, node->src1);
16865
-
16866
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"y\"; ]\n",
16867
- parent1 ? (void *) parent1 : (void *) node->src1,
16868
- parent1 ? "g" : "x",
16869
- parent ? (void *) parent : (void *) node,
16870
- parent ? "g" : "x",
16871
- parent ? "empty" : "vee",
16872
- parent ? "dashed" : "solid");
16873
  }
16874
  }
16875
 
@@ -16877,15 +17563,19 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
16877
  struct ggml_tensor * node = gb->leafs[i];
16878
 
16879
  if (node->src0) {
16880
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"x\"; ]\n",
16881
- (void *) node->src0, "x",
16882
- (void *) node, "x");
16883
  }
16884
 
16885
  if (node->src1) {
16886
- fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"y\"; ]\n",
16887
- (void *) node->src1, "x",
16888
- (void *) node, "x");
 
 
 
 
 
 
16889
  }
16890
  }
16891
 
@@ -17604,7 +18294,6 @@ GGML_API void ggml_opt_init(
17604
  ggml_set_zero(opt->lbfgs.g);
17605
  ggml_set_zero(opt->lbfgs.gp);
17606
  ggml_set_zero(opt->lbfgs.d);
17607
- ggml_set_zero(opt->lbfgs.pf);
17608
  if (opt->lbfgs.pf) {
17609
  ggml_set_zero(opt->lbfgs.pf);
17610
  }
 
24
  #include <stdio.h>
25
  #include <float.h>
26
  #include <limits.h>
27
+ #include <stdarg.h>
28
 
29
  #ifdef GGML_USE_METAL
30
  #include <unistd.h>
 
113
  /*#define GGML_PERF*/
114
  #define GGML_DEBUG 0
115
  #define GGML_GELU_FP16
116
+ #define GGML_GELU_QUICK_FP16
117
  #define GGML_SILU_FP16
118
 
119
  #define GGML_SOFT_MAX_UNROLL 4
 
342
  // precomputed gelu table for f16 (128 KB)
343
  static ggml_fp16_t table_gelu_f16[1 << 16];
344
 
345
+ // precomputed quick gelu table for f16 (128 KB)
346
+ static ggml_fp16_t table_gelu_quick_f16[1 << 16];
347
+
348
  // precomputed silu table for f16 (128 KB)
349
  static ggml_fp16_t table_silu_f16[1 << 16];
350
 
 
1682
  #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
1683
  #define GGML_F32x4_REDUCE(res, x) \
1684
  { \
1685
+ int offset = GGML_F32_ARR >> 1; \
1686
+ for (int i = 0; i < offset; ++i) { \
1687
+ x[i] = vaddq_f32(x[i], x[offset+i]); \
1688
  } \
1689
+ offset >>= 1; \
1690
+ for (int i = 0; i < offset; ++i) { \
1691
+ x[i] = vaddq_f32(x[i], x[offset+i]); \
1692
  } \
1693
+ offset >>= 1; \
1694
+ for (int i = 0; i < offset; ++i) { \
1695
+ x[i] = vaddq_f32(x[i], x[offset+i]); \
1696
  } \
1697
  res = GGML_F32x4_REDUCE_ONE(x[0]); \
1698
  }
 
1723
  #define GGML_F16x8_MUL vmulq_f16
1724
  #define GGML_F16x8_REDUCE(res, x) \
1725
  { \
1726
+ int offset = GGML_F16_ARR >> 1; \
1727
+ for (int i = 0; i < offset; ++i) { \
1728
+ x[i] = vaddq_f16(x[i], x[offset+i]); \
1729
  } \
1730
+ offset >>= 1; \
1731
+ for (int i = 0; i < offset; ++i) { \
1732
+ x[i] = vaddq_f16(x[i], x[offset+i]); \
1733
  } \
1734
+ offset >>= 1; \
1735
+ for (int i = 0; i < offset; ++i) { \
1736
+ x[i] = vaddq_f16(x[i], x[offset+i]); \
1737
  } \
1738
  const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
1739
  const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
 
1800
  #define GGML_F32x8_MUL _mm256_mul_ps
1801
  #define GGML_F32x8_REDUCE(res, x) \
1802
  { \
1803
+ int offset = GGML_F32_ARR >> 1; \
1804
+ for (int i = 0; i < offset; ++i) { \
1805
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
1806
  } \
1807
+ offset >>= 1; \
1808
+ for (int i = 0; i < offset; ++i) { \
1809
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
1810
  } \
1811
+ offset >>= 1; \
1812
+ for (int i = 0; i < offset; ++i) { \
1813
+ x[i] = _mm256_add_ps(x[i], x[offset+i]); \
1814
  } \
1815
  const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
1816
  _mm256_extractf128_ps(x[0], 1)); \
 
1900
  #define GGML_F32x4_MUL vec_mul
1901
  #define GGML_F32x4_REDUCE(res, x) \
1902
  { \
1903
+ int offset = GGML_F32_ARR >> 1; \
1904
+ for (int i = 0; i < offset; ++i) { \
1905
+ x[i] = vec_add(x[i], x[offset+i]); \
1906
  } \
1907
+ offset >>= 1; \
1908
+ for (int i = 0; i < offset; ++i) { \
1909
+ x[i] = vec_add(x[i], x[offset+i]); \
1910
  } \
1911
+ offset >>= 1; \
1912
+ for (int i = 0; i < offset; ++i) { \
1913
+ x[i] = vec_add(x[i], x[offset+i]); \
1914
  } \
1915
  res = vec_extract(x[0], 0) + \
1916
  vec_extract(x[0], 1) + \
 
1966
  #define GGML_F32x4_MUL wasm_f32x4_mul
1967
  #define GGML_F32x4_REDUCE(res, x) \
1968
  { \
1969
+ int offset = GGML_F32_ARR >> 1; \
1970
+ for (int i = 0; i < offset; ++i) { \
1971
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
1972
  } \
1973
+ offset >>= 1; \
1974
+ for (int i = 0; i < offset; ++i) { \
1975
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
1976
  } \
1977
+ offset >>= 1; \
1978
+ for (int i = 0; i < offset; ++i) { \
1979
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
1980
  } \
1981
  res = wasm_f32x4_extract_lane(x[0], 0) + \
1982
  wasm_f32x4_extract_lane(x[0], 1) + \
 
2031
  #define GGML_F16x4_MUL wasm_f32x4_mul
2032
  #define GGML_F16x4_REDUCE(res, x) \
2033
  { \
2034
+ int offset = GGML_F16_ARR >> 1; \
2035
+ for (int i = 0; i < offset; ++i) { \
2036
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
2037
  } \
2038
+ offset >>= 1; \
2039
+ for (int i = 0; i < offset; ++i) { \
2040
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
2041
  } \
2042
+ offset >>= 1; \
2043
+ for (int i = 0; i < offset; ++i) { \
2044
+ x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
2045
  } \
2046
  res = wasm_f32x4_extract_lane(x[0], 0) + \
2047
  wasm_f32x4_extract_lane(x[0], 1) + \
 
2083
  #define GGML_F32x4_MUL _mm_mul_ps
2084
  #define GGML_F32x4_REDUCE(res, x) \
2085
  { \
2086
+ int offset = GGML_F32_ARR >> 1; \
2087
+ for (int i = 0; i < offset; ++i) { \
2088
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
2089
  } \
2090
+ offset >>= 1; \
2091
+ for (int i = 0; i < offset; ++i) { \
2092
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
2093
  } \
2094
+ offset >>= 1; \
2095
+ for (int i = 0; i < offset; ++i) { \
2096
+ x[i] = _mm_add_ps(x[i], x[offset+i]); \
2097
  } \
2098
  const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
2099
  res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
 
3382
  inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
3383
 
3384
  static const float GELU_COEF_A = 0.044715f;
3385
+ static const float GELU_QUICK_COEF = -1.702f;
3386
  static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
3387
 
3388
  inline static float ggml_gelu_f32(float x) {
 
3413
  }
3414
  #endif
3415
 
3416
+ inline static float ggml_gelu_quick_f32(float x) {
3417
+ return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
3418
+ }
3419
+
3420
+ //inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
3421
+ // const uint16_t * i16 = (const uint16_t *) x;
3422
+ // for (int i = 0; i < n; ++i) {
3423
+ // y[i] = table_gelu_quick_f16[i16[i]];
3424
+ // }
3425
+ //}
3426
+
3427
+ #ifdef GGML_GELU_QUICK_FP16
3428
+ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
3429
+ uint16_t t;
3430
+ for (int i = 0; i < n; ++i) {
3431
+ ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
3432
+ memcpy(&t, &fp16, sizeof(uint16_t));
3433
+ y[i] = GGML_FP16_TO_FP32(table_gelu_quick_f16[t]);
3434
+ }
3435
+ }
3436
+ #else
3437
+ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
3438
+ for (int i = 0; i < n; ++i) {
3439
+ y[i] = ggml_gelu_quick_f32(x[i]);
3440
+ }
3441
+ }
3442
+ #endif
3443
+
3444
  // Sigmoid Linear Unit (SiLU) function
3445
  inline static float ggml_silu_f32(float x) {
3446
  return x/(1.0f + expf(-x));
 
3671
  "STEP",
3672
  "RELU",
3673
  "GELU",
3674
+ "GELU_QUICK",
3675
  "SILU",
3676
  "SILU_BACK",
3677
  "NORM",
 
3700
  "ROPE_BACK",
3701
  "ALIBI",
3702
  "CLAMP",
3703
+ "CONV_1D_S1_PH",
3704
+ "CONV_1D_S2_PH",
3705
+ "CONV_2D_SK_P0",
3706
 
3707
  "FLASH_ATTN",
3708
  "FLASH_FF",
3709
  "FLASH_ATTN_BACK",
3710
+ "WIN_PART",
3711
+ "WIN_UNPART",
3712
 
3713
  "MAP_UNARY",
3714
  "MAP_BINARY",
 
3717
  "CROSS_ENTROPY_LOSS_BACK",
3718
  };
3719
 
3720
+ static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
3721
 
3722
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
3723
  "none",
 
3743
  "step(x)",
3744
  "relu(x)",
3745
  "gelu(x)",
3746
+ "gelu_quick(x)",
3747
  "silu(x)",
3748
  "silu_back(x)",
3749
  "norm(x)",
 
3772
  "rope_back(x)",
3773
  "alibi(x)",
3774
  "clamp(x)",
3775
+ "conv_1d_s1_ph(x)",
3776
+ "conv_1d_s2_ph(x)",
3777
+ "conv_2d_sk_p0(x)",
3778
 
3779
  "flash_attn(x)",
3780
  "flash_ff(x)",
3781
  "flash_attn_back(x)",
3782
+ "win_part(x)",
3783
+ "win_unpart(x)",
3784
 
3785
  "f(x)",
3786
  "f(x,y)",
 
3789
  "cross_entropy_loss_back(x,y)",
3790
  };
3791
 
3792
+ static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61");
3793
 
3794
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
3795
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
 
4080
  // initialize time system (required on Windows)
4081
  ggml_time_init();
4082
 
4083
+ // initialize GELU, Quick GELU, SILU and EXP F32 tables
4084
  {
4085
  const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
4086
 
 
4090
  memcpy(&ii, &ui, sizeof(ii));
4091
  const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
4092
  table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
4093
+ table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f));
4094
  table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
4095
  table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f));
4096
  }
4097
 
4098
  const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
4099
 
4100
+ GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
4101
  }
4102
 
4103
  // initialize g_state
 
4218
  ctx->no_alloc = no_alloc;
4219
  }
4220
 
4221
+ void * ggml_get_mem_buffer(const struct ggml_context * ctx) {
4222
  return ctx->mem_buffer;
4223
  }
4224
 
4225
+ size_t ggml_get_mem_size(const struct ggml_context * ctx) {
4226
  return ctx->mem_size;
4227
  }
4228
 
4229
+ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
4230
+ size_t max_size = 0;
4231
+
4232
+ struct ggml_object * obj = ctx->objects_begin;
4233
+
4234
+ while (obj != NULL) {
4235
+ struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
4236
+
4237
+ const size_t size = ggml_nbytes(tensor);
4238
+
4239
+ if (max_size < size) {
4240
+ max_size = size;
4241
+ }
4242
+
4243
+ obj = obj->next;
4244
+ }
4245
+
4246
+ return max_size;
4247
+ }
4248
+
4249
  // IMPORTANT:
4250
  // when creating "opt" tensors, always save and load the scratch buffer
4251
  // this is an error prone process, but it is necessary to support inplace
 
4729
  return tensor->name;
4730
  }
4731
 
4732
+ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) {
4733
  strncpy(tensor->name, name, sizeof(tensor->name));
4734
  tensor->name[sizeof(tensor->name) - 1] = '\0';
4735
+ return tensor;
4736
+ }
4737
+
4738
+ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
4739
+ va_list args;
4740
+ va_start(args, fmt);
4741
+ vsnprintf(tensor->name, sizeof(tensor->name), fmt, args);
4742
+ va_end(args);
4743
+ return tensor;
4744
  }
4745
 
4746
  struct ggml_tensor * ggml_view_tensor(
4747
  struct ggml_context * ctx,
4748
  const struct ggml_tensor * src) {
4749
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
4750
+ ggml_format_name(result, "%s (view)", src->name);
4751
 
4752
  result->nb[0] = src->nb[0];
4753
  result->nb[1] = src->nb[1];
 
5520
  return ggml_gelu_impl(ctx, a, true);
5521
  }
5522
 
5523
+ // ggml_gelu_quick
5524
+
5525
+ struct ggml_tensor * ggml_gelu_quick_impl(
5526
+ struct ggml_context * ctx,
5527
+ struct ggml_tensor * a,
5528
+ bool inplace) {
5529
+ bool is_node = false;
5530
+
5531
+ if (!inplace && (a->grad)) {
5532
+ is_node = true;
5533
+ }
5534
+
5535
+ struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5536
+
5537
+ result->op = GGML_OP_GELU_QUICK;
5538
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
5539
+ result->src0 = a;
5540
+ result->src1 = NULL;
5541
+
5542
+ return result;
5543
+ }
5544
+
5545
+ struct ggml_tensor * ggml_gelu_quick(
5546
+ struct ggml_context * ctx,
5547
+ struct ggml_tensor * a) {
5548
+ return ggml_gelu_quick_impl(ctx, a, false);
5549
+ }
5550
+
5551
+ struct ggml_tensor * ggml_gelu_quick_inplace(
5552
+ struct ggml_context * ctx,
5553
+ struct ggml_tensor * a) {
5554
+ return ggml_gelu_quick_impl(ctx, a, true);
5555
+ }
5556
+
5557
  // ggml_silu
5558
 
5559
  struct ggml_tensor * ggml_silu_impl(
 
5909
 
5910
  // make a view of the destination
5911
  struct ggml_tensor * result = ggml_view_tensor(ctx, b);
5912
+ if (strlen(b->name) > 0) {
5913
+ ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
5914
+ } else {
5915
+ ggml_format_name(result, "%s (copy)", a->name);
5916
+ }
5917
 
5918
  result->op = GGML_OP_CPY;
5919
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
5950
  }
5951
 
5952
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
5953
+ ggml_format_name(result, "%s (cont)", a->name);
5954
 
5955
  result->op = GGML_OP_CONT;
5956
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
5994
  }
5995
 
5996
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
5997
+ ggml_format_name(result, "%s (reshaped)", a->name);
5998
 
5999
  result->op = GGML_OP_RESHAPE;
6000
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
6019
 
6020
  const int64_t ne[1] = { ne0 };
6021
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data);
6022
+ ggml_format_name(result, "%s (reshaped)", a->name);
6023
 
6024
  result->op = GGML_OP_RESHAPE;
6025
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
6045
 
6046
  const int64_t ne[2] = { ne0, ne1 };
6047
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
6048
+ ggml_format_name(result, "%s (reshaped)", a->name);
6049
 
6050
  result->op = GGML_OP_RESHAPE;
6051
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
6072
 
6073
  const int64_t ne[3] = { ne0, ne1, ne2 };
6074
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
6075
+ ggml_format_name(result, "%s (reshaped)", a->name);
6076
 
6077
  result->op = GGML_OP_RESHAPE;
6078
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
6101
 
6102
  const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
6103
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data);
6104
+ ggml_format_name(result, "%s (reshaped)", a->name);
6105
 
6106
  result->op = GGML_OP_RESHAPE;
6107
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
 
6126
  }
6127
 
6128
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
6129
+ ggml_format_name(result, "%s (view)", a->name);
6130
 
6131
  ggml_scratch_save(ctx);
6132
 
6133
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6134
+ ggml_set_name(offs, "offset");
6135
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
6136
 
6137
  ggml_scratch_load(ctx);
 
6164
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
6165
 
6166
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
6167
+ ggml_format_name(result, "%s (view)", a->name);
6168
 
6169
  ggml_scratch_save(ctx);
6170
 
6171
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6172
+ ggml_set_name(offs, "offset");
6173
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
6174
 
6175
  ggml_scratch_load(ctx);
 
6208
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
6209
 
6210
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
6211
+ ggml_format_name(result, "%s (view)", a->name);
6212
 
6213
  ggml_scratch_save(ctx);
6214
 
6215
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6216
+ ggml_set_name(offs, "offset");
6217
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
6218
 
6219
  ggml_scratch_load(ctx);
 
6254
  const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
6255
 
6256
  struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
6257
+ ggml_format_name(result, "%s (view)", a->name);
6258
 
6259
  ggml_scratch_save(ctx);
6260
 
6261
  struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6262
+ ggml_set_name(offs, "offset");
6263
  memcpy(offs->data, &offset, 2*sizeof(int32_t));
6264
 
6265
  ggml_scratch_load(ctx);
 
6305
  }
6306
 
6307
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6308
+ ggml_format_name(result, "%s (permuted)", a->name);
6309
 
6310
  int ne[GGML_MAX_DIMS];
6311
  int nb[GGML_MAX_DIMS];
 
6365
  }
6366
 
6367
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6368
+ ggml_format_name(result, "%s (transposed)", a->name);
6369
 
6370
  result->ne[0] = a->ne[1];
6371
  result->ne[1] = a->ne[0];
 
6774
 
6775
  ggml_scratch_save(ctx);
6776
 
6777
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2);
6778
 
6779
  ((float *) b->data)[0] = min;
6780
  ((float *) b->data)[1] = max;
 
6789
  return result;
6790
  }
6791
 
6792
+ // ggml_conv_1d_s1_ph
6793
 
6794
+ struct ggml_tensor * ggml_conv_1d_s1_ph(
6795
  struct ggml_context * ctx,
6796
  struct ggml_tensor * a,
6797
  struct ggml_tensor * b) {
 
6808
  const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
6809
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6810
 
6811
+ result->op = GGML_OP_CONV_1D_S1_PH;
6812
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6813
  result->src0 = a;
6814
  result->src1 = b;
 
6816
  return result;
6817
  }
6818
 
6819
+ // ggml_conv_1d_s2_ph
6820
 
6821
+ struct ggml_tensor * ggml_conv_1d_s2_ph(
6822
  struct ggml_context * ctx,
6823
  struct ggml_tensor * a,
6824
  struct ggml_tensor * b) {
 
6835
  const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
6836
  struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
6837
 
6838
+ result->op = GGML_OP_CONV_1D_S2_PH;
6839
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6840
+ result->src0 = a;
6841
+ result->src1 = b;
6842
+
6843
+ return result;
6844
+ }
6845
+
6846
+ // ggml_conv_2d_sk_p0
6847
+
6848
+ struct ggml_tensor * ggml_conv_2d_sk_p0(
6849
+ struct ggml_context * ctx,
6850
+ struct ggml_tensor * a,
6851
+ struct ggml_tensor * b) {
6852
+ GGML_ASSERT(b->ne[3] == 1);
6853
+ GGML_ASSERT(a->ne[2] == b->ne[2]);
6854
+ GGML_ASSERT(b->ne[0] % a->ne[0] == 0);
6855
+ GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
6856
+ bool is_node = false;
6857
+
6858
+ if (a->grad || b->grad) {
6859
+ GGML_ASSERT(false); // TODO: implement backward
6860
+ is_node = true;
6861
+ }
6862
+
6863
+ const int64_t ne[4] = { b->ne[0]/a->ne[0], b->ne[1]/a->ne[1], a->ne[3], 1, };
6864
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
6865
+
6866
+ result->op = GGML_OP_CONV_2D_SK_P0;
6867
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6868
  result->src0 = a;
6869
  result->src1 = b;
 
6997
  return result;
6998
  }
6999
 
7000
+ // ggml_win_part
7001
+
7002
+ struct ggml_tensor * ggml_win_part(
7003
+ struct ggml_context * ctx,
7004
+ struct ggml_tensor * a,
7005
+ int w) {
7006
+ GGML_ASSERT(a->ne[3] == 1);
7007
+ GGML_ASSERT(a->type == GGML_TYPE_F32);
7008
+
7009
+ bool is_node = false;
7010
+
7011
+ if (a->grad) {
7012
+ GGML_ASSERT(false); // TODO: implement backward
7013
+ is_node = true;
7014
+ }
7015
+
7016
+ // padding
7017
+ const int px = (w - a->ne[1]%w)%w;
7018
+ const int py = (w - a->ne[2]%w)%w;
7019
+
7020
+ const int npx = (px + a->ne[1])/w;
7021
+ const int npy = (py + a->ne[2])/w;
7022
+ const int np = npx*npy;
7023
+
7024
+ const int64_t ne[4] = { a->ne[0], w, w, np, };
7025
+
7026
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
7027
+
7028
+ ggml_scratch_save(ctx);
7029
+
7030
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
7031
+
7032
+ ((int32_t *) b->data)[0] = npx;
7033
+ ((int32_t *) b->data)[1] = npy;
7034
+ ((int32_t *) b->data)[2] = w;
7035
+
7036
+ ggml_scratch_load(ctx);
7037
+
7038
+ result->op = GGML_OP_WIN_PART;
7039
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7040
+ result->src0 = a;
7041
+ result->src1 = NULL;
7042
+ result->opt[0] = b;
7043
+
7044
+ return result;
7045
+ }
7046
+
7047
+ // ggml_win_unpart
7048
+
7049
+ struct ggml_tensor * ggml_win_unpart(
7050
+ struct ggml_context * ctx,
7051
+ struct ggml_tensor * a,
7052
+ int w0,
7053
+ int h0,
7054
+ int w) {
7055
+ GGML_ASSERT(a->type == GGML_TYPE_F32);
7056
+
7057
+ bool is_node = false;
7058
+
7059
+ if (a->grad) {
7060
+ GGML_ASSERT(false); // TODO: implement backward
7061
+ is_node = true;
7062
+ }
7063
+
7064
+ const int64_t ne[4] = { a->ne[0], w0, h0, 1, };
7065
+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne);
7066
+
7067
+ ggml_scratch_save(ctx);
7068
+
7069
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
7070
+
7071
+ ((int32_t *) b->data)[0] = w;
7072
+
7073
+ ggml_scratch_load(ctx);
7074
+
7075
+ result->op = GGML_OP_WIN_UNPART;
7076
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
7077
+ result->src0 = a;
7078
+ result->src1 = NULL;
7079
+ result->opt[0] = b;
7080
+
7081
+ return result;
7082
+ }
7083
 
7084
  // ggml_map_unary
7085
 
 
8158
 
8159
  void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
8160
  float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
8161
+ void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3));
8162
 
8163
  assert(ne00 % 32 == 0);
8164
 
 
9719
  GGML_ASSERT(false);
9720
  } break;
9721
  }
9722
+ }
9723
+
9724
+ // ggml_compute_forward_gelu_quick
9725
+
9726
+ static void ggml_compute_forward_gelu_quick_f32(
9727
+ const struct ggml_compute_params * params,
9728
+ const struct ggml_tensor * src0,
9729
+ struct ggml_tensor * dst) {
9730
+ GGML_ASSERT(ggml_is_contiguous(src0));
9731
+ GGML_ASSERT(ggml_is_contiguous(dst));
9732
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
9733
+
9734
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9735
+ return;
9736
+ }
9737
+
9738
+ const int ith = params->ith;
9739
+ const int nth = params->nth;
9740
+
9741
+ const int nc = src0->ne[0];
9742
+ const int nr = ggml_nrows(src0);
9743
+
9744
+ // rows per thread
9745
+ const int dr = (nr + nth - 1)/nth;
9746
+
9747
+ // row range for this thread
9748
+ const int ir0 = dr*ith;
9749
+ const int ir1 = MIN(ir0 + dr, nr);
9750
+
9751
+ for (int i1 = ir0; i1 < ir1; i1++) {
9752
+ ggml_vec_gelu_quick_f32(nc,
9753
+ (float *) ((char *) dst->data + i1*( dst->nb[1])),
9754
+ (float *) ((char *) src0->data + i1*(src0->nb[1])));
9755
+
9756
+ #ifndef NDEBUG
9757
+ for (int k = 0; k < nc; k++) {
9758
+ const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
9759
+ UNUSED(x);
9760
+ assert(!isnan(x));
9761
+ assert(!isinf(x));
9762
+ }
9763
+ #endif
9764
+ }
9765
+ }
9766
 
9767
+ static void ggml_compute_forward_gelu_quick(
9768
+ const struct ggml_compute_params * params,
9769
+ const struct ggml_tensor * src0,
9770
+ struct ggml_tensor * dst) {
9771
+ switch (src0->type) {
9772
+ case GGML_TYPE_F32:
9773
+ {
9774
+ ggml_compute_forward_gelu_quick_f32(params, src0, dst);
9775
+ } break;
9776
+ default:
9777
+ {
9778
+ GGML_ASSERT(false);
9779
+ } break;
9780
+ }
9781
  }
9782
 
9783
  // ggml_compute_forward_silu
 
11175
  const int im2 = (ne12 == 0 ? 0 : ne12-1);
11176
  const int im3 = (ne13 == 0 ? 0 : ne13-1);
11177
 
11178
+ GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst));
11179
 
11180
  GGML_ASSERT(nb10 == sizeof(float));
11181
 
 
11896
  const struct ggml_tensor * src1,
11897
  struct ggml_tensor * dst) {
11898
  assert(params->ith == 0);
11899
+
11900
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
11901
+ GGML_ASSERT(ggml_nelements(src1) == 3);
11902
 
11903
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11904
  return;
 
11961
  const struct ggml_tensor * src1,
11962
  struct ggml_tensor * dst) {
11963
  assert(params->ith == 0);
11964
+
11965
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
11966
+ GGML_ASSERT(ggml_nelements(src1) == 3);
11967
 
11968
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
11969
  return;
 
12065
  const struct ggml_tensor * src1,
12066
  struct ggml_tensor * dst) {
12067
  assert(params->ith == 0);
12068
+
12069
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
12070
+ GGML_ASSERT(ggml_nelements(src1) == 2);
12071
 
12072
  if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
12073
  return;
12074
  }
12075
 
12076
+ const float min = ((float *) src1->data)[0];
12077
+ const float max = ((float *) src1->data)[1];
12078
 
12079
  const int ith = params->ith;
12080
  const int nth = params->nth;
 
12632
  }
12633
  }
12634
 
12635
+ // ggml_compute_forward_conv_1d_s1_ph
12636
 
12637
+ static void ggml_compute_forward_conv_1d_s1_ph_f16_f32(
12638
  const struct ggml_compute_params * params,
12639
  const struct ggml_tensor * src0,
12640
  const struct ggml_tensor * src1,
 
12754
  }
12755
  }
12756
 
12757
+ static void ggml_compute_forward_conv_1d_s1_ph_f32(
12758
  const struct ggml_compute_params * params,
12759
  const struct ggml_tensor * src0,
12760
  const struct ggml_tensor * src1,
 
12874
  }
12875
  }
12876
 
12877
+ static void ggml_compute_forward_conv_1d_s1_ph(
12878
  const struct ggml_compute_params * params,
12879
  const struct ggml_tensor * src0,
12880
  const struct ggml_tensor * src1,
 
12882
  switch (src0->type) {
12883
  case GGML_TYPE_F16:
12884
  {
12885
+ ggml_compute_forward_conv_1d_s1_ph_f16_f32(params, src0, src1, dst);
12886
  } break;
12887
  case GGML_TYPE_F32:
12888
  {
12889
+ ggml_compute_forward_conv_1d_s1_ph_f32(params, src0, src1, dst);
12890
  } break;
12891
  default:
12892
  {
 
12895
  }
12896
  }
12897
 
12898
+ // ggml_compute_forward_conv_1d_s2_ph
12899
 
12900
+ static void ggml_compute_forward_conv_1d_s2_ph_f16_f32(
12901
  const struct ggml_compute_params * params,
12902
  const struct ggml_tensor * src0,
12903
  const struct ggml_tensor * src1,
 
13017
  }
13018
  }
13019
 
13020
+ static void ggml_compute_forward_conv_1d_s2_ph_f32(
13021
  const struct ggml_compute_params * params,
13022
  const struct ggml_tensor * src0,
13023
  const struct ggml_tensor * src1,
 
13137
  }
13138
  }
13139
 
13140
+ static void ggml_compute_forward_conv_1d_s2_ph(
13141
  const struct ggml_compute_params * params,
13142
  const struct ggml_tensor * src0,
13143
  const struct ggml_tensor * src1,
 
13145
  switch (src0->type) {
13146
  case GGML_TYPE_F16:
13147
  {
13148
+ ggml_compute_forward_conv_1d_s2_ph_f16_f32(params, src0, src1, dst);
13149
  } break;
13150
  case GGML_TYPE_F32:
13151
  {
13152
+ ggml_compute_forward_conv_1d_s2_ph_f32(params, src0, src1, dst);
13153
+ } break;
13154
+ default:
13155
+ {
13156
+ GGML_ASSERT(false);
13157
+ } break;
13158
+ }
13159
+ }
13160
+
13161
+ // ggml_compute_forward_conv_2d_sk_p0
13162
+
13163
+ static void ggml_compute_forward_conv_2d_sk_p0_f16_f32(
13164
+ const struct ggml_compute_params * params,
13165
+ const struct ggml_tensor * src0,
13166
+ const struct ggml_tensor * src1,
13167
+ struct ggml_tensor * dst) {
13168
+ GGML_ASSERT(src0->type == GGML_TYPE_F16);
13169
+ GGML_ASSERT(src1->type == GGML_TYPE_F32);
13170
+ GGML_ASSERT( dst->type == GGML_TYPE_F32);
13171
+
13172
+ int64_t t0 = ggml_perf_time_us();
13173
+ UNUSED(t0);
13174
+
13175
+ const int ne00 = src0->ne[0];
13176
+ const int ne01 = src0->ne[1];
13177
+ const int ne02 = src0->ne[2];
13178
+ //const int ne03 = src0->ne[3];
13179
+
13180
+ const int ne10 = src1->ne[0];
13181
+ //const int ne11 = src1->ne[1];
13182
+ const int ne12 = src1->ne[2];
13183
+ //const int ne13 = src1->ne[3];
13184
+
13185
+ const int ne0 = dst->ne[0];
13186
+ const int ne1 = dst->ne[1];
13187
+ const int ne2 = dst->ne[2];
13188
+ //const int ne3 = dst->ne[3];
13189
+ //const int ne = ne0*ne1*ne2*ne3;
13190
+
13191
+ const int nb00 = src0->nb[0];
13192
+ //const int nb01 = src0->nb[1];
13193
+ //const int nb02 = src0->nb[2];
13194
+ const int nb03 = src0->nb[3];
13195
+
13196
+ const int nb10 = src1->nb[0];
13197
+ //const int nb11 = src1->nb[1];
13198
+ const int nb12 = src1->nb[2];
13199
+ //const int nb13 = src1->nb[3];
13200
+
13201
+ //const int nb0 = dst->nb[0];
13202
+ //const int nb1 = dst->nb[1];
13203
+ const int nb2 = dst->nb[2];
13204
+ //const int nb3 = dst->nb[3];
13205
+
13206
+ const int ith = params->ith;
13207
+ const int nth = params->nth;
13208
+
13209
+ const int nk0 = ne00;
13210
+ const int nk1 = ne01;
13211
+
13212
+ // size of the convolution row - the kernel size unrolled across all channels
13213
+ // round-up so it is more suitable for SIMD
13214
+ const int ew0 = ggml_up32(nk0*nk1*ne02);
13215
+
13216
+ GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
13217
+ GGML_ASSERT(nb10 == sizeof(float));
13218
+
13219
+ if (params->type == GGML_TASK_INIT) {
13220
+ // TODO: fix this memset (wsize is overestimated)
13221
+ memset(params->wdata, 0, params->wsize);
13222
+
13223
+ // prepare source data (src1)
13224
+ {
13225
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13226
+
13227
+ for (int i12 = 0; i12 < ne12; i12++) {
13228
+ const float * const src = (float *)((char *) src1->data + i12*nb12);
13229
+ ggml_fp16_t * dst_data = wdata;
13230
+
13231
+ for (int i1 = 0; i1 < ne1; i1++) {
13232
+ for (int i0 = 0; i0 < ne0; i0++) {
13233
+ for (int ik1 = 0; ik1 < nk1; ik1++) {
13234
+ for (int ik0 = 0; ik0 < nk0; ik0++) {
13235
+ dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
13236
+ GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]);
13237
+ }
13238
+ }
13239
+ }
13240
+ }
13241
+ }
13242
+ }
13243
+
13244
+ return;
13245
+ }
13246
+
13247
+ if (params->type == GGML_TASK_FINALIZE) {
13248
+ return;
13249
+ }
13250
+
13251
+ // total patches in dst
13252
+ const int np = ne2;
13253
+
13254
+ // patches per thread
13255
+ const int dp = (np + nth - 1)/nth;
13256
+
13257
+ // patch range for this thread
13258
+ const int ip0 = dp*ith;
13259
+ const int ip1 = MIN(ip0 + dp, np);
13260
+
13261
+ ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
13262
+
13263
+ for (int i2 = ip0; i2 < ip1; i2++) {
13264
+ float * dst_data = (float *)((char *) dst->data + i2*nb2);
13265
+
13266
+ for (int i1 = 0; i1 < ne1; ++i1) {
13267
+ for (int i0 = 0; i0 < ne0; ++i0) {
13268
+ ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
13269
+ (ggml_fp16_t *) ((char *) src0->data + i2*nb03),
13270
+ (ggml_fp16_t *) wdata + (i1*ne0 + i0)*ew0);
13271
+ }
13272
+ }
13273
+ }
13274
+ }
13275
+
13276
+ static void ggml_compute_forward_conv_2d_sk_p0(
13277
+ const struct ggml_compute_params * params,
13278
+ const struct ggml_tensor * src0,
13279
+ const struct ggml_tensor * src1,
13280
+ struct ggml_tensor * dst) {
13281
+ switch (src0->type) {
13282
+ case GGML_TYPE_F16:
13283
+ {
13284
+ ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst);
13285
+ } break;
13286
+ case GGML_TYPE_F32:
13287
+ {
13288
+ //ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst);
13289
+ GGML_ASSERT(false);
13290
  } break;
13291
  default:
13292
  {
 
14389
  }
14390
  }
14391
 
14392
+ // ggml_compute_forward_win_part
14393
+
14394
+ static void ggml_compute_forward_win_part_f32(
14395
+ const struct ggml_compute_params * params,
14396
+ const struct ggml_tensor * src0,
14397
+ const struct ggml_tensor * opt0,
14398
+ struct ggml_tensor * dst) {
14399
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14400
+ return;
14401
+ }
14402
+
14403
+ const int64_t ne00 = src0->ne[0]; UNUSED(ne00);
14404
+ const int64_t ne01 = src0->ne[1];
14405
+ const int64_t ne02 = src0->ne[2];
14406
+ const int64_t ne03 = src0->ne[3]; UNUSED(ne03);
14407
+
14408
+ const int64_t ne0 = dst->ne[0];
14409
+ const int64_t ne1 = dst->ne[1];
14410
+ const int64_t ne2 = dst->ne[2];
14411
+ const int64_t ne3 = dst->ne[3]; UNUSED(ne3);
14412
+
14413
+ const int32_t nep0 = ((const int32_t *)(opt0->data))[0];
14414
+ const int32_t nep1 = ((const int32_t *)(opt0->data))[1];
14415
+ const int32_t w = ((const int32_t *)(opt0->data))[2];
14416
+
14417
+ assert(ne00 == ne0);
14418
+ assert(ne3 == nep0*nep1);
14419
+
14420
+ // TODO: optimize / multi-thread
14421
+ for (int py = 0; py < nep1; ++py) {
14422
+ for (int px = 0; px < nep0; ++px) {
14423
+ const int64_t i3 = py*nep0 + px;
14424
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
14425
+ for (int64_t i1 = 0; i1 < ne1; ++i1) {
14426
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
14427
+ const int64_t i02 = py*w + i2;
14428
+ const int64_t i01 = px*w + i1;
14429
+ const int64_t i00 = i0;
14430
+
14431
+ const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + i0;
14432
+ const int64_t j = i02*ne01*ne00 + i01*ne00 + i00;
14433
+
14434
+ if (py*w + i2 >= ne02 || px*w + i1 >= ne01) {
14435
+ ((float *) dst->data)[i] = 0.0f;
14436
+ } else {
14437
+ ((float *) dst->data)[i] = ((float *) src0->data)[j];
14438
+ }
14439
+ }
14440
+ }
14441
+ }
14442
+ }
14443
+ }
14444
+ }
14445
+
14446
+ static void ggml_compute_forward_win_part(
14447
+ const struct ggml_compute_params * params,
14448
+ const struct ggml_tensor * src0,
14449
+ const struct ggml_tensor * opt0,
14450
+ struct ggml_tensor * dst) {
14451
+ switch (src0->type) {
14452
+ case GGML_TYPE_F32:
14453
+ {
14454
+ ggml_compute_forward_win_part_f32(params, src0, opt0, dst);
14455
+ } break;
14456
+ default:
14457
+ {
14458
+ GGML_ASSERT(false);
14459
+ } break;
14460
+ }
14461
+ }
14462
+
14463
+ // ggml_compute_forward_win_unpart
14464
+
14465
+ static void ggml_compute_forward_win_unpart_f32(
14466
+ const struct ggml_compute_params * params,
14467
+ const struct ggml_tensor * src0,
14468
+ const struct ggml_tensor * opt0,
14469
+ struct ggml_tensor * dst) {
14470
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
14471
+ return;
14472
+ }
14473
+
14474
+ const int64_t ne00 = src0->ne[0];
14475
+ const int64_t ne01 = src0->ne[1];
14476
+ const int64_t ne02 = src0->ne[2];
14477
+ //const int64_t ne03 = src0->ne[3];
14478
+
14479
+ const int64_t ne0 = dst->ne[0];
14480
+ const int64_t ne1 = dst->ne[1];
14481
+ const int64_t ne2 = dst->ne[2];
14482
+
14483
+ const int32_t w = ((const int32_t *)(opt0->data))[0];
14484
+
14485
+ // padding
14486
+ const int px = (w - ne1%w)%w;
14487
+ //const int py = (w - ne2%w)%w;
14488
+
14489
+ const int npx = (px + ne1)/w;
14490
+ //const int npy = (py + ne2)/w;
14491
+
14492
+ assert(ne0 == ne00);
14493
+
14494
+ // TODO: optimize / multi-thread
14495
+ for (int64_t i2 = 0; i2 < ne2; ++i2) {
14496
+ for (int64_t i1 = 0; i1 < ne1; ++i1) {
14497
+ for (int64_t i0 = 0; i0 < ne0; ++i0) {
14498
+ const int ip2 = i2/w;
14499
+ const int ip1 = i1/w;
14500
+
14501
+ const int64_t i02 = i2%w;
14502
+ const int64_t i01 = i1%w;
14503
+ const int64_t i00 = i0;
14504
+
14505
+ const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00;
14506
+ const int64_t j = i2*ne1*ne0 + i1*ne0 + i0;
14507
+
14508
+ ((float *) dst->data)[j] = ((float *) src0->data)[i];
14509
+ }
14510
+ }
14511
+ }
14512
+ }
14513
+
14514
+ static void ggml_compute_forward_win_unpart(
14515
+ const struct ggml_compute_params * params,
14516
+ const struct ggml_tensor * src0,
14517
+ const struct ggml_tensor * opt0,
14518
+ struct ggml_tensor * dst) {
14519
+ switch (src0->type) {
14520
+ case GGML_TYPE_F32:
14521
+ {
14522
+ ggml_compute_forward_win_unpart_f32(params, src0, opt0, dst);
14523
+ } break;
14524
+ default:
14525
+ {
14526
+ GGML_ASSERT(false);
14527
+ } break;
14528
+ }
14529
+ }
14530
+
14531
  // ggml_compute_forward_map_unary
14532
 
14533
  static void ggml_compute_forward_map_unary_f32(
 
14911
  if (skip_cpu) {
14912
  return;
14913
  }
14914
+ GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU);
14915
  GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
14916
  #endif // GGML_USE_CUBLAS
14917
 
 
15000
  {
15001
  ggml_compute_forward_gelu(params, tensor->src0, tensor);
15002
  } break;
15003
+ case GGML_OP_GELU_QUICK:
15004
+ {
15005
+ ggml_compute_forward_gelu_quick(params, tensor->src0, tensor);
15006
+ } break;
15007
  case GGML_OP_SILU:
15008
  {
15009
  ggml_compute_forward_silu(params, tensor->src0, tensor);
 
15108
  {
15109
  ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor);
15110
  } break;
15111
+ case GGML_OP_CONV_1D_S1_PH:
15112
+ {
15113
+ ggml_compute_forward_conv_1d_s1_ph(params, tensor->src0, tensor->src1, tensor);
15114
+ } break;
15115
+ case GGML_OP_CONV_1D_S2_PH:
15116
  {
15117
+ ggml_compute_forward_conv_1d_s2_ph(params, tensor->src0, tensor->src1, tensor);
15118
  } break;
15119
+ case GGML_OP_CONV_2D_SK_P0:
15120
  {
15121
+ ggml_compute_forward_conv_2d_sk_p0(params, tensor->src0, tensor->src1, tensor);
15122
  } break;
15123
  case GGML_OP_FLASH_ATTN:
15124
  {
15125
+ const int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
15126
  GGML_ASSERT(t == 0 || t == 1);
15127
+ const bool masked = t != 0;
15128
  ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor);
15129
  } break;
15130
  case GGML_OP_FLASH_FF:
 
15138
  bool masked = t != 0;
15139
  ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor);
15140
  } break;
15141
+ case GGML_OP_WIN_PART:
15142
+ {
15143
+ ggml_compute_forward_win_part(params, tensor->src0, tensor->opt[0], tensor);
15144
+ } break;
15145
+ case GGML_OP_WIN_UNPART:
15146
+ {
15147
+ ggml_compute_forward_win_unpart(params, tensor->src0, tensor->opt[0], tensor);
15148
+ } break;
15149
  case GGML_OP_MAP_UNARY:
15150
  {
15151
  const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
 
15417
  {
15418
  GGML_ASSERT(false); // TODO: not implemented
15419
  } break;
15420
+ case GGML_OP_GELU_QUICK:
15421
+ {
15422
+ GGML_ASSERT(false); // TODO: not implemented
15423
+ } break;
15424
  case GGML_OP_ALIBI:
15425
  {
15426
  GGML_ASSERT(false); // TODO: not implemented
 
15783
  // noop
15784
  }
15785
  } break;
15786
+ case GGML_OP_CONV_1D_S1_PH:
15787
+ {
15788
+ GGML_ASSERT(false); // TODO: not implemented
15789
+ } break;
15790
+ case GGML_OP_CONV_1D_S2_PH:
15791
  {
15792
  GGML_ASSERT(false); // TODO: not implemented
15793
  } break;
15794
+ case GGML_OP_CONV_2D_SK_P0:
15795
  {
15796
  GGML_ASSERT(false); // TODO: not implemented
15797
  } break;
 
15960
  {
15961
  GGML_ASSERT(false); // not supported
15962
  } break;
15963
+ case GGML_OP_WIN_PART:
15964
+ case GGML_OP_WIN_UNPART:
15965
  case GGML_OP_MAP_UNARY:
15966
  case GGML_OP_MAP_BINARY:
15967
  {
 
16035
  GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
16036
 
16037
  if (strlen(node->name) == 0) {
16038
+ ggml_format_name(node, "leaf_%d", cgraph->n_leafs);
16039
  }
16040
 
16041
  cgraph->leafs[cgraph->n_leafs] = node;
 
16044
  GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
16045
 
16046
  if (strlen(node->name) == 0) {
16047
+ ggml_format_name(node, "node_%d", cgraph->n_nodes);
16048
  }
16049
 
16050
  cgraph->nodes[cgraph->n_nodes] = node;
 
16370
  } break;
16371
  case GGML_OP_MUL:
16372
  case GGML_OP_GELU:
16373
+ case GGML_OP_GELU_QUICK:
16374
  case GGML_OP_SILU:
16375
  case GGML_OP_SILU_BACK:
16376
  case GGML_OP_NORM:
 
16477
  {
16478
  node->n_tasks = 1; //TODO
16479
  } break;
16480
+ case GGML_OP_CONV_1D_S1_PH:
16481
+ case GGML_OP_CONV_1D_S2_PH:
16482
  {
16483
  node->n_tasks = n_threads;
16484
 
 
16505
  GGML_ASSERT(false);
16506
  }
16507
 
16508
+ work_size = MAX(work_size, cur);
16509
+ } break;
16510
+ case GGML_OP_CONV_2D_SK_P0:
16511
+ {
16512
+ node->n_tasks = n_threads;
16513
+
16514
+ GGML_ASSERT(node->src1->ne[3] == 1);
16515
+
16516
+ const int64_t ne00 = node->src0->ne[0]; // W
16517
+ const int64_t ne01 = node->src0->ne[1]; // H
16518
+ const int64_t ne02 = node->src0->ne[2]; // C
16519
+ const int64_t ne03 = node->src0->ne[3]; // N
16520
+
16521
+ const int64_t ne10 = node->src1->ne[0]; // W
16522
+ const int64_t ne11 = node->src1->ne[1]; // H
16523
+ const int64_t ne12 = node->src1->ne[2]; // C
16524
+
16525
+ const int64_t nk = ne00*ne01;
16526
+
16527
+ UNUSED(ne02);
16528
+ UNUSED(ne03);
16529
+ UNUSED(nk);
16530
+
16531
+ size_t cur = 0;
16532
+
16533
+ if (node->src0->type == GGML_TYPE_F16 &&
16534
+ node->src1->type == GGML_TYPE_F32) {
16535
+ cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
16536
+ } else if (node->src0->type == GGML_TYPE_F32 &&
16537
+ node->src1->type == GGML_TYPE_F32) {
16538
+ cur = sizeof(float)* (ne10*ne11*ne12);
16539
+ } else {
16540
+ GGML_ASSERT(false);
16541
+ }
16542
+
16543
  work_size = MAX(work_size, cur);
16544
  } break;
16545
  case GGML_OP_FLASH_ATTN:
 
16601
 
16602
  work_size = MAX(work_size, cur);
16603
  } break;
16604
+ case GGML_OP_WIN_PART:
16605
+ case GGML_OP_WIN_UNPART:
16606
  case GGML_OP_MAP_UNARY:
16607
  case GGML_OP_MAP_BINARY:
16608
  {
 
17135
 
17136
  if (!*ctx_data) {
17137
  fprintf(stderr, "%s: failed to create ggml context\n", __func__);
17138
+ fclose(fin);
17139
  return result;
17140
  }
17141
  }
17142
 
17143
  data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
17144
 
17145
+ {
17146
+ const size_t ret = fread(data->data, sizeof(char), fsize, fin);
17147
+ if (ret != fsize) {
17148
+ fprintf(stderr, "%s: failed to read %s\n", __func__, fname);
17149
+ fclose(fin);
17150
+ return result;
17151
+ }
17152
  }
17153
 
17154
  fclose(fin);
 
17428
  return NULL;
17429
  }
17430
 
17431
+ static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
17432
+ struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node);
17433
+ struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent);
17434
+ fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n",
17435
+ gparent0 ? (void *) gparent0 : (void *) parent,
17436
+ gparent0 ? "g" : "x",
17437
+ gparent ? (void *) gparent : (void *) node,
17438
+ gparent ? "g" : "x",
17439
+ gparent ? "empty" : "vee",
17440
+ gparent ? "dashed" : "solid",
17441
+ label);
17442
+ }
17443
+
17444
+ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) {
17445
+ fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n",
17446
+ (void *) parent, "x",
17447
+ (void *) node, "x",
17448
+ label);
17449
+ }
17450
+
17451
  void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
17452
  char color[16];
17453
 
 
17483
  (void *) node, color);
17484
 
17485
  if (strlen(node->name) > 0) {
17486
+ fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
17487
+ } else {
17488
+ fprintf(fp, "(%s)|", ggml_type_name(node->type));
17489
  }
17490
 
17491
  if (node->n_dims == 2) {
 
17494
  fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | <x>%s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]);
17495
  }
17496
 
 
17497
  if (node->grad) {
17498
  fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
17499
  } else {
 
17512
  (void *) node, color);
17513
 
17514
  if (strlen(node->name) > 0) {
17515
+ fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type));
17516
+ } else {
17517
+ fprintf(fp, "(%s)|", ggml_type_name(node->type));
17518
  }
17519
+
17520
+ fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
17521
+ if (ggml_nelements(node) < 5) {
17522
+ fprintf(fp, " | (");
17523
+ for (int j = 0; j < ggml_nelements(node); j++) {
17524
+ if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
17525
+ fprintf(fp, "%d", ggml_get_i32_1d(node, j));
17526
+ }
17527
+ else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) {
17528
+ fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j));
17529
+ }
17530
+ else {
17531
+ fprintf(fp, "#");
17532
+ }
17533
+ if (j < ggml_nelements(node) - 1) {
17534
+ fprintf(fp, ", ");
17535
+ }
17536
  }
17537
+ fprintf(fp, ")");
 
 
17538
  }
17539
  fprintf(fp, "\"; ]\n");
17540
  }
 
17542
  for (int i = 0; i < gb->n_nodes; i++) {
17543
  struct ggml_tensor * node = gb->nodes[i];
17544
 
 
 
17545
  if (node->src0) {
17546
+ ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x");
 
 
 
 
 
 
 
 
17547
  }
17548
 
17549
  if (node->src1) {
17550
+ ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y");
17551
+ }
17552
+
17553
+ for (int j = 0; j < GGML_MAX_OPT; j++) {
17554
+ if (node->opt[j]) {
17555
+ char label[16];
17556
+ snprintf(label, sizeof(label), "opt %d", j);
17557
+ ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label);
17558
+ }
17559
  }
17560
  }
17561
 
 
17563
  struct ggml_tensor * node = gb->leafs[i];
17564
 
17565
  if (node->src0) {
17566
+ ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x");
 
 
17567
  }
17568
 
17569
  if (node->src1) {
17570
+ ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y");
17571
+ }
17572
+
17573
+ for (int j = 0; j < GGML_MAX_OPT; j++) {
17574
+ if (node->opt[j]) {
17575
+ char label[16];
17576
+ snprintf(label, sizeof(label), "opt %d", j);
17577
+ ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label);
17578
+ }
17579
  }
17580
  }
17581
 
 
18294
  ggml_set_zero(opt->lbfgs.g);
18295
  ggml_set_zero(opt->lbfgs.gp);
18296
  ggml_set_zero(opt->lbfgs.d);
 
18297
  if (opt->lbfgs.pf) {
18298
  ggml_set_zero(opt->lbfgs.pf);
18299
  }
ggml.h CHANGED
@@ -303,6 +303,7 @@ extern "C" {
303
  GGML_OP_STEP,
304
  GGML_OP_RELU,
305
  GGML_OP_GELU,
 
306
  GGML_OP_SILU,
307
  GGML_OP_SILU_BACK,
308
  GGML_OP_NORM, // normalize
@@ -331,12 +332,15 @@ extern "C" {
331
  GGML_OP_ROPE_BACK,
332
  GGML_OP_ALIBI,
333
  GGML_OP_CLAMP,
334
- GGML_OP_CONV_1D_1S,
335
- GGML_OP_CONV_1D_2S,
 
336
 
337
  GGML_OP_FLASH_ATTN,
338
  GGML_OP_FLASH_FF,
339
  GGML_OP_FLASH_ATTN_BACK,
 
 
340
 
341
  GGML_OP_MAP_UNARY,
342
  GGML_OP_MAP_BINARY,
@@ -500,8 +504,9 @@ extern "C" {
500
  GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
501
  GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
502
 
503
- GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
504
- GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
 
505
 
506
  GGML_API struct ggml_tensor * ggml_new_tensor(
507
  struct ggml_context * ctx,
@@ -556,8 +561,9 @@ extern "C" {
556
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
557
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
558
 
559
- GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
560
- GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
 
561
 
562
  //
563
  // operations on tensors with backpropagation
@@ -610,24 +616,47 @@ extern "C" {
610
  struct ggml_tensor * a,
611
  struct ggml_tensor * b);
612
 
 
 
 
 
 
613
  GGML_API struct ggml_tensor * ggml_mul(
614
  struct ggml_context * ctx,
615
  struct ggml_tensor * a,
616
  struct ggml_tensor * b);
617
 
 
 
 
 
 
618
  GGML_API struct ggml_tensor * ggml_div(
619
  struct ggml_context * ctx,
620
  struct ggml_tensor * a,
621
  struct ggml_tensor * b);
622
 
 
 
 
 
 
623
  GGML_API struct ggml_tensor * ggml_sqr(
624
  struct ggml_context * ctx,
625
  struct ggml_tensor * a);
626
 
 
 
 
 
627
  GGML_API struct ggml_tensor * ggml_sqrt(
628
  struct ggml_context * ctx,
629
  struct ggml_tensor * a);
630
 
 
 
 
 
631
  GGML_API struct ggml_tensor * ggml_log(
632
  struct ggml_context * ctx,
633
  struct ggml_tensor * a);
@@ -667,31 +696,67 @@ extern "C" {
667
  struct ggml_context * ctx,
668
  struct ggml_tensor * a);
669
 
 
 
 
 
670
  GGML_API struct ggml_tensor * ggml_sgn(
671
  struct ggml_context * ctx,
672
  struct ggml_tensor * a);
673
 
 
 
 
 
674
  GGML_API struct ggml_tensor * ggml_neg(
675
  struct ggml_context * ctx,
676
  struct ggml_tensor * a);
677
 
 
 
 
 
678
  GGML_API struct ggml_tensor * ggml_step(
679
  struct ggml_context * ctx,
680
  struct ggml_tensor * a);
681
 
 
 
 
 
682
  GGML_API struct ggml_tensor * ggml_relu(
683
  struct ggml_context * ctx,
684
  struct ggml_tensor * a);
685
 
 
 
 
 
686
  // TODO: double-check this computation is correct
687
  GGML_API struct ggml_tensor * ggml_gelu(
688
  struct ggml_context * ctx,
689
  struct ggml_tensor * a);
690
 
 
 
 
 
 
 
 
 
 
 
 
 
691
  GGML_API struct ggml_tensor * ggml_silu(
692
  struct ggml_context * ctx,
693
  struct ggml_tensor * a);
694
 
 
 
 
 
695
  // a - x
696
  // b - dy
697
  GGML_API struct ggml_tensor * ggml_silu_back(
@@ -705,10 +770,18 @@ extern "C" {
705
  struct ggml_context * ctx,
706
  struct ggml_tensor * a);
707
 
 
 
 
 
708
  GGML_API struct ggml_tensor * ggml_rms_norm(
709
  struct ggml_context * ctx,
710
  struct ggml_tensor * a);
711
 
 
 
 
 
712
  // a - x
713
  // b - dy
714
  GGML_API struct ggml_tensor * ggml_rms_norm_back(
@@ -998,16 +1071,55 @@ extern "C" {
998
  float min,
999
  float max);
1000
 
1001
- // padding = 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1002
  // TODO: we don't support extra parameters for now
1003
  // that's why we are hard-coding the stride, padding, and dilation
1004
  // not great ..
1005
- GGML_API struct ggml_tensor * ggml_conv_1d_1s(
 
 
 
 
 
1006
  struct ggml_context * ctx,
1007
  struct ggml_tensor * a,
1008
  struct ggml_tensor * b);
1009
 
1010
- GGML_API struct ggml_tensor * ggml_conv_1d_2s(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1011
  struct ggml_context * ctx,
1012
  struct ggml_tensor * a,
1013
  struct ggml_tensor * b);
@@ -1035,6 +1147,26 @@ extern "C" {
1035
  struct ggml_tensor * c0,
1036
  struct ggml_tensor * c1);
1037
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1038
  // Mapping operations
1039
  typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
1040
  typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
 
303
  GGML_OP_STEP,
304
  GGML_OP_RELU,
305
  GGML_OP_GELU,
306
+ GGML_OP_GELU_QUICK,
307
  GGML_OP_SILU,
308
  GGML_OP_SILU_BACK,
309
  GGML_OP_NORM, // normalize
 
332
  GGML_OP_ROPE_BACK,
333
  GGML_OP_ALIBI,
334
  GGML_OP_CLAMP,
335
+ GGML_OP_CONV_1D_S1_PH,
336
+ GGML_OP_CONV_1D_S2_PH,
337
+ GGML_OP_CONV_2D_SK_P0,
338
 
339
  GGML_OP_FLASH_ATTN,
340
  GGML_OP_FLASH_FF,
341
  GGML_OP_FLASH_ATTN_BACK,
342
+ GGML_OP_WIN_PART,
343
+ GGML_OP_WIN_UNPART,
344
 
345
  GGML_OP_MAP_UNARY,
346
  GGML_OP_MAP_BINARY,
 
504
  GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
505
  GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
506
 
507
+ GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
508
+ GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
509
+ GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
510
 
511
  GGML_API struct ggml_tensor * ggml_new_tensor(
512
  struct ggml_context * ctx,
 
561
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
562
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
563
 
564
+ GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
565
+ GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
566
+ GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...);
567
 
568
  //
569
  // operations on tensors with backpropagation
 
616
  struct ggml_tensor * a,
617
  struct ggml_tensor * b);
618
 
619
+ GGML_API struct ggml_tensor * ggml_sub_inplace(
620
+ struct ggml_context * ctx,
621
+ struct ggml_tensor * a,
622
+ struct ggml_tensor * b);
623
+
624
  GGML_API struct ggml_tensor * ggml_mul(
625
  struct ggml_context * ctx,
626
  struct ggml_tensor * a,
627
  struct ggml_tensor * b);
628
 
629
+ GGML_API struct ggml_tensor * ggml_mul_inplace(
630
+ struct ggml_context * ctx,
631
+ struct ggml_tensor * a,
632
+ struct ggml_tensor * b);
633
+
634
  GGML_API struct ggml_tensor * ggml_div(
635
  struct ggml_context * ctx,
636
  struct ggml_tensor * a,
637
  struct ggml_tensor * b);
638
 
639
+ GGML_API struct ggml_tensor * ggml_div_inplace(
640
+ struct ggml_context * ctx,
641
+ struct ggml_tensor * a,
642
+ struct ggml_tensor * b);
643
+
644
  GGML_API struct ggml_tensor * ggml_sqr(
645
  struct ggml_context * ctx,
646
  struct ggml_tensor * a);
647
 
648
+ GGML_API struct ggml_tensor * ggml_sqr_inplace(
649
+ struct ggml_context * ctx,
650
+ struct ggml_tensor * a);
651
+
652
  GGML_API struct ggml_tensor * ggml_sqrt(
653
  struct ggml_context * ctx,
654
  struct ggml_tensor * a);
655
 
656
+ GGML_API struct ggml_tensor * ggml_sqrt_inplace(
657
+ struct ggml_context * ctx,
658
+ struct ggml_tensor * a);
659
+
660
  GGML_API struct ggml_tensor * ggml_log(
661
  struct ggml_context * ctx,
662
  struct ggml_tensor * a);
 
696
  struct ggml_context * ctx,
697
  struct ggml_tensor * a);
698
 
699
+ GGML_API struct ggml_tensor * ggml_abs_inplace(
700
+ struct ggml_context * ctx,
701
+ struct ggml_tensor * a);
702
+
703
  GGML_API struct ggml_tensor * ggml_sgn(
704
  struct ggml_context * ctx,
705
  struct ggml_tensor * a);
706
 
707
+ GGML_API struct ggml_tensor * ggml_sgn_inplace(
708
+ struct ggml_context * ctx,
709
+ struct ggml_tensor * a);
710
+
711
  GGML_API struct ggml_tensor * ggml_neg(
712
  struct ggml_context * ctx,
713
  struct ggml_tensor * a);
714
 
715
+ GGML_API struct ggml_tensor * ggml_neg_inplace(
716
+ struct ggml_context * ctx,
717
+ struct ggml_tensor * a);
718
+
719
  GGML_API struct ggml_tensor * ggml_step(
720
  struct ggml_context * ctx,
721
  struct ggml_tensor * a);
722
 
723
+ GGML_API struct ggml_tensor * ggml_step_inplace(
724
+ struct ggml_context * ctx,
725
+ struct ggml_tensor * a);
726
+
727
  GGML_API struct ggml_tensor * ggml_relu(
728
  struct ggml_context * ctx,
729
  struct ggml_tensor * a);
730
 
731
+ GGML_API struct ggml_tensor * ggml_relu_inplace(
732
+ struct ggml_context * ctx,
733
+ struct ggml_tensor * a);
734
+
735
  // TODO: double-check this computation is correct
736
  GGML_API struct ggml_tensor * ggml_gelu(
737
  struct ggml_context * ctx,
738
  struct ggml_tensor * a);
739
 
740
+ GGML_API struct ggml_tensor * ggml_gelu_inplace(
741
+ struct ggml_context * ctx,
742
+ struct ggml_tensor * a);
743
+
744
+ GGML_API struct ggml_tensor * ggml_gelu_quick(
745
+ struct ggml_context * ctx,
746
+ struct ggml_tensor * a);
747
+
748
+ GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
749
+ struct ggml_context * ctx,
750
+ struct ggml_tensor * a);
751
+
752
  GGML_API struct ggml_tensor * ggml_silu(
753
  struct ggml_context * ctx,
754
  struct ggml_tensor * a);
755
 
756
+ GGML_API struct ggml_tensor * ggml_silu_inplace(
757
+ struct ggml_context * ctx,
758
+ struct ggml_tensor * a);
759
+
760
  // a - x
761
  // b - dy
762
  GGML_API struct ggml_tensor * ggml_silu_back(
 
770
  struct ggml_context * ctx,
771
  struct ggml_tensor * a);
772
 
773
+ GGML_API struct ggml_tensor * ggml_norm_inplace(
774
+ struct ggml_context * ctx,
775
+ struct ggml_tensor * a);
776
+
777
  GGML_API struct ggml_tensor * ggml_rms_norm(
778
  struct ggml_context * ctx,
779
  struct ggml_tensor * a);
780
 
781
+ GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
782
+ struct ggml_context * ctx,
783
+ struct ggml_tensor * a);
784
+
785
  // a - x
786
  // b - dy
787
  GGML_API struct ggml_tensor * ggml_rms_norm_back(
 
1071
  float min,
1072
  float max);
1073
 
1074
+ // TODO: implement general-purpose convolutions
1075
+ // GGML_API struct ggml_tensor * ggml_conv_1d(
1076
+ // struct ggml_context * ctx,
1077
+ // struct ggml_tensor * a,
1078
+ // struct ggml_tensor * b,
1079
+ // int s0
1080
+ // int p0,
1081
+ // int d0);
1082
+ //
1083
+ // GGML_API struct ggml_tensor * ggml_conv_2d(
1084
+ // struct ggml_context * ctx,
1085
+ // struct ggml_tensor * a,
1086
+ // struct ggml_tensor * b,
1087
+ // int s0,
1088
+ // int s1,
1089
+ // int p0,
1090
+ // int p1,
1091
+ // int d0,
1092
+ // int d1);
1093
+
1094
+ // padding = half
1095
  // TODO: we don't support extra parameters for now
1096
  // that's why we are hard-coding the stride, padding, and dilation
1097
  // not great ..
1098
+ // example:
1099
+ // a: 3 80 768 1
1100
+ // b: 3000 80 1 1
1101
+ // res: 3000 768 1 1
1102
+ // used in whisper
1103
+ GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
1104
  struct ggml_context * ctx,
1105
  struct ggml_tensor * a,
1106
  struct ggml_tensor * b);
1107
 
1108
+ // used in whisper
1109
+ GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
1110
+ struct ggml_context * ctx,
1111
+ struct ggml_tensor * a,
1112
+ struct ggml_tensor * b);
1113
+
1114
+ // kernel size is a->ne[0] x a->ne[1]
1115
+ // stride is equal to kernel size
1116
+ // padding is zero
1117
+ // example:
1118
+ // a: 16 16 3 768
1119
+ // b: 1024 1024 3 1
1120
+ // res: 64 64 768 1
1121
+ // used in sam
1122
+ GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
1123
  struct ggml_context * ctx,
1124
  struct ggml_tensor * a,
1125
  struct ggml_tensor * b);
 
1147
  struct ggml_tensor * c0,
1148
  struct ggml_tensor * c1);
1149
 
1150
+ // partition into non-overlapping windows with padding if needed
1151
+ // example:
1152
+ // a: 768 64 64 1
1153
+ // w: 14
1154
+ // res: 768 14 14 25
1155
+ // used in sam
1156
+ GGML_API struct ggml_tensor * ggml_win_part(
1157
+ struct ggml_context * ctx,
1158
+ struct ggml_tensor * a,
1159
+ int w);
1160
+
1161
+ // reverse of ggml_win_part
1162
+ // used in sam
1163
+ GGML_API struct ggml_tensor * ggml_win_unpart(
1164
+ struct ggml_context * ctx,
1165
+ struct ggml_tensor * a,
1166
+ int w0,
1167
+ int h0,
1168
+ int w);
1169
+
1170
  // Mapping operations
1171
  typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
1172
  typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
gpttype_adapter.cpp CHANGED
@@ -68,7 +68,7 @@ static int n_batch = 8;
68
  static bool useSmartContext = false;
69
  static bool unbanTokens = false;
70
  static int blasbatchsize = 512;
71
- static bool debugmode = false;
72
  static std::string modelname;
73
  static std::vector<gpt_vocab::id> last_n_tokens;
74
  static std::vector<gpt_vocab::id> current_context_tokens;
@@ -78,6 +78,7 @@ static std::vector<int> smartcontext;
78
  static std::vector<std::string> stop_sequence;
79
  static std::vector<llama_token_data> top_picks;
80
  static int remaining_tokens = 0;
 
81
  static std::string concat_output = "";
82
 
83
  inline bool IsNanCheck(float f)
@@ -118,7 +119,7 @@ llama_token sample_token(llama_token_data_array * candidates, std::mt19937 & rng
118
  std::discrete_distribution<> dist(probs.begin(), probs.end());
119
  int idx = dist(rng);
120
 
121
- if(debugmode)
122
  {
123
  top_picks.push_back(candidates->data[idx]);
124
  for (size_t i = 0; (i < candidates->size && i<4); ++i)
@@ -308,8 +309,13 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
308
  params.memory_f16 = inputs.f16_kv;
309
  params.n_ctx = inputs.max_context_length;
310
 
311
- neox_ctx_v2.hparams.n_ctx = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx
312
- = neox_ctx_v3.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx = mpt_ctx_v3.hparams.n_ctx = params.n_ctx;
 
 
 
 
 
313
 
314
  printf("System Info: %s\n", llama_print_system_info());
315
  SetQuantsUnshuffled(false);
@@ -387,9 +393,15 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
387
  {
388
  printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str());
389
 
 
 
 
 
 
 
390
  int err = llama_apply_lora_from_file(llama_ctx_v3,
391
  lora_filename.c_str(),
392
- NULL,
393
  n_threads);
394
  if (err != 0)
395
  {
@@ -539,7 +551,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
539
  return res;
540
  }
541
  // determine the required inference memory per token:
542
- gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
543
  return ModelLoadResult::SUCCESS;
544
  }
545
  else
@@ -606,14 +618,14 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
606
  }
607
 
608
  // determine the required inference memory per token:
609
- gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
610
 
611
  //if the logits are NAN or duplicated, it means the model is incompatible
612
  std::vector<float> oldlogits(logits);
613
 
614
  //this is another hack because they change the library - we run the eval through the model
615
  //twice and compare logits. if they give the same logits for different inputs, model is broken
616
- gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token);
617
 
618
  if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits)))
619
  {
@@ -665,7 +677,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
665
  {
666
  if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
667
  {
668
- ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx_v3, vocab, file_format);
669
  if(res==ModelLoadResult::FAIL)
670
  {
671
  fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
@@ -678,7 +690,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
678
  }
679
 
680
  // determine the required inference memory per token:
681
- gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
682
 
683
  return ModelLoadResult::SUCCESS;
684
  }
@@ -727,7 +739,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
727
  }
728
  else if(file_format==FileFormat::MPT_1)
729
  {
730
- bool res = mpt_model_load(params.model, mpt_ctx_v3, vocab);
731
  if(res==false)
732
  {
733
  fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
@@ -735,7 +747,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
735
  }
736
 
737
  // determine the required inference memory per token:
738
- mpt_eval(mpt_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token);
739
  return ModelLoadResult::SUCCESS;
740
  }
741
  else
@@ -748,6 +760,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
748
 
749
  bool gpttype_generate_abort()
750
  {
 
751
  remaining_tokens = 0;
752
  return true;
753
  }
@@ -888,12 +901,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
888
  current_context_tokens.resize(n_past);
889
 
890
  remaining_tokens = params.n_predict;
891
- int stopper_unused_tokens = 0;
892
  int input_consumed = 0;
893
  std::mt19937 rng(params.seed);
894
  concat_output = "";
895
 
896
  bool startedsampling = false;
 
897
 
898
  timer_start();
899
  double time1 = 0, time2 = 0;
@@ -981,9 +995,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
981
  printf("Bad format!");
982
  }
983
 
984
- printf("\n");
 
 
 
985
 
986
- if (debugmode)
987
  {
988
  std::string outstr = "";
989
  printf("\n[Debug: Dump Input Tokens, format: %d]\n", file_format);
@@ -1013,7 +1030,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
1013
  // predict
1014
  unsigned int embdsize = embd.size();
1015
  //print progress
1016
- if (!startedsampling)
1017
  {
1018
  printf("\rProcessing Prompt%s (%d / %d tokens)", (blasmode ? " [BLAS]" : ""), input_consumed, embd_inp.size());
1019
  }
@@ -1065,7 +1082,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
1065
  }
1066
  else if(file_format==FileFormat::GPT2_4)
1067
  {
1068
- evalres = gpt2_eval(gpt2_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
1069
  }
1070
  else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
1071
  {
@@ -1073,7 +1090,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
1073
  }
1074
  else if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
1075
  {
1076
- evalres = gpt_neox_eval(neox_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token);
1077
  }
1078
  else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2)
1079
  {
@@ -1085,11 +1102,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
1085
  }
1086
  else if(file_format==FileFormat::GPTJ_5)
1087
  {
1088
- evalres = gptj_eval(gptj_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token);
1089
  }
1090
  else if(file_format==FileFormat::MPT_1)
1091
  {
1092
- evalres = mpt_eval(mpt_ctx_v3, params.n_threads, n_past, embd, logits, false, mem_per_token);
1093
  }
1094
  else
1095
  {
@@ -1126,7 +1143,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
1126
  params.n_threads = original_threads;
1127
  time1 = timer_check();
1128
  timer_start();
1129
- printf("\n");
 
 
 
1130
  }
1131
 
1132
  unsigned int eosID = 0;
@@ -1229,11 +1249,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
1229
  concat_output += tokenizedstr;
1230
  }
1231
 
1232
- if (startedsampling)
1233
  {
1234
  printf("\rGenerating (%d / %d tokens)", (params.n_predict - remaining_tokens), params.n_predict);
1235
  }
1236
- if(debugmode && top_picks.size()>0)
1237
  {
1238
  printf(" [");
1239
  bool firstloop = true;
@@ -1253,6 +1273,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
1253
 
1254
  if(unbanTokens && id==eosID)
1255
  {
 
1256
  printf("\n(EOS token triggered!)");
1257
  remaining_tokens = 0;
1258
  }
@@ -1263,7 +1284,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
1263
  {
1264
  stopper_unused_tokens = remaining_tokens;
1265
  remaining_tokens = 0;
1266
- printf("\n(Stop sequence triggered: <%s>)", matched.c_str());
 
 
 
1267
  break;
1268
  }
1269
  }
 
68
  static bool useSmartContext = false;
69
  static bool unbanTokens = false;
70
  static int blasbatchsize = 512;
71
+ static int debugmode = 0; //-1 = hide all, 0 = normal, 1 = showall
72
  static std::string modelname;
73
  static std::vector<gpt_vocab::id> last_n_tokens;
74
  static std::vector<gpt_vocab::id> current_context_tokens;
 
78
  static std::vector<std::string> stop_sequence;
79
  static std::vector<llama_token_data> top_picks;
80
  static int remaining_tokens = 0;
81
+ static int stopper_unused_tokens = 0;
82
  static std::string concat_output = "";
83
 
84
  inline bool IsNanCheck(float f)
 
119
  std::discrete_distribution<> dist(probs.begin(), probs.end());
120
  int idx = dist(rng);
121
 
122
+ if(debugmode==1)
123
  {
124
  top_picks.push_back(candidates->data[idx]);
125
  for (size_t i = 0; (i < candidates->size && i<4); ++i)
 
309
  params.memory_f16 = inputs.f16_kv;
310
  params.n_ctx = inputs.max_context_length;
311
 
312
+ neox_ctx_v2.hparams.n_ctx = neox_ctx_v3.hparams.n_ctx
313
+ = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx
314
+ = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx
315
+ = mpt_ctx_v3.hparams.n_ctx = params.n_ctx;
316
+
317
+ //this is used for the mem_per_token eval, openblas needs more RAM
318
+ bool use_scratch = ggml_cpu_has_gpublas();
319
 
320
  printf("System Info: %s\n", llama_print_system_info());
321
  SetQuantsUnshuffled(false);
 
393
  {
394
  printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str());
395
 
396
+ const char * lora_base_arg = NULL;
397
+ if (lora_base != "") {
398
+ printf("Using LORA base model: %s\n", lora_base.c_str());
399
+ lora_base_arg = lora_base.c_str();
400
+ }
401
+
402
  int err = llama_apply_lora_from_file(llama_ctx_v3,
403
  lora_filename.c_str(),
404
+ lora_base_arg,
405
  n_threads);
406
  if (err != 0)
407
  {
 
551
  return res;
552
  }
553
  // determine the required inference memory per token:
554
+ gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch);
555
  return ModelLoadResult::SUCCESS;
556
  }
557
  else
 
618
  }
619
 
620
  // determine the required inference memory per token:
621
+ gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch);
622
 
623
  //if the logits are NAN or duplicated, it means the model is incompatible
624
  std::vector<float> oldlogits(logits);
625
 
626
  //this is another hack because they change the library - we run the eval through the model
627
  //twice and compare logits. if they give the same logits for different inputs, model is broken
628
+ gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token, use_scratch);
629
 
630
  if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits)))
631
  {
 
677
  {
678
  if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
679
  {
680
+ ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx_v3, vocab, file_format, inputs.gpulayers);
681
  if(res==ModelLoadResult::FAIL)
682
  {
683
  fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
 
690
  }
691
 
692
  // determine the required inference memory per token:
693
+ gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, use_scratch);
694
 
695
  return ModelLoadResult::SUCCESS;
696
  }
 
739
  }
740
  else if(file_format==FileFormat::MPT_1)
741
  {
742
+ bool res = mpt_model_load(params.model, mpt_ctx_v3, vocab, inputs.gpulayers);
743
  if(res==false)
744
  {
745
  fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
 
747
  }
748
 
749
  // determine the required inference memory per token:
750
+ mpt_eval(mpt_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, false, mem_per_token, use_scratch);
751
  return ModelLoadResult::SUCCESS;
752
  }
753
  else
 
760
 
761
  bool gpttype_generate_abort()
762
  {
763
+ stopper_unused_tokens = remaining_tokens;
764
  remaining_tokens = 0;
765
  return true;
766
  }
 
901
  current_context_tokens.resize(n_past);
902
 
903
  remaining_tokens = params.n_predict;
904
+ stopper_unused_tokens = 0;
905
  int input_consumed = 0;
906
  std::mt19937 rng(params.seed);
907
  concat_output = "";
908
 
909
  bool startedsampling = false;
910
+ bool use_scratch = true; //for normal inference always use scratch
911
 
912
  timer_start();
913
  double time1 = 0, time2 = 0;
 
995
  printf("Bad format!");
996
  }
997
 
998
+ if(debugmode!=-1)
999
+ {
1000
+ printf("\n");
1001
+ }
1002
 
1003
+ if (debugmode==1)
1004
  {
1005
  std::string outstr = "";
1006
  printf("\n[Debug: Dump Input Tokens, format: %d]\n", file_format);
 
1030
  // predict
1031
  unsigned int embdsize = embd.size();
1032
  //print progress
1033
+ if (!startedsampling && debugmode!=-1)
1034
  {
1035
  printf("\rProcessing Prompt%s (%d / %d tokens)", (blasmode ? " [BLAS]" : ""), input_consumed, embd_inp.size());
1036
  }
 
1082
  }
1083
  else if(file_format==FileFormat::GPT2_4)
1084
  {
1085
+ evalres = gpt2_eval(gpt2_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, use_scratch);
1086
  }
1087
  else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
1088
  {
 
1090
  }
1091
  else if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
1092
  {
1093
+ evalres = gpt_neox_eval(neox_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, use_scratch);
1094
  }
1095
  else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2)
1096
  {
 
1102
  }
1103
  else if(file_format==FileFormat::GPTJ_5)
1104
  {
1105
+ evalres = gptj_eval(gptj_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, use_scratch);
1106
  }
1107
  else if(file_format==FileFormat::MPT_1)
1108
  {
1109
+ evalres = mpt_eval(mpt_ctx_v3, params.n_threads, n_past, embd, logits, false, mem_per_token, use_scratch);
1110
  }
1111
  else
1112
  {
 
1143
  params.n_threads = original_threads;
1144
  time1 = timer_check();
1145
  timer_start();
1146
+ if(debugmode!=-1)
1147
+ {
1148
+ printf("\n");
1149
+ }
1150
  }
1151
 
1152
  unsigned int eosID = 0;
 
1249
  concat_output += tokenizedstr;
1250
  }
1251
 
1252
+ if (startedsampling && debugmode!=-1)
1253
  {
1254
  printf("\rGenerating (%d / %d tokens)", (params.n_predict - remaining_tokens), params.n_predict);
1255
  }
1256
+ if(debugmode==1 && top_picks.size()>0)
1257
  {
1258
  printf(" [");
1259
  bool firstloop = true;
 
1273
 
1274
  if(unbanTokens && id==eosID)
1275
  {
1276
+ stopper_unused_tokens = remaining_tokens;
1277
  printf("\n(EOS token triggered!)");
1278
  remaining_tokens = 0;
1279
  }
 
1284
  {
1285
  stopper_unused_tokens = remaining_tokens;
1286
  remaining_tokens = 0;
1287
+ if(debugmode!=-1)
1288
+ {
1289
+ printf("\n(Stop sequence triggered: <%s>)", matched.c_str());
1290
+ }
1291
  break;
1292
  }
1293
  }
klite.embd CHANGED
The diff for this file is too large to render. See raw diff
 
koboldcpp.py CHANGED
@@ -26,7 +26,7 @@ class load_model_inputs(ctypes.Structure):
26
  ("unban_tokens", ctypes.c_bool),
27
  ("clblast_info", ctypes.c_int),
28
  ("blasbatchsize", ctypes.c_int),
29
- ("debugmode", ctypes.c_bool),
30
  ("forceversion", ctypes.c_int),
31
  ("gpulayers", ctypes.c_int)]
32
 
@@ -221,10 +221,12 @@ def utfprint(str):
221
  #################################################################
222
  friendlymodelname = "concedo/koboldcpp" # local kobold api apparently needs a hardcoded known HF model name
223
  maxctx = 2048
224
- maxlen = 256
 
225
  modelbusy = False
226
  defaultport = 5001
227
- KcppVersion = "1.31"
 
228
 
229
  class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
230
  sys_version = ""
@@ -238,6 +240,12 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
238
  def __call__(self, *args, **kwargs):
239
  super().__init__(*args, **kwargs)
240
 
 
 
 
 
 
 
241
  async def generate_text(self, newprompt, genparams, basic_api_flag, stream_flag):
242
 
243
  def run_blocking():
@@ -281,7 +289,8 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
281
  else:
282
  recvtxt = run_blocking()
283
 
284
- utfprint("\nOutput: " + recvtxt)
 
285
 
286
  res = {"data": {"seqs":[recvtxt]}} if basic_api_flag else {"results": [{"text": recvtxt}]}
287
 
@@ -345,7 +354,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
345
 
346
 
347
  def do_GET(self):
348
- global maxctx, maxlen, friendlymodelname, KcppVersion, streamLock
349
  self.path = self.path.rstrip('/')
350
  response_body = None
351
 
@@ -371,10 +380,10 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
371
  response_body = (json.dumps({'result': friendlymodelname }).encode())
372
 
373
  elif self.path.endswith(('/api/v1/config/max_length', '/api/latest/config/max_length')):
374
- response_body = (json.dumps({"value": maxlen}).encode())
375
 
376
  elif self.path.endswith(('/api/v1/config/max_context_length', '/api/latest/config/max_context_length')):
377
- response_body = (json.dumps({"value": maxctx}).encode())
378
 
379
  elif self.path.endswith(('/api/v1/config/soft_prompt', '/api/latest/config/soft_prompt')):
380
  response_body = (json.dumps({"value":""}).encode())
@@ -414,7 +423,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
414
  self.send_response(200)
415
  self.end_headers()
416
  self.wfile.write(json.dumps({"success": ("true" if ag else "false")}).encode())
417
- print("Generation Aborted")
418
  modelbusy = False
419
  return
420
 
@@ -453,7 +462,8 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
453
  utfprint("Body Err: " + str(body))
454
  return self.send_response(503)
455
 
456
- utfprint("\nInput: " + json.dumps(genparams))
 
457
 
458
  modelbusy = True
459
 
@@ -714,10 +724,17 @@ def main(args):
714
  sys.exit(2)
715
 
716
  if args.hordeconfig and args.hordeconfig[0]!="":
717
- global friendlymodelname, maxlen
718
  friendlymodelname = "koboldcpp/"+args.hordeconfig[0]
719
  if len(args.hordeconfig) > 1:
720
- maxlen = int(args.hordeconfig[1])
 
 
 
 
 
 
 
721
 
722
  if args.highpriority:
723
  print("Setting process to Higher Priority - Use Caution")
@@ -839,9 +856,9 @@ if __name__ == '__main__':
839
  parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
840
  parser.add_argument("--usemlock", help="For Apple Systems. Force system to keep model in RAM rather than swapping or compressing", action='store_true')
841
  parser.add_argument("--noavx2", help="Do not use AVX2 instructions, a slower compatibility mode for older devices. Does not work with --clblast.", action='store_true')
842
- parser.add_argument("--debugmode", help="Shows additional debug info in the terminal.", action='store_true')
843
  parser.add_argument("--skiplauncher", help="Doesn't display or use the new GUI launcher.", action='store_true')
844
- parser.add_argument("--hordeconfig", help="Sets the display model name to something else, for easy use on AI Horde. An optional second parameter sets the horde max gen length.",metavar=('[hordename]', '[hordelength]'), nargs='+')
845
  compatgroup = parser.add_mutually_exclusive_group()
846
  compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
847
  compatgroup.add_argument("--useclblast", help="Use CLBlast instead of OpenBLAS for prompt ingestion. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
 
26
  ("unban_tokens", ctypes.c_bool),
27
  ("clblast_info", ctypes.c_int),
28
  ("blasbatchsize", ctypes.c_int),
29
+ ("debugmode", ctypes.c_int),
30
  ("forceversion", ctypes.c_int),
31
  ("gpulayers", ctypes.c_int)]
32
 
 
221
  #################################################################
222
  friendlymodelname = "concedo/koboldcpp" # local kobold api apparently needs a hardcoded known HF model name
223
  maxctx = 2048
224
+ maxhordectx = 1024
225
+ maxhordelen = 256
226
  modelbusy = False
227
  defaultport = 5001
228
+ KcppVersion = "1.33"
229
+ showdebug = True
230
 
231
  class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
232
  sys_version = ""
 
240
  def __call__(self, *args, **kwargs):
241
  super().__init__(*args, **kwargs)
242
 
243
+ def log_message(self, format, *args):
244
+ global showdebug
245
+ if showdebug:
246
+ super().log_message(format, *args)
247
+ pass
248
+
249
  async def generate_text(self, newprompt, genparams, basic_api_flag, stream_flag):
250
 
251
  def run_blocking():
 
289
  else:
290
  recvtxt = run_blocking()
291
 
292
+ if args.debugmode!=-1:
293
+ utfprint("\nOutput: " + recvtxt)
294
 
295
  res = {"data": {"seqs":[recvtxt]}} if basic_api_flag else {"results": [{"text": recvtxt}]}
296
 
 
354
 
355
 
356
  def do_GET(self):
357
+ global maxctx, maxhordelen, friendlymodelname, KcppVersion, streamLock
358
  self.path = self.path.rstrip('/')
359
  response_body = None
360
 
 
380
  response_body = (json.dumps({'result': friendlymodelname }).encode())
381
 
382
  elif self.path.endswith(('/api/v1/config/max_length', '/api/latest/config/max_length')):
383
+ response_body = (json.dumps({"value": maxhordelen}).encode())
384
 
385
  elif self.path.endswith(('/api/v1/config/max_context_length', '/api/latest/config/max_context_length')):
386
+ response_body = (json.dumps({"value": min(maxctx,maxhordectx)}).encode())
387
 
388
  elif self.path.endswith(('/api/v1/config/soft_prompt', '/api/latest/config/soft_prompt')):
389
  response_body = (json.dumps({"value":""}).encode())
 
423
  self.send_response(200)
424
  self.end_headers()
425
  self.wfile.write(json.dumps({"success": ("true" if ag else "false")}).encode())
426
+ print("\nGeneration Aborted")
427
  modelbusy = False
428
  return
429
 
 
462
  utfprint("Body Err: " + str(body))
463
  return self.send_response(503)
464
 
465
+ if args.debugmode!=-1:
466
+ utfprint("\nInput: " + json.dumps(genparams))
467
 
468
  modelbusy = True
469
 
 
724
  sys.exit(2)
725
 
726
  if args.hordeconfig and args.hordeconfig[0]!="":
727
+ global friendlymodelname, maxhordelen, maxhordectx, showdebug
728
  friendlymodelname = "koboldcpp/"+args.hordeconfig[0]
729
  if len(args.hordeconfig) > 1:
730
+ maxhordelen = int(args.hordeconfig[1])
731
+ if len(args.hordeconfig) > 2:
732
+ maxhordectx = int(args.hordeconfig[2])
733
+ if args.debugmode == 0:
734
+ args.debugmode = -1
735
+
736
+ if args.debugmode != 1:
737
+ showdebug = False
738
 
739
  if args.highpriority:
740
  print("Setting process to Higher Priority - Use Caution")
 
856
  parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
857
  parser.add_argument("--usemlock", help="For Apple Systems. Force system to keep model in RAM rather than swapping or compressing", action='store_true')
858
  parser.add_argument("--noavx2", help="Do not use AVX2 instructions, a slower compatibility mode for older devices. Does not work with --clblast.", action='store_true')
859
+ parser.add_argument("--debugmode", help="Shows additional debug info in the terminal.", action='store_const', const=1, default=0)
860
  parser.add_argument("--skiplauncher", help="Doesn't display or use the new GUI launcher.", action='store_true')
861
+ parser.add_argument("--hordeconfig", help="Sets the display model name to something else, for easy use on AI Horde. Optional additional parameters set the horde max genlength and max ctxlen.",metavar=('[hordename]', '[hordelength] [hordectx]'), nargs='+')
862
  compatgroup = parser.add_mutually_exclusive_group()
863
  compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
864
  compatgroup.add_argument("--useclblast", help="Use CLBlast instead of OpenBLAS for prompt ingestion. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
llama.cpp CHANGED
@@ -19,6 +19,11 @@
19
  #ifdef GGML_USE_METAL
20
  #include "ggml-metal.h"
21
  #endif
 
 
 
 
 
22
 
23
  #include <array>
24
  #include <ctime>
@@ -75,7 +80,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
75
  { MODEL_3B, 256ull * MB },
76
  { MODEL_7B, 512ull * MB },
77
  { MODEL_13B, 512ull * MB },
78
- { MODEL_30B, 512ull * MB },
79
  { MODEL_65B, 1024ull * MB },
80
  };
81
  return k_sizes;
@@ -87,7 +92,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
87
  { MODEL_3B, 256ull * MB },
88
  { MODEL_7B, 512ull * MB },
89
  { MODEL_13B, 512ull * MB },
90
- { MODEL_30B, 512ull * MB },
91
  { MODEL_65B, 1024ull * MB },
92
  };
93
  return k_sizes;
@@ -100,7 +105,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
100
  { MODEL_3B, 682ull * MB },
101
  { MODEL_7B, 1026ull * MB },
102
  { MODEL_13B, 1608ull * MB },
103
- { MODEL_30B, 3124ull * MB },
104
  { MODEL_65B, 5120ull * MB },
105
  };
106
  return k_sizes;
@@ -114,7 +119,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
114
  { MODEL_3B, 512ull * MB },
115
  { MODEL_7B, 800ull * MB },
116
  { MODEL_13B, 1024ull * MB },
117
- { MODEL_30B, 1280ull * MB },
118
  { MODEL_65B, 1536ull * MB },
119
  };
120
  return k_sizes;
@@ -177,6 +182,19 @@ struct llama_kv_cache {
177
  }
178
  };
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  struct llama_model {
181
  e_model type = MODEL_UNKNOWN;
182
 
@@ -193,10 +211,6 @@ struct llama_model {
193
  // context
194
  struct ggml_context * ctx = NULL;
195
 
196
- // key + value cache for the self attention
197
- // TODO: move to llama_state
198
- struct llama_kv_cache kv_self;
199
-
200
  // the model memory buffer
201
  llama_ctx_buffer buf;
202
 
@@ -210,6 +224,11 @@ struct llama_model {
210
  // for quantize-stats only
211
  std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
212
 
 
 
 
 
 
213
  ~llama_model() {
214
  if (ctx) {
215
  ggml_free(ctx);
@@ -228,24 +247,11 @@ struct llama_model {
228
  }
229
  };
230
 
231
- struct llama_vocab {
232
- using id = int32_t;
233
- using token = std::string;
234
-
235
- struct token_score {
236
- token tok;
237
- float score;
238
- };
239
-
240
- std::unordered_map<token, id> token_to_id;
241
- std::vector<token_score> id_to_token;
242
- };
243
-
244
  struct llama_context {
 
 
245
  std::mt19937 rng;
246
 
247
- int64_t t_load_us = 0;
248
- int64_t t_start_us = 0;
249
  bool has_evaluated_once = false;
250
 
251
  int64_t t_sample_us = 0;
@@ -256,8 +262,16 @@ struct llama_context {
256
  int32_t n_eval = 0; // number of eval calls
257
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
258
 
259
- llama_model model;
260
- llama_vocab vocab;
 
 
 
 
 
 
 
 
261
 
262
  size_t mem_per_token = 0;
263
 
@@ -886,6 +900,7 @@ static bool kv_cache_init(
886
  const int64_t n_elements = n_embd*n_mem;
887
 
888
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
 
889
 
890
  struct ggml_init_params params;
891
  params.mem_size = cache.buf.size;
@@ -904,6 +919,7 @@ static bool kv_cache_init(
904
  ggml_set_name(cache.k, "cache_k");
905
  ggml_set_name(cache.v, "cache_v");
906
 
 
907
  #ifdef GGML_USE_CUBLAS
908
  if (n_gpu_layers > n_layer + 1) {
909
  ggml_cuda_assign_buffers_no_scratch(cache.v);
@@ -918,21 +934,21 @@ static bool kv_cache_init(
918
 
919
  struct llama_context_params llama_context_default_params() {
920
  struct llama_context_params result = {
 
921
  /*.n_ctx =*/ 512,
922
  /*.n_batch =*/ 512,
923
  /*.gpu_layers =*/ 0,
924
  /*.main_gpu =*/ 0,
925
  /*.tensor_split =*/ {0},
 
 
926
  /*.low_vram =*/ false,
927
- /*.seed =*/ -1,
928
  /*.f16_kv =*/ true,
929
  /*.logits_all =*/ false,
930
  /*.vocab_only =*/ false,
931
  /*.use_mmap =*/ true,
932
  /*.use_mlock =*/ false,
933
  /*.embedding =*/ false,
934
- /*.progress_callback =*/ nullptr,
935
- /*.progress_callback_user_data =*/ nullptr,
936
  };
937
 
938
  return result;
@@ -1026,7 +1042,8 @@ static const char *llama_model_type_name(e_model type) {
1026
 
1027
  static void llama_model_load_internal(
1028
  const std::string & fname,
1029
- llama_context & lctx,
 
1030
  int n_ctx,
1031
  int n_batch,
1032
  int n_gpu_layers,
@@ -1040,12 +1057,11 @@ static void llama_model_load_internal(
1040
  llama_progress_callback progress_callback,
1041
  void * progress_callback_user_data) {
1042
 
1043
- lctx.t_start_us = ggml_time_us();
1044
 
1045
  std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
1046
 
1047
- lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
1048
- auto & model = lctx.model;
1049
  model.hparams = ml->file_loaders.at(0)->hparams;
1050
  model.n_gpu_layers = n_gpu_layers;
1051
  llama_file_version file_version = ml->file_loaders.at(0)->file_version;
@@ -1115,15 +1131,15 @@ static void llama_model_load_internal(
1115
 
1116
  // create the ggml context
1117
  {
1118
- lctx.model.buf.resize(ctx_size);
1119
  if (use_mlock) {
1120
- lctx.model.mlock_buf.init(lctx.model.buf.addr);
1121
- lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
1122
  }
1123
 
1124
  struct ggml_init_params params = {
1125
- /*.mem_size =*/ lctx.model.buf.size,
1126
- /*.mem_buffer =*/ lctx.model.buf.addr,
1127
  /*.no_alloc =*/ ml->use_mmap,
1128
  };
1129
 
@@ -1253,7 +1269,7 @@ static void llama_model_load_internal(
1253
  vram_scratch = n_batch * MB;
1254
  ggml_cuda_set_scratch_size(vram_scratch);
1255
  if (n_gpu_layers > 0) {
1256
- fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
1257
  __func__, vram_scratch / MB);
1258
  }
1259
  }
@@ -1304,7 +1320,7 @@ static void llama_model_load_internal(
1304
  }
1305
  #endif
1306
 
1307
- ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
1308
 
1309
  if (progress_callback) {
1310
  progress_callback(1.0f, progress_callback_user_data);
@@ -1314,12 +1330,13 @@ static void llama_model_load_internal(
1314
 
1315
  // loading time will be recalculate after the first eval, so
1316
  // we take page faults deferred by mmap() into consideration
1317
- lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
1318
  }
1319
 
1320
  static bool llama_model_load(
1321
  const std::string & fname,
1322
- llama_context & lctx,
 
1323
  int n_ctx,
1324
  int n_batch,
1325
  int n_gpu_layers,
@@ -1333,7 +1350,7 @@ static bool llama_model_load(
1333
  llama_progress_callback progress_callback,
1334
  void *progress_callback_user_data) {
1335
  try {
1336
- llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1337
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1338
  return true;
1339
  } catch (const std::exception & err) {
@@ -1371,7 +1388,7 @@ static bool llama_eval_internal(
1371
  const auto & model = lctx.model;
1372
  const auto & hparams = model.hparams;
1373
 
1374
- const auto & kv_self = model.kv_self;
1375
 
1376
  LLAMA_ASSERT(!!kv_self.ctx);
1377
 
@@ -1613,7 +1630,7 @@ static bool llama_eval_internal(
1613
  model.layers[il].w1,
1614
  cur);
1615
  offload_func(cur);
1616
- ggml_set_name(cur, "result_w2");
1617
 
1618
  // SILU activation
1619
  cur = ggml_silu(ctx0, cur);
@@ -1650,11 +1667,7 @@ static bool llama_eval_internal(
1650
  {
1651
  cur = ggml_rms_norm(ctx0, inpL);
1652
  offload_func_nr(cur);
1653
- ggml_set_name(cur, "rms_norm_inpL");
1654
-
1655
- cur = ggml_rms_norm(ctx0, cur);
1656
- offload_func_nr(cur);
1657
- ggml_set_name(cur, "rms_norm_after");
1658
 
1659
  // cur = cur*norm(broadcasted)
1660
  cur = ggml_mul(ctx0, cur, model.norm);
@@ -1723,7 +1736,7 @@ static bool llama_eval_internal(
1723
  //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
1724
 
1725
  // update kv token count
1726
- lctx.model.kv_self.n = n_past + N;
1727
 
1728
  // extract logits
1729
  {
@@ -2002,9 +2015,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
2002
  for (size_t i = 0; i < candidates->size; ++i) {
2003
  cum_sum += candidates->data[i].p;
2004
 
2005
- // Check if the running sum is greater than p or if we have kept at least min_keep tokens
2006
- if (cum_sum > p && i >= min_keep) {
2007
- last_idx = i;
 
2008
  break;
2009
  }
2010
  }
@@ -2489,8 +2503,23 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2489
  } else {
2490
  new_type = quantized_type;
2491
  #ifdef GGML_USE_K_QUANTS
 
 
 
 
 
 
 
 
 
 
 
2492
  if (tensor.name == "output.weight") {
2493
- new_type = GGML_TYPE_Q6_K;
 
 
 
 
2494
  } else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2495
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2496
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
@@ -2616,12 +2645,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2616
  // interface implementation
2617
  //
2618
 
2619
- struct llama_context * llama_init_from_file(
2620
  const char * path_model,
2621
  struct llama_context_params params) {
2622
  ggml_time_init();
2623
 
2624
- llama_context * ctx = new llama_context;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2625
 
2626
  if (params.seed < 0) {
2627
  params.seed = time(NULL);
@@ -2649,24 +2705,16 @@ struct llama_context * llama_init_from_file(
2649
 
2650
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2651
 
2652
- if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
2653
- params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2654
- params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2655
- fprintf(stderr, "%s: failed to load model\n", __func__);
2656
- llama_free(ctx);
2657
- return nullptr;
2658
- }
2659
-
2660
  // reserve memory for context buffers
2661
  if (!params.vocab_only) {
2662
- if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
2663
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
2664
  llama_free(ctx);
2665
  return nullptr;
2666
  }
2667
 
2668
  {
2669
- const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
2670
  fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2671
  }
2672
 
@@ -2694,16 +2742,21 @@ struct llama_context * llama_init_from_file(
2694
  // this allocates all Metal resources and memory buffers
2695
  ctx->ctx_metal = ggml_metal_init();
2696
 
2697
- void *data_ptr = NULL;
2698
  size_t data_size = 0;
 
2699
  if (params.use_mmap) {
2700
- data_ptr = ctx->model.mapping->addr;
2701
- data_size= ctx->model.mapping->size;
2702
  } else {
2703
- data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2704
- data_size= ggml_get_mem_size(ctx->model.ctx);
2705
  }
2706
 
 
 
 
 
2707
  #define LLAMA_METAL_CHECK_BUF(result) \
2708
  if (!(result)) { \
2709
  fprintf(stderr, "%s: failed to add buffer\n", __func__); \
@@ -2711,12 +2764,13 @@ struct llama_context * llama_init_from_file(
2711
  return NULL; \
2712
  }
2713
 
2714
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size));
2715
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size));
 
 
2716
 
2717
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size));
2718
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size));
2719
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size));
2720
  #undef LLAMA_METAL_CHECK_BUF
2721
  }
2722
  #endif
@@ -2724,7 +2778,23 @@ struct llama_context * llama_init_from_file(
2724
  return ctx;
2725
  }
2726
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2727
  void llama_free(struct llama_context * ctx) {
 
 
 
2728
  delete ctx;
2729
  }
2730
 
@@ -2741,11 +2811,9 @@ int llama_model_quantize(
2741
  }
2742
  }
2743
 
2744
- int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2745
  fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
2746
 
2747
- auto & model = ctx->model;
2748
-
2749
  const int64_t t_start_lora_us = ggml_time_us();
2750
 
2751
  auto fin = std::ifstream(path_lora, std::ios::binary);
@@ -2988,7 +3056,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2988
 
2989
  int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
2990
  try {
2991
- return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
 
 
 
 
 
 
 
 
 
2992
  } catch (const std::exception & err) {
2993
  fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
2994
  return 1;
@@ -2996,7 +3073,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor
2996
  }
2997
 
2998
  int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
2999
- return ctx->model.kv_self.n;
3000
  }
3001
 
3002
  #define LLAMA_MAX_RNG_STATE (64*1024)
@@ -3021,7 +3098,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
3021
  const size_t s_embedding = ctx->embedding.size() * sizeof(float);
3022
  const size_t s_kv_size = sizeof(size_t);
3023
  const size_t s_kv_ntok = sizeof(int);
3024
- const size_t s_kv = ctx->model.kv_self.buf.size;
3025
 
3026
  const size_t s_total = (
3027
  + s_rng_size
@@ -3087,7 +3164,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3087
 
3088
  // copy kv cache
3089
  {
3090
- const auto & kv_self = ctx->model.kv_self;
3091
  const auto & hparams = ctx->model.hparams;
3092
  const int n_layer = hparams.n_layer;
3093
  const int n_embd = hparams.n_embd;
@@ -3102,9 +3179,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
3102
  if (kv_size) {
3103
  const size_t elt_size = ggml_element_size(kv_self.k);
3104
 
3105
- char buffer[4096];
3106
-
3107
- ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
3108
  ggml_cgraph gf{};
3109
  gf.n_threads = 1;
3110
 
@@ -3193,7 +3268,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3193
 
3194
  // set kv cache
3195
  {
3196
- const auto & kv_self = ctx->model.kv_self;
3197
  const auto & hparams = ctx->model.hparams;
3198
  const int n_layer = hparams.n_layer;
3199
  const int n_embd = hparams.n_embd;
@@ -3210,9 +3285,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3210
 
3211
  const size_t elt_size = ggml_element_size(kv_self.k);
3212
 
3213
- char buffer[4096];
3214
-
3215
- ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
3216
  ggml_cgraph gf{};
3217
  gf.n_threads = 1;
3218
 
@@ -3239,7 +3312,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
3239
  ggml_free(cpy_ctx);
3240
  }
3241
 
3242
- ctx->model.kv_self.n = kv_ntok;
3243
  }
3244
 
3245
  const size_t nread = inp - src;
@@ -3447,9 +3520,12 @@ void llama_print_timings(struct llama_context * ctx) {
3447
 
3448
  fprintf(stderr, "\n");
3449
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
3450
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
3451
- fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
3452
- fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
 
 
 
3453
  fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
3454
  }
3455
 
@@ -3483,6 +3559,6 @@ const char * llama_print_system_info(void) {
3483
  }
3484
 
3485
  // For internal test use
3486
- std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
3487
  return ctx->model.tensors_by_name;
3488
  }
 
19
  #ifdef GGML_USE_METAL
20
  #include "ggml-metal.h"
21
  #endif
22
+ #ifdef GGML_USE_K_QUANTS
23
+ #ifndef QK_K
24
+ #define QK_K 256
25
+ #endif
26
+ #endif
27
 
28
  #include <array>
29
  #include <ctime>
 
80
  { MODEL_3B, 256ull * MB },
81
  { MODEL_7B, 512ull * MB },
82
  { MODEL_13B, 512ull * MB },
83
+ { MODEL_30B, 640ull * MB },
84
  { MODEL_65B, 1024ull * MB },
85
  };
86
  return k_sizes;
 
92
  { MODEL_3B, 256ull * MB },
93
  { MODEL_7B, 512ull * MB },
94
  { MODEL_13B, 512ull * MB },
95
+ { MODEL_30B, 640ull * MB },
96
  { MODEL_65B, 1024ull * MB },
97
  };
98
  return k_sizes;
 
105
  { MODEL_3B, 682ull * MB },
106
  { MODEL_7B, 1026ull * MB },
107
  { MODEL_13B, 1608ull * MB },
108
+ { MODEL_30B, 3224ull * MB },
109
  { MODEL_65B, 5120ull * MB },
110
  };
111
  return k_sizes;
 
119
  { MODEL_3B, 512ull * MB },
120
  { MODEL_7B, 800ull * MB },
121
  { MODEL_13B, 1024ull * MB },
122
+ { MODEL_30B, 1380ull * MB },
123
  { MODEL_65B, 1536ull * MB },
124
  };
125
  return k_sizes;
 
182
  }
183
  };
184
 
185
+ struct llama_vocab {
186
+ using id = int32_t;
187
+ using token = std::string;
188
+
189
+ struct token_score {
190
+ token tok;
191
+ float score;
192
+ };
193
+
194
+ std::unordered_map<token, id> token_to_id;
195
+ std::vector<token_score> id_to_token;
196
+ };
197
+
198
  struct llama_model {
199
  e_model type = MODEL_UNKNOWN;
200
 
 
211
  // context
212
  struct ggml_context * ctx = NULL;
213
 
 
 
 
 
214
  // the model memory buffer
215
  llama_ctx_buffer buf;
216
 
 
224
  // for quantize-stats only
225
  std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
226
 
227
+ int64_t t_load_us = 0;
228
+ int64_t t_start_us = 0;
229
+
230
+ llama_vocab vocab;
231
+
232
  ~llama_model() {
233
  if (ctx) {
234
  ggml_free(ctx);
 
247
  }
248
  };
249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  struct llama_context {
251
+ llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
252
+
253
  std::mt19937 rng;
254
 
 
 
255
  bool has_evaluated_once = false;
256
 
257
  int64_t t_sample_us = 0;
 
262
  int32_t n_eval = 0; // number of eval calls
263
  int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
264
 
265
+ const llama_model & model;
266
+ const llama_vocab & vocab;
267
+
268
+ bool model_owner = false;
269
+
270
+ int64_t t_load_us;
271
+ int64_t t_start_us;
272
+
273
+ // key + value cache for the self attention
274
+ struct llama_kv_cache kv_self;
275
 
276
  size_t mem_per_token = 0;
277
 
 
900
  const int64_t n_elements = n_embd*n_mem;
901
 
902
  cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
903
+ cache.n = 0;
904
 
905
  struct ggml_init_params params;
906
  params.mem_size = cache.buf.size;
 
919
  ggml_set_name(cache.k, "cache_k");
920
  ggml_set_name(cache.v, "cache_v");
921
 
922
+ (void) n_gpu_layers;
923
  #ifdef GGML_USE_CUBLAS
924
  if (n_gpu_layers > n_layer + 1) {
925
  ggml_cuda_assign_buffers_no_scratch(cache.v);
 
934
 
935
  struct llama_context_params llama_context_default_params() {
936
  struct llama_context_params result = {
937
+ /*.seed =*/ -1,
938
  /*.n_ctx =*/ 512,
939
  /*.n_batch =*/ 512,
940
  /*.gpu_layers =*/ 0,
941
  /*.main_gpu =*/ 0,
942
  /*.tensor_split =*/ {0},
943
+ /*.progress_callback =*/ nullptr,
944
+ /*.progress_callback_user_data =*/ nullptr,
945
  /*.low_vram =*/ false,
 
946
  /*.f16_kv =*/ true,
947
  /*.logits_all =*/ false,
948
  /*.vocab_only =*/ false,
949
  /*.use_mmap =*/ true,
950
  /*.use_mlock =*/ false,
951
  /*.embedding =*/ false,
 
 
952
  };
953
 
954
  return result;
 
1042
 
1043
  static void llama_model_load_internal(
1044
  const std::string & fname,
1045
+ llama_model & model,
1046
+ llama_vocab & vocab,
1047
  int n_ctx,
1048
  int n_batch,
1049
  int n_gpu_layers,
 
1057
  llama_progress_callback progress_callback,
1058
  void * progress_callback_user_data) {
1059
 
1060
+ model.t_start_us = ggml_time_us();
1061
 
1062
  std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
1063
 
1064
+ vocab = std::move(ml->file_loaders.at(0)->vocab);
 
1065
  model.hparams = ml->file_loaders.at(0)->hparams;
1066
  model.n_gpu_layers = n_gpu_layers;
1067
  llama_file_version file_version = ml->file_loaders.at(0)->file_version;
 
1131
 
1132
  // create the ggml context
1133
  {
1134
+ model.buf.resize(ctx_size);
1135
  if (use_mlock) {
1136
+ model.mlock_buf.init(model.buf.addr);
1137
+ model.mlock_buf.grow_to(model.buf.size);
1138
  }
1139
 
1140
  struct ggml_init_params params = {
1141
+ /*.mem_size =*/ model.buf.size,
1142
+ /*.mem_buffer =*/ model.buf.addr,
1143
  /*.no_alloc =*/ ml->use_mmap,
1144
  };
1145
 
 
1269
  vram_scratch = n_batch * MB;
1270
  ggml_cuda_set_scratch_size(vram_scratch);
1271
  if (n_gpu_layers > 0) {
1272
+ fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
1273
  __func__, vram_scratch / MB);
1274
  }
1275
  }
 
1320
  }
1321
  #endif
1322
 
1323
+ ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
1324
 
1325
  if (progress_callback) {
1326
  progress_callback(1.0f, progress_callback_user_data);
 
1330
 
1331
  // loading time will be recalculate after the first eval, so
1332
  // we take page faults deferred by mmap() into consideration
1333
+ model.t_load_us = ggml_time_us() - model.t_start_us;
1334
  }
1335
 
1336
  static bool llama_model_load(
1337
  const std::string & fname,
1338
+ llama_model & model,
1339
+ llama_vocab & vocab,
1340
  int n_ctx,
1341
  int n_batch,
1342
  int n_gpu_layers,
 
1350
  llama_progress_callback progress_callback,
1351
  void *progress_callback_user_data) {
1352
  try {
1353
+ llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
1354
  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
1355
  return true;
1356
  } catch (const std::exception & err) {
 
1388
  const auto & model = lctx.model;
1389
  const auto & hparams = model.hparams;
1390
 
1391
+ const auto & kv_self = lctx.kv_self;
1392
 
1393
  LLAMA_ASSERT(!!kv_self.ctx);
1394
 
 
1630
  model.layers[il].w1,
1631
  cur);
1632
  offload_func(cur);
1633
+ ggml_set_name(cur, "result_w1");
1634
 
1635
  // SILU activation
1636
  cur = ggml_silu(ctx0, cur);
 
1667
  {
1668
  cur = ggml_rms_norm(ctx0, inpL);
1669
  offload_func_nr(cur);
1670
+ ggml_set_name(cur, "rms_norm_2");
 
 
 
 
1671
 
1672
  // cur = cur*norm(broadcasted)
1673
  cur = ggml_mul(ctx0, cur, model.norm);
 
1736
  //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
1737
 
1738
  // update kv token count
1739
+ lctx.kv_self.n = n_past + N;
1740
 
1741
  // extract logits
1742
  {
 
2015
  for (size_t i = 0; i < candidates->size; ++i) {
2016
  cum_sum += candidates->data[i].p;
2017
 
2018
+ // Check if the running sum is at least p or if we have kept at least min_keep tokens
2019
+ // we set the last index to i+1 to indicate that the current iterate should be included in the set
2020
+ if (cum_sum >= p && i + 1 >= min_keep) {
2021
+ last_idx = i + 1;
2022
  break;
2023
  }
2024
  }
 
2503
  } else {
2504
  new_type = quantized_type;
2505
  #ifdef GGML_USE_K_QUANTS
2506
+ if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
2507
+ quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
2508
+ int nx = tensor.ne.at(0);
2509
+ int ny = tensor.ne.at(1);
2510
+ if (nx % QK_K != 0 || ny % QK_K != 0) {
2511
+ fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
2512
+ fprintf(stderr, "This is required to be able to use k-quants for now!\n");
2513
+ fprintf(stderr, "========================================================================================\n\n");
2514
+ throw std::runtime_error("Unsupported tensor size encountered\n");
2515
+ }
2516
+ }
2517
  if (tensor.name == "output.weight") {
2518
+ int nx = tensor.ne.at(0);
2519
+ int ny = tensor.ne.at(1);
2520
+ if (nx % QK_K == 0 && ny % QK_K == 0) {
2521
+ new_type = GGML_TYPE_Q6_K;
2522
+ }
2523
  } else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2524
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2525
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
 
2645
  // interface implementation
2646
  //
2647
 
2648
+ struct llama_model * llama_load_model_from_file(
2649
  const char * path_model,
2650
  struct llama_context_params params) {
2651
  ggml_time_init();
2652
 
2653
+ llama_model * model = new llama_model;
2654
+
2655
+ ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2656
+
2657
+ if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
2658
+ params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
2659
+ params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
2660
+ delete model;
2661
+ fprintf(stderr, "%s: failed to load model\n", __func__);
2662
+ return nullptr;
2663
+ }
2664
+
2665
+ return model;
2666
+ }
2667
+
2668
+ void llama_free_model(struct llama_model * model) {
2669
+ delete model;
2670
+ }
2671
+
2672
+ struct llama_context * llama_new_context_with_model(
2673
+ struct llama_model * model,
2674
+ struct llama_context_params params) {
2675
+
2676
+ if (!model) {
2677
+ return nullptr;
2678
+ }
2679
+
2680
+ llama_context * ctx = new llama_context(*model, model->vocab);
2681
 
2682
  if (params.seed < 0) {
2683
  params.seed = time(NULL);
 
2705
 
2706
  ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
2707
 
 
 
 
 
 
 
 
 
2708
  // reserve memory for context buffers
2709
  if (!params.vocab_only) {
2710
+ if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
2711
  fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
2712
  llama_free(ctx);
2713
  return nullptr;
2714
  }
2715
 
2716
  {
2717
+ const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
2718
  fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
2719
  }
2720
 
 
2742
  // this allocates all Metal resources and memory buffers
2743
  ctx->ctx_metal = ggml_metal_init();
2744
 
2745
+ void * data_ptr = NULL;
2746
  size_t data_size = 0;
2747
+
2748
  if (params.use_mmap) {
2749
+ data_ptr = ctx->model.mapping->addr;
2750
+ data_size = ctx->model.mapping->size;
2751
  } else {
2752
+ data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
2753
+ data_size = ggml_get_mem_size (ctx->model.ctx);
2754
  }
2755
 
2756
+ const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
2757
+
2758
+ printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
2759
+
2760
  #define LLAMA_METAL_CHECK_BUF(result) \
2761
  if (!(result)) { \
2762
  fprintf(stderr, "%s: failed to add buffer\n", __func__); \
 
2764
  return NULL; \
2765
  }
2766
 
2767
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
2768
+
2769
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
2770
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
2771
 
2772
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
2773
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
 
2774
  #undef LLAMA_METAL_CHECK_BUF
2775
  }
2776
  #endif
 
2778
  return ctx;
2779
  }
2780
 
2781
+ struct llama_context * llama_init_from_file(
2782
+ const char * path_model,
2783
+ struct llama_context_params params) {
2784
+
2785
+ struct llama_model * model = llama_load_model_from_file(path_model, params);
2786
+ if (!model) {
2787
+ return nullptr;
2788
+ }
2789
+ struct llama_context * ctx = llama_new_context_with_model(model, params);
2790
+ ctx->model_owner = true;
2791
+ return ctx;
2792
+ }
2793
+
2794
  void llama_free(struct llama_context * ctx) {
2795
+ if (ctx->model_owner) {
2796
+ delete &ctx->model;
2797
+ }
2798
  delete ctx;
2799
  }
2800
 
 
2811
  }
2812
  }
2813
 
2814
+ int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) {
2815
  fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
2816
 
 
 
2817
  const int64_t t_start_lora_us = ggml_time_us();
2818
 
2819
  auto fin = std::ifstream(path_lora, std::ios::binary);
 
3056
 
3057
  int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
3058
  try {
3059
+ return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
3060
+ } catch (const std::exception & err) {
3061
+ fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
3062
+ return 1;
3063
+ }
3064
+ }
3065
+
3066
+ int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
3067
+ try {
3068
+ return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
3069
  } catch (const std::exception & err) {
3070
  fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what());
3071
  return 1;
 
3073
  }
3074
 
3075
  int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
3076
+ return ctx->kv_self.n;
3077
  }
3078
 
3079
  #define LLAMA_MAX_RNG_STATE (64*1024)
 
3098
  const size_t s_embedding = ctx->embedding.size() * sizeof(float);
3099
  const size_t s_kv_size = sizeof(size_t);
3100
  const size_t s_kv_ntok = sizeof(int);
3101
+ const size_t s_kv = ctx->kv_self.buf.size;
3102
 
3103
  const size_t s_total = (
3104
  + s_rng_size
 
3164
 
3165
  // copy kv cache
3166
  {
3167
+ const auto & kv_self = ctx->kv_self;
3168
  const auto & hparams = ctx->model.hparams;
3169
  const int n_layer = hparams.n_layer;
3170
  const int n_embd = hparams.n_embd;
 
3179
  if (kv_size) {
3180
  const size_t elt_size = ggml_element_size(kv_self.k);
3181
 
3182
+ ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
 
 
3183
  ggml_cgraph gf{};
3184
  gf.n_threads = 1;
3185
 
 
3268
 
3269
  // set kv cache
3270
  {
3271
+ const auto & kv_self = ctx->kv_self;
3272
  const auto & hparams = ctx->model.hparams;
3273
  const int n_layer = hparams.n_layer;
3274
  const int n_embd = hparams.n_embd;
 
3285
 
3286
  const size_t elt_size = ggml_element_size(kv_self.k);
3287
 
3288
+ ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
 
 
3289
  ggml_cgraph gf{};
3290
  gf.n_threads = 1;
3291
 
 
3312
  ggml_free(cpy_ctx);
3313
  }
3314
 
3315
+ ctx->kv_self.n = kv_ntok;
3316
  }
3317
 
3318
  const size_t nread = inp - src;
 
3520
 
3521
  fprintf(stderr, "\n");
3522
  fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
3523
+ fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3524
+ __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
3525
+ fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
3526
+ __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
3527
+ fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
3528
+ __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
3529
  fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
3530
  }
3531
 
 
3559
  }
3560
 
3561
  // For internal test use
3562
+ const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
3563
  return ctx->model.tensors_by_name;
3564
  }
llama.h CHANGED
@@ -26,6 +26,14 @@
26
  # define LLAMA_API
27
  #endif
28
 
 
 
 
 
 
 
 
 
29
  #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
30
  #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
31
  #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
@@ -53,6 +61,7 @@ extern "C" {
53
  // TODO: show sample usage
54
  //
55
 
 
56
  struct llama_context;
57
 
58
  typedef int llama_token;
@@ -71,28 +80,27 @@ extern "C" {
71
 
72
  typedef void (*llama_progress_callback)(float progress, void *ctx);
73
 
74
- struct llama_context_params {
 
75
  int n_ctx; // text context
76
  int n_batch; // prompt processing batch size
77
  int n_gpu_layers; // number of layers to store in VRAM
78
  int main_gpu; // the GPU that is used for scratch and small tensors
79
  float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
80
- bool low_vram; // if true, reduce VRAM usage at the cost of performance
81
- int seed; // RNG seed, -1 for random
 
 
82
 
 
 
83
  bool f16_kv; // use fp16 for KV cache
84
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
85
  bool vocab_only; // only load the vocabulary, no weights
86
  bool use_mmap; // use mmap if possible
87
  bool use_mlock; // force system to keep model in RAM
88
  bool embedding; // embedding mode only
89
-
90
- // called with a progress value between 0 and 1, pass NULL to disable
91
- llama_progress_callback progress_callback;
92
- // context pointer passed to the progress callback
93
- void * progress_callback_user_data;
94
  };
95
-
96
  // model file types
97
  enum llama_ftype {
98
  LLAMA_FTYPE_ALL_F32 = 0,
@@ -137,12 +145,23 @@ extern "C" {
137
 
138
  LLAMA_API int64_t llama_time_us();
139
 
 
 
 
 
 
 
 
 
 
 
140
  // Various functions for loading a ggml llama model.
141
  // Allocate (almost) all memory needed for the model.
142
  // Return NULL on failure
143
- LLAMA_API struct llama_context * llama_init_from_file(
144
  const char * path_model,
145
- struct llama_context_params params);
 
146
 
147
  // Frees all allocated memory
148
  LLAMA_API void llama_free(struct llama_context * ctx);
@@ -159,8 +178,15 @@ extern "C" {
159
  // The model needs to be reloaded before applying a new adapter, otherwise the adapter
160
  // will be applied on top of the previous one
161
  // Returns 0 on success
162
- LLAMA_API int llama_apply_lora_from_file(
163
  struct llama_context * ctx,
 
 
 
 
 
 
 
164
  const char * path_lora,
165
  const char * path_base_model,
166
  int n_threads);
@@ -311,7 +337,7 @@ extern "C" {
311
  #include <string>
312
  struct ggml_tensor;
313
 
314
- std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
315
 
316
  #endif
317
 
 
26
  # define LLAMA_API
27
  #endif
28
 
29
+ #ifdef __GNUC__
30
+ # define DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
31
+ #elif defined(_MSC_VER)
32
+ # define DEPRECATED(func, hint) __declspec(deprecated(hint)) func
33
+ #else
34
+ # define DEPRECATED(func, hint) func
35
+ #endif
36
+
37
  #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
38
  #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
39
  #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf'
 
61
  // TODO: show sample usage
62
  //
63
 
64
+ struct llama_model;
65
  struct llama_context;
66
 
67
  typedef int llama_token;
 
80
 
81
  typedef void (*llama_progress_callback)(float progress, void *ctx);
82
 
83
+ struct llama_context_params {
84
+ int seed; // RNG seed, -1 for random
85
  int n_ctx; // text context
86
  int n_batch; // prompt processing batch size
87
  int n_gpu_layers; // number of layers to store in VRAM
88
  int main_gpu; // the GPU that is used for scratch and small tensors
89
  float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
90
+ // called with a progress value between 0 and 1, pass NULL to disable
91
+ llama_progress_callback progress_callback;
92
+ // context pointer passed to the progress callback
93
+ void * progress_callback_user_data;
94
 
95
+ // Keep the booleans together to avoid misalignment during copy-by-value.
96
+ bool low_vram; // if true, reduce VRAM usage at the cost of performance
97
  bool f16_kv; // use fp16 for KV cache
98
  bool logits_all; // the llama_eval() call computes all logits, not just the last one
99
  bool vocab_only; // only load the vocabulary, no weights
100
  bool use_mmap; // use mmap if possible
101
  bool use_mlock; // force system to keep model in RAM
102
  bool embedding; // embedding mode only
 
 
 
 
 
103
  };
 
104
  // model file types
105
  enum llama_ftype {
106
  LLAMA_FTYPE_ALL_F32 = 0,
 
145
 
146
  LLAMA_API int64_t llama_time_us();
147
 
148
+ LLAMA_API struct llama_model * llama_load_model_from_file(
149
+ const char * path_model,
150
+ struct llama_context_params params);
151
+
152
+ LLAMA_API void llama_free_model(struct llama_model * model);
153
+
154
+ LLAMA_API struct llama_context * llama_new_context_with_model(
155
+ struct llama_model * model,
156
+ struct llama_context_params params);
157
+
158
  // Various functions for loading a ggml llama model.
159
  // Allocate (almost) all memory needed for the model.
160
  // Return NULL on failure
161
+ LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
162
  const char * path_model,
163
+ struct llama_context_params params),
164
+ "please use llama_load_model_from_file combined with llama_new_context_with_model instead");
165
 
166
  // Frees all allocated memory
167
  LLAMA_API void llama_free(struct llama_context * ctx);
 
178
  // The model needs to be reloaded before applying a new adapter, otherwise the adapter
179
  // will be applied on top of the previous one
180
  // Returns 0 on success
181
+ LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
182
  struct llama_context * ctx,
183
+ const char * path_lora,
184
+ const char * path_base_model,
185
+ int n_threads),
186
+ "please use llama_model_apply_lora_from_file instead");
187
+
188
+ LLAMA_API int llama_model_apply_lora_from_file(
189
+ const struct llama_model * model,
190
  const char * path_lora,
191
  const char * path_base_model,
192
  int n_threads);
 
337
  #include <string>
338
  struct ggml_tensor;
339
 
340
+ const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
341
 
342
  #endif
343
 
model_adapter.cpp CHANGED
@@ -98,7 +98,7 @@ void print_tok_vec(std::vector<float> &embd)
98
  //we need to read more to determine
99
  int32_t vocabsiz = 0;
100
  fin.read((char *) &vocabsiz, sizeof(int32_t));
101
- if(vocabsiz==4096) //actually the d_model for mpt
102
  {
103
  fileformat = FileFormat::MPT_1;
104
  }
 
98
  //we need to read more to determine
99
  int32_t vocabsiz = 0;
100
  fin.read((char *) &vocabsiz, sizeof(int32_t));
101
+ if(vocabsiz==4096 || vocabsiz==7168) //actually the d_model for mpt
102
  {
103
  fileformat = FileFormat::MPT_1;
104
  }
otherarch/gpt2_v3.cpp CHANGED
@@ -12,6 +12,7 @@
12
  #include <string>
13
  #include <vector>
14
  #include <iostream>
 
15
 
16
  #include "model_adapter.h"
17
 
@@ -39,6 +40,8 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
39
  }
40
  }
41
 
 
 
42
  // load hparams
43
  {
44
  auto & hparams = model.hparams;
@@ -53,7 +56,7 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
53
  const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
54
 
55
  printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
56
- printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
57
  printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
58
  printf("%s: n_head = %d\n", __func__, hparams.n_head);
59
  printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
@@ -90,9 +93,19 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
90
 
91
  // if (i < 10) fprintf(stderr, "%.s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
92
  }
93
- }
94
 
95
- auto memory_type = GGML_TYPE_F16;
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  // for the big tensors, we have the option to store the data in 16-bit floats or quantized
98
  // in order to save memory and also to speed up the computation
@@ -144,10 +157,10 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
144
  ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
145
  ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
146
 
147
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
148
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
149
 
150
- ctx_size += (6 + 12*n_layer)*512; // object overhead
151
 
152
  printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
153
  }
@@ -158,7 +171,6 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
158
  params.mem_size = ctx_size;
159
  params.mem_buffer = NULL;
160
  params.no_alloc = false;
161
-
162
 
163
  model.ctx = ggml_init(params);
164
  if (!model.ctx) {
@@ -247,11 +259,11 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
247
  const int n_layer = hparams.n_layer;
248
  const int n_ctx = hparams.n_ctx;
249
 
250
- const int n_mem = n_layer*n_ctx;
251
  const int n_elements = n_embd*n_mem;
252
 
253
- model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements);
254
- model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements);
255
 
256
  const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
257
 
@@ -293,14 +305,14 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
293
  }
294
 
295
  auto tensor = model.tensors[name.data()];
296
- if (ggml_nelements(tensor) != nelements) {
297
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
 
298
  return ModelLoadResult::FAIL;
299
  }
300
-
301
- if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
302
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld], expected [%lld, %lld]\n",
303
- __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
304
  return ModelLoadResult::FAIL;
305
  }
306
 
@@ -336,6 +348,28 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
336
 
337
  fin.close();
338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
 
340
  return ModelLoadResult::SUCCESS;
341
  }
@@ -355,7 +389,7 @@ bool gpt2_eval(
355
  const std::vector<gpt_vocab::id> & embd_inp,
356
  std::vector<float> & embd_w,
357
  size_t & mem_per_token,
358
- FileFormat file_format) {
359
  const int N = embd_inp.size();
360
 
361
  const auto & hparams = model.hparams;
@@ -369,8 +403,16 @@ bool gpt2_eval(
369
  static size_t buf_size = 256u*1024*1024;
370
  static void * buf = malloc(buf_size);
371
 
 
 
 
 
 
 
 
 
372
  if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
373
- const size_t buf_size_new = 320u*1024*1024 + 1.6*(mem_per_token*N); // add 10% to account for ggml object overhead
374
  //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
375
 
376
  // reallocate
@@ -380,7 +422,7 @@ bool gpt2_eval(
380
  buf = realloc(buf, buf_size);
381
  if (buf == nullptr)
382
  {
383
- fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
384
  return false;
385
  }
386
  }
@@ -390,7 +432,7 @@ bool gpt2_eval(
390
  params.mem_size = buf_size;
391
  params.mem_buffer = buf;
392
  params.no_alloc = false;
393
-
394
 
395
  struct ggml_context * ctx0 = ggml_init(params);
396
  struct ggml_cgraph gf = {};
@@ -413,6 +455,10 @@ bool gpt2_eval(
413
  for (int il = 0; il < n_layer; ++il) {
414
  struct ggml_tensor * cur;
415
 
 
 
 
 
416
  // norm
417
  {
418
  // [ 768, N]
@@ -559,6 +605,10 @@ bool gpt2_eval(
559
 
560
  struct ggml_tensor * inpFF = cur;
561
 
 
 
 
 
562
  // feed-forward network
563
  {
564
  // norm
@@ -615,6 +665,10 @@ bool gpt2_eval(
615
  inpL = ggml_add(ctx0, cur, inpFF);
616
  }
617
 
 
 
 
 
618
  // norm
619
  {
620
  // [ 768, N]
@@ -629,6 +683,10 @@ bool gpt2_eval(
629
  ggml_repeat(ctx0, model.ln_f_b, inpL));
630
  }
631
 
 
 
 
 
632
  // inpL = WTE * inpL
633
  // [ 768, 50257] - model.lm_head
634
  // [ 768, N] - inpL
 
12
  #include <string>
13
  #include <vector>
14
  #include <iostream>
15
+ #include <algorithm>
16
 
17
  #include "model_adapter.h"
18
 
 
40
  }
41
  }
42
 
43
+ int32_t origmaxctx = model.hparams.n_ctx;
44
+
45
  // load hparams
46
  {
47
  auto & hparams = model.hparams;
 
56
  const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
57
 
58
  printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
59
+ printf("%s: n_ctx = %d (%d)\n", __func__, hparams.n_ctx,origmaxctx);
60
  printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
61
  printf("%s: n_head = %d\n", __func__, hparams.n_head);
62
  printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
 
93
 
94
  // if (i < 10) fprintf(stderr, "%.s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
95
  }
 
96
 
97
+ // Add StarChat special tokens.
98
+ for (const std::string & token : {
99
+ "<|system|>",
100
+ "<|user|>",
101
+ "<|assistant|>",
102
+ "<|end|>",
103
+ }) {
104
+ if (vocab.token_to_id.find(token) != vocab.token_to_id.end()) {
105
+ vocab.add_special_token(token);
106
+ }
107
+ }
108
+ }
109
 
110
  // for the big tensors, we have the option to store the data in 16-bit floats or quantized
111
  // in order to save memory and also to speed up the computation
 
157
  ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
158
  ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
159
 
160
+ ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
161
+ ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
162
 
163
+ ctx_size += (6 + 12*n_layer)*1024; // object overhead
164
 
165
  printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
166
  }
 
171
  params.mem_size = ctx_size;
172
  params.mem_buffer = NULL;
173
  params.no_alloc = false;
 
174
 
175
  model.ctx = ggml_init(params);
176
  if (!model.ctx) {
 
259
  const int n_layer = hparams.n_layer;
260
  const int n_ctx = hparams.n_ctx;
261
 
262
+ const int n_mem = n_layer*std::max(origmaxctx,n_ctx);
263
  const int n_elements = n_embd*n_mem;
264
 
265
+ model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
266
+ model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
267
 
268
  const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
269
 
 
305
  }
306
 
307
  auto tensor = model.tensors[name.data()];
308
+ if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
309
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
310
+ __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
311
  return ModelLoadResult::FAIL;
312
  }
313
+ if (ggml_nelements(tensor) != nelements) {
314
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file. got %d, expected %d\n",
315
+ __func__, name.data(), (int) ggml_nelements(tensor), nelements);
 
316
  return ModelLoadResult::FAIL;
317
  }
318
 
 
348
 
349
  fin.close();
350
 
351
+ //gpu offload
352
+ #if defined(GGML_USE_CLBLAST)
353
+ if(gpulayers>0)
354
+ {
355
+ const auto & hparams = model.hparams;
356
+ size_t vram_total = 0;
357
+ const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
358
+ fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
359
+ for (int i = 0; i < n_gpu; ++i) {
360
+ const auto & layer = model.layers[i];
361
+ layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
362
+ layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
363
+ layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
364
+ layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
365
+ ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
366
+ ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
367
+ ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
368
+ ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
369
+ }
370
+ fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
371
+ }
372
+ #endif
373
 
374
  return ModelLoadResult::SUCCESS;
375
  }
 
389
  const std::vector<gpt_vocab::id> & embd_inp,
390
  std::vector<float> & embd_w,
391
  size_t & mem_per_token,
392
+ bool use_scratch) {
393
  const int N = embd_inp.size();
394
 
395
  const auto & hparams = model.hparams;
 
403
  static size_t buf_size = 256u*1024*1024;
404
  static void * buf = malloc(buf_size);
405
 
406
+ // use 2 scratch buffers
407
+ // TODO: very hacky solution - reimplement in a more elegant way
408
+ static size_t scr0_size = (n_embd>2400?512u:256u)*1024*1024;
409
+ static size_t scr1_size = (n_embd>2400?512u:256u)*1024*1024;
410
+
411
+ static void * scr0 = malloc(scr0_size);
412
+ static void * scr1 = malloc(scr1_size);
413
+
414
  if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
415
+ const size_t buf_size_new = 320u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead
416
  //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
417
 
418
  // reallocate
 
422
  buf = realloc(buf, buf_size);
423
  if (buf == nullptr)
424
  {
425
+ fprintf(stderr, "%s: failed to allocate %zu bytes. Try reducing batch size.\n", __func__, buf_size);
426
  return false;
427
  }
428
  }
 
432
  params.mem_size = buf_size;
433
  params.mem_buffer = buf;
434
  params.no_alloc = false;
435
+
436
 
437
  struct ggml_context * ctx0 = ggml_init(params);
438
  struct ggml_cgraph gf = {};
 
455
  for (int il = 0; il < n_layer; ++il) {
456
  struct ggml_tensor * cur;
457
 
458
+ if(use_scratch){
459
+ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
460
+ }
461
+
462
  // norm
463
  {
464
  // [ 768, N]
 
605
 
606
  struct ggml_tensor * inpFF = cur;
607
 
608
+ if(use_scratch){
609
+ ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
610
+ }
611
+
612
  // feed-forward network
613
  {
614
  // norm
 
665
  inpL = ggml_add(ctx0, cur, inpFF);
666
  }
667
 
668
+ if(use_scratch){
669
+ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
670
+ }
671
+
672
  // norm
673
  {
674
  // [ 768, N]
 
683
  ggml_repeat(ctx0, model.ln_f_b, inpL));
684
  }
685
 
686
+ if(use_scratch){
687
+ ggml_set_scratch(ctx0, { 0, 0, nullptr, });
688
+ }
689
+
690
  // inpL = WTE * inpL
691
  // [ 768, 50257] - model.lm_head
692
  // [ 768, N] - inpL
otherarch/gptj_v3.cpp CHANGED
@@ -12,10 +12,13 @@
12
  #include <string>
13
  #include <vector>
14
  #include <iostream>
 
15
 
16
  #include "model_adapter.h"
17
 
18
-
 
 
19
 
20
  // load the model's weights from a file
21
  ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab, int gpulayers) {
@@ -37,6 +40,8 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
37
  }
38
  }
39
 
 
 
40
  // load hparams
41
  {
42
  auto & hparams = model.hparams;
@@ -52,7 +57,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
52
  const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
53
 
54
  printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
55
- printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
56
  printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
57
  printf("%s: n_head = %d\n", __func__, hparams.n_head);
58
  printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
@@ -136,8 +141,8 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
136
  ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
137
  ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
138
 
139
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
140
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
141
 
142
  ctx_size += (5 + 10*n_layer)*512; // object overhead
143
 
@@ -150,7 +155,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
150
  params.mem_size = ctx_size;
151
  params.mem_buffer = NULL;
152
  params.no_alloc = false;
153
-
154
 
155
  model.ctx = ggml_init(params);
156
  if (!model.ctx) {
@@ -230,7 +235,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
230
  const int n_layer = hparams.n_layer;
231
  const int n_ctx = hparams.n_ctx;
232
 
233
- const int n_mem = n_layer*n_ctx;
234
  const int n_elements = n_embd*n_mem;
235
 
236
  model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements);
@@ -281,7 +286,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
281
  fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
282
  return ModelLoadResult::FAIL;
283
  }
284
-
285
 
286
  if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
287
 
@@ -298,7 +303,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
298
  __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
299
  return ModelLoadResult::FAIL;
300
  }
301
-
302
  }
303
 
304
  // for debugging
@@ -331,7 +336,32 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
331
 
332
  fin.close();
333
 
334
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
 
336
  return ModelLoadResult::SUCCESS;
337
  }
@@ -352,7 +382,8 @@ bool gptj_eval(
352
  const int n_past,
353
  const std::vector<gpt_vocab::id> & embd_inp,
354
  std::vector<float> & embd_w,
355
- size_t & mem_per_token) {
 
356
  const int N = embd_inp.size();
357
 
358
  const auto & hparams = model.hparams;
@@ -367,8 +398,16 @@ bool gptj_eval(
367
  static size_t buf_size = 256u*1024*1024;
368
  static void * buf = malloc(buf_size);
369
 
 
 
 
 
 
 
 
 
370
  if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
371
- const size_t buf_size_new = 320u*1024*1024 + 1.6*(mem_per_token*N); // add 10% to account for ggml object overhead
372
  //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
373
 
374
  // reallocate
@@ -378,7 +417,7 @@ bool gptj_eval(
378
  buf = realloc(buf, buf_size);
379
  if (buf == nullptr)
380
  {
381
- fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
382
  return false;
383
  }
384
  }
@@ -388,7 +427,7 @@ bool gptj_eval(
388
  params.mem_size = buf_size;
389
  params.mem_buffer = buf;
390
  params.no_alloc = false;
391
-
392
 
393
  struct ggml_context * ctx0 = ggml_init(params);
394
  struct ggml_cgraph gf = {};
@@ -403,6 +442,10 @@ bool gptj_eval(
403
  for (int il = 0; il < n_layer; ++il) {
404
  struct ggml_tensor * cur;
405
 
 
 
 
 
406
  // norm
407
  {
408
  cur = ggml_norm(ctx0, inpL);
@@ -490,6 +533,10 @@ bool gptj_eval(
490
  cur);
491
  }
492
 
 
 
 
 
493
  struct ggml_tensor * inpFF = cur;
494
 
495
  // feed-forward network
@@ -525,6 +572,10 @@ bool gptj_eval(
525
  inpL = ggml_add(ctx0, cur, inpL);
526
  }
527
 
 
 
 
 
528
  // norm
529
  {
530
  inpL = ggml_norm(ctx0, inpL);
@@ -537,6 +588,10 @@ bool gptj_eval(
537
  ggml_repeat(ctx0, model.ln_f_b, inpL));
538
  }
539
 
 
 
 
 
540
  // lm_head
541
  {
542
  inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
 
12
  #include <string>
13
  #include <vector>
14
  #include <iostream>
15
+ #include <algorithm>
16
 
17
  #include "model_adapter.h"
18
 
19
+ #if defined(GGML_USE_CLBLAST)
20
+ #include "ggml-opencl.h"
21
+ #endif
22
 
23
  // load the model's weights from a file
24
  ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab, int gpulayers) {
 
40
  }
41
  }
42
 
43
+ int32_t origmaxctx = model.hparams.n_ctx;
44
+
45
  // load hparams
46
  {
47
  auto & hparams = model.hparams;
 
57
  const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
58
 
59
  printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
60
+ printf("%s: n_ctx = %d (%d)\n", __func__, hparams.n_ctx,origmaxctx);
61
  printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
62
  printf("%s: n_head = %d\n", __func__, hparams.n_head);
63
  printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
 
141
  ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
142
  ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
143
 
144
+ ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
145
+ ctx_size += std::max(origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
146
 
147
  ctx_size += (5 + 10*n_layer)*512; // object overhead
148
 
 
155
  params.mem_size = ctx_size;
156
  params.mem_buffer = NULL;
157
  params.no_alloc = false;
158
+
159
 
160
  model.ctx = ggml_init(params);
161
  if (!model.ctx) {
 
235
  const int n_layer = hparams.n_layer;
236
  const int n_ctx = hparams.n_ctx;
237
 
238
+ const int n_mem = n_layer*std::max(origmaxctx,n_ctx);
239
  const int n_elements = n_embd*n_mem;
240
 
241
  model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements);
 
286
  fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
287
  return ModelLoadResult::FAIL;
288
  }
289
+
290
 
291
  if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
292
 
 
303
  __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
304
  return ModelLoadResult::FAIL;
305
  }
306
+
307
  }
308
 
309
  // for debugging
 
336
 
337
  fin.close();
338
 
339
+ //gpu offload
340
+ #if defined(GGML_USE_CLBLAST)
341
+ if(gpulayers>0)
342
+ {
343
+ const auto & hparams = model.hparams;
344
+ size_t vram_total = 0;
345
+ const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
346
+ fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
347
+ for (int i = 0; i < n_gpu; ++i) {
348
+ const auto & layer = model.layers[i];
349
+ layer.c_attn_q_proj_w->backend = GGML_BACKEND_GPU;
350
+ layer.c_attn_k_proj_w->backend = GGML_BACKEND_GPU;
351
+ layer.c_attn_v_proj_w->backend = GGML_BACKEND_GPU;
352
+ layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
353
+ layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
354
+ layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
355
+ ggml_cl_transform_tensor(layer.c_attn_q_proj_w->data,layer.c_attn_q_proj_w); vram_total += ggml_nbytes(layer.c_attn_q_proj_w);
356
+ ggml_cl_transform_tensor(layer.c_attn_k_proj_w->data,layer.c_attn_k_proj_w); vram_total += ggml_nbytes(layer.c_attn_k_proj_w);
357
+ ggml_cl_transform_tensor(layer.c_attn_v_proj_w->data,layer.c_attn_v_proj_w); vram_total += ggml_nbytes(layer.c_attn_v_proj_w);
358
+ ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
359
+ ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
360
+ ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
361
+ }
362
+ fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
363
+ }
364
+ #endif
365
 
366
  return ModelLoadResult::SUCCESS;
367
  }
 
382
  const int n_past,
383
  const std::vector<gpt_vocab::id> & embd_inp,
384
  std::vector<float> & embd_w,
385
+ size_t & mem_per_token,
386
+ bool use_scratch) {
387
  const int N = embd_inp.size();
388
 
389
  const auto & hparams = model.hparams;
 
398
  static size_t buf_size = 256u*1024*1024;
399
  static void * buf = malloc(buf_size);
400
 
401
+ // use 2 scratch buffers
402
+ // TODO: very hacky solution - reimplement in a more elegant way
403
+ static size_t scr0_size = 512u*1024*1024;
404
+ static size_t scr1_size = 512u*1024*1024;
405
+
406
+ static void * scr0 = malloc(scr0_size);
407
+ static void * scr1 = malloc(scr1_size);
408
+
409
  if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
410
+ const size_t buf_size_new = 320u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead
411
  //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
412
 
413
  // reallocate
 
417
  buf = realloc(buf, buf_size);
418
  if (buf == nullptr)
419
  {
420
+ fprintf(stderr, "%s: failed to allocate %zu bytes. Try reducing batch size.\n", __func__, buf_size);
421
  return false;
422
  }
423
  }
 
427
  params.mem_size = buf_size;
428
  params.mem_buffer = buf;
429
  params.no_alloc = false;
430
+
431
 
432
  struct ggml_context * ctx0 = ggml_init(params);
433
  struct ggml_cgraph gf = {};
 
442
  for (int il = 0; il < n_layer; ++il) {
443
  struct ggml_tensor * cur;
444
 
445
+ if(use_scratch){
446
+ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
447
+ }
448
+
449
  // norm
450
  {
451
  cur = ggml_norm(ctx0, inpL);
 
533
  cur);
534
  }
535
 
536
+ if(use_scratch){
537
+ ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
538
+ }
539
+
540
  struct ggml_tensor * inpFF = cur;
541
 
542
  // feed-forward network
 
572
  inpL = ggml_add(ctx0, cur, inpL);
573
  }
574
 
575
+ if(use_scratch){
576
+ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
577
+ }
578
+
579
  // norm
580
  {
581
  inpL = ggml_norm(ctx0, inpL);
 
588
  ggml_repeat(ctx0, model.ln_f_b, inpL));
589
  }
590
 
591
+ if(use_scratch){
592
+ ggml_set_scratch(ctx0, { 0, 0, nullptr, });
593
+ }
594
+
595
  // lm_head
596
  {
597
  inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
otherarch/llama_v2.cpp CHANGED
@@ -59,7 +59,7 @@ static const std::map<e_model2, size_t> & MEM_REQ_SCRATCH0_2()
59
  { MODEL_UNKNOWN_2, 512ull * MB_2 },
60
  { MODEL_7B_2, 512ull * MB_2 },
61
  { MODEL_13B_2, 512ull * MB_2 },
62
- { MODEL_30B_2, 512ull * MB_2 },
63
  { MODEL_65B_2, 1024ull * MB_2 },
64
  };
65
  return k_sizes;
@@ -71,7 +71,7 @@ static const std::map<e_model2, size_t> & MEM_REQ_SCRATCH1_2()
71
  { MODEL_UNKNOWN_2, 512ull * MB_2 },
72
  { MODEL_7B_2, 512ull * MB_2 },
73
  { MODEL_13B_2, 512ull * MB_2 },
74
- { MODEL_30B_2, 512ull * MB_2 },
75
  { MODEL_65B_2, 1024ull * MB_2 },
76
  };
77
  return k_sizes;
 
59
  { MODEL_UNKNOWN_2, 512ull * MB_2 },
60
  { MODEL_7B_2, 512ull * MB_2 },
61
  { MODEL_13B_2, 512ull * MB_2 },
62
+ { MODEL_30B_2, 640ull * MB_2 },
63
  { MODEL_65B_2, 1024ull * MB_2 },
64
  };
65
  return k_sizes;
 
71
  { MODEL_UNKNOWN_2, 512ull * MB_2 },
72
  { MODEL_7B_2, 512ull * MB_2 },
73
  { MODEL_13B_2, 512ull * MB_2 },
74
+ { MODEL_30B_2, 640ull * MB_2 },
75
  { MODEL_65B_2, 1024ull * MB_2 },
76
  };
77
  return k_sizes;
otherarch/mpt_v3.cpp CHANGED
@@ -12,13 +12,16 @@
12
  #include <string>
13
  #include <vector>
14
  #include <iostream>
 
15
 
16
  #include "model_adapter.h"
17
 
18
-
 
 
19
 
20
  // load the model's weights from a file
21
- bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab) {
22
  printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
23
 
24
  auto fin = std::ifstream(fname, std::ios::binary);
@@ -75,7 +78,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
75
  std::string word;
76
  std::vector<char> buf(128);
77
 
78
- for (int i = 0; i < n_vocab; i++) {
79
  uint32_t len;
80
  fin.read((char *) &len, sizeof(len));
81
 
@@ -83,6 +86,16 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
83
  fin.read((char *) buf.data(), len);
84
  word.assign(buf.data(), len);
85
 
 
 
 
 
 
 
 
 
 
 
86
  vocab.token_to_id[word] = i;
87
  vocab.id_to_token[i] = word;
88
  }
@@ -120,8 +133,8 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
120
  ctx_size += n_layer * (4 * n_embd * n_embd * ggml_type_sizef(wtype)); // mlp_mlp_up_weight
121
  ctx_size += n_layer * (n_embd * n_embd * 4 * ggml_type_sizef(wtype)); // mlp_mlp_down_weight
122
 
123
- ctx_size += (n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16)); // memory_k
124
- ctx_size += (n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16)); // memory_v
125
 
126
  ctx_size += (6 + 6 * n_layer) * 512; // object overhead
127
 
@@ -278,6 +291,29 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
278
 
279
  fin.close();
280
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  return true;
282
  }
283
 
@@ -290,7 +326,8 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
290
  // - embd_w: the predicted logits for the next token
291
  //
292
  bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
293
- const std::vector<gpt_vocab::id> & embd_inp, std::vector<float> & embd_w, bool logits_all, size_t & mem_per_token) {
 
294
  const int N = embd_inp.size();
295
 
296
  const auto & hparams = model.hparams;
@@ -306,22 +343,26 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
306
 
307
  // use 2 scratch buffers
308
  // TODO: very hacky solution - reimplement in a more elegant way
309
- static size_t scr0_size = (n_ctx>2048?1024u:512u)*1024*1024;
310
- static void * scr0 = malloc(scr0_size);
 
311
 
312
- static size_t scr1_size = (n_ctx>2048?1024u:512u)*1024*1024;
313
  static void * scr1 = malloc(scr1_size);
314
 
315
- if (mem_per_token > 0 && mem_per_token * N > buf_size) {
316
- const size_t buf_size_new = 1.1 * (mem_per_token * N); // add 10% to account for ggml object overhead
317
  // printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__,
318
  // buf_size, buf_size_new);
319
  // reallocate
320
- buf_size = buf_size_new;
321
- buf = realloc(buf, buf_size);
322
- if (buf == nullptr) {
323
- fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
324
- return false;
 
 
 
325
  }
326
  }
327
 
@@ -343,7 +384,9 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
343
 
344
  struct ggml_tensor * cur;
345
 
 
346
  ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
 
347
 
348
  // a = self.ln_1(x)
349
  {
@@ -439,7 +482,9 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
439
 
440
  inpL = ggml_add(ctx0, inpL, cur);
441
 
 
442
  ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
 
443
 
444
  // m = self.ln_2(x)
445
  {
@@ -465,7 +510,9 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
465
  inpL = ggml_add(ctx0, inpL, cur);
466
  }
467
 
 
468
  ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
 
469
 
470
  // norm
471
  {
@@ -474,7 +521,9 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
474
  inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_weight, inpL), inpL);
475
  }
476
 
 
477
  ggml_set_scratch(ctx0, { 0, 0, nullptr, });
 
478
 
479
  // output embedding weight tied to input embedding
480
  inpL = ggml_mul_mat(ctx0, model.wte_weight, inpL);
 
12
  #include <string>
13
  #include <vector>
14
  #include <iostream>
15
+ #include <algorithm>
16
 
17
  #include "model_adapter.h"
18
 
19
+ #if defined(GGML_USE_CLBLAST)
20
+ #include "ggml-opencl.h"
21
+ #endif
22
 
23
  // load the model's weights from a file
24
+ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab, int gpulayers) {
25
  printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
26
 
27
  auto fin = std::ifstream(fname, std::ios::binary);
 
78
  std::string word;
79
  std::vector<char> buf(128);
80
 
81
+ for (int i = 0; i < n_vocab; i++) {
82
  uint32_t len;
83
  fin.read((char *) &len, sizeof(len));
84
 
 
86
  fin.read((char *) buf.data(), len);
87
  word.assign(buf.data(), len);
88
 
89
+ // Convert token from utf-8
90
+ // std::wstring word_multibytes = convert_to_wstring(word);
91
+ // if(word_multibytes!=L"")
92
+ // {
93
+ // word.resize(word_multibytes.size());
94
+ // for (int w = 0; w < word_multibytes.size(); w++) {
95
+ // word[w] = uint8_t(word_multibytes[w]);
96
+ // }
97
+ // }
98
+
99
  vocab.token_to_id[word] = i;
100
  vocab.id_to_token[i] = word;
101
  }
 
133
  ctx_size += n_layer * (4 * n_embd * n_embd * ggml_type_sizef(wtype)); // mlp_mlp_up_weight
134
  ctx_size += n_layer * (n_embd * n_embd * 4 * ggml_type_sizef(wtype)); // mlp_mlp_down_weight
135
 
136
+ ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_k
137
+ ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_v
138
 
139
  ctx_size += (6 + 6 * n_layer) * 512; // object overhead
140
 
 
291
 
292
  fin.close();
293
 
294
+ //gpu offload
295
+ #if defined(GGML_USE_CLBLAST)
296
+ if(gpulayers>0)
297
+ {
298
+ const auto & hparams = model.hparams;
299
+ size_t vram_total = 0;
300
+ const int n_gpu = std::min(gpulayers, int(hparams.n_layers));
301
+ fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
302
+ for (int i = 0; i < n_gpu; ++i) {
303
+ const auto & layer = model.layers[i];
304
+ layer.ffn_up_proj->backend = GGML_BACKEND_GPU;
305
+ layer.ffn_down_proj->backend = GGML_BACKEND_GPU;
306
+ layer.c_attn_wqkv_weight->backend = GGML_BACKEND_GPU;
307
+ layer.c_attn_out_proj_weight->backend = GGML_BACKEND_GPU;
308
+ ggml_cl_transform_tensor(layer.ffn_up_proj->data,layer.ffn_up_proj); vram_total += ggml_nbytes(layer.ffn_up_proj);
309
+ ggml_cl_transform_tensor(layer.ffn_down_proj->data,layer.ffn_down_proj); vram_total += ggml_nbytes(layer.ffn_down_proj);
310
+ ggml_cl_transform_tensor(layer.c_attn_wqkv_weight->data,layer.c_attn_wqkv_weight); vram_total += ggml_nbytes(layer.c_attn_wqkv_weight);
311
+ ggml_cl_transform_tensor(layer.c_attn_out_proj_weight->data,layer.c_attn_out_proj_weight); vram_total += ggml_nbytes(layer.c_attn_out_proj_weight);
312
+ }
313
+ fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
314
+ }
315
+ #endif
316
+
317
  return true;
318
  }
319
 
 
326
  // - embd_w: the predicted logits for the next token
327
  //
328
  bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
329
+ const std::vector<gpt_vocab::id> & embd_inp, std::vector<float> & embd_w,
330
+ bool logits_all, size_t & mem_per_token, bool use_scratch) {
331
  const int N = embd_inp.size();
332
 
333
  const auto & hparams = model.hparams;
 
343
 
344
  // use 2 scratch buffers
345
  // TODO: very hacky solution - reimplement in a more elegant way
346
+ //MPT 30B needs more scratch memory
347
+ static size_t scr0_size = (n_embd>=7168?2048u:1024u)*1024*1024;
348
+ static size_t scr1_size = (n_embd>=7168?2048u:1024u)*1024*1024;
349
 
350
+ static void * scr0 = malloc(scr0_size);
351
  static void * scr1 = malloc(scr1_size);
352
 
353
+ if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
354
+ const size_t buf_size_new = 320u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead
355
  // printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__,
356
  // buf_size, buf_size_new);
357
  // reallocate
358
+ if (buf_size_new > buf_size)
359
+ {
360
+ buf_size = buf_size_new;
361
+ buf = realloc(buf, buf_size);
362
+ if (buf == nullptr) {
363
+ fprintf(stderr, "%s: failed to allocate %zu bytes. Try reducing batch size.\n", __func__, buf_size);
364
+ return false;
365
+ }
366
  }
367
  }
368
 
 
384
 
385
  struct ggml_tensor * cur;
386
 
387
+ if(use_scratch){
388
  ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
389
+ }
390
 
391
  // a = self.ln_1(x)
392
  {
 
482
 
483
  inpL = ggml_add(ctx0, inpL, cur);
484
 
485
+ if(use_scratch){
486
  ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
487
+ }
488
 
489
  // m = self.ln_2(x)
490
  {
 
510
  inpL = ggml_add(ctx0, inpL, cur);
511
  }
512
 
513
+ if(use_scratch){
514
  ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
515
+ }
516
 
517
  // norm
518
  {
 
521
  inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_weight, inpL), inpL);
522
  }
523
 
524
+ if(use_scratch){
525
  ggml_set_scratch(ctx0, { 0, 0, nullptr, });
526
+ }
527
 
528
  // output embedding weight tied to input embedding
529
  inpL = ggml_mul_mat(ctx0, model.wte_weight, inpL);
otherarch/neox_v3.cpp CHANGED
@@ -12,11 +12,14 @@
12
  #include <string>
13
  #include <vector>
14
  #include <iostream>
 
15
 
16
-
 
 
17
 
18
  // load the model's weights from a file
19
- ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format) {
20
  printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
21
 
22
  auto fin = std::ifstream(fname, std::ios::binary);
@@ -35,30 +38,25 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
35
  }
36
  }
37
 
 
 
38
  // load hparams
39
  {
40
  auto & hparams = model.hparams;
41
- hparams.par_res = 1; //true
42
  fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
43
  fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
44
  fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
45
  fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
46
  fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
47
  fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
48
- if(file_format!=FileFormat::NEOX_1 && file_format!=FileFormat::NEOX_2 && file_format!=FileFormat::NEOX_3)
49
- {
50
- fin.read((char *) &hparams.par_res, sizeof(hparams.par_res));
51
- }
52
- if(file_format==FileFormat::NEOX_3)
53
- {
54
- hparams.par_res = 0;
55
- }
56
  fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
57
 
58
  const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
59
 
60
  printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
61
- printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
62
  printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
63
  printf("%s: n_head = %d\n", __func__, hparams.n_head);
64
  printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
@@ -107,10 +105,10 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
107
  {
108
  const auto & hparams = model.hparams;
109
 
110
- const int n_embd = hparams.n_embd;
111
- const int n_layer = hparams.n_layer;
112
- const int n_ctx = hparams.n_ctx;
113
- const int n_vocab = hparams.n_vocab;
114
 
115
  ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
116
  ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
@@ -138,10 +136,10 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
138
  ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
139
  ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
140
 
141
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
142
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
143
 
144
- ctx_size += (6 + 16*n_layer)*512; // object overhead
145
 
146
  printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
147
  }
@@ -152,7 +150,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
152
  params.mem_size = ctx_size;
153
  params.mem_buffer = NULL;
154
  params.no_alloc = false;
155
-
156
  model.ctx = ggml_init(params);
157
  if (!model.ctx) {
158
  fprintf(stderr, "%s: ggml_init() failed\n", __func__);
@@ -237,7 +235,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
237
  const int n_layer = hparams.n_layer;
238
  const int n_ctx = hparams.n_ctx;
239
 
240
- const int64_t n_mem = n_layer*n_ctx;
241
  const int64_t n_elements = n_embd*n_mem;
242
 
243
  model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
@@ -300,22 +298,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
300
  printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
301
  }
302
 
303
- size_t bpe = ggml_type_size(ggml_type(ttype));
304
-
305
- if(file_format==FileFormat::NEOX_1)
306
- {
307
- switch (ttype) {
308
- case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
309
- case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
310
- case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
311
- case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
312
- default:
313
- {
314
- fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ttype);
315
- return ModelLoadResult::FAIL;
316
- }
317
- };
318
- }
319
 
320
  if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
321
  fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
@@ -340,6 +323,29 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
340
 
341
  fin.close();
342
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
  return ModelLoadResult::SUCCESS;
344
  }
345
 
@@ -394,7 +400,8 @@ bool gpt_neox_eval(
394
  const int n_past,
395
  const std::vector<gpt_vocab::id> & embd_inp,
396
  std::vector<float> & embd_w,
397
- size_t & mem_per_token) {
 
398
  const int N = embd_inp.size();
399
 
400
  const auto & hparams = model.hparams;
@@ -409,8 +416,16 @@ bool gpt_neox_eval(
409
  static size_t buf_size = 256u*1024*1024;
410
  static void * buf = malloc(buf_size);
411
 
 
 
 
 
 
 
 
 
412
  if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
413
- const size_t buf_size_new = 360u*1024*1024 + 1.6*(mem_per_token*N); // add 10% to account for ggml object overhead
414
  //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
415
 
416
  // reallocate
@@ -420,7 +435,7 @@ bool gpt_neox_eval(
420
  buf = realloc(buf, buf_size);
421
  if (buf == nullptr)
422
  {
423
- fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
424
  return false;
425
  }
426
  }
@@ -430,7 +445,7 @@ bool gpt_neox_eval(
430
  params.mem_size = buf_size;
431
  params.mem_buffer = buf;
432
  params.no_alloc = false;
433
-
434
 
435
  struct ggml_context * ctx0 = ggml_init(params);
436
  struct ggml_cgraph gf = {};
@@ -445,6 +460,10 @@ bool gpt_neox_eval(
445
  for (int il = 0; il < n_layer; ++il) {
446
  struct ggml_tensor * cur;
447
 
 
 
 
 
448
  // self-attention
449
  {
450
  {
@@ -548,6 +567,10 @@ bool gpt_neox_eval(
548
  }
549
  }
550
 
 
 
 
 
551
  if (hparams.par_res == 0) {
552
  struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
553
 
@@ -570,6 +593,10 @@ bool gpt_neox_eval(
570
  }
571
  }
572
 
 
 
 
 
573
  // norm
574
  {
575
  inpL = ggml_norm(ctx0, inpL);
@@ -582,6 +609,10 @@ bool gpt_neox_eval(
582
  ggml_repeat(ctx0, model.ln_f_b, inpL));
583
  }
584
 
 
 
 
 
585
  // lm_head
586
  {
587
  inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
 
12
  #include <string>
13
  #include <vector>
14
  #include <iostream>
15
+ #include <algorithm>
16
 
17
+ #if defined(GGML_USE_CLBLAST)
18
+ #include "ggml-opencl.h"
19
+ #endif
20
 
21
  // load the model's weights from a file
22
+ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) {
23
  printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
24
 
25
  auto fin = std::ifstream(fname, std::ios::binary);
 
38
  }
39
  }
40
 
41
+ int32_t origmaxctx = model.hparams.n_ctx;
42
+
43
  // load hparams
44
  {
45
  auto & hparams = model.hparams;
46
+
47
  fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
48
  fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
49
  fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
50
  fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
51
  fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
52
  fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
53
+ fin.read((char *) &hparams.par_res, sizeof(hparams.par_res));
 
 
 
 
 
 
 
54
  fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
55
 
56
  const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
57
 
58
  printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
59
+ printf("%s: n_ctx = %d (%d)\n", __func__, hparams.n_ctx,origmaxctx);
60
  printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
61
  printf("%s: n_head = %d\n", __func__, hparams.n_head);
62
  printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
 
105
  {
106
  const auto & hparams = model.hparams;
107
 
108
+ const size_t n_embd = hparams.n_embd;
109
+ const size_t n_layer = hparams.n_layer;
110
+ const size_t n_ctx = hparams.n_ctx;
111
+ const size_t n_vocab = hparams.n_vocab;
112
 
113
  ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
114
  ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
 
136
  ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
137
  ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
138
 
139
+ ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
140
+ ctx_size += std::max((size_t)origmaxctx,n_ctx)*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
141
 
142
+ ctx_size += (6 + 16*n_layer)*1024; // object overhead
143
 
144
  printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
145
  }
 
150
  params.mem_size = ctx_size;
151
  params.mem_buffer = NULL;
152
  params.no_alloc = false;
153
+
154
  model.ctx = ggml_init(params);
155
  if (!model.ctx) {
156
  fprintf(stderr, "%s: ggml_init() failed\n", __func__);
 
235
  const int n_layer = hparams.n_layer;
236
  const int n_ctx = hparams.n_ctx;
237
 
238
+ const int64_t n_mem = n_layer*std::max(origmaxctx,n_ctx);
239
  const int64_t n_elements = n_embd*n_mem;
240
 
241
  model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
 
298
  printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
299
  }
300
 
301
+ const size_t bpe = ggml_type_size(ggml_type(ttype));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
 
303
  if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
304
  fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
 
323
 
324
  fin.close();
325
 
326
+ //gpu offload
327
+ #if defined(GGML_USE_CLBLAST)
328
+ if(gpulayers>0)
329
+ {
330
+ const auto & hparams = model.hparams;
331
+ size_t vram_total = 0;
332
+ const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
333
+ fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
334
+ for (int i = 0; i < n_gpu; ++i) {
335
+ const auto & layer = model.layers[i];
336
+ layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
337
+ layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
338
+ layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
339
+ layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
340
+ ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
341
+ ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
342
+ ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
343
+ ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
344
+ }
345
+ fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
346
+ }
347
+ #endif
348
+
349
  return ModelLoadResult::SUCCESS;
350
  }
351
 
 
400
  const int n_past,
401
  const std::vector<gpt_vocab::id> & embd_inp,
402
  std::vector<float> & embd_w,
403
+ size_t & mem_per_token,
404
+ bool use_scratch) {
405
  const int N = embd_inp.size();
406
 
407
  const auto & hparams = model.hparams;
 
416
  static size_t buf_size = 256u*1024*1024;
417
  static void * buf = malloc(buf_size);
418
 
419
+ // use 2 scratch buffers
420
+ // TODO: very hacky solution - reimplement in a more elegant way
421
+ static size_t scr0_size = (n_embd>2400?512u:256u)*1024*1024;
422
+ static size_t scr1_size = (n_embd>2400?512u:256u)*1024*1024;
423
+
424
+ static void * scr0 = malloc(scr0_size);
425
+ static void * scr1 = malloc(scr1_size);
426
+
427
  if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
428
+ const size_t buf_size_new = 360u*1024*1024 + 1.2*(mem_per_token*N); // add 10% to account for ggml object overhead
429
  //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
430
 
431
  // reallocate
 
435
  buf = realloc(buf, buf_size);
436
  if (buf == nullptr)
437
  {
438
+ fprintf(stderr, "%s: failed to allocate %zu bytes. Try reducing batch size.\n", __func__, buf_size);
439
  return false;
440
  }
441
  }
 
445
  params.mem_size = buf_size;
446
  params.mem_buffer = buf;
447
  params.no_alloc = false;
448
+
449
 
450
  struct ggml_context * ctx0 = ggml_init(params);
451
  struct ggml_cgraph gf = {};
 
460
  for (int il = 0; il < n_layer; ++il) {
461
  struct ggml_tensor * cur;
462
 
463
+ if(use_scratch){
464
+ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
465
+ }
466
+
467
  // self-attention
468
  {
469
  {
 
567
  }
568
  }
569
 
570
+ if(use_scratch){
571
+ ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
572
+ }
573
+
574
  if (hparams.par_res == 0) {
575
  struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
576
 
 
593
  }
594
  }
595
 
596
+ if(use_scratch){
597
+ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
598
+ }
599
+
600
  // norm
601
  {
602
  inpL = ggml_norm(ctx0, inpL);
 
609
  ggml_repeat(ctx0, model.ln_f_b, inpL));
610
  }
611
 
612
+ if(use_scratch){
613
+ ggml_set_scratch(ctx0, { 0, 0, nullptr, });
614
+ }
615
+
616
  // lm_head
617
  {
618
  inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
otherarch/otherarch.h CHANGED
@@ -43,7 +43,6 @@ struct gptj_layer {
43
  struct ggml_tensor * c_mlp_fc_b;
44
 
45
  struct ggml_tensor * c_mlp_proj_w;
46
- struct ggml_tensor * c_mlp_proj_w_trans; //for backwards compatibility
47
  struct ggml_tensor * c_mlp_proj_b;
48
  };
49
  struct gptj_layer_v2 {
 
43
  struct ggml_tensor * c_mlp_fc_b;
44
 
45
  struct ggml_tensor * c_mlp_proj_w;
 
46
  struct ggml_tensor * c_mlp_proj_b;
47
  };
48
  struct gptj_layer_v2 {
otherarch/utils.cpp CHANGED
@@ -122,8 +122,27 @@ std::string convert_to_utf8(const std::wstring & input) {
122
 
123
 
124
  std::wstring convert_to_wstring(const std::string & input) {
125
- std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
126
- return converter.from_bytes(input);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  }
128
 
129
  std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
@@ -132,31 +151,34 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
132
  // first split the text into words
133
  {
134
  std::string str = text;
135
- std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
136
 
137
  // Generate the subpattern from the special_tokens vector if it's not empty
138
  if (!vocab.special_tokens.empty()) {
 
139
  std::string special_tokens_subpattern;
140
  for (const auto & token : vocab.special_tokens) {
141
  if (!special_tokens_subpattern.empty()) {
142
  special_tokens_subpattern += "|";
143
  }
144
- special_tokens_subpattern += token;
145
  }
146
 
147
- // Modify the regex pattern with the generated special tokens subpattern
148
- pat = special_tokens_subpattern + "|" + pat;
149
- }
150
-
151
- std::regex re(pat);
152
- std::smatch m;
153
-
154
- while (std::regex_search(str, m, re)) {
155
- for (auto x : m) {
156
- words.push_back(x);
 
157
  }
158
- str = m.suffix();
159
  }
 
 
160
  }
161
 
162
  // find the longest token that forms each word in words:
@@ -185,15 +207,15 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
185
 
186
  bool should_transpose_layer(std::string name)
187
  {
188
-
189
- if(name.find(".mlp.fc_in.weight")!=std::string::npos ||
190
- name.find(".attn.out_proj.weight")!=std::string::npos ||
191
- name.find(".attn.q_proj.weight")!=std::string::npos ||
192
- name.find(".attn.k_proj.weight")!=std::string::npos ||
193
  name.find(".attn.v_proj.weight")!=std::string::npos ||
194
- name.find("/attn/c_attn/w")!=std::string::npos ||
195
- name.find("/attn/c_proj/w")!=std::string::npos ||
196
- name.find("/mlp/c_fc/w")!=std::string::npos ||
197
  name.find("/mlp/c_proj/w")!=std::string::npos)
198
  {
199
  return true;
 
122
 
123
 
124
  std::wstring convert_to_wstring(const std::string & input) {
125
+ try {
126
+ std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
127
+ return converter.from_bytes(input);
128
+ } catch (const std::range_error& e) {
129
+ return L"";
130
+ } catch (...) {
131
+ return L"";
132
+ }
133
+ }
134
+
135
+ void gpt_split_words(std::string str, std::vector<std::string>& words) {
136
+ const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
137
+ const std::regex re(pattern);
138
+ std::smatch m;
139
+
140
+ while (std::regex_search(str, m, re)) {
141
+ for (auto x : m) {
142
+ words.push_back(x);
143
+ }
144
+ str = m.suffix();
145
+ }
146
  }
147
 
148
  std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
 
151
  // first split the text into words
152
  {
153
  std::string str = text;
 
154
 
155
  // Generate the subpattern from the special_tokens vector if it's not empty
156
  if (!vocab.special_tokens.empty()) {
157
+ const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])");
158
  std::string special_tokens_subpattern;
159
  for (const auto & token : vocab.special_tokens) {
160
  if (!special_tokens_subpattern.empty()) {
161
  special_tokens_subpattern += "|";
162
  }
163
+ special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)");
164
  }
165
 
166
+ std::regex re(special_tokens_subpattern);
167
+ std::smatch m;
168
+ // Split the text by special tokens.
169
+ while (std::regex_search(str, m, re)) {
170
+ // Split the substrings in-between special tokens into words.
171
+ gpt_split_words(m.prefix(), words);
172
+ // Add matched special tokens as words.
173
+ for (auto x : m) {
174
+ words.push_back(x);
175
+ }
176
+ str = m.suffix();
177
  }
178
+ // Remaining text without special tokens will be handled below.
179
  }
180
+
181
+ gpt_split_words(str, words);
182
  }
183
 
184
  // find the longest token that forms each word in words:
 
207
 
208
  bool should_transpose_layer(std::string name)
209
  {
210
+
211
+ if(name.find(".mlp.fc_in.weight")!=std::string::npos ||
212
+ name.find(".attn.out_proj.weight")!=std::string::npos ||
213
+ name.find(".attn.q_proj.weight")!=std::string::npos ||
214
+ name.find(".attn.k_proj.weight")!=std::string::npos ||
215
  name.find(".attn.v_proj.weight")!=std::string::npos ||
216
+ name.find("/attn/c_attn/w")!=std::string::npos ||
217
+ name.find("/attn/c_proj/w")!=std::string::npos ||
218
+ name.find("/mlp/c_fc/w")!=std::string::npos ||
219
  name.find("/mlp/c_proj/w")!=std::string::npos)
220
  {
221
  return true;
otherarch/utils.h CHANGED
@@ -34,6 +34,12 @@ void utreplace(std::string & str, const std::string & needle, const std::string
34
  // poor-man's JSON parsing
35
  std::map<std::string, int32_t> json_parse(const std::string & fname);
36
 
 
 
 
 
 
 
37
  // split text into tokens
38
  //
39
  // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
 
34
  // poor-man's JSON parsing
35
  std::map<std::string, int32_t> json_parse(const std::string & fname);
36
 
37
+ std::string convert_to_utf8(const std::wstring & input);
38
+
39
+ std::wstring convert_to_wstring(const std::string & input);
40
+
41
+ void gpt_split_words(std::string str, std::vector<std::string>& words);
42
+
43
  // split text into tokens
44
  //
45
  // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
spm-headers/ggml.h CHANGED
@@ -303,6 +303,7 @@ extern "C" {
303
  GGML_OP_STEP,
304
  GGML_OP_RELU,
305
  GGML_OP_GELU,
 
306
  GGML_OP_SILU,
307
  GGML_OP_SILU_BACK,
308
  GGML_OP_NORM, // normalize
@@ -331,12 +332,15 @@ extern "C" {
331
  GGML_OP_ROPE_BACK,
332
  GGML_OP_ALIBI,
333
  GGML_OP_CLAMP,
334
- GGML_OP_CONV_1D_1S,
335
- GGML_OP_CONV_1D_2S,
 
336
 
337
  GGML_OP_FLASH_ATTN,
338
  GGML_OP_FLASH_FF,
339
  GGML_OP_FLASH_ATTN_BACK,
 
 
340
 
341
  GGML_OP_MAP_UNARY,
342
  GGML_OP_MAP_BINARY,
@@ -500,8 +504,9 @@ extern "C" {
500
  GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
501
  GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
502
 
503
- GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
504
- GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
 
505
 
506
  GGML_API struct ggml_tensor * ggml_new_tensor(
507
  struct ggml_context * ctx,
@@ -556,8 +561,9 @@ extern "C" {
556
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
557
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
558
 
559
- GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
560
- GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
 
561
 
562
  //
563
  // operations on tensors with backpropagation
@@ -610,24 +616,47 @@ extern "C" {
610
  struct ggml_tensor * a,
611
  struct ggml_tensor * b);
612
 
 
 
 
 
 
613
  GGML_API struct ggml_tensor * ggml_mul(
614
  struct ggml_context * ctx,
615
  struct ggml_tensor * a,
616
  struct ggml_tensor * b);
617
 
 
 
 
 
 
618
  GGML_API struct ggml_tensor * ggml_div(
619
  struct ggml_context * ctx,
620
  struct ggml_tensor * a,
621
  struct ggml_tensor * b);
622
 
 
 
 
 
 
623
  GGML_API struct ggml_tensor * ggml_sqr(
624
  struct ggml_context * ctx,
625
  struct ggml_tensor * a);
626
 
 
 
 
 
627
  GGML_API struct ggml_tensor * ggml_sqrt(
628
  struct ggml_context * ctx,
629
  struct ggml_tensor * a);
630
 
 
 
 
 
631
  GGML_API struct ggml_tensor * ggml_log(
632
  struct ggml_context * ctx,
633
  struct ggml_tensor * a);
@@ -667,31 +696,67 @@ extern "C" {
667
  struct ggml_context * ctx,
668
  struct ggml_tensor * a);
669
 
 
 
 
 
670
  GGML_API struct ggml_tensor * ggml_sgn(
671
  struct ggml_context * ctx,
672
  struct ggml_tensor * a);
673
 
 
 
 
 
674
  GGML_API struct ggml_tensor * ggml_neg(
675
  struct ggml_context * ctx,
676
  struct ggml_tensor * a);
677
 
 
 
 
 
678
  GGML_API struct ggml_tensor * ggml_step(
679
  struct ggml_context * ctx,
680
  struct ggml_tensor * a);
681
 
 
 
 
 
682
  GGML_API struct ggml_tensor * ggml_relu(
683
  struct ggml_context * ctx,
684
  struct ggml_tensor * a);
685
 
 
 
 
 
686
  // TODO: double-check this computation is correct
687
  GGML_API struct ggml_tensor * ggml_gelu(
688
  struct ggml_context * ctx,
689
  struct ggml_tensor * a);
690
 
 
 
 
 
 
 
 
 
 
 
 
 
691
  GGML_API struct ggml_tensor * ggml_silu(
692
  struct ggml_context * ctx,
693
  struct ggml_tensor * a);
694
 
 
 
 
 
695
  // a - x
696
  // b - dy
697
  GGML_API struct ggml_tensor * ggml_silu_back(
@@ -705,10 +770,18 @@ extern "C" {
705
  struct ggml_context * ctx,
706
  struct ggml_tensor * a);
707
 
 
 
 
 
708
  GGML_API struct ggml_tensor * ggml_rms_norm(
709
  struct ggml_context * ctx,
710
  struct ggml_tensor * a);
711
 
 
 
 
 
712
  // a - x
713
  // b - dy
714
  GGML_API struct ggml_tensor * ggml_rms_norm_back(
@@ -998,16 +1071,55 @@ extern "C" {
998
  float min,
999
  float max);
1000
 
1001
- // padding = 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1002
  // TODO: we don't support extra parameters for now
1003
  // that's why we are hard-coding the stride, padding, and dilation
1004
  // not great ..
1005
- GGML_API struct ggml_tensor * ggml_conv_1d_1s(
 
 
 
 
 
1006
  struct ggml_context * ctx,
1007
  struct ggml_tensor * a,
1008
  struct ggml_tensor * b);
1009
 
1010
- GGML_API struct ggml_tensor * ggml_conv_1d_2s(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1011
  struct ggml_context * ctx,
1012
  struct ggml_tensor * a,
1013
  struct ggml_tensor * b);
@@ -1035,6 +1147,26 @@ extern "C" {
1035
  struct ggml_tensor * c0,
1036
  struct ggml_tensor * c1);
1037
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1038
  // Mapping operations
1039
  typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
1040
  typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
 
303
  GGML_OP_STEP,
304
  GGML_OP_RELU,
305
  GGML_OP_GELU,
306
+ GGML_OP_GELU_QUICK,
307
  GGML_OP_SILU,
308
  GGML_OP_SILU_BACK,
309
  GGML_OP_NORM, // normalize
 
332
  GGML_OP_ROPE_BACK,
333
  GGML_OP_ALIBI,
334
  GGML_OP_CLAMP,
335
+ GGML_OP_CONV_1D_S1_PH,
336
+ GGML_OP_CONV_1D_S2_PH,
337
+ GGML_OP_CONV_2D_SK_P0,
338
 
339
  GGML_OP_FLASH_ATTN,
340
  GGML_OP_FLASH_FF,
341
  GGML_OP_FLASH_ATTN_BACK,
342
+ GGML_OP_WIN_PART,
343
+ GGML_OP_WIN_UNPART,
344
 
345
  GGML_OP_MAP_UNARY,
346
  GGML_OP_MAP_BINARY,
 
504
  GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
505
  GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
506
 
507
+ GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx);
508
+ GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx);
509
+ GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx);
510
 
511
  GGML_API struct ggml_tensor * ggml_new_tensor(
512
  struct ggml_context * ctx,
 
561
  GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
562
  GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
563
 
564
+ GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
565
+ GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
566
+ GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...);
567
 
568
  //
569
  // operations on tensors with backpropagation
 
616
  struct ggml_tensor * a,
617
  struct ggml_tensor * b);
618
 
619
+ GGML_API struct ggml_tensor * ggml_sub_inplace(
620
+ struct ggml_context * ctx,
621
+ struct ggml_tensor * a,
622
+ struct ggml_tensor * b);
623
+
624
  GGML_API struct ggml_tensor * ggml_mul(
625
  struct ggml_context * ctx,
626
  struct ggml_tensor * a,
627
  struct ggml_tensor * b);
628
 
629
+ GGML_API struct ggml_tensor * ggml_mul_inplace(
630
+ struct ggml_context * ctx,
631
+ struct ggml_tensor * a,
632
+ struct ggml_tensor * b);
633
+
634
  GGML_API struct ggml_tensor * ggml_div(
635
  struct ggml_context * ctx,
636
  struct ggml_tensor * a,
637
  struct ggml_tensor * b);
638
 
639
+ GGML_API struct ggml_tensor * ggml_div_inplace(
640
+ struct ggml_context * ctx,
641
+ struct ggml_tensor * a,
642
+ struct ggml_tensor * b);
643
+
644
  GGML_API struct ggml_tensor * ggml_sqr(
645
  struct ggml_context * ctx,
646
  struct ggml_tensor * a);
647
 
648
+ GGML_API struct ggml_tensor * ggml_sqr_inplace(
649
+ struct ggml_context * ctx,
650
+ struct ggml_tensor * a);
651
+
652
  GGML_API struct ggml_tensor * ggml_sqrt(
653
  struct ggml_context * ctx,
654
  struct ggml_tensor * a);
655
 
656
+ GGML_API struct ggml_tensor * ggml_sqrt_inplace(
657
+ struct ggml_context * ctx,
658
+ struct ggml_tensor * a);
659
+
660
  GGML_API struct ggml_tensor * ggml_log(
661
  struct ggml_context * ctx,
662
  struct ggml_tensor * a);
 
696
  struct ggml_context * ctx,
697
  struct ggml_tensor * a);
698
 
699
+ GGML_API struct ggml_tensor * ggml_abs_inplace(
700
+ struct ggml_context * ctx,
701
+ struct ggml_tensor * a);
702
+
703
  GGML_API struct ggml_tensor * ggml_sgn(
704
  struct ggml_context * ctx,
705
  struct ggml_tensor * a);
706
 
707
+ GGML_API struct ggml_tensor * ggml_sgn_inplace(
708
+ struct ggml_context * ctx,
709
+ struct ggml_tensor * a);
710
+
711
  GGML_API struct ggml_tensor * ggml_neg(
712
  struct ggml_context * ctx,
713
  struct ggml_tensor * a);
714
 
715
+ GGML_API struct ggml_tensor * ggml_neg_inplace(
716
+ struct ggml_context * ctx,
717
+ struct ggml_tensor * a);
718
+
719
  GGML_API struct ggml_tensor * ggml_step(
720
  struct ggml_context * ctx,
721
  struct ggml_tensor * a);
722
 
723
+ GGML_API struct ggml_tensor * ggml_step_inplace(
724
+ struct ggml_context * ctx,
725
+ struct ggml_tensor * a);
726
+
727
  GGML_API struct ggml_tensor * ggml_relu(
728
  struct ggml_context * ctx,
729
  struct ggml_tensor * a);
730
 
731
+ GGML_API struct ggml_tensor * ggml_relu_inplace(
732
+ struct ggml_context * ctx,
733
+ struct ggml_tensor * a);
734
+
735
  // TODO: double-check this computation is correct
736
  GGML_API struct ggml_tensor * ggml_gelu(
737
  struct ggml_context * ctx,
738
  struct ggml_tensor * a);
739
 
740
+ GGML_API struct ggml_tensor * ggml_gelu_inplace(
741
+ struct ggml_context * ctx,
742
+ struct ggml_tensor * a);
743
+
744
+ GGML_API struct ggml_tensor * ggml_gelu_quick(
745
+ struct ggml_context * ctx,
746
+ struct ggml_tensor * a);
747
+
748
+ GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
749
+ struct ggml_context * ctx,
750
+ struct ggml_tensor * a);
751
+
752
  GGML_API struct ggml_tensor * ggml_silu(
753
  struct ggml_context * ctx,
754
  struct ggml_tensor * a);
755
 
756
+ GGML_API struct ggml_tensor * ggml_silu_inplace(
757
+ struct ggml_context * ctx,
758
+ struct ggml_tensor * a);
759
+
760
  // a - x
761
  // b - dy
762
  GGML_API struct ggml_tensor * ggml_silu_back(
 
770
  struct ggml_context * ctx,
771
  struct ggml_tensor * a);
772
 
773
+ GGML_API struct ggml_tensor * ggml_norm_inplace(
774
+ struct ggml_context * ctx,
775
+ struct ggml_tensor * a);
776
+
777
  GGML_API struct ggml_tensor * ggml_rms_norm(
778
  struct ggml_context * ctx,
779
  struct ggml_tensor * a);
780
 
781
+ GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
782
+ struct ggml_context * ctx,
783
+ struct ggml_tensor * a);
784
+
785
  // a - x
786
  // b - dy
787
  GGML_API struct ggml_tensor * ggml_rms_norm_back(
 
1071
  float min,
1072
  float max);
1073
 
1074
+ // TODO: implement general-purpose convolutions
1075
+ // GGML_API struct ggml_tensor * ggml_conv_1d(
1076
+ // struct ggml_context * ctx,
1077
+ // struct ggml_tensor * a,
1078
+ // struct ggml_tensor * b,
1079
+ // int s0
1080
+ // int p0,
1081
+ // int d0);
1082
+ //
1083
+ // GGML_API struct ggml_tensor * ggml_conv_2d(
1084
+ // struct ggml_context * ctx,
1085
+ // struct ggml_tensor * a,
1086
+ // struct ggml_tensor * b,
1087
+ // int s0,
1088
+ // int s1,
1089
+ // int p0,
1090
+ // int p1,
1091
+ // int d0,
1092
+ // int d1);
1093
+
1094
+ // padding = half
1095
  // TODO: we don't support extra parameters for now
1096
  // that's why we are hard-coding the stride, padding, and dilation
1097
  // not great ..
1098
+ // example:
1099
+ // a: 3 80 768 1
1100
+ // b: 3000 80 1 1
1101
+ // res: 3000 768 1 1
1102
+ // used in whisper
1103
+ GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
1104
  struct ggml_context * ctx,
1105
  struct ggml_tensor * a,
1106
  struct ggml_tensor * b);
1107
 
1108
+ // used in whisper
1109
+ GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
1110
+ struct ggml_context * ctx,
1111
+ struct ggml_tensor * a,
1112
+ struct ggml_tensor * b);
1113
+
1114
+ // kernel size is a->ne[0] x a->ne[1]
1115
+ // stride is equal to kernel size
1116
+ // padding is zero
1117
+ // example:
1118
+ // a: 16 16 3 768
1119
+ // b: 1024 1024 3 1
1120
+ // res: 64 64 768 1
1121
+ // used in sam
1122
+ GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
1123
  struct ggml_context * ctx,
1124
  struct ggml_tensor * a,
1125
  struct ggml_tensor * b);
 
1147
  struct ggml_tensor * c0,
1148
  struct ggml_tensor * c1);
1149
 
1150
+ // partition into non-overlapping windows with padding if needed
1151
+ // example:
1152
+ // a: 768 64 64 1
1153
+ // w: 14
1154
+ // res: 768 14 14 25
1155
+ // used in sam
1156
+ GGML_API struct ggml_tensor * ggml_win_part(
1157
+ struct ggml_context * ctx,
1158
+ struct ggml_tensor * a,
1159
+ int w);
1160
+
1161
+ // reverse of ggml_win_part
1162
+ // used in sam
1163
+ GGML_API struct ggml_tensor * ggml_win_unpart(
1164
+ struct ggml_context * ctx,
1165
+ struct ggml_tensor * a,
1166
+ int w0,
1167
+ int h0,
1168
+ int w);
1169
+
1170
  // Mapping operations
1171
  typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
1172
  typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);