Spaces:
Build error
Build error
Illumotion
commited on
Commit
·
1e081f1
1
Parent(s):
9f305d5
Upload folder using huggingface_hub
Browse files- CMakeLists.txt +0 -5
- Dockerfile +1 -1
- convert.py +36 -5
- examples/embd-input/embd-input-lib.cpp +6 -3
- examples/embd-input/embd-input.h +1 -3
- examples/train-text-from-scratch/train-text-from-scratch.cpp +2 -1
- ggml-cuda.cu +48 -19
- ggml-cuda.h +0 -4
- ggml-metal.m +3 -1
- ggml.c +74 -25
- ggml.h +5 -6
- gpttype_adapter.cpp +16 -9
- koboldcpp.py +1 -1
- llama.cpp +19 -4
- otherarch/gptj_v3.cpp +4 -2
- otherarch/llama_v2.cpp +2 -2
- otherarch/neox_v3.cpp +4 -2
- otherarch/rwkv_v3.cpp +725 -344
- otherarch/rwkv_v3.h +45 -20
- spm-headers/ggml.h +5 -6
CMakeLists.txt
CHANGED
@@ -167,11 +167,6 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
|
|
167 |
if (MSVC)
|
168 |
# TODO: arm msvc?
|
169 |
else()
|
170 |
-
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
|
171 |
-
# Apple M1, M2, etc.
|
172 |
-
# Raspberry Pi 3, 4, Zero 2 (64-bit)
|
173 |
-
add_compile_options(-mcpu=native)
|
174 |
-
endif()
|
175 |
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
|
176 |
# Raspberry Pi 1, Zero
|
177 |
add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access)
|
|
|
167 |
if (MSVC)
|
168 |
# TODO: arm msvc?
|
169 |
else()
|
|
|
|
|
|
|
|
|
|
|
170 |
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
|
171 |
# Raspberry Pi 1, Zero
|
172 |
add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access)
|
Dockerfile
CHANGED
@@ -7,4 +7,4 @@ RUN apt update \
|
|
7 |
&& wget https://huggingface.co/notstoic/pygmalion-13b-ggml/resolve/main/pygmalion-13b-ggml-q4_0.bin \
|
8 |
&& apt remove build-essential wget make -y
|
9 |
|
10 |
-
ENTRYPOINT ["python", "koboldcpp.py", "pygmalion-
|
|
|
7 |
&& wget https://huggingface.co/notstoic/pygmalion-13b-ggml/resolve/main/pygmalion-13b-ggml-q4_0.bin \
|
8 |
&& apt remove build-essential wget make -y
|
9 |
|
10 |
+
ENTRYPOINT ["python", "koboldcpp.py", "pygmalion-7b-q5_K_M.bin", "--port", "7860"]
|
convert.py
CHANGED
@@ -136,7 +136,7 @@ def find_n_mult(n_ff: int, n_embd: int) -> int:
|
|
136 |
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
|
137 |
if calc_ff == n_ff:
|
138 |
return n_mult
|
139 |
-
|
140 |
|
141 |
@dataclass
|
142 |
class Params:
|
@@ -321,6 +321,10 @@ class Tensor(metaclass=ABCMeta):
|
|
321 |
@abstractmethod
|
322 |
def permute(self, n_head: int) -> 'Tensor': ...
|
323 |
@abstractmethod
|
|
|
|
|
|
|
|
|
324 |
def to_ggml(self) -> 'GGMLCompatibleTensor': ...
|
325 |
|
326 |
|
@@ -345,6 +349,14 @@ class UnquantizedTensor(Tensor):
|
|
345 |
def to_ggml(self) -> 'UnquantizedTensor':
|
346 |
return self
|
347 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
348 |
def permute(self, n_head: int) -> 'UnquantizedTensor':
|
349 |
return UnquantizedTensor(permute(self.ndarray, n_head))
|
350 |
|
@@ -642,6 +654,19 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor:
|
|
642 |
return lazy_tensor.load().permute(n_head)
|
643 |
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
|
644 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
645 |
|
646 |
def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
|
647 |
out: LazyModel = {}
|
@@ -650,11 +675,17 @@ def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
|
|
650 |
out["output.weight"] = model["lm_head.weight"]
|
651 |
|
652 |
for i in itertools.count():
|
653 |
-
if f"model.layers.{i}.self_attn.q_proj.weight"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
654 |
break
|
655 |
-
|
656 |
-
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
|
657 |
-
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
658 |
out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
|
659 |
|
660 |
out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"]
|
|
|
136 |
calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
|
137 |
if calc_ff == n_ff:
|
138 |
return n_mult
|
139 |
+
raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
|
140 |
|
141 |
@dataclass
|
142 |
class Params:
|
|
|
321 |
@abstractmethod
|
322 |
def permute(self, n_head: int) -> 'Tensor': ...
|
323 |
@abstractmethod
|
324 |
+
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
|
325 |
+
@abstractmethod
|
326 |
+
def part(self, n_part: int) -> 'UnquantizedTensor': ...
|
327 |
+
@abstractmethod
|
328 |
def to_ggml(self) -> 'GGMLCompatibleTensor': ...
|
329 |
|
330 |
|
|
|
349 |
def to_ggml(self) -> 'UnquantizedTensor':
|
350 |
return self
|
351 |
|
352 |
+
def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
|
353 |
+
r = self.ndarray.shape[0] // 3
|
354 |
+
return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))
|
355 |
+
|
356 |
+
def part(self, n_part: int) -> 'UnquantizedTensor':
|
357 |
+
r = self.ndarray.shape[0] // 3
|
358 |
+
return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])
|
359 |
+
|
360 |
def permute(self, n_head: int) -> 'UnquantizedTensor':
|
361 |
return UnquantizedTensor(permute(self.ndarray, n_head))
|
362 |
|
|
|
654 |
return lazy_tensor.load().permute(n_head)
|
655 |
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
|
656 |
|
657 |
+
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
|
658 |
+
def load() -> Tensor:
|
659 |
+
return lazy_tensor.load().permute_part(n_part, n_head)
|
660 |
+
s = lazy_tensor.shape.copy()
|
661 |
+
s[0] = s[0] // 3
|
662 |
+
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
|
663 |
+
|
664 |
+
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
|
665 |
+
def load() -> Tensor:
|
666 |
+
return lazy_tensor.load().part(n_part)
|
667 |
+
s = lazy_tensor.shape.copy()
|
668 |
+
s[0] = s[0] // 3
|
669 |
+
return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
|
670 |
|
671 |
def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel:
|
672 |
out: LazyModel = {}
|
|
|
675 |
out["output.weight"] = model["lm_head.weight"]
|
676 |
|
677 |
for i in itertools.count():
|
678 |
+
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
|
679 |
+
out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head)
|
680 |
+
out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head)
|
681 |
+
out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
682 |
+
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
|
683 |
+
out[f"layers.{i}.attention.wq.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
|
684 |
+
out[f"layers.{i}.attention.wk.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head)
|
685 |
+
out[f"layers.{i}.attention.wv.weight"] = part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
|
686 |
+
else:
|
687 |
break
|
688 |
+
|
|
|
|
|
689 |
out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"]
|
690 |
|
691 |
out[f"layers.{i}.feed_forward.w1.weight"] = model[f"model.layers.{i}.mlp.gate_proj.weight"]
|
examples/embd-input/embd-input-lib.cpp
CHANGED
@@ -210,9 +210,12 @@ llama_token sampling_id(struct MyModel* mymodel) {
|
|
210 |
const char * sampling(struct MyModel * mymodel) {
|
211 |
llama_context * ctx = mymodel->ctx;
|
212 |
int id = sampling_id(mymodel);
|
213 |
-
std::string ret;
|
214 |
-
if (id == llama_token_eos())
|
215 |
-
|
|
|
|
|
|
|
216 |
eval_id(mymodel, id);
|
217 |
return ret.c_str();
|
218 |
}
|
|
|
210 |
const char * sampling(struct MyModel * mymodel) {
|
211 |
llama_context * ctx = mymodel->ctx;
|
212 |
int id = sampling_id(mymodel);
|
213 |
+
static std::string ret;
|
214 |
+
if (id == llama_token_eos()) {
|
215 |
+
ret = "</s>";
|
216 |
+
} else {
|
217 |
+
ret = llama_token_to_str(ctx, id);
|
218 |
+
}
|
219 |
eval_id(mymodel, id);
|
220 |
return ret.c_str();
|
221 |
}
|
examples/embd-input/embd-input.h
CHANGED
@@ -5,7 +5,6 @@
|
|
5 |
#include "llama.h"
|
6 |
#include "build-info.h"
|
7 |
|
8 |
-
|
9 |
extern "C" {
|
10 |
|
11 |
typedef struct MyModel {
|
@@ -14,14 +13,13 @@ typedef struct MyModel {
|
|
14 |
int n_past = 0;
|
15 |
} MyModel;
|
16 |
|
17 |
-
|
18 |
struct MyModel* create_mymodel(int argc, char ** argv);
|
19 |
|
20 |
bool eval_float(void* model, float* input, int N);
|
21 |
bool eval_tokens(void* model, std::vector<llama_token> tokens);
|
22 |
bool eval_id(struct MyModel* mymodel, int id);
|
23 |
bool eval_string(struct MyModel* mymodel, const char* str);
|
24 |
-
const char* sampling(struct MyModel* mymodel);
|
25 |
llama_token sampling_id(struct MyModel* mymodel);
|
26 |
void free_mymodel(struct MyModel* mymodel);
|
27 |
|
|
|
5 |
#include "llama.h"
|
6 |
#include "build-info.h"
|
7 |
|
|
|
8 |
extern "C" {
|
9 |
|
10 |
typedef struct MyModel {
|
|
|
13 |
int n_past = 0;
|
14 |
} MyModel;
|
15 |
|
|
|
16 |
struct MyModel* create_mymodel(int argc, char ** argv);
|
17 |
|
18 |
bool eval_float(void* model, float* input, int N);
|
19 |
bool eval_tokens(void* model, std::vector<llama_token> tokens);
|
20 |
bool eval_id(struct MyModel* mymodel, int id);
|
21 |
bool eval_string(struct MyModel* mymodel, const char* str);
|
22 |
+
const char * sampling(struct MyModel* mymodel);
|
23 |
llama_token sampling_id(struct MyModel* mymodel);
|
24 |
void free_mymodel(struct MyModel* mymodel);
|
25 |
|
examples/train-text-from-scratch/train-text-from-scratch.cpp
CHANGED
@@ -2671,7 +2671,8 @@ struct train_params {
|
|
2671 |
const char * fn_checkpoint_out;
|
2672 |
const char * fn_model_out;
|
2673 |
|
2674 |
-
|
|
|
2675 |
int n_ctx;
|
2676 |
int n_embd;
|
2677 |
int n_mult;
|
|
|
2671 |
const char * fn_checkpoint_out;
|
2672 |
const char * fn_model_out;
|
2673 |
|
2674 |
+
uint32_t seed;
|
2675 |
+
|
2676 |
int n_ctx;
|
2677 |
int n_embd;
|
2678 |
int n_mult;
|
ggml-cuda.cu
CHANGED
@@ -214,6 +214,11 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
214 |
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
215 |
#endif
|
216 |
|
|
|
|
|
|
|
|
|
|
|
217 |
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
|
218 |
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
219 |
|
@@ -1995,7 +2000,6 @@ inline void ggml_cuda_op_add(
|
|
1995 |
} else {
|
1996 |
GGML_ASSERT(false);
|
1997 |
}
|
1998 |
-
CUDA_CHECK(cudaGetLastError());
|
1999 |
|
2000 |
(void) src1;
|
2001 |
(void) dst;
|
@@ -2027,7 +2031,6 @@ inline void ggml_cuda_op_mul(
|
|
2027 |
|
2028 |
// compute
|
2029 |
mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
|
2030 |
-
CUDA_CHECK(cudaGetLastError());
|
2031 |
}
|
2032 |
|
2033 |
(void) dst;
|
@@ -2048,7 +2051,6 @@ inline void ggml_cuda_op_silu(
|
|
2048 |
|
2049 |
// compute
|
2050 |
silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
2051 |
-
CUDA_CHECK(cudaGetLastError());
|
2052 |
|
2053 |
(void) src1;
|
2054 |
(void) dst;
|
@@ -2071,7 +2073,6 @@ inline void ggml_cuda_op_rms_norm(
|
|
2071 |
|
2072 |
// compute
|
2073 |
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2074 |
-
CUDA_CHECK(cudaGetLastError());
|
2075 |
|
2076 |
(void) src1;
|
2077 |
(void) dst;
|
@@ -2150,7 +2151,6 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
|
2150 |
GGML_ASSERT(false);
|
2151 |
break;
|
2152 |
}
|
2153 |
-
CUDA_CHECK(cudaGetLastError());
|
2154 |
|
2155 |
#ifdef GGML_CUDA_DMMV_F16
|
2156 |
if (src1_convert_f16) {
|
@@ -2223,14 +2223,13 @@ inline void ggml_cuda_op_rope(
|
|
2223 |
const int n_ctx = ((int32_t *) src1->data)[3];
|
2224 |
GGML_ASSERT(mode == 0);
|
2225 |
|
2226 |
-
const float theta_scale =
|
2227 |
const float p0 = ((mode & 1) == 0 ? n_past + i02 : i02);
|
2228 |
|
2229 |
-
const float p =
|
2230 |
|
2231 |
// compute
|
2232 |
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
2233 |
-
CUDA_CHECK(cudaGetLastError());
|
2234 |
|
2235 |
(void) dst;
|
2236 |
(void) src0_ddq_i;
|
@@ -2254,7 +2253,6 @@ inline void ggml_cuda_op_diag_mask_inf(
|
|
2254 |
|
2255 |
// compute
|
2256 |
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
2257 |
-
CUDA_CHECK(cudaGetLastError());
|
2258 |
|
2259 |
(void) dst;
|
2260 |
(void) src0_ddq_i;
|
@@ -2276,7 +2274,6 @@ inline void ggml_cuda_op_soft_max(
|
|
2276 |
|
2277 |
// compute
|
2278 |
soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
2279 |
-
CUDA_CHECK(cudaGetLastError());
|
2280 |
|
2281 |
(void) src1;
|
2282 |
(void) dst;
|
@@ -2372,10 +2369,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2372 |
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
2373 |
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
2374 |
|
2375 |
-
// if multiple
|
|
|
2376 |
if (split && g_device_count > 1) {
|
2377 |
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2378 |
-
CUDA_CHECK(
|
2379 |
}
|
2380 |
|
2381 |
for (int id = 0; id < g_device_count; ++id) {
|
@@ -2401,6 +2399,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2401 |
int64_t row_diff = row_high - row_low;
|
2402 |
|
2403 |
cudaSetDevice(id);
|
|
|
|
|
|
|
|
|
|
|
|
|
2404 |
|
2405 |
if (src0_on_device && src0_is_contiguous) {
|
2406 |
if (src0_is_f32) {
|
@@ -2476,8 +2480,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2476 |
}
|
2477 |
const int64_t i11 = i13*ne12 + i12;
|
2478 |
|
2479 |
-
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
2480 |
-
|
2481 |
// for split tensors the data begins at i0 == i0_offset_low
|
2482 |
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
2483 |
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
|
@@ -2537,6 +2539,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2537 |
|
2538 |
// do the computation
|
2539 |
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
|
|
|
2540 |
|
2541 |
// copy dst to host or other device if necessary
|
2542 |
if (!dst_on_device) {
|
@@ -2566,6 +2569,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2566 |
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
|
2567 |
}
|
2568 |
}
|
|
|
|
|
|
|
|
|
|
|
2569 |
}
|
2570 |
}
|
2571 |
}
|
@@ -2577,7 +2585,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2577 |
}
|
2578 |
|
2579 |
CUDA_CHECK(cudaSetDevice(id));
|
2580 |
-
CUDA_CHECK(cudaDeviceSynchronize());
|
2581 |
|
2582 |
if (src0_asq[id] > 0) {
|
2583 |
ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
|
@@ -2592,6 +2599,21 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|
2592 |
ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
|
2593 |
}
|
2594 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2595 |
}
|
2596 |
|
2597 |
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
@@ -2831,6 +2853,10 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
2831 |
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
|
2832 |
|
2833 |
extra->data_device[id] = buf;
|
|
|
|
|
|
|
|
|
2834 |
}
|
2835 |
|
2836 |
tensor->extra = extra;
|
@@ -2844,12 +2870,15 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
2844 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
2845 |
|
2846 |
for (int id = 0; id < g_device_count; ++id) {
|
2847 |
-
if (extra->data_device[id]
|
2848 |
-
|
|
|
2849 |
}
|
2850 |
|
2851 |
-
|
2852 |
-
|
|
|
|
|
2853 |
}
|
2854 |
|
2855 |
delete extra;
|
|
|
214 |
static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
|
215 |
#endif
|
216 |
|
217 |
+
struct ggml_tensor_extra_gpu {
|
218 |
+
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
219 |
+
cudaEvent_t events[GGML_CUDA_MAX_DEVICES]; // events for synchronizing multiple GPUs
|
220 |
+
};
|
221 |
+
|
222 |
static __global__ void add_f32(const float * x, const float * y, float * dst, const int k) {
|
223 |
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
224 |
|
|
|
2000 |
} else {
|
2001 |
GGML_ASSERT(false);
|
2002 |
}
|
|
|
2003 |
|
2004 |
(void) src1;
|
2005 |
(void) dst;
|
|
|
2031 |
|
2032 |
// compute
|
2033 |
mul_f32_cuda(src0_ddf_i01, src1_ddf_i01, dst_ddf_i01, ne00, ne10, cudaStream_main);
|
|
|
2034 |
}
|
2035 |
|
2036 |
(void) dst;
|
|
|
2051 |
|
2052 |
// compute
|
2053 |
silu_f32_cuda(src0_ddf_i, dst_ddf_i, ne00*i01_diff, cudaStream_main);
|
|
|
2054 |
|
2055 |
(void) src1;
|
2056 |
(void) dst;
|
|
|
2073 |
|
2074 |
// compute
|
2075 |
rms_norm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
|
|
2076 |
|
2077 |
(void) src1;
|
2078 |
(void) dst;
|
|
|
2151 |
GGML_ASSERT(false);
|
2152 |
break;
|
2153 |
}
|
|
|
2154 |
|
2155 |
#ifdef GGML_CUDA_DMMV_F16
|
2156 |
if (src1_convert_f16) {
|
|
|
2223 |
const int n_ctx = ((int32_t *) src1->data)[3];
|
2224 |
GGML_ASSERT(mode == 0);
|
2225 |
|
2226 |
+
const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
|
2227 |
const float p0 = ((mode & 1) == 0 ? n_past + i02 : i02);
|
2228 |
|
2229 |
+
const float p = p0;
|
2230 |
|
2231 |
// compute
|
2232 |
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main);
|
|
|
2233 |
|
2234 |
(void) dst;
|
2235 |
(void) src0_ddq_i;
|
|
|
2253 |
|
2254 |
// compute
|
2255 |
diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main);
|
|
|
2256 |
|
2257 |
(void) dst;
|
2258 |
(void) src0_ddq_i;
|
|
|
2274 |
|
2275 |
// compute
|
2276 |
soft_max_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, cudaStream_main);
|
|
|
2277 |
|
2278 |
(void) src1;
|
2279 |
(void) dst;
|
|
|
2369 |
size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
2370 |
size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0};
|
2371 |
|
2372 |
+
// if multiple devices are used they need to wait for the main device
|
2373 |
+
// here an event is recorded that signifies that the main device has finished calculating the input data
|
2374 |
if (split && g_device_count > 1) {
|
2375 |
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2376 |
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device], g_cudaStreams_main[g_main_device]));
|
2377 |
}
|
2378 |
|
2379 |
for (int id = 0; id < g_device_count; ++id) {
|
|
|
2399 |
int64_t row_diff = row_high - row_low;
|
2400 |
|
2401 |
cudaSetDevice(id);
|
2402 |
+
cudaStream_t cudaStream_main = g_cudaStreams_main[id];
|
2403 |
+
|
2404 |
+
// wait for main GPU data if necessary
|
2405 |
+
if (split && id != g_main_device) {
|
2406 |
+
CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, src0_extra->events[g_main_device]));
|
2407 |
+
}
|
2408 |
|
2409 |
if (src0_on_device && src0_is_contiguous) {
|
2410 |
if (src0_is_f32) {
|
|
|
2480 |
}
|
2481 |
const int64_t i11 = i13*ne12 + i12;
|
2482 |
|
|
|
|
|
2483 |
// for split tensors the data begins at i0 == i0_offset_low
|
2484 |
char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs;
|
2485 |
float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride;
|
|
|
2539 |
|
2540 |
// do the computation
|
2541 |
op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main);
|
2542 |
+
CUDA_CHECK(cudaGetLastError());
|
2543 |
|
2544 |
// copy dst to host or other device if necessary
|
2545 |
if (!dst_on_device) {
|
|
|
2569 |
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_ddf_i, dst_stride*sizeof(float), kind, cudaStream_main));
|
2570 |
}
|
2571 |
}
|
2572 |
+
|
2573 |
+
// signify to main device that other device is done
|
2574 |
+
if (split && g_device_count > 1 && id != g_main_device) {
|
2575 |
+
CUDA_CHECK(cudaEventRecord(src0_extra->events[id], cudaStream_main));
|
2576 |
+
}
|
2577 |
}
|
2578 |
}
|
2579 |
}
|
|
|
2585 |
}
|
2586 |
|
2587 |
CUDA_CHECK(cudaSetDevice(id));
|
|
|
2588 |
|
2589 |
if (src0_asq[id] > 0) {
|
2590 |
ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]);
|
|
|
2599 |
ggml_cuda_pool_free(dst_ddf[id], dst_asf[id]);
|
2600 |
}
|
2601 |
}
|
2602 |
+
|
2603 |
+
// main device waits for all other devices to be finished
|
2604 |
+
if (split && g_device_count > 1) {
|
2605 |
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2606 |
+
for (int id = 0; id < g_device_count; ++id) {
|
2607 |
+
if (id != g_main_device) {
|
2608 |
+
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams_main[g_main_device], src0_extra->events[id]));
|
2609 |
+
}
|
2610 |
+
}
|
2611 |
+
}
|
2612 |
+
|
2613 |
+
if (dst->backend == GGML_BACKEND_CPU) {
|
2614 |
+
CUDA_CHECK(cudaSetDevice(g_main_device));
|
2615 |
+
CUDA_CHECK(cudaDeviceSynchronize());
|
2616 |
+
}
|
2617 |
}
|
2618 |
|
2619 |
void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
2853 |
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
|
2854 |
|
2855 |
extra->data_device[id] = buf;
|
2856 |
+
|
2857 |
+
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
2858 |
+
CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id], cudaEventDisableTiming));
|
2859 |
+
}
|
2860 |
}
|
2861 |
|
2862 |
tensor->extra = extra;
|
|
|
2870 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
2871 |
|
2872 |
for (int id = 0; id < g_device_count; ++id) {
|
2873 |
+
if (extra->data_device[id] != nullptr) {
|
2874 |
+
CUDA_CHECK(cudaSetDevice(id));
|
2875 |
+
CUDA_CHECK(cudaFree(extra->data_device[id]));
|
2876 |
}
|
2877 |
|
2878 |
+
if (extra->events[id] != nullptr) {
|
2879 |
+
CUDA_CHECK(cudaSetDevice(id));
|
2880 |
+
CUDA_CHECK(cudaEventDestroy(extra->events[id]));
|
2881 |
+
}
|
2882 |
}
|
2883 |
|
2884 |
delete extra;
|
ggml-cuda.h
CHANGED
@@ -8,10 +8,6 @@ extern "C" {
|
|
8 |
|
9 |
#define GGML_CUDA_MAX_DEVICES 16
|
10 |
|
11 |
-
struct ggml_tensor_extra_gpu {
|
12 |
-
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
13 |
-
};
|
14 |
-
|
15 |
void ggml_init_cublas(void);
|
16 |
void ggml_cuda_set_tensor_split(const float * tensor_split);
|
17 |
|
|
|
8 |
|
9 |
#define GGML_CUDA_MAX_DEVICES 16
|
10 |
|
|
|
|
|
|
|
|
|
11 |
void ggml_init_cublas(void);
|
12 |
void ggml_cuda_set_tensor_split(const float * tensor_split);
|
13 |
|
ggml-metal.m
CHANGED
@@ -202,7 +202,9 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
|
202 |
|
203 |
void ggml_metal_free(struct ggml_metal_context * ctx) {
|
204 |
fprintf(stderr, "%s: deallocating\n", __func__);
|
205 |
-
|
|
|
|
|
206 |
free(ctx);
|
207 |
}
|
208 |
|
|
|
202 |
|
203 |
void ggml_metal_free(struct ggml_metal_context * ctx) {
|
204 |
fprintf(stderr, "%s: deallocating\n", __func__);
|
205 |
+
for (int i = 0; i < ctx->n_buffers; ++i) {
|
206 |
+
[ctx->buffers[i].metal release];
|
207 |
+
}
|
208 |
free(ctx);
|
209 |
}
|
210 |
|
ggml.c
CHANGED
@@ -3846,6 +3846,41 @@ static_assert(GGML_OP_COUNT == 64, "GGML_OP_COUNT != 64");
|
|
3846 |
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
3847 |
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
3848 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3849 |
//
|
3850 |
// ggml context
|
3851 |
//
|
@@ -4208,6 +4243,22 @@ static inline int ggml_up(int n, int m) {
|
|
4208 |
#define ggml_assert_aligned(ptr) \
|
4209 |
GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
|
4210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4211 |
////////////////////////////////////////////////////////////////////////////////
|
4212 |
|
4213 |
struct ggml_context * ggml_init(struct ggml_init_params params) {
|
@@ -4267,6 +4318,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4267 |
ggml_cl_init();
|
4268 |
#endif
|
4269 |
|
|
|
|
|
4270 |
is_first_call = false;
|
4271 |
}
|
4272 |
|
@@ -12495,7 +12548,7 @@ static void ggml_compute_forward_rope_f32(
|
|
12495 |
// row index used to determine which thread to use
|
12496 |
int ir = 0;
|
12497 |
|
12498 |
-
const float theta_scale =
|
12499 |
|
12500 |
const bool is_neox = mode & 2;
|
12501 |
const bool is_glm = mode & 4;
|
@@ -12535,9 +12588,7 @@ static void ggml_compute_forward_rope_f32(
|
|
12535 |
dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
|
12536 |
}
|
12537 |
} else if (!is_neox) {
|
12538 |
-
|
12539 |
-
theta = theta * GGML_TRAINING_CTX / n_ctx;
|
12540 |
-
}
|
12541 |
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
12542 |
const float cos_theta = cosf(theta);
|
12543 |
const float sin_theta = sinf(theta);
|
@@ -12638,7 +12689,7 @@ static void ggml_compute_forward_rope_f16(
|
|
12638 |
// row index used to determine which thread to use
|
12639 |
int ir = 0;
|
12640 |
|
12641 |
-
const float theta_scale =
|
12642 |
|
12643 |
const bool is_neox = mode & 2;
|
12644 |
const bool is_glm = mode & 4;
|
@@ -12678,9 +12729,6 @@ static void ggml_compute_forward_rope_f16(
|
|
12678 |
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
12679 |
}
|
12680 |
} if (!is_neox) {
|
12681 |
-
if (n_ctx > GGML_TRAINING_CTX) {
|
12682 |
-
theta = theta * GGML_TRAINING_CTX / n_ctx;
|
12683 |
-
}
|
12684 |
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
12685 |
const float cos_theta = cosf(theta);
|
12686 |
const float sin_theta = sinf(theta);
|
@@ -12806,7 +12854,7 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12806 |
// row index used to determine which thread to use
|
12807 |
int ir = 0;
|
12808 |
|
12809 |
-
const float theta_scale =
|
12810 |
|
12811 |
const bool is_neox = mode & 2;
|
12812 |
|
@@ -12820,9 +12868,6 @@ static void ggml_compute_forward_rope_back_f32(
|
|
12820 |
float theta = (float)p;
|
12821 |
|
12822 |
if (!is_neox) {
|
12823 |
-
if (n_ctx > GGML_TRAINING_CTX) {
|
12824 |
-
theta = theta * GGML_TRAINING_CTX / n_ctx;
|
12825 |
-
}
|
12826 |
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
12827 |
const float cos_theta = cosf(theta);
|
12828 |
const float sin_theta = sinf(theta);
|
@@ -12923,7 +12968,7 @@ static void ggml_compute_forward_rope_back_f16(
|
|
12923 |
// row index used to determine which thread to use
|
12924 |
int ir = 0;
|
12925 |
|
12926 |
-
const float theta_scale =
|
12927 |
|
12928 |
const bool is_neox = mode & 2;
|
12929 |
|
@@ -12937,9 +12982,6 @@ static void ggml_compute_forward_rope_back_f16(
|
|
12937 |
float theta = (float)p;
|
12938 |
|
12939 |
if (!is_neox) {
|
12940 |
-
if (n_ctx > GGML_TRAINING_CTX) {
|
12941 |
-
theta = theta * GGML_TRAINING_CTX / n_ctx;
|
12942 |
-
}
|
12943 |
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
12944 |
const float cos_theta = cosf(theta);
|
12945 |
const float sin_theta = sinf(theta);
|
@@ -16805,9 +16847,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16805 |
if (node_n != -1) {
|
16806 |
/* FINALIZE */
|
16807 |
struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
|
16808 |
-
|
16809 |
-
|
16810 |
-
|
|
|
|
|
16811 |
}
|
16812 |
|
16813 |
// distribute new work or execute it direct if 1T
|
@@ -16819,10 +16863,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16819 |
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
16820 |
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
16821 |
|
|
|
|
|
16822 |
/* INIT */
|
16823 |
-
|
16824 |
-
|
16825 |
-
|
|
|
16826 |
|
16827 |
if (node->n_tasks == 1) {
|
16828 |
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
@@ -16830,9 +16877,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|
16830 |
params.type = GGML_TASK_COMPUTE;
|
16831 |
ggml_compute_forward(¶ms, node);
|
16832 |
|
16833 |
-
|
16834 |
-
|
16835 |
-
|
|
|
|
|
16836 |
} else {
|
16837 |
break;
|
16838 |
}
|
|
|
3846 |
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
3847 |
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
3848 |
|
3849 |
+
// WARN:
|
3850 |
+
// Mis-confguration can lead to problem that's hard to reason about:
|
3851 |
+
// * At best it crash or talks nosense.
|
3852 |
+
// * At worst it talks slightly difference but hard to perceive.
|
3853 |
+
//
|
3854 |
+
// An op has to enable INIT or FINALIZE when any of it's branch needs that pass.
|
3855 |
+
// Take care about compile options (e.g., GGML_USE_xxx).
|
3856 |
+
static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 };
|
3857 |
+
static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 };
|
3858 |
+
|
3859 |
+
static void ggml_setup_op_has_task_pass(void) {
|
3860 |
+
{ // INIT
|
3861 |
+
bool * p = GGML_OP_HAS_INIT;
|
3862 |
+
|
3863 |
+
p[GGML_OP_ACC ] = true;
|
3864 |
+
p[GGML_OP_MUL_MAT ] = true;
|
3865 |
+
p[GGML_OP_OUT_PROD ] = true;
|
3866 |
+
p[GGML_OP_SET ] = true;
|
3867 |
+
p[GGML_OP_GET_ROWS_BACK ] = true;
|
3868 |
+
p[GGML_OP_DIAG_MASK_INF ] = true;
|
3869 |
+
p[GGML_OP_DIAG_MASK_ZERO ] = true;
|
3870 |
+
p[GGML_OP_CONV_1D_S1_PH ] = true;
|
3871 |
+
p[GGML_OP_CONV_1D_S2_PH ] = true;
|
3872 |
+
p[GGML_OP_CONV_2D_SK_P0 ] = true;
|
3873 |
+
p[GGML_OP_FLASH_ATTN_BACK ] = true;
|
3874 |
+
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
|
3875 |
+
}
|
3876 |
+
|
3877 |
+
{ // FINALIZE
|
3878 |
+
bool * p = GGML_OP_HAS_FINALIZE;
|
3879 |
+
|
3880 |
+
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
|
3881 |
+
}
|
3882 |
+
}
|
3883 |
+
|
3884 |
//
|
3885 |
// ggml context
|
3886 |
//
|
|
|
4243 |
#define ggml_assert_aligned(ptr) \
|
4244 |
GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
|
4245 |
|
4246 |
+
float get_theta_scale(int n_dims,int n_past,int n_ctx)
|
4247 |
+
{
|
4248 |
+
if(n_ctx<=2048) //normie mode
|
4249 |
+
{
|
4250 |
+
return powf(10000.0, -2.0f/n_dims);
|
4251 |
+
}
|
4252 |
+
else
|
4253 |
+
{
|
4254 |
+
//using scaled NTK aware ctx
|
4255 |
+
float a = (n_ctx<=4096?4.0:8.0);
|
4256 |
+
float m = powf(a, n_dims / (n_dims - 2.0));
|
4257 |
+
float s = powf(10000.0 * m, -2.0f/n_dims);
|
4258 |
+
return s;
|
4259 |
+
}
|
4260 |
+
}
|
4261 |
+
|
4262 |
////////////////////////////////////////////////////////////////////////////////
|
4263 |
|
4264 |
struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
|
4318 |
ggml_cl_init();
|
4319 |
#endif
|
4320 |
|
4321 |
+
ggml_setup_op_has_task_pass();
|
4322 |
+
|
4323 |
is_first_call = false;
|
4324 |
}
|
4325 |
|
|
|
12548 |
// row index used to determine which thread to use
|
12549 |
int ir = 0;
|
12550 |
|
12551 |
+
const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
|
12552 |
|
12553 |
const bool is_neox = mode & 2;
|
12554 |
const bool is_glm = mode & 4;
|
|
|
12588 |
dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
|
12589 |
}
|
12590 |
} else if (!is_neox) {
|
12591 |
+
|
|
|
|
|
12592 |
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
12593 |
const float cos_theta = cosf(theta);
|
12594 |
const float sin_theta = sinf(theta);
|
|
|
12689 |
// row index used to determine which thread to use
|
12690 |
int ir = 0;
|
12691 |
|
12692 |
+
const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
|
12693 |
|
12694 |
const bool is_neox = mode & 2;
|
12695 |
const bool is_glm = mode & 4;
|
|
|
12729 |
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
|
12730 |
}
|
12731 |
} if (!is_neox) {
|
|
|
|
|
|
|
12732 |
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
12733 |
const float cos_theta = cosf(theta);
|
12734 |
const float sin_theta = sinf(theta);
|
|
|
12854 |
// row index used to determine which thread to use
|
12855 |
int ir = 0;
|
12856 |
|
12857 |
+
const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
|
12858 |
|
12859 |
const bool is_neox = mode & 2;
|
12860 |
|
|
|
12868 |
float theta = (float)p;
|
12869 |
|
12870 |
if (!is_neox) {
|
|
|
|
|
|
|
12871 |
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
12872 |
const float cos_theta = cosf(theta);
|
12873 |
const float sin_theta = sinf(theta);
|
|
|
12968 |
// row index used to determine which thread to use
|
12969 |
int ir = 0;
|
12970 |
|
12971 |
+
const float theta_scale = get_theta_scale(n_dims,n_past,n_ctx);
|
12972 |
|
12973 |
const bool is_neox = mode & 2;
|
12974 |
|
|
|
12982 |
float theta = (float)p;
|
12983 |
|
12984 |
if (!is_neox) {
|
|
|
|
|
|
|
12985 |
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
12986 |
const float cos_theta = cosf(theta);
|
12987 |
const float sin_theta = sinf(theta);
|
|
|
16847 |
if (node_n != -1) {
|
16848 |
/* FINALIZE */
|
16849 |
struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
|
16850 |
+
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
16851 |
+
params.nth = node->n_tasks;
|
16852 |
+
ggml_compute_forward(¶ms, node);
|
16853 |
+
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16854 |
+
}
|
16855 |
}
|
16856 |
|
16857 |
// distribute new work or execute it direct if 1T
|
|
|
16863 |
state->shared->perf_node_start_cycles = ggml_perf_cycles();
|
16864 |
state->shared->perf_node_start_time_us = ggml_perf_time_us();
|
16865 |
|
16866 |
+
params.nth = node->n_tasks;
|
16867 |
+
|
16868 |
/* INIT */
|
16869 |
+
if (GGML_OP_HAS_INIT[node->op]) {
|
16870 |
+
params.type = GGML_TASK_INIT;
|
16871 |
+
ggml_compute_forward(¶ms, node);
|
16872 |
+
}
|
16873 |
|
16874 |
if (node->n_tasks == 1) {
|
16875 |
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
|
|
16877 |
params.type = GGML_TASK_COMPUTE;
|
16878 |
ggml_compute_forward(¶ms, node);
|
16879 |
|
16880 |
+
if (GGML_OP_HAS_FINALIZE[node->op]) {
|
16881 |
+
params.type = GGML_TASK_FINALIZE;
|
16882 |
+
ggml_compute_forward(¶ms, node);
|
16883 |
+
ggml_graph_compute_perf_stats_node(node, state->shared);
|
16884 |
+
}
|
16885 |
} else {
|
16886 |
break;
|
16887 |
}
|
ggml.h
CHANGED
@@ -201,12 +201,6 @@
|
|
201 |
#define GGML_MAX_NAME 48
|
202 |
#define GGML_DEFAULT_N_THREADS 4
|
203 |
|
204 |
-
// Maximum training context of the model in use
|
205 |
-
// For the LLaMA models this is normally 2048, but somehow "stepping out" by 128 gives better results (tested at 7B and 13B)
|
206 |
-
#ifndef GGML_TRAINING_CTX
|
207 |
-
#define GGML_TRAINING_CTX 2176
|
208 |
-
#endif
|
209 |
-
|
210 |
#define GGML_ASSERT(x) \
|
211 |
do { \
|
212 |
if (!(x)) { \
|
@@ -450,6 +444,9 @@ extern "C" {
|
|
450 |
|
451 |
|
452 |
// compute types
|
|
|
|
|
|
|
453 |
enum ggml_task_type {
|
454 |
GGML_TASK_INIT = 0,
|
455 |
GGML_TASK_COMPUTE,
|
@@ -507,6 +504,8 @@ extern "C" {
|
|
507 |
// use this to compute the memory overhead of a tensor
|
508 |
GGML_API size_t ggml_tensor_overhead(void);
|
509 |
|
|
|
|
|
510 |
// main
|
511 |
|
512 |
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
|
|
201 |
#define GGML_MAX_NAME 48
|
202 |
#define GGML_DEFAULT_N_THREADS 4
|
203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
#define GGML_ASSERT(x) \
|
205 |
do { \
|
206 |
if (!(x)) { \
|
|
|
444 |
|
445 |
|
446 |
// compute types
|
447 |
+
|
448 |
+
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
|
449 |
+
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
|
450 |
enum ggml_task_type {
|
451 |
GGML_TASK_INIT = 0,
|
452 |
GGML_TASK_COMPUTE,
|
|
|
504 |
// use this to compute the memory overhead of a tensor
|
505 |
GGML_API size_t ggml_tensor_overhead(void);
|
506 |
|
507 |
+
GGML_API float get_theta_scale(int n_dims,int n_past,int n_ctx);
|
508 |
+
|
509 |
// main
|
510 |
|
511 |
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
gpttype_adapter.cpp
CHANGED
@@ -431,6 +431,12 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|
431 |
else //rwkv_2
|
432 |
{
|
433 |
rwkv_ctx_v3 = rwkv_init_from_file(modelname.c_str(), n_threads);
|
|
|
|
|
|
|
|
|
|
|
|
|
434 |
const struct rwkv_file_header & header = rwkv_ctx_v3->instance->model.header;
|
435 |
const size_t n_vocab = header.n_vocab;
|
436 |
printf("\nDetected Vocab: %d",n_vocab);
|
@@ -811,7 +817,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|
811 |
{
|
812 |
params.top_k = 120; //to disable top_k we actually need to increase this value to a very high number
|
813 |
}
|
814 |
-
if (params.seed <= 0)
|
815 |
{
|
816 |
params.seed = time(NULL);
|
817 |
}
|
@@ -1060,14 +1066,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|
1060 |
}
|
1061 |
else
|
1062 |
{
|
1063 |
-
if(embd.size()>1)
|
1064 |
-
{
|
1065 |
-
|
1066 |
-
}
|
1067 |
-
else
|
1068 |
-
{
|
1069 |
-
|
1070 |
-
|
|
|
1071 |
|
1072 |
memcpy(logits.data(), rwkv_ctx_v3->logits_out, sizeof(float) * rwkv_vocab.size());
|
1073 |
rwkv_ctx_v3->state_in = rwkv_ctx_v3->state_out;
|
|
|
431 |
else //rwkv_2
|
432 |
{
|
433 |
rwkv_ctx_v3 = rwkv_init_from_file(modelname.c_str(), n_threads);
|
434 |
+
|
435 |
+
// if(inputs.gpulayers>0)
|
436 |
+
// {
|
437 |
+
// rwkv_gpu_offload_layers(rwkv_ctx_v3,inputs.gpulayers);
|
438 |
+
// }
|
439 |
+
|
440 |
const struct rwkv_file_header & header = rwkv_ctx_v3->instance->model.header;
|
441 |
const size_t n_vocab = header.n_vocab;
|
442 |
printf("\nDetected Vocab: %d",n_vocab);
|
|
|
817 |
{
|
818 |
params.top_k = 120; //to disable top_k we actually need to increase this value to a very high number
|
819 |
}
|
820 |
+
if (params.seed <= 0 || params.seed==0xFFFFFFFF)
|
821 |
{
|
822 |
params.seed = time(NULL);
|
823 |
}
|
|
|
1066 |
}
|
1067 |
else
|
1068 |
{
|
1069 |
+
// if(embd.size()>1)
|
1070 |
+
// {
|
1071 |
+
// evalres = rwkv_eval_sequence(rwkv_ctx_v3, (uint32_t*)embd.data(), embd.size(), rwkv_ctx_v3->state_in, rwkv_ctx_v3->state_out, rwkv_ctx_v3->logits_out);
|
1072 |
+
// }
|
1073 |
+
// else
|
1074 |
+
// {
|
1075 |
+
bool ignoreLogits = (!startedsampling && ((int)embd_inp.size() > input_consumed + 2));
|
1076 |
+
evalres = rwkv_eval(rwkv_ctx_v3, embd[0], rwkv_ctx_v3->state_in, rwkv_ctx_v3->state_out, ignoreLogits?nullptr:rwkv_ctx_v3->logits_out);
|
1077 |
+
//}
|
1078 |
|
1079 |
memcpy(logits.data(), rwkv_ctx_v3->logits_out, sizeof(float) * rwkv_vocab.size());
|
1080 |
rwkv_ctx_v3->state_in = rwkv_ctx_v3->state_out;
|
koboldcpp.py
CHANGED
@@ -859,7 +859,7 @@ if __name__ == '__main__':
|
|
859 |
parser.add_argument("--blasthreads", help="Use a different number of threads during BLAS if specified. Otherwise, has the same value as --threads",metavar=('[threads]'), type=int, default=0)
|
860 |
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
|
861 |
parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
|
862 |
-
parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048)", type=int,choices=[512,1024,2048,4096,8192], default=2048)
|
863 |
parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024], default=512)
|
864 |
parser.add_argument("--stream", help="Uses streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true')
|
865 |
parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
|
|
|
859 |
parser.add_argument("--blasthreads", help="Use a different number of threads during BLAS if specified. Otherwise, has the same value as --threads",metavar=('[threads]'), type=int, default=0)
|
860 |
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
|
861 |
parser.add_argument("--highpriority", help="Experimental flag. If set, increases the process CPU priority, potentially speeding up generation. Use caution.", action='store_true')
|
862 |
+
parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 2048)", type=int,choices=[512,1024,2048,3072,4096,6144,8192], default=2048)
|
863 |
parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024], default=512)
|
864 |
parser.add_argument("--stream", help="Uses streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true')
|
865 |
parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
|
llama.cpp
CHANGED
@@ -283,7 +283,13 @@ struct llama_model {
|
|
283 |
|
284 |
struct llama_context {
|
285 |
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
286 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
std::mt19937 rng;
|
288 |
|
289 |
bool has_evaluated_once = false;
|
@@ -1121,7 +1127,7 @@ static void llama_model_load_internal(
|
|
1121 |
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
1122 |
|
1123 |
// this is the total memory required to run the inference
|
1124 |
-
const size_t bigctxmul = (hparams.n_ctx>2048?2:1);
|
1125 |
const size_t mem_required =
|
1126 |
ctx_size +
|
1127 |
mmapped_size - vram_weights + // weights in VRAM not in memory
|
@@ -2627,7 +2633,7 @@ struct llama_context * llama_new_context_with_model(
|
|
2627 |
|
2628 |
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
2629 |
|
2630 |
-
const size_t bigctxmul = (hparams.n_ctx>2048?2:1);
|
2631 |
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)*bigctxmul);
|
2632 |
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)*bigctxmul);
|
2633 |
}
|
@@ -3252,7 +3258,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
3252 |
return nread;
|
3253 |
}
|
3254 |
|
3255 |
-
bool
|
3256 |
llama_file file(path_session, "rb");
|
3257 |
|
3258 |
// sanity checks
|
@@ -3306,6 +3312,15 @@ bool llama_load_session_file(struct llama_context * ctx, const char * path_sessi
|
|
3306 |
return true;
|
3307 |
}
|
3308 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3309 |
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
3310 |
llama_file file(path_session, "wb");
|
3311 |
|
|
|
283 |
|
284 |
struct llama_context {
|
285 |
llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
286 |
+
#ifdef GGML_USE_METAL
|
287 |
+
~llama_context() {
|
288 |
+
if (ctx_metal) {
|
289 |
+
ggml_metal_free(ctx_metal);
|
290 |
+
}
|
291 |
+
}
|
292 |
+
#endif
|
293 |
std::mt19937 rng;
|
294 |
|
295 |
bool has_evaluated_once = false;
|
|
|
1127 |
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
1128 |
|
1129 |
// this is the total memory required to run the inference
|
1130 |
+
const size_t bigctxmul = (hparams.n_ctx>4096?3:(hparams.n_ctx>2048?2:1));
|
1131 |
const size_t mem_required =
|
1132 |
ctx_size +
|
1133 |
mmapped_size - vram_weights + // weights in VRAM not in memory
|
|
|
2633 |
|
2634 |
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
2635 |
|
2636 |
+
const size_t bigctxmul = (hparams.n_ctx>4096?3:(hparams.n_ctx>2048?2:1));
|
2637 |
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)*bigctxmul);
|
2638 |
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)*bigctxmul);
|
2639 |
}
|
|
|
3258 |
return nread;
|
3259 |
}
|
3260 |
|
3261 |
+
static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
3262 |
llama_file file(path_session, "rb");
|
3263 |
|
3264 |
// sanity checks
|
|
|
3312 |
return true;
|
3313 |
}
|
3314 |
|
3315 |
+
bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
|
3316 |
+
try {
|
3317 |
+
return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
|
3318 |
+
} catch (const std::exception & err) {
|
3319 |
+
fprintf(stderr, "error loading session file: %s\n", err.what());
|
3320 |
+
return false;
|
3321 |
+
}
|
3322 |
+
}
|
3323 |
+
|
3324 |
bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
|
3325 |
llama_file file(path_session, "wb");
|
3326 |
|
otherarch/gptj_v3.cpp
CHANGED
@@ -68,6 +68,8 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
|
|
68 |
printf("%s: ftype = %d\n", __func__, hparams.ftype);
|
69 |
printf("%s: qntvr = %d\n", __func__, qntvr);
|
70 |
|
|
|
|
|
71 |
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
|
72 |
}
|
73 |
|
@@ -474,8 +476,8 @@ bool gptj_eval(
|
|
474 |
|
475 |
// self-attention
|
476 |
{
|
477 |
-
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0,
|
478 |
-
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0,
|
479 |
|
480 |
// store key and value to memory
|
481 |
{
|
|
|
68 |
printf("%s: ftype = %d\n", __func__, hparams.ftype);
|
69 |
printf("%s: qntvr = %d\n", __func__, qntvr);
|
70 |
|
71 |
+
hparams.n_ctx = std::max(origmaxctx,hparams.n_ctx);
|
72 |
+
|
73 |
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
|
74 |
}
|
75 |
|
|
|
476 |
|
477 |
// self-attention
|
478 |
{
|
479 |
+
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
|
480 |
+
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx);
|
481 |
|
482 |
// store key and value to memory
|
483 |
{
|
otherarch/llama_v2.cpp
CHANGED
@@ -2204,7 +2204,7 @@ struct llama_v2_context * llama_v2_init_from_file(
|
|
2204 |
|
2205 |
llama_v2_context * ctx = new llama_v2_context;
|
2206 |
|
2207 |
-
if (params.seed < 0) {
|
2208 |
params.seed = time(NULL);
|
2209 |
}
|
2210 |
|
@@ -2552,7 +2552,7 @@ int llama_v2_get_kv_cache_token_count(const struct llama_v2_context * ctx) {
|
|
2552 |
#define LLAMA_V2_MAX_RNG_STATE (64*1024)
|
2553 |
|
2554 |
void llama_v2_set_rng_seed(struct llama_v2_context * ctx, int seed) {
|
2555 |
-
if (seed < 0) {
|
2556 |
seed = time(NULL);
|
2557 |
}
|
2558 |
ctx->rng.seed(seed);
|
|
|
2204 |
|
2205 |
llama_v2_context * ctx = new llama_v2_context;
|
2206 |
|
2207 |
+
if (params.seed < 0 || params.seed==0xFFFFFFFF) {
|
2208 |
params.seed = time(NULL);
|
2209 |
}
|
2210 |
|
|
|
2552 |
#define LLAMA_V2_MAX_RNG_STATE (64*1024)
|
2553 |
|
2554 |
void llama_v2_set_rng_seed(struct llama_v2_context * ctx, int seed) {
|
2555 |
+
if (seed < 0 || seed==0xFFFFFFFF) {
|
2556 |
seed = time(NULL);
|
2557 |
}
|
2558 |
ctx->rng.seed(seed);
|
otherarch/neox_v3.cpp
CHANGED
@@ -68,6 +68,8 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
|
|
68 |
printf("%s: ftype = %d\n", __func__, hparams.ftype);
|
69 |
printf("%s: qntvr = %d\n", __func__, qntvr);
|
70 |
|
|
|
|
|
71 |
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
|
72 |
}
|
73 |
|
@@ -502,8 +504,8 @@ bool gpt_neox_eval(
|
|
502 |
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
|
503 |
|
504 |
// using mode = 2 for GPT-NeoX mode
|
505 |
-
Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2,
|
506 |
-
Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2,
|
507 |
|
508 |
// store key and value to memory
|
509 |
{
|
|
|
68 |
printf("%s: ftype = %d\n", __func__, hparams.ftype);
|
69 |
printf("%s: qntvr = %d\n", __func__, qntvr);
|
70 |
|
71 |
+
hparams.n_ctx = std::max(origmaxctx,hparams.n_ctx);
|
72 |
+
|
73 |
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
|
74 |
}
|
75 |
|
|
|
504 |
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
|
505 |
|
506 |
// using mode = 2 for GPT-NeoX mode
|
507 |
+
Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2, n_ctx);
|
508 |
+
Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2, n_ctx);
|
509 |
|
510 |
// store key and value to memory
|
511 |
{
|
otherarch/rwkv_v3.cpp
CHANGED
@@ -17,6 +17,7 @@
|
|
17 |
#include <utility>
|
18 |
|
19 |
#define _FILE_OFFSET_BITS 64
|
|
|
20 |
#define RWKV_MAYBE_BREAK
|
21 |
|
22 |
#include <sys/stat.h>
|
@@ -38,9 +39,6 @@
|
|
38 |
#endif
|
39 |
#endif
|
40 |
|
41 |
-
// static_assert(sizeof(stat::st_size) >= 8, "File offsets should be 64-bit or else rwkv.cpp will not be able to load model files over 2GB");
|
42 |
-
// static_assert(sizeof(decltype(ftell(NULL))) >= 8, "File offsets should be 64-bit or else rwkv.cpp will not be able to load model files over 2GB");
|
43 |
-
|
44 |
// --- Error handling ---
|
45 |
|
46 |
thread_local enum rwkv_error_flags global_last_error = RWKV_ERROR_NONE;
|
@@ -124,20 +122,17 @@ inline enum rwkv_error_flags operator|=(enum rwkv_error_flags & a, enum rwkv_err
|
|
124 |
|
125 |
#define RWKV_ASSERT_FALSE_MSG(ERR_VAL, x, ...) RWKV_ASSERT_MSG(ERR_VAL, false, x, __VA_ARGS__)
|
126 |
#define RWKV_ASSERT_NULL_MSG(ERR_VAL, x, ...) RWKV_ASSERT_MSG(ERR_VAL, NULL, x, __VA_ARGS__)
|
|
|
127 |
#define RWKV_CTX_ASSERT_FALSE_MSG(ctx, ERR_VAL, x, ...) RWKV_CTX_ASSERT_MSG(ctx, ERR_VAL, false, x, __VA_ARGS__)
|
128 |
-
#define RWKV_CTX_ASSERT_NULL_MSG(ctx, ERR_VAL, x, ...) RWKV_CTX_ASSERT_MSG(ctx, ERR_VAL, NULL, x, __VA_ARGS__)
|
129 |
|
130 |
#define RWKV_ASSERT_FALSE(ERR_VAL, x) RWKV_ASSERT(ERR_VAL, false, x)
|
131 |
#define RWKV_ASSERT_NULL(ERR_VAL, x) RWKV_ASSERT(ERR_VAL, NULL, x)
|
|
|
132 |
#define RWKV_CTX_ASSERT_FALSE(ctx, ERR_VAL, x) RWKV_CTX_ASSERT(ctx, ERR_VAL, false, x)
|
133 |
-
#define RWKV_CTX_ASSERT_NULL(ctx, ERR_VAL, x) RWKV_CTX_ASSERT(ctx, ERR_VAL, NULL, x)
|
134 |
|
135 |
#define RWKV_ENSURE_OR_FALSE(x) RWKV_ENSURE(false, x)
|
136 |
#define RWKV_ENSURE_OR_NULL(x) RWKV_ENSURE(NULL, x)
|
137 |
#define RWKV_ENSURE_OR_FALSE_MSG(x, ...) RWKV_ENSURE_MSG(false, x, __VA_ARGS__)
|
138 |
-
#define RWKV_ENSURE_OR_NULL_MSG(x, ...) RWKV_ENSURE_MSG(NULL, x, __VA_ARGS__)
|
139 |
-
#define RWKV_CTX_ENSURE_OR_FALSE_MSG(ctx, x, ...) RWKV_CTX_ENSURE_MSG(ctx, false, x, __VA_ARGS__)
|
140 |
-
#define RWKV_CTX_ENSURE_OR_NULL_MSG(ctx, x, ...) RWKV_CTX_ENSURE_MSG(ctx, NULL, x, __VA_ARGS__)
|
141 |
|
142 |
// --- Utilities ---
|
143 |
|
@@ -172,13 +167,13 @@ bool rwkv_fwrite_data(FILE * file, const void * data, const size_t length) {
|
|
172 |
return fwrite(data, length, 1, file) == 1;
|
173 |
}
|
174 |
|
175 |
-
// --- File
|
176 |
|
177 |
#define TYPE_UNKNOWN TYPE_COUNT
|
178 |
|
179 |
enum rwkv_type {
|
180 |
-
|
181 |
-
|
182 |
TYPE_Q4_0,
|
183 |
TYPE_Q4_1,
|
184 |
TYPE_Q4_1_O, // Unsupported
|
@@ -193,8 +188,8 @@ enum rwkv_type {
|
|
193 |
#define GGML_TYPE_UNKNOWN GGML_TYPE_COUNT
|
194 |
|
195 |
extern const enum ggml_type rwkv_type_to_ggml[TYPE_COUNT + 1] = {
|
196 |
-
GGML_TYPE_F32, /*
|
197 |
-
GGML_TYPE_F16, /*
|
198 |
GGML_TYPE_Q4_0, /* Q4_0 */
|
199 |
GGML_TYPE_Q4_1, /* Q4_1 */
|
200 |
GGML_TYPE_UNKNOWN, /* Q4_1_O */
|
@@ -207,8 +202,8 @@ extern const enum ggml_type rwkv_type_to_ggml[TYPE_COUNT + 1] = {
|
|
207 |
};
|
208 |
|
209 |
extern const enum rwkv_type rwkv_type_from_ggml[GGML_TYPE_COUNT + 1] = {
|
210 |
-
|
211 |
-
|
212 |
TYPE_Q4_0, /* Q4_0 */
|
213 |
TYPE_Q4_1, /* Q4_1 */
|
214 |
TYPE_Q4_2, /* Q4_2 */
|
@@ -223,7 +218,7 @@ extern const enum rwkv_type rwkv_type_from_ggml[GGML_TYPE_COUNT + 1] = {
|
|
223 |
TYPE_COUNT, /* COUNT */
|
224 |
};
|
225 |
|
226 |
-
extern const char * rwkv_type_to_string[TYPE_COUNT + 1] = {"
|
227 |
|
228 |
enum rwkv_type rwkv_type_from_string(const char * str) {
|
229 |
for (int ord = 0; ord < TYPE_COUNT; ord++) {
|
@@ -290,6 +285,8 @@ struct rwkv_tensor_header {
|
|
290 |
uint32_t data_type;
|
291 |
uint32_t width;
|
292 |
uint32_t height;
|
|
|
|
|
293 |
};
|
294 |
|
295 |
struct rwkv_tensor {
|
@@ -303,7 +300,12 @@ bool rwkv_fread_tensor_header(FILE * file, struct rwkv_tensor_header & header) {
|
|
303 |
header.height = 1;
|
304 |
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_SHAPE, header.dim_count == 1 || header.dim_count == 2, "Tensor has an invalid shape (%" PRId32 " dimensions)", header.dim_count);
|
305 |
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_DATA_TYPE, header.data_type < TYPE_COUNT, "Tensor data type out of range (%" PRId32 " > %" PRId32 ")", header.data_type, TYPE_COUNT - 1);
|
306 |
-
RWKV_ASSERT_FALSE_MSG(
|
|
|
|
|
|
|
|
|
|
|
307 |
|
308 |
if (header.dim_count == 2) {
|
309 |
RWKV_ASSERT_FALSE(RWKV_ERROR_FILE_READ, rwkv_fread_uint32(file, header.height));
|
@@ -317,22 +319,8 @@ bool rwkv_fwrite_tensor_header(FILE * file, const struct rwkv_tensor_header & he
|
|
317 |
return true;
|
318 |
}
|
319 |
|
320 |
-
size_t rwkv_tensor_size(enum ggml_type type, const int64_t width, const int64_t height = 1) {
|
321 |
-
struct ggml_tensor decoy {};
|
322 |
-
decoy.type = type;
|
323 |
-
decoy.ne[0] = width;
|
324 |
-
decoy.ne[1] = height;
|
325 |
-
decoy.ne[2] = 1;
|
326 |
-
decoy.ne[3] = 1;
|
327 |
-
return ggml_nbytes(&decoy);
|
328 |
-
}
|
329 |
-
|
330 |
-
size_t rwkv_tensor_size(const struct rwkv_tensor_header & header) {
|
331 |
-
return rwkv_tensor_size(rwkv_type_to_ggml[header.data_type], header.width, header.height);
|
332 |
-
}
|
333 |
-
|
334 |
bool rwkv_fskip_tensor_data(FILE * file, const struct rwkv_tensor_header & header) {
|
335 |
-
return fseek(file, header.key_length +
|
336 |
}
|
337 |
|
338 |
bool rwkv_fread_tensor_header_and_skip(FILE * file, struct rwkv_tensor_header & header) {
|
@@ -342,7 +330,7 @@ bool rwkv_fread_tensor_header_and_skip(FILE * file, struct rwkv_tensor_header &
|
|
342 |
}
|
343 |
|
344 |
bool rwkv_fread_tensor_data(FILE * file, struct rwkv_tensor & output, void * buffer = NULL) {
|
345 |
-
size_t data_size =
|
346 |
RWKV_ASSERT_FALSE(RWKV_ERROR_FILE_READ, rwkv_fread_string(file, output.header.key_length, output.name));
|
347 |
|
348 |
if (buffer) {
|
@@ -361,10 +349,33 @@ bool rwkv_fread_tensor(FILE * file, struct rwkv_tensor & output, void * buffer =
|
|
361 |
return true;
|
362 |
}
|
363 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
bool rwkv_fwrite_tensor(FILE * file, const struct rwkv_tensor & tensor) {
|
365 |
RWKV_ENSURE_OR_FALSE(rwkv_fwrite_tensor_header(file, tensor.header));
|
366 |
RWKV_ENSURE_OR_FALSE(rwkv_fwrite_string(file, tensor.name));
|
367 |
-
RWKV_ENSURE_OR_FALSE(rwkv_fwrite_data(file, tensor.data,
|
368 |
return true;
|
369 |
}
|
370 |
|
@@ -404,7 +415,7 @@ struct rwkv_model {
|
|
404 |
struct ggml_tensor * ln0_weight;
|
405 |
struct ggml_tensor * ln0_bias;
|
406 |
|
407 |
-
std::unique_ptr<struct rwkv_layer
|
408 |
|
409 |
struct ggml_tensor * ln_out_weight;
|
410 |
struct ggml_tensor * ln_out_bias;
|
@@ -457,28 +468,153 @@ struct ggml_tensor * rwkv_max(ggml_context * ctx, struct ggml_tensor * x, struct
|
|
457 |
struct ggml_tensor * rwkv_layer_norm(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight, struct ggml_tensor * bias) {
|
458 |
// LayerNorm in RWKV is `x = (x - mean(x)) / sqrt(variance(x) + 1e-5) * weight + bias`
|
459 |
// Looks like ggml_norm does the first part, we only need to apply weight & bias.
|
460 |
-
return ggml_add_inplace(ctx,
|
461 |
}
|
462 |
|
463 |
// --- Implementation ---
|
464 |
|
465 |
-
// Used
|
466 |
-
|
467 |
-
|
468 |
-
|
|
|
|
|
|
|
469 |
size_t objects_count = 0;
|
470 |
-
size_t
|
471 |
size_t scratch_size = 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
472 |
};
|
473 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
474 |
struct rwkv_ggml_context {
|
475 |
-
std::unique_ptr<uint8_t
|
476 |
struct ggml_context * ctx;
|
477 |
|
478 |
rwkv_ggml_context(): ctx(NULL) {}
|
479 |
|
480 |
-
rwkv_ggml_context(struct
|
481 |
-
scratch.reset(new(std::nothrow) uint8_t
|
482 |
|
483 |
if (!scratch) {
|
484 |
return;
|
@@ -487,13 +623,13 @@ struct rwkv_ggml_context {
|
|
487 |
const size_t memory_required_overhead = size_t(128) * 1024 * 1024;
|
488 |
const size_t memory_required_overhead_sc = size_t(64) * 1024 * 1024;
|
489 |
|
490 |
-
ctx = ggml_init({
|
491 |
|
492 |
if (!ctx) {
|
493 |
return;
|
494 |
}
|
495 |
|
496 |
-
ggml_set_scratch(ctx, { 0, memory_required_overhead_sc +
|
497 |
}
|
498 |
|
499 |
struct rwkv_ggml_context & operator=(struct rwkv_ggml_context && source) {
|
@@ -516,7 +652,7 @@ struct rwkv_instance {
|
|
516 |
struct rwkv_ggml_context ctx;
|
517 |
struct rwkv_model model;
|
518 |
|
519 |
-
// TODO
|
520 |
// The ggml_cgraph allocates a "work tensor" the first time it is used.
|
521 |
// Currently, the height of blocks.0.ffn.key.weight is the bottleneck in our implementation of RWKV.
|
522 |
// Since it is the largest dimension used in any matrix multiply, it is the size used for the "work tensor".
|
@@ -528,8 +664,8 @@ struct rwkv_instance {
|
|
528 |
|
529 |
// The hidden state of a single RWKV layer.
|
530 |
// These are mostly used for dividing up the input state, and writing portions of the output state.
|
531 |
-
// But they're also used in building the computation graphs
|
532 |
-
// (operating "in place" on a rwkv_layer_state).
|
533 |
struct rwkv_layer_state {
|
534 |
struct ggml_tensor * ffn_xx;
|
535 |
struct ggml_tensor * att_xx;
|
@@ -538,7 +674,7 @@ struct rwkv_layer_state {
|
|
538 |
struct ggml_tensor * att_pp;
|
539 |
};
|
540 |
|
541 |
-
// Holds a single computation graph and its
|
542 |
// Graphs each have their own context so that they can be individually freed and rebuilt.
|
543 |
// Graphs read hidden state from the rwkv_context and then write it back to the rwkv_context.
|
544 |
// (see rwkv_context.input_layers and rwkv_context.output_layers)
|
@@ -548,6 +684,11 @@ struct rwkv_graph {
|
|
548 |
|
549 |
// ggml_cgraph is so large that it can cause stack overflows if not stored on the heap
|
550 |
std::unique_ptr<struct ggml_cgraph> cgraph;
|
|
|
|
|
|
|
|
|
|
|
551 |
};
|
552 |
|
553 |
// RWKV context for a specific instance.
|
@@ -558,9 +699,9 @@ struct rwkv_context {
|
|
558 |
// Reused by all graphs.
|
559 |
struct rwkv_ggml_context ctx;
|
560 |
struct ggml_tensor * input_state;
|
561 |
-
std::unique_ptr<struct rwkv_layer_state
|
562 |
struct ggml_tensor * output_state;
|
563 |
-
std::unique_ptr<struct rwkv_layer_state
|
564 |
struct ggml_tensor * logits;
|
565 |
|
566 |
uint32_t n_threads;
|
@@ -581,40 +722,17 @@ struct rwkv_context {
|
|
581 |
float * logits_out = 0; //stores address of output logit buffer
|
582 |
|
583 |
size_t gpu_layers;
|
584 |
-
size_t vram_total;
|
585 |
};
|
586 |
|
587 |
-
|
588 |
-
|
589 |
-
|
590 |
-
enum ggml_type ggml_type = rwkv_type_to_ggml[header.data_type];
|
591 |
-
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_UNSUPPORTED, ggml_type != GGML_TYPE_UNKNOWN, "Unsupported tensor data type %s from %s", rwkv_type_to_string[header.data_type], name.c_str());
|
592 |
-
|
593 |
-
tensor = header.dim_count == 1
|
594 |
-
? ggml_new_tensor_1d(ctx, ggml_type, header.width)
|
595 |
-
: ggml_new_tensor_2d(ctx, ggml_type, header.width, header.height);
|
596 |
-
|
597 |
-
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, tensor, "Failed to allocate tensor");
|
598 |
-
ggml_set_name(tensor, name.c_str());
|
599 |
-
|
600 |
-
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_FILE_READ, rwkv_fread_data(file, ggml_nbytes(tensor), tensor->data), "Failed to read tensor data from %s", name.c_str());
|
601 |
-
return true;
|
602 |
-
}
|
603 |
-
|
604 |
-
bool rwkv_fread_ggml_tensor(FILE * file, struct ggml_context * ctx, std::string & name, struct ggml_tensor *& tensor) {
|
605 |
-
struct rwkv_tensor_header header;
|
606 |
-
RWKV_ENSURE_OR_FALSE_MSG(rwkv_fread_tensor_header(file, header), "Invalid tensor header");
|
607 |
-
return rwkv_fread_ggml_tensor_data(file, header, ctx, name, tensor);
|
608 |
-
}
|
609 |
-
|
610 |
-
template<typename F> // https://stackoverflow.com/a/6458689
|
611 |
bool rwkv_set_params(struct rwkv_model & model, F callback) {
|
612 |
RWKV_ENSURE_OR_FALSE(callback("emb.weight", model.emb));
|
613 |
RWKV_ENSURE_OR_FALSE(callback("blocks.0.ln0.weight", model.ln0_weight));
|
614 |
RWKV_ENSURE_OR_FALSE(callback("blocks.0.ln0.bias", model.ln0_bias));
|
615 |
|
616 |
uint32_t n_layer = model.header.n_layer;
|
617 |
-
std::unique_ptr<struct rwkv_layer
|
618 |
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, layers.get(), "Failed to allocate model layers");
|
619 |
model.layers = std::move(layers);
|
620 |
|
@@ -652,121 +770,108 @@ bool rwkv_set_params(struct rwkv_model & model, F callback) {
|
|
652 |
return true;
|
653 |
}
|
654 |
|
655 |
-
void
|
656 |
-
|
657 |
-
|
658 |
-
|
659 |
-
|
660 |
-
|
661 |
-
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
|
666 |
-
rwkv_ctx_size_add_scratch(ctx_size, scratch, scratches);
|
667 |
-
}
|
668 |
-
|
669 |
-
void rwkv_ctx_size_add(struct rwkv_ctx_size & ctx_size, size_t count, const struct rwkv_ctx_size & other) {
|
670 |
-
ctx_size.objects_count += other.objects_count * count;
|
671 |
-
ctx_size.objects_size += other.objects_size * count;
|
672 |
-
ctx_size.scratch_size += other.scratch_size * count;
|
673 |
-
}
|
674 |
-
|
675 |
-
void rwkv_ctx_size_add_tensor(struct rwkv_ctx_size & ctx_size, const uint64_t tensors, const uint64_t views, const enum ggml_type type, const uint64_t width, const uint64_t height = 1) {
|
676 |
-
rwkv_ctx_size_add_objects(ctx_size, tensors + views);
|
677 |
-
rwkv_ctx_size_add_scratch(ctx_size, rwkv_tensor_size(type, width, height), tensors);
|
678 |
-
}
|
679 |
-
|
680 |
-
void rwkv_ctx_size_add_tensor(struct rwkv_ctx_size & size, const uint64_t tensors, const uint64_t views, const struct rwkv_tensor_header & header) {
|
681 |
-
rwkv_ctx_size_add_tensor(size, tensors, views, rwkv_type_to_ggml[header.data_type], header.width, header.height);
|
682 |
-
}
|
683 |
-
|
684 |
-
struct rwkv_ctx_size rwkv_xx_size(const size_t n_embed = 0, const size_t sequence_len = 1) {
|
685 |
-
struct rwkv_ctx_size ctx_size;
|
686 |
-
|
687 |
-
if (sequence_len == 1) {
|
688 |
-
/* x0 */ rwkv_ctx_size_add_tensor(ctx_size, 2, 1, GGML_TYPE_F32, n_embed);
|
689 |
} else {
|
690 |
-
|
691 |
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
|
696 |
-
|
697 |
}
|
698 |
-
|
699 |
-
return ctx_size;
|
700 |
}
|
701 |
|
702 |
-
void
|
703 |
-
|
704 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
705 |
|
706 |
if (sequence_len == 1) {
|
707 |
// self.layer_norm(x, self.w.blocks[i].ln2)
|
708 |
x = rwkv_layer_norm(ctx, x, weight, bias);
|
709 |
|
710 |
// xx = state[5*i+0]
|
711 |
-
|
712 |
|
713 |
// state[5*i+0] = x
|
714 |
-
|
715 |
} else {
|
716 |
// self.layer_norm(x, self.w.blocks[i].ln2)
|
717 |
x = rwkv_layer_norm(ctx, x, ggml_repeat(ctx, weight, x), ggml_repeat(ctx, bias, x));
|
718 |
|
719 |
// xx = torch.cat((state[5*i+0].to(dtype=self.FLOAT_MODE).unsqueeze(0), x[:-1,:]))
|
720 |
-
|
721 |
-
|
722 |
-
|
723 |
|
724 |
// state[5*i+0] = x[-1,:]
|
725 |
-
|
726 |
}
|
727 |
}
|
728 |
|
729 |
-
|
730 |
-
|
731 |
-
|
732 |
-
struct
|
733 |
-
|
734 |
-
|
735 |
-
|
736 |
-
|
737 |
-
|
738 |
-
|
739 |
-
|
740 |
-
|
741 |
-
|
742 |
-
|
743 |
-
|
744 |
-
|
745 |
-
/* r */ rwkv_ctx_size_add_tensor(ctx_size, 2, 0, GGML_TYPE_F32, n_embed, sequence_len);
|
746 |
-
/* r */ rwkv_ctx_size_add_tensor(ctx_size, 1, 0, GGML_TYPE_I32, ptr_nelem);
|
747 |
-
/* k */ rwkv_ctx_size_add_tensor(ctx_size, 1, 0, GGML_TYPE_F32, n_embed, sequence_len);
|
748 |
-
/* v */ rwkv_ctx_size_add_tensor(ctx_size, 1, 0, GGML_TYPE_F32, n_embed, sequence_len);
|
749 |
|
750 |
-
|
|
|
|
|
751 |
}
|
752 |
|
753 |
-
void rwkv_att_rkv(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
754 |
// xk = x * time_mix_k + state[5 * i + 1] * (1 - time_mix_k)
|
755 |
struct ggml_tensor * xk = ggml_add_inplace(ctx,
|
756 |
-
ggml_mul(ctx,
|
757 |
-
ggml_mul(ctx,
|
758 |
);
|
759 |
|
760 |
// xv = x * time_mix_v + state[5 * i + 1] * (1 - time_mix_v)
|
761 |
struct ggml_tensor * xv = ggml_add_inplace(ctx,
|
762 |
-
ggml_mul(ctx,
|
763 |
-
ggml_mul(ctx,
|
764 |
);
|
765 |
|
766 |
// xr = x * time_mix_r + state[5 * i + 1] * (1 - time_mix_r)
|
767 |
struct ggml_tensor * xr = ggml_add_inplace(ctx,
|
768 |
-
ggml_mul(ctx,
|
769 |
-
ggml_mul(ctx,
|
770 |
);
|
771 |
|
772 |
// r = torch.sigmoid(rw @ xr)
|
@@ -777,39 +882,47 @@ void rwkv_att_rkv(struct ggml_context * ctx, struct rwkv_layer layer, struct ggm
|
|
777 |
v = ggml_mul_mat(ctx, layer.att_value, xv);
|
778 |
}
|
779 |
|
780 |
-
struct
|
781 |
-
|
782 |
-
|
783 |
-
struct
|
784 |
-
|
785 |
-
|
786 |
-
|
787 |
-
|
788 |
-
|
789 |
-
|
790 |
-
|
791 |
-
|
792 |
-
|
793 |
-
|
794 |
-
|
795 |
-
|
796 |
-
|
797 |
-
|
798 |
-
|
799 |
-
|
800 |
-
|
801 |
-
|
802 |
-
|
803 |
-
|
804 |
-
|
805 |
-
|
806 |
-
|
807 |
-
/* wkv */ rwkv_ctx_size_add_tensor(ctx_size, 1, 0, GGML_TYPE_F32, n_embed);
|
808 |
|
809 |
-
|
|
|
810 |
}
|
811 |
|
812 |
-
struct ggml_tensor * rwkv_att_wkv(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
813 |
// ww = time_first + k
|
814 |
struct ggml_tensor * ww = ggml_add(ctx, att_time_first, k);
|
815 |
// qq = torch.maximum(pp, ww)
|
@@ -844,24 +957,42 @@ struct ggml_tensor * rwkv_att_wkv(struct ggml_context * ctx, struct ggml_tensor
|
|
844 |
return ggml_div(ctx, a, b);
|
845 |
}
|
846 |
|
847 |
-
struct rwkv_ctx_size rwkv_att_size(const size_t n_embed = 0) {
|
848 |
-
size_t ptr_nelem = sizeof(void *) / sizeof(uint32_t);
|
849 |
|
850 |
-
|
851 |
-
|
852 |
-
|
853 |
-
|
854 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
855 |
|
856 |
-
return
|
857 |
}
|
858 |
|
859 |
struct ggml_tensor * rwkv_att(struct ggml_context * ctx, struct ggml_tensor * x, struct rwkv_layer layer, struct rwkv_layer_state & state) {
|
860 |
-
struct ggml_tensor *
|
861 |
-
|
862 |
|
863 |
struct ggml_tensor * r, * k, * v;
|
864 |
-
rwkv_att_rkv(ctx, layer,
|
865 |
|
866 |
struct ggml_tensor * wkv = rwkv_att_wkv(ctx, layer.att_time_first, layer.att_time_decay, k, v, state.att_aa, state.att_bb, state.att_pp);
|
867 |
|
@@ -869,74 +1000,129 @@ struct ggml_tensor * rwkv_att(struct ggml_context * ctx, struct ggml_tensor * x,
|
|
869 |
return ggml_mul_mat(ctx, layer.att_output, ggml_mul(ctx, r, wkv));
|
870 |
}
|
871 |
|
872 |
-
struct
|
873 |
-
|
874 |
-
|
875 |
-
struct
|
876 |
-
|
877 |
-
|
878 |
-
|
879 |
-
|
880 |
-
|
881 |
-
|
882 |
-
|
883 |
-
|
884 |
-
|
885 |
|
886 |
-
|
887 |
-
|
888 |
-
/* k */ rwkv_ctx_size_add_tensor(ctx_size, 3, 0, GGML_TYPE_F32, ffn_key, sequence_len);
|
889 |
|
890 |
-
|
|
|
891 |
|
892 |
-
return
|
893 |
}
|
894 |
|
895 |
struct ggml_tensor * rwkv_ffn(struct ggml_context * ctx, struct ggml_tensor * x, struct rwkv_layer layer, struct rwkv_layer_state & state) {
|
896 |
-
struct ggml_tensor *
|
897 |
-
|
898 |
|
899 |
// xk = x * time_mix_k + state[5 * i + 1] * (1 - time_mix_k)
|
900 |
// xk = x * time_mix_k + state[5 * i + 0] * (1 - time_mix_k)
|
901 |
struct ggml_tensor * xk = ggml_add_inplace(
|
902 |
ctx,
|
903 |
-
ggml_mul(ctx,
|
904 |
-
ggml_mul(ctx,
|
905 |
);
|
906 |
|
907 |
// xr = x * time_mix_r + state[5 * i + 0] * (1 - time_mix_r)
|
908 |
struct ggml_tensor * xr = ggml_add_inplace(
|
909 |
ctx,
|
910 |
-
ggml_mul(ctx,
|
911 |
-
ggml_mul(ctx,
|
912 |
);
|
913 |
|
914 |
// r = torch.sigmoid(rw @ xr)
|
915 |
struct ggml_tensor * r = rwkv_sigmoid(ctx, ggml_mul_mat(ctx, layer.ffn_receptance, xr));
|
916 |
|
917 |
// k = torch.square(torch.relu(kw @ xk))
|
918 |
-
struct ggml_tensor * k =
|
919 |
|
920 |
// r * (vw @ k)
|
921 |
-
return
|
922 |
}
|
923 |
|
924 |
-
struct
|
925 |
-
|
926 |
-
|
927 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
928 |
|
929 |
-
|
930 |
-
|
931 |
-
|
932 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
933 |
|
934 |
-
|
935 |
|
936 |
-
|
937 |
-
/* logits */ rwkv_ctx_size_add_tensor(ctx_size, 1, 1, GGML_TYPE_F32, n_vocab);
|
938 |
|
939 |
-
return
|
940 |
}
|
941 |
|
942 |
bool rwkv_build_serial_graph(
|
@@ -946,10 +1132,13 @@ bool rwkv_build_serial_graph(
|
|
946 |
struct rwkv_layer_state * inputs,
|
947 |
struct rwkv_layer_state * outputs,
|
948 |
struct ggml_tensor * logits,
|
949 |
-
struct ggml_cgraph * cgraph
|
950 |
-
) {
|
951 |
-
size_t n_embed = model.header.n_embed;
|
952 |
|
|
|
|
|
|
|
|
|
|
|
953 |
// x = self.w.emb.weight[token]
|
954 |
struct ggml_tensor * x = ggml_get_rows(ctx, model.emb, tokens);
|
955 |
|
@@ -971,40 +1160,93 @@ bool rwkv_build_serial_graph(
|
|
971 |
ggml_build_forward_expand(cgraph, ggml_cpy(ctx, state.att_pp, output.att_pp));
|
972 |
}
|
973 |
|
|
|
|
|
|
|
974 |
// x = self.layer_norm(x[-1,:], self.w.ln_out)
|
975 |
x = rwkv_layer_norm(ctx, x, model.ln_out_weight, model.ln_out_bias);
|
976 |
|
977 |
// x = (self.w.head.weight @ x).float()
|
978 |
ggml_build_forward_expand(cgraph, ggml_cpy(ctx, ggml_mul_mat(ctx, model.head, x), logits));
|
979 |
|
|
|
|
|
|
|
980 |
return true;
|
981 |
}
|
982 |
|
983 |
-
struct
|
984 |
-
struct
|
985 |
-
|
986 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
987 |
|
988 |
-
|
989 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
990 |
|
991 |
-
|
992 |
-
|
993 |
-
/* xt */ rwkv_ctx_size_add_tensor(ctx_size, 0, n_layer * sequence_len, GGML_TYPE_F32, n_embed);
|
994 |
-
/* wkv */ rwkv_ctx_size_add(ctx_size, n_layer * sequence_len, rwkv_att_wkv_size(n_embed));
|
995 |
-
/* xt */ rwkv_ctx_size_add_tensor(ctx_size, 0, n_layer * sequence_len, GGML_TYPE_F32, n_embed);
|
996 |
-
/* x */ rwkv_ctx_size_add_tensor(ctx_size, n_layer * 2, 0, GGML_TYPE_F32, n_embed, sequence_len);
|
997 |
|
998 |
-
|
999 |
-
|
1000 |
-
|
|
|
|
|
|
|
1001 |
|
1002 |
-
|
1003 |
|
1004 |
-
|
1005 |
-
/* logits */ rwkv_ctx_size_add_tensor(ctx_size, 1, 1, GGML_TYPE_F32, n_vocab);
|
1006 |
|
1007 |
-
return
|
1008 |
}
|
1009 |
|
1010 |
bool rwkv_build_sequence_graph(
|
@@ -1014,7 +1256,12 @@ bool rwkv_build_sequence_graph(
|
|
1014 |
struct rwkv_layer_state * inputs,
|
1015 |
struct rwkv_layer_state * outputs,
|
1016 |
struct ggml_tensor * logits,
|
1017 |
-
struct ggml_cgraph * cgraph
|
|
|
|
|
|
|
|
|
|
|
1018 |
) {
|
1019 |
const uint32_t n_embed = model.header.n_embed;
|
1020 |
const size_t sequence_len = tokens->ne[0];
|
@@ -1026,23 +1273,23 @@ bool rwkv_build_sequence_graph(
|
|
1026 |
struct rwkv_layer & layer = model.layers[i];
|
1027 |
struct rwkv_layer_state state = inputs[i];
|
1028 |
|
1029 |
-
struct ggml_tensor * x0 = x, *
|
1030 |
-
|
1031 |
|
1032 |
struct ggml_tensor * r, * k, * v;
|
1033 |
-
rwkv_att_rkv(ctx, layer, x0,
|
1034 |
|
1035 |
ggml_build_forward_expand(cgraph, r);
|
1036 |
|
1037 |
for (uint32_t t = 0; t < sequence_len; t++) {
|
1038 |
struct ggml_tensor * kt = ggml_view_1d(ctx, k, n_embed, n_embed * sizeof(float) * t);
|
1039 |
struct ggml_tensor * vt = ggml_view_1d(ctx, v, n_embed, n_embed * sizeof(float) * t);
|
1040 |
-
struct ggml_tensor * xt = ggml_view_1d(ctx,
|
1041 |
struct ggml_tensor * wkv = rwkv_att_wkv(ctx, layer.att_time_first, layer.att_time_decay, kt, vt, state.att_aa, state.att_bb, state.att_pp);
|
1042 |
ggml_build_forward_expand(cgraph, ggml_cpy(ctx, wkv, xt));
|
1043 |
}
|
1044 |
|
1045 |
-
x = ggml_add_inplace(ctx, x, ggml_mul_mat(ctx, layer.att_output, ggml_mul(ctx, r,
|
1046 |
x = ggml_add_inplace(ctx, x, rwkv_ffn(ctx, x, layer, state));
|
1047 |
|
1048 |
struct rwkv_layer_state & output = outputs[i];
|
@@ -1053,33 +1300,21 @@ bool rwkv_build_sequence_graph(
|
|
1053 |
ggml_build_forward_expand(cgraph, ggml_cpy(ctx, state.att_pp, output.att_pp));
|
1054 |
}
|
1055 |
|
|
|
|
|
|
|
1056 |
// x = self.layer_norm(x[-1,:], self.w.ln_out)
|
1057 |
x = rwkv_layer_norm(ctx, ggml_view_1d(ctx, x, n_embed, n_embed * sizeof(float) * (sequence_len - 1)), model.ln_out_weight, model.ln_out_bias);
|
1058 |
|
1059 |
// x = (self.w.head.weight @ x).float()
|
1060 |
ggml_build_forward_expand(cgraph, ggml_cpy(ctx, ggml_mul_mat(ctx, model.head, x), logits));
|
1061 |
|
1062 |
-
|
1063 |
-
|
1064 |
-
|
1065 |
-
size_t rwkv_estimate_graph_work(const enum ggml_type type, const size_t ffn_key_size, const uint32_t n_threads, const size_t sequence_len = 1) {
|
1066 |
|
1067 |
-
|
1068 |
-
return rwkv_tensor_size(GGML_TYPE_I8, rwkv_tensor_size(mul_mat_type, ffn_key_size, sequence_len) * n_threads + 64 * (n_threads - 1));
|
1069 |
}
|
1070 |
|
1071 |
-
struct rwkv_file {
|
1072 |
-
FILE * file;
|
1073 |
-
|
1074 |
-
rwkv_file(FILE * file): file(file) {}
|
1075 |
-
|
1076 |
-
~rwkv_file() {
|
1077 |
-
if (file) {
|
1078 |
-
fclose(file);
|
1079 |
-
}
|
1080 |
-
}
|
1081 |
-
};
|
1082 |
-
|
1083 |
void rwkv_set_print_errors(struct rwkv_context * ctx, bool print_errors) {
|
1084 |
bool * ptr = ctx ? &ctx->print_errors : &global_print_errors;
|
1085 |
*ptr = print_errors;
|
@@ -1096,6 +1331,18 @@ enum rwkv_error_flags rwkv_get_last_error(struct rwkv_context * ctx) {
|
|
1096 |
return value;
|
1097 |
}
|
1098 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1099 |
bool rwkv_instance_from_file(const char * file_path, struct rwkv_instance & instance) {
|
1100 |
struct stat file_stat;
|
1101 |
struct rwkv_model model;
|
@@ -1114,14 +1361,14 @@ bool rwkv_instance_from_file(const char * file_path, struct rwkv_instance & inst
|
|
1114 |
|
1115 |
struct rwkv_tensor_header tensor_header;
|
1116 |
std::string name;
|
1117 |
-
struct
|
1118 |
|
1119 |
while ((size_t) ftell(file.file) < (size_t) file_stat.st_size) {
|
1120 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_MODEL_PARAMS, rwkv_fread_tensor_header(file.file, tensor_header), "Invalid tensor header");
|
1121 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_MODEL_PARAMS, rwkv_fread_string(file.file, tensor_header.key_length, name), "Failed to read tensor name");
|
1122 |
-
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_FILE | RWKV_ERROR_FILE_READ, fseek(file.file,
|
1123 |
|
1124 |
-
|
1125 |
|
1126 |
if (ffn_key_size == 0 && name == "blocks.0.ffn.key.weight") {
|
1127 |
ffn_key_size = tensor_header.height;
|
@@ -1131,7 +1378,7 @@ bool rwkv_instance_from_file(const char * file_path, struct rwkv_instance & inst
|
|
1131 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_PARAM_MISSING, ffn_key_size, "Model is missing parameter blocks.0.ffn.key.weight");
|
1132 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_FILE | RWKV_ERROR_FILE_READ, fseek(file.file, sizeof(struct rwkv_file_header), SEEK_SET) == 0, "Failed to seek in file");
|
1133 |
|
1134 |
-
ctx =
|
1135 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, ctx.ctx, "Failed to allocate model context");
|
1136 |
|
1137 |
struct ggml_tensor * tensor;
|
@@ -1170,25 +1417,31 @@ struct rwkv_context * rwkv_new_context_impl(std::shared_ptr<struct rwkv_instance
|
|
1170 |
const size_t n_embed = header.n_embed;
|
1171 |
const size_t n_layer = header.n_layer;
|
1172 |
|
1173 |
-
struct
|
1174 |
-
|
1175 |
-
|
1176 |
-
|
1177 |
-
|
1178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1179 |
|
1180 |
-
struct rwkv_ggml_context ctx(
|
1181 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, ctx.ctx, "Failed to allocate model context");
|
1182 |
|
1183 |
struct ggml_tensor * input = ggml_new_tensor_1d(ctx.ctx, GGML_TYPE_F32, n_embed * 5 * n_layer);
|
1184 |
struct ggml_tensor * output = ggml_new_tensor_1d(ctx.ctx, GGML_TYPE_F32, n_embed * 5 * n_layer);
|
1185 |
|
1186 |
// We collect parts of input state here. Each part is (n_embed) vector.
|
1187 |
-
std::unique_ptr<struct rwkv_layer_state
|
1188 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_ALLOC, inputs.get(), "Failed to allocate input state parts");
|
1189 |
|
1190 |
// We collect parts of output state here. Each part is (n_embed) vector.
|
1191 |
-
std::unique_ptr<struct rwkv_layer_state
|
1192 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_ALLOC, outputs.get(), "Failed to allocate output state parts");
|
1193 |
|
1194 |
for (size_t i = 0; i < n_layer; i++) {
|
@@ -1209,19 +1462,52 @@ struct rwkv_context * rwkv_new_context_impl(std::shared_ptr<struct rwkv_instance
|
|
1209 |
|
1210 |
struct ggml_tensor * logits = ggml_new_tensor_1d(ctx.ctx, GGML_TYPE_F32, n_vocab);
|
1211 |
|
1212 |
-
struct
|
1213 |
-
|
1214 |
-
|
1215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1216 |
|
1217 |
struct rwkv_graph serial_graph;
|
1218 |
-
serial_graph.ctx =
|
1219 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, serial_graph.ctx.ctx, "Failed to allocate serial graph context");
|
1220 |
serial_graph.tokens = ggml_new_i32(serial_graph.ctx.ctx, 0);
|
1221 |
serial_graph.cgraph.reset(new(std::nothrow) struct ggml_cgraph());
|
1222 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_ALLOC, serial_graph.cgraph, "Failed to allocate serial graph");
|
1223 |
serial_graph.cgraph->n_threads = n_threads;
|
1224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1225 |
|
1226 |
std::unique_ptr<struct rwkv_context> rwkv_ctx(new(std::nothrow) struct rwkv_context());
|
1227 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, rwkv_ctx, "Failed to allocate rwkv_context");
|
@@ -1258,20 +1544,16 @@ struct rwkv_context * rwkv_clone_context(struct rwkv_context * ctx, const uint32
|
|
1258 |
return clone;
|
1259 |
}
|
1260 |
|
1261 |
-
bool rwkv_gpu_offload_layers(
|
1262 |
|
1263 |
-
return
|
1264 |
}
|
1265 |
|
1266 |
void rwkv_set_inputs(const struct rwkv_context * ctx, const float * state_in) {
|
1267 |
if (state_in) {
|
1268 |
memcpy(ctx->input_state->data, state_in, ggml_nbytes(ctx->input_state));
|
1269 |
} else {
|
1270 |
-
|
1271 |
-
|
1272 |
-
for (size_t i = 0; i < ctx->instance->model.header.n_layer; i++) {
|
1273 |
-
ggml_set_f32(ctx->input_layers[i].att_pp, -1e30F);
|
1274 |
-
}
|
1275 |
}
|
1276 |
}
|
1277 |
|
@@ -1285,23 +1567,33 @@ void rwkv_get_outputs(const struct rwkv_context * ctx, float * state_out, float
|
|
1285 |
}
|
1286 |
}
|
1287 |
|
1288 |
-
bool rwkv_eval(
|
1289 |
-
|
1290 |
|
1291 |
const struct rwkv_file_header & header = ctx->instance->model.header;
|
1292 |
const size_t n_vocab = header.n_vocab;
|
1293 |
-
RWKV_CTX_ASSERT_FALSE_MSG(ctx, RWKV_ERROR_ARGS, token < n_vocab, "Token (%" PRId32 ") is out of range (0
|
1294 |
|
1295 |
rwkv_set_inputs(ctx, state_in);
|
1296 |
ggml_set_i32(ctx->serial_graph.tokens, token);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1297 |
ggml_graph_compute(ctx->serial_graph.ctx.ctx, ctx->serial_graph.cgraph.get());
|
1298 |
rwkv_get_outputs(ctx, state_out, logits_out);
|
1299 |
|
1300 |
return true;
|
1301 |
}
|
1302 |
|
1303 |
-
bool rwkv_eval_sequence(
|
1304 |
-
|
1305 |
|
1306 |
const struct rwkv_file_header & header = ctx->instance->model.header;
|
1307 |
const size_t n_vocab = header.n_vocab;
|
@@ -1311,34 +1603,78 @@ bool rwkv_eval_sequence(const struct rwkv_context * ctx, const uint32_t * sequen
|
|
1311 |
if (sequence) {
|
1312 |
for (size_t i = 0; i < sequence_len; i++) {
|
1313 |
const uint32_t token = sequence[i];
|
1314 |
-
RWKV_CTX_ASSERT_FALSE_MSG(ctx, RWKV_ERROR_ARGS, token < n_vocab, "
|
1315 |
}
|
1316 |
}
|
1317 |
|
1318 |
if (ctx->sequence_len != sequence_len) {
|
1319 |
// Build new sequence graph
|
1320 |
-
|
1321 |
-
|
1322 |
-
|
1323 |
-
|
1324 |
-
|
1325 |
-
struct
|
1326 |
-
|
1327 |
-
|
1328 |
-
|
1329 |
-
|
1330 |
-
|
1331 |
-
|
1332 |
-
|
1333 |
-
|
1334 |
-
|
1335 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1336 |
}
|
1337 |
|
1338 |
// Allow building the sequence graph without actually evaluating, by specifying sequence = NULL.
|
1339 |
if (sequence) {
|
1340 |
rwkv_set_inputs(ctx, state_in);
|
1341 |
memcpy(ctx->sequence_graph.tokens->data, sequence, sequence_len * sizeof(uint32_t));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1342 |
ggml_graph_compute(ctx->sequence_graph.ctx.ctx, ctx->sequence_graph.cgraph.get());
|
1343 |
rwkv_get_outputs(ctx, state_out, logits_out);
|
1344 |
}
|
@@ -1346,12 +1682,52 @@ bool rwkv_eval_sequence(const struct rwkv_context * ctx, const uint32_t * sequen
|
|
1346 |
return true;
|
1347 |
}
|
1348 |
|
1349 |
-
|
1350 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1351 |
}
|
1352 |
|
1353 |
-
|
1354 |
-
return ctx->instance->model.header.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1355 |
}
|
1356 |
|
1357 |
void rwkv_free(struct rwkv_context * ctx) {
|
@@ -1381,7 +1757,12 @@ bool rwkv_quantize_model_file(const char * in_path, const char * out_path, const
|
|
1381 |
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_FILE, rwkv_fread_file_header(in_file.file, in_header), "Invalid file header");
|
1382 |
|
1383 |
enum ggml_type in_type = rwkv_type_to_ggml[in_header.data_type];
|
1384 |
-
RWKV_ASSERT_FALSE_MSG(
|
|
|
|
|
|
|
|
|
|
|
1385 |
|
1386 |
struct rwkv_file_header out_header = in_header;
|
1387 |
out_header.version = RWKV_FILE_VERSION;
|
@@ -1392,7 +1773,7 @@ bool rwkv_quantize_model_file(const char * in_path, const char * out_path, const
|
|
1392 |
size_t orig_total_size = 0;
|
1393 |
size_t new_total_size = 0;
|
1394 |
|
1395 |
-
// Required to init the
|
1396 |
// Doesn't crash if ggml_init fails
|
1397 |
ggml_free(ggml_init({ 0, NULL, true }));
|
1398 |
|
@@ -1404,26 +1785,26 @@ bool rwkv_quantize_model_file(const char * in_path, const char * out_path, const
|
|
1404 |
struct rwkv_tensor_header header;
|
1405 |
RWKV_ASSERT_FALSE(RWKV_ERROR_FILE, rwkv_fread_tensor_header_and_skip(in_file.file, header));
|
1406 |
|
1407 |
-
size_t in_size =
|
1408 |
|
1409 |
if (in_size > max_in_size) {
|
1410 |
max_in_size = in_size;
|
1411 |
}
|
1412 |
|
1413 |
// f16 type tensors get relocated to out and then converted into f32 at in
|
1414 |
-
if (header.data_type ==
|
1415 |
if (in_size > max_out_size) {
|
1416 |
max_out_size = in_size;
|
1417 |
}
|
1418 |
|
1419 |
-
size_t f32_size =
|
1420 |
|
1421 |
if (f32_size > max_in_size) {
|
1422 |
max_in_size = f32_size;
|
1423 |
}
|
1424 |
}
|
1425 |
|
1426 |
-
size_t out_size =
|
1427 |
|
1428 |
if (out_size > max_out_size) {
|
1429 |
max_out_size = out_size;
|
@@ -1439,7 +1820,7 @@ bool rwkv_quantize_model_file(const char * in_path, const char * out_path, const
|
|
1439 |
// This is a histogram of quantized values. If it shows single 1.0, then all 0.0, something went very wrong!
|
1440 |
int64_t hist_all[16] {};
|
1441 |
|
1442 |
-
std::unique_ptr<uint8_t
|
1443 |
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, scratch.get(), "Failed to allocate buffer");
|
1444 |
|
1445 |
uint8_t * in_buf = scratch.get();
|
@@ -1457,19 +1838,19 @@ bool rwkv_quantize_model_file(const char * in_path, const char * out_path, const
|
|
1457 |
const char * name_str = name.c_str();
|
1458 |
RWKV_MSG("%*s - [%5" PRId32 ", %5" PRId32 "], type = %6s ", (int) max_key_length, name_str, header.width, header.height, rwkv_type_to_string[header.data_type]);
|
1459 |
|
1460 |
-
data = header.data_type ==
|
1461 |
-
size_t orig_size =
|
1462 |
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_MODEL_PARAMS, rwkv_fread_data(in_file.file, orig_size, data), "\nFailed to read tensor data of %s", name_str);
|
1463 |
|
1464 |
// Quantize only 2D tensors, except embedding and head matrices.
|
1465 |
// Embedding and head take not too much space, especially in bigger models;
|
1466 |
// but they significantly increase perplexity when quantized.
|
1467 |
-
if ((header.data_type ==
|
1468 |
RWKV_MSG("quantizing... ");
|
1469 |
|
1470 |
size_t nelements = (size_t) header.width * (size_t) header.height;
|
1471 |
|
1472 |
-
if (header.data_type ==
|
1473 |
ggml_fp16_to_fp32_row((const ggml_fp16_t *) out_buf, (float *) in_buf, nelements);
|
1474 |
}
|
1475 |
|
|
|
17 |
#include <utility>
|
18 |
|
19 |
#define _FILE_OFFSET_BITS 64
|
20 |
+
// Puts an optional break point, if debug is enabled.
|
21 |
#define RWKV_MAYBE_BREAK
|
22 |
|
23 |
#include <sys/stat.h>
|
|
|
39 |
#endif
|
40 |
#endif
|
41 |
|
|
|
|
|
|
|
42 |
// --- Error handling ---
|
43 |
|
44 |
thread_local enum rwkv_error_flags global_last_error = RWKV_ERROR_NONE;
|
|
|
122 |
|
123 |
#define RWKV_ASSERT_FALSE_MSG(ERR_VAL, x, ...) RWKV_ASSERT_MSG(ERR_VAL, false, x, __VA_ARGS__)
|
124 |
#define RWKV_ASSERT_NULL_MSG(ERR_VAL, x, ...) RWKV_ASSERT_MSG(ERR_VAL, NULL, x, __VA_ARGS__)
|
125 |
+
|
126 |
#define RWKV_CTX_ASSERT_FALSE_MSG(ctx, ERR_VAL, x, ...) RWKV_CTX_ASSERT_MSG(ctx, ERR_VAL, false, x, __VA_ARGS__)
|
|
|
127 |
|
128 |
#define RWKV_ASSERT_FALSE(ERR_VAL, x) RWKV_ASSERT(ERR_VAL, false, x)
|
129 |
#define RWKV_ASSERT_NULL(ERR_VAL, x) RWKV_ASSERT(ERR_VAL, NULL, x)
|
130 |
+
|
131 |
#define RWKV_CTX_ASSERT_FALSE(ctx, ERR_VAL, x) RWKV_CTX_ASSERT(ctx, ERR_VAL, false, x)
|
|
|
132 |
|
133 |
#define RWKV_ENSURE_OR_FALSE(x) RWKV_ENSURE(false, x)
|
134 |
#define RWKV_ENSURE_OR_NULL(x) RWKV_ENSURE(NULL, x)
|
135 |
#define RWKV_ENSURE_OR_FALSE_MSG(x, ...) RWKV_ENSURE_MSG(false, x, __VA_ARGS__)
|
|
|
|
|
|
|
136 |
|
137 |
// --- Utilities ---
|
138 |
|
|
|
167 |
return fwrite(data, length, 1, file) == 1;
|
168 |
}
|
169 |
|
170 |
+
// --- File handling ---
|
171 |
|
172 |
#define TYPE_UNKNOWN TYPE_COUNT
|
173 |
|
174 |
enum rwkv_type {
|
175 |
+
TYPE_FP32,
|
176 |
+
TYPE_FP16,
|
177 |
TYPE_Q4_0,
|
178 |
TYPE_Q4_1,
|
179 |
TYPE_Q4_1_O, // Unsupported
|
|
|
188 |
#define GGML_TYPE_UNKNOWN GGML_TYPE_COUNT
|
189 |
|
190 |
extern const enum ggml_type rwkv_type_to_ggml[TYPE_COUNT + 1] = {
|
191 |
+
GGML_TYPE_F32, /* FP32 */
|
192 |
+
GGML_TYPE_F16, /* FP16 */
|
193 |
GGML_TYPE_Q4_0, /* Q4_0 */
|
194 |
GGML_TYPE_Q4_1, /* Q4_1 */
|
195 |
GGML_TYPE_UNKNOWN, /* Q4_1_O */
|
|
|
202 |
};
|
203 |
|
204 |
extern const enum rwkv_type rwkv_type_from_ggml[GGML_TYPE_COUNT + 1] = {
|
205 |
+
TYPE_FP32, /* FP32 */
|
206 |
+
TYPE_FP16, /* FP16 */
|
207 |
TYPE_Q4_0, /* Q4_0 */
|
208 |
TYPE_Q4_1, /* Q4_1 */
|
209 |
TYPE_Q4_2, /* Q4_2 */
|
|
|
218 |
TYPE_COUNT, /* COUNT */
|
219 |
};
|
220 |
|
221 |
+
extern const char * rwkv_type_to_string[TYPE_COUNT + 1] = {"FP32", "FP16", "Q4_0", "Q4_1", "Q4_1_O", "Q4_2", "Q4_3", "Q5_0", "Q5_1", "Q8_0", "unknown"};
|
222 |
|
223 |
enum rwkv_type rwkv_type_from_string(const char * str) {
|
224 |
for (int ord = 0; ord < TYPE_COUNT; ord++) {
|
|
|
285 |
uint32_t data_type;
|
286 |
uint32_t width;
|
287 |
uint32_t height;
|
288 |
+
|
289 |
+
const size_t size() const;
|
290 |
};
|
291 |
|
292 |
struct rwkv_tensor {
|
|
|
300 |
header.height = 1;
|
301 |
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_SHAPE, header.dim_count == 1 || header.dim_count == 2, "Tensor has an invalid shape (%" PRId32 " dimensions)", header.dim_count);
|
302 |
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_DATA_TYPE, header.data_type < TYPE_COUNT, "Tensor data type out of range (%" PRId32 " > %" PRId32 ")", header.data_type, TYPE_COUNT - 1);
|
303 |
+
RWKV_ASSERT_FALSE_MSG(
|
304 |
+
RWKV_ERROR_DATA_TYPE,
|
305 |
+
rwkv_type_to_ggml[header.data_type] != GGML_TYPE_UNKNOWN,
|
306 |
+
"Tensor data type (%s) is no longer supported",
|
307 |
+
rwkv_type_to_string[header.data_type]
|
308 |
+
);
|
309 |
|
310 |
if (header.dim_count == 2) {
|
311 |
RWKV_ASSERT_FALSE(RWKV_ERROR_FILE_READ, rwkv_fread_uint32(file, header.height));
|
|
|
319 |
return true;
|
320 |
}
|
321 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
322 |
bool rwkv_fskip_tensor_data(FILE * file, const struct rwkv_tensor_header & header) {
|
323 |
+
return fseek(file, header.key_length + header.size(), SEEK_CUR) == 0;
|
324 |
}
|
325 |
|
326 |
bool rwkv_fread_tensor_header_and_skip(FILE * file, struct rwkv_tensor_header & header) {
|
|
|
330 |
}
|
331 |
|
332 |
bool rwkv_fread_tensor_data(FILE * file, struct rwkv_tensor & output, void * buffer = NULL) {
|
333 |
+
size_t data_size = output.header.size();
|
334 |
RWKV_ASSERT_FALSE(RWKV_ERROR_FILE_READ, rwkv_fread_string(file, output.header.key_length, output.name));
|
335 |
|
336 |
if (buffer) {
|
|
|
349 |
return true;
|
350 |
}
|
351 |
|
352 |
+
bool rwkv_fread_ggml_tensor_data(FILE * file, const struct rwkv_tensor_header & header, struct ggml_context * ctx, std::string & name, struct ggml_tensor *& tensor) {
|
353 |
+
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_FILE_READ, rwkv_fread_string(file, header.key_length, name), "Failed to read tensor name");
|
354 |
+
|
355 |
+
enum ggml_type ggml_type = rwkv_type_to_ggml[header.data_type];
|
356 |
+
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_UNSUPPORTED, ggml_type != GGML_TYPE_UNKNOWN, "Unsupported tensor data type %s from %s", rwkv_type_to_string[header.data_type], name.c_str());
|
357 |
+
|
358 |
+
tensor = header.dim_count == 1
|
359 |
+
? ggml_new_tensor_1d(ctx, ggml_type, header.width)
|
360 |
+
: ggml_new_tensor_2d(ctx, ggml_type, header.width, header.height);
|
361 |
+
|
362 |
+
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, tensor, "Failed to allocate tensor");
|
363 |
+
ggml_set_name(tensor, name.c_str());
|
364 |
+
|
365 |
+
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_FILE_READ, rwkv_fread_data(file, ggml_nbytes(tensor), tensor->data), "Failed to read tensor data from %s", name.c_str());
|
366 |
+
return true;
|
367 |
+
}
|
368 |
+
|
369 |
+
bool rwkv_fread_ggml_tensor(FILE * file, struct ggml_context * ctx, std::string & name, struct ggml_tensor *& tensor) {
|
370 |
+
struct rwkv_tensor_header header;
|
371 |
+
RWKV_ENSURE_OR_FALSE_MSG(rwkv_fread_tensor_header(file, header), "Invalid tensor header");
|
372 |
+
return rwkv_fread_ggml_tensor_data(file, header, ctx, name, tensor);
|
373 |
+
}
|
374 |
+
|
375 |
bool rwkv_fwrite_tensor(FILE * file, const struct rwkv_tensor & tensor) {
|
376 |
RWKV_ENSURE_OR_FALSE(rwkv_fwrite_tensor_header(file, tensor.header));
|
377 |
RWKV_ENSURE_OR_FALSE(rwkv_fwrite_string(file, tensor.name));
|
378 |
+
RWKV_ENSURE_OR_FALSE(rwkv_fwrite_data(file, tensor.data, tensor.header.size()));
|
379 |
return true;
|
380 |
}
|
381 |
|
|
|
415 |
struct ggml_tensor * ln0_weight;
|
416 |
struct ggml_tensor * ln0_bias;
|
417 |
|
418 |
+
std::unique_ptr<struct rwkv_layer[]> layers;
|
419 |
|
420 |
struct ggml_tensor * ln_out_weight;
|
421 |
struct ggml_tensor * ln_out_bias;
|
|
|
468 |
struct ggml_tensor * rwkv_layer_norm(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight, struct ggml_tensor * bias) {
|
469 |
// LayerNorm in RWKV is `x = (x - mean(x)) / sqrt(variance(x) + 1e-5) * weight + bias`
|
470 |
// Looks like ggml_norm does the first part, we only need to apply weight & bias.
|
471 |
+
return ggml_add_inplace(ctx, ggml_mul_inplace(ctx, ggml_norm(ctx, x), weight), bias);
|
472 |
}
|
473 |
|
474 |
// --- Implementation ---
|
475 |
|
476 |
+
// Used as a helper during rwkv_ctx_size calculation.
|
477 |
+
struct rwkv_future_tensor;
|
478 |
+
|
479 |
+
// Used to calculate the memory usage of ggml contexts before allocating them.
|
480 |
+
// Since ggml uses an internal bump allocator that can't be grown at runtime, we need to ensure we have enough space,
|
481 |
+
// while at the same time not using more memory than necessary.
|
482 |
+
struct rwkv_future_ctx {
|
483 |
size_t objects_count = 0;
|
484 |
+
size_t memory_size = 0;
|
485 |
size_t scratch_size = 0;
|
486 |
+
|
487 |
+
// Align to GGML_MEM_ALIGN, which can currently be up to 16
|
488 |
+
static const size_t align(const size_t size) {
|
489 |
+
return ((size + 15) & ~15);
|
490 |
+
}
|
491 |
+
|
492 |
+
void add_objects(const size_t size, const size_t count = 1) {
|
493 |
+
this->objects_count += count;
|
494 |
+
|
495 |
+
if (size && count) {
|
496 |
+
this->add_memory(size, count);
|
497 |
+
}
|
498 |
+
}
|
499 |
+
|
500 |
+
void add_memory(const size_t size, const size_t count = 1) {
|
501 |
+
this->memory_size += this->align(size) * count;
|
502 |
+
}
|
503 |
+
|
504 |
+
void add_scratch(const size_t size, const size_t count = 1) {
|
505 |
+
this->scratch_size += this->align(size) * count;
|
506 |
+
}
|
507 |
+
|
508 |
+
void add_data(const bool use_scratch, const size_t size, const size_t count = 1) {
|
509 |
+
if (use_scratch) {
|
510 |
+
this->add_scratch(size, count);
|
511 |
+
} else {
|
512 |
+
this->add_memory(size, count);
|
513 |
+
}
|
514 |
+
}
|
515 |
+
|
516 |
+
struct rwkv_future_tensor declare(const enum ggml_type type, const uint64_t width, const uint64_t height = 1);
|
517 |
+
|
518 |
+
struct rwkv_future_tensor alloc(const enum ggml_type type, const uint64_t width, const uint64_t height = 1, const bool use_scratch = true);
|
519 |
+
};
|
520 |
+
|
521 |
+
struct rwkv_future_tensor {
|
522 |
+
enum ggml_type type = GGML_TYPE_COUNT;
|
523 |
+
uint64_t width = 0;
|
524 |
+
uint64_t height = 0;
|
525 |
+
|
526 |
+
static const size_t size(const enum ggml_type type, const uint64_t width, const uint64_t height) {
|
527 |
+
struct ggml_tensor decoy {};
|
528 |
+
decoy.type = type;
|
529 |
+
decoy.ne[0] = width;
|
530 |
+
decoy.ne[1] = height;
|
531 |
+
decoy.ne[2] = 1;
|
532 |
+
decoy.ne[3] = 1;
|
533 |
+
return ggml_nbytes(&decoy);
|
534 |
+
}
|
535 |
+
|
536 |
+
rwkv_future_tensor() {}
|
537 |
+
rwkv_future_tensor(const enum ggml_type type, const uint64_t width, const uint64_t height = 1): type(type), width(width), height(height) {}
|
538 |
+
rwkv_future_tensor(const struct ggml_tensor * ref): type(ref->type), width(ref->ne[0]), height(ref->ne[1]) {}
|
539 |
+
|
540 |
+
struct rwkv_future_tensor alloc(struct rwkv_future_ctx & ctx, const bool use_scratch = true) const {
|
541 |
+
ctx.add_objects(sizeof(struct ggml_tensor));
|
542 |
+
ctx.add_data(use_scratch, rwkv_future_tensor::size(type, width, height));
|
543 |
+
return *this;
|
544 |
+
}
|
545 |
+
|
546 |
+
struct rwkv_future_tensor view(struct rwkv_future_ctx & ctx) const {
|
547 |
+
ctx.add_objects(sizeof(struct ggml_tensor));
|
548 |
+
return *this;
|
549 |
+
}
|
550 |
+
|
551 |
+
struct rwkv_future_tensor subview(struct rwkv_future_ctx & ctx, const uint32_t width, const uint32_t height = 1) const {
|
552 |
+
ctx.add_objects(sizeof(struct ggml_tensor), 2);
|
553 |
+
ctx.add_memory(sizeof(uint32_t) * 2);
|
554 |
+
return rwkv_future_tensor(type, width, height);
|
555 |
+
}
|
556 |
+
|
557 |
+
struct rwkv_future_tensor dup(struct rwkv_future_ctx & ctx) const {
|
558 |
+
return this->alloc(ctx);
|
559 |
+
}
|
560 |
+
|
561 |
+
struct rwkv_future_tensor layer_norm(struct rwkv_future_ctx & ctx, const struct rwkv_future_tensor & weight, const struct rwkv_future_tensor & bias) const {
|
562 |
+
return this->dup(ctx).view(ctx).view(ctx);
|
563 |
+
}
|
564 |
+
|
565 |
+
struct rwkv_future_tensor repeat(struct rwkv_future_ctx & ctx, const struct rwkv_future_tensor reference) const {
|
566 |
+
return reference.dup(ctx);
|
567 |
+
}
|
568 |
+
|
569 |
+
struct rwkv_future_tensor set_inplace(struct rwkv_future_ctx & ctx, const struct rwkv_future_tensor src) {
|
570 |
+
ctx.add_objects(sizeof(struct ggml_tensor));
|
571 |
+
ctx.add_memory(sizeof(uint32_t) * 5);
|
572 |
+
return this->view(ctx);
|
573 |
+
}
|
574 |
+
|
575 |
+
struct rwkv_future_tensor consume(struct rwkv_future_ctx & ctx, const struct rwkv_future_tensor & other) {
|
576 |
+
return this->view(ctx);
|
577 |
+
}
|
578 |
+
|
579 |
+
struct rwkv_future_tensor combine(struct rwkv_future_ctx & ctx, const struct rwkv_future_tensor & other) const {
|
580 |
+
return this->dup(ctx);
|
581 |
+
}
|
582 |
+
|
583 |
+
struct rwkv_future_tensor fn(struct rwkv_future_ctx & ctx) const {
|
584 |
+
ctx.add_objects(sizeof(struct ggml_tensor));
|
585 |
+
ctx.add_memory(sizeof(void *) / sizeof(uint32_t));
|
586 |
+
return this->dup(ctx);
|
587 |
+
}
|
588 |
+
|
589 |
+
struct rwkv_future_tensor mul_mat(struct rwkv_future_ctx & ctx, const struct rwkv_future_tensor & other) const {
|
590 |
+
return ctx.alloc(GGML_TYPE_F32, this->height, other.height);
|
591 |
+
}
|
592 |
+
|
593 |
+
struct rwkv_future_tensor get_rows(struct rwkv_future_ctx & ctx, const struct rwkv_future_tensor & other) const {
|
594 |
+
return ctx.alloc(GGML_TYPE_F32, this->width, other.width);
|
595 |
+
}
|
596 |
};
|
597 |
|
598 |
+
const size_t rwkv_tensor_header::size() const {
|
599 |
+
return rwkv_future_tensor::size(rwkv_type_to_ggml[this->data_type], this->width, this->height);
|
600 |
+
}
|
601 |
+
|
602 |
+
struct rwkv_future_tensor rwkv_future_ctx::declare(const enum ggml_type type, const uint64_t width, const uint64_t height) {
|
603 |
+
return rwkv_future_tensor(type, width, height);
|
604 |
+
}
|
605 |
+
|
606 |
+
struct rwkv_future_tensor rwkv_future_ctx::alloc(const enum ggml_type type, const uint64_t width, const uint64_t height, const bool use_scratch) {
|
607 |
+
return this->declare(type, width, height).alloc(*this, use_scratch);
|
608 |
+
}
|
609 |
+
|
610 |
struct rwkv_ggml_context {
|
611 |
+
std::unique_ptr<uint8_t[]> scratch;
|
612 |
struct ggml_context * ctx;
|
613 |
|
614 |
rwkv_ggml_context(): ctx(NULL) {}
|
615 |
|
616 |
+
rwkv_ggml_context(const struct rwkv_future_ctx future_ctx): ctx(NULL) {
|
617 |
+
scratch.reset(new(std::nothrow) uint8_t[future_ctx.scratch_size]);
|
618 |
|
619 |
if (!scratch) {
|
620 |
return;
|
|
|
623 |
const size_t memory_required_overhead = size_t(128) * 1024 * 1024;
|
624 |
const size_t memory_required_overhead_sc = size_t(64) * 1024 * 1024;
|
625 |
|
626 |
+
ctx = ggml_init({ future_ctx.objects_count * GGML_OBJECT_SIZE + future_ctx.memory_size + memory_required_overhead, NULL, false});
|
627 |
|
628 |
if (!ctx) {
|
629 |
return;
|
630 |
}
|
631 |
|
632 |
+
ggml_set_scratch(ctx, { 0, memory_required_overhead_sc + future_ctx.scratch_size, scratch.get() });
|
633 |
}
|
634 |
|
635 |
struct rwkv_ggml_context & operator=(struct rwkv_ggml_context && source) {
|
|
|
652 |
struct rwkv_ggml_context ctx;
|
653 |
struct rwkv_model model;
|
654 |
|
655 |
+
// TODO Come up with a better solution to estimate "work tensor" size
|
656 |
// The ggml_cgraph allocates a "work tensor" the first time it is used.
|
657 |
// Currently, the height of blocks.0.ffn.key.weight is the bottleneck in our implementation of RWKV.
|
658 |
// Since it is the largest dimension used in any matrix multiply, it is the size used for the "work tensor".
|
|
|
664 |
|
665 |
// The hidden state of a single RWKV layer.
|
666 |
// These are mostly used for dividing up the input state, and writing portions of the output state.
|
667 |
+
// But they're also used in building the computation graphs to represent the operations
|
668 |
+
// used from input->output (operating "in place" on a rwkv_layer_state).
|
669 |
struct rwkv_layer_state {
|
670 |
struct ggml_tensor * ffn_xx;
|
671 |
struct ggml_tensor * att_xx;
|
|
|
674 |
struct ggml_tensor * att_pp;
|
675 |
};
|
676 |
|
677 |
+
// Holds a single computation graph and its ggml context.
|
678 |
// Graphs each have their own context so that they can be individually freed and rebuilt.
|
679 |
// Graphs read hidden state from the rwkv_context and then write it back to the rwkv_context.
|
680 |
// (see rwkv_context.input_layers and rwkv_context.output_layers)
|
|
|
684 |
|
685 |
// ggml_cgraph is so large that it can cause stack overflows if not stored on the heap
|
686 |
std::unique_ptr<struct ggml_cgraph> cgraph;
|
687 |
+
|
688 |
+
size_t pre_logits_nodes;
|
689 |
+
size_t pre_logits_leafs;
|
690 |
+
size_t post_logits_nodes;
|
691 |
+
size_t post_logits_leafs;
|
692 |
};
|
693 |
|
694 |
// RWKV context for a specific instance.
|
|
|
699 |
// Reused by all graphs.
|
700 |
struct rwkv_ggml_context ctx;
|
701 |
struct ggml_tensor * input_state;
|
702 |
+
std::unique_ptr<struct rwkv_layer_state[]> input_layers;
|
703 |
struct ggml_tensor * output_state;
|
704 |
+
std::unique_ptr<struct rwkv_layer_state[]> output_layers;
|
705 |
struct ggml_tensor * logits;
|
706 |
|
707 |
uint32_t n_threads;
|
|
|
722 |
float * logits_out = 0; //stores address of output logit buffer
|
723 |
|
724 |
size_t gpu_layers;
|
|
|
725 |
};
|
726 |
|
727 |
+
// https://stackoverflow.com/a/6458689
|
728 |
+
template<typename F>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
729 |
bool rwkv_set_params(struct rwkv_model & model, F callback) {
|
730 |
RWKV_ENSURE_OR_FALSE(callback("emb.weight", model.emb));
|
731 |
RWKV_ENSURE_OR_FALSE(callback("blocks.0.ln0.weight", model.ln0_weight));
|
732 |
RWKV_ENSURE_OR_FALSE(callback("blocks.0.ln0.bias", model.ln0_bias));
|
733 |
|
734 |
uint32_t n_layer = model.header.n_layer;
|
735 |
+
std::unique_ptr<struct rwkv_layer[]> layers(new(std::nothrow) struct rwkv_layer[n_layer]);
|
736 |
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, layers.get(), "Failed to allocate model layers");
|
737 |
model.layers = std::move(layers);
|
738 |
|
|
|
770 |
return true;
|
771 |
}
|
772 |
|
773 |
+
void rwkv_future_carry_x(struct rwkv_future_ctx & ctx,
|
774 |
+
const struct rwkv_future_tensor weight,
|
775 |
+
const struct rwkv_future_tensor bias,
|
776 |
+
struct rwkv_future_tensor & x,
|
777 |
+
struct rwkv_future_tensor & x_prev,
|
778 |
+
struct rwkv_future_tensor & carry
|
779 |
+
) {
|
780 |
+
if (x.height == 1) {
|
781 |
+
x = x.layer_norm(ctx, weight, bias);
|
782 |
+
x_prev = carry;
|
783 |
+
carry = x;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
784 |
} else {
|
785 |
+
x = x.layer_norm(ctx, weight.repeat(ctx, x), bias.repeat(ctx, x));
|
786 |
|
787 |
+
x_prev = x.dup(ctx)
|
788 |
+
.set_inplace(ctx, carry)
|
789 |
+
.set_inplace(ctx, x.subview(ctx, x.width, x.height - 1));
|
790 |
|
791 |
+
carry = x.subview(ctx, x.width);
|
792 |
}
|
|
|
|
|
793 |
}
|
794 |
|
795 |
+
void rwkv_carry_x(struct ggml_context * ctx,
|
796 |
+
struct ggml_tensor * weight,
|
797 |
+
struct ggml_tensor * bias,
|
798 |
+
struct ggml_tensor *& x,
|
799 |
+
struct ggml_tensor *& x_prev,
|
800 |
+
struct ggml_tensor *& carry
|
801 |
+
) {
|
802 |
+
const size_t n_embed = x->ne[0];
|
803 |
+
const size_t sequence_len = x->ne[1];
|
804 |
|
805 |
if (sequence_len == 1) {
|
806 |
// self.layer_norm(x, self.w.blocks[i].ln2)
|
807 |
x = rwkv_layer_norm(ctx, x, weight, bias);
|
808 |
|
809 |
// xx = state[5*i+0]
|
810 |
+
x_prev = carry;
|
811 |
|
812 |
// state[5*i+0] = x
|
813 |
+
carry = x;
|
814 |
} else {
|
815 |
// self.layer_norm(x, self.w.blocks[i].ln2)
|
816 |
x = rwkv_layer_norm(ctx, x, ggml_repeat(ctx, weight, x), ggml_repeat(ctx, bias, x));
|
817 |
|
818 |
// xx = torch.cat((state[5*i+0].to(dtype=self.FLOAT_MODE).unsqueeze(0), x[:-1,:]))
|
819 |
+
x_prev = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embed, sequence_len);
|
820 |
+
x_prev = ggml_set_1d_inplace(ctx, x_prev, carry, 0);
|
821 |
+
x_prev = ggml_set_1d_inplace(ctx, x_prev, ggml_view_1d(ctx, x, n_embed * (sequence_len - 1), 0), n_embed * sizeof(float));
|
822 |
|
823 |
// state[5*i+0] = x[-1,:]
|
824 |
+
carry = ggml_view_1d(ctx, x, n_embed, n_embed * (sequence_len - 1) * sizeof(float));
|
825 |
}
|
826 |
}
|
827 |
|
828 |
+
void rwkv_future_att_rkv(struct rwkv_future_ctx & ctx,
|
829 |
+
const struct rwkv_future_tensor time_mix_k,
|
830 |
+
const struct rwkv_future_tensor time_mix_v,
|
831 |
+
const struct rwkv_future_tensor time_mix_r,
|
832 |
+
const struct rwkv_future_tensor x,
|
833 |
+
const struct rwkv_future_tensor x_prev,
|
834 |
+
const struct rwkv_future_tensor att_r,
|
835 |
+
const struct rwkv_future_tensor att_k,
|
836 |
+
const struct rwkv_future_tensor att_v,
|
837 |
+
struct rwkv_future_tensor & r,
|
838 |
+
struct rwkv_future_tensor & k,
|
839 |
+
struct rwkv_future_tensor & v
|
840 |
+
) {
|
841 |
+
const struct rwkv_future_tensor xk = x.combine(ctx, time_mix_k).consume(ctx, x_prev.combine(ctx, time_mix_k.fn(ctx)));
|
842 |
+
const struct rwkv_future_tensor xv = x.combine(ctx, time_mix_v).consume(ctx, x_prev.combine(ctx, time_mix_v.fn(ctx)));
|
843 |
+
const struct rwkv_future_tensor xr = x.combine(ctx, time_mix_r).consume(ctx, x_prev.combine(ctx, time_mix_r.fn(ctx)));
|
|
|
|
|
|
|
|
|
844 |
|
845 |
+
r = att_r.mul_mat(ctx, xr).fn(ctx);
|
846 |
+
k = att_k.mul_mat(ctx, xk);
|
847 |
+
v = att_v.mul_mat(ctx, xv);
|
848 |
}
|
849 |
|
850 |
+
void rwkv_att_rkv(
|
851 |
+
struct ggml_context * ctx,
|
852 |
+
struct rwkv_layer layer,
|
853 |
+
struct ggml_tensor * x,
|
854 |
+
struct ggml_tensor * x_prev,
|
855 |
+
struct ggml_tensor *& r,
|
856 |
+
struct ggml_tensor *& k,
|
857 |
+
struct ggml_tensor *& v
|
858 |
+
) {
|
859 |
// xk = x * time_mix_k + state[5 * i + 1] * (1 - time_mix_k)
|
860 |
struct ggml_tensor * xk = ggml_add_inplace(ctx,
|
861 |
+
ggml_mul(ctx, x, layer.att_time_mix_k),
|
862 |
+
ggml_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.att_time_mix_k))
|
863 |
);
|
864 |
|
865 |
// xv = x * time_mix_v + state[5 * i + 1] * (1 - time_mix_v)
|
866 |
struct ggml_tensor * xv = ggml_add_inplace(ctx,
|
867 |
+
ggml_mul(ctx, x, layer.att_time_mix_v),
|
868 |
+
ggml_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.att_time_mix_v))
|
869 |
);
|
870 |
|
871 |
// xr = x * time_mix_r + state[5 * i + 1] * (1 - time_mix_r)
|
872 |
struct ggml_tensor * xr = ggml_add_inplace(ctx,
|
873 |
+
ggml_mul(ctx, x, layer.att_time_mix_r),
|
874 |
+
ggml_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.att_time_mix_r))
|
875 |
);
|
876 |
|
877 |
// r = torch.sigmoid(rw @ xr)
|
|
|
882 |
v = ggml_mul_mat(ctx, layer.att_value, xv);
|
883 |
}
|
884 |
|
885 |
+
struct rwkv_future_tensor rwkv_future_att_wkv(struct rwkv_future_ctx & ctx,
|
886 |
+
const struct rwkv_future_tensor time_first,
|
887 |
+
const struct rwkv_future_tensor time_decay,
|
888 |
+
struct rwkv_future_tensor & aa,
|
889 |
+
struct rwkv_future_tensor & bb,
|
890 |
+
struct rwkv_future_tensor & pp,
|
891 |
+
const struct rwkv_future_tensor k,
|
892 |
+
const struct rwkv_future_tensor v
|
893 |
+
) {
|
894 |
+
struct rwkv_future_tensor ww = time_first.combine(ctx, k);
|
895 |
+
struct rwkv_future_tensor qq = pp.fn(ctx);
|
896 |
+
struct rwkv_future_tensor e1 = pp.combine(ctx, qq).fn(ctx);
|
897 |
+
struct rwkv_future_tensor e2 = ww.combine(ctx, qq).fn(ctx);
|
898 |
+
|
899 |
+
struct rwkv_future_tensor a = e1.combine(ctx, aa).consume(ctx, e2.combine(ctx, v));
|
900 |
+
struct rwkv_future_tensor b = e1.combine(ctx, bb).consume(ctx, e2);
|
901 |
+
|
902 |
+
ww = pp.combine(ctx, time_decay);
|
903 |
+
qq = ww.fn(ctx);
|
904 |
+
e1 = ww.combine(ctx, qq).fn(ctx);
|
905 |
+
e2 = k.combine(ctx, qq).fn(ctx);
|
906 |
+
|
907 |
+
// aa, bb
|
908 |
+
aa = e1.combine(ctx, aa).consume(ctx, e2.combine(ctx, v));
|
909 |
+
bb = e1.combine(ctx, bb).consume(ctx, e2);
|
910 |
+
pp = qq;
|
|
|
|
|
911 |
|
912 |
+
// wkv
|
913 |
+
return a.combine(ctx, b);
|
914 |
}
|
915 |
|
916 |
+
struct ggml_tensor * rwkv_att_wkv(
|
917 |
+
struct ggml_context * ctx,
|
918 |
+
struct ggml_tensor * att_time_first,
|
919 |
+
struct ggml_tensor * att_time_decay,
|
920 |
+
struct ggml_tensor * k,
|
921 |
+
struct ggml_tensor * v,
|
922 |
+
struct ggml_tensor *& aa,
|
923 |
+
struct ggml_tensor *& bb,
|
924 |
+
struct ggml_tensor *& pp
|
925 |
+
) {
|
926 |
// ww = time_first + k
|
927 |
struct ggml_tensor * ww = ggml_add(ctx, att_time_first, k);
|
928 |
// qq = torch.maximum(pp, ww)
|
|
|
957 |
return ggml_div(ctx, a, b);
|
958 |
}
|
959 |
|
|
|
|
|
960 |
|
961 |
+
struct rwkv_future_tensor rwkv_future_att(struct rwkv_future_ctx & ctx,
|
962 |
+
const struct rwkv_future_tensor ln1_weight,
|
963 |
+
const struct rwkv_future_tensor ln1_bias,
|
964 |
+
const struct rwkv_future_tensor time_mix_k,
|
965 |
+
const struct rwkv_future_tensor time_mix_v,
|
966 |
+
const struct rwkv_future_tensor time_mix_r,
|
967 |
+
const struct rwkv_future_tensor time_first,
|
968 |
+
const struct rwkv_future_tensor time_decay,
|
969 |
+
const struct rwkv_future_tensor att_r,
|
970 |
+
const struct rwkv_future_tensor att_k,
|
971 |
+
const struct rwkv_future_tensor att_v,
|
972 |
+
const struct rwkv_future_tensor att_output,
|
973 |
+
struct rwkv_future_tensor x,
|
974 |
+
struct rwkv_future_tensor & att_xx,
|
975 |
+
struct rwkv_future_tensor & att_aa,
|
976 |
+
struct rwkv_future_tensor & att_bb,
|
977 |
+
struct rwkv_future_tensor & att_pp
|
978 |
+
) {
|
979 |
+
struct rwkv_future_tensor x_prev;
|
980 |
+
rwkv_future_carry_x(ctx, ln1_weight, ln1_bias, x, x_prev, att_xx);
|
981 |
+
|
982 |
+
struct rwkv_future_tensor r, k, v;
|
983 |
+
rwkv_future_att_rkv(ctx, time_mix_k, time_mix_v, time_mix_r, x, x_prev, att_r, att_k, att_v, r, k, v);
|
984 |
+
|
985 |
+
struct rwkv_future_tensor wkv = rwkv_future_att_wkv(ctx, time_first, time_decay, att_aa, att_bb, att_pp, k, v);
|
986 |
|
987 |
+
return att_output.mul_mat(ctx, r.combine(ctx, wkv));
|
988 |
}
|
989 |
|
990 |
struct ggml_tensor * rwkv_att(struct ggml_context * ctx, struct ggml_tensor * x, struct rwkv_layer layer, struct rwkv_layer_state & state) {
|
991 |
+
struct ggml_tensor * x_prev;
|
992 |
+
rwkv_carry_x(ctx, layer.ln1_weight, layer.ln1_bias, x, x_prev, state.att_xx);
|
993 |
|
994 |
struct ggml_tensor * r, * k, * v;
|
995 |
+
rwkv_att_rkv(ctx, layer, x, x_prev, r, k, v);
|
996 |
|
997 |
struct ggml_tensor * wkv = rwkv_att_wkv(ctx, layer.att_time_first, layer.att_time_decay, k, v, state.att_aa, state.att_bb, state.att_pp);
|
998 |
|
|
|
1000 |
return ggml_mul_mat(ctx, layer.att_output, ggml_mul(ctx, r, wkv));
|
1001 |
}
|
1002 |
|
1003 |
+
struct rwkv_future_tensor rwkv_future_ffn(struct rwkv_future_ctx & ctx,
|
1004 |
+
const struct rwkv_future_tensor ln2_weight,
|
1005 |
+
const struct rwkv_future_tensor ln2_bias,
|
1006 |
+
const struct rwkv_future_tensor time_mix_k,
|
1007 |
+
const struct rwkv_future_tensor time_mix_r,
|
1008 |
+
const struct rwkv_future_tensor ffn_k,
|
1009 |
+
const struct rwkv_future_tensor ffn_v,
|
1010 |
+
const struct rwkv_future_tensor ffn_r,
|
1011 |
+
struct rwkv_future_tensor x,
|
1012 |
+
struct rwkv_future_tensor & ffn_xx
|
1013 |
+
) {
|
1014 |
+
struct rwkv_future_tensor x_prev;
|
1015 |
+
rwkv_future_carry_x(ctx, ln2_weight, ln2_bias, x, x_prev, ffn_xx);
|
1016 |
|
1017 |
+
struct rwkv_future_tensor xk = x.combine(ctx, time_mix_k).consume(ctx, x_prev.combine(ctx, time_mix_k.fn(ctx)));
|
1018 |
+
struct rwkv_future_tensor xr = x.combine(ctx, time_mix_r).consume(ctx, x_prev.combine(ctx, time_mix_r.fn(ctx)));
|
|
|
1019 |
|
1020 |
+
struct rwkv_future_tensor r = ffn_r.mul_mat(ctx, xr).fn(ctx);
|
1021 |
+
struct rwkv_future_tensor k = ffn_k.mul_mat(ctx, xk).view(ctx).view(ctx);
|
1022 |
|
1023 |
+
return r.consume(ctx, ffn_v.mul_mat(ctx, k));
|
1024 |
}
|
1025 |
|
1026 |
struct ggml_tensor * rwkv_ffn(struct ggml_context * ctx, struct ggml_tensor * x, struct rwkv_layer layer, struct rwkv_layer_state & state) {
|
1027 |
+
struct ggml_tensor * x_prev;
|
1028 |
+
rwkv_carry_x(ctx, layer.ln2_weight, layer.ln2_bias, x, x_prev, state.ffn_xx);
|
1029 |
|
1030 |
// xk = x * time_mix_k + state[5 * i + 1] * (1 - time_mix_k)
|
1031 |
// xk = x * time_mix_k + state[5 * i + 0] * (1 - time_mix_k)
|
1032 |
struct ggml_tensor * xk = ggml_add_inplace(
|
1033 |
ctx,
|
1034 |
+
ggml_mul(ctx, x, layer.ffn_time_mix_k),
|
1035 |
+
ggml_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.ffn_time_mix_k))
|
1036 |
);
|
1037 |
|
1038 |
// xr = x * time_mix_r + state[5 * i + 0] * (1 - time_mix_r)
|
1039 |
struct ggml_tensor * xr = ggml_add_inplace(
|
1040 |
ctx,
|
1041 |
+
ggml_mul(ctx, x, layer.ffn_time_mix_r),
|
1042 |
+
ggml_mul(ctx, x_prev, rwkv_1_minus_x(ctx, layer.ffn_time_mix_r))
|
1043 |
);
|
1044 |
|
1045 |
// r = torch.sigmoid(rw @ xr)
|
1046 |
struct ggml_tensor * r = rwkv_sigmoid(ctx, ggml_mul_mat(ctx, layer.ffn_receptance, xr));
|
1047 |
|
1048 |
// k = torch.square(torch.relu(kw @ xk))
|
1049 |
+
struct ggml_tensor * k = ggml_sqr_inplace(ctx, ggml_relu_inplace(ctx, ggml_mul_mat(ctx, layer.ffn_key, xk)));
|
1050 |
|
1051 |
// r * (vw @ k)
|
1052 |
+
return ggml_mul_inplace(ctx, r, ggml_mul_mat(ctx, layer.ffn_value, k));
|
1053 |
}
|
1054 |
|
1055 |
+
struct rwkv_future_tensor rwkv_future_graph_work(struct rwkv_future_ctx & ctx,
|
1056 |
+
const enum ggml_type type,
|
1057 |
+
const size_t ffn_key_height,
|
1058 |
+
const size_t n_threads,
|
1059 |
+
const size_t sequence_len = 1
|
1060 |
+
) {
|
1061 |
+
enum ggml_type mul_mat_type = ggml_is_quantized(type) ? GGML_TYPE_Q8_1 : type;
|
1062 |
+
return ctx.alloc(GGML_TYPE_I8, rwkv_future_tensor::size(mul_mat_type, ffn_key_height, sequence_len) * n_threads + 64 * (n_threads - 1));
|
1063 |
+
}
|
1064 |
+
|
1065 |
+
struct rwkv_future_tensor rwkv_future_serial_graph(struct rwkv_future_ctx & ctx,
|
1066 |
+
const struct rwkv_future_tensor tokens,
|
1067 |
+
const size_t n_threads,
|
1068 |
+
|
1069 |
+
const struct rwkv_future_tensor emb,
|
1070 |
+
const struct rwkv_future_tensor ln0_weight,
|
1071 |
+
const struct rwkv_future_tensor ln0_bias,
|
1072 |
+
|
1073 |
+
const size_t n_layer,
|
1074 |
+
|
1075 |
+
const struct rwkv_future_tensor ln1_weight,
|
1076 |
+
const struct rwkv_future_tensor ln1_bias,
|
1077 |
+
const struct rwkv_future_tensor att_time_mix_k,
|
1078 |
+
const struct rwkv_future_tensor att_time_mix_v,
|
1079 |
+
const struct rwkv_future_tensor att_time_mix_r,
|
1080 |
+
const struct rwkv_future_tensor att_time_first,
|
1081 |
+
const struct rwkv_future_tensor att_time_decay,
|
1082 |
+
const struct rwkv_future_tensor att_r,
|
1083 |
+
const struct rwkv_future_tensor att_k,
|
1084 |
+
const struct rwkv_future_tensor att_v,
|
1085 |
+
const struct rwkv_future_tensor att_output,
|
1086 |
+
struct rwkv_future_tensor & att_xx,
|
1087 |
+
struct rwkv_future_tensor & att_aa,
|
1088 |
+
struct rwkv_future_tensor & att_bb,
|
1089 |
+
struct rwkv_future_tensor & att_pp,
|
1090 |
+
|
1091 |
+
const struct rwkv_future_tensor ln2_weight,
|
1092 |
+
const struct rwkv_future_tensor ln2_bias,
|
1093 |
+
const struct rwkv_future_tensor ffn_time_mix_k,
|
1094 |
+
const struct rwkv_future_tensor ffn_time_mix_r,
|
1095 |
+
const struct rwkv_future_tensor ffn_k,
|
1096 |
+
const struct rwkv_future_tensor ffn_v,
|
1097 |
+
const struct rwkv_future_tensor ffn_r,
|
1098 |
+
struct rwkv_future_tensor & ffn_xx,
|
1099 |
+
|
1100 |
+
const struct rwkv_future_tensor ln_out_weight,
|
1101 |
+
const struct rwkv_future_tensor ln_out_bias,
|
1102 |
+
const struct rwkv_future_tensor head
|
1103 |
+
) {
|
1104 |
+
struct rwkv_future_tensor x = emb.get_rows(ctx, tokens).layer_norm(ctx, ln0_weight, ln0_bias);
|
1105 |
|
1106 |
+
for (size_t i = 0; i < n_layer; i++) {
|
1107 |
+
x = x.consume(ctx, rwkv_future_att(ctx,
|
1108 |
+
ln1_weight, ln1_bias, att_time_mix_k, att_time_mix_v, att_time_mix_r, att_time_first, att_time_decay,
|
1109 |
+
att_r, att_k, att_v, att_output, x, att_xx, att_aa, att_bb, att_pp));
|
1110 |
+
|
1111 |
+
x = x.consume(ctx, rwkv_future_ffn(ctx,
|
1112 |
+
ln2_weight, ln2_bias, ffn_time_mix_k, ffn_time_mix_r, ffn_k, ffn_v, ffn_r, x, ffn_xx));
|
1113 |
+
|
1114 |
+
ffn_xx.view(ctx);
|
1115 |
+
att_xx.view(ctx);
|
1116 |
+
att_aa.view(ctx);
|
1117 |
+
att_bb.view(ctx);
|
1118 |
+
att_pp.view(ctx);
|
1119 |
+
}
|
1120 |
|
1121 |
+
x = x.layer_norm(ctx, ln_out_weight, ln_out_bias);
|
1122 |
|
1123 |
+
rwkv_future_graph_work(ctx, ffn_k.type, ffn_k.height, n_threads, tokens.width);
|
|
|
1124 |
|
1125 |
+
return head.mul_mat(ctx, x).view(ctx);
|
1126 |
}
|
1127 |
|
1128 |
bool rwkv_build_serial_graph(
|
|
|
1132 |
struct rwkv_layer_state * inputs,
|
1133 |
struct rwkv_layer_state * outputs,
|
1134 |
struct ggml_tensor * logits,
|
1135 |
+
struct ggml_cgraph * cgraph,
|
|
|
|
|
1136 |
|
1137 |
+
size_t * const pre_logits_nodes,
|
1138 |
+
size_t * const pre_logits_leafs,
|
1139 |
+
size_t * const post_logits_nodes,
|
1140 |
+
size_t * const post_logits_leafs
|
1141 |
+
) {
|
1142 |
// x = self.w.emb.weight[token]
|
1143 |
struct ggml_tensor * x = ggml_get_rows(ctx, model.emb, tokens);
|
1144 |
|
|
|
1160 |
ggml_build_forward_expand(cgraph, ggml_cpy(ctx, state.att_pp, output.att_pp));
|
1161 |
}
|
1162 |
|
1163 |
+
*pre_logits_nodes = cgraph->n_nodes;
|
1164 |
+
*pre_logits_leafs = cgraph->n_leafs;
|
1165 |
+
|
1166 |
// x = self.layer_norm(x[-1,:], self.w.ln_out)
|
1167 |
x = rwkv_layer_norm(ctx, x, model.ln_out_weight, model.ln_out_bias);
|
1168 |
|
1169 |
// x = (self.w.head.weight @ x).float()
|
1170 |
ggml_build_forward_expand(cgraph, ggml_cpy(ctx, ggml_mul_mat(ctx, model.head, x), logits));
|
1171 |
|
1172 |
+
*post_logits_nodes = cgraph->n_nodes;
|
1173 |
+
*post_logits_leafs = cgraph->n_leafs;
|
1174 |
+
|
1175 |
return true;
|
1176 |
}
|
1177 |
|
1178 |
+
struct rwkv_future_tensor rwkv_future_sequence_graph(struct rwkv_future_ctx & ctx,
|
1179 |
+
const struct rwkv_future_tensor tokens,
|
1180 |
+
const size_t n_threads,
|
1181 |
+
|
1182 |
+
const struct rwkv_future_tensor emb,
|
1183 |
+
const struct rwkv_future_tensor ln0_weight,
|
1184 |
+
const struct rwkv_future_tensor ln0_bias,
|
1185 |
+
|
1186 |
+
const size_t n_layer,
|
1187 |
+
|
1188 |
+
const struct rwkv_future_tensor ln1_weight,
|
1189 |
+
const struct rwkv_future_tensor ln1_bias,
|
1190 |
+
const struct rwkv_future_tensor att_time_mix_k,
|
1191 |
+
const struct rwkv_future_tensor att_time_mix_v,
|
1192 |
+
const struct rwkv_future_tensor att_time_mix_r,
|
1193 |
+
const struct rwkv_future_tensor att_time_first,
|
1194 |
+
const struct rwkv_future_tensor att_time_decay,
|
1195 |
+
const struct rwkv_future_tensor att_r,
|
1196 |
+
const struct rwkv_future_tensor att_k,
|
1197 |
+
const struct rwkv_future_tensor att_v,
|
1198 |
+
const struct rwkv_future_tensor att_output,
|
1199 |
+
struct rwkv_future_tensor & att_xx,
|
1200 |
+
struct rwkv_future_tensor & att_aa,
|
1201 |
+
struct rwkv_future_tensor & att_bb,
|
1202 |
+
struct rwkv_future_tensor & att_pp,
|
1203 |
+
|
1204 |
+
const struct rwkv_future_tensor ln2_weight,
|
1205 |
+
const struct rwkv_future_tensor ln2_bias,
|
1206 |
+
const struct rwkv_future_tensor ffn_time_mix_k,
|
1207 |
+
const struct rwkv_future_tensor ffn_time_mix_r,
|
1208 |
+
const struct rwkv_future_tensor ffn_k,
|
1209 |
+
const struct rwkv_future_tensor ffn_v,
|
1210 |
+
const struct rwkv_future_tensor ffn_r,
|
1211 |
+
struct rwkv_future_tensor & ffn_xx,
|
1212 |
+
|
1213 |
+
const struct rwkv_future_tensor ln_out_weight,
|
1214 |
+
const struct rwkv_future_tensor ln_out_bias,
|
1215 |
+
const struct rwkv_future_tensor head
|
1216 |
+
) {
|
1217 |
+
struct rwkv_future_tensor x = emb.get_rows(ctx, tokens);
|
1218 |
+
x = x.layer_norm(ctx, ln0_weight.repeat(ctx, x), ln0_bias.repeat(ctx, x));
|
1219 |
|
1220 |
+
for (size_t i = 0; i < n_layer; i++) {
|
1221 |
+
struct rwkv_future_tensor x0 = x, x_prev;
|
1222 |
+
rwkv_future_carry_x(ctx, ln1_weight, ln1_bias, x0, x_prev, att_xx);
|
1223 |
+
|
1224 |
+
struct rwkv_future_tensor r, k, v;
|
1225 |
+
rwkv_future_att_rkv(ctx, att_time_mix_k, att_time_mix_v, att_time_mix_r, x0, x_prev, att_r, att_k, att_v, r, k, v);
|
1226 |
+
|
1227 |
+
for (size_t i = 0; i < tokens.width; i++) {
|
1228 |
+
struct rwkv_future_tensor kt = k.subview(ctx, emb.width);
|
1229 |
+
struct rwkv_future_tensor vt = v.subview(ctx, emb.width);
|
1230 |
+
struct rwkv_future_tensor xt = x_prev.subview(ctx, emb.width);
|
1231 |
+
struct rwkv_future_tensor wkv = rwkv_future_att_wkv(ctx, att_time_first, att_time_decay, att_aa, att_bb, att_pp, k, v);
|
1232 |
+
wkv.view(ctx);
|
1233 |
+
}
|
1234 |
|
1235 |
+
x = x.consume(ctx, att_output.mul_mat(ctx, r.combine(ctx, x_prev)));
|
1236 |
+
x = x.consume(ctx, rwkv_future_ffn(ctx, ln2_weight, ln2_bias, ffn_time_mix_k, ffn_time_mix_r, ffn_k, ffn_v, ffn_r, x, ffn_xx));
|
|
|
|
|
|
|
|
|
1237 |
|
1238 |
+
ffn_xx.view(ctx);
|
1239 |
+
att_xx.view(ctx);
|
1240 |
+
att_aa.view(ctx);
|
1241 |
+
att_bb.view(ctx);
|
1242 |
+
att_pp.view(ctx);
|
1243 |
+
}
|
1244 |
|
1245 |
+
x = x.subview(ctx, emb.width).layer_norm(ctx, ln_out_weight, ln_out_bias);
|
1246 |
|
1247 |
+
rwkv_future_graph_work(ctx, ffn_k.type, ffn_k.height, n_threads, tokens.width);
|
|
|
1248 |
|
1249 |
+
return head.mul_mat(ctx, x).view(ctx);
|
1250 |
}
|
1251 |
|
1252 |
bool rwkv_build_sequence_graph(
|
|
|
1256 |
struct rwkv_layer_state * inputs,
|
1257 |
struct rwkv_layer_state * outputs,
|
1258 |
struct ggml_tensor * logits,
|
1259 |
+
struct ggml_cgraph * cgraph,
|
1260 |
+
|
1261 |
+
size_t * const pre_logits_nodes,
|
1262 |
+
size_t * const pre_logits_leafs,
|
1263 |
+
size_t * const post_logits_nodes,
|
1264 |
+
size_t * const post_logits_leafs
|
1265 |
) {
|
1266 |
const uint32_t n_embed = model.header.n_embed;
|
1267 |
const size_t sequence_len = tokens->ne[0];
|
|
|
1273 |
struct rwkv_layer & layer = model.layers[i];
|
1274 |
struct rwkv_layer_state state = inputs[i];
|
1275 |
|
1276 |
+
struct ggml_tensor * x0 = x, * x_prev;
|
1277 |
+
rwkv_carry_x(ctx, layer.ln1_weight, layer.ln1_bias, x0, x_prev, state.att_xx);
|
1278 |
|
1279 |
struct ggml_tensor * r, * k, * v;
|
1280 |
+
rwkv_att_rkv(ctx, layer, x0, x_prev, r, k, v);
|
1281 |
|
1282 |
ggml_build_forward_expand(cgraph, r);
|
1283 |
|
1284 |
for (uint32_t t = 0; t < sequence_len; t++) {
|
1285 |
struct ggml_tensor * kt = ggml_view_1d(ctx, k, n_embed, n_embed * sizeof(float) * t);
|
1286 |
struct ggml_tensor * vt = ggml_view_1d(ctx, v, n_embed, n_embed * sizeof(float) * t);
|
1287 |
+
struct ggml_tensor * xt = ggml_view_1d(ctx, x_prev, n_embed, n_embed * sizeof(float) * t);
|
1288 |
struct ggml_tensor * wkv = rwkv_att_wkv(ctx, layer.att_time_first, layer.att_time_decay, kt, vt, state.att_aa, state.att_bb, state.att_pp);
|
1289 |
ggml_build_forward_expand(cgraph, ggml_cpy(ctx, wkv, xt));
|
1290 |
}
|
1291 |
|
1292 |
+
x = ggml_add_inplace(ctx, x, ggml_mul_mat(ctx, layer.att_output, ggml_mul(ctx, r, x_prev)));
|
1293 |
x = ggml_add_inplace(ctx, x, rwkv_ffn(ctx, x, layer, state));
|
1294 |
|
1295 |
struct rwkv_layer_state & output = outputs[i];
|
|
|
1300 |
ggml_build_forward_expand(cgraph, ggml_cpy(ctx, state.att_pp, output.att_pp));
|
1301 |
}
|
1302 |
|
1303 |
+
*pre_logits_nodes = cgraph->n_nodes;
|
1304 |
+
*pre_logits_leafs = cgraph->n_leafs;
|
1305 |
+
|
1306 |
// x = self.layer_norm(x[-1,:], self.w.ln_out)
|
1307 |
x = rwkv_layer_norm(ctx, ggml_view_1d(ctx, x, n_embed, n_embed * sizeof(float) * (sequence_len - 1)), model.ln_out_weight, model.ln_out_bias);
|
1308 |
|
1309 |
// x = (self.w.head.weight @ x).float()
|
1310 |
ggml_build_forward_expand(cgraph, ggml_cpy(ctx, ggml_mul_mat(ctx, model.head, x), logits));
|
1311 |
|
1312 |
+
*post_logits_nodes = cgraph->n_nodes;
|
1313 |
+
*post_logits_leafs = cgraph->n_leafs;
|
|
|
|
|
1314 |
|
1315 |
+
return true;
|
|
|
1316 |
}
|
1317 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1318 |
void rwkv_set_print_errors(struct rwkv_context * ctx, bool print_errors) {
|
1319 |
bool * ptr = ctx ? &ctx->print_errors : &global_print_errors;
|
1320 |
*ptr = print_errors;
|
|
|
1331 |
return value;
|
1332 |
}
|
1333 |
|
1334 |
+
struct rwkv_file {
|
1335 |
+
FILE * file;
|
1336 |
+
|
1337 |
+
rwkv_file(FILE * file): file(file) {}
|
1338 |
+
|
1339 |
+
~rwkv_file() {
|
1340 |
+
if (file) {
|
1341 |
+
fclose(file);
|
1342 |
+
}
|
1343 |
+
}
|
1344 |
+
};
|
1345 |
+
|
1346 |
bool rwkv_instance_from_file(const char * file_path, struct rwkv_instance & instance) {
|
1347 |
struct stat file_stat;
|
1348 |
struct rwkv_model model;
|
|
|
1361 |
|
1362 |
struct rwkv_tensor_header tensor_header;
|
1363 |
std::string name;
|
1364 |
+
struct rwkv_future_ctx future_ctx;
|
1365 |
|
1366 |
while ((size_t) ftell(file.file) < (size_t) file_stat.st_size) {
|
1367 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_MODEL_PARAMS, rwkv_fread_tensor_header(file.file, tensor_header), "Invalid tensor header");
|
1368 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_MODEL_PARAMS, rwkv_fread_string(file.file, tensor_header.key_length, name), "Failed to read tensor name");
|
1369 |
+
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_FILE | RWKV_ERROR_FILE_READ, fseek(file.file, tensor_header.size(), SEEK_CUR) == 0, "Failed to read tensor data");
|
1370 |
|
1371 |
+
future_ctx.alloc(rwkv_type_to_ggml[tensor_header.data_type], tensor_header.width, tensor_header.height);
|
1372 |
|
1373 |
if (ffn_key_size == 0 && name == "blocks.0.ffn.key.weight") {
|
1374 |
ffn_key_size = tensor_header.height;
|
|
|
1378 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_MODEL_PARAMS | RWKV_ERROR_PARAM_MISSING, ffn_key_size, "Model is missing parameter blocks.0.ffn.key.weight");
|
1379 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_FILE | RWKV_ERROR_FILE_READ, fseek(file.file, sizeof(struct rwkv_file_header), SEEK_SET) == 0, "Failed to seek in file");
|
1380 |
|
1381 |
+
ctx = future_ctx;
|
1382 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, ctx.ctx, "Failed to allocate model context");
|
1383 |
|
1384 |
struct ggml_tensor * tensor;
|
|
|
1417 |
const size_t n_embed = header.n_embed;
|
1418 |
const size_t n_layer = header.n_layer;
|
1419 |
|
1420 |
+
struct rwkv_future_ctx future_ctx;
|
1421 |
+
const struct rwkv_future_tensor future_input = future_ctx.alloc(GGML_TYPE_F32, n_embed * 5 * n_layer);
|
1422 |
+
const struct rwkv_future_tensor future_output = future_ctx.alloc(GGML_TYPE_F32, n_embed * 5 * n_layer);
|
1423 |
+
const struct rwkv_future_tensor future_logits = future_ctx.alloc(GGML_TYPE_F32, n_vocab);
|
1424 |
+
|
1425 |
+
for (size_t i = 0; i < n_layer; i++) {
|
1426 |
+
/* ffn_xx */ future_input.subview(future_ctx, n_embed); future_output.subview(future_ctx, n_embed);
|
1427 |
+
/* att_xx */ future_input.subview(future_ctx, n_embed); future_output.subview(future_ctx, n_embed);
|
1428 |
+
/* att_aa */ future_input.subview(future_ctx, n_embed); future_output.subview(future_ctx, n_embed);
|
1429 |
+
/* att_bb */ future_input.subview(future_ctx, n_embed); future_output.subview(future_ctx, n_embed);
|
1430 |
+
/* att_pp */ future_input.subview(future_ctx, n_embed); future_output.subview(future_ctx, n_embed);
|
1431 |
+
}
|
1432 |
|
1433 |
+
struct rwkv_ggml_context ctx(future_ctx);
|
1434 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, ctx.ctx, "Failed to allocate model context");
|
1435 |
|
1436 |
struct ggml_tensor * input = ggml_new_tensor_1d(ctx.ctx, GGML_TYPE_F32, n_embed * 5 * n_layer);
|
1437 |
struct ggml_tensor * output = ggml_new_tensor_1d(ctx.ctx, GGML_TYPE_F32, n_embed * 5 * n_layer);
|
1438 |
|
1439 |
// We collect parts of input state here. Each part is (n_embed) vector.
|
1440 |
+
std::unique_ptr<struct rwkv_layer_state[]> inputs(new(std::nothrow) struct rwkv_layer_state[n_layer]);
|
1441 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_ALLOC, inputs.get(), "Failed to allocate input state parts");
|
1442 |
|
1443 |
// We collect parts of output state here. Each part is (n_embed) vector.
|
1444 |
+
std::unique_ptr<struct rwkv_layer_state[]> outputs(new(std::nothrow) struct rwkv_layer_state[n_layer]);
|
1445 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_ALLOC, outputs.get(), "Failed to allocate output state parts");
|
1446 |
|
1447 |
for (size_t i = 0; i < n_layer; i++) {
|
|
|
1462 |
|
1463 |
struct ggml_tensor * logits = ggml_new_tensor_1d(ctx.ctx, GGML_TYPE_F32, n_vocab);
|
1464 |
|
1465 |
+
struct rwkv_future_ctx graph_future_ctx;
|
1466 |
+
const struct rwkv_future_tensor future_token = graph_future_ctx.alloc(GGML_TYPE_I32, 1, 1, false);
|
1467 |
+
|
1468 |
+
const struct rwkv_model & model = instance->model;
|
1469 |
+
const struct rwkv_layer & layer = model.layers[0];
|
1470 |
+
const struct rwkv_layer_state & state = inputs[0];
|
1471 |
+
struct rwkv_future_tensor ffn_xx = state.ffn_xx;
|
1472 |
+
struct rwkv_future_tensor att_xx = state.att_xx;
|
1473 |
+
struct rwkv_future_tensor att_aa = state.att_aa;
|
1474 |
+
struct rwkv_future_tensor att_bb = state.att_bb;
|
1475 |
+
struct rwkv_future_tensor att_pp = state.att_pp;
|
1476 |
+
|
1477 |
+
const struct rwkv_future_tensor future_graph = rwkv_future_serial_graph(graph_future_ctx, future_token, n_threads,
|
1478 |
+
model.emb,
|
1479 |
+
model.ln0_weight, model.ln0_bias,
|
1480 |
+
|
1481 |
+
n_layer,
|
1482 |
+
layer.ln1_weight, layer.ln1_bias,
|
1483 |
+
layer.att_time_mix_k, layer.att_time_mix_v, layer.att_time_mix_r,
|
1484 |
+
layer.att_time_first, layer.att_time_decay,
|
1485 |
+
layer.att_receptance, layer.att_key, layer.att_value, layer.att_output,
|
1486 |
+
att_xx, att_aa, att_bb, att_pp,
|
1487 |
+
|
1488 |
+
layer.ln2_weight, layer.ln2_bias,
|
1489 |
+
layer.ffn_time_mix_k, layer.ffn_time_mix_r,
|
1490 |
+
layer.ffn_key, layer.ffn_value, layer.ffn_receptance,
|
1491 |
+
ffn_xx,
|
1492 |
+
|
1493 |
+
model.ln_out_weight, model.ln_out_weight,
|
1494 |
+
model.head
|
1495 |
+
);
|
1496 |
|
1497 |
struct rwkv_graph serial_graph;
|
1498 |
+
serial_graph.ctx = graph_future_ctx;
|
1499 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, serial_graph.ctx.ctx, "Failed to allocate serial graph context");
|
1500 |
serial_graph.tokens = ggml_new_i32(serial_graph.ctx.ctx, 0);
|
1501 |
serial_graph.cgraph.reset(new(std::nothrow) struct ggml_cgraph());
|
1502 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_ALLOC, serial_graph.cgraph, "Failed to allocate serial graph");
|
1503 |
serial_graph.cgraph->n_threads = n_threads;
|
1504 |
+
|
1505 |
+
RWKV_ASSERT_NULL(RWKV_ERROR_GRAPH, rwkv_build_serial_graph(
|
1506 |
+
serial_graph.ctx.ctx, instance->model,
|
1507 |
+
serial_graph.tokens, inputs.get(), outputs.get(), logits,
|
1508 |
+
serial_graph.cgraph.get(),
|
1509 |
+
&serial_graph.pre_logits_nodes, &serial_graph.pre_logits_leafs, &serial_graph.post_logits_nodes, &serial_graph.post_logits_leafs
|
1510 |
+
));
|
1511 |
|
1512 |
std::unique_ptr<struct rwkv_context> rwkv_ctx(new(std::nothrow) struct rwkv_context());
|
1513 |
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, rwkv_ctx, "Failed to allocate rwkv_context");
|
|
|
1544 |
return clone;
|
1545 |
}
|
1546 |
|
1547 |
+
bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers) {
|
1548 |
|
1549 |
+
return false;
|
1550 |
}
|
1551 |
|
1552 |
void rwkv_set_inputs(const struct rwkv_context * ctx, const float * state_in) {
|
1553 |
if (state_in) {
|
1554 |
memcpy(ctx->input_state->data, state_in, ggml_nbytes(ctx->input_state));
|
1555 |
} else {
|
1556 |
+
rwkv_init_state(ctx, (float *) ctx->input_state->data);
|
|
|
|
|
|
|
|
|
1557 |
}
|
1558 |
}
|
1559 |
|
|
|
1567 |
}
|
1568 |
}
|
1569 |
|
1570 |
+
bool rwkv_eval(struct rwkv_context * ctx, const uint32_t token, const float * state_in, float * state_out, float * logits_out) {
|
1571 |
+
ctx->last_error = RWKV_ERROR_NONE;
|
1572 |
|
1573 |
const struct rwkv_file_header & header = ctx->instance->model.header;
|
1574 |
const size_t n_vocab = header.n_vocab;
|
1575 |
+
RWKV_CTX_ASSERT_FALSE_MSG(ctx, RWKV_ERROR_ARGS, token < n_vocab, "Token (%" PRId32 ") is out of range (0 .. %zu)", token, n_vocab - 1);
|
1576 |
|
1577 |
rwkv_set_inputs(ctx, state_in);
|
1578 |
ggml_set_i32(ctx->serial_graph.tokens, token);
|
1579 |
+
|
1580 |
+
// Short circuit computation of logits if nobody actually cares
|
1581 |
+
if (!logits_out) {
|
1582 |
+
ctx->serial_graph.cgraph->n_nodes = ctx->serial_graph.pre_logits_nodes;
|
1583 |
+
ctx->serial_graph.cgraph->n_leafs = ctx->serial_graph.pre_logits_leafs;
|
1584 |
+
} else {
|
1585 |
+
ctx->serial_graph.cgraph->n_nodes = ctx->serial_graph.post_logits_nodes;
|
1586 |
+
ctx->serial_graph.cgraph->n_leafs = ctx->serial_graph.post_logits_leafs;
|
1587 |
+
}
|
1588 |
+
|
1589 |
ggml_graph_compute(ctx->serial_graph.ctx.ctx, ctx->serial_graph.cgraph.get());
|
1590 |
rwkv_get_outputs(ctx, state_out, logits_out);
|
1591 |
|
1592 |
return true;
|
1593 |
}
|
1594 |
|
1595 |
+
bool rwkv_eval_sequence(struct rwkv_context * ctx, const uint32_t * sequence, const size_t sequence_len, const float * state_in, float * state_out, float * logits_out) {
|
1596 |
+
ctx->last_error = RWKV_ERROR_NONE;
|
1597 |
|
1598 |
const struct rwkv_file_header & header = ctx->instance->model.header;
|
1599 |
const size_t n_vocab = header.n_vocab;
|
|
|
1603 |
if (sequence) {
|
1604 |
for (size_t i = 0; i < sequence_len; i++) {
|
1605 |
const uint32_t token = sequence[i];
|
1606 |
+
RWKV_CTX_ASSERT_FALSE_MSG(ctx, RWKV_ERROR_ARGS, token < n_vocab, "Token at index %zu (%" PRId32 ") is out of range (0 .. %zu)", i, token, n_vocab - 1);
|
1607 |
}
|
1608 |
}
|
1609 |
|
1610 |
if (ctx->sequence_len != sequence_len) {
|
1611 |
// Build new sequence graph
|
1612 |
+
|
1613 |
+
struct rwkv_future_ctx graph_future_ctx;
|
1614 |
+
const struct rwkv_future_tensor future_tokens = graph_future_ctx.alloc(GGML_TYPE_I32, sequence_len);
|
1615 |
+
|
1616 |
+
const struct rwkv_model & model = ctx->instance->model;
|
1617 |
+
const struct rwkv_layer & layer = model.layers[0];
|
1618 |
+
const struct rwkv_layer_state & state = ctx->input_layers[0];
|
1619 |
+
struct rwkv_future_tensor ffn_xx = state.ffn_xx;
|
1620 |
+
struct rwkv_future_tensor att_xx = state.att_xx;
|
1621 |
+
struct rwkv_future_tensor att_aa = state.att_aa;
|
1622 |
+
struct rwkv_future_tensor att_bb = state.att_bb;
|
1623 |
+
struct rwkv_future_tensor att_pp = state.att_pp;
|
1624 |
+
|
1625 |
+
const struct rwkv_future_tensor future_graph = rwkv_future_sequence_graph(graph_future_ctx, future_tokens, 1,
|
1626 |
+
model.emb,
|
1627 |
+
model.ln0_weight, model.ln0_bias,
|
1628 |
+
|
1629 |
+
n_layer,
|
1630 |
+
layer.ln1_weight, layer.ln1_bias,
|
1631 |
+
layer.att_time_mix_k, layer.att_time_mix_v, layer.att_time_mix_r,
|
1632 |
+
layer.att_time_first, layer.att_time_decay,
|
1633 |
+
layer.att_receptance, layer.att_key, layer.att_value, layer.att_output,
|
1634 |
+
att_xx, att_aa, att_bb, att_pp,
|
1635 |
+
|
1636 |
+
layer.ln2_weight, layer.ln2_bias,
|
1637 |
+
layer.ffn_time_mix_k, layer.ffn_time_mix_r,
|
1638 |
+
layer.ffn_key, layer.ffn_value, layer.ffn_receptance,
|
1639 |
+
ffn_xx,
|
1640 |
+
|
1641 |
+
model.ln_out_weight, model.ln_out_weight,
|
1642 |
+
model.head
|
1643 |
+
);
|
1644 |
+
|
1645 |
+
struct rwkv_graph sequence_graph;
|
1646 |
+
sequence_graph.ctx = graph_future_ctx;
|
1647 |
+
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, sequence_graph.ctx.ctx, "Failed to allocate sequence graph context");
|
1648 |
+
sequence_graph.tokens = ggml_new_tensor_1d(sequence_graph.ctx.ctx, GGML_TYPE_I32, sequence_len);
|
1649 |
+
sequence_graph.cgraph.reset(new(std::nothrow) struct ggml_cgraph());
|
1650 |
+
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, sequence_graph.cgraph, "Failed to allocate sequence graph");
|
1651 |
+
sequence_graph.cgraph->n_threads = 1;
|
1652 |
+
|
1653 |
+
RWKV_ASSERT_FALSE(RWKV_ERROR_GRAPH, rwkv_build_sequence_graph(
|
1654 |
+
sequence_graph.ctx.ctx, ctx->instance->model,
|
1655 |
+
sequence_graph.tokens, ctx->input_layers.get(), ctx->output_layers.get(), ctx->logits,
|
1656 |
+
sequence_graph.cgraph.get(),
|
1657 |
+
&sequence_graph.pre_logits_nodes, &sequence_graph.pre_logits_leafs, &sequence_graph.post_logits_nodes, &sequence_graph.post_logits_leafs
|
1658 |
+
));
|
1659 |
+
|
1660 |
+
ctx->sequence_len = sequence_len;
|
1661 |
+
ctx->sequence_graph = std::move(sequence_graph);
|
1662 |
}
|
1663 |
|
1664 |
// Allow building the sequence graph without actually evaluating, by specifying sequence = NULL.
|
1665 |
if (sequence) {
|
1666 |
rwkv_set_inputs(ctx, state_in);
|
1667 |
memcpy(ctx->sequence_graph.tokens->data, sequence, sequence_len * sizeof(uint32_t));
|
1668 |
+
|
1669 |
+
// Short circuit computation of logits if nobody actually cares
|
1670 |
+
if (!logits_out) {
|
1671 |
+
ctx->sequence_graph.cgraph->n_nodes = ctx->sequence_graph.pre_logits_nodes;
|
1672 |
+
ctx->sequence_graph.cgraph->n_leafs = ctx->sequence_graph.pre_logits_leafs;
|
1673 |
+
} else {
|
1674 |
+
ctx->sequence_graph.cgraph->n_nodes = ctx->sequence_graph.post_logits_nodes;
|
1675 |
+
ctx->sequence_graph.cgraph->n_leafs = ctx->sequence_graph.post_logits_leafs;
|
1676 |
+
}
|
1677 |
+
|
1678 |
ggml_graph_compute(ctx->sequence_graph.ctx.ctx, ctx->sequence_graph.cgraph.get());
|
1679 |
rwkv_get_outputs(ctx, state_out, logits_out);
|
1680 |
}
|
|
|
1682 |
return true;
|
1683 |
}
|
1684 |
|
1685 |
+
// Provided for compatibility.
|
1686 |
+
extern "C" RWKV_API uint32_t rwkv_get_state_buffer_element_count(const struct rwkv_context * ctx) {
|
1687 |
+
return rwkv_get_state_len(ctx);
|
1688 |
+
}
|
1689 |
+
|
1690 |
+
// Provided for compatibility.
|
1691 |
+
extern "C" RWKV_API uint32_t rwkv_get_logits_buffer_element_count(const struct rwkv_context * ctx) {
|
1692 |
+
return rwkv_get_logits_len(ctx);
|
1693 |
+
}
|
1694 |
+
|
1695 |
+
size_t rwkv_get_n_vocab(const struct rwkv_context * ctx) {
|
1696 |
+
return (size_t) ctx->instance->model.header.n_vocab;
|
1697 |
+
}
|
1698 |
+
|
1699 |
+
size_t rwkv_get_n_embed(const struct rwkv_context * ctx) {
|
1700 |
+
return (size_t) ctx->instance->model.header.n_embed;
|
1701 |
}
|
1702 |
|
1703 |
+
size_t rwkv_get_n_layer(const struct rwkv_context * ctx) {
|
1704 |
+
return (size_t) ctx->instance->model.header.n_layer;
|
1705 |
+
}
|
1706 |
+
|
1707 |
+
size_t rwkv_get_state_len(const struct rwkv_context * ctx) {
|
1708 |
+
const struct rwkv_file_header & header = ctx->instance->model.header;
|
1709 |
+
return (size_t) header.n_embed * 5 * (size_t) header.n_layer;
|
1710 |
+
}
|
1711 |
+
|
1712 |
+
size_t rwkv_get_logits_len(const struct rwkv_context * ctx) {
|
1713 |
+
return (size_t) ctx->instance->model.header.n_vocab;
|
1714 |
+
}
|
1715 |
+
|
1716 |
+
void rwkv_init_state(const struct rwkv_context * ctx, float * state) {
|
1717 |
+
const struct rwkv_file_header & header = ctx->instance->model.header;
|
1718 |
+
const size_t layer_size = (size_t) header.n_embed * 5;
|
1719 |
+
const size_t layer_zero = (size_t) header.n_embed * 4;
|
1720 |
+
const size_t layers_size = (size_t) header.n_layer * layer_size;
|
1721 |
+
|
1722 |
+
for (size_t start = 0; start < layers_size; start += layer_size) {
|
1723 |
+
for (size_t i = 0; i < layer_zero; i++) {
|
1724 |
+
state[start + i] = 0.0F;
|
1725 |
+
}
|
1726 |
+
|
1727 |
+
for (size_t i = layer_zero; i < layer_size; i++) {
|
1728 |
+
state[start + i] = -1e30F;
|
1729 |
+
}
|
1730 |
+
}
|
1731 |
}
|
1732 |
|
1733 |
void rwkv_free(struct rwkv_context * ctx) {
|
|
|
1757 |
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_FILE, rwkv_fread_file_header(in_file.file, in_header), "Invalid file header");
|
1758 |
|
1759 |
enum ggml_type in_type = rwkv_type_to_ggml[in_header.data_type];
|
1760 |
+
RWKV_ASSERT_FALSE_MSG(
|
1761 |
+
RWKV_ERROR_FILE,
|
1762 |
+
in_type == GGML_TYPE_F32 || in_type == GGML_TYPE_F16,
|
1763 |
+
"Unsupported input data type (%s); needs to be FP32 or FP16",
|
1764 |
+
rwkv_type_to_string[rwkv_type_from_ggml[in_type]]
|
1765 |
+
);
|
1766 |
|
1767 |
struct rwkv_file_header out_header = in_header;
|
1768 |
out_header.version = RWKV_FILE_VERSION;
|
|
|
1773 |
size_t orig_total_size = 0;
|
1774 |
size_t new_total_size = 0;
|
1775 |
|
1776 |
+
// Required to init the F16 tables
|
1777 |
// Doesn't crash if ggml_init fails
|
1778 |
ggml_free(ggml_init({ 0, NULL, true }));
|
1779 |
|
|
|
1785 |
struct rwkv_tensor_header header;
|
1786 |
RWKV_ASSERT_FALSE(RWKV_ERROR_FILE, rwkv_fread_tensor_header_and_skip(in_file.file, header));
|
1787 |
|
1788 |
+
size_t in_size = header.size();
|
1789 |
|
1790 |
if (in_size > max_in_size) {
|
1791 |
max_in_size = in_size;
|
1792 |
}
|
1793 |
|
1794 |
// f16 type tensors get relocated to out and then converted into f32 at in
|
1795 |
+
if (header.data_type == TYPE_FP16) {
|
1796 |
if (in_size > max_out_size) {
|
1797 |
max_out_size = in_size;
|
1798 |
}
|
1799 |
|
1800 |
+
size_t f32_size = rwkv_future_tensor::size(GGML_TYPE_F32, header.width, header.height);
|
1801 |
|
1802 |
if (f32_size > max_in_size) {
|
1803 |
max_in_size = f32_size;
|
1804 |
}
|
1805 |
}
|
1806 |
|
1807 |
+
size_t out_size = rwkv_future_tensor::size(out_type, header.width, header.height);
|
1808 |
|
1809 |
if (out_size > max_out_size) {
|
1810 |
max_out_size = out_size;
|
|
|
1820 |
// This is a histogram of quantized values. If it shows single 1.0, then all 0.0, something went very wrong!
|
1821 |
int64_t hist_all[16] {};
|
1822 |
|
1823 |
+
std::unique_ptr<uint8_t[]> scratch(new(std::nothrow) uint8_t[max_in_size + max_out_size]);
|
1824 |
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, scratch.get(), "Failed to allocate buffer");
|
1825 |
|
1826 |
uint8_t * in_buf = scratch.get();
|
|
|
1838 |
const char * name_str = name.c_str();
|
1839 |
RWKV_MSG("%*s - [%5" PRId32 ", %5" PRId32 "], type = %6s ", (int) max_key_length, name_str, header.width, header.height, rwkv_type_to_string[header.data_type]);
|
1840 |
|
1841 |
+
data = header.data_type == TYPE_FP16 ? out_buf : in_buf;
|
1842 |
+
size_t orig_size = header.size(), new_size = orig_size;
|
1843 |
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_MODEL_PARAMS, rwkv_fread_data(in_file.file, orig_size, data), "\nFailed to read tensor data of %s", name_str);
|
1844 |
|
1845 |
// Quantize only 2D tensors, except embedding and head matrices.
|
1846 |
// Embedding and head take not too much space, especially in bigger models;
|
1847 |
// but they significantly increase perplexity when quantized.
|
1848 |
+
if ((header.data_type == TYPE_FP32 || header.data_type == TYPE_FP16) && header.dim_count == 2 && name != "emb.weight" && name != "head.weight") {
|
1849 |
RWKV_MSG("quantizing... ");
|
1850 |
|
1851 |
size_t nelements = (size_t) header.width * (size_t) header.height;
|
1852 |
|
1853 |
+
if (header.data_type == TYPE_FP16) {
|
1854 |
ggml_fp16_to_fp32_row((const ggml_fp16_t *) out_buf, (float *) in_buf, nelements);
|
1855 |
}
|
1856 |
|
otherarch/rwkv_v3.h
CHANGED
@@ -84,7 +84,7 @@ extern "C" {
|
|
84 |
RWKV_API enum rwkv_error_flags rwkv_get_last_error(struct rwkv_context * ctx);
|
85 |
|
86 |
// Loads the model from a file and prepares it for inference.
|
87 |
-
// Returns NULL on any error.
|
88 |
// - model_file_path: path to model file in ggml format.
|
89 |
// - n_threads: count of threads to use, must be positive.
|
90 |
RWKV_API struct rwkv_context * rwkv_init_from_file(const char * model_file_path, const uint32_t n_threads);
|
@@ -97,39 +97,64 @@ extern "C" {
|
|
97 |
// - n_threads: count of threads to use, must be positive.
|
98 |
RWKV_API struct rwkv_context * rwkv_clone_context(struct rwkv_context * ctx, const uint32_t n_threads);
|
99 |
|
100 |
-
// Offloads specified
|
101 |
-
//
|
102 |
-
|
|
|
103 |
|
104 |
// Evaluates the model for a single token.
|
105 |
// Not thread-safe. For parallel inference, call rwkv_clone_context to create one rwkv_context for each thread.
|
106 |
-
// Returns false on any error.
|
|
|
|
|
107 |
// - token: next token index, in range 0 <= token < n_vocab.
|
108 |
-
// - state_in: FP32 buffer of size
|
109 |
-
// - state_out: FP32 buffer of size
|
110 |
-
// - logits_out: FP32 buffer of size
|
111 |
-
RWKV_API bool rwkv_eval(
|
112 |
|
113 |
// Evaluates the model for a sequence of tokens.
|
114 |
// Uses a faster algorithm than rwkv_eval if you do not need the state and logits for every token. Best used with batch sizes of 64 or so.
|
115 |
// Has to build a computation graph on the first call for a given sequence, but will use this cached graph for subsequent calls of the same sequence length.
|
116 |
-
// - tokens: pointer to an array of tokens. If NULL, the graph will be built and cached, but not executed. (Useful for initialization.)
|
117 |
// Not thread-safe. For parallel inference, call rwkv_clone_context to create one rwkv_context for each thread.
|
118 |
-
// Returns false on any error.
|
|
|
|
|
|
|
119 |
// - sequence_len: number of tokens to read from the array.
|
120 |
-
// - state_in: FP32 buffer of size
|
121 |
-
// - state_out: FP32 buffer of size
|
122 |
-
// - logits_out: FP32 buffer of size
|
123 |
-
RWKV_API bool rwkv_eval_sequence(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
-
// Returns
|
126 |
-
|
|
|
127 |
|
128 |
-
//
|
129 |
-
|
|
|
|
|
|
|
130 |
|
131 |
// Frees all allocated memory and the context.
|
132 |
-
// Does not need to be the same thread that created the rwkv_context.
|
133 |
RWKV_API void rwkv_free(struct rwkv_context * ctx);
|
134 |
|
135 |
// Quantizes FP32 or FP16 model to one of quantized formats.
|
|
|
84 |
RWKV_API enum rwkv_error_flags rwkv_get_last_error(struct rwkv_context * ctx);
|
85 |
|
86 |
// Loads the model from a file and prepares it for inference.
|
87 |
+
// Returns NULL on any error.
|
88 |
// - model_file_path: path to model file in ggml format.
|
89 |
// - n_threads: count of threads to use, must be positive.
|
90 |
RWKV_API struct rwkv_context * rwkv_init_from_file(const char * model_file_path, const uint32_t n_threads);
|
|
|
97 |
// - n_threads: count of threads to use, must be positive.
|
98 |
RWKV_API struct rwkv_context * rwkv_clone_context(struct rwkv_context * ctx, const uint32_t n_threads);
|
99 |
|
100 |
+
// Offloads specified count of model layers onto the GPU. Offloaded layers are evaluated using cuBLAS.
|
101 |
+
// Returns true if at least one layer was offloaded.
|
102 |
+
// If rwkv.cpp was compiled without cuBLAS support, this function is a no-op and always returns false.
|
103 |
+
RWKV_API bool rwkv_gpu_offload_layers(struct rwkv_context * ctx, const uint32_t n_layers);
|
104 |
|
105 |
// Evaluates the model for a single token.
|
106 |
// Not thread-safe. For parallel inference, call rwkv_clone_context to create one rwkv_context for each thread.
|
107 |
+
// Returns false on any error.
|
108 |
+
// You can pass NULL to logits_out whenever logits are not needed. This can improve speed by ~10ms per iteration
|
109 |
+
// that you do not calculate logits.
|
110 |
// - token: next token index, in range 0 <= token < n_vocab.
|
111 |
+
// - state_in: FP32 buffer of size rwkv_get_state_len(); or NULL, if this is a first pass.
|
112 |
+
// - state_out: FP32 buffer of size rwkv_get_state_len(). This buffer will be written to if non-NULL.
|
113 |
+
// - logits_out: FP32 buffer of size rwkv_get_logits_len(). This buffer will be written to if non-NULL.
|
114 |
+
RWKV_API bool rwkv_eval(struct rwkv_context * ctx, const uint32_t token, const float * state_in, float * state_out, float * logits_out);
|
115 |
|
116 |
// Evaluates the model for a sequence of tokens.
|
117 |
// Uses a faster algorithm than rwkv_eval if you do not need the state and logits for every token. Best used with batch sizes of 64 or so.
|
118 |
// Has to build a computation graph on the first call for a given sequence, but will use this cached graph for subsequent calls of the same sequence length.
|
|
|
119 |
// Not thread-safe. For parallel inference, call rwkv_clone_context to create one rwkv_context for each thread.
|
120 |
+
// Returns false on any error.
|
121 |
+
// You can pass NULL to logits_out whenever logits are not needed. This can improve speed by ~10ms per iteration
|
122 |
+
// that you do not calculate logits.
|
123 |
+
// - tokens: pointer to an array of tokens. If NULL, the graph will be built and cached, but not executed: this can be useful for initialization.
|
124 |
// - sequence_len: number of tokens to read from the array.
|
125 |
+
// - state_in: FP32 buffer of size rwkv_get_state_len(), or NULL if this is a first pass.
|
126 |
+
// - state_out: FP32 buffer of size rwkv_get_state_len(). This buffer will be written to if non-NULL.
|
127 |
+
// - logits_out: FP32 buffer of size rwkv_get_logits_len(). This buffer will be written to if non-NULL.
|
128 |
+
RWKV_API bool rwkv_eval_sequence(struct rwkv_context * ctx, const uint32_t * tokens, size_t sequence_len, const float * state_in, float * state_out, float * logits_out);
|
129 |
+
|
130 |
+
// Returns the number of tokens in the given model's vocabulary.
|
131 |
+
// Useful for telling 20B_tokenizer models (n_vocab = 50277) apart from World models (n_vocab = 65536).
|
132 |
+
RWKV_API size_t rwkv_get_n_vocab(const struct rwkv_context * ctx);
|
133 |
+
|
134 |
+
// Returns the number of elements in the given model's embedding.
|
135 |
+
// Useful for reading individual fields of a model's hidden state.
|
136 |
+
RWKV_API size_t rwkv_get_n_embed(const struct rwkv_context * ctx);
|
137 |
+
|
138 |
+
// Returns the number of layers in the given model.
|
139 |
+
// Useful for always offloading the entire model to GPU.
|
140 |
+
RWKV_API size_t rwkv_get_n_layer(const struct rwkv_context * ctx);
|
141 |
+
|
142 |
+
// Returns the number of float elements in a complete state for the given model.
|
143 |
+
// This is the number of elements you'll need to allocate for a call to rwkv_eval, rwkv_eval_sequence, or rwkv_init_state.
|
144 |
+
RWKV_API size_t rwkv_get_state_len(const struct rwkv_context * ctx);
|
145 |
|
146 |
+
// Returns the number of float elements in the logits output of a given model.
|
147 |
+
// This is currently always identical to n_vocab.
|
148 |
+
RWKV_API size_t rwkv_get_logits_len(const struct rwkv_context * ctx);
|
149 |
|
150 |
+
// Initializes the given state so that passing it to rwkv_eval or rwkv_eval_sequence would be identical to passing NULL.
|
151 |
+
// Useful in cases where tracking the first call to these functions may be annoying or expensive.
|
152 |
+
// State must be initialized for behavior to be defined, passing a zeroed state to rwkv.cpp functions will result in NaNs.
|
153 |
+
// - state: FP32 buffer of size rwkv_get_state_len() to initialize
|
154 |
+
RWKV_API void rwkv_init_state(const struct rwkv_context * ctx, float * state);
|
155 |
|
156 |
// Frees all allocated memory and the context.
|
157 |
+
// Does not need to be called on the same thread that created the rwkv_context.
|
158 |
RWKV_API void rwkv_free(struct rwkv_context * ctx);
|
159 |
|
160 |
// Quantizes FP32 or FP16 model to one of quantized formats.
|
spm-headers/ggml.h
CHANGED
@@ -201,12 +201,6 @@
|
|
201 |
#define GGML_MAX_NAME 48
|
202 |
#define GGML_DEFAULT_N_THREADS 4
|
203 |
|
204 |
-
// Maximum training context of the model in use
|
205 |
-
// For the LLaMA models this is normally 2048, but somehow "stepping out" by 128 gives better results (tested at 7B and 13B)
|
206 |
-
#ifndef GGML_TRAINING_CTX
|
207 |
-
#define GGML_TRAINING_CTX 2176
|
208 |
-
#endif
|
209 |
-
|
210 |
#define GGML_ASSERT(x) \
|
211 |
do { \
|
212 |
if (!(x)) { \
|
@@ -450,6 +444,9 @@ extern "C" {
|
|
450 |
|
451 |
|
452 |
// compute types
|
|
|
|
|
|
|
453 |
enum ggml_task_type {
|
454 |
GGML_TASK_INIT = 0,
|
455 |
GGML_TASK_COMPUTE,
|
@@ -507,6 +504,8 @@ extern "C" {
|
|
507 |
// use this to compute the memory overhead of a tensor
|
508 |
GGML_API size_t ggml_tensor_overhead(void);
|
509 |
|
|
|
|
|
510 |
// main
|
511 |
|
512 |
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
|
|
201 |
#define GGML_MAX_NAME 48
|
202 |
#define GGML_DEFAULT_N_THREADS 4
|
203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
#define GGML_ASSERT(x) \
|
205 |
do { \
|
206 |
if (!(x)) { \
|
|
|
444 |
|
445 |
|
446 |
// compute types
|
447 |
+
|
448 |
+
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
|
449 |
+
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
|
450 |
enum ggml_task_type {
|
451 |
GGML_TASK_INIT = 0,
|
452 |
GGML_TASK_COMPUTE,
|
|
|
504 |
// use this to compute the memory overhead of a tensor
|
505 |
GGML_API size_t ggml_tensor_overhead(void);
|
506 |
|
507 |
+
GGML_API float get_theta_scale(int n_dims,int n_past,int n_ctx);
|
508 |
+
|
509 |
// main
|
510 |
|
511 |
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|