File size: 178,996 Bytes

aa83836

{"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyOgngB7a+G/dFQHszPkGRIV"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python","version":"3.10.14","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[],"dockerImageVersionId":30761,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"!pip -q install huggingface_hub\n!pip -q install transformers sentencepiece\nfrom huggingface_hub import upload_file, create_repo\n\nfrom kaggle_secrets import UserSecretsClient\nuser_secrets = UserSecretsClient()\nHF_TOKEN = user_secrets.get_secret(\"HF_TOKEN\")","metadata":{"id":"oCMMoIiOHXdL","executionInfo":{"status":"ok","timestamp":1726501025234,"user_tz":-60,"elapsed":17496,"user":{"displayName":"Lyte","userId":"00368277356076556155"}},"execution":{"iopub.status.busy":"2024-09-16T15:43:11.615446Z","iopub.execute_input":"2024-09-16T15:43:11.616891Z","iopub.status.idle":"2024-09-16T15:43:44.237618Z","shell.execute_reply.started":"2024-09-16T15:43:11.616834Z","shell.execute_reply":"2024-09-16T15:43:44.236028Z"},"trusted":true},"execution_count":2,"outputs":[]},{"cell_type":"code","source":"!git clone https://github.com/ggerganov/llama.cpp\n%cd llama.cpp\n!make","metadata":{"id":"SO0QBqij-kSj","colab":{"base_uri":"https://localhost:8080/"},"outputId":"59059c96-5118-4278-ad0c-52390b135f10","scrolled":true,"execution":{"iopub.status.busy":"2024-09-16T15:43:44.241022Z","iopub.execute_input":"2024-09-16T15:43:44.241624Z","iopub.status.idle":"2024-09-16T15:54:35.487734Z","shell.execute_reply.started":"2024-09-16T15:43:44.241559Z","shell.execute_reply":"2024-09-16T15:54:35.485038Z"},"trusted":true},"execution_count":3,"outputs":[{"name":"stdout","text":"Cloning into 'llama.cpp'...\nremote: Enumerating objects: 34149, done.\u001b[K\nremote: Counting objects: 100% (7801/7801), done.\u001b[K\nremote: Compressing objects: 100% (691/691), done.\u001b[K\nremote: Total 34149 (delta 7499), reused 7183 (delta 7105), pack-reused 26348 (from 1)\u001b[K\nReceiving objects: 100% (34149/34149), 57.58 MiB | 22.22 MiB/s, done.\nResolving deltas: 100% (24732/24732), done.\n/kaggle/working/llama.cpp\nI ccache not found. Consider installing it for faster compilation.\nI llama.cpp build info: \nI UNAME_S:   Linux\nI UNAME_P:   x86_64\nI UNAME_M:   x86_64\nI CFLAGS:    -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion \nI CXXFLAGS:  -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE \nI NVCCFLAGS: -std=c++11 -O3 -g \nI LDFLAGS:    \nI CC:        cc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\nI CXX:       c++ (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0\n\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c ggml/src/llamafile/sgemm.cpp -o ggml/src/llamafile/sgemm.o\ncc  -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion    -c ggml/src/ggml.c -o ggml/src/ggml.o\ncc  -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion    -c ggml/src/ggml-alloc.c -o ggml/src/ggml-alloc.o\ncc  -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion    -c ggml/src/ggml-backend.c -o ggml/src/ggml-backend.o\ncc -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion     -c ggml/src/ggml-quants.c -o ggml/src/ggml-quants.o\ncc -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion     -c ggml/src/ggml-aarch64.c -o ggml/src/ggml-aarch64.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c src/llama.cpp -o src/llama.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c src/llama-vocab.cpp -o src/llama-vocab.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c src/llama-grammar.cpp -o src/llama-grammar.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c src/llama-sampling.cpp -o src/llama-sampling.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c src/unicode.cpp -o src/unicode.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c src/unicode-data.cpp -o src/unicode-data.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c common/common.cpp -o common/common.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c common/arg.cpp -o common/arg.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c common/log.cpp -o common/log.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c common/console.cpp -o common/console.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c common/ngram-cache.cpp -o common/ngram-cache.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c common/sampling.cpp -o common/sampling.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c common/train.cpp -o common/train.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c common/build-info.cpp -o common/build-info.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c common/json-schema-to-grammar.cpp -o common/json-schema-to-grammar.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -static -fPIC -c examples/llava/llava.cpp -o libllava.a -Wno-cast-qual\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/baby-llama/baby-llama.cpp -o examples/baby-llama/baby-llama.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/baby-llama/baby-llama.o -o llama-baby-llama  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/batched/batched.cpp -o examples/batched/batched.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/batched/batched.o -o llama-batched  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/batched-bench/batched-bench.cpp -o examples/batched-bench/batched-bench.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/batched-bench/batched-bench.o -o llama-batched-bench  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/llama-bench/llama-bench.cpp -o examples/llama-bench/llama-bench.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/llama-bench/llama-bench.o -o llama-bench  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/benchmark/benchmark-matmult.cpp -o examples/benchmark/benchmark-matmult.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o common/build-info.o examples/benchmark/benchmark-matmult.o -o llama-benchmark-matmult  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/main/main.cpp -o examples/main/main.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/main/main.o -o llama-cli  \n\n====  Run ./llama-cli -h for help.  ====\n\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp -o examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.o -o llama-convert-llama2c-to-ggml  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/embedding/embedding.cpp -o examples/embedding/embedding.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/embedding/embedding.o -o llama-embedding  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/eval-callback/eval-callback.cpp -o examples/eval-callback/eval-callback.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/eval-callback/eval-callback.o -o llama-eval-callback  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/export-lora/export-lora.cpp -o examples/export-lora/export-lora.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/export-lora/export-lora.o -o llama-export-lora  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/gbnf-validator/gbnf-validator.cpp -o examples/gbnf-validator/gbnf-validator.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/gbnf-validator/gbnf-validator.o -o llama-gbnf-validator  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/gguf/gguf.cpp -o examples/gguf/gguf.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o examples/gguf/gguf.o -o llama-gguf  \ncc -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion  -Iexamples/gguf-hash/deps -c examples/gguf-hash/deps/sha1/sha1.c -o examples/gguf-hash/deps/sha1/sha1.o\ncc -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion  -Iexamples/gguf-hash/deps -c examples/gguf-hash/deps/xxhash/xxhash.c -o examples/gguf-hash/deps/xxhash/xxhash.o\ncc -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion  -Iexamples/gguf-hash/deps -c examples/gguf-hash/deps/sha256/sha256.c -o examples/gguf-hash/deps/sha256/sha256.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -Iexamples/gguf-hash/deps -c examples/gguf-hash/gguf-hash.cpp -o examples/gguf-hash/gguf-hash.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/gguf-hash/gguf-hash.o -o llama-gguf-hash  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/gguf-split/gguf-split.cpp -o examples/gguf-split/gguf-split.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/gguf-split/gguf-split.o -o llama-gguf-split  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/gritlm/gritlm.cpp -o examples/gritlm/gritlm.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/gritlm/gritlm.o -o llama-gritlm  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/imatrix/imatrix.cpp -o examples/imatrix/imatrix.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/imatrix/imatrix.o -o llama-imatrix  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/infill/infill.cpp -o examples/infill/infill.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/infill/infill.o -o llama-infill  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  examples/llava/llava-cli.cpp examples/llava/llava.cpp examples/llava/clip.cpp ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o -o llama-llava-cli   -Wno-cast-qual\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  examples/llava/minicpmv-cli.cpp examples/llava/llava.cpp examples/llava/clip.cpp ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o -o llama-minicpmv-cli   -Wno-cast-qual\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/lookahead/lookahead.cpp -o examples/lookahead/lookahead.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/lookahead/lookahead.o -o llama-lookahead  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/lookup/lookup.cpp -o examples/lookup/lookup.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/lookup/lookup.o -o llama-lookup  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/lookup/lookup-create.cpp -o examples/lookup/lookup-create.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/lookup/lookup-create.o -o llama-lookup-create  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/lookup/lookup-merge.cpp -o examples/lookup/lookup-merge.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/lookup/lookup-merge.o -o llama-lookup-merge  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/lookup/lookup-stats.cpp -o examples/lookup/lookup-stats.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/lookup/lookup-stats.o -o llama-lookup-stats  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/parallel/parallel.cpp -o examples/parallel/parallel.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/parallel/parallel.o -o llama-parallel  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/passkey/passkey.cpp -o examples/passkey/passkey.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/passkey/passkey.o -o llama-passkey  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/perplexity/perplexity.cpp -o examples/perplexity/perplexity.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/perplexity/perplexity.o -o llama-perplexity  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c pocs/vdot/q8dot.cpp -o pocs/vdot/q8dot.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/ggml.o ggml/src/llamafile/sgemm.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o pocs/vdot/q8dot.o -o llama-q8dot  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/quantize/quantize.cpp -o examples/quantize/quantize.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/quantize/quantize.o -o llama-quantize  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/quantize-stats/quantize-stats.cpp -o examples/quantize-stats/quantize-stats.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/quantize-stats/quantize-stats.o -o llama-quantize-stats  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/retrieval/retrieval.cpp -o examples/retrieval/retrieval.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/retrieval/retrieval.o -o llama-retrieval  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/save-load-state/save-load-state.cpp -o examples/save-load-state/save-load-state.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/save-load-state/save-load-state.o -o llama-save-load-state  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/server/server.cpp -o examples/server/server.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o -Iexamples/server examples/server/server.o -o llama-server   \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/simple/simple.cpp -o examples/simple/simple.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/simple/simple.o -o llama-simple  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/speculative/speculative.cpp -o examples/speculative/speculative.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/speculative/speculative.o -o llama-speculative  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/tokenize/tokenize.cpp -o examples/tokenize/tokenize.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/tokenize/tokenize.o -o llama-tokenize  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c pocs/vdot/vdot.cpp -o pocs/vdot/vdot.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/ggml.o ggml/src/llamafile/sgemm.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o pocs/vdot/vdot.o -o llama-vdot  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/cvector-generator/cvector-generator.cpp -o examples/cvector-generator/cvector-generator.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/cvector-generator/cvector-generator.o -o llama-cvector-generator  \nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/gen-docs/gen-docs.cpp -o examples/gen-docs/gen-docs.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  ggml/src/llamafile/sgemm.o ggml/src/ggml.o ggml/src/ggml-alloc.o ggml/src/ggml-backend.o ggml/src/ggml-quants.o ggml/src/ggml-aarch64.o src/llama.o src/llama-vocab.o src/llama-grammar.o src/llama-sampling.o src/unicode.o src/unicode-data.o common/common.o common/arg.o common/log.o common/console.o common/ngram-cache.o common/sampling.o common/train.o common/build-info.o common/json-schema-to-grammar.o examples/gen-docs/gen-docs.o -o llama-gen-docs  \ncc -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -fopenmp -Wdouble-promotion  -c tests/test-c.c -o tests/test-c.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  -c examples/deprecation-warning/deprecation-warning.cpp -o examples/deprecation-warning/deprecation-warning.o\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  examples/deprecation-warning/deprecation-warning.o -o main  \nNOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead.\nc++ -std=c++11 -fPIC -O3 -g -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread -fopenmp  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_OPENMP -DGGML_USE_LLAMAFILE  examples/deprecation-warning/deprecation-warning.o -o server  \nNOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead.\n","output_type":"stream"}]},{"cell_type":"code","source":"!mkdir model/\n%cd model/\n!wget https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/config.json\n!wget https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/generation_config.json\n!wget https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/added_tokens.json\n!wget https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/special_tokens_map.json\n!wget https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/rwkv_vocab_v20230424.txt\n!wget https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/pytorch_model.bin\n!wget https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/tokenizer_config.json\n!ls\n%cd ..","metadata":{"id":"k-vJrh2cDZmk","scrolled":true,"execution":{"iopub.status.busy":"2024-09-16T16:10:40.690242Z","iopub.execute_input":"2024-09-16T16:10:40.690904Z","iopub.status.idle":"2024-09-16T16:12:31.318947Z","shell.execute_reply.started":"2024-09-16T16:10:40.690849Z","shell.execute_reply":"2024-09-16T16:12:31.317019Z"},"trusted":true},"execution_count":18,"outputs":[{"name":"stdout","text":"mkdir: cannot create directory 'model/': File exists\n/kaggle/working/llama.cpp/model\n--2024-09-16 16:10:43--  https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/config.json\nResolving huggingface.co (huggingface.co)... 3.165.160.59, 3.165.160.12, 3.165.160.11, ...\nConnecting to huggingface.co (huggingface.co)|3.165.160.59|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 610 [text/plain]\nSaving to: 'config.json'\n\nconfig.json         100%[===================>]     610  --.-KB/s    in 0s      \n\n2024-09-16 16:10:43 (58.0 MB/s) - 'config.json' saved [610/610]\n\n--2024-09-16 16:10:44--  https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/generation_config.json\nResolving huggingface.co (huggingface.co)... 3.165.160.11, 3.165.160.61, 3.165.160.12, ...\nConnecting to huggingface.co (huggingface.co)|3.165.160.11|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 260 [text/plain]\nSaving to: 'generation_config.json'\n\ngeneration_config.j 100%[===================>]     260  --.-KB/s    in 0s      \n\n2024-09-16 16:10:44 (22.7 MB/s) - 'generation_config.json' saved [260/260]\n\n--2024-09-16 16:10:45--  https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/added_tokens.json\nResolving huggingface.co (huggingface.co)... 3.165.160.59, 3.165.160.11, 3.165.160.12, ...\nConnecting to huggingface.co (huggingface.co)|3.165.160.59|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 15 [text/plain]\nSaving to: 'added_tokens.json'\n\nadded_tokens.json   100%[===================>]      15  --.-KB/s    in 0s      \n\n2024-09-16 16:10:45 (1.24 MB/s) - 'added_tokens.json' saved [15/15]\n\n--2024-09-16 16:10:47--  https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/special_tokens_map.json\nResolving huggingface.co (huggingface.co)... 3.165.160.61, 3.165.160.59, 3.165.160.11, ...\nConnecting to huggingface.co (huggingface.co)|3.165.160.61|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 91 [text/plain]\nSaving to: 'special_tokens_map.json'\n\nspecial_tokens_map. 100%[===================>]      91  --.-KB/s    in 0s      \n\n2024-09-16 16:10:47 (4.87 MB/s) - 'special_tokens_map.json' saved [91/91]\n\n--2024-09-16 16:10:48--  https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/rwkv_vocab_v20230424.txt\nResolving huggingface.co (huggingface.co)... 3.165.160.12, 3.165.160.11, 3.165.160.59, ...\nConnecting to huggingface.co (huggingface.co)|3.165.160.12|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 1093733 (1.0M) [text/plain]\nSaving to: 'rwkv_vocab_v20230424.txt'\n\nrwkv_vocab_v2023042 100%[===================>]   1.04M  5.37MB/s    in 0.2s    \n\n2024-09-16 16:10:48 (5.37 MB/s) - 'rwkv_vocab_v20230424.txt' saved [1093733/1093733]\n\n--2024-09-16 16:10:49--  https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/pytorch_model.bin\nResolving huggingface.co (huggingface.co)... 3.165.160.61, 3.165.160.11, 3.165.160.59, ...\nConnecting to huggingface.co (huggingface.co)|3.165.160.61|:443... connected.\nHTTP request sent, awaiting response... 302 Found\nLocation: https://cdn-lfs-us-1.huggingface.co/repos/f6/ff/f6ff4cd2e55f87480652292741ef4ea949af295cd3fee9e7279db09d3ad866d0/609ffca33ff73d53bf059f7336396dc39bfe76764d3b263429ee5f2933688993?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1726762250&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyNjc2MjI1MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2Y2L2ZmL2Y2ZmY0Y2QyZTU1Zjg3NDgwNjUyMjkyNzQxZWY0ZWE5NDlhZjI5NWNkM2ZlZTllNzI3OWRiMDlkM2FkODY2ZDAvNjA5ZmZjYTMzZmY3M2Q1M2JmMDU5ZjczMzYzOTZkYzM5YmZlNzY3NjRkM2IyNjM0MjllZTVmMjkzMzY4ODk5Mz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=OpzRJc69pyjGt9FE7yAP74u0dyWRqQcXYY4pPEmgPyMJj7m6mtLd1rddbB3CruHOu9H%7ELicWaivltQV8EfEeyAZt4KdgMxjfeBvkrQNFkoEb-k6d-qCKbpzVGPZijtnLUMUtVv5LW76393LAtJED47McOKbevcZIUonFPbAyTDv0n3p6i8YItRUAwUo1LS-EJWPNZ3EROxGRANIowpsdGZSRPiu1nBqfuRdpvz9avdarzdO9vNkJnDjPn%7EW57vyDL0VWGSRQ1tfIfoTFok9Ta-ny3cPN779Kj3eZOXRLoqQd0q7D5VGvuLMApccga6IcNClgiV3hcNf-r2xahz9A5w__&Key-Pair-Id=K24J24Z295AEI9 [following]\n--2024-09-16 16:10:50--  https://cdn-lfs-us-1.huggingface.co/repos/f6/ff/f6ff4cd2e55f87480652292741ef4ea949af295cd3fee9e7279db09d3ad866d0/609ffca33ff73d53bf059f7336396dc39bfe76764d3b263429ee5f2933688993?response-content-disposition=inline%3B+filename*%3DUTF-8''pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1726762250&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyNjc2MjI1MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2Y2L2ZmL2Y2ZmY0Y2QyZTU1Zjg3NDgwNjUyMjkyNzQxZWY0ZWE5NDlhZjI5NWNkM2ZlZTllNzI3OWRiMDlkM2FkODY2ZDAvNjA5ZmZjYTMzZmY3M2Q1M2JmMDU5ZjczMzYzOTZkYzM5YmZlNzY3NjRkM2IyNjM0MjllZTVmMjkzMzY4ODk5Mz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSomcmVzcG9uc2UtY29udGVudC10eXBlPSoifV19&Signature=OpzRJc69pyjGt9FE7yAP74u0dyWRqQcXYY4pPEmgPyMJj7m6mtLd1rddbB3CruHOu9H~LicWaivltQV8EfEeyAZt4KdgMxjfeBvkrQNFkoEb-k6d-qCKbpzVGPZijtnLUMUtVv5LW76393LAtJED47McOKbevcZIUonFPbAyTDv0n3p6i8YItRUAwUo1LS-EJWPNZ3EROxGRANIowpsdGZSRPiu1nBqfuRdpvz9avdarzdO9vNkJnDjPn~W57vyDL0VWGSRQ1tfIfoTFok9Ta-ny3cPN779Kj3eZOXRLoqQd0q7D5VGvuLMApccga6IcNClgiV3hcNf-r2xahz9A5w__&Key-Pair-Id=K24J24Z295AEI9\nResolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 3.163.189.28, 3.163.189.20, 3.163.189.91, ...\nConnecting to cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)|3.163.189.28|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 3199826561 (3.0G) [application/octet-stream]\nSaving to: 'pytorch_model.bin'\n\npytorch_model.bin   100%[===================>]   2.98G  32.2MB/s    in 98s     \n\n2024-09-16 16:12:28 (31.1 MB/s) - 'pytorch_model.bin' saved [3199826561/3199826561]\n\n--2024-09-16 16:12:29--  https://huggingface.co/RWKV/rwkv-6-world-1b6/resolve/main/tokenizer_config.json\nResolving huggingface.co (huggingface.co)... 3.165.160.12, 3.165.160.11, 3.165.160.59, ...\nConnecting to huggingface.co (huggingface.co)|3.165.160.12|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 234 [text/plain]\nSaving to: 'tokenizer_config.json'\n\ntokenizer_config.js 100%[===================>]     234  --.-KB/s    in 0s      \n\n2024-09-16 16:12:29 (18.6 MB/s) - 'tokenizer_config.json' saved [234/234]\n\nadded_tokens.json\tpytorch_model.bin\t  tokenizer_config.json\nconfig.json\t\trwkv_vocab_v20230424.txt\ngeneration_config.json\tspecial_tokens_map.json\n/kaggle/working/llama.cpp\n","output_type":"stream"}]},{"cell_type":"code","source":"!python convert_hf_to_gguf.py model/","metadata":{"id":"zHtxjcITBSzN","execution":{"iopub.status.busy":"2024-09-16T16:12:31.321369Z","iopub.execute_input":"2024-09-16T16:12:31.321900Z","iopub.status.idle":"2024-09-16T16:12:58.901563Z","shell.execute_reply.started":"2024-09-16T16:12:31.321840Z","shell.execute_reply":"2024-09-16T16:12:58.899537Z"},"trusted":true},"execution_count":19,"outputs":[{"name":"stdout","text":"Writing: 100%|███████████████████████████| 3.25G/3.25G [00:19<00:00, 168Mbyte/s]\n","output_type":"stream"}]},{"cell_type":"code","source":"!ls","metadata":{"id":"VTibFqk_dJAG","execution":{"iopub.status.busy":"2024-09-16T15:58:07.359854Z","iopub.execute_input":"2024-09-16T15:58:07.360380Z","iopub.status.idle":"2024-09-16T15:58:08.541518Z","shell.execute_reply.started":"2024-09-16T15:58:07.360325Z","shell.execute_reply":"2024-09-16T15:58:08.539947Z"},"trusted":true},"execution_count":6,"outputs":[{"name":"stdout","text":"AUTHORS\t\t\t       llama-gritlm\nCMakeLists.txt\t\t       llama-imatrix\nCMakePresets.json\t       llama-infill\nCONTRIBUTING.md\t\t       llama-llava-cli\nLICENSE\t\t\t       llama-lookahead\nMakefile\t\t       llama-lookup\nPackage.swift\t\t       llama-lookup-create\nREADME.md\t\t       llama-lookup-merge\nSECURITY.md\t\t       llama-lookup-stats\nci\t\t\t       llama-minicpmv-cli\ncmake\t\t\t       llama-parallel\ncommon\t\t\t       llama-passkey\nconvert_hf_to_gguf.py\t       llama-perplexity\nconvert_hf_to_gguf_update.py   llama-q8dot\nconvert_llama_ggml_to_gguf.py  llama-quantize\nconvert_lora_to_gguf.py        llama-quantize-stats\ndocs\t\t\t       llama-retrieval\nexamples\t\t       llama-save-load-state\nflake.lock\t\t       llama-server\nflake.nix\t\t       llama-simple\nggml\t\t\t       llama-speculative\ngguf-py\t\t\t       llama-tokenize\ngrammars\t\t       llama-vdot\ninclude\t\t\t       main\nlibllava.a\t\t       media\nllama-baby-llama\t       model\nllama-batched\t\t       models\nllama-batched-bench\t       mypy.ini\nllama-bench\t\t       pocs\nllama-benchmark-matmult        poetry.lock\nllama-cli\t\t       prompts\nllama-convert-llama2c-to-ggml  pyproject.toml\nllama-cvector-generator        pyrightconfig.json\nllama-embedding\t\t       requirements\nllama-eval-callback\t       requirements.txt\nllama-export-lora\t       scripts\nllama-gbnf-validator\t       server\nllama-gen-docs\t\t       spm-headers\nllama-gguf\t\t       src\nllama-gguf-hash\t\t       tests\nllama-gguf-split\n","output_type":"stream"}]},{"cell_type":"code","source":"!./llama-quantize ./model/Model-1.6B-F16.gguf ./model/RWKV-6-World-1.6B-GGUF-Q2_K.gguf Q2_K","metadata":{"id":"bgGOaSYxAO_K","scrolled":true,"execution":{"iopub.status.busy":"2024-09-16T16:25:01.963314Z","iopub.execute_input":"2024-09-16T16:25:01.963850Z","iopub.status.idle":"2024-09-16T16:26:43.499980Z","shell.execute_reply.started":"2024-09-16T16:25:01.963797Z","shell.execute_reply":"2024-09-16T16:26:43.497981Z"},"trusted":true},"execution_count":37,"outputs":[{"name":"stdout","text":"main: build = 3772 (23e0d70b)\nmain: built with cc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0 for x86_64-linux-gnu\nmain: quantizing './model/Model-1.6B-F16.gguf' to './model/RWKV-6-World-1.6B-GGUF-Q2_K.gguf' as Q2_K\nllama_model_loader: loaded meta data with 21 key-value pairs and 678 tensors from ./model/Model-1.6B-F16.gguf (version GGUF V3 (latest))\nllama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\nllama_model_loader: - kv   0:                       general.architecture str              = rwkv6\nllama_model_loader: - kv   1:                               general.type str              = model\nllama_model_loader: - kv   2:                               general.name str              = Model\nllama_model_loader: - kv   3:                         general.size_label str              = 1.6B\nllama_model_loader: - kv   4:                       rwkv6.context_length u32              = 1048576\nllama_model_loader: - kv   5:                     rwkv6.embedding_length u32              = 2048\nllama_model_loader: - kv   6:                          rwkv6.block_count u32              = 24\nllama_model_loader: - kv   7:         rwkv6.attention.layer_norm_epsilon f32              = 0.000010\nllama_model_loader: - kv   8:               rwkv6.rescale_every_n_layers u32              = 6\nllama_model_loader: - kv   9:                        rwkv6.wkv.head_size u32              = 64\nllama_model_loader: - kv  10:                   rwkv6.time_mix_extra_dim u32              = 32\nllama_model_loader: - kv  11:                 rwkv6.time_decay_extra_dim u32              = 64\nllama_model_loader: - kv  12:                  rwkv6.feed_forward_length u32              = 7168\nllama_model_loader: - kv  13:                          general.file_type u32              = 1\nllama_model_loader: - kv  14:                 rwkv6.attention.head_count u32              = 0\nllama_model_loader: - kv  15:                       tokenizer.ggml.model str              = rwkv\nllama_model_loader: - kv  16:                      tokenizer.ggml.tokens arr[str,65536]   = [\"<s>\", \"\\\\x00\", \"\\\\x01\", \"\\\\x02\", \"\\...\nllama_model_loader: - kv  17:                  tokenizer.ggml.token_type arr[i32,65536]   = [3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\nllama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 0\nllama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 0\nllama_model_loader: - kv  20:               general.quantization_version u32              = 2\nllama_model_loader: - type  f32:  484 tensors\nllama_model_loader: - type  f16:  194 tensors\n[   1/ 678]                    token_embd.weight - [ 2048, 65536,     1,     1], type =    f16, converting to q2_K .. size =   256.00 MiB ->    42.00 MiB\n[   2/ 678]               blk.0.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[   3/ 678]                 blk.0.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[   4/ 678]             blk.0.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[   5/ 678]               blk.0.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[   6/ 678]               token_embd_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[   7/ 678]                 token_embd_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[   8/ 678]         blk.0.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[   9/ 678]         blk.0.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  10/ 678]         blk.0.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  11/ 678]         blk.0.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  12/ 678]         blk.0.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  13/ 678]         blk.0.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  14/ 678]             blk.0.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[  15/ 678]             blk.0.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[  16/ 678]          blk.0.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  17/ 678]       blk.0.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[  18/ 678]       blk.0.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[  19/ 678]          blk.0.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[  20/ 678]     blk.0.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[  21/ 678]            blk.0.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[  22/ 678]          blk.0.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[  23/ 678]         blk.0.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[  24/ 678]           blk.0.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[  25/ 678]             blk.0.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  26/ 678]               blk.0.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  27/ 678]      blk.0.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  28/ 678]      blk.0.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  29/ 678]         blk.0.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[  30/ 678]  blk.0.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[  31/ 678]       blk.0.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[  32/ 678]               blk.1.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  33/ 678]                 blk.1.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  34/ 678]             blk.1.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  35/ 678]               blk.1.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  36/ 678]         blk.1.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  37/ 678]         blk.1.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  38/ 678]         blk.1.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  39/ 678]         blk.1.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  40/ 678]         blk.1.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  41/ 678]         blk.1.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  42/ 678]             blk.1.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[  43/ 678]             blk.1.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[  44/ 678]          blk.1.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  45/ 678]       blk.1.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[  46/ 678]       blk.1.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[  47/ 678]          blk.1.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[  48/ 678]     blk.1.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[  49/ 678]            blk.1.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[  50/ 678]          blk.1.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[  51/ 678]         blk.1.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[  52/ 678]           blk.1.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[  53/ 678]             blk.1.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  54/ 678]               blk.1.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  55/ 678]      blk.1.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  56/ 678]      blk.1.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  57/ 678]         blk.1.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[  58/ 678]  blk.1.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[  59/ 678]       blk.1.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[  60/ 678]               blk.2.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  61/ 678]                 blk.2.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  62/ 678]             blk.2.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  63/ 678]               blk.2.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  64/ 678]         blk.2.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  65/ 678]         blk.2.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  66/ 678]         blk.2.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  67/ 678]         blk.2.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  68/ 678]         blk.2.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  69/ 678]         blk.2.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  70/ 678]             blk.2.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[  71/ 678]             blk.2.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[  72/ 678]          blk.2.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  73/ 678]       blk.2.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[  74/ 678]       blk.2.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[  75/ 678]          blk.2.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[  76/ 678]     blk.2.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[  77/ 678]            blk.2.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[  78/ 678]          blk.2.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[  79/ 678]         blk.2.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[  80/ 678]           blk.2.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[  81/ 678]             blk.2.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  82/ 678]               blk.2.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  83/ 678]      blk.2.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  84/ 678]      blk.2.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  85/ 678]         blk.2.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[  86/ 678]  blk.2.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[  87/ 678]       blk.2.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[  88/ 678]               blk.3.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  89/ 678]                 blk.3.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  90/ 678]             blk.3.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  91/ 678]               blk.3.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  92/ 678]         blk.3.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  93/ 678]         blk.3.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  94/ 678]         blk.3.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  95/ 678]         blk.3.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  96/ 678]         blk.3.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  97/ 678]         blk.3.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[  98/ 678]             blk.3.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[  99/ 678]             blk.3.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[ 100/ 678]          blk.3.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 101/ 678]       blk.3.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[ 102/ 678]       blk.3.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[ 103/ 678]          blk.3.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[ 104/ 678]     blk.3.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 105/ 678]            blk.3.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 106/ 678]          blk.3.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 107/ 678]         blk.3.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 108/ 678]           blk.3.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 109/ 678]             blk.3.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 110/ 678]               blk.3.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 111/ 678]      blk.3.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 112/ 678]      blk.3.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 113/ 678]         blk.3.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 114/ 678]  blk.3.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 115/ 678]       blk.3.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 116/ 678]               blk.4.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 117/ 678]                 blk.4.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 118/ 678]             blk.4.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 119/ 678]               blk.4.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 120/ 678]         blk.4.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 121/ 678]         blk.4.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 122/ 678]         blk.4.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 123/ 678]         blk.4.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 124/ 678]         blk.4.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 125/ 678]         blk.4.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 126/ 678]             blk.4.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[ 127/ 678]             blk.4.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[ 128/ 678]          blk.4.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 129/ 678]       blk.4.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[ 130/ 678]       blk.4.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[ 131/ 678]          blk.4.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[ 132/ 678]     blk.4.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 133/ 678]            blk.4.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 134/ 678]          blk.4.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 135/ 678]         blk.4.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 136/ 678]           blk.4.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 137/ 678]             blk.4.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 138/ 678]               blk.4.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 139/ 678]      blk.4.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 140/ 678]      blk.4.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 141/ 678]         blk.4.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 142/ 678]  blk.4.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 143/ 678]       blk.4.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 144/ 678]               blk.5.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 145/ 678]                 blk.5.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 146/ 678]             blk.5.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 147/ 678]               blk.5.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 148/ 678]         blk.5.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 149/ 678]         blk.5.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 150/ 678]         blk.5.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 151/ 678]         blk.5.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 152/ 678]         blk.5.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 153/ 678]         blk.5.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 154/ 678]             blk.5.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[ 155/ 678]             blk.5.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[ 156/ 678]          blk.5.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 157/ 678]       blk.5.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[ 158/ 678]       blk.5.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[ 159/ 678]          blk.5.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[ 160/ 678]     blk.5.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 161/ 678]            blk.5.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 162/ 678]          blk.5.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 163/ 678]         blk.5.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 164/ 678]           blk.5.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 165/ 678]             blk.5.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 166/ 678]               blk.5.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 167/ 678]      blk.5.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 168/ 678]      blk.5.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 169/ 678]         blk.5.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 170/ 678]  blk.5.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 171/ 678]       blk.5.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 172/ 678]               blk.6.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 173/ 678]                 blk.6.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 174/ 678]             blk.6.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 175/ 678]               blk.6.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 176/ 678]         blk.6.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 177/ 678]         blk.6.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 178/ 678]         blk.6.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 179/ 678]         blk.6.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 180/ 678]         blk.6.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 181/ 678]         blk.6.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 182/ 678]             blk.6.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[ 183/ 678]             blk.6.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[ 184/ 678]          blk.6.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 185/ 678]       blk.6.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[ 186/ 678]       blk.6.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[ 187/ 678]          blk.6.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[ 188/ 678]     blk.6.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 189/ 678]            blk.6.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 190/ 678]          blk.6.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 191/ 678]         blk.6.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 192/ 678]           blk.6.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 193/ 678]             blk.6.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 194/ 678]               blk.6.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 195/ 678]      blk.6.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 196/ 678]      blk.6.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 197/ 678]         blk.6.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 198/ 678]  blk.6.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 199/ 678]       blk.6.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 200/ 678]               blk.7.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 201/ 678]                 blk.7.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 202/ 678]             blk.7.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 203/ 678]               blk.7.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 204/ 678]         blk.7.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 205/ 678]         blk.7.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 206/ 678]         blk.7.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 207/ 678]         blk.7.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 208/ 678]         blk.7.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 209/ 678]         blk.7.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 210/ 678]             blk.7.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[ 211/ 678]             blk.7.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[ 212/ 678]          blk.7.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 213/ 678]       blk.7.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[ 214/ 678]       blk.7.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[ 215/ 678]          blk.7.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[ 216/ 678]     blk.7.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 217/ 678]            blk.7.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 218/ 678]          blk.7.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 219/ 678]         blk.7.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 220/ 678]           blk.7.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 221/ 678]             blk.7.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 222/ 678]               blk.7.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 223/ 678]      blk.7.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 224/ 678]      blk.7.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 225/ 678]         blk.7.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 226/ 678]  blk.7.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 227/ 678]       blk.7.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 228/ 678]               blk.8.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 229/ 678]                 blk.8.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 230/ 678]             blk.8.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 231/ 678]               blk.8.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 232/ 678]         blk.8.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 233/ 678]         blk.8.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 234/ 678]         blk.8.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 235/ 678]         blk.8.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 236/ 678]         blk.8.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 237/ 678]         blk.8.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 238/ 678]             blk.8.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[ 239/ 678]             blk.8.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[ 240/ 678]          blk.8.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 241/ 678]       blk.8.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[ 242/ 678]       blk.8.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[ 243/ 678]          blk.8.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[ 244/ 678]     blk.8.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 245/ 678]            blk.8.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 246/ 678]          blk.8.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 247/ 678]         blk.8.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 248/ 678]           blk.8.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 249/ 678]             blk.8.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 250/ 678]               blk.8.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 251/ 678]      blk.8.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 252/ 678]      blk.8.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 253/ 678]         blk.8.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 254/ 678]  blk.8.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 255/ 678]       blk.8.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 256/ 678]               blk.9.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 257/ 678]                 blk.9.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 258/ 678]             blk.9.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 259/ 678]               blk.9.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 260/ 678]         blk.9.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 261/ 678]         blk.9.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 262/ 678]         blk.9.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 263/ 678]         blk.9.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 264/ 678]         blk.9.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 265/ 678]         blk.9.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 266/ 678]             blk.9.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[ 267/ 678]             blk.9.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[ 268/ 678]          blk.9.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 269/ 678]       blk.9.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[ 270/ 678]       blk.9.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[ 271/ 678]          blk.9.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[ 272/ 678]     blk.9.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 273/ 678]            blk.9.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 274/ 678]          blk.9.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 275/ 678]         blk.9.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 276/ 678]           blk.9.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 277/ 678]             blk.9.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 278/ 678]               blk.9.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 279/ 678]      blk.9.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 280/ 678]      blk.9.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 281/ 678]         blk.9.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 282/ 678]  blk.9.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 283/ 678]       blk.9.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 284/ 678]              blk.10.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 285/ 678]                blk.10.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 286/ 678]            blk.10.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 287/ 678]              blk.10.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 288/ 678]        blk.10.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 289/ 678]        blk.10.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 290/ 678]        blk.10.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 291/ 678]        blk.10.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 292/ 678]        blk.10.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 293/ 678]        blk.10.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 294/ 678]            blk.10.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[ 295/ 678]            blk.10.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[ 296/ 678]         blk.10.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 297/ 678]      blk.10.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[ 298/ 678]      blk.10.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[ 299/ 678]         blk.10.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[ 300/ 678]    blk.10.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 301/ 678]           blk.10.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 302/ 678]         blk.10.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 303/ 678]        blk.10.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 304/ 678]          blk.10.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 305/ 678]            blk.10.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 306/ 678]              blk.10.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 307/ 678]     blk.10.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 308/ 678]     blk.10.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 309/ 678]        blk.10.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 310/ 678] blk.10.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 311/ 678]      blk.10.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 312/ 678]              blk.11.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 313/ 678]                blk.11.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 314/ 678]            blk.11.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 315/ 678]              blk.11.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 316/ 678]        blk.11.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 317/ 678]        blk.11.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 318/ 678]        blk.11.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 319/ 678]        blk.11.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 320/ 678]        blk.11.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 321/ 678]        blk.11.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 322/ 678]            blk.11.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[ 323/ 678]            blk.11.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[ 324/ 678]         blk.11.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 325/ 678]      blk.11.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[ 326/ 678]      blk.11.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[ 327/ 678]         blk.11.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[ 328/ 678]    blk.11.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 329/ 678]           blk.11.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 330/ 678]         blk.11.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 331/ 678]        blk.11.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 332/ 678]          blk.11.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 333/ 678]            blk.11.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 334/ 678]              blk.11.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 335/ 678]     blk.11.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 336/ 678]     blk.11.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 337/ 678]        blk.11.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 338/ 678] blk.11.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 339/ 678]      blk.11.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 340/ 678]              blk.12.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 341/ 678]                blk.12.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 342/ 678]            blk.12.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 343/ 678]              blk.12.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 344/ 678]        blk.12.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 345/ 678]        blk.12.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 346/ 678]        blk.12.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 347/ 678]        blk.12.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 348/ 678]        blk.12.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 349/ 678]        blk.12.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 350/ 678]            blk.12.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[ 351/ 678]            blk.12.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[ 352/ 678]         blk.12.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 353/ 678]      blk.12.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[ 354/ 678]      blk.12.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[ 355/ 678]         blk.12.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[ 356/ 678]    blk.12.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 357/ 678]           blk.12.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 358/ 678]         blk.12.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 359/ 678]        blk.12.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 360/ 678]          blk.12.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 361/ 678]            blk.12.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 362/ 678]              blk.12.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 363/ 678]     blk.12.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 364/ 678]     blk.12.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 365/ 678]        blk.12.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 366/ 678] blk.12.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 367/ 678]      blk.12.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 368/ 678]              blk.13.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 369/ 678]                blk.13.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 370/ 678]            blk.13.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 371/ 678]              blk.13.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 372/ 678]        blk.13.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 373/ 678]        blk.13.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 374/ 678]        blk.13.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 375/ 678]        blk.13.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 376/ 678]        blk.13.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 377/ 678]        blk.13.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 378/ 678]            blk.13.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[ 379/ 678]            blk.13.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[ 380/ 678]         blk.13.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 381/ 678]      blk.13.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[ 382/ 678]      blk.13.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[ 383/ 678]         blk.13.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[ 384/ 678]    blk.13.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 385/ 678]           blk.13.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 386/ 678]         blk.13.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 387/ 678]        blk.13.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 388/ 678]          blk.13.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 389/ 678]            blk.13.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 390/ 678]              blk.13.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 391/ 678]     blk.13.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 392/ 678]     blk.13.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 393/ 678]        blk.13.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 394/ 678] blk.13.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 395/ 678]      blk.13.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 396/ 678]              blk.14.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 397/ 678]                blk.14.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 398/ 678]            blk.14.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 399/ 678]              blk.14.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 400/ 678]        blk.14.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 401/ 678]        blk.14.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 402/ 678]        blk.14.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 403/ 678]        blk.14.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 404/ 678]        blk.14.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 405/ 678]        blk.14.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 406/ 678]            blk.14.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[ 407/ 678]            blk.14.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[ 408/ 678]         blk.14.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 409/ 678]      blk.14.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[ 410/ 678]      blk.14.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[ 411/ 678]         blk.14.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[ 412/ 678]    blk.14.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 413/ 678]           blk.14.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 414/ 678]         blk.14.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 415/ 678]        blk.14.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 416/ 678]          blk.14.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 417/ 678]            blk.14.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 418/ 678]              blk.14.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 419/ 678]     blk.14.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 420/ 678]     blk.14.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 421/ 678]        blk.14.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 422/ 678] blk.14.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 423/ 678]      blk.14.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 424/ 678]              blk.15.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 425/ 678]                blk.15.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 426/ 678]            blk.15.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 427/ 678]              blk.15.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 428/ 678]        blk.15.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 429/ 678]        blk.15.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 430/ 678]        blk.15.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 431/ 678]        blk.15.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 432/ 678]        blk.15.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 433/ 678]        blk.15.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 434/ 678]            blk.15.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[ 435/ 678]            blk.15.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[ 436/ 678]         blk.15.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 437/ 678]      blk.15.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[ 438/ 678]      blk.15.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[ 439/ 678]         blk.15.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[ 440/ 678]    blk.15.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 441/ 678]           blk.15.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 442/ 678]         blk.15.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 443/ 678]        blk.15.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 444/ 678]          blk.15.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 445/ 678]            blk.15.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 446/ 678]              blk.15.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 447/ 678]     blk.15.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 448/ 678]     blk.15.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 449/ 678]        blk.15.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 450/ 678] blk.15.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 451/ 678]      blk.15.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 452/ 678]              blk.16.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 453/ 678]                blk.16.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 454/ 678]            blk.16.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 455/ 678]              blk.16.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 456/ 678]        blk.16.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 457/ 678]        blk.16.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 458/ 678]        blk.16.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 459/ 678]        blk.16.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 460/ 678]        blk.16.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 461/ 678]        blk.16.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 462/ 678]            blk.16.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[ 463/ 678]            blk.16.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[ 464/ 678]         blk.16.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 465/ 678]      blk.16.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[ 466/ 678]      blk.16.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[ 467/ 678]         blk.16.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[ 468/ 678]    blk.16.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 469/ 678]           blk.16.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 470/ 678]         blk.16.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 471/ 678]        blk.16.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 472/ 678]          blk.16.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 473/ 678]            blk.16.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 474/ 678]              blk.16.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 475/ 678]     blk.16.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 476/ 678]     blk.16.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 477/ 678]        blk.16.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 478/ 678] blk.16.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 479/ 678]      blk.16.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 480/ 678]              blk.17.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 481/ 678]                blk.17.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 482/ 678]            blk.17.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 483/ 678]              blk.17.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 484/ 678]        blk.17.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 485/ 678]        blk.17.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 486/ 678]        blk.17.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 487/ 678]        blk.17.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 488/ 678]        blk.17.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 489/ 678]        blk.17.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 490/ 678]            blk.17.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[ 491/ 678]            blk.17.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[ 492/ 678]         blk.17.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 493/ 678]      blk.17.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[ 494/ 678]      blk.17.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[ 495/ 678]         blk.17.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[ 496/ 678]    blk.17.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 497/ 678]           blk.17.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 498/ 678]         blk.17.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 499/ 678]        blk.17.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 500/ 678]          blk.17.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 501/ 678]            blk.17.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 502/ 678]              blk.17.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 503/ 678]     blk.17.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 504/ 678]     blk.17.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 505/ 678]        blk.17.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 506/ 678] blk.17.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 507/ 678]      blk.17.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 508/ 678]              blk.18.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 509/ 678]                blk.18.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 510/ 678]            blk.18.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 511/ 678]              blk.18.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 512/ 678]        blk.18.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 513/ 678]        blk.18.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 514/ 678]        blk.18.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 515/ 678]        blk.18.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 516/ 678]        blk.18.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 517/ 678]        blk.18.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 518/ 678]            blk.18.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[ 519/ 678]            blk.18.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[ 520/ 678]         blk.18.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 521/ 678]      blk.18.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[ 522/ 678]      blk.18.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[ 523/ 678]         blk.18.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[ 524/ 678]    blk.18.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 525/ 678]           blk.18.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 526/ 678]         blk.18.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 527/ 678]        blk.18.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 528/ 678]          blk.18.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 529/ 678]            blk.18.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 530/ 678]              blk.18.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 531/ 678]     blk.18.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 532/ 678]     blk.18.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 533/ 678]        blk.18.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 534/ 678] blk.18.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 535/ 678]      blk.18.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 536/ 678]              blk.19.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 537/ 678]                blk.19.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 538/ 678]            blk.19.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 539/ 678]              blk.19.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 540/ 678]        blk.19.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 541/ 678]        blk.19.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 542/ 678]        blk.19.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 543/ 678]        blk.19.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 544/ 678]        blk.19.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 545/ 678]        blk.19.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 546/ 678]            blk.19.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[ 547/ 678]            blk.19.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[ 548/ 678]         blk.19.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 549/ 678]      blk.19.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[ 550/ 678]      blk.19.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[ 551/ 678]         blk.19.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[ 552/ 678]    blk.19.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 553/ 678]           blk.19.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 554/ 678]         blk.19.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 555/ 678]        blk.19.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 556/ 678]          blk.19.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 557/ 678]            blk.19.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 558/ 678]              blk.19.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 559/ 678]     blk.19.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 560/ 678]     blk.19.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 561/ 678]        blk.19.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 562/ 678] blk.19.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 563/ 678]      blk.19.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 564/ 678]              blk.20.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 565/ 678]                blk.20.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 566/ 678]            blk.20.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 567/ 678]              blk.20.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 568/ 678]        blk.20.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 569/ 678]        blk.20.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 570/ 678]        blk.20.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 571/ 678]        blk.20.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 572/ 678]        blk.20.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 573/ 678]        blk.20.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 574/ 678]            blk.20.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[ 575/ 678]            blk.20.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[ 576/ 678]         blk.20.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 577/ 678]      blk.20.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[ 578/ 678]      blk.20.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[ 579/ 678]         blk.20.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[ 580/ 678]    blk.20.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 581/ 678]           blk.20.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 582/ 678]         blk.20.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 583/ 678]        blk.20.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 584/ 678]          blk.20.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 585/ 678]            blk.20.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 586/ 678]              blk.20.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 587/ 678]     blk.20.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 588/ 678]     blk.20.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 589/ 678]        blk.20.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 590/ 678] blk.20.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 591/ 678]      blk.20.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 592/ 678]              blk.21.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 593/ 678]                blk.21.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 594/ 678]            blk.21.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 595/ 678]              blk.21.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 596/ 678]        blk.21.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 597/ 678]        blk.21.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 598/ 678]        blk.21.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 599/ 678]        blk.21.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 600/ 678]        blk.21.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 601/ 678]        blk.21.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 602/ 678]            blk.21.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[ 603/ 678]            blk.21.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[ 604/ 678]         blk.21.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 605/ 678]      blk.21.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[ 606/ 678]      blk.21.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[ 607/ 678]         blk.21.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[ 608/ 678]    blk.21.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 609/ 678]           blk.21.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 610/ 678]         blk.21.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 611/ 678]        blk.21.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 612/ 678]          blk.21.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 613/ 678]            blk.21.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 614/ 678]              blk.21.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 615/ 678]     blk.21.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 616/ 678]     blk.21.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 617/ 678]        blk.21.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 618/ 678] blk.21.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 619/ 678]      blk.21.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 620/ 678]              blk.22.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 621/ 678]                blk.22.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 622/ 678]            blk.22.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 623/ 678]              blk.22.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 624/ 678]        blk.22.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 625/ 678]        blk.22.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 626/ 678]        blk.22.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 627/ 678]        blk.22.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 628/ 678]        blk.22.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 629/ 678]        blk.22.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 630/ 678]            blk.22.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[ 631/ 678]            blk.22.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[ 632/ 678]         blk.22.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 633/ 678]      blk.22.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[ 634/ 678]      blk.22.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[ 635/ 678]         blk.22.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[ 636/ 678]    blk.22.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 637/ 678]           blk.22.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 638/ 678]         blk.22.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 639/ 678]        blk.22.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 640/ 678]          blk.22.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 641/ 678]            blk.22.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 642/ 678]              blk.22.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 643/ 678]     blk.22.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 644/ 678]     blk.22.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 645/ 678]        blk.22.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 646/ 678] blk.22.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 647/ 678]      blk.22.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 648/ 678]              blk.23.attn_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 649/ 678]                blk.23.attn_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 650/ 678]            blk.23.attn_norm_2.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 651/ 678]              blk.23.attn_norm_2.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 652/ 678]        blk.23.time_mix_lerp_x.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 653/ 678]        blk.23.time_mix_lerp_w.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 654/ 678]        blk.23.time_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 655/ 678]        blk.23.time_mix_lerp_v.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 656/ 678]        blk.23.time_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 657/ 678]        blk.23.time_mix_lerp_g.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 658/ 678]            blk.23.time_mix_w1.weight - [ 2048,   160,     1,     1], type =    f32, size =    1.250 MB\n[ 659/ 678]            blk.23.time_mix_w2.weight - [   32,  2048,     5,     1], type =    f32, size =    1.250 MB\n[ 660/ 678]         blk.23.time_mix_decay.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 661/ 678]      blk.23.time_mix_decay_w1.weight - [ 2048,    64,     1,     1], type =    f32, size =    0.500 MB\n[ 662/ 678]      blk.23.time_mix_decay_w2.weight - [   64,  2048,     1,     1], type =    f32, size =    0.500 MB\n[ 663/ 678]         blk.23.time_mix_first.weight - [   64,    32,     1,     1], type =    f32, size =    0.008 MB\n[ 664/ 678]    blk.23.time_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 665/ 678]           blk.23.time_mix_key.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 666/ 678]         blk.23.time_mix_value.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 667/ 678]        blk.23.time_mix_output.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 668/ 678]          blk.23.time_mix_gate.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 669/ 678]            blk.23.time_mix_ln.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 670/ 678]              blk.23.time_mix_ln.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 671/ 678]     blk.23.channel_mix_lerp_k.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 672/ 678]     blk.23.channel_mix_lerp_r.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 673/ 678]        blk.23.channel_mix_key.weight - [ 2048,  7168,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 674/ 678] blk.23.channel_mix_receptance.weight - [ 2048,  2048,     1,     1], type =    f16, converting to q2_K .. size =     8.00 MiB ->     1.31 MiB\n[ 675/ 678]      blk.23.channel_mix_value.weight - [ 7168,  2048,     1,     1], type =    f16, converting to q2_K .. size =    28.00 MiB ->     4.59 MiB\n[ 676/ 678]                   output_norm.weight - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 677/ 678]                     output_norm.bias - [ 2048,     1,     1,     1], type =    f32, size =    0.008 MB\n[ 678/ 678]                        output.weight - [ 2048, 65536,     1,     1], type =    f16, converting to q6_K .. size =   256.00 MiB ->   105.00 MiB\nllama_model_quantize_internal: model size  =  3095.03 MB\nllama_model_quantize_internal: quant size  =   643.53 MB\n\nmain: quantize time = 100331.91 ms\nmain:    total time = 100331.91 ms\n","output_type":"stream"}]},{"cell_type":"code","source":"#!./llama-quantize","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"vdAKSgnNS0-e","executionInfo":{"status":"ok","timestamp":1725256398962,"user_tz":-60,"elapsed":275,"user":{"displayName":"Lyte","userId":"00368277356076556155"}},"outputId":"f35500d3-68fd-45e3-fdc1-19335d6c3a6b","scrolled":true},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":"usage: ./llama-quantize [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n\n\n  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n\n  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n\n  --pure: Disable k-quant mixtures and quantize all tensors to the same type\n\n  --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n\n  --include-weights tensor_name: use importance matrix for this/these tensor(s)\n\n  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n\n  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n\n  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n\n  --keep-split: will generate quantized model in the same shards as input\n\n  --override-kv KEY=TYPE:VALUE\n\n      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n\nNote: --include-weights and --exclude-weights cannot be used together\n\n\n\nAllowed quantization types:\n\n   2  or  Q4_0    :  4.34G, +0.4685 ppl @ Llama-3-8B\n\n   3  or  Q4_1    :  4.78G, +0.4511 ppl @ Llama-3-8B\n\n   8  or  Q5_0    :  5.21G, +0.1316 ppl @ Llama-3-8B\n\n   9  or  Q5_1    :  5.65G, +0.1062 ppl @ Llama-3-8B\n\n  19  or  IQ2_XXS :  2.06 bpw quantization\n\n  20  or  IQ2_XS  :  2.31 bpw quantization\n\n  28  or  IQ2_S   :  2.5  bpw quantization\n\n  29  or  IQ2_M   :  2.7  bpw quantization\n\n  24  or  IQ1_S   :  1.56 bpw quantization\n\n  31  or  IQ1_M   :  1.75 bpw quantization\n\n  10  or  Q2_K    :  2.96G, +3.5199 ppl @ Llama-3-8B\n\n  21  or  Q2_K_S  :  2.96G, +3.1836 ppl @ Llama-3-8B\n\n  23  or  IQ3_XXS :  3.06 bpw quantization\n\n  26  or  IQ3_S   :  3.44 bpw quantization\n\n  27  or  IQ3_M   :  3.66 bpw quantization mix\n\n  12  or  Q3_K    : alias for Q3_K_M\n\n  22  or  IQ3_XS  :  3.3 bpw quantization\n\n  11  or  Q3_K_S  :  3.41G, +1.6321 ppl @ Llama-3-8B\n\n  12  or  Q3_K_M  :  3.74G, +0.6569 ppl @ Llama-3-8B\n\n  13  or  Q3_K_L  :  4.03G, +0.5562 ppl @ Llama-3-8B\n\n  25  or  IQ4_NL  :  4.50 bpw non-linear quantization\n\n  30  or  IQ4_XS  :  4.25 bpw non-linear quantization\n\n  15  or  Q4_K    : alias for Q4_K_M\n\n  14  or  Q4_K_S  :  4.37G, +0.2689 ppl @ Llama-3-8B\n\n  15  or  Q4_K_M  :  4.58G, +0.1754 ppl @ Llama-3-8B\n\n  17  or  Q5_K    : alias for Q5_K_M\n\n  16  or  Q5_K_S  :  5.21G, +0.1049 ppl @ Llama-3-8B\n\n  17  or  Q5_K_M  :  5.33G, +0.0569 ppl @ Llama-3-8B\n\n  18  or  Q6_K    :  6.14G, +0.0217 ppl @ Llama-3-8B\n\n   7  or  Q8_0    :  7.96G, +0.0026 ppl @ Llama-3-8B\n\n  33  or  Q4_0_4_4 :  4.34G, +0.4685 ppl @ Llama-3-8B\n\n  34  or  Q4_0_4_8 :  4.34G, +0.4685 ppl @ Llama-3-8B\n\n  35  or  Q4_0_8_8 :  4.34G, +0.4685 ppl @ Llama-3-8B\n\n   1  or  F16     : 14.00G, +0.0020 ppl @ Mistral-7B\n\n  32  or  BF16    : 14.00G, -0.0050 ppl @ Mistral-7B\n\n   0  or  F32     : 26.00G              @ 7B\n\n          COPY    : only copy tensors, no quantizing\n"}]},{"cell_type":"code","source":"#create_repo(\"Lyte/RWKV-6-World-3B-v2.1-GGUF\", token=HF_TOKEN)\n\nupload_file(\n    path_or_fileobj=\"./model/RWKV-6-World-1.6B-GGUF-Q2_K.gguf\",\n    path_in_repo=\"RWKV-6-World-1.6B-GGUF-Q2_K.gguf\",\n    repo_id=\"Lyte/RWKV-6-World-1.6B-GGUF\",\n    repo_type=\"model\",\n    token=HF_TOKEN,\n)","metadata":{"id":"gEq9apcVEgAJ","execution":{"iopub.status.busy":"2024-09-16T16:26:43.503822Z","iopub.execute_input":"2024-09-16T16:26:43.504408Z","iopub.status.idle":"2024-09-16T16:27:05.656307Z","shell.execute_reply.started":"2024-09-16T16:26:43.504350Z","shell.execute_reply":"2024-09-16T16:27:05.654986Z"},"trusted":true},"execution_count":38,"outputs":[{"output_type":"display_data","data":{"text/plain":"RWKV-6-World-1.6B-GGUF-Q2_K.gguf:   0%|          | 0.00/676M [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"b0226af36fb74cfd8f2a54b3696104d1"}},"metadata":{}},{"execution_count":38,"output_type":"execute_result","data":{"text/plain":"CommitInfo(commit_url='https://huggingface.co/Lyte/RWKV-6-World-1.6B-GGUF/commit/94b7972c8cc6369aa75c801c19bf54d034c8dba7', commit_message='Upload RWKV-6-World-1.6B-GGUF-Q2_K.gguf with huggingface_hub', commit_description='', oid='94b7972c8cc6369aa75c801c19bf54d034c8dba7', pr_url=None, pr_revision=None, pr_num=None)"},"metadata":{}}]},{"cell_type":"code","source":"#create_repo(\"Lyte/RWKV-6-World-3B-v2.1-GGUF\", token=HF_TOKEN)\n\nupload_file(\n    path_or_fileobj=\"./model/Model-1.6B-F16.gguf\",\n    path_in_repo=\"RWKV-6-World-1.6B-GGUF-F16.gguf\",\n    repo_id=\"Lyte/RWKV-6-World-1.6B-GGUF\",\n    repo_type=\"model\",\n    token=HF_TOKEN,\n)","metadata":{"id":"rsnPjHgyexNa","execution":{"iopub.status.busy":"2024-09-16T16:16:59.849320Z","iopub.execute_input":"2024-09-16T16:16:59.849777Z","iopub.status.idle":"2024-09-16T16:18:32.644440Z","shell.execute_reply.started":"2024-09-16T16:16:59.849732Z","shell.execute_reply":"2024-09-16T16:18:32.643127Z"},"trusted":true},"execution_count":22,"outputs":[{"output_type":"display_data","data":{"text/plain":"Model-1.6B-F16.gguf:   0%|          | 0.00/3.25G [00:00<?, ?B/s]","application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"0c359bba1c2e4b8aa4263d3737516402"}},"metadata":{}},{"execution_count":22,"output_type":"execute_result","data":{"text/plain":"CommitInfo(commit_url='https://huggingface.co/Lyte/RWKV-6-World-1.6B-GGUF/commit/e92d3beb2be80c8601f8abad393def776cc20186', commit_message='Upload RWKV-6-World-1.6B-GGUF-F16.gguf with huggingface_hub', commit_description='', oid='e92d3beb2be80c8601f8abad393def776cc20186', pr_url=None, pr_revision=None, pr_num=None)"},"metadata":{}}]},{"cell_type":"code","source":"!rm /kaggle/working/llama.cpp/model/RWKV-6-World-1.6B-GGUF-Q2_K.gguf","metadata":{"execution":{"iopub.status.busy":"2024-09-16T16:25:00.589117Z","iopub.execute_input":"2024-09-16T16:25:00.589751Z","iopub.status.idle":"2024-09-16T16:25:01.959618Z","shell.execute_reply.started":"2024-09-16T16:25:00.589684Z","shell.execute_reply":"2024-09-16T16:25:01.957685Z"},"trusted":true},"execution_count":36,"outputs":[]},{"cell_type":"code","source":"!./llama-cli -m ./model/Model-1.6B-F16.gguf --in-suffix \"Assistant:\" -c 1024 -t 0.7 --top-k 50 --top-p 0.95 -n 128 -p \"Assistant: Hello, what can i help you with today?\\nUser:\" -r \"User:\"","metadata":{"id":"AwpO_N8-SGDV","execution":{"iopub.status.busy":"2024-09-16T16:27:27.681654Z","iopub.execute_input":"2024-09-16T16:27:27.682357Z","iopub.status.idle":"2024-09-16T16:27:29.999437Z","shell.execute_reply.started":"2024-09-16T16:27:27.682296Z","shell.execute_reply":"2024-09-16T16:27:29.997768Z"},"trusted":true},"execution_count":40,"outputs":[{"name":"stdout","text":"build: 3772 (23e0d70b) with cc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0 for x86_64-linux-gnu\nmain: llama backend init\nmain: load the model and apply lora adapter, if any\nllama_model_loader: loaded meta data with 21 key-value pairs and 678 tensors from ./model/Model-1.6B-F16.gguf (version GGUF V3 (latest))\nllama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\nllama_model_loader: - kv   0:                       general.architecture str              = rwkv6\nllama_model_loader: - kv   1:                               general.type str              = model\nllama_model_loader: - kv   2:                               general.name str              = Model\nllama_model_loader: - kv   3:                         general.size_label str              = 1.6B\nllama_model_loader: - kv   4:                       rwkv6.context_length u32              = 1048576\nllama_model_loader: - kv   5:                     rwkv6.embedding_length u32              = 2048\nllama_model_loader: - kv   6:                          rwkv6.block_count u32              = 24\nllama_model_loader: - kv   7:         rwkv6.attention.layer_norm_epsilon f32              = 0.000010\nllama_model_loader: - kv   8:               rwkv6.rescale_every_n_layers u32              = 6\nllama_model_loader: - kv   9:                        rwkv6.wkv.head_size u32              = 64\nllama_model_loader: - kv  10:                   rwkv6.time_mix_extra_dim u32              = 32\nllama_model_loader: - kv  11:                 rwkv6.time_decay_extra_dim u32              = 64\nllama_model_loader: - kv  12:                  rwkv6.feed_forward_length u32              = 7168\nllama_model_loader: - kv  13:                          general.file_type u32              = 1\nllama_model_loader: - kv  14:                 rwkv6.attention.head_count u32              = 0\nllama_model_loader: - kv  15:                       tokenizer.ggml.model str              = rwkv\nllama_model_loader: - kv  16:                      tokenizer.ggml.tokens arr[str,65536]   = [\"<s>\", \"\\\\x00\", \"\\\\x01\", \"\\\\x02\", \"\\...\nllama_model_loader: - kv  17:                  tokenizer.ggml.token_type arr[i32,65536]   = [3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\nllama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 0\nllama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 0\nllama_model_loader: - kv  20:               general.quantization_version u32              = 2\nllama_model_loader: - type  f32:  484 tensors\nllama_model_loader: - type  f16:  194 tensors\nllm_load_vocab: special tokens cache size = 1\nllm_load_vocab: token to piece cache size = 0.3561 MB\nllm_load_print_meta: format           = GGUF V3 (latest)\nllm_load_print_meta: arch             = rwkv6\nllm_load_print_meta: vocab type       = RWKV\nllm_load_print_meta: n_vocab          = 65536\nllm_load_print_meta: n_merges         = 0\nllm_load_print_meta: vocab_only       = 0\nllm_load_print_meta: n_ctx_train      = 1048576\nllm_load_print_meta: n_embd           = 2048\nllm_load_print_meta: n_layer          = 24\nllm_load_print_meta: n_head           = 0\nllm_load_print_meta: n_head_kv        = 0\nllm_load_print_meta: n_rot            = 0\nllm_load_print_meta: n_swa            = 0\nllm_load_print_meta: n_embd_head_k    = 0\nllm_load_print_meta: n_embd_head_v    = 0\nllm_load_print_meta: n_gqa            = 0\nllm_load_print_meta: n_embd_k_gqa     = 0\nllm_load_print_meta: n_embd_v_gqa     = 0\nllm_load_print_meta: f_norm_eps       = 1.0e-05\nllm_load_print_meta: f_norm_rms_eps   = 0.0e+00\nllm_load_print_meta: f_clamp_kqv      = 0.0e+00\nllm_load_print_meta: f_max_alibi_bias = 0.0e+00\nllm_load_print_meta: f_logit_scale    = 0.0e+00\nllm_load_print_meta: n_ff             = 7168\nllm_load_print_meta: n_expert         = 0\nllm_load_print_meta: n_expert_used    = 0\nllm_load_print_meta: causal attn      = 1\nllm_load_print_meta: pooling type     = 0\nllm_load_print_meta: rope type        = -1\nllm_load_print_meta: rope scaling     = linear\nllm_load_print_meta: freq_base_train  = 10000.0\nllm_load_print_meta: freq_scale_train = 1\nllm_load_print_meta: n_ctx_orig_yarn  = 1048576\nllm_load_print_meta: rope_finetuned   = unknown\nllm_load_print_meta: ssm_d_conv       = 0\nllm_load_print_meta: ssm_d_inner      = 0\nllm_load_print_meta: ssm_d_state      = 0\nllm_load_print_meta: ssm_dt_rank      = 0\nllm_load_print_meta: ssm_dt_b_c_rms   = 0\nllm_load_print_meta: model type       = 1.6B\nllm_load_print_meta: model ftype      = F16\nllm_load_print_meta: model params     = 1.60 B\nllm_load_print_meta: model size       = 3.02 GiB (16.23 BPW) \nllm_load_print_meta: general.name     = Model\nllm_load_print_meta: BOS token        = 0 '<s>'\nllm_load_print_meta: EOS token        = 0 '<s>'\nllm_load_print_meta: LF token         = 11 '\\n'\nllm_load_print_meta: max token length = 192\nllm_load_tensors: ggml ctx size =    0.26 MiB\nllm_load_tensors:        CPU buffer size =  3095.03 MiB\n......................................................................................\nllama_new_context_with_model: n_ctx      = 1024\nllama_new_context_with_model: n_batch    = 1024\nllama_new_context_with_model: n_ubatch   = 512\nllama_new_context_with_model: flash_attn = 0\nllama_new_context_with_model: freq_base  = 10000.0\nllama_new_context_with_model: freq_scale = 1\nllama_kv_cache_init:        CPU KV buffer size =    12.38 MiB\nllama_new_context_with_model: KV self size  =   12.38 MiB, K (f32):    0.38 MiB, V (f32):   12.00 MiB\nllama_new_context_with_model:        CPU  output buffer size =     0.25 MiB\nllama_new_context_with_model:        CPU compute buffer size =   136.00 MiB\nllama_new_context_with_model: graph nodes  = 2726\nllama_new_context_with_model: graph splits = 1\nllama_init_from_gpt_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)\nmain: llama threadpool init, n_threads = 4\n\nsystem_info: n_threads = 4 (n_threads_batch = 4) / 4 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | RISCV_VECT = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | \n\nsampler seed: 3259106188\nsampler params: \n\trepeat_last_n = 64, repeat_penalty = 1.000, frequency_penalty = 0.000, presence_penalty = 0.000\n\ttop_k = 50, tfs_z = 1.000, top_p = 0.950, min_p = 0.050, typical_p = 1.000, temp = 0.800\n\tmirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000\nsampler chain: logits -> logit-bias -> penalties -> top-k -> tail-free -> typical -> top-p -> min-p -> temp-ext -> softmax -> dist \ngenerate: n_ctx = 1024, n_batch = 2048, n_predict = 128, n_keep = 0\n\nAssistant: Hello, what can i help you with today?\nUser:\n\nllama_perf_sampler_print:    sampling time =       0.00 ms /    16 runs   (    0.00 ms per token, 4000000.00 tokens per second)\nllama_perf_context_print:        load time =     861.50 ms\nllama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)\nllama_perf_context_print:        eval time =       0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)\nllama_perf_context_print:       total time =     127.06 ms /     2 tokens\n","output_type":"stream"}]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}