diff --git "a/ndarray-cache-b16.json" "b/ndarray-cache-b16.json" new file mode 100644--- /dev/null +++ "b/ndarray-cache-b16.json" @@ -0,0 +1,2937 @@ +{ + "metadata": { + "ParamSize": 267, + "ParamBytes": 309011968.0, + "BitsPerParam": 5.003910477452378 + }, + "records": [ + { + "dataPath": "params_shard_0.bin", + "format": "raw-shard", + "nbytes": 68067328, + "records": [ + { + "name": "model.embed_tokens.q_weight", + "shape": [ + 151936, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 68067328, + "byteOffset": 0 + } + ], + "md5sum": "ea318a246037c876e3efef1f8c917fb1" + }, + { + "dataPath": "params_shard_1.bin", + "format": "raw-shard", + "nbytes": 33234176, + "records": [ + { + "name": "model.embed_tokens.q_scale", + "shape": [ + 151936, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 8508416, + "byteOffset": 0 + }, + { + "name": "model.layers.0.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 8508416 + }, + { + "name": "model.layers.0.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 8510208 + }, + { + "name": "model.layers.0.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 10689280 + }, + { + "name": "model.layers.0.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 10961664 + }, + { + "name": "model.layers.0.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 15319808 + }, + { + "name": "model.layers.0.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 15864576 + }, + { + "name": "model.layers.0.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 15866368 + }, + { + "name": "model.layers.0.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 15868672 + }, + { + "name": "model.layers.0.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 16384768 + }, + { + "name": "model.layers.0.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 16449280 + }, + { + "name": "model.layers.0.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 16850688 + }, + { + "name": "model.layers.1.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 16900864 + }, + { + "name": "model.layers.1.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 16902656 + }, + { + "name": "model.layers.1.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 19081728 + }, + { + "name": "model.layers.1.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 19354112 + }, + { + "name": "model.layers.1.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 23712256 + }, + { + "name": "model.layers.1.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 24257024 + }, + { + "name": "model.layers.1.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 24258816 + }, + { + "name": "model.layers.1.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 24261120 + }, + { + "name": "model.layers.1.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 24777216 + }, + { + "name": "model.layers.1.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 24841728 + }, + { + "name": "model.layers.1.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 25243136 + }, + { + "name": "model.layers.10.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 25293312 + }, + { + "name": "model.layers.10.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 25295104 + }, + { + "name": "model.layers.10.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 27474176 + }, + { + "name": "model.layers.10.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 27746560 + }, + { + "name": "model.layers.10.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 32104704 + }, + { + "name": "model.layers.10.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 32649472 + }, + { + "name": "model.layers.10.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 32651264 + }, + { + "name": "model.layers.10.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 32653568 + }, + { + "name": "model.layers.10.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 33169664 + } + ], + "md5sum": "a8b16ec20037fdbe1cda73b84bba5516" + }, + { + "dataPath": "params_shard_2.bin", + "format": "raw-shard", + "nbytes": 33505280, + "records": [ + { + "name": "model.layers.10.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 0 + }, + { + "name": "model.layers.10.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 401408 + }, + { + "name": "model.layers.11.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 451584 + }, + { + "name": "model.layers.11.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 453376 + }, + { + "name": "model.layers.11.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 2632448 + }, + { + "name": "model.layers.11.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 2904832 + }, + { + "name": "model.layers.11.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 7262976 + }, + { + "name": "model.layers.11.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 7807744 + }, + { + "name": "model.layers.11.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 7809536 + }, + { + "name": "model.layers.11.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 7811840 + }, + { + "name": "model.layers.11.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 8327936 + }, + { + "name": "model.layers.11.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 8392448 + }, + { + "name": "model.layers.11.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 8793856 + }, + { + "name": "model.layers.12.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 8844032 + }, + { + "name": "model.layers.12.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 8845824 + }, + { + "name": "model.layers.12.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 11024896 + }, + { + "name": "model.layers.12.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 11297280 + }, + { + "name": "model.layers.12.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 15655424 + }, + { + "name": "model.layers.12.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 16200192 + }, + { + "name": "model.layers.12.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 16201984 + }, + { + "name": "model.layers.12.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 16204288 + }, + { + "name": "model.layers.12.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 16720384 + }, + { + "name": "model.layers.12.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 16784896 + }, + { + "name": "model.layers.12.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 17186304 + }, + { + "name": "model.layers.13.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 17236480 + }, + { + "name": "model.layers.13.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 17238272 + }, + { + "name": "model.layers.13.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 19417344 + }, + { + "name": "model.layers.13.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 19689728 + }, + { + "name": "model.layers.13.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 24047872 + }, + { + "name": "model.layers.13.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 24592640 + }, + { + "name": "model.layers.13.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 24594432 + }, + { + "name": "model.layers.13.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 24596736 + }, + { + "name": "model.layers.13.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 25112832 + }, + { + "name": "model.layers.13.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 25177344 + }, + { + "name": "model.layers.13.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 25578752 + }, + { + "name": "model.layers.14.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 25628928 + }, + { + "name": "model.layers.14.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 25630720 + }, + { + "name": "model.layers.14.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 27809792 + }, + { + "name": "model.layers.14.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 28082176 + }, + { + "name": "model.layers.14.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 32440320 + }, + { + "name": "model.layers.14.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 32985088 + }, + { + "name": "model.layers.14.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 32986880 + }, + { + "name": "model.layers.14.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 32989184 + } + ], + "md5sum": "2a667cc1cf2251835bbdd93cc3c2fc0b" + }, + { + "dataPath": "params_shard_3.bin", + "format": "raw-shard", + "nbytes": 33053696, + "records": [ + { + "name": "model.layers.14.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 0 + }, + { + "name": "model.layers.14.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 64512 + }, + { + "name": "model.layers.14.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 465920 + }, + { + "name": "model.layers.15.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 516096 + }, + { + "name": "model.layers.15.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 517888 + }, + { + "name": "model.layers.15.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 2696960 + }, + { + "name": "model.layers.15.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 2969344 + }, + { + "name": "model.layers.15.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 7327488 + }, + { + "name": "model.layers.15.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 7872256 + }, + { + "name": "model.layers.15.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 7874048 + }, + { + "name": "model.layers.15.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 7876352 + }, + { + "name": "model.layers.15.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 8392448 + }, + { + "name": "model.layers.15.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 8456960 + }, + { + "name": "model.layers.15.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 8858368 + }, + { + "name": "model.layers.16.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 8908544 + }, + { + "name": "model.layers.16.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 8910336 + }, + { + "name": "model.layers.16.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 11089408 + }, + { + "name": "model.layers.16.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 11361792 + }, + { + "name": "model.layers.16.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 15719936 + }, + { + "name": "model.layers.16.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 16264704 + }, + { + "name": "model.layers.16.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 16266496 + }, + { + "name": "model.layers.16.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 16268800 + }, + { + "name": "model.layers.16.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 16784896 + }, + { + "name": "model.layers.16.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 16849408 + }, + { + "name": "model.layers.16.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 17250816 + }, + { + "name": "model.layers.17.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 17300992 + }, + { + "name": "model.layers.17.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 17302784 + }, + { + "name": "model.layers.17.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 19481856 + }, + { + "name": "model.layers.17.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 19754240 + }, + { + "name": "model.layers.17.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 24112384 + }, + { + "name": "model.layers.17.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 24657152 + }, + { + "name": "model.layers.17.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 24658944 + }, + { + "name": "model.layers.17.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 24661248 + }, + { + "name": "model.layers.17.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 25177344 + }, + { + "name": "model.layers.17.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 25241856 + }, + { + "name": "model.layers.17.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 25643264 + }, + { + "name": "model.layers.18.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 25693440 + }, + { + "name": "model.layers.18.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 25695232 + }, + { + "name": "model.layers.18.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 27874304 + }, + { + "name": "model.layers.18.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 28146688 + }, + { + "name": "model.layers.18.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 32504832 + }, + { + "name": "model.layers.18.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 33049600 + }, + { + "name": "model.layers.18.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 33051392 + } + ], + "md5sum": "4d1aefac0b336db6a31751e59f421ab0" + }, + { + "dataPath": "params_shard_4.bin", + "format": "raw-shard", + "nbytes": 33020928, + "records": [ + { + "name": "model.layers.18.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 0 + }, + { + "name": "model.layers.18.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 516096 + }, + { + "name": "model.layers.18.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 580608 + }, + { + "name": "model.layers.18.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 982016 + }, + { + "name": "model.layers.19.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 1032192 + }, + { + "name": "model.layers.19.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 1033984 + }, + { + "name": "model.layers.19.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 3213056 + }, + { + "name": "model.layers.19.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 3485440 + }, + { + "name": "model.layers.19.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 7843584 + }, + { + "name": "model.layers.19.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 8388352 + }, + { + "name": "model.layers.19.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 8390144 + }, + { + "name": "model.layers.19.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 8392448 + }, + { + "name": "model.layers.19.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 8908544 + }, + { + "name": "model.layers.19.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 8973056 + }, + { + "name": "model.layers.19.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 9374464 + }, + { + "name": "model.layers.2.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 9424640 + }, + { + "name": "model.layers.2.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 9426432 + }, + { + "name": "model.layers.2.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 11605504 + }, + { + "name": "model.layers.2.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 11877888 + }, + { + "name": "model.layers.2.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 16236032 + }, + { + "name": "model.layers.2.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 16780800 + }, + { + "name": "model.layers.2.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 16782592 + }, + { + "name": "model.layers.2.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 16784896 + }, + { + "name": "model.layers.2.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 17300992 + }, + { + "name": "model.layers.2.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 17365504 + }, + { + "name": "model.layers.2.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 17766912 + }, + { + "name": "model.layers.20.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 17817088 + }, + { + "name": "model.layers.20.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 17818880 + }, + { + "name": "model.layers.20.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 19997952 + }, + { + "name": "model.layers.20.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 20270336 + }, + { + "name": "model.layers.20.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 24628480 + }, + { + "name": "model.layers.20.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 25173248 + }, + { + "name": "model.layers.20.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 25175040 + }, + { + "name": "model.layers.20.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 25177344 + }, + { + "name": "model.layers.20.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 25693440 + }, + { + "name": "model.layers.20.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 25757952 + }, + { + "name": "model.layers.20.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 26159360 + }, + { + "name": "model.layers.21.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 26209536 + }, + { + "name": "model.layers.21.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 26211328 + }, + { + "name": "model.layers.21.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 28390400 + }, + { + "name": "model.layers.21.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 28662784 + } + ], + "md5sum": "e6a80ce05fca732a020105bfe2ffa5b5" + }, + { + "dataPath": "params_shard_5.bin", + "format": "raw-shard", + "nbytes": 29211648, + "records": [ + { + "name": "model.layers.21.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 0 + }, + { + "name": "model.layers.21.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 544768 + }, + { + "name": "model.layers.21.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 546560 + }, + { + "name": "model.layers.21.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 548864 + }, + { + "name": "model.layers.21.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 1064960 + }, + { + "name": "model.layers.21.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 1129472 + }, + { + "name": "model.layers.21.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 1530880 + }, + { + "name": "model.layers.22.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 1581056 + }, + { + "name": "model.layers.22.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 1582848 + }, + { + "name": "model.layers.22.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 3761920 + }, + { + "name": "model.layers.22.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 4034304 + }, + { + "name": "model.layers.22.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 8392448 + }, + { + "name": "model.layers.22.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 8937216 + }, + { + "name": "model.layers.22.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 8939008 + }, + { + "name": "model.layers.22.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 8941312 + }, + { + "name": "model.layers.22.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 9457408 + }, + { + "name": "model.layers.22.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 9521920 + }, + { + "name": "model.layers.22.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 9923328 + }, + { + "name": "model.layers.23.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 9973504 + }, + { + "name": "model.layers.23.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 9975296 + }, + { + "name": "model.layers.23.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 12154368 + }, + { + "name": "model.layers.23.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 12426752 + }, + { + "name": "model.layers.23.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 16784896 + }, + { + "name": "model.layers.23.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 17329664 + }, + { + "name": "model.layers.23.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 17331456 + }, + { + "name": "model.layers.23.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 17333760 + }, + { + "name": "model.layers.23.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 17849856 + }, + { + "name": "model.layers.23.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 17914368 + }, + { + "name": "model.layers.23.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 18315776 + }, + { + "name": "model.layers.3.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 18365952 + }, + { + "name": "model.layers.3.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 18367744 + }, + { + "name": "model.layers.3.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 20546816 + }, + { + "name": "model.layers.3.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 20819200 + }, + { + "name": "model.layers.3.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 25177344 + }, + { + "name": "model.layers.3.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 25722112 + }, + { + "name": "model.layers.3.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 25723904 + }, + { + "name": "model.layers.3.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 25726208 + }, + { + "name": "model.layers.3.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 26242304 + }, + { + "name": "model.layers.3.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 26306816 + }, + { + "name": "model.layers.3.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 26708224 + }, + { + "name": "model.layers.4.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 26758400 + }, + { + "name": "model.layers.4.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 26760192 + }, + { + "name": "model.layers.4.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 28939264 + } + ], + "md5sum": "4dfeaf67a16ccd036886e650379f7247" + }, + { + "dataPath": "params_shard_6.bin", + "format": "raw-shard", + "nbytes": 33297408, + "records": [ + { + "name": "model.layers.4.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 0 + }, + { + "name": "model.layers.4.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 4358144 + }, + { + "name": "model.layers.4.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 4902912 + }, + { + "name": "model.layers.4.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 4904704 + }, + { + "name": "model.layers.4.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 4907008 + }, + { + "name": "model.layers.4.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 5423104 + }, + { + "name": "model.layers.4.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 5487616 + }, + { + "name": "model.layers.4.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 5889024 + }, + { + "name": "model.layers.5.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 5939200 + }, + { + "name": "model.layers.5.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 5940992 + }, + { + "name": "model.layers.5.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 8120064 + }, + { + "name": "model.layers.5.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 8392448 + }, + { + "name": "model.layers.5.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 12750592 + }, + { + "name": "model.layers.5.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 13295360 + }, + { + "name": "model.layers.5.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 13297152 + }, + { + "name": "model.layers.5.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 13299456 + }, + { + "name": "model.layers.5.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 13815552 + }, + { + "name": "model.layers.5.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 13880064 + }, + { + "name": "model.layers.5.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 14281472 + }, + { + "name": "model.layers.6.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 14331648 + }, + { + "name": "model.layers.6.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 14333440 + }, + { + "name": "model.layers.6.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 16512512 + }, + { + "name": "model.layers.6.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 16784896 + }, + { + "name": "model.layers.6.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 21143040 + }, + { + "name": "model.layers.6.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 21687808 + }, + { + "name": "model.layers.6.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 21689600 + }, + { + "name": "model.layers.6.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 21691904 + }, + { + "name": "model.layers.6.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 22208000 + }, + { + "name": "model.layers.6.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 22272512 + }, + { + "name": "model.layers.6.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 22673920 + }, + { + "name": "model.layers.7.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 22724096 + }, + { + "name": "model.layers.7.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 22725888 + }, + { + "name": "model.layers.7.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 24904960 + }, + { + "name": "model.layers.7.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 25177344 + }, + { + "name": "model.layers.7.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 29535488 + }, + { + "name": "model.layers.7.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 30080256 + }, + { + "name": "model.layers.7.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 30082048 + }, + { + "name": "model.layers.7.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 30084352 + }, + { + "name": "model.layers.7.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 30600448 + }, + { + "name": "model.layers.7.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 30664960 + }, + { + "name": "model.layers.7.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 31066368 + }, + { + "name": "model.layers.8.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 31116544 + }, + { + "name": "model.layers.8.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 31118336 + } + ], + "md5sum": "77c0cb74ff962309394c1816ddd1f815" + }, + { + "dataPath": "params_shard_7.bin", + "format": "raw-shard", + "nbytes": 14605824, + "records": [ + { + "name": "model.layers.8.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 0 + }, + { + "name": "model.layers.8.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 272384 + }, + { + "name": "model.layers.8.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 4630528 + }, + { + "name": "model.layers.8.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 5175296 + }, + { + "name": "model.layers.8.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 5177088 + }, + { + "name": "model.layers.8.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 5179392 + }, + { + "name": "model.layers.8.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 5695488 + }, + { + "name": "model.layers.8.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 5760000 + }, + { + "name": "model.layers.8.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 6161408 + }, + { + "name": "model.layers.9.input_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 6211584 + }, + { + "name": "model.layers.9.mlp.down_proj.q_weight", + "shape": [ + 896, + 608 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 2179072, + "byteOffset": 6213376 + }, + { + "name": "model.layers.9.mlp.down_proj.q_scale", + "shape": [ + 896, + 152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 272384, + "byteOffset": 8392448 + }, + { + "name": "model.layers.9.mlp.gate_up_proj.q_weight", + "shape": [ + 9728, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 4358144, + "byteOffset": 8664832 + }, + { + "name": "model.layers.9.mlp.gate_up_proj.q_scale", + "shape": [ + 9728, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 544768, + "byteOffset": 13022976 + }, + { + "name": "model.layers.9.post_attention_layernorm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 13567744 + }, + { + "name": "model.layers.9.self_attn.c_attn.bias", + "shape": [ + 1152 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 2304, + "byteOffset": 13569536 + }, + { + "name": "model.layers.9.self_attn.c_attn.q_weight", + "shape": [ + 1152, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 516096, + "byteOffset": 13571840 + }, + { + "name": "model.layers.9.self_attn.c_attn.q_scale", + "shape": [ + 1152, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 64512, + "byteOffset": 14087936 + }, + { + "name": "model.layers.9.self_attn.o_proj.q_weight", + "shape": [ + 896, + 112 + ], + "dtype": "uint32", + "format": "f32-to-bf16", + "nbytes": 401408, + "byteOffset": 14152448 + }, + { + "name": "model.layers.9.self_attn.o_proj.q_scale", + "shape": [ + 896, + 28 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 50176, + "byteOffset": 14553856 + }, + { + "name": "model.norm.weight", + "shape": [ + 896 + ], + "dtype": "bfloat16", + "format": "raw", + "nbytes": 1792, + "byteOffset": 14604032 + } + ], + "md5sum": "694ee88e054bd03bb0be3be07b9b3abd" + } + ] +} \ No newline at end of file