justus27 commited on
Commit
fd63f4c
·
verified ·
1 Parent(s): 3b7add6

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. added_tokens.json +24 -0
  3. config.json +29 -0
  4. generation_config.json +9 -0
  5. merges.txt +0 -0
  6. model-00001-of-00031.safetensors +3 -0
  7. model-00002-of-00031.safetensors +3 -0
  8. model-00003-of-00031.safetensors +3 -0
  9. model-00004-of-00031.safetensors +3 -0
  10. model-00005-of-00031.safetensors +3 -0
  11. model-00006-of-00031.safetensors +3 -0
  12. model-00007-of-00031.safetensors +3 -0
  13. model-00008-of-00031.safetensors +3 -0
  14. model-00009-of-00031.safetensors +3 -0
  15. model-00010-of-00031.safetensors +3 -0
  16. model-00011-of-00031.safetensors +3 -0
  17. model-00012-of-00031.safetensors +3 -0
  18. model-00013-of-00031.safetensors +3 -0
  19. model-00014-of-00031.safetensors +3 -0
  20. model-00015-of-00031.safetensors +3 -0
  21. model-00016-of-00031.safetensors +3 -0
  22. model-00017-of-00031.safetensors +3 -0
  23. model-00018-of-00031.safetensors +3 -0
  24. model-00019-of-00031.safetensors +3 -0
  25. model-00020-of-00031.safetensors +3 -0
  26. model-00021-of-00031.safetensors +3 -0
  27. model-00022-of-00031.safetensors +3 -0
  28. model-00023-of-00031.safetensors +3 -0
  29. model-00024-of-00031.safetensors +3 -0
  30. model-00025-of-00031.safetensors +3 -0
  31. model-00026-of-00031.safetensors +3 -0
  32. model-00027-of-00031.safetensors +3 -0
  33. model-00028-of-00031.safetensors +3 -0
  34. model-00029-of-00031.safetensors +3 -0
  35. model-00030-of-00031.safetensors +3 -0
  36. model-00031-of-00031.safetensors +3 -0
  37. model.safetensors.index.json +970 -0
  38. rng_state_0.pth +3 -0
  39. rng_state_1.pth +3 -0
  40. rng_state_2.pth +3 -0
  41. rng_state_3.pth +3 -0
  42. rng_state_4.pth +3 -0
  43. rng_state_5.pth +3 -0
  44. rng_state_6.pth +3 -0
  45. rng_state_7.pth +3 -0
  46. scheduler.pt +3 -0
  47. special_tokens_map.json +25 -0
  48. tokenizer.json +3 -0
  49. tokenizer_config.json +208 -0
  50. trainer_state.json +1633 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "justus27/qwen-math-long",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 8192,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 29568,
13
+ "max_position_embeddings": 32768,
14
+ "max_window_layers": 70,
15
+ "model_type": "qwen2",
16
+ "num_attention_heads": 64,
17
+ "num_hidden_layers": 80,
18
+ "num_key_value_heads": 8,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_scaling": null,
21
+ "rope_theta": 300000.0,
22
+ "sliding_window": null,
23
+ "tie_word_embeddings": false,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.49.0",
26
+ "use_cache": false,
27
+ "use_sliding_window": false,
28
+ "vocab_size": 152064
29
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "transformers_version": "4.49.0"
9
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d217ed54ae4ac3f787a00b467dfe023664e892b0987e83e07258cb5b883fdba
3
+ size 4548798728
model-00002-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed07a2af8708a9f98fb20eb365fa52a5472b81ef5a29cc7f9fc44ff329af2b7d
3
+ size 4964101384
model-00003-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99ee7baf3a3fdd4c229793d340fba401f52ba53f3c9f97d6a0d06fac339298ce
3
+ size 4781637328
model-00004-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16588e452d913af2cdc65e6ef74388fb1cbea515bf4dd23219cd21b7c48256f3
3
+ size 4781670320
model-00005-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd414f97c6f28ac6ad1e76b8afb28bdaf749b3ad19ad8fe385673fb3e304cc20
3
+ size 4781670360
model-00006-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:425cae07cac23b67a63641b9505749d6700f6f6678b453280e29e3c74cf787a5
3
+ size 4964101416
model-00007-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:174fe3f9e1f22dc6c26f785ea3652f5ec26bcf17b0c72881e0122a040919fc8b
3
+ size 4781637360
model-00008-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9b2979ef1a9c60163c03acdacf611cdc6402eec4dd3dc86d4fe9af74e102265
3
+ size 4781670360
model-00009-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9d18518a6ae752019a33fbed891245f264692aa2a24d2038cb2d73f0bbbd326
3
+ size 4781670360
model-00010-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e87cb624af17403a8d17e434925e453854d68c2ab003d9f50190f1de9fcc9eb
3
+ size 4964101416
model-00011-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35889430045d8954e8423fd0da65cbf2ffc2d939e0e30b8e4e9d739d6e8ae51b
3
+ size 4781637360
model-00012-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b41510717458f90125eba91dc0a1893f09fe386400d53a53e3c498b1acd3e02d
3
+ size 4781670360
model-00013-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:152bbd2cf73001522b868756c6125ac5e2bd8bfc8d9b7c2975d2ff9853121bf5
3
+ size 4781670360
model-00014-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0559f0b378ff0442afbf806425e3f5cb4c887c10094430a1110da41eb192c2fe
3
+ size 4964101416
model-00015-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae2ad65fe0663f522f329f1056431b87b7b2347502e355ac281272f4ea50021e
3
+ size 4781637360
model-00016-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8487050b072d9359b5be7238f1fc12e718534a307cfa3c7b1962bbb744d785b7
3
+ size 4781670360
model-00017-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c6b94b7a539f06c4e34426d0cced427791fbecba826da3e57d66fbc54be9c84
3
+ size 4781670360
model-00018-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:071501422758d2da6ba93a693cb3f5974d206af66c9d79fa3dfc8f4ea83787ff
3
+ size 4964101416
model-00019-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79b607b99752afad68a95901d5cf328f35f5a05af43591771f4257821d59796a
3
+ size 4781637360
model-00020-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23ac32b8c447749089241af5c7a3fa99a1c2dece1e7b7fb51bdb639544c01a2f
3
+ size 4781670360
model-00021-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:097dbbc7e5797e5c98f52ef04f0f229313fe479dc823efa9bf26017b8f9c210f
3
+ size 4781670360
model-00022-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d729f5bdcd034572f6556398e98a0d6d4f6f62584720e699adeae5547548bc2c
3
+ size 4964101416
model-00023-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d21874ecaccb89499d61e701b255c607486415bade0398b7d2371612ccb7914f
3
+ size 4781637360
model-00024-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:607f97d7a74fcc6a908a5898e903b8e5d9993585d58d215598e9797abd451187
3
+ size 4781670360
model-00025-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66ac429d9da05a6fd2a8ddabf377320e9b388d2b8643776a8029966041087581
3
+ size 4781670360
model-00026-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96a5bd3ad8f50b541ae0c571bf0c67f6a636f0bea1b2768f2226c1b77db5a928
3
+ size 4964101416
model-00027-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:195cd1d3a7adadeecefde081f2a7e7ba4482df41a47da19fe8ff7dcd4743a458
3
+ size 4781637360
model-00028-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b342df5edf8850cd4b9f3004eb88af02e233fa90e050a3e63649c82fb0135621
3
+ size 4781670360
model-00029-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7b4aa47e25ba78fe40d97568b9921a177dd7cc0d684b8a44c63d556533c4158
3
+ size 4781670360
model-00030-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3437b6f9882b1e453a48c37a7ccb6be22e5fe34aecc03828daeda06a8062274e
3
+ size 3208747032
model-00031-of-00031.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58c4d3c5fb5fe2aaac8427125742248a1066e8961cb3023b98769c3cdfbcd6b4
3
+ size 2491416704
model.safetensors.index.json ADDED
@@ -0,0 +1,970 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 145412407296
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00031-of-00031.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00031.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00031.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00031.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00031.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00031.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00031.safetensors",
13
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00031.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00031.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00031.safetensors",
16
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00031.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00031.safetensors",
18
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00031.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00031.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00002-of-00031.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00002-of-00031.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00002-of-00031.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00002-of-00031.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00031.safetensors",
25
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00031.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00031.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00031.safetensors",
28
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00031.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00031.safetensors",
30
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00031.safetensors",
31
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00031.safetensors",
32
+ "model.layers.10.input_layernorm.weight": "model-00005-of-00031.safetensors",
33
+ "model.layers.10.mlp.down_proj.weight": "model-00005-of-00031.safetensors",
34
+ "model.layers.10.mlp.gate_proj.weight": "model-00005-of-00031.safetensors",
35
+ "model.layers.10.mlp.up_proj.weight": "model-00005-of-00031.safetensors",
36
+ "model.layers.10.post_attention_layernorm.weight": "model-00005-of-00031.safetensors",
37
+ "model.layers.10.self_attn.k_proj.bias": "model-00005-of-00031.safetensors",
38
+ "model.layers.10.self_attn.k_proj.weight": "model-00005-of-00031.safetensors",
39
+ "model.layers.10.self_attn.o_proj.weight": "model-00005-of-00031.safetensors",
40
+ "model.layers.10.self_attn.q_proj.bias": "model-00005-of-00031.safetensors",
41
+ "model.layers.10.self_attn.q_proj.weight": "model-00005-of-00031.safetensors",
42
+ "model.layers.10.self_attn.v_proj.bias": "model-00005-of-00031.safetensors",
43
+ "model.layers.10.self_attn.v_proj.weight": "model-00005-of-00031.safetensors",
44
+ "model.layers.11.input_layernorm.weight": "model-00005-of-00031.safetensors",
45
+ "model.layers.11.mlp.down_proj.weight": "model-00005-of-00031.safetensors",
46
+ "model.layers.11.mlp.gate_proj.weight": "model-00005-of-00031.safetensors",
47
+ "model.layers.11.mlp.up_proj.weight": "model-00005-of-00031.safetensors",
48
+ "model.layers.11.post_attention_layernorm.weight": "model-00005-of-00031.safetensors",
49
+ "model.layers.11.self_attn.k_proj.bias": "model-00005-of-00031.safetensors",
50
+ "model.layers.11.self_attn.k_proj.weight": "model-00005-of-00031.safetensors",
51
+ "model.layers.11.self_attn.o_proj.weight": "model-00005-of-00031.safetensors",
52
+ "model.layers.11.self_attn.q_proj.bias": "model-00005-of-00031.safetensors",
53
+ "model.layers.11.self_attn.q_proj.weight": "model-00005-of-00031.safetensors",
54
+ "model.layers.11.self_attn.v_proj.bias": "model-00005-of-00031.safetensors",
55
+ "model.layers.11.self_attn.v_proj.weight": "model-00005-of-00031.safetensors",
56
+ "model.layers.12.input_layernorm.weight": "model-00006-of-00031.safetensors",
57
+ "model.layers.12.mlp.down_proj.weight": "model-00006-of-00031.safetensors",
58
+ "model.layers.12.mlp.gate_proj.weight": "model-00006-of-00031.safetensors",
59
+ "model.layers.12.mlp.up_proj.weight": "model-00006-of-00031.safetensors",
60
+ "model.layers.12.post_attention_layernorm.weight": "model-00006-of-00031.safetensors",
61
+ "model.layers.12.self_attn.k_proj.bias": "model-00005-of-00031.safetensors",
62
+ "model.layers.12.self_attn.k_proj.weight": "model-00005-of-00031.safetensors",
63
+ "model.layers.12.self_attn.o_proj.weight": "model-00005-of-00031.safetensors",
64
+ "model.layers.12.self_attn.q_proj.bias": "model-00005-of-00031.safetensors",
65
+ "model.layers.12.self_attn.q_proj.weight": "model-00005-of-00031.safetensors",
66
+ "model.layers.12.self_attn.v_proj.bias": "model-00005-of-00031.safetensors",
67
+ "model.layers.12.self_attn.v_proj.weight": "model-00005-of-00031.safetensors",
68
+ "model.layers.13.input_layernorm.weight": "model-00006-of-00031.safetensors",
69
+ "model.layers.13.mlp.down_proj.weight": "model-00006-of-00031.safetensors",
70
+ "model.layers.13.mlp.gate_proj.weight": "model-00006-of-00031.safetensors",
71
+ "model.layers.13.mlp.up_proj.weight": "model-00006-of-00031.safetensors",
72
+ "model.layers.13.post_attention_layernorm.weight": "model-00006-of-00031.safetensors",
73
+ "model.layers.13.self_attn.k_proj.bias": "model-00006-of-00031.safetensors",
74
+ "model.layers.13.self_attn.k_proj.weight": "model-00006-of-00031.safetensors",
75
+ "model.layers.13.self_attn.o_proj.weight": "model-00006-of-00031.safetensors",
76
+ "model.layers.13.self_attn.q_proj.bias": "model-00006-of-00031.safetensors",
77
+ "model.layers.13.self_attn.q_proj.weight": "model-00006-of-00031.safetensors",
78
+ "model.layers.13.self_attn.v_proj.bias": "model-00006-of-00031.safetensors",
79
+ "model.layers.13.self_attn.v_proj.weight": "model-00006-of-00031.safetensors",
80
+ "model.layers.14.input_layernorm.weight": "model-00006-of-00031.safetensors",
81
+ "model.layers.14.mlp.down_proj.weight": "model-00006-of-00031.safetensors",
82
+ "model.layers.14.mlp.gate_proj.weight": "model-00006-of-00031.safetensors",
83
+ "model.layers.14.mlp.up_proj.weight": "model-00006-of-00031.safetensors",
84
+ "model.layers.14.post_attention_layernorm.weight": "model-00006-of-00031.safetensors",
85
+ "model.layers.14.self_attn.k_proj.bias": "model-00006-of-00031.safetensors",
86
+ "model.layers.14.self_attn.k_proj.weight": "model-00006-of-00031.safetensors",
87
+ "model.layers.14.self_attn.o_proj.weight": "model-00006-of-00031.safetensors",
88
+ "model.layers.14.self_attn.q_proj.bias": "model-00006-of-00031.safetensors",
89
+ "model.layers.14.self_attn.q_proj.weight": "model-00006-of-00031.safetensors",
90
+ "model.layers.14.self_attn.v_proj.bias": "model-00006-of-00031.safetensors",
91
+ "model.layers.14.self_attn.v_proj.weight": "model-00006-of-00031.safetensors",
92
+ "model.layers.15.input_layernorm.weight": "model-00007-of-00031.safetensors",
93
+ "model.layers.15.mlp.down_proj.weight": "model-00007-of-00031.safetensors",
94
+ "model.layers.15.mlp.gate_proj.weight": "model-00007-of-00031.safetensors",
95
+ "model.layers.15.mlp.up_proj.weight": "model-00007-of-00031.safetensors",
96
+ "model.layers.15.post_attention_layernorm.weight": "model-00007-of-00031.safetensors",
97
+ "model.layers.15.self_attn.k_proj.bias": "model-00007-of-00031.safetensors",
98
+ "model.layers.15.self_attn.k_proj.weight": "model-00007-of-00031.safetensors",
99
+ "model.layers.15.self_attn.o_proj.weight": "model-00007-of-00031.safetensors",
100
+ "model.layers.15.self_attn.q_proj.bias": "model-00007-of-00031.safetensors",
101
+ "model.layers.15.self_attn.q_proj.weight": "model-00007-of-00031.safetensors",
102
+ "model.layers.15.self_attn.v_proj.bias": "model-00007-of-00031.safetensors",
103
+ "model.layers.15.self_attn.v_proj.weight": "model-00007-of-00031.safetensors",
104
+ "model.layers.16.input_layernorm.weight": "model-00007-of-00031.safetensors",
105
+ "model.layers.16.mlp.down_proj.weight": "model-00007-of-00031.safetensors",
106
+ "model.layers.16.mlp.gate_proj.weight": "model-00007-of-00031.safetensors",
107
+ "model.layers.16.mlp.up_proj.weight": "model-00007-of-00031.safetensors",
108
+ "model.layers.16.post_attention_layernorm.weight": "model-00007-of-00031.safetensors",
109
+ "model.layers.16.self_attn.k_proj.bias": "model-00007-of-00031.safetensors",
110
+ "model.layers.16.self_attn.k_proj.weight": "model-00007-of-00031.safetensors",
111
+ "model.layers.16.self_attn.o_proj.weight": "model-00007-of-00031.safetensors",
112
+ "model.layers.16.self_attn.q_proj.bias": "model-00007-of-00031.safetensors",
113
+ "model.layers.16.self_attn.q_proj.weight": "model-00007-of-00031.safetensors",
114
+ "model.layers.16.self_attn.v_proj.bias": "model-00007-of-00031.safetensors",
115
+ "model.layers.16.self_attn.v_proj.weight": "model-00007-of-00031.safetensors",
116
+ "model.layers.17.input_layernorm.weight": "model-00008-of-00031.safetensors",
117
+ "model.layers.17.mlp.down_proj.weight": "model-00008-of-00031.safetensors",
118
+ "model.layers.17.mlp.gate_proj.weight": "model-00007-of-00031.safetensors",
119
+ "model.layers.17.mlp.up_proj.weight": "model-00007-of-00031.safetensors",
120
+ "model.layers.17.post_attention_layernorm.weight": "model-00008-of-00031.safetensors",
121
+ "model.layers.17.self_attn.k_proj.bias": "model-00007-of-00031.safetensors",
122
+ "model.layers.17.self_attn.k_proj.weight": "model-00007-of-00031.safetensors",
123
+ "model.layers.17.self_attn.o_proj.weight": "model-00007-of-00031.safetensors",
124
+ "model.layers.17.self_attn.q_proj.bias": "model-00007-of-00031.safetensors",
125
+ "model.layers.17.self_attn.q_proj.weight": "model-00007-of-00031.safetensors",
126
+ "model.layers.17.self_attn.v_proj.bias": "model-00007-of-00031.safetensors",
127
+ "model.layers.17.self_attn.v_proj.weight": "model-00007-of-00031.safetensors",
128
+ "model.layers.18.input_layernorm.weight": "model-00008-of-00031.safetensors",
129
+ "model.layers.18.mlp.down_proj.weight": "model-00008-of-00031.safetensors",
130
+ "model.layers.18.mlp.gate_proj.weight": "model-00008-of-00031.safetensors",
131
+ "model.layers.18.mlp.up_proj.weight": "model-00008-of-00031.safetensors",
132
+ "model.layers.18.post_attention_layernorm.weight": "model-00008-of-00031.safetensors",
133
+ "model.layers.18.self_attn.k_proj.bias": "model-00008-of-00031.safetensors",
134
+ "model.layers.18.self_attn.k_proj.weight": "model-00008-of-00031.safetensors",
135
+ "model.layers.18.self_attn.o_proj.weight": "model-00008-of-00031.safetensors",
136
+ "model.layers.18.self_attn.q_proj.bias": "model-00008-of-00031.safetensors",
137
+ "model.layers.18.self_attn.q_proj.weight": "model-00008-of-00031.safetensors",
138
+ "model.layers.18.self_attn.v_proj.bias": "model-00008-of-00031.safetensors",
139
+ "model.layers.18.self_attn.v_proj.weight": "model-00008-of-00031.safetensors",
140
+ "model.layers.19.input_layernorm.weight": "model-00008-of-00031.safetensors",
141
+ "model.layers.19.mlp.down_proj.weight": "model-00008-of-00031.safetensors",
142
+ "model.layers.19.mlp.gate_proj.weight": "model-00008-of-00031.safetensors",
143
+ "model.layers.19.mlp.up_proj.weight": "model-00008-of-00031.safetensors",
144
+ "model.layers.19.post_attention_layernorm.weight": "model-00008-of-00031.safetensors",
145
+ "model.layers.19.self_attn.k_proj.bias": "model-00008-of-00031.safetensors",
146
+ "model.layers.19.self_attn.k_proj.weight": "model-00008-of-00031.safetensors",
147
+ "model.layers.19.self_attn.o_proj.weight": "model-00008-of-00031.safetensors",
148
+ "model.layers.19.self_attn.q_proj.bias": "model-00008-of-00031.safetensors",
149
+ "model.layers.19.self_attn.q_proj.weight": "model-00008-of-00031.safetensors",
150
+ "model.layers.19.self_attn.v_proj.bias": "model-00008-of-00031.safetensors",
151
+ "model.layers.19.self_attn.v_proj.weight": "model-00008-of-00031.safetensors",
152
+ "model.layers.2.input_layernorm.weight": "model-00002-of-00031.safetensors",
153
+ "model.layers.2.mlp.down_proj.weight": "model-00002-of-00031.safetensors",
154
+ "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00031.safetensors",
155
+ "model.layers.2.mlp.up_proj.weight": "model-00002-of-00031.safetensors",
156
+ "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00031.safetensors",
157
+ "model.layers.2.self_attn.k_proj.bias": "model-00002-of-00031.safetensors",
158
+ "model.layers.2.self_attn.k_proj.weight": "model-00002-of-00031.safetensors",
159
+ "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00031.safetensors",
160
+ "model.layers.2.self_attn.q_proj.bias": "model-00002-of-00031.safetensors",
161
+ "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00031.safetensors",
162
+ "model.layers.2.self_attn.v_proj.bias": "model-00002-of-00031.safetensors",
163
+ "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00031.safetensors",
164
+ "model.layers.20.input_layernorm.weight": "model-00009-of-00031.safetensors",
165
+ "model.layers.20.mlp.down_proj.weight": "model-00009-of-00031.safetensors",
166
+ "model.layers.20.mlp.gate_proj.weight": "model-00008-of-00031.safetensors",
167
+ "model.layers.20.mlp.up_proj.weight": "model-00009-of-00031.safetensors",
168
+ "model.layers.20.post_attention_layernorm.weight": "model-00009-of-00031.safetensors",
169
+ "model.layers.20.self_attn.k_proj.bias": "model-00008-of-00031.safetensors",
170
+ "model.layers.20.self_attn.k_proj.weight": "model-00008-of-00031.safetensors",
171
+ "model.layers.20.self_attn.o_proj.weight": "model-00008-of-00031.safetensors",
172
+ "model.layers.20.self_attn.q_proj.bias": "model-00008-of-00031.safetensors",
173
+ "model.layers.20.self_attn.q_proj.weight": "model-00008-of-00031.safetensors",
174
+ "model.layers.20.self_attn.v_proj.bias": "model-00008-of-00031.safetensors",
175
+ "model.layers.20.self_attn.v_proj.weight": "model-00008-of-00031.safetensors",
176
+ "model.layers.21.input_layernorm.weight": "model-00009-of-00031.safetensors",
177
+ "model.layers.21.mlp.down_proj.weight": "model-00009-of-00031.safetensors",
178
+ "model.layers.21.mlp.gate_proj.weight": "model-00009-of-00031.safetensors",
179
+ "model.layers.21.mlp.up_proj.weight": "model-00009-of-00031.safetensors",
180
+ "model.layers.21.post_attention_layernorm.weight": "model-00009-of-00031.safetensors",
181
+ "model.layers.21.self_attn.k_proj.bias": "model-00009-of-00031.safetensors",
182
+ "model.layers.21.self_attn.k_proj.weight": "model-00009-of-00031.safetensors",
183
+ "model.layers.21.self_attn.o_proj.weight": "model-00009-of-00031.safetensors",
184
+ "model.layers.21.self_attn.q_proj.bias": "model-00009-of-00031.safetensors",
185
+ "model.layers.21.self_attn.q_proj.weight": "model-00009-of-00031.safetensors",
186
+ "model.layers.21.self_attn.v_proj.bias": "model-00009-of-00031.safetensors",
187
+ "model.layers.21.self_attn.v_proj.weight": "model-00009-of-00031.safetensors",
188
+ "model.layers.22.input_layernorm.weight": "model-00009-of-00031.safetensors",
189
+ "model.layers.22.mlp.down_proj.weight": "model-00009-of-00031.safetensors",
190
+ "model.layers.22.mlp.gate_proj.weight": "model-00009-of-00031.safetensors",
191
+ "model.layers.22.mlp.up_proj.weight": "model-00009-of-00031.safetensors",
192
+ "model.layers.22.post_attention_layernorm.weight": "model-00009-of-00031.safetensors",
193
+ "model.layers.22.self_attn.k_proj.bias": "model-00009-of-00031.safetensors",
194
+ "model.layers.22.self_attn.k_proj.weight": "model-00009-of-00031.safetensors",
195
+ "model.layers.22.self_attn.o_proj.weight": "model-00009-of-00031.safetensors",
196
+ "model.layers.22.self_attn.q_proj.bias": "model-00009-of-00031.safetensors",
197
+ "model.layers.22.self_attn.q_proj.weight": "model-00009-of-00031.safetensors",
198
+ "model.layers.22.self_attn.v_proj.bias": "model-00009-of-00031.safetensors",
199
+ "model.layers.22.self_attn.v_proj.weight": "model-00009-of-00031.safetensors",
200
+ "model.layers.23.input_layernorm.weight": "model-00010-of-00031.safetensors",
201
+ "model.layers.23.mlp.down_proj.weight": "model-00010-of-00031.safetensors",
202
+ "model.layers.23.mlp.gate_proj.weight": "model-00010-of-00031.safetensors",
203
+ "model.layers.23.mlp.up_proj.weight": "model-00010-of-00031.safetensors",
204
+ "model.layers.23.post_attention_layernorm.weight": "model-00010-of-00031.safetensors",
205
+ "model.layers.23.self_attn.k_proj.bias": "model-00009-of-00031.safetensors",
206
+ "model.layers.23.self_attn.k_proj.weight": "model-00009-of-00031.safetensors",
207
+ "model.layers.23.self_attn.o_proj.weight": "model-00009-of-00031.safetensors",
208
+ "model.layers.23.self_attn.q_proj.bias": "model-00009-of-00031.safetensors",
209
+ "model.layers.23.self_attn.q_proj.weight": "model-00009-of-00031.safetensors",
210
+ "model.layers.23.self_attn.v_proj.bias": "model-00009-of-00031.safetensors",
211
+ "model.layers.23.self_attn.v_proj.weight": "model-00009-of-00031.safetensors",
212
+ "model.layers.24.input_layernorm.weight": "model-00010-of-00031.safetensors",
213
+ "model.layers.24.mlp.down_proj.weight": "model-00010-of-00031.safetensors",
214
+ "model.layers.24.mlp.gate_proj.weight": "model-00010-of-00031.safetensors",
215
+ "model.layers.24.mlp.up_proj.weight": "model-00010-of-00031.safetensors",
216
+ "model.layers.24.post_attention_layernorm.weight": "model-00010-of-00031.safetensors",
217
+ "model.layers.24.self_attn.k_proj.bias": "model-00010-of-00031.safetensors",
218
+ "model.layers.24.self_attn.k_proj.weight": "model-00010-of-00031.safetensors",
219
+ "model.layers.24.self_attn.o_proj.weight": "model-00010-of-00031.safetensors",
220
+ "model.layers.24.self_attn.q_proj.bias": "model-00010-of-00031.safetensors",
221
+ "model.layers.24.self_attn.q_proj.weight": "model-00010-of-00031.safetensors",
222
+ "model.layers.24.self_attn.v_proj.bias": "model-00010-of-00031.safetensors",
223
+ "model.layers.24.self_attn.v_proj.weight": "model-00010-of-00031.safetensors",
224
+ "model.layers.25.input_layernorm.weight": "model-00010-of-00031.safetensors",
225
+ "model.layers.25.mlp.down_proj.weight": "model-00010-of-00031.safetensors",
226
+ "model.layers.25.mlp.gate_proj.weight": "model-00010-of-00031.safetensors",
227
+ "model.layers.25.mlp.up_proj.weight": "model-00010-of-00031.safetensors",
228
+ "model.layers.25.post_attention_layernorm.weight": "model-00010-of-00031.safetensors",
229
+ "model.layers.25.self_attn.k_proj.bias": "model-00010-of-00031.safetensors",
230
+ "model.layers.25.self_attn.k_proj.weight": "model-00010-of-00031.safetensors",
231
+ "model.layers.25.self_attn.o_proj.weight": "model-00010-of-00031.safetensors",
232
+ "model.layers.25.self_attn.q_proj.bias": "model-00010-of-00031.safetensors",
233
+ "model.layers.25.self_attn.q_proj.weight": "model-00010-of-00031.safetensors",
234
+ "model.layers.25.self_attn.v_proj.bias": "model-00010-of-00031.safetensors",
235
+ "model.layers.25.self_attn.v_proj.weight": "model-00010-of-00031.safetensors",
236
+ "model.layers.26.input_layernorm.weight": "model-00011-of-00031.safetensors",
237
+ "model.layers.26.mlp.down_proj.weight": "model-00011-of-00031.safetensors",
238
+ "model.layers.26.mlp.gate_proj.weight": "model-00011-of-00031.safetensors",
239
+ "model.layers.26.mlp.up_proj.weight": "model-00011-of-00031.safetensors",
240
+ "model.layers.26.post_attention_layernorm.weight": "model-00011-of-00031.safetensors",
241
+ "model.layers.26.self_attn.k_proj.bias": "model-00011-of-00031.safetensors",
242
+ "model.layers.26.self_attn.k_proj.weight": "model-00011-of-00031.safetensors",
243
+ "model.layers.26.self_attn.o_proj.weight": "model-00011-of-00031.safetensors",
244
+ "model.layers.26.self_attn.q_proj.bias": "model-00011-of-00031.safetensors",
245
+ "model.layers.26.self_attn.q_proj.weight": "model-00011-of-00031.safetensors",
246
+ "model.layers.26.self_attn.v_proj.bias": "model-00011-of-00031.safetensors",
247
+ "model.layers.26.self_attn.v_proj.weight": "model-00011-of-00031.safetensors",
248
+ "model.layers.27.input_layernorm.weight": "model-00011-of-00031.safetensors",
249
+ "model.layers.27.mlp.down_proj.weight": "model-00011-of-00031.safetensors",
250
+ "model.layers.27.mlp.gate_proj.weight": "model-00011-of-00031.safetensors",
251
+ "model.layers.27.mlp.up_proj.weight": "model-00011-of-00031.safetensors",
252
+ "model.layers.27.post_attention_layernorm.weight": "model-00011-of-00031.safetensors",
253
+ "model.layers.27.self_attn.k_proj.bias": "model-00011-of-00031.safetensors",
254
+ "model.layers.27.self_attn.k_proj.weight": "model-00011-of-00031.safetensors",
255
+ "model.layers.27.self_attn.o_proj.weight": "model-00011-of-00031.safetensors",
256
+ "model.layers.27.self_attn.q_proj.bias": "model-00011-of-00031.safetensors",
257
+ "model.layers.27.self_attn.q_proj.weight": "model-00011-of-00031.safetensors",
258
+ "model.layers.27.self_attn.v_proj.bias": "model-00011-of-00031.safetensors",
259
+ "model.layers.27.self_attn.v_proj.weight": "model-00011-of-00031.safetensors",
260
+ "model.layers.28.input_layernorm.weight": "model-00012-of-00031.safetensors",
261
+ "model.layers.28.mlp.down_proj.weight": "model-00012-of-00031.safetensors",
262
+ "model.layers.28.mlp.gate_proj.weight": "model-00011-of-00031.safetensors",
263
+ "model.layers.28.mlp.up_proj.weight": "model-00011-of-00031.safetensors",
264
+ "model.layers.28.post_attention_layernorm.weight": "model-00012-of-00031.safetensors",
265
+ "model.layers.28.self_attn.k_proj.bias": "model-00011-of-00031.safetensors",
266
+ "model.layers.28.self_attn.k_proj.weight": "model-00011-of-00031.safetensors",
267
+ "model.layers.28.self_attn.o_proj.weight": "model-00011-of-00031.safetensors",
268
+ "model.layers.28.self_attn.q_proj.bias": "model-00011-of-00031.safetensors",
269
+ "model.layers.28.self_attn.q_proj.weight": "model-00011-of-00031.safetensors",
270
+ "model.layers.28.self_attn.v_proj.bias": "model-00011-of-00031.safetensors",
271
+ "model.layers.28.self_attn.v_proj.weight": "model-00011-of-00031.safetensors",
272
+ "model.layers.29.input_layernorm.weight": "model-00012-of-00031.safetensors",
273
+ "model.layers.29.mlp.down_proj.weight": "model-00012-of-00031.safetensors",
274
+ "model.layers.29.mlp.gate_proj.weight": "model-00012-of-00031.safetensors",
275
+ "model.layers.29.mlp.up_proj.weight": "model-00012-of-00031.safetensors",
276
+ "model.layers.29.post_attention_layernorm.weight": "model-00012-of-00031.safetensors",
277
+ "model.layers.29.self_attn.k_proj.bias": "model-00012-of-00031.safetensors",
278
+ "model.layers.29.self_attn.k_proj.weight": "model-00012-of-00031.safetensors",
279
+ "model.layers.29.self_attn.o_proj.weight": "model-00012-of-00031.safetensors",
280
+ "model.layers.29.self_attn.q_proj.bias": "model-00012-of-00031.safetensors",
281
+ "model.layers.29.self_attn.q_proj.weight": "model-00012-of-00031.safetensors",
282
+ "model.layers.29.self_attn.v_proj.bias": "model-00012-of-00031.safetensors",
283
+ "model.layers.29.self_attn.v_proj.weight": "model-00012-of-00031.safetensors",
284
+ "model.layers.3.input_layernorm.weight": "model-00002-of-00031.safetensors",
285
+ "model.layers.3.mlp.down_proj.weight": "model-00002-of-00031.safetensors",
286
+ "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00031.safetensors",
287
+ "model.layers.3.mlp.up_proj.weight": "model-00002-of-00031.safetensors",
288
+ "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00031.safetensors",
289
+ "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00031.safetensors",
290
+ "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00031.safetensors",
291
+ "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00031.safetensors",
292
+ "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00031.safetensors",
293
+ "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00031.safetensors",
294
+ "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00031.safetensors",
295
+ "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00031.safetensors",
296
+ "model.layers.30.input_layernorm.weight": "model-00012-of-00031.safetensors",
297
+ "model.layers.30.mlp.down_proj.weight": "model-00012-of-00031.safetensors",
298
+ "model.layers.30.mlp.gate_proj.weight": "model-00012-of-00031.safetensors",
299
+ "model.layers.30.mlp.up_proj.weight": "model-00012-of-00031.safetensors",
300
+ "model.layers.30.post_attention_layernorm.weight": "model-00012-of-00031.safetensors",
301
+ "model.layers.30.self_attn.k_proj.bias": "model-00012-of-00031.safetensors",
302
+ "model.layers.30.self_attn.k_proj.weight": "model-00012-of-00031.safetensors",
303
+ "model.layers.30.self_attn.o_proj.weight": "model-00012-of-00031.safetensors",
304
+ "model.layers.30.self_attn.q_proj.bias": "model-00012-of-00031.safetensors",
305
+ "model.layers.30.self_attn.q_proj.weight": "model-00012-of-00031.safetensors",
306
+ "model.layers.30.self_attn.v_proj.bias": "model-00012-of-00031.safetensors",
307
+ "model.layers.30.self_attn.v_proj.weight": "model-00012-of-00031.safetensors",
308
+ "model.layers.31.input_layernorm.weight": "model-00013-of-00031.safetensors",
309
+ "model.layers.31.mlp.down_proj.weight": "model-00013-of-00031.safetensors",
310
+ "model.layers.31.mlp.gate_proj.weight": "model-00012-of-00031.safetensors",
311
+ "model.layers.31.mlp.up_proj.weight": "model-00013-of-00031.safetensors",
312
+ "model.layers.31.post_attention_layernorm.weight": "model-00013-of-00031.safetensors",
313
+ "model.layers.31.self_attn.k_proj.bias": "model-00012-of-00031.safetensors",
314
+ "model.layers.31.self_attn.k_proj.weight": "model-00012-of-00031.safetensors",
315
+ "model.layers.31.self_attn.o_proj.weight": "model-00012-of-00031.safetensors",
316
+ "model.layers.31.self_attn.q_proj.bias": "model-00012-of-00031.safetensors",
317
+ "model.layers.31.self_attn.q_proj.weight": "model-00012-of-00031.safetensors",
318
+ "model.layers.31.self_attn.v_proj.bias": "model-00012-of-00031.safetensors",
319
+ "model.layers.31.self_attn.v_proj.weight": "model-00012-of-00031.safetensors",
320
+ "model.layers.32.input_layernorm.weight": "model-00013-of-00031.safetensors",
321
+ "model.layers.32.mlp.down_proj.weight": "model-00013-of-00031.safetensors",
322
+ "model.layers.32.mlp.gate_proj.weight": "model-00013-of-00031.safetensors",
323
+ "model.layers.32.mlp.up_proj.weight": "model-00013-of-00031.safetensors",
324
+ "model.layers.32.post_attention_layernorm.weight": "model-00013-of-00031.safetensors",
325
+ "model.layers.32.self_attn.k_proj.bias": "model-00013-of-00031.safetensors",
326
+ "model.layers.32.self_attn.k_proj.weight": "model-00013-of-00031.safetensors",
327
+ "model.layers.32.self_attn.o_proj.weight": "model-00013-of-00031.safetensors",
328
+ "model.layers.32.self_attn.q_proj.bias": "model-00013-of-00031.safetensors",
329
+ "model.layers.32.self_attn.q_proj.weight": "model-00013-of-00031.safetensors",
330
+ "model.layers.32.self_attn.v_proj.bias": "model-00013-of-00031.safetensors",
331
+ "model.layers.32.self_attn.v_proj.weight": "model-00013-of-00031.safetensors",
332
+ "model.layers.33.input_layernorm.weight": "model-00013-of-00031.safetensors",
333
+ "model.layers.33.mlp.down_proj.weight": "model-00013-of-00031.safetensors",
334
+ "model.layers.33.mlp.gate_proj.weight": "model-00013-of-00031.safetensors",
335
+ "model.layers.33.mlp.up_proj.weight": "model-00013-of-00031.safetensors",
336
+ "model.layers.33.post_attention_layernorm.weight": "model-00013-of-00031.safetensors",
337
+ "model.layers.33.self_attn.k_proj.bias": "model-00013-of-00031.safetensors",
338
+ "model.layers.33.self_attn.k_proj.weight": "model-00013-of-00031.safetensors",
339
+ "model.layers.33.self_attn.o_proj.weight": "model-00013-of-00031.safetensors",
340
+ "model.layers.33.self_attn.q_proj.bias": "model-00013-of-00031.safetensors",
341
+ "model.layers.33.self_attn.q_proj.weight": "model-00013-of-00031.safetensors",
342
+ "model.layers.33.self_attn.v_proj.bias": "model-00013-of-00031.safetensors",
343
+ "model.layers.33.self_attn.v_proj.weight": "model-00013-of-00031.safetensors",
344
+ "model.layers.34.input_layernorm.weight": "model-00014-of-00031.safetensors",
345
+ "model.layers.34.mlp.down_proj.weight": "model-00014-of-00031.safetensors",
346
+ "model.layers.34.mlp.gate_proj.weight": "model-00014-of-00031.safetensors",
347
+ "model.layers.34.mlp.up_proj.weight": "model-00014-of-00031.safetensors",
348
+ "model.layers.34.post_attention_layernorm.weight": "model-00014-of-00031.safetensors",
349
+ "model.layers.34.self_attn.k_proj.bias": "model-00013-of-00031.safetensors",
350
+ "model.layers.34.self_attn.k_proj.weight": "model-00013-of-00031.safetensors",
351
+ "model.layers.34.self_attn.o_proj.weight": "model-00013-of-00031.safetensors",
352
+ "model.layers.34.self_attn.q_proj.bias": "model-00013-of-00031.safetensors",
353
+ "model.layers.34.self_attn.q_proj.weight": "model-00013-of-00031.safetensors",
354
+ "model.layers.34.self_attn.v_proj.bias": "model-00013-of-00031.safetensors",
355
+ "model.layers.34.self_attn.v_proj.weight": "model-00013-of-00031.safetensors",
356
+ "model.layers.35.input_layernorm.weight": "model-00014-of-00031.safetensors",
357
+ "model.layers.35.mlp.down_proj.weight": "model-00014-of-00031.safetensors",
358
+ "model.layers.35.mlp.gate_proj.weight": "model-00014-of-00031.safetensors",
359
+ "model.layers.35.mlp.up_proj.weight": "model-00014-of-00031.safetensors",
360
+ "model.layers.35.post_attention_layernorm.weight": "model-00014-of-00031.safetensors",
361
+ "model.layers.35.self_attn.k_proj.bias": "model-00014-of-00031.safetensors",
362
+ "model.layers.35.self_attn.k_proj.weight": "model-00014-of-00031.safetensors",
363
+ "model.layers.35.self_attn.o_proj.weight": "model-00014-of-00031.safetensors",
364
+ "model.layers.35.self_attn.q_proj.bias": "model-00014-of-00031.safetensors",
365
+ "model.layers.35.self_attn.q_proj.weight": "model-00014-of-00031.safetensors",
366
+ "model.layers.35.self_attn.v_proj.bias": "model-00014-of-00031.safetensors",
367
+ "model.layers.35.self_attn.v_proj.weight": "model-00014-of-00031.safetensors",
368
+ "model.layers.36.input_layernorm.weight": "model-00014-of-00031.safetensors",
369
+ "model.layers.36.mlp.down_proj.weight": "model-00014-of-00031.safetensors",
370
+ "model.layers.36.mlp.gate_proj.weight": "model-00014-of-00031.safetensors",
371
+ "model.layers.36.mlp.up_proj.weight": "model-00014-of-00031.safetensors",
372
+ "model.layers.36.post_attention_layernorm.weight": "model-00014-of-00031.safetensors",
373
+ "model.layers.36.self_attn.k_proj.bias": "model-00014-of-00031.safetensors",
374
+ "model.layers.36.self_attn.k_proj.weight": "model-00014-of-00031.safetensors",
375
+ "model.layers.36.self_attn.o_proj.weight": "model-00014-of-00031.safetensors",
376
+ "model.layers.36.self_attn.q_proj.bias": "model-00014-of-00031.safetensors",
377
+ "model.layers.36.self_attn.q_proj.weight": "model-00014-of-00031.safetensors",
378
+ "model.layers.36.self_attn.v_proj.bias": "model-00014-of-00031.safetensors",
379
+ "model.layers.36.self_attn.v_proj.weight": "model-00014-of-00031.safetensors",
380
+ "model.layers.37.input_layernorm.weight": "model-00015-of-00031.safetensors",
381
+ "model.layers.37.mlp.down_proj.weight": "model-00015-of-00031.safetensors",
382
+ "model.layers.37.mlp.gate_proj.weight": "model-00015-of-00031.safetensors",
383
+ "model.layers.37.mlp.up_proj.weight": "model-00015-of-00031.safetensors",
384
+ "model.layers.37.post_attention_layernorm.weight": "model-00015-of-00031.safetensors",
385
+ "model.layers.37.self_attn.k_proj.bias": "model-00015-of-00031.safetensors",
386
+ "model.layers.37.self_attn.k_proj.weight": "model-00015-of-00031.safetensors",
387
+ "model.layers.37.self_attn.o_proj.weight": "model-00015-of-00031.safetensors",
388
+ "model.layers.37.self_attn.q_proj.bias": "model-00015-of-00031.safetensors",
389
+ "model.layers.37.self_attn.q_proj.weight": "model-00015-of-00031.safetensors",
390
+ "model.layers.37.self_attn.v_proj.bias": "model-00015-of-00031.safetensors",
391
+ "model.layers.37.self_attn.v_proj.weight": "model-00015-of-00031.safetensors",
392
+ "model.layers.38.input_layernorm.weight": "model-00015-of-00031.safetensors",
393
+ "model.layers.38.mlp.down_proj.weight": "model-00015-of-00031.safetensors",
394
+ "model.layers.38.mlp.gate_proj.weight": "model-00015-of-00031.safetensors",
395
+ "model.layers.38.mlp.up_proj.weight": "model-00015-of-00031.safetensors",
396
+ "model.layers.38.post_attention_layernorm.weight": "model-00015-of-00031.safetensors",
397
+ "model.layers.38.self_attn.k_proj.bias": "model-00015-of-00031.safetensors",
398
+ "model.layers.38.self_attn.k_proj.weight": "model-00015-of-00031.safetensors",
399
+ "model.layers.38.self_attn.o_proj.weight": "model-00015-of-00031.safetensors",
400
+ "model.layers.38.self_attn.q_proj.bias": "model-00015-of-00031.safetensors",
401
+ "model.layers.38.self_attn.q_proj.weight": "model-00015-of-00031.safetensors",
402
+ "model.layers.38.self_attn.v_proj.bias": "model-00015-of-00031.safetensors",
403
+ "model.layers.38.self_attn.v_proj.weight": "model-00015-of-00031.safetensors",
404
+ "model.layers.39.input_layernorm.weight": "model-00016-of-00031.safetensors",
405
+ "model.layers.39.mlp.down_proj.weight": "model-00016-of-00031.safetensors",
406
+ "model.layers.39.mlp.gate_proj.weight": "model-00015-of-00031.safetensors",
407
+ "model.layers.39.mlp.up_proj.weight": "model-00015-of-00031.safetensors",
408
+ "model.layers.39.post_attention_layernorm.weight": "model-00016-of-00031.safetensors",
409
+ "model.layers.39.self_attn.k_proj.bias": "model-00015-of-00031.safetensors",
410
+ "model.layers.39.self_attn.k_proj.weight": "model-00015-of-00031.safetensors",
411
+ "model.layers.39.self_attn.o_proj.weight": "model-00015-of-00031.safetensors",
412
+ "model.layers.39.self_attn.q_proj.bias": "model-00015-of-00031.safetensors",
413
+ "model.layers.39.self_attn.q_proj.weight": "model-00015-of-00031.safetensors",
414
+ "model.layers.39.self_attn.v_proj.bias": "model-00015-of-00031.safetensors",
415
+ "model.layers.39.self_attn.v_proj.weight": "model-00015-of-00031.safetensors",
416
+ "model.layers.4.input_layernorm.weight": "model-00003-of-00031.safetensors",
417
+ "model.layers.4.mlp.down_proj.weight": "model-00003-of-00031.safetensors",
418
+ "model.layers.4.mlp.gate_proj.weight": "model-00003-of-00031.safetensors",
419
+ "model.layers.4.mlp.up_proj.weight": "model-00003-of-00031.safetensors",
420
+ "model.layers.4.post_attention_layernorm.weight": "model-00003-of-00031.safetensors",
421
+ "model.layers.4.self_attn.k_proj.bias": "model-00003-of-00031.safetensors",
422
+ "model.layers.4.self_attn.k_proj.weight": "model-00003-of-00031.safetensors",
423
+ "model.layers.4.self_attn.o_proj.weight": "model-00003-of-00031.safetensors",
424
+ "model.layers.4.self_attn.q_proj.bias": "model-00003-of-00031.safetensors",
425
+ "model.layers.4.self_attn.q_proj.weight": "model-00003-of-00031.safetensors",
426
+ "model.layers.4.self_attn.v_proj.bias": "model-00003-of-00031.safetensors",
427
+ "model.layers.4.self_attn.v_proj.weight": "model-00003-of-00031.safetensors",
428
+ "model.layers.40.input_layernorm.weight": "model-00016-of-00031.safetensors",
429
+ "model.layers.40.mlp.down_proj.weight": "model-00016-of-00031.safetensors",
430
+ "model.layers.40.mlp.gate_proj.weight": "model-00016-of-00031.safetensors",
431
+ "model.layers.40.mlp.up_proj.weight": "model-00016-of-00031.safetensors",
432
+ "model.layers.40.post_attention_layernorm.weight": "model-00016-of-00031.safetensors",
433
+ "model.layers.40.self_attn.k_proj.bias": "model-00016-of-00031.safetensors",
434
+ "model.layers.40.self_attn.k_proj.weight": "model-00016-of-00031.safetensors",
435
+ "model.layers.40.self_attn.o_proj.weight": "model-00016-of-00031.safetensors",
436
+ "model.layers.40.self_attn.q_proj.bias": "model-00016-of-00031.safetensors",
437
+ "model.layers.40.self_attn.q_proj.weight": "model-00016-of-00031.safetensors",
438
+ "model.layers.40.self_attn.v_proj.bias": "model-00016-of-00031.safetensors",
439
+ "model.layers.40.self_attn.v_proj.weight": "model-00016-of-00031.safetensors",
440
+ "model.layers.41.input_layernorm.weight": "model-00016-of-00031.safetensors",
441
+ "model.layers.41.mlp.down_proj.weight": "model-00016-of-00031.safetensors",
442
+ "model.layers.41.mlp.gate_proj.weight": "model-00016-of-00031.safetensors",
443
+ "model.layers.41.mlp.up_proj.weight": "model-00016-of-00031.safetensors",
444
+ "model.layers.41.post_attention_layernorm.weight": "model-00016-of-00031.safetensors",
445
+ "model.layers.41.self_attn.k_proj.bias": "model-00016-of-00031.safetensors",
446
+ "model.layers.41.self_attn.k_proj.weight": "model-00016-of-00031.safetensors",
447
+ "model.layers.41.self_attn.o_proj.weight": "model-00016-of-00031.safetensors",
448
+ "model.layers.41.self_attn.q_proj.bias": "model-00016-of-00031.safetensors",
449
+ "model.layers.41.self_attn.q_proj.weight": "model-00016-of-00031.safetensors",
450
+ "model.layers.41.self_attn.v_proj.bias": "model-00016-of-00031.safetensors",
451
+ "model.layers.41.self_attn.v_proj.weight": "model-00016-of-00031.safetensors",
452
+ "model.layers.42.input_layernorm.weight": "model-00017-of-00031.safetensors",
453
+ "model.layers.42.mlp.down_proj.weight": "model-00017-of-00031.safetensors",
454
+ "model.layers.42.mlp.gate_proj.weight": "model-00016-of-00031.safetensors",
455
+ "model.layers.42.mlp.up_proj.weight": "model-00017-of-00031.safetensors",
456
+ "model.layers.42.post_attention_layernorm.weight": "model-00017-of-00031.safetensors",
457
+ "model.layers.42.self_attn.k_proj.bias": "model-00016-of-00031.safetensors",
458
+ "model.layers.42.self_attn.k_proj.weight": "model-00016-of-00031.safetensors",
459
+ "model.layers.42.self_attn.o_proj.weight": "model-00016-of-00031.safetensors",
460
+ "model.layers.42.self_attn.q_proj.bias": "model-00016-of-00031.safetensors",
461
+ "model.layers.42.self_attn.q_proj.weight": "model-00016-of-00031.safetensors",
462
+ "model.layers.42.self_attn.v_proj.bias": "model-00016-of-00031.safetensors",
463
+ "model.layers.42.self_attn.v_proj.weight": "model-00016-of-00031.safetensors",
464
+ "model.layers.43.input_layernorm.weight": "model-00017-of-00031.safetensors",
465
+ "model.layers.43.mlp.down_proj.weight": "model-00017-of-00031.safetensors",
466
+ "model.layers.43.mlp.gate_proj.weight": "model-00017-of-00031.safetensors",
467
+ "model.layers.43.mlp.up_proj.weight": "model-00017-of-00031.safetensors",
468
+ "model.layers.43.post_attention_layernorm.weight": "model-00017-of-00031.safetensors",
469
+ "model.layers.43.self_attn.k_proj.bias": "model-00017-of-00031.safetensors",
470
+ "model.layers.43.self_attn.k_proj.weight": "model-00017-of-00031.safetensors",
471
+ "model.layers.43.self_attn.o_proj.weight": "model-00017-of-00031.safetensors",
472
+ "model.layers.43.self_attn.q_proj.bias": "model-00017-of-00031.safetensors",
473
+ "model.layers.43.self_attn.q_proj.weight": "model-00017-of-00031.safetensors",
474
+ "model.layers.43.self_attn.v_proj.bias": "model-00017-of-00031.safetensors",
475
+ "model.layers.43.self_attn.v_proj.weight": "model-00017-of-00031.safetensors",
476
+ "model.layers.44.input_layernorm.weight": "model-00017-of-00031.safetensors",
477
+ "model.layers.44.mlp.down_proj.weight": "model-00017-of-00031.safetensors",
478
+ "model.layers.44.mlp.gate_proj.weight": "model-00017-of-00031.safetensors",
479
+ "model.layers.44.mlp.up_proj.weight": "model-00017-of-00031.safetensors",
480
+ "model.layers.44.post_attention_layernorm.weight": "model-00017-of-00031.safetensors",
481
+ "model.layers.44.self_attn.k_proj.bias": "model-00017-of-00031.safetensors",
482
+ "model.layers.44.self_attn.k_proj.weight": "model-00017-of-00031.safetensors",
483
+ "model.layers.44.self_attn.o_proj.weight": "model-00017-of-00031.safetensors",
484
+ "model.layers.44.self_attn.q_proj.bias": "model-00017-of-00031.safetensors",
485
+ "model.layers.44.self_attn.q_proj.weight": "model-00017-of-00031.safetensors",
486
+ "model.layers.44.self_attn.v_proj.bias": "model-00017-of-00031.safetensors",
487
+ "model.layers.44.self_attn.v_proj.weight": "model-00017-of-00031.safetensors",
488
+ "model.layers.45.input_layernorm.weight": "model-00018-of-00031.safetensors",
489
+ "model.layers.45.mlp.down_proj.weight": "model-00018-of-00031.safetensors",
490
+ "model.layers.45.mlp.gate_proj.weight": "model-00018-of-00031.safetensors",
491
+ "model.layers.45.mlp.up_proj.weight": "model-00018-of-00031.safetensors",
492
+ "model.layers.45.post_attention_layernorm.weight": "model-00018-of-00031.safetensors",
493
+ "model.layers.45.self_attn.k_proj.bias": "model-00017-of-00031.safetensors",
494
+ "model.layers.45.self_attn.k_proj.weight": "model-00017-of-00031.safetensors",
495
+ "model.layers.45.self_attn.o_proj.weight": "model-00017-of-00031.safetensors",
496
+ "model.layers.45.self_attn.q_proj.bias": "model-00017-of-00031.safetensors",
497
+ "model.layers.45.self_attn.q_proj.weight": "model-00017-of-00031.safetensors",
498
+ "model.layers.45.self_attn.v_proj.bias": "model-00017-of-00031.safetensors",
499
+ "model.layers.45.self_attn.v_proj.weight": "model-00017-of-00031.safetensors",
500
+ "model.layers.46.input_layernorm.weight": "model-00018-of-00031.safetensors",
501
+ "model.layers.46.mlp.down_proj.weight": "model-00018-of-00031.safetensors",
502
+ "model.layers.46.mlp.gate_proj.weight": "model-00018-of-00031.safetensors",
503
+ "model.layers.46.mlp.up_proj.weight": "model-00018-of-00031.safetensors",
504
+ "model.layers.46.post_attention_layernorm.weight": "model-00018-of-00031.safetensors",
505
+ "model.layers.46.self_attn.k_proj.bias": "model-00018-of-00031.safetensors",
506
+ "model.layers.46.self_attn.k_proj.weight": "model-00018-of-00031.safetensors",
507
+ "model.layers.46.self_attn.o_proj.weight": "model-00018-of-00031.safetensors",
508
+ "model.layers.46.self_attn.q_proj.bias": "model-00018-of-00031.safetensors",
509
+ "model.layers.46.self_attn.q_proj.weight": "model-00018-of-00031.safetensors",
510
+ "model.layers.46.self_attn.v_proj.bias": "model-00018-of-00031.safetensors",
511
+ "model.layers.46.self_attn.v_proj.weight": "model-00018-of-00031.safetensors",
512
+ "model.layers.47.input_layernorm.weight": "model-00018-of-00031.safetensors",
513
+ "model.layers.47.mlp.down_proj.weight": "model-00018-of-00031.safetensors",
514
+ "model.layers.47.mlp.gate_proj.weight": "model-00018-of-00031.safetensors",
515
+ "model.layers.47.mlp.up_proj.weight": "model-00018-of-00031.safetensors",
516
+ "model.layers.47.post_attention_layernorm.weight": "model-00018-of-00031.safetensors",
517
+ "model.layers.47.self_attn.k_proj.bias": "model-00018-of-00031.safetensors",
518
+ "model.layers.47.self_attn.k_proj.weight": "model-00018-of-00031.safetensors",
519
+ "model.layers.47.self_attn.o_proj.weight": "model-00018-of-00031.safetensors",
520
+ "model.layers.47.self_attn.q_proj.bias": "model-00018-of-00031.safetensors",
521
+ "model.layers.47.self_attn.q_proj.weight": "model-00018-of-00031.safetensors",
522
+ "model.layers.47.self_attn.v_proj.bias": "model-00018-of-00031.safetensors",
523
+ "model.layers.47.self_attn.v_proj.weight": "model-00018-of-00031.safetensors",
524
+ "model.layers.48.input_layernorm.weight": "model-00019-of-00031.safetensors",
525
+ "model.layers.48.mlp.down_proj.weight": "model-00019-of-00031.safetensors",
526
+ "model.layers.48.mlp.gate_proj.weight": "model-00019-of-00031.safetensors",
527
+ "model.layers.48.mlp.up_proj.weight": "model-00019-of-00031.safetensors",
528
+ "model.layers.48.post_attention_layernorm.weight": "model-00019-of-00031.safetensors",
529
+ "model.layers.48.self_attn.k_proj.bias": "model-00019-of-00031.safetensors",
530
+ "model.layers.48.self_attn.k_proj.weight": "model-00019-of-00031.safetensors",
531
+ "model.layers.48.self_attn.o_proj.weight": "model-00019-of-00031.safetensors",
532
+ "model.layers.48.self_attn.q_proj.bias": "model-00019-of-00031.safetensors",
533
+ "model.layers.48.self_attn.q_proj.weight": "model-00019-of-00031.safetensors",
534
+ "model.layers.48.self_attn.v_proj.bias": "model-00019-of-00031.safetensors",
535
+ "model.layers.48.self_attn.v_proj.weight": "model-00019-of-00031.safetensors",
536
+ "model.layers.49.input_layernorm.weight": "model-00019-of-00031.safetensors",
537
+ "model.layers.49.mlp.down_proj.weight": "model-00019-of-00031.safetensors",
538
+ "model.layers.49.mlp.gate_proj.weight": "model-00019-of-00031.safetensors",
539
+ "model.layers.49.mlp.up_proj.weight": "model-00019-of-00031.safetensors",
540
+ "model.layers.49.post_attention_layernorm.weight": "model-00019-of-00031.safetensors",
541
+ "model.layers.49.self_attn.k_proj.bias": "model-00019-of-00031.safetensors",
542
+ "model.layers.49.self_attn.k_proj.weight": "model-00019-of-00031.safetensors",
543
+ "model.layers.49.self_attn.o_proj.weight": "model-00019-of-00031.safetensors",
544
+ "model.layers.49.self_attn.q_proj.bias": "model-00019-of-00031.safetensors",
545
+ "model.layers.49.self_attn.q_proj.weight": "model-00019-of-00031.safetensors",
546
+ "model.layers.49.self_attn.v_proj.bias": "model-00019-of-00031.safetensors",
547
+ "model.layers.49.self_attn.v_proj.weight": "model-00019-of-00031.safetensors",
548
+ "model.layers.5.input_layernorm.weight": "model-00003-of-00031.safetensors",
549
+ "model.layers.5.mlp.down_proj.weight": "model-00003-of-00031.safetensors",
550
+ "model.layers.5.mlp.gate_proj.weight": "model-00003-of-00031.safetensors",
551
+ "model.layers.5.mlp.up_proj.weight": "model-00003-of-00031.safetensors",
552
+ "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00031.safetensors",
553
+ "model.layers.5.self_attn.k_proj.bias": "model-00003-of-00031.safetensors",
554
+ "model.layers.5.self_attn.k_proj.weight": "model-00003-of-00031.safetensors",
555
+ "model.layers.5.self_attn.o_proj.weight": "model-00003-of-00031.safetensors",
556
+ "model.layers.5.self_attn.q_proj.bias": "model-00003-of-00031.safetensors",
557
+ "model.layers.5.self_attn.q_proj.weight": "model-00003-of-00031.safetensors",
558
+ "model.layers.5.self_attn.v_proj.bias": "model-00003-of-00031.safetensors",
559
+ "model.layers.5.self_attn.v_proj.weight": "model-00003-of-00031.safetensors",
560
+ "model.layers.50.input_layernorm.weight": "model-00020-of-00031.safetensors",
561
+ "model.layers.50.mlp.down_proj.weight": "model-00020-of-00031.safetensors",
562
+ "model.layers.50.mlp.gate_proj.weight": "model-00019-of-00031.safetensors",
563
+ "model.layers.50.mlp.up_proj.weight": "model-00019-of-00031.safetensors",
564
+ "model.layers.50.post_attention_layernorm.weight": "model-00020-of-00031.safetensors",
565
+ "model.layers.50.self_attn.k_proj.bias": "model-00019-of-00031.safetensors",
566
+ "model.layers.50.self_attn.k_proj.weight": "model-00019-of-00031.safetensors",
567
+ "model.layers.50.self_attn.o_proj.weight": "model-00019-of-00031.safetensors",
568
+ "model.layers.50.self_attn.q_proj.bias": "model-00019-of-00031.safetensors",
569
+ "model.layers.50.self_attn.q_proj.weight": "model-00019-of-00031.safetensors",
570
+ "model.layers.50.self_attn.v_proj.bias": "model-00019-of-00031.safetensors",
571
+ "model.layers.50.self_attn.v_proj.weight": "model-00019-of-00031.safetensors",
572
+ "model.layers.51.input_layernorm.weight": "model-00020-of-00031.safetensors",
573
+ "model.layers.51.mlp.down_proj.weight": "model-00020-of-00031.safetensors",
574
+ "model.layers.51.mlp.gate_proj.weight": "model-00020-of-00031.safetensors",
575
+ "model.layers.51.mlp.up_proj.weight": "model-00020-of-00031.safetensors",
576
+ "model.layers.51.post_attention_layernorm.weight": "model-00020-of-00031.safetensors",
577
+ "model.layers.51.self_attn.k_proj.bias": "model-00020-of-00031.safetensors",
578
+ "model.layers.51.self_attn.k_proj.weight": "model-00020-of-00031.safetensors",
579
+ "model.layers.51.self_attn.o_proj.weight": "model-00020-of-00031.safetensors",
580
+ "model.layers.51.self_attn.q_proj.bias": "model-00020-of-00031.safetensors",
581
+ "model.layers.51.self_attn.q_proj.weight": "model-00020-of-00031.safetensors",
582
+ "model.layers.51.self_attn.v_proj.bias": "model-00020-of-00031.safetensors",
583
+ "model.layers.51.self_attn.v_proj.weight": "model-00020-of-00031.safetensors",
584
+ "model.layers.52.input_layernorm.weight": "model-00020-of-00031.safetensors",
585
+ "model.layers.52.mlp.down_proj.weight": "model-00020-of-00031.safetensors",
586
+ "model.layers.52.mlp.gate_proj.weight": "model-00020-of-00031.safetensors",
587
+ "model.layers.52.mlp.up_proj.weight": "model-00020-of-00031.safetensors",
588
+ "model.layers.52.post_attention_layernorm.weight": "model-00020-of-00031.safetensors",
589
+ "model.layers.52.self_attn.k_proj.bias": "model-00020-of-00031.safetensors",
590
+ "model.layers.52.self_attn.k_proj.weight": "model-00020-of-00031.safetensors",
591
+ "model.layers.52.self_attn.o_proj.weight": "model-00020-of-00031.safetensors",
592
+ "model.layers.52.self_attn.q_proj.bias": "model-00020-of-00031.safetensors",
593
+ "model.layers.52.self_attn.q_proj.weight": "model-00020-of-00031.safetensors",
594
+ "model.layers.52.self_attn.v_proj.bias": "model-00020-of-00031.safetensors",
595
+ "model.layers.52.self_attn.v_proj.weight": "model-00020-of-00031.safetensors",
596
+ "model.layers.53.input_layernorm.weight": "model-00021-of-00031.safetensors",
597
+ "model.layers.53.mlp.down_proj.weight": "model-00021-of-00031.safetensors",
598
+ "model.layers.53.mlp.gate_proj.weight": "model-00020-of-00031.safetensors",
599
+ "model.layers.53.mlp.up_proj.weight": "model-00021-of-00031.safetensors",
600
+ "model.layers.53.post_attention_layernorm.weight": "model-00021-of-00031.safetensors",
601
+ "model.layers.53.self_attn.k_proj.bias": "model-00020-of-00031.safetensors",
602
+ "model.layers.53.self_attn.k_proj.weight": "model-00020-of-00031.safetensors",
603
+ "model.layers.53.self_attn.o_proj.weight": "model-00020-of-00031.safetensors",
604
+ "model.layers.53.self_attn.q_proj.bias": "model-00020-of-00031.safetensors",
605
+ "model.layers.53.self_attn.q_proj.weight": "model-00020-of-00031.safetensors",
606
+ "model.layers.53.self_attn.v_proj.bias": "model-00020-of-00031.safetensors",
607
+ "model.layers.53.self_attn.v_proj.weight": "model-00020-of-00031.safetensors",
608
+ "model.layers.54.input_layernorm.weight": "model-00021-of-00031.safetensors",
609
+ "model.layers.54.mlp.down_proj.weight": "model-00021-of-00031.safetensors",
610
+ "model.layers.54.mlp.gate_proj.weight": "model-00021-of-00031.safetensors",
611
+ "model.layers.54.mlp.up_proj.weight": "model-00021-of-00031.safetensors",
612
+ "model.layers.54.post_attention_layernorm.weight": "model-00021-of-00031.safetensors",
613
+ "model.layers.54.self_attn.k_proj.bias": "model-00021-of-00031.safetensors",
614
+ "model.layers.54.self_attn.k_proj.weight": "model-00021-of-00031.safetensors",
615
+ "model.layers.54.self_attn.o_proj.weight": "model-00021-of-00031.safetensors",
616
+ "model.layers.54.self_attn.q_proj.bias": "model-00021-of-00031.safetensors",
617
+ "model.layers.54.self_attn.q_proj.weight": "model-00021-of-00031.safetensors",
618
+ "model.layers.54.self_attn.v_proj.bias": "model-00021-of-00031.safetensors",
619
+ "model.layers.54.self_attn.v_proj.weight": "model-00021-of-00031.safetensors",
620
+ "model.layers.55.input_layernorm.weight": "model-00021-of-00031.safetensors",
621
+ "model.layers.55.mlp.down_proj.weight": "model-00021-of-00031.safetensors",
622
+ "model.layers.55.mlp.gate_proj.weight": "model-00021-of-00031.safetensors",
623
+ "model.layers.55.mlp.up_proj.weight": "model-00021-of-00031.safetensors",
624
+ "model.layers.55.post_attention_layernorm.weight": "model-00021-of-00031.safetensors",
625
+ "model.layers.55.self_attn.k_proj.bias": "model-00021-of-00031.safetensors",
626
+ "model.layers.55.self_attn.k_proj.weight": "model-00021-of-00031.safetensors",
627
+ "model.layers.55.self_attn.o_proj.weight": "model-00021-of-00031.safetensors",
628
+ "model.layers.55.self_attn.q_proj.bias": "model-00021-of-00031.safetensors",
629
+ "model.layers.55.self_attn.q_proj.weight": "model-00021-of-00031.safetensors",
630
+ "model.layers.55.self_attn.v_proj.bias": "model-00021-of-00031.safetensors",
631
+ "model.layers.55.self_attn.v_proj.weight": "model-00021-of-00031.safetensors",
632
+ "model.layers.56.input_layernorm.weight": "model-00022-of-00031.safetensors",
633
+ "model.layers.56.mlp.down_proj.weight": "model-00022-of-00031.safetensors",
634
+ "model.layers.56.mlp.gate_proj.weight": "model-00022-of-00031.safetensors",
635
+ "model.layers.56.mlp.up_proj.weight": "model-00022-of-00031.safetensors",
636
+ "model.layers.56.post_attention_layernorm.weight": "model-00022-of-00031.safetensors",
637
+ "model.layers.56.self_attn.k_proj.bias": "model-00021-of-00031.safetensors",
638
+ "model.layers.56.self_attn.k_proj.weight": "model-00021-of-00031.safetensors",
639
+ "model.layers.56.self_attn.o_proj.weight": "model-00021-of-00031.safetensors",
640
+ "model.layers.56.self_attn.q_proj.bias": "model-00021-of-00031.safetensors",
641
+ "model.layers.56.self_attn.q_proj.weight": "model-00021-of-00031.safetensors",
642
+ "model.layers.56.self_attn.v_proj.bias": "model-00021-of-00031.safetensors",
643
+ "model.layers.56.self_attn.v_proj.weight": "model-00021-of-00031.safetensors",
644
+ "model.layers.57.input_layernorm.weight": "model-00022-of-00031.safetensors",
645
+ "model.layers.57.mlp.down_proj.weight": "model-00022-of-00031.safetensors",
646
+ "model.layers.57.mlp.gate_proj.weight": "model-00022-of-00031.safetensors",
647
+ "model.layers.57.mlp.up_proj.weight": "model-00022-of-00031.safetensors",
648
+ "model.layers.57.post_attention_layernorm.weight": "model-00022-of-00031.safetensors",
649
+ "model.layers.57.self_attn.k_proj.bias": "model-00022-of-00031.safetensors",
650
+ "model.layers.57.self_attn.k_proj.weight": "model-00022-of-00031.safetensors",
651
+ "model.layers.57.self_attn.o_proj.weight": "model-00022-of-00031.safetensors",
652
+ "model.layers.57.self_attn.q_proj.bias": "model-00022-of-00031.safetensors",
653
+ "model.layers.57.self_attn.q_proj.weight": "model-00022-of-00031.safetensors",
654
+ "model.layers.57.self_attn.v_proj.bias": "model-00022-of-00031.safetensors",
655
+ "model.layers.57.self_attn.v_proj.weight": "model-00022-of-00031.safetensors",
656
+ "model.layers.58.input_layernorm.weight": "model-00022-of-00031.safetensors",
657
+ "model.layers.58.mlp.down_proj.weight": "model-00022-of-00031.safetensors",
658
+ "model.layers.58.mlp.gate_proj.weight": "model-00022-of-00031.safetensors",
659
+ "model.layers.58.mlp.up_proj.weight": "model-00022-of-00031.safetensors",
660
+ "model.layers.58.post_attention_layernorm.weight": "model-00022-of-00031.safetensors",
661
+ "model.layers.58.self_attn.k_proj.bias": "model-00022-of-00031.safetensors",
662
+ "model.layers.58.self_attn.k_proj.weight": "model-00022-of-00031.safetensors",
663
+ "model.layers.58.self_attn.o_proj.weight": "model-00022-of-00031.safetensors",
664
+ "model.layers.58.self_attn.q_proj.bias": "model-00022-of-00031.safetensors",
665
+ "model.layers.58.self_attn.q_proj.weight": "model-00022-of-00031.safetensors",
666
+ "model.layers.58.self_attn.v_proj.bias": "model-00022-of-00031.safetensors",
667
+ "model.layers.58.self_attn.v_proj.weight": "model-00022-of-00031.safetensors",
668
+ "model.layers.59.input_layernorm.weight": "model-00023-of-00031.safetensors",
669
+ "model.layers.59.mlp.down_proj.weight": "model-00023-of-00031.safetensors",
670
+ "model.layers.59.mlp.gate_proj.weight": "model-00023-of-00031.safetensors",
671
+ "model.layers.59.mlp.up_proj.weight": "model-00023-of-00031.safetensors",
672
+ "model.layers.59.post_attention_layernorm.weight": "model-00023-of-00031.safetensors",
673
+ "model.layers.59.self_attn.k_proj.bias": "model-00023-of-00031.safetensors",
674
+ "model.layers.59.self_attn.k_proj.weight": "model-00023-of-00031.safetensors",
675
+ "model.layers.59.self_attn.o_proj.weight": "model-00023-of-00031.safetensors",
676
+ "model.layers.59.self_attn.q_proj.bias": "model-00023-of-00031.safetensors",
677
+ "model.layers.59.self_attn.q_proj.weight": "model-00023-of-00031.safetensors",
678
+ "model.layers.59.self_attn.v_proj.bias": "model-00023-of-00031.safetensors",
679
+ "model.layers.59.self_attn.v_proj.weight": "model-00023-of-00031.safetensors",
680
+ "model.layers.6.input_layernorm.weight": "model-00004-of-00031.safetensors",
681
+ "model.layers.6.mlp.down_proj.weight": "model-00004-of-00031.safetensors",
682
+ "model.layers.6.mlp.gate_proj.weight": "model-00003-of-00031.safetensors",
683
+ "model.layers.6.mlp.up_proj.weight": "model-00003-of-00031.safetensors",
684
+ "model.layers.6.post_attention_layernorm.weight": "model-00004-of-00031.safetensors",
685
+ "model.layers.6.self_attn.k_proj.bias": "model-00003-of-00031.safetensors",
686
+ "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00031.safetensors",
687
+ "model.layers.6.self_attn.o_proj.weight": "model-00003-of-00031.safetensors",
688
+ "model.layers.6.self_attn.q_proj.bias": "model-00003-of-00031.safetensors",
689
+ "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00031.safetensors",
690
+ "model.layers.6.self_attn.v_proj.bias": "model-00003-of-00031.safetensors",
691
+ "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00031.safetensors",
692
+ "model.layers.60.input_layernorm.weight": "model-00023-of-00031.safetensors",
693
+ "model.layers.60.mlp.down_proj.weight": "model-00023-of-00031.safetensors",
694
+ "model.layers.60.mlp.gate_proj.weight": "model-00023-of-00031.safetensors",
695
+ "model.layers.60.mlp.up_proj.weight": "model-00023-of-00031.safetensors",
696
+ "model.layers.60.post_attention_layernorm.weight": "model-00023-of-00031.safetensors",
697
+ "model.layers.60.self_attn.k_proj.bias": "model-00023-of-00031.safetensors",
698
+ "model.layers.60.self_attn.k_proj.weight": "model-00023-of-00031.safetensors",
699
+ "model.layers.60.self_attn.o_proj.weight": "model-00023-of-00031.safetensors",
700
+ "model.layers.60.self_attn.q_proj.bias": "model-00023-of-00031.safetensors",
701
+ "model.layers.60.self_attn.q_proj.weight": "model-00023-of-00031.safetensors",
702
+ "model.layers.60.self_attn.v_proj.bias": "model-00023-of-00031.safetensors",
703
+ "model.layers.60.self_attn.v_proj.weight": "model-00023-of-00031.safetensors",
704
+ "model.layers.61.input_layernorm.weight": "model-00024-of-00031.safetensors",
705
+ "model.layers.61.mlp.down_proj.weight": "model-00024-of-00031.safetensors",
706
+ "model.layers.61.mlp.gate_proj.weight": "model-00023-of-00031.safetensors",
707
+ "model.layers.61.mlp.up_proj.weight": "model-00023-of-00031.safetensors",
708
+ "model.layers.61.post_attention_layernorm.weight": "model-00024-of-00031.safetensors",
709
+ "model.layers.61.self_attn.k_proj.bias": "model-00023-of-00031.safetensors",
710
+ "model.layers.61.self_attn.k_proj.weight": "model-00023-of-00031.safetensors",
711
+ "model.layers.61.self_attn.o_proj.weight": "model-00023-of-00031.safetensors",
712
+ "model.layers.61.self_attn.q_proj.bias": "model-00023-of-00031.safetensors",
713
+ "model.layers.61.self_attn.q_proj.weight": "model-00023-of-00031.safetensors",
714
+ "model.layers.61.self_attn.v_proj.bias": "model-00023-of-00031.safetensors",
715
+ "model.layers.61.self_attn.v_proj.weight": "model-00023-of-00031.safetensors",
716
+ "model.layers.62.input_layernorm.weight": "model-00024-of-00031.safetensors",
717
+ "model.layers.62.mlp.down_proj.weight": "model-00024-of-00031.safetensors",
718
+ "model.layers.62.mlp.gate_proj.weight": "model-00024-of-00031.safetensors",
719
+ "model.layers.62.mlp.up_proj.weight": "model-00024-of-00031.safetensors",
720
+ "model.layers.62.post_attention_layernorm.weight": "model-00024-of-00031.safetensors",
721
+ "model.layers.62.self_attn.k_proj.bias": "model-00024-of-00031.safetensors",
722
+ "model.layers.62.self_attn.k_proj.weight": "model-00024-of-00031.safetensors",
723
+ "model.layers.62.self_attn.o_proj.weight": "model-00024-of-00031.safetensors",
724
+ "model.layers.62.self_attn.q_proj.bias": "model-00024-of-00031.safetensors",
725
+ "model.layers.62.self_attn.q_proj.weight": "model-00024-of-00031.safetensors",
726
+ "model.layers.62.self_attn.v_proj.bias": "model-00024-of-00031.safetensors",
727
+ "model.layers.62.self_attn.v_proj.weight": "model-00024-of-00031.safetensors",
728
+ "model.layers.63.input_layernorm.weight": "model-00024-of-00031.safetensors",
729
+ "model.layers.63.mlp.down_proj.weight": "model-00024-of-00031.safetensors",
730
+ "model.layers.63.mlp.gate_proj.weight": "model-00024-of-00031.safetensors",
731
+ "model.layers.63.mlp.up_proj.weight": "model-00024-of-00031.safetensors",
732
+ "model.layers.63.post_attention_layernorm.weight": "model-00024-of-00031.safetensors",
733
+ "model.layers.63.self_attn.k_proj.bias": "model-00024-of-00031.safetensors",
734
+ "model.layers.63.self_attn.k_proj.weight": "model-00024-of-00031.safetensors",
735
+ "model.layers.63.self_attn.o_proj.weight": "model-00024-of-00031.safetensors",
736
+ "model.layers.63.self_attn.q_proj.bias": "model-00024-of-00031.safetensors",
737
+ "model.layers.63.self_attn.q_proj.weight": "model-00024-of-00031.safetensors",
738
+ "model.layers.63.self_attn.v_proj.bias": "model-00024-of-00031.safetensors",
739
+ "model.layers.63.self_attn.v_proj.weight": "model-00024-of-00031.safetensors",
740
+ "model.layers.64.input_layernorm.weight": "model-00025-of-00031.safetensors",
741
+ "model.layers.64.mlp.down_proj.weight": "model-00025-of-00031.safetensors",
742
+ "model.layers.64.mlp.gate_proj.weight": "model-00024-of-00031.safetensors",
743
+ "model.layers.64.mlp.up_proj.weight": "model-00025-of-00031.safetensors",
744
+ "model.layers.64.post_attention_layernorm.weight": "model-00025-of-00031.safetensors",
745
+ "model.layers.64.self_attn.k_proj.bias": "model-00024-of-00031.safetensors",
746
+ "model.layers.64.self_attn.k_proj.weight": "model-00024-of-00031.safetensors",
747
+ "model.layers.64.self_attn.o_proj.weight": "model-00024-of-00031.safetensors",
748
+ "model.layers.64.self_attn.q_proj.bias": "model-00024-of-00031.safetensors",
749
+ "model.layers.64.self_attn.q_proj.weight": "model-00024-of-00031.safetensors",
750
+ "model.layers.64.self_attn.v_proj.bias": "model-00024-of-00031.safetensors",
751
+ "model.layers.64.self_attn.v_proj.weight": "model-00024-of-00031.safetensors",
752
+ "model.layers.65.input_layernorm.weight": "model-00025-of-00031.safetensors",
753
+ "model.layers.65.mlp.down_proj.weight": "model-00025-of-00031.safetensors",
754
+ "model.layers.65.mlp.gate_proj.weight": "model-00025-of-00031.safetensors",
755
+ "model.layers.65.mlp.up_proj.weight": "model-00025-of-00031.safetensors",
756
+ "model.layers.65.post_attention_layernorm.weight": "model-00025-of-00031.safetensors",
757
+ "model.layers.65.self_attn.k_proj.bias": "model-00025-of-00031.safetensors",
758
+ "model.layers.65.self_attn.k_proj.weight": "model-00025-of-00031.safetensors",
759
+ "model.layers.65.self_attn.o_proj.weight": "model-00025-of-00031.safetensors",
760
+ "model.layers.65.self_attn.q_proj.bias": "model-00025-of-00031.safetensors",
761
+ "model.layers.65.self_attn.q_proj.weight": "model-00025-of-00031.safetensors",
762
+ "model.layers.65.self_attn.v_proj.bias": "model-00025-of-00031.safetensors",
763
+ "model.layers.65.self_attn.v_proj.weight": "model-00025-of-00031.safetensors",
764
+ "model.layers.66.input_layernorm.weight": "model-00025-of-00031.safetensors",
765
+ "model.layers.66.mlp.down_proj.weight": "model-00025-of-00031.safetensors",
766
+ "model.layers.66.mlp.gate_proj.weight": "model-00025-of-00031.safetensors",
767
+ "model.layers.66.mlp.up_proj.weight": "model-00025-of-00031.safetensors",
768
+ "model.layers.66.post_attention_layernorm.weight": "model-00025-of-00031.safetensors",
769
+ "model.layers.66.self_attn.k_proj.bias": "model-00025-of-00031.safetensors",
770
+ "model.layers.66.self_attn.k_proj.weight": "model-00025-of-00031.safetensors",
771
+ "model.layers.66.self_attn.o_proj.weight": "model-00025-of-00031.safetensors",
772
+ "model.layers.66.self_attn.q_proj.bias": "model-00025-of-00031.safetensors",
773
+ "model.layers.66.self_attn.q_proj.weight": "model-00025-of-00031.safetensors",
774
+ "model.layers.66.self_attn.v_proj.bias": "model-00025-of-00031.safetensors",
775
+ "model.layers.66.self_attn.v_proj.weight": "model-00025-of-00031.safetensors",
776
+ "model.layers.67.input_layernorm.weight": "model-00026-of-00031.safetensors",
777
+ "model.layers.67.mlp.down_proj.weight": "model-00026-of-00031.safetensors",
778
+ "model.layers.67.mlp.gate_proj.weight": "model-00026-of-00031.safetensors",
779
+ "model.layers.67.mlp.up_proj.weight": "model-00026-of-00031.safetensors",
780
+ "model.layers.67.post_attention_layernorm.weight": "model-00026-of-00031.safetensors",
781
+ "model.layers.67.self_attn.k_proj.bias": "model-00025-of-00031.safetensors",
782
+ "model.layers.67.self_attn.k_proj.weight": "model-00025-of-00031.safetensors",
783
+ "model.layers.67.self_attn.o_proj.weight": "model-00025-of-00031.safetensors",
784
+ "model.layers.67.self_attn.q_proj.bias": "model-00025-of-00031.safetensors",
785
+ "model.layers.67.self_attn.q_proj.weight": "model-00025-of-00031.safetensors",
786
+ "model.layers.67.self_attn.v_proj.bias": "model-00025-of-00031.safetensors",
787
+ "model.layers.67.self_attn.v_proj.weight": "model-00025-of-00031.safetensors",
788
+ "model.layers.68.input_layernorm.weight": "model-00026-of-00031.safetensors",
789
+ "model.layers.68.mlp.down_proj.weight": "model-00026-of-00031.safetensors",
790
+ "model.layers.68.mlp.gate_proj.weight": "model-00026-of-00031.safetensors",
791
+ "model.layers.68.mlp.up_proj.weight": "model-00026-of-00031.safetensors",
792
+ "model.layers.68.post_attention_layernorm.weight": "model-00026-of-00031.safetensors",
793
+ "model.layers.68.self_attn.k_proj.bias": "model-00026-of-00031.safetensors",
794
+ "model.layers.68.self_attn.k_proj.weight": "model-00026-of-00031.safetensors",
795
+ "model.layers.68.self_attn.o_proj.weight": "model-00026-of-00031.safetensors",
796
+ "model.layers.68.self_attn.q_proj.bias": "model-00026-of-00031.safetensors",
797
+ "model.layers.68.self_attn.q_proj.weight": "model-00026-of-00031.safetensors",
798
+ "model.layers.68.self_attn.v_proj.bias": "model-00026-of-00031.safetensors",
799
+ "model.layers.68.self_attn.v_proj.weight": "model-00026-of-00031.safetensors",
800
+ "model.layers.69.input_layernorm.weight": "model-00026-of-00031.safetensors",
801
+ "model.layers.69.mlp.down_proj.weight": "model-00026-of-00031.safetensors",
802
+ "model.layers.69.mlp.gate_proj.weight": "model-00026-of-00031.safetensors",
803
+ "model.layers.69.mlp.up_proj.weight": "model-00026-of-00031.safetensors",
804
+ "model.layers.69.post_attention_layernorm.weight": "model-00026-of-00031.safetensors",
805
+ "model.layers.69.self_attn.k_proj.bias": "model-00026-of-00031.safetensors",
806
+ "model.layers.69.self_attn.k_proj.weight": "model-00026-of-00031.safetensors",
807
+ "model.layers.69.self_attn.o_proj.weight": "model-00026-of-00031.safetensors",
808
+ "model.layers.69.self_attn.q_proj.bias": "model-00026-of-00031.safetensors",
809
+ "model.layers.69.self_attn.q_proj.weight": "model-00026-of-00031.safetensors",
810
+ "model.layers.69.self_attn.v_proj.bias": "model-00026-of-00031.safetensors",
811
+ "model.layers.69.self_attn.v_proj.weight": "model-00026-of-00031.safetensors",
812
+ "model.layers.7.input_layernorm.weight": "model-00004-of-00031.safetensors",
813
+ "model.layers.7.mlp.down_proj.weight": "model-00004-of-00031.safetensors",
814
+ "model.layers.7.mlp.gate_proj.weight": "model-00004-of-00031.safetensors",
815
+ "model.layers.7.mlp.up_proj.weight": "model-00004-of-00031.safetensors",
816
+ "model.layers.7.post_attention_layernorm.weight": "model-00004-of-00031.safetensors",
817
+ "model.layers.7.self_attn.k_proj.bias": "model-00004-of-00031.safetensors",
818
+ "model.layers.7.self_attn.k_proj.weight": "model-00004-of-00031.safetensors",
819
+ "model.layers.7.self_attn.o_proj.weight": "model-00004-of-00031.safetensors",
820
+ "model.layers.7.self_attn.q_proj.bias": "model-00004-of-00031.safetensors",
821
+ "model.layers.7.self_attn.q_proj.weight": "model-00004-of-00031.safetensors",
822
+ "model.layers.7.self_attn.v_proj.bias": "model-00004-of-00031.safetensors",
823
+ "model.layers.7.self_attn.v_proj.weight": "model-00004-of-00031.safetensors",
824
+ "model.layers.70.input_layernorm.weight": "model-00027-of-00031.safetensors",
825
+ "model.layers.70.mlp.down_proj.weight": "model-00027-of-00031.safetensors",
826
+ "model.layers.70.mlp.gate_proj.weight": "model-00027-of-00031.safetensors",
827
+ "model.layers.70.mlp.up_proj.weight": "model-00027-of-00031.safetensors",
828
+ "model.layers.70.post_attention_layernorm.weight": "model-00027-of-00031.safetensors",
829
+ "model.layers.70.self_attn.k_proj.bias": "model-00027-of-00031.safetensors",
830
+ "model.layers.70.self_attn.k_proj.weight": "model-00027-of-00031.safetensors",
831
+ "model.layers.70.self_attn.o_proj.weight": "model-00027-of-00031.safetensors",
832
+ "model.layers.70.self_attn.q_proj.bias": "model-00027-of-00031.safetensors",
833
+ "model.layers.70.self_attn.q_proj.weight": "model-00027-of-00031.safetensors",
834
+ "model.layers.70.self_attn.v_proj.bias": "model-00027-of-00031.safetensors",
835
+ "model.layers.70.self_attn.v_proj.weight": "model-00027-of-00031.safetensors",
836
+ "model.layers.71.input_layernorm.weight": "model-00027-of-00031.safetensors",
837
+ "model.layers.71.mlp.down_proj.weight": "model-00027-of-00031.safetensors",
838
+ "model.layers.71.mlp.gate_proj.weight": "model-00027-of-00031.safetensors",
839
+ "model.layers.71.mlp.up_proj.weight": "model-00027-of-00031.safetensors",
840
+ "model.layers.71.post_attention_layernorm.weight": "model-00027-of-00031.safetensors",
841
+ "model.layers.71.self_attn.k_proj.bias": "model-00027-of-00031.safetensors",
842
+ "model.layers.71.self_attn.k_proj.weight": "model-00027-of-00031.safetensors",
843
+ "model.layers.71.self_attn.o_proj.weight": "model-00027-of-00031.safetensors",
844
+ "model.layers.71.self_attn.q_proj.bias": "model-00027-of-00031.safetensors",
845
+ "model.layers.71.self_attn.q_proj.weight": "model-00027-of-00031.safetensors",
846
+ "model.layers.71.self_attn.v_proj.bias": "model-00027-of-00031.safetensors",
847
+ "model.layers.71.self_attn.v_proj.weight": "model-00027-of-00031.safetensors",
848
+ "model.layers.72.input_layernorm.weight": "model-00028-of-00031.safetensors",
849
+ "model.layers.72.mlp.down_proj.weight": "model-00028-of-00031.safetensors",
850
+ "model.layers.72.mlp.gate_proj.weight": "model-00027-of-00031.safetensors",
851
+ "model.layers.72.mlp.up_proj.weight": "model-00027-of-00031.safetensors",
852
+ "model.layers.72.post_attention_layernorm.weight": "model-00028-of-00031.safetensors",
853
+ "model.layers.72.self_attn.k_proj.bias": "model-00027-of-00031.safetensors",
854
+ "model.layers.72.self_attn.k_proj.weight": "model-00027-of-00031.safetensors",
855
+ "model.layers.72.self_attn.o_proj.weight": "model-00027-of-00031.safetensors",
856
+ "model.layers.72.self_attn.q_proj.bias": "model-00027-of-00031.safetensors",
857
+ "model.layers.72.self_attn.q_proj.weight": "model-00027-of-00031.safetensors",
858
+ "model.layers.72.self_attn.v_proj.bias": "model-00027-of-00031.safetensors",
859
+ "model.layers.72.self_attn.v_proj.weight": "model-00027-of-00031.safetensors",
860
+ "model.layers.73.input_layernorm.weight": "model-00028-of-00031.safetensors",
861
+ "model.layers.73.mlp.down_proj.weight": "model-00028-of-00031.safetensors",
862
+ "model.layers.73.mlp.gate_proj.weight": "model-00028-of-00031.safetensors",
863
+ "model.layers.73.mlp.up_proj.weight": "model-00028-of-00031.safetensors",
864
+ "model.layers.73.post_attention_layernorm.weight": "model-00028-of-00031.safetensors",
865
+ "model.layers.73.self_attn.k_proj.bias": "model-00028-of-00031.safetensors",
866
+ "model.layers.73.self_attn.k_proj.weight": "model-00028-of-00031.safetensors",
867
+ "model.layers.73.self_attn.o_proj.weight": "model-00028-of-00031.safetensors",
868
+ "model.layers.73.self_attn.q_proj.bias": "model-00028-of-00031.safetensors",
869
+ "model.layers.73.self_attn.q_proj.weight": "model-00028-of-00031.safetensors",
870
+ "model.layers.73.self_attn.v_proj.bias": "model-00028-of-00031.safetensors",
871
+ "model.layers.73.self_attn.v_proj.weight": "model-00028-of-00031.safetensors",
872
+ "model.layers.74.input_layernorm.weight": "model-00028-of-00031.safetensors",
873
+ "model.layers.74.mlp.down_proj.weight": "model-00028-of-00031.safetensors",
874
+ "model.layers.74.mlp.gate_proj.weight": "model-00028-of-00031.safetensors",
875
+ "model.layers.74.mlp.up_proj.weight": "model-00028-of-00031.safetensors",
876
+ "model.layers.74.post_attention_layernorm.weight": "model-00028-of-00031.safetensors",
877
+ "model.layers.74.self_attn.k_proj.bias": "model-00028-of-00031.safetensors",
878
+ "model.layers.74.self_attn.k_proj.weight": "model-00028-of-00031.safetensors",
879
+ "model.layers.74.self_attn.o_proj.weight": "model-00028-of-00031.safetensors",
880
+ "model.layers.74.self_attn.q_proj.bias": "model-00028-of-00031.safetensors",
881
+ "model.layers.74.self_attn.q_proj.weight": "model-00028-of-00031.safetensors",
882
+ "model.layers.74.self_attn.v_proj.bias": "model-00028-of-00031.safetensors",
883
+ "model.layers.74.self_attn.v_proj.weight": "model-00028-of-00031.safetensors",
884
+ "model.layers.75.input_layernorm.weight": "model-00029-of-00031.safetensors",
885
+ "model.layers.75.mlp.down_proj.weight": "model-00029-of-00031.safetensors",
886
+ "model.layers.75.mlp.gate_proj.weight": "model-00028-of-00031.safetensors",
887
+ "model.layers.75.mlp.up_proj.weight": "model-00029-of-00031.safetensors",
888
+ "model.layers.75.post_attention_layernorm.weight": "model-00029-of-00031.safetensors",
889
+ "model.layers.75.self_attn.k_proj.bias": "model-00028-of-00031.safetensors",
890
+ "model.layers.75.self_attn.k_proj.weight": "model-00028-of-00031.safetensors",
891
+ "model.layers.75.self_attn.o_proj.weight": "model-00028-of-00031.safetensors",
892
+ "model.layers.75.self_attn.q_proj.bias": "model-00028-of-00031.safetensors",
893
+ "model.layers.75.self_attn.q_proj.weight": "model-00028-of-00031.safetensors",
894
+ "model.layers.75.self_attn.v_proj.bias": "model-00028-of-00031.safetensors",
895
+ "model.layers.75.self_attn.v_proj.weight": "model-00028-of-00031.safetensors",
896
+ "model.layers.76.input_layernorm.weight": "model-00029-of-00031.safetensors",
897
+ "model.layers.76.mlp.down_proj.weight": "model-00029-of-00031.safetensors",
898
+ "model.layers.76.mlp.gate_proj.weight": "model-00029-of-00031.safetensors",
899
+ "model.layers.76.mlp.up_proj.weight": "model-00029-of-00031.safetensors",
900
+ "model.layers.76.post_attention_layernorm.weight": "model-00029-of-00031.safetensors",
901
+ "model.layers.76.self_attn.k_proj.bias": "model-00029-of-00031.safetensors",
902
+ "model.layers.76.self_attn.k_proj.weight": "model-00029-of-00031.safetensors",
903
+ "model.layers.76.self_attn.o_proj.weight": "model-00029-of-00031.safetensors",
904
+ "model.layers.76.self_attn.q_proj.bias": "model-00029-of-00031.safetensors",
905
+ "model.layers.76.self_attn.q_proj.weight": "model-00029-of-00031.safetensors",
906
+ "model.layers.76.self_attn.v_proj.bias": "model-00029-of-00031.safetensors",
907
+ "model.layers.76.self_attn.v_proj.weight": "model-00029-of-00031.safetensors",
908
+ "model.layers.77.input_layernorm.weight": "model-00029-of-00031.safetensors",
909
+ "model.layers.77.mlp.down_proj.weight": "model-00029-of-00031.safetensors",
910
+ "model.layers.77.mlp.gate_proj.weight": "model-00029-of-00031.safetensors",
911
+ "model.layers.77.mlp.up_proj.weight": "model-00029-of-00031.safetensors",
912
+ "model.layers.77.post_attention_layernorm.weight": "model-00029-of-00031.safetensors",
913
+ "model.layers.77.self_attn.k_proj.bias": "model-00029-of-00031.safetensors",
914
+ "model.layers.77.self_attn.k_proj.weight": "model-00029-of-00031.safetensors",
915
+ "model.layers.77.self_attn.o_proj.weight": "model-00029-of-00031.safetensors",
916
+ "model.layers.77.self_attn.q_proj.bias": "model-00029-of-00031.safetensors",
917
+ "model.layers.77.self_attn.q_proj.weight": "model-00029-of-00031.safetensors",
918
+ "model.layers.77.self_attn.v_proj.bias": "model-00029-of-00031.safetensors",
919
+ "model.layers.77.self_attn.v_proj.weight": "model-00029-of-00031.safetensors",
920
+ "model.layers.78.input_layernorm.weight": "model-00030-of-00031.safetensors",
921
+ "model.layers.78.mlp.down_proj.weight": "model-00030-of-00031.safetensors",
922
+ "model.layers.78.mlp.gate_proj.weight": "model-00030-of-00031.safetensors",
923
+ "model.layers.78.mlp.up_proj.weight": "model-00030-of-00031.safetensors",
924
+ "model.layers.78.post_attention_layernorm.weight": "model-00030-of-00031.safetensors",
925
+ "model.layers.78.self_attn.k_proj.bias": "model-00029-of-00031.safetensors",
926
+ "model.layers.78.self_attn.k_proj.weight": "model-00029-of-00031.safetensors",
927
+ "model.layers.78.self_attn.o_proj.weight": "model-00029-of-00031.safetensors",
928
+ "model.layers.78.self_attn.q_proj.bias": "model-00029-of-00031.safetensors",
929
+ "model.layers.78.self_attn.q_proj.weight": "model-00029-of-00031.safetensors",
930
+ "model.layers.78.self_attn.v_proj.bias": "model-00029-of-00031.safetensors",
931
+ "model.layers.78.self_attn.v_proj.weight": "model-00029-of-00031.safetensors",
932
+ "model.layers.79.input_layernorm.weight": "model-00030-of-00031.safetensors",
933
+ "model.layers.79.mlp.down_proj.weight": "model-00030-of-00031.safetensors",
934
+ "model.layers.79.mlp.gate_proj.weight": "model-00030-of-00031.safetensors",
935
+ "model.layers.79.mlp.up_proj.weight": "model-00030-of-00031.safetensors",
936
+ "model.layers.79.post_attention_layernorm.weight": "model-00030-of-00031.safetensors",
937
+ "model.layers.79.self_attn.k_proj.bias": "model-00030-of-00031.safetensors",
938
+ "model.layers.79.self_attn.k_proj.weight": "model-00030-of-00031.safetensors",
939
+ "model.layers.79.self_attn.o_proj.weight": "model-00030-of-00031.safetensors",
940
+ "model.layers.79.self_attn.q_proj.bias": "model-00030-of-00031.safetensors",
941
+ "model.layers.79.self_attn.q_proj.weight": "model-00030-of-00031.safetensors",
942
+ "model.layers.79.self_attn.v_proj.bias": "model-00030-of-00031.safetensors",
943
+ "model.layers.79.self_attn.v_proj.weight": "model-00030-of-00031.safetensors",
944
+ "model.layers.8.input_layernorm.weight": "model-00004-of-00031.safetensors",
945
+ "model.layers.8.mlp.down_proj.weight": "model-00004-of-00031.safetensors",
946
+ "model.layers.8.mlp.gate_proj.weight": "model-00004-of-00031.safetensors",
947
+ "model.layers.8.mlp.up_proj.weight": "model-00004-of-00031.safetensors",
948
+ "model.layers.8.post_attention_layernorm.weight": "model-00004-of-00031.safetensors",
949
+ "model.layers.8.self_attn.k_proj.bias": "model-00004-of-00031.safetensors",
950
+ "model.layers.8.self_attn.k_proj.weight": "model-00004-of-00031.safetensors",
951
+ "model.layers.8.self_attn.o_proj.weight": "model-00004-of-00031.safetensors",
952
+ "model.layers.8.self_attn.q_proj.bias": "model-00004-of-00031.safetensors",
953
+ "model.layers.8.self_attn.q_proj.weight": "model-00004-of-00031.safetensors",
954
+ "model.layers.8.self_attn.v_proj.bias": "model-00004-of-00031.safetensors",
955
+ "model.layers.8.self_attn.v_proj.weight": "model-00004-of-00031.safetensors",
956
+ "model.layers.9.input_layernorm.weight": "model-00005-of-00031.safetensors",
957
+ "model.layers.9.mlp.down_proj.weight": "model-00005-of-00031.safetensors",
958
+ "model.layers.9.mlp.gate_proj.weight": "model-00004-of-00031.safetensors",
959
+ "model.layers.9.mlp.up_proj.weight": "model-00005-of-00031.safetensors",
960
+ "model.layers.9.post_attention_layernorm.weight": "model-00005-of-00031.safetensors",
961
+ "model.layers.9.self_attn.k_proj.bias": "model-00004-of-00031.safetensors",
962
+ "model.layers.9.self_attn.k_proj.weight": "model-00004-of-00031.safetensors",
963
+ "model.layers.9.self_attn.o_proj.weight": "model-00004-of-00031.safetensors",
964
+ "model.layers.9.self_attn.q_proj.bias": "model-00004-of-00031.safetensors",
965
+ "model.layers.9.self_attn.q_proj.weight": "model-00004-of-00031.safetensors",
966
+ "model.layers.9.self_attn.v_proj.bias": "model-00004-of-00031.safetensors",
967
+ "model.layers.9.self_attn.v_proj.weight": "model-00004-of-00031.safetensors",
968
+ "model.norm.weight": "model-00030-of-00031.safetensors"
969
+ }
970
+ }
rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad8a35afd8967cbb748405387e44426e43ad127028e826eddc9b67d2ca873c85
3
+ size 15984
rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f338ce80d7c441076bfc8c53b84067a0181f5a14e80c13d5acb8150b659f4d73
3
+ size 15984
rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9fbc9fa428939be10b46779f0eb5cd833e0da426b1cbdee77b3a55b6952235b
3
+ size 15984
rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac55dba0b79d5fa4699d239da2f966d52040d576d31234ac8d4632e6956481bc
3
+ size 15984
rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af2d0c015100768ffa23faf3b6c2d54ea89eb045603e30e55cd211e06ff34972
3
+ size 15984
rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c60a1b40608e34bc801c8231f97b81c53b5290dfaed1b9cd0ccbeca29574a991
3
+ size 15984
rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ad6a142a403eb9aafc4a3a9a856bca648fe31fd22d796867baca31fb13656aa
3
+ size 15984
rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38bc23a138cc800b22881742c0f3f9a71731a9a7111c6058a0077e6274d21773
3
+ size 15984
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a49cd8b86c1fc5494087e8b515f0355320aa65f04cf14865b6b9434292334eec
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": "<|im_end|>"
25
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'Please reason step by step, and put your final answer within \\\\boxed{}.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nPlease reason step by step, and put your final answer within \\\\boxed{}.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "extra_special_tokens": {},
203
+ "model_max_length": 131072,
204
+ "pad_token": "<|im_end|>",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
trainer_state.json ADDED
@@ -0,0 +1,1633 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.3869625520110958,
5
+ "eval_steps": 500,
6
+ "global_step": 1000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.006934812760055479,
13
+ "grad_norm": 61.60813903808594,
14
+ "learning_rate": 8.650519031141869e-07,
15
+ "loss": 2.7928,
16
+ "mean_token_accuracy": 0.6783367753028869,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.013869625520110958,
21
+ "grad_norm": 35.546016693115234,
22
+ "learning_rate": 1.7301038062283738e-06,
23
+ "loss": 2.3942,
24
+ "mean_token_accuracy": 0.6943186521530151,
25
+ "step": 10
26
+ },
27
+ {
28
+ "epoch": 0.020804438280166437,
29
+ "grad_norm": 2.246945858001709,
30
+ "learning_rate": 2.5951557093425604e-06,
31
+ "loss": 1.202,
32
+ "mean_token_accuracy": 0.7397322177886962,
33
+ "step": 15
34
+ },
35
+ {
36
+ "epoch": 0.027739251040221916,
37
+ "grad_norm": 1.1429805755615234,
38
+ "learning_rate": 3.4602076124567477e-06,
39
+ "loss": 0.918,
40
+ "mean_token_accuracy": 0.7564186692237854,
41
+ "step": 20
42
+ },
43
+ {
44
+ "epoch": 0.03467406380027739,
45
+ "grad_norm": 0.9538511633872986,
46
+ "learning_rate": 4.325259515570934e-06,
47
+ "loss": 0.8104,
48
+ "mean_token_accuracy": 0.7724308490753173,
49
+ "step": 25
50
+ },
51
+ {
52
+ "epoch": 0.04160887656033287,
53
+ "grad_norm": 0.7069241404533386,
54
+ "learning_rate": 5.190311418685121e-06,
55
+ "loss": 0.7364,
56
+ "mean_token_accuracy": 0.7827559828758239,
57
+ "step": 30
58
+ },
59
+ {
60
+ "epoch": 0.04854368932038835,
61
+ "grad_norm": 0.4030636250972748,
62
+ "learning_rate": 6.055363321799308e-06,
63
+ "loss": 0.6835,
64
+ "mean_token_accuracy": 0.7935511350631714,
65
+ "step": 35
66
+ },
67
+ {
68
+ "epoch": 0.05547850208044383,
69
+ "grad_norm": 0.8142576217651367,
70
+ "learning_rate": 6.920415224913495e-06,
71
+ "loss": 0.6478,
72
+ "mean_token_accuracy": 0.8010085463523865,
73
+ "step": 40
74
+ },
75
+ {
76
+ "epoch": 0.06241331484049931,
77
+ "grad_norm": 0.2626665532588959,
78
+ "learning_rate": 7.785467128027681e-06,
79
+ "loss": 0.6267,
80
+ "mean_token_accuracy": 0.8053073883056641,
81
+ "step": 45
82
+ },
83
+ {
84
+ "epoch": 0.06934812760055478,
85
+ "grad_norm": 0.23942551016807556,
86
+ "learning_rate": 8.650519031141868e-06,
87
+ "loss": 0.6013,
88
+ "mean_token_accuracy": 0.8112802267074585,
89
+ "step": 50
90
+ },
91
+ {
92
+ "epoch": 0.07628294036061026,
93
+ "grad_norm": 0.20308136940002441,
94
+ "learning_rate": 9.515570934256055e-06,
95
+ "loss": 0.5769,
96
+ "mean_token_accuracy": 0.8168688178062439,
97
+ "step": 55
98
+ },
99
+ {
100
+ "epoch": 0.08321775312066575,
101
+ "grad_norm": 0.1854431927204132,
102
+ "learning_rate": 1.0380622837370241e-05,
103
+ "loss": 0.5805,
104
+ "mean_token_accuracy": 0.815436840057373,
105
+ "step": 60
106
+ },
107
+ {
108
+ "epoch": 0.09015256588072122,
109
+ "grad_norm": 0.1700541228055954,
110
+ "learning_rate": 1.124567474048443e-05,
111
+ "loss": 0.5652,
112
+ "mean_token_accuracy": 0.8188095331192017,
113
+ "step": 65
114
+ },
115
+ {
116
+ "epoch": 0.0970873786407767,
117
+ "grad_norm": 0.18573108315467834,
118
+ "learning_rate": 1.2110726643598615e-05,
119
+ "loss": 0.5524,
120
+ "mean_token_accuracy": 0.8222507953643798,
121
+ "step": 70
122
+ },
123
+ {
124
+ "epoch": 0.10402219140083217,
125
+ "grad_norm": 0.18843185901641846,
126
+ "learning_rate": 1.2975778546712803e-05,
127
+ "loss": 0.542,
128
+ "mean_token_accuracy": 0.8249342203140259,
129
+ "step": 75
130
+ },
131
+ {
132
+ "epoch": 0.11095700416088766,
133
+ "grad_norm": 0.21635942161083221,
134
+ "learning_rate": 1.384083044982699e-05,
135
+ "loss": 0.5401,
136
+ "mean_token_accuracy": 0.8251730322837829,
137
+ "step": 80
138
+ },
139
+ {
140
+ "epoch": 0.11789181692094314,
141
+ "grad_norm": 0.21325534582138062,
142
+ "learning_rate": 1.4705882352941177e-05,
143
+ "loss": 0.5404,
144
+ "mean_token_accuracy": 0.8243620276451111,
145
+ "step": 85
146
+ },
147
+ {
148
+ "epoch": 0.12482662968099861,
149
+ "grad_norm": 0.22691610455513,
150
+ "learning_rate": 1.5570934256055363e-05,
151
+ "loss": 0.5278,
152
+ "mean_token_accuracy": 0.8283108592033386,
153
+ "step": 90
154
+ },
155
+ {
156
+ "epoch": 0.1317614424410541,
157
+ "grad_norm": 0.2375083714723587,
158
+ "learning_rate": 1.643598615916955e-05,
159
+ "loss": 0.5198,
160
+ "mean_token_accuracy": 0.8296842217445374,
161
+ "step": 95
162
+ },
163
+ {
164
+ "epoch": 0.13869625520110956,
165
+ "grad_norm": 0.27802157402038574,
166
+ "learning_rate": 1.7301038062283735e-05,
167
+ "loss": 0.5346,
168
+ "mean_token_accuracy": 0.825625765323639,
169
+ "step": 100
170
+ },
171
+ {
172
+ "epoch": 0.14563106796116504,
173
+ "grad_norm": 0.3691716194152832,
174
+ "learning_rate": 1.8166089965397926e-05,
175
+ "loss": 0.5249,
176
+ "mean_token_accuracy": 0.828523588180542,
177
+ "step": 105
178
+ },
179
+ {
180
+ "epoch": 0.15256588072122051,
181
+ "grad_norm": 0.30235254764556885,
182
+ "learning_rate": 1.903114186851211e-05,
183
+ "loss": 0.514,
184
+ "mean_token_accuracy": 0.8320501446723938,
185
+ "step": 110
186
+ },
187
+ {
188
+ "epoch": 0.15950069348127602,
189
+ "grad_norm": 0.3447076380252838,
190
+ "learning_rate": 1.98961937716263e-05,
191
+ "loss": 0.5203,
192
+ "mean_token_accuracy": 0.8298335313796997,
193
+ "step": 115
194
+ },
195
+ {
196
+ "epoch": 0.1664355062413315,
197
+ "grad_norm": 0.28185489773750305,
198
+ "learning_rate": 2.0761245674740483e-05,
199
+ "loss": 0.5305,
200
+ "mean_token_accuracy": 0.8264262914657593,
201
+ "step": 120
202
+ },
203
+ {
204
+ "epoch": 0.17337031900138697,
205
+ "grad_norm": 0.2629449963569641,
206
+ "learning_rate": 2.1626297577854674e-05,
207
+ "loss": 0.5167,
208
+ "mean_token_accuracy": 0.8303680658340454,
209
+ "step": 125
210
+ },
211
+ {
212
+ "epoch": 0.18030513176144244,
213
+ "grad_norm": 0.2788124680519104,
214
+ "learning_rate": 2.249134948096886e-05,
215
+ "loss": 0.5275,
216
+ "mean_token_accuracy": 0.8274267673492431,
217
+ "step": 130
218
+ },
219
+ {
220
+ "epoch": 0.18723994452149792,
221
+ "grad_norm": 0.24550440907478333,
222
+ "learning_rate": 2.3356401384083046e-05,
223
+ "loss": 0.5073,
224
+ "mean_token_accuracy": 0.8328269720077515,
225
+ "step": 135
226
+ },
227
+ {
228
+ "epoch": 0.1941747572815534,
229
+ "grad_norm": 0.7636565566062927,
230
+ "learning_rate": 2.422145328719723e-05,
231
+ "loss": 0.5138,
232
+ "mean_token_accuracy": 0.8308726906776428,
233
+ "step": 140
234
+ },
235
+ {
236
+ "epoch": 0.20110957004160887,
237
+ "grad_norm": 0.6163385510444641,
238
+ "learning_rate": 2.508650519031142e-05,
239
+ "loss": 0.5123,
240
+ "mean_token_accuracy": 0.8311893105506897,
241
+ "step": 145
242
+ },
243
+ {
244
+ "epoch": 0.20804438280166435,
245
+ "grad_norm": 0.3808706998825073,
246
+ "learning_rate": 2.5951557093425606e-05,
247
+ "loss": 0.5018,
248
+ "mean_token_accuracy": 0.8343647360801697,
249
+ "step": 150
250
+ },
251
+ {
252
+ "epoch": 0.21497919556171982,
253
+ "grad_norm": 0.2565021216869354,
254
+ "learning_rate": 2.6816608996539794e-05,
255
+ "loss": 0.4943,
256
+ "mean_token_accuracy": 0.8362817883491516,
257
+ "step": 155
258
+ },
259
+ {
260
+ "epoch": 0.22191400832177532,
261
+ "grad_norm": 0.3511087894439697,
262
+ "learning_rate": 2.768166089965398e-05,
263
+ "loss": 0.4933,
264
+ "mean_token_accuracy": 0.8366880297660828,
265
+ "step": 160
266
+ },
267
+ {
268
+ "epoch": 0.2288488210818308,
269
+ "grad_norm": 0.4006827175617218,
270
+ "learning_rate": 2.8546712802768166e-05,
271
+ "loss": 0.4906,
272
+ "mean_token_accuracy": 0.837351131439209,
273
+ "step": 165
274
+ },
275
+ {
276
+ "epoch": 0.23578363384188628,
277
+ "grad_norm": 0.4149070978164673,
278
+ "learning_rate": 2.9411764705882354e-05,
279
+ "loss": 0.4988,
280
+ "mean_token_accuracy": 0.8351827621459961,
281
+ "step": 170
282
+ },
283
+ {
284
+ "epoch": 0.24271844660194175,
285
+ "grad_norm": 0.32881560921669006,
286
+ "learning_rate": 3.0276816608996538e-05,
287
+ "loss": 0.5085,
288
+ "mean_token_accuracy": 0.831884253025055,
289
+ "step": 175
290
+ },
291
+ {
292
+ "epoch": 0.24965325936199723,
293
+ "grad_norm": 0.46366971731185913,
294
+ "learning_rate": 3.1141868512110726e-05,
295
+ "loss": 0.4964,
296
+ "mean_token_accuracy": 0.8355090618133545,
297
+ "step": 180
298
+ },
299
+ {
300
+ "epoch": 0.2565880721220527,
301
+ "grad_norm": 0.838777482509613,
302
+ "learning_rate": 3.200692041522492e-05,
303
+ "loss": 0.5078,
304
+ "mean_token_accuracy": 0.8320568442344666,
305
+ "step": 185
306
+ },
307
+ {
308
+ "epoch": 0.2635228848821082,
309
+ "grad_norm": 30.57767677307129,
310
+ "learning_rate": 3.28719723183391e-05,
311
+ "loss": 0.4978,
312
+ "mean_token_accuracy": 0.8360116839408874,
313
+ "step": 190
314
+ },
315
+ {
316
+ "epoch": 0.27045769764216365,
317
+ "grad_norm": 0.5885879993438721,
318
+ "learning_rate": 3.373702422145329e-05,
319
+ "loss": 0.497,
320
+ "mean_token_accuracy": 0.8356186389923096,
321
+ "step": 195
322
+ },
323
+ {
324
+ "epoch": 0.27739251040221913,
325
+ "grad_norm": 0.3610420525074005,
326
+ "learning_rate": 3.460207612456747e-05,
327
+ "loss": 0.4989,
328
+ "mean_token_accuracy": 0.8350513100624084,
329
+ "step": 200
330
+ },
331
+ {
332
+ "epoch": 0.2843273231622746,
333
+ "grad_norm": 0.3954995572566986,
334
+ "learning_rate": 3.546712802768166e-05,
335
+ "loss": 0.5011,
336
+ "mean_token_accuracy": 0.83415766954422,
337
+ "step": 205
338
+ },
339
+ {
340
+ "epoch": 0.2912621359223301,
341
+ "grad_norm": 0.3071337342262268,
342
+ "learning_rate": 3.633217993079585e-05,
343
+ "loss": 0.5255,
344
+ "mean_token_accuracy": 0.8285403490066529,
345
+ "step": 210
346
+ },
347
+ {
348
+ "epoch": 0.29819694868238555,
349
+ "grad_norm": 0.31758391857147217,
350
+ "learning_rate": 3.719723183391004e-05,
351
+ "loss": 0.4954,
352
+ "mean_token_accuracy": 0.835390031337738,
353
+ "step": 215
354
+ },
355
+ {
356
+ "epoch": 0.30513176144244103,
357
+ "grad_norm": 0.3296087086200714,
358
+ "learning_rate": 3.806228373702422e-05,
359
+ "loss": 0.4923,
360
+ "mean_token_accuracy": 0.8361375451087951,
361
+ "step": 220
362
+ },
363
+ {
364
+ "epoch": 0.3120665742024965,
365
+ "grad_norm": 0.2980894446372986,
366
+ "learning_rate": 3.892733564013841e-05,
367
+ "loss": 0.4996,
368
+ "mean_token_accuracy": 0.8341476917266846,
369
+ "step": 225
370
+ },
371
+ {
372
+ "epoch": 0.31900138696255204,
373
+ "grad_norm": 0.2892495095729828,
374
+ "learning_rate": 3.97923875432526e-05,
375
+ "loss": 0.4855,
376
+ "mean_token_accuracy": 0.8382086515426636,
377
+ "step": 230
378
+ },
379
+ {
380
+ "epoch": 0.3259361997226075,
381
+ "grad_norm": 0.29287102818489075,
382
+ "learning_rate": 4.065743944636679e-05,
383
+ "loss": 0.4944,
384
+ "mean_token_accuracy": 0.8353524923324585,
385
+ "step": 235
386
+ },
387
+ {
388
+ "epoch": 0.332871012482663,
389
+ "grad_norm": 0.28245487809181213,
390
+ "learning_rate": 4.1522491349480966e-05,
391
+ "loss": 0.4887,
392
+ "mean_token_accuracy": 0.8373544692993165,
393
+ "step": 240
394
+ },
395
+ {
396
+ "epoch": 0.33980582524271846,
397
+ "grad_norm": 0.23551802337169647,
398
+ "learning_rate": 4.238754325259516e-05,
399
+ "loss": 0.4925,
400
+ "mean_token_accuracy": 0.8361364006996155,
401
+ "step": 245
402
+ },
403
+ {
404
+ "epoch": 0.34674063800277394,
405
+ "grad_norm": 0.24266427755355835,
406
+ "learning_rate": 4.325259515570935e-05,
407
+ "loss": 0.4759,
408
+ "mean_token_accuracy": 0.8410738468170166,
409
+ "step": 250
410
+ },
411
+ {
412
+ "epoch": 0.3536754507628294,
413
+ "grad_norm": 0.33316895365715027,
414
+ "learning_rate": 4.411764705882353e-05,
415
+ "loss": 0.4902,
416
+ "mean_token_accuracy": 0.8370885252952576,
417
+ "step": 255
418
+ },
419
+ {
420
+ "epoch": 0.3606102635228849,
421
+ "grad_norm": 0.5113539099693298,
422
+ "learning_rate": 4.498269896193772e-05,
423
+ "loss": 0.4918,
424
+ "mean_token_accuracy": 0.8364068984985351,
425
+ "step": 260
426
+ },
427
+ {
428
+ "epoch": 0.36754507628294036,
429
+ "grad_norm": 0.3733905851840973,
430
+ "learning_rate": 4.58477508650519e-05,
431
+ "loss": 0.49,
432
+ "mean_token_accuracy": 0.8370036244392395,
433
+ "step": 265
434
+ },
435
+ {
436
+ "epoch": 0.37447988904299584,
437
+ "grad_norm": 0.4112997353076935,
438
+ "learning_rate": 4.671280276816609e-05,
439
+ "loss": 0.4932,
440
+ "mean_token_accuracy": 0.8356328129768371,
441
+ "step": 270
442
+ },
443
+ {
444
+ "epoch": 0.3814147018030513,
445
+ "grad_norm": 0.5121487379074097,
446
+ "learning_rate": 4.7577854671280283e-05,
447
+ "loss": 0.479,
448
+ "mean_token_accuracy": 0.839626955986023,
449
+ "step": 275
450
+ },
451
+ {
452
+ "epoch": 0.3883495145631068,
453
+ "grad_norm": 0.36294957995414734,
454
+ "learning_rate": 4.844290657439446e-05,
455
+ "loss": 0.4829,
456
+ "mean_token_accuracy": 0.8391167283058166,
457
+ "step": 280
458
+ },
459
+ {
460
+ "epoch": 0.39528432732316227,
461
+ "grad_norm": 0.3162820339202881,
462
+ "learning_rate": 4.930795847750865e-05,
463
+ "loss": 0.4899,
464
+ "mean_token_accuracy": 0.8368083834648132,
465
+ "step": 285
466
+ },
467
+ {
468
+ "epoch": 0.40221914008321774,
469
+ "grad_norm": 0.3973437547683716,
470
+ "learning_rate": 4.9980732177263974e-05,
471
+ "loss": 0.4864,
472
+ "mean_token_accuracy": 0.8374906539916992,
473
+ "step": 290
474
+ },
475
+ {
476
+ "epoch": 0.4091539528432732,
477
+ "grad_norm": 0.5423433184623718,
478
+ "learning_rate": 4.9884393063583816e-05,
479
+ "loss": 0.4907,
480
+ "mean_token_accuracy": 0.8373413920402527,
481
+ "step": 295
482
+ },
483
+ {
484
+ "epoch": 0.4160887656033287,
485
+ "grad_norm": 0.39722123742103577,
486
+ "learning_rate": 4.9788053949903666e-05,
487
+ "loss": 0.4961,
488
+ "mean_token_accuracy": 0.8351489901542664,
489
+ "step": 300
490
+ },
491
+ {
492
+ "epoch": 0.42302357836338417,
493
+ "grad_norm": 0.34169071912765503,
494
+ "learning_rate": 4.969171483622351e-05,
495
+ "loss": 0.4891,
496
+ "mean_token_accuracy": 0.8370493412017822,
497
+ "step": 305
498
+ },
499
+ {
500
+ "epoch": 0.42995839112343964,
501
+ "grad_norm": 0.3429335951805115,
502
+ "learning_rate": 4.959537572254335e-05,
503
+ "loss": 0.4794,
504
+ "mean_token_accuracy": 0.8396916627883911,
505
+ "step": 310
506
+ },
507
+ {
508
+ "epoch": 0.4368932038834951,
509
+ "grad_norm": 0.3266272246837616,
510
+ "learning_rate": 4.94990366088632e-05,
511
+ "loss": 0.4757,
512
+ "mean_token_accuracy": 0.8405494570732117,
513
+ "step": 315
514
+ },
515
+ {
516
+ "epoch": 0.44382801664355065,
517
+ "grad_norm": 0.2874930202960968,
518
+ "learning_rate": 4.940269749518305e-05,
519
+ "loss": 0.4978,
520
+ "mean_token_accuracy": 0.8344841122627258,
521
+ "step": 320
522
+ },
523
+ {
524
+ "epoch": 0.4507628294036061,
525
+ "grad_norm": 0.2812349498271942,
526
+ "learning_rate": 4.930635838150289e-05,
527
+ "loss": 0.4839,
528
+ "mean_token_accuracy": 0.8383953332901001,
529
+ "step": 325
530
+ },
531
+ {
532
+ "epoch": 0.4576976421636616,
533
+ "grad_norm": 0.25296345353126526,
534
+ "learning_rate": 4.921001926782274e-05,
535
+ "loss": 0.4738,
536
+ "mean_token_accuracy": 0.8412886261940002,
537
+ "step": 330
538
+ },
539
+ {
540
+ "epoch": 0.4646324549237171,
541
+ "grad_norm": 0.22165291011333466,
542
+ "learning_rate": 4.9113680154142584e-05,
543
+ "loss": 0.4867,
544
+ "mean_token_accuracy": 0.8379201173782349,
545
+ "step": 335
546
+ },
547
+ {
548
+ "epoch": 0.47156726768377255,
549
+ "grad_norm": 0.2551758289337158,
550
+ "learning_rate": 4.9017341040462426e-05,
551
+ "loss": 0.4786,
552
+ "mean_token_accuracy": 0.8399594306945801,
553
+ "step": 340
554
+ },
555
+ {
556
+ "epoch": 0.478502080443828,
557
+ "grad_norm": 0.25708919763565063,
558
+ "learning_rate": 4.8921001926782276e-05,
559
+ "loss": 0.48,
560
+ "mean_token_accuracy": 0.8395551085472107,
561
+ "step": 345
562
+ },
563
+ {
564
+ "epoch": 0.4854368932038835,
565
+ "grad_norm": 0.1992408186197281,
566
+ "learning_rate": 4.8824662813102125e-05,
567
+ "loss": 0.4714,
568
+ "mean_token_accuracy": 0.8418668508529663,
569
+ "step": 350
570
+ },
571
+ {
572
+ "epoch": 0.492371705963939,
573
+ "grad_norm": 0.23445720970630646,
574
+ "learning_rate": 4.872832369942197e-05,
575
+ "loss": 0.471,
576
+ "mean_token_accuracy": 0.8421580553054809,
577
+ "step": 355
578
+ },
579
+ {
580
+ "epoch": 0.49930651872399445,
581
+ "grad_norm": 0.31462928652763367,
582
+ "learning_rate": 4.863198458574181e-05,
583
+ "loss": 0.4711,
584
+ "mean_token_accuracy": 0.842027747631073,
585
+ "step": 360
586
+ },
587
+ {
588
+ "epoch": 0.5062413314840499,
589
+ "grad_norm": 0.24767646193504333,
590
+ "learning_rate": 4.853564547206166e-05,
591
+ "loss": 0.4717,
592
+ "mean_token_accuracy": 0.8417503118515015,
593
+ "step": 365
594
+ },
595
+ {
596
+ "epoch": 0.5131761442441054,
597
+ "grad_norm": 0.2389938235282898,
598
+ "learning_rate": 4.84393063583815e-05,
599
+ "loss": 0.4677,
600
+ "mean_token_accuracy": 0.8431912064552307,
601
+ "step": 370
602
+ },
603
+ {
604
+ "epoch": 0.5201109570041609,
605
+ "grad_norm": 0.29998722672462463,
606
+ "learning_rate": 4.834296724470135e-05,
607
+ "loss": 0.4877,
608
+ "mean_token_accuracy": 0.8374402284622192,
609
+ "step": 375
610
+ },
611
+ {
612
+ "epoch": 0.5270457697642164,
613
+ "grad_norm": 0.2877121865749359,
614
+ "learning_rate": 4.82466281310212e-05,
615
+ "loss": 0.4863,
616
+ "mean_token_accuracy": 0.8380719065666199,
617
+ "step": 380
618
+ },
619
+ {
620
+ "epoch": 0.5339805825242718,
621
+ "grad_norm": 0.24628062546253204,
622
+ "learning_rate": 4.815028901734104e-05,
623
+ "loss": 0.4665,
624
+ "mean_token_accuracy": 0.8434135437011718,
625
+ "step": 385
626
+ },
627
+ {
628
+ "epoch": 0.5409153952843273,
629
+ "grad_norm": 0.24347947537899017,
630
+ "learning_rate": 4.8053949903660886e-05,
631
+ "loss": 0.476,
632
+ "mean_token_accuracy": 0.8404138565063477,
633
+ "step": 390
634
+ },
635
+ {
636
+ "epoch": 0.5478502080443828,
637
+ "grad_norm": 0.20724909007549286,
638
+ "learning_rate": 4.7957610789980735e-05,
639
+ "loss": 0.4881,
640
+ "mean_token_accuracy": 0.8372583389282227,
641
+ "step": 395
642
+ },
643
+ {
644
+ "epoch": 0.5547850208044383,
645
+ "grad_norm": 0.2162594497203827,
646
+ "learning_rate": 4.786127167630058e-05,
647
+ "loss": 0.4726,
648
+ "mean_token_accuracy": 0.842011570930481,
649
+ "step": 400
650
+ },
651
+ {
652
+ "epoch": 0.5617198335644937,
653
+ "grad_norm": 0.34494099020957947,
654
+ "learning_rate": 4.776493256262042e-05,
655
+ "loss": 0.4783,
656
+ "mean_token_accuracy": 0.8399308085441589,
657
+ "step": 405
658
+ },
659
+ {
660
+ "epoch": 0.5686546463245492,
661
+ "grad_norm": 0.24402566254138947,
662
+ "learning_rate": 4.7668593448940276e-05,
663
+ "loss": 0.4953,
664
+ "mean_token_accuracy": 0.8352864623069763,
665
+ "step": 410
666
+ },
667
+ {
668
+ "epoch": 0.5755894590846047,
669
+ "grad_norm": 0.2124612033367157,
670
+ "learning_rate": 4.757225433526012e-05,
671
+ "loss": 0.4849,
672
+ "mean_token_accuracy": 0.8380748987197876,
673
+ "step": 415
674
+ },
675
+ {
676
+ "epoch": 0.5825242718446602,
677
+ "grad_norm": 0.20577934384346008,
678
+ "learning_rate": 4.747591522157996e-05,
679
+ "loss": 0.4591,
680
+ "mean_token_accuracy": 0.845665693283081,
681
+ "step": 420
682
+ },
683
+ {
684
+ "epoch": 0.5894590846047156,
685
+ "grad_norm": 0.2838655710220337,
686
+ "learning_rate": 4.737957610789981e-05,
687
+ "loss": 0.4709,
688
+ "mean_token_accuracy": 0.8418583750724793,
689
+ "step": 425
690
+ },
691
+ {
692
+ "epoch": 0.5963938973647711,
693
+ "grad_norm": 0.2222902923822403,
694
+ "learning_rate": 4.7283236994219653e-05,
695
+ "loss": 0.4817,
696
+ "mean_token_accuracy": 0.8388337612152099,
697
+ "step": 430
698
+ },
699
+ {
700
+ "epoch": 0.6033287101248266,
701
+ "grad_norm": 0.25565460324287415,
702
+ "learning_rate": 4.7186897880539496e-05,
703
+ "loss": 0.4724,
704
+ "mean_token_accuracy": 0.8415215969085693,
705
+ "step": 435
706
+ },
707
+ {
708
+ "epoch": 0.6102635228848821,
709
+ "grad_norm": 0.680081844329834,
710
+ "learning_rate": 4.709055876685935e-05,
711
+ "loss": 0.4777,
712
+ "mean_token_accuracy": 0.8402902245521545,
713
+ "step": 440
714
+ },
715
+ {
716
+ "epoch": 0.6171983356449375,
717
+ "grad_norm": 0.3035682141780853,
718
+ "learning_rate": 4.6994219653179195e-05,
719
+ "loss": 0.4749,
720
+ "mean_token_accuracy": 0.8405117988586426,
721
+ "step": 445
722
+ },
723
+ {
724
+ "epoch": 0.624133148404993,
725
+ "grad_norm": 0.22393807768821716,
726
+ "learning_rate": 4.689788053949904e-05,
727
+ "loss": 0.4735,
728
+ "mean_token_accuracy": 0.8410566568374633,
729
+ "step": 450
730
+ },
731
+ {
732
+ "epoch": 0.6310679611650486,
733
+ "grad_norm": 0.23452860116958618,
734
+ "learning_rate": 4.6801541425818887e-05,
735
+ "loss": 0.4798,
736
+ "mean_token_accuracy": 0.8394344925880433,
737
+ "step": 455
738
+ },
739
+ {
740
+ "epoch": 0.6380027739251041,
741
+ "grad_norm": 0.21135355532169342,
742
+ "learning_rate": 4.670520231213873e-05,
743
+ "loss": 0.4783,
744
+ "mean_token_accuracy": 0.8398800015449523,
745
+ "step": 460
746
+ },
747
+ {
748
+ "epoch": 0.6449375866851595,
749
+ "grad_norm": 0.2495516985654831,
750
+ "learning_rate": 4.660886319845857e-05,
751
+ "loss": 0.4769,
752
+ "mean_token_accuracy": 0.8407980084419251,
753
+ "step": 465
754
+ },
755
+ {
756
+ "epoch": 0.651872399445215,
757
+ "grad_norm": 0.25724372267723083,
758
+ "learning_rate": 4.651252408477843e-05,
759
+ "loss": 0.4764,
760
+ "mean_token_accuracy": 0.8402070879936219,
761
+ "step": 470
762
+ },
763
+ {
764
+ "epoch": 0.6588072122052705,
765
+ "grad_norm": 0.28974995017051697,
766
+ "learning_rate": 4.641618497109827e-05,
767
+ "loss": 0.468,
768
+ "mean_token_accuracy": 0.8425545215606689,
769
+ "step": 475
770
+ },
771
+ {
772
+ "epoch": 0.665742024965326,
773
+ "grad_norm": 0.26298555731773376,
774
+ "learning_rate": 4.631984585741811e-05,
775
+ "loss": 0.4752,
776
+ "mean_token_accuracy": 0.8405273199081421,
777
+ "step": 480
778
+ },
779
+ {
780
+ "epoch": 0.6726768377253814,
781
+ "grad_norm": 0.3188522756099701,
782
+ "learning_rate": 4.622350674373796e-05,
783
+ "loss": 0.4683,
784
+ "mean_token_accuracy": 0.8426392555236817,
785
+ "step": 485
786
+ },
787
+ {
788
+ "epoch": 0.6796116504854369,
789
+ "grad_norm": 0.2528276741504669,
790
+ "learning_rate": 4.6127167630057805e-05,
791
+ "loss": 0.4753,
792
+ "mean_token_accuracy": 0.840662169456482,
793
+ "step": 490
794
+ },
795
+ {
796
+ "epoch": 0.6865464632454924,
797
+ "grad_norm": 0.3695737421512604,
798
+ "learning_rate": 4.603082851637765e-05,
799
+ "loss": 0.501,
800
+ "mean_token_accuracy": 0.8371694445610046,
801
+ "step": 495
802
+ },
803
+ {
804
+ "epoch": 0.6934812760055479,
805
+ "grad_norm": 0.31206727027893066,
806
+ "learning_rate": 4.59344894026975e-05,
807
+ "loss": 0.478,
808
+ "mean_token_accuracy": 0.8401562452316285,
809
+ "step": 500
810
+ },
811
+ {
812
+ "epoch": 0.7004160887656034,
813
+ "grad_norm": 3.478522539138794,
814
+ "learning_rate": 4.5838150289017346e-05,
815
+ "loss": 0.49,
816
+ "mean_token_accuracy": 0.8365014433860779,
817
+ "step": 505
818
+ },
819
+ {
820
+ "epoch": 0.7073509015256588,
821
+ "grad_norm": 0.4430016875267029,
822
+ "learning_rate": 4.574181117533719e-05,
823
+ "loss": 0.47,
824
+ "mean_token_accuracy": 0.8422938823699951,
825
+ "step": 510
826
+ },
827
+ {
828
+ "epoch": 0.7142857142857143,
829
+ "grad_norm": 0.28713470697402954,
830
+ "learning_rate": 4.564547206165704e-05,
831
+ "loss": 0.4786,
832
+ "mean_token_accuracy": 0.8401166200637817,
833
+ "step": 515
834
+ },
835
+ {
836
+ "epoch": 0.7212205270457698,
837
+ "grad_norm": 0.2158370316028595,
838
+ "learning_rate": 4.554913294797688e-05,
839
+ "loss": 0.4703,
840
+ "mean_token_accuracy": 0.8421276092529297,
841
+ "step": 520
842
+ },
843
+ {
844
+ "epoch": 0.7281553398058253,
845
+ "grad_norm": 0.2426484376192093,
846
+ "learning_rate": 4.545279383429672e-05,
847
+ "loss": 0.469,
848
+ "mean_token_accuracy": 0.8426563143730164,
849
+ "step": 525
850
+ },
851
+ {
852
+ "epoch": 0.7350901525658807,
853
+ "grad_norm": 0.27153995633125305,
854
+ "learning_rate": 4.535645472061657e-05,
855
+ "loss": 0.4754,
856
+ "mean_token_accuracy": 0.8406094431877136,
857
+ "step": 530
858
+ },
859
+ {
860
+ "epoch": 0.7420249653259362,
861
+ "grad_norm": 0.1991535872220993,
862
+ "learning_rate": 4.526011560693642e-05,
863
+ "loss": 0.4782,
864
+ "mean_token_accuracy": 0.8397158980369568,
865
+ "step": 535
866
+ },
867
+ {
868
+ "epoch": 0.7489597780859917,
869
+ "grad_norm": 0.15923242270946503,
870
+ "learning_rate": 4.5163776493256264e-05,
871
+ "loss": 0.4563,
872
+ "mean_token_accuracy": 0.8461790800094604,
873
+ "step": 540
874
+ },
875
+ {
876
+ "epoch": 0.7558945908460472,
877
+ "grad_norm": 0.18306083977222443,
878
+ "learning_rate": 4.5067437379576114e-05,
879
+ "loss": 0.4791,
880
+ "mean_token_accuracy": 0.8393635034561158,
881
+ "step": 545
882
+ },
883
+ {
884
+ "epoch": 0.7628294036061026,
885
+ "grad_norm": 0.24309256672859192,
886
+ "learning_rate": 4.4971098265895956e-05,
887
+ "loss": 0.4777,
888
+ "mean_token_accuracy": 0.8401144862174987,
889
+ "step": 550
890
+ },
891
+ {
892
+ "epoch": 0.7697642163661581,
893
+ "grad_norm": 0.20910784602165222,
894
+ "learning_rate": 4.48747591522158e-05,
895
+ "loss": 0.4728,
896
+ "mean_token_accuracy": 0.8417426466941833,
897
+ "step": 555
898
+ },
899
+ {
900
+ "epoch": 0.7766990291262136,
901
+ "grad_norm": 0.1896984726190567,
902
+ "learning_rate": 4.477842003853565e-05,
903
+ "loss": 0.4557,
904
+ "mean_token_accuracy": 0.8461586833000183,
905
+ "step": 560
906
+ },
907
+ {
908
+ "epoch": 0.7836338418862691,
909
+ "grad_norm": 0.18798613548278809,
910
+ "learning_rate": 4.46820809248555e-05,
911
+ "loss": 0.457,
912
+ "mean_token_accuracy": 0.8459754705429077,
913
+ "step": 565
914
+ },
915
+ {
916
+ "epoch": 0.7905686546463245,
917
+ "grad_norm": 0.18959036469459534,
918
+ "learning_rate": 4.458574181117534e-05,
919
+ "loss": 0.4633,
920
+ "mean_token_accuracy": 0.8437102913856507,
921
+ "step": 570
922
+ },
923
+ {
924
+ "epoch": 0.79750346740638,
925
+ "grad_norm": 0.16292130947113037,
926
+ "learning_rate": 4.448940269749519e-05,
927
+ "loss": 0.4749,
928
+ "mean_token_accuracy": 0.8404599308967591,
929
+ "step": 575
930
+ },
931
+ {
932
+ "epoch": 0.8044382801664355,
933
+ "grad_norm": 0.17686040699481964,
934
+ "learning_rate": 4.439306358381503e-05,
935
+ "loss": 0.4601,
936
+ "mean_token_accuracy": 0.844899308681488,
937
+ "step": 580
938
+ },
939
+ {
940
+ "epoch": 0.811373092926491,
941
+ "grad_norm": 0.1865614652633667,
942
+ "learning_rate": 4.4296724470134875e-05,
943
+ "loss": 0.4533,
944
+ "mean_token_accuracy": 0.846677553653717,
945
+ "step": 585
946
+ },
947
+ {
948
+ "epoch": 0.8183079056865464,
949
+ "grad_norm": 0.2037810981273651,
950
+ "learning_rate": 4.4200385356454724e-05,
951
+ "loss": 0.4575,
952
+ "mean_token_accuracy": 0.8457266449928283,
953
+ "step": 590
954
+ },
955
+ {
956
+ "epoch": 0.8252427184466019,
957
+ "grad_norm": 0.16701985895633698,
958
+ "learning_rate": 4.4104046242774566e-05,
959
+ "loss": 0.466,
960
+ "mean_token_accuracy": 0.8428797006607056,
961
+ "step": 595
962
+ },
963
+ {
964
+ "epoch": 0.8321775312066574,
965
+ "grad_norm": 0.19714096188545227,
966
+ "learning_rate": 4.4007707129094416e-05,
967
+ "loss": 0.4696,
968
+ "mean_token_accuracy": 0.8422728657722474,
969
+ "step": 600
970
+ },
971
+ {
972
+ "epoch": 0.8391123439667129,
973
+ "grad_norm": 0.20772860944271088,
974
+ "learning_rate": 4.391136801541426e-05,
975
+ "loss": 0.4635,
976
+ "mean_token_accuracy": 0.8438523054122925,
977
+ "step": 605
978
+ },
979
+ {
980
+ "epoch": 0.8460471567267683,
981
+ "grad_norm": 0.35546374320983887,
982
+ "learning_rate": 4.381502890173411e-05,
983
+ "loss": 0.4665,
984
+ "mean_token_accuracy": 0.8430918097496033,
985
+ "step": 610
986
+ },
987
+ {
988
+ "epoch": 0.8529819694868238,
989
+ "grad_norm": 0.19986563920974731,
990
+ "learning_rate": 4.371868978805395e-05,
991
+ "loss": 0.4742,
992
+ "mean_token_accuracy": 0.8409379243850708,
993
+ "step": 615
994
+ },
995
+ {
996
+ "epoch": 0.8599167822468793,
997
+ "grad_norm": 0.4013294279575348,
998
+ "learning_rate": 4.36223506743738e-05,
999
+ "loss": 0.4673,
1000
+ "mean_token_accuracy": 0.8426662087440491,
1001
+ "step": 620
1002
+ },
1003
+ {
1004
+ "epoch": 0.8668515950069348,
1005
+ "grad_norm": 0.29566317796707153,
1006
+ "learning_rate": 4.352601156069364e-05,
1007
+ "loss": 0.4837,
1008
+ "mean_token_accuracy": 0.8380556702613831,
1009
+ "step": 625
1010
+ },
1011
+ {
1012
+ "epoch": 0.8737864077669902,
1013
+ "grad_norm": 0.24461045861244202,
1014
+ "learning_rate": 4.342967244701349e-05,
1015
+ "loss": 0.4648,
1016
+ "mean_token_accuracy": 0.8434231281280518,
1017
+ "step": 630
1018
+ },
1019
+ {
1020
+ "epoch": 0.8807212205270458,
1021
+ "grad_norm": 0.2197730541229248,
1022
+ "learning_rate": 4.3333333333333334e-05,
1023
+ "loss": 0.4585,
1024
+ "mean_token_accuracy": 0.8448979973793029,
1025
+ "step": 635
1026
+ },
1027
+ {
1028
+ "epoch": 0.8876560332871013,
1029
+ "grad_norm": 0.22158759832382202,
1030
+ "learning_rate": 4.3236994219653183e-05,
1031
+ "loss": 0.4678,
1032
+ "mean_token_accuracy": 0.8427410125732422,
1033
+ "step": 640
1034
+ },
1035
+ {
1036
+ "epoch": 0.8945908460471568,
1037
+ "grad_norm": 0.17014814913272858,
1038
+ "learning_rate": 4.3140655105973026e-05,
1039
+ "loss": 0.4706,
1040
+ "mean_token_accuracy": 0.8416074395179749,
1041
+ "step": 645
1042
+ },
1043
+ {
1044
+ "epoch": 0.9015256588072122,
1045
+ "grad_norm": 0.22929687798023224,
1046
+ "learning_rate": 4.304431599229287e-05,
1047
+ "loss": 0.4753,
1048
+ "mean_token_accuracy": 0.8403880834579468,
1049
+ "step": 650
1050
+ },
1051
+ {
1052
+ "epoch": 0.9084604715672677,
1053
+ "grad_norm": 0.20894835889339447,
1054
+ "learning_rate": 4.294797687861272e-05,
1055
+ "loss": 0.4734,
1056
+ "mean_token_accuracy": 0.8410162568092346,
1057
+ "step": 655
1058
+ },
1059
+ {
1060
+ "epoch": 0.9153952843273232,
1061
+ "grad_norm": 0.18031327426433563,
1062
+ "learning_rate": 4.285163776493257e-05,
1063
+ "loss": 0.4536,
1064
+ "mean_token_accuracy": 0.8469532251358032,
1065
+ "step": 660
1066
+ },
1067
+ {
1068
+ "epoch": 0.9223300970873787,
1069
+ "grad_norm": 0.17288991808891296,
1070
+ "learning_rate": 4.275529865125241e-05,
1071
+ "loss": 0.4611,
1072
+ "mean_token_accuracy": 0.8443895936012268,
1073
+ "step": 665
1074
+ },
1075
+ {
1076
+ "epoch": 0.9292649098474342,
1077
+ "grad_norm": 0.1980760544538498,
1078
+ "learning_rate": 4.265895953757226e-05,
1079
+ "loss": 0.484,
1080
+ "mean_token_accuracy": 0.8379009962081909,
1081
+ "step": 670
1082
+ },
1083
+ {
1084
+ "epoch": 0.9361997226074896,
1085
+ "grad_norm": 0.20848602056503296,
1086
+ "learning_rate": 4.25626204238921e-05,
1087
+ "loss": 0.4771,
1088
+ "mean_token_accuracy": 0.8398370265960693,
1089
+ "step": 675
1090
+ },
1091
+ {
1092
+ "epoch": 0.9431345353675451,
1093
+ "grad_norm": 0.1636408418416977,
1094
+ "learning_rate": 4.2466281310211944e-05,
1095
+ "loss": 0.4578,
1096
+ "mean_token_accuracy": 0.845670223236084,
1097
+ "step": 680
1098
+ },
1099
+ {
1100
+ "epoch": 0.9500693481276006,
1101
+ "grad_norm": 0.22376923263072968,
1102
+ "learning_rate": 4.2369942196531794e-05,
1103
+ "loss": 0.4652,
1104
+ "mean_token_accuracy": 0.8431706428527832,
1105
+ "step": 685
1106
+ },
1107
+ {
1108
+ "epoch": 0.957004160887656,
1109
+ "grad_norm": 0.21399416029453278,
1110
+ "learning_rate": 4.2273603082851636e-05,
1111
+ "loss": 0.4537,
1112
+ "mean_token_accuracy": 0.8464810252189636,
1113
+ "step": 690
1114
+ },
1115
+ {
1116
+ "epoch": 0.9639389736477115,
1117
+ "grad_norm": 2.5790159702301025,
1118
+ "learning_rate": 4.2177263969171485e-05,
1119
+ "loss": 0.4754,
1120
+ "mean_token_accuracy": 0.8421392440795898,
1121
+ "step": 695
1122
+ },
1123
+ {
1124
+ "epoch": 0.970873786407767,
1125
+ "grad_norm": 0.2648729085922241,
1126
+ "learning_rate": 4.2080924855491335e-05,
1127
+ "loss": 0.469,
1128
+ "mean_token_accuracy": 0.8423485517501831,
1129
+ "step": 700
1130
+ },
1131
+ {
1132
+ "epoch": 0.9778085991678225,
1133
+ "grad_norm": 0.20691435039043427,
1134
+ "learning_rate": 4.198458574181118e-05,
1135
+ "loss": 0.4534,
1136
+ "mean_token_accuracy": 0.8466127276420593,
1137
+ "step": 705
1138
+ },
1139
+ {
1140
+ "epoch": 0.984743411927878,
1141
+ "grad_norm": 0.2122969925403595,
1142
+ "learning_rate": 4.188824662813102e-05,
1143
+ "loss": 0.4744,
1144
+ "mean_token_accuracy": 0.843373692035675,
1145
+ "step": 710
1146
+ },
1147
+ {
1148
+ "epoch": 0.9916782246879334,
1149
+ "grad_norm": 0.18356889486312866,
1150
+ "learning_rate": 4.179190751445087e-05,
1151
+ "loss": 0.4735,
1152
+ "mean_token_accuracy": 0.840711236000061,
1153
+ "step": 715
1154
+ },
1155
+ {
1156
+ "epoch": 0.9986130374479889,
1157
+ "grad_norm": 0.2710322141647339,
1158
+ "learning_rate": 4.169556840077071e-05,
1159
+ "loss": 0.4893,
1160
+ "mean_token_accuracy": 0.8400939464569092,
1161
+ "step": 720
1162
+ },
1163
+ {
1164
+ "epoch": 1.0055478502080444,
1165
+ "grad_norm": 0.28685542941093445,
1166
+ "learning_rate": 4.159922928709056e-05,
1167
+ "loss": 0.4413,
1168
+ "mean_token_accuracy": 0.8504527807235718,
1169
+ "step": 725
1170
+ },
1171
+ {
1172
+ "epoch": 1.0124826629680999,
1173
+ "grad_norm": 0.24674533307552338,
1174
+ "learning_rate": 4.150289017341041e-05,
1175
+ "loss": 0.4098,
1176
+ "mean_token_accuracy": 0.8587909460067749,
1177
+ "step": 730
1178
+ },
1179
+ {
1180
+ "epoch": 1.0194174757281553,
1181
+ "grad_norm": 0.21753250062465668,
1182
+ "learning_rate": 4.140655105973025e-05,
1183
+ "loss": 0.3935,
1184
+ "mean_token_accuracy": 0.8634741544723511,
1185
+ "step": 735
1186
+ },
1187
+ {
1188
+ "epoch": 1.0263522884882108,
1189
+ "grad_norm": 0.20492789149284363,
1190
+ "learning_rate": 4.1310211946050096e-05,
1191
+ "loss": 0.4068,
1192
+ "mean_token_accuracy": 0.8603113770484925,
1193
+ "step": 740
1194
+ },
1195
+ {
1196
+ "epoch": 1.0332871012482663,
1197
+ "grad_norm": 0.16923396289348602,
1198
+ "learning_rate": 4.1213872832369945e-05,
1199
+ "loss": 0.4185,
1200
+ "mean_token_accuracy": 0.8562816023826599,
1201
+ "step": 745
1202
+ },
1203
+ {
1204
+ "epoch": 1.0402219140083218,
1205
+ "grad_norm": 0.18504321575164795,
1206
+ "learning_rate": 4.111753371868979e-05,
1207
+ "loss": 0.4083,
1208
+ "mean_token_accuracy": 0.8588919401168823,
1209
+ "step": 750
1210
+ },
1211
+ {
1212
+ "epoch": 1.0471567267683772,
1213
+ "grad_norm": 0.15754340589046478,
1214
+ "learning_rate": 4.102119460500964e-05,
1215
+ "loss": 0.4001,
1216
+ "mean_token_accuracy": 0.8617303729057312,
1217
+ "step": 755
1218
+ },
1219
+ {
1220
+ "epoch": 1.0540915395284327,
1221
+ "grad_norm": 0.16705656051635742,
1222
+ "learning_rate": 4.0924855491329486e-05,
1223
+ "loss": 0.4124,
1224
+ "mean_token_accuracy": 0.8577338337898255,
1225
+ "step": 760
1226
+ },
1227
+ {
1228
+ "epoch": 1.0610263522884882,
1229
+ "grad_norm": 0.1913621723651886,
1230
+ "learning_rate": 4.082851637764933e-05,
1231
+ "loss": 0.4027,
1232
+ "mean_token_accuracy": 0.860604989528656,
1233
+ "step": 765
1234
+ },
1235
+ {
1236
+ "epoch": 1.0679611650485437,
1237
+ "grad_norm": 0.1807246059179306,
1238
+ "learning_rate": 4.073217726396917e-05,
1239
+ "loss": 0.4154,
1240
+ "mean_token_accuracy": 0.8569133520126343,
1241
+ "step": 770
1242
+ },
1243
+ {
1244
+ "epoch": 1.0748959778085991,
1245
+ "grad_norm": 0.16904115676879883,
1246
+ "learning_rate": 4.063583815028902e-05,
1247
+ "loss": 0.4043,
1248
+ "mean_token_accuracy": 0.8603330969810485,
1249
+ "step": 775
1250
+ },
1251
+ {
1252
+ "epoch": 1.0818307905686546,
1253
+ "grad_norm": 0.13820037245750427,
1254
+ "learning_rate": 4.053949903660886e-05,
1255
+ "loss": 0.4032,
1256
+ "mean_token_accuracy": 0.8607820630073547,
1257
+ "step": 780
1258
+ },
1259
+ {
1260
+ "epoch": 1.08876560332871,
1261
+ "grad_norm": 0.15458045899868011,
1262
+ "learning_rate": 4.0443159922928706e-05,
1263
+ "loss": 0.4126,
1264
+ "mean_token_accuracy": 0.8577965140342713,
1265
+ "step": 785
1266
+ },
1267
+ {
1268
+ "epoch": 1.0957004160887656,
1269
+ "grad_norm": 0.14621621370315552,
1270
+ "learning_rate": 4.034682080924856e-05,
1271
+ "loss": 0.4164,
1272
+ "mean_token_accuracy": 0.8568703651428222,
1273
+ "step": 790
1274
+ },
1275
+ {
1276
+ "epoch": 1.102635228848821,
1277
+ "grad_norm": 0.22418245673179626,
1278
+ "learning_rate": 4.0250481695568404e-05,
1279
+ "loss": 0.4151,
1280
+ "mean_token_accuracy": 0.8572113513946533,
1281
+ "step": 795
1282
+ },
1283
+ {
1284
+ "epoch": 1.1095700416088765,
1285
+ "grad_norm": 0.18166805803775787,
1286
+ "learning_rate": 4.015414258188825e-05,
1287
+ "loss": 0.4236,
1288
+ "mean_token_accuracy": 0.8545473575592041,
1289
+ "step": 800
1290
+ },
1291
+ {
1292
+ "epoch": 1.116504854368932,
1293
+ "grad_norm": 0.19410911202430725,
1294
+ "learning_rate": 4.0057803468208096e-05,
1295
+ "loss": 0.4081,
1296
+ "mean_token_accuracy": 0.8594057202339173,
1297
+ "step": 805
1298
+ },
1299
+ {
1300
+ "epoch": 1.1234396671289875,
1301
+ "grad_norm": 0.15663549304008484,
1302
+ "learning_rate": 3.996146435452794e-05,
1303
+ "loss": 0.41,
1304
+ "mean_token_accuracy": 0.8585174441337585,
1305
+ "step": 810
1306
+ },
1307
+ {
1308
+ "epoch": 1.130374479889043,
1309
+ "grad_norm": 0.2926901578903198,
1310
+ "learning_rate": 3.986512524084778e-05,
1311
+ "loss": 0.4088,
1312
+ "mean_token_accuracy": 0.8590117692947388,
1313
+ "step": 815
1314
+ },
1315
+ {
1316
+ "epoch": 1.1373092926490984,
1317
+ "grad_norm": 0.14440152049064636,
1318
+ "learning_rate": 3.976878612716764e-05,
1319
+ "loss": 0.4029,
1320
+ "mean_token_accuracy": 0.8609296917915344,
1321
+ "step": 820
1322
+ },
1323
+ {
1324
+ "epoch": 1.1442441054091539,
1325
+ "grad_norm": 0.18435537815093994,
1326
+ "learning_rate": 3.967244701348748e-05,
1327
+ "loss": 0.406,
1328
+ "mean_token_accuracy": 0.8599510669708252,
1329
+ "step": 825
1330
+ },
1331
+ {
1332
+ "epoch": 1.1511789181692094,
1333
+ "grad_norm": 0.16614344716072083,
1334
+ "learning_rate": 3.957610789980732e-05,
1335
+ "loss": 0.4165,
1336
+ "mean_token_accuracy": 0.8565038800239563,
1337
+ "step": 830
1338
+ },
1339
+ {
1340
+ "epoch": 1.1581137309292648,
1341
+ "grad_norm": 0.180514857172966,
1342
+ "learning_rate": 3.947976878612717e-05,
1343
+ "loss": 0.4233,
1344
+ "mean_token_accuracy": 0.854881489276886,
1345
+ "step": 835
1346
+ },
1347
+ {
1348
+ "epoch": 1.1650485436893203,
1349
+ "grad_norm": 0.17873796820640564,
1350
+ "learning_rate": 3.9383429672447015e-05,
1351
+ "loss": 0.4142,
1352
+ "mean_token_accuracy": 0.8575613379478455,
1353
+ "step": 840
1354
+ },
1355
+ {
1356
+ "epoch": 1.1719833564493758,
1357
+ "grad_norm": 0.17171607911586761,
1358
+ "learning_rate": 3.928709055876686e-05,
1359
+ "loss": 0.4199,
1360
+ "mean_token_accuracy": 0.8555831432342529,
1361
+ "step": 845
1362
+ },
1363
+ {
1364
+ "epoch": 1.1789181692094313,
1365
+ "grad_norm": 0.2052180916070938,
1366
+ "learning_rate": 3.9190751445086707e-05,
1367
+ "loss": 0.4084,
1368
+ "mean_token_accuracy": 0.8590785980224609,
1369
+ "step": 850
1370
+ },
1371
+ {
1372
+ "epoch": 1.1858529819694867,
1373
+ "grad_norm": 0.16104774177074432,
1374
+ "learning_rate": 3.9094412331406556e-05,
1375
+ "loss": 0.4064,
1376
+ "mean_token_accuracy": 0.8598842978477478,
1377
+ "step": 855
1378
+ },
1379
+ {
1380
+ "epoch": 1.1927877947295422,
1381
+ "grad_norm": 0.16743043065071106,
1382
+ "learning_rate": 3.89980732177264e-05,
1383
+ "loss": 0.4136,
1384
+ "mean_token_accuracy": 0.857661247253418,
1385
+ "step": 860
1386
+ },
1387
+ {
1388
+ "epoch": 1.1997226074895977,
1389
+ "grad_norm": 0.15085460245609283,
1390
+ "learning_rate": 3.890173410404625e-05,
1391
+ "loss": 0.4143,
1392
+ "mean_token_accuracy": 0.8573906064033509,
1393
+ "step": 865
1394
+ },
1395
+ {
1396
+ "epoch": 1.2066574202496532,
1397
+ "grad_norm": 0.14390355348587036,
1398
+ "learning_rate": 3.880539499036609e-05,
1399
+ "loss": 0.4221,
1400
+ "mean_token_accuracy": 0.8549758553504944,
1401
+ "step": 870
1402
+ },
1403
+ {
1404
+ "epoch": 1.2135922330097086,
1405
+ "grad_norm": 0.170955091714859,
1406
+ "learning_rate": 3.870905587668593e-05,
1407
+ "loss": 0.417,
1408
+ "mean_token_accuracy": 0.8564952373504638,
1409
+ "step": 875
1410
+ },
1411
+ {
1412
+ "epoch": 1.2205270457697641,
1413
+ "grad_norm": 0.17432747781276703,
1414
+ "learning_rate": 3.861271676300578e-05,
1415
+ "loss": 0.4065,
1416
+ "mean_token_accuracy": 0.8594078302383423,
1417
+ "step": 880
1418
+ },
1419
+ {
1420
+ "epoch": 1.2274618585298196,
1421
+ "grad_norm": 0.15886807441711426,
1422
+ "learning_rate": 3.851637764932563e-05,
1423
+ "loss": 0.4191,
1424
+ "mean_token_accuracy": 0.8560209155082703,
1425
+ "step": 885
1426
+ },
1427
+ {
1428
+ "epoch": 1.234396671289875,
1429
+ "grad_norm": 0.22739961743354797,
1430
+ "learning_rate": 3.8420038535645474e-05,
1431
+ "loss": 0.4136,
1432
+ "mean_token_accuracy": 0.8580429792404175,
1433
+ "step": 890
1434
+ },
1435
+ {
1436
+ "epoch": 1.2413314840499305,
1437
+ "grad_norm": 0.1761876940727234,
1438
+ "learning_rate": 3.832369942196532e-05,
1439
+ "loss": 0.4058,
1440
+ "mean_token_accuracy": 0.8598258137702942,
1441
+ "step": 895
1442
+ },
1443
+ {
1444
+ "epoch": 1.248266296809986,
1445
+ "grad_norm": 0.8043875098228455,
1446
+ "learning_rate": 3.8227360308285166e-05,
1447
+ "loss": 0.4146,
1448
+ "mean_token_accuracy": 0.8571544885635376,
1449
+ "step": 900
1450
+ },
1451
+ {
1452
+ "epoch": 1.2552011095700415,
1453
+ "grad_norm": 0.18043817579746246,
1454
+ "learning_rate": 3.813102119460501e-05,
1455
+ "loss": 0.4032,
1456
+ "mean_token_accuracy": 0.8605299115180969,
1457
+ "step": 905
1458
+ },
1459
+ {
1460
+ "epoch": 1.262135922330097,
1461
+ "grad_norm": 0.16484476625919342,
1462
+ "learning_rate": 3.803468208092486e-05,
1463
+ "loss": 0.3987,
1464
+ "mean_token_accuracy": 0.8620869636535644,
1465
+ "step": 910
1466
+ },
1467
+ {
1468
+ "epoch": 1.2690707350901524,
1469
+ "grad_norm": 0.15530748665332794,
1470
+ "learning_rate": 3.793834296724471e-05,
1471
+ "loss": 0.4019,
1472
+ "mean_token_accuracy": 0.8608452916145325,
1473
+ "step": 915
1474
+ },
1475
+ {
1476
+ "epoch": 1.276005547850208,
1477
+ "grad_norm": 0.16284696757793427,
1478
+ "learning_rate": 3.784200385356455e-05,
1479
+ "loss": 0.4056,
1480
+ "mean_token_accuracy": 0.8598546504974365,
1481
+ "step": 920
1482
+ },
1483
+ {
1484
+ "epoch": 1.2829403606102634,
1485
+ "grad_norm": 0.15156075358390808,
1486
+ "learning_rate": 3.774566473988439e-05,
1487
+ "loss": 0.4148,
1488
+ "mean_token_accuracy": 0.8573850750923157,
1489
+ "step": 925
1490
+ },
1491
+ {
1492
+ "epoch": 1.2898751733703189,
1493
+ "grad_norm": 0.18044961988925934,
1494
+ "learning_rate": 3.764932562620424e-05,
1495
+ "loss": 0.4165,
1496
+ "mean_token_accuracy": 0.8561815977096557,
1497
+ "step": 930
1498
+ },
1499
+ {
1500
+ "epoch": 1.2968099861303743,
1501
+ "grad_norm": 0.1658436506986618,
1502
+ "learning_rate": 3.7552986512524084e-05,
1503
+ "loss": 0.4056,
1504
+ "mean_token_accuracy": 0.8600709080696106,
1505
+ "step": 935
1506
+ },
1507
+ {
1508
+ "epoch": 1.3037447988904298,
1509
+ "grad_norm": 0.16520382463932037,
1510
+ "learning_rate": 3.7456647398843934e-05,
1511
+ "loss": 0.4148,
1512
+ "mean_token_accuracy": 0.8568984508514405,
1513
+ "step": 940
1514
+ },
1515
+ {
1516
+ "epoch": 1.3106796116504853,
1517
+ "grad_norm": 0.1799880713224411,
1518
+ "learning_rate": 3.736030828516378e-05,
1519
+ "loss": 0.4188,
1520
+ "mean_token_accuracy": 0.856162166595459,
1521
+ "step": 945
1522
+ },
1523
+ {
1524
+ "epoch": 1.317614424410541,
1525
+ "grad_norm": 0.16812920570373535,
1526
+ "learning_rate": 3.7263969171483626e-05,
1527
+ "loss": 0.4111,
1528
+ "mean_token_accuracy": 0.8584610104560852,
1529
+ "step": 950
1530
+ },
1531
+ {
1532
+ "epoch": 1.3245492371705965,
1533
+ "grad_norm": 0.15165302157402039,
1534
+ "learning_rate": 3.716763005780347e-05,
1535
+ "loss": 0.404,
1536
+ "mean_token_accuracy": 0.860293960571289,
1537
+ "step": 955
1538
+ },
1539
+ {
1540
+ "epoch": 1.331484049930652,
1541
+ "grad_norm": 0.13624367117881775,
1542
+ "learning_rate": 3.707129094412332e-05,
1543
+ "loss": 0.4136,
1544
+ "mean_token_accuracy": 0.8572997689247132,
1545
+ "step": 960
1546
+ },
1547
+ {
1548
+ "epoch": 1.3384188626907074,
1549
+ "grad_norm": 0.6036350131034851,
1550
+ "learning_rate": 3.697495183044316e-05,
1551
+ "loss": 0.4139,
1552
+ "mean_token_accuracy": 0.8579724669456482,
1553
+ "step": 965
1554
+ },
1555
+ {
1556
+ "epoch": 1.345353675450763,
1557
+ "grad_norm": 0.16172119975090027,
1558
+ "learning_rate": 3.6878612716763e-05,
1559
+ "loss": 0.4194,
1560
+ "mean_token_accuracy": 0.8556413412094116,
1561
+ "step": 970
1562
+ },
1563
+ {
1564
+ "epoch": 1.3522884882108184,
1565
+ "grad_norm": 0.13519282639026642,
1566
+ "learning_rate": 3.678227360308285e-05,
1567
+ "loss": 0.4053,
1568
+ "mean_token_accuracy": 0.8597145199775695,
1569
+ "step": 975
1570
+ },
1571
+ {
1572
+ "epoch": 1.3592233009708738,
1573
+ "grad_norm": 0.14305779337882996,
1574
+ "learning_rate": 3.66859344894027e-05,
1575
+ "loss": 0.4066,
1576
+ "mean_token_accuracy": 0.8596665501594544,
1577
+ "step": 980
1578
+ },
1579
+ {
1580
+ "epoch": 1.3661581137309293,
1581
+ "grad_norm": 0.18043436110019684,
1582
+ "learning_rate": 3.6589595375722544e-05,
1583
+ "loss": 0.4061,
1584
+ "mean_token_accuracy": 0.8598026871681214,
1585
+ "step": 985
1586
+ },
1587
+ {
1588
+ "epoch": 1.3730929264909848,
1589
+ "grad_norm": 0.12696458399295807,
1590
+ "learning_rate": 3.649325626204239e-05,
1591
+ "loss": 0.4035,
1592
+ "mean_token_accuracy": 0.860295832157135,
1593
+ "step": 990
1594
+ },
1595
+ {
1596
+ "epoch": 1.3800277392510403,
1597
+ "grad_norm": 0.15299014747142792,
1598
+ "learning_rate": 3.6396917148362236e-05,
1599
+ "loss": 0.4045,
1600
+ "mean_token_accuracy": 0.8601055145263672,
1601
+ "step": 995
1602
+ },
1603
+ {
1604
+ "epoch": 1.3869625520110958,
1605
+ "grad_norm": 0.14797343313694,
1606
+ "learning_rate": 3.630057803468208e-05,
1607
+ "loss": 0.4064,
1608
+ "mean_token_accuracy": 0.8599012613296508,
1609
+ "step": 1000
1610
+ }
1611
+ ],
1612
+ "logging_steps": 5,
1613
+ "max_steps": 2884,
1614
+ "num_input_tokens_seen": 0,
1615
+ "num_train_epochs": 4,
1616
+ "save_steps": 100,
1617
+ "stateful_callbacks": {
1618
+ "TrainerControl": {
1619
+ "args": {
1620
+ "should_epoch_stop": false,
1621
+ "should_evaluate": false,
1622
+ "should_log": false,
1623
+ "should_save": true,
1624
+ "should_training_stop": false
1625
+ },
1626
+ "attributes": {}
1627
+ }
1628
+ },
1629
+ "total_flos": 2.2953357891076096e+16,
1630
+ "train_batch_size": 1,
1631
+ "trial_name": null,
1632
+ "trial_params": null
1633
+ }