PEFT
Safetensors
English
German
vidore
multimodal_embedding
tattrongvu commited on
Commit
7e6afe8
·
verified ·
1 Parent(s): 6e77e2d

Upload 57 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. adapter_config.json +26 -0
  2. adapter_model.safetensors +3 -0
  3. added_tokens.json +16 -0
  4. chat_template.json +3 -0
  5. checkpoint-1137/adapter_config.json +26 -0
  6. checkpoint-1137/adapter_model.safetensors +3 -0
  7. checkpoint-1137/generation_config.json +14 -0
  8. checkpoint-1137/optimizer.pt +3 -0
  9. checkpoint-1137/rng_state_0.pth +3 -0
  10. checkpoint-1137/rng_state_1.pth +3 -0
  11. checkpoint-1137/rng_state_2.pth +3 -0
  12. checkpoint-1137/rng_state_3.pth +3 -0
  13. checkpoint-1137/rng_state_4.pth +3 -0
  14. checkpoint-1137/rng_state_5.pth +3 -0
  15. checkpoint-1137/rng_state_6.pth +3 -0
  16. checkpoint-1137/rng_state_7.pth +3 -0
  17. checkpoint-1137/scheduler.pt +3 -0
  18. checkpoint-1137/trainer_state.json +609 -0
  19. checkpoint-1137/training_args.bin +3 -0
  20. checkpoint-1516/adapter_config.json +26 -0
  21. checkpoint-1516/adapter_model.safetensors +3 -0
  22. checkpoint-1516/generation_config.json +14 -0
  23. checkpoint-1516/optimizer.pt +3 -0
  24. checkpoint-1516/rng_state_0.pth +3 -0
  25. checkpoint-1516/rng_state_1.pth +3 -0
  26. checkpoint-1516/rng_state_2.pth +3 -0
  27. checkpoint-1516/rng_state_3.pth +3 -0
  28. checkpoint-1516/rng_state_4.pth +3 -0
  29. checkpoint-1516/rng_state_5.pth +3 -0
  30. checkpoint-1516/rng_state_6.pth +3 -0
  31. checkpoint-1516/rng_state_7.pth +3 -0
  32. checkpoint-1516/scheduler.pt +3 -0
  33. checkpoint-1516/trainer_state.json +806 -0
  34. checkpoint-1516/training_args.bin +3 -0
  35. checkpoint-1895/adapter_config.json +26 -0
  36. checkpoint-1895/adapter_model.safetensors +3 -0
  37. checkpoint-1895/generation_config.json +14 -0
  38. checkpoint-1895/optimizer.pt +3 -0
  39. checkpoint-1895/rng_state_0.pth +3 -0
  40. checkpoint-1895/rng_state_1.pth +3 -0
  41. checkpoint-1895/rng_state_2.pth +3 -0
  42. checkpoint-1895/rng_state_3.pth +3 -0
  43. checkpoint-1895/rng_state_4.pth +3 -0
  44. checkpoint-1895/rng_state_5.pth +3 -0
  45. checkpoint-1895/rng_state_6.pth +3 -0
  46. checkpoint-1895/rng_state_7.pth +3 -0
  47. checkpoint-1895/scheduler.pt +3 -0
  48. checkpoint-1895/trainer_state.json +995 -0
  49. checkpoint-1895/training_args.bin +3 -0
  50. generation_config.json +14 -0
adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "vidore/colqwen2-base",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": "gaussian",
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 128,
14
+ "lora_dropout": 0.1,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 128,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": "(.*(model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)",
23
+ "task_type": "FEATURE_EXTRACTION",
24
+ "use_dora": false,
25
+ "use_rslora": false
26
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c42115acdfbac77d6ad3cd0992cac6b57ceb4fc0caab3ac9aa65716d5c31e771
3
+ size 295915936
added_tokens.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|box_end|>": 151649,
3
+ "<|box_start|>": 151648,
4
+ "<|endoftext|>": 151643,
5
+ "<|im_end|>": 151645,
6
+ "<|im_start|>": 151644,
7
+ "<|image_pad|>": 151655,
8
+ "<|object_ref_end|>": 151647,
9
+ "<|object_ref_start|>": 151646,
10
+ "<|quad_end|>": 151651,
11
+ "<|quad_start|>": 151650,
12
+ "<|video_pad|>": 151656,
13
+ "<|vision_end|>": 151653,
14
+ "<|vision_pad|>": 151654,
15
+ "<|vision_start|>": 151652
16
+ }
chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
3
+ }
checkpoint-1137/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/home/pv_rwm_models/models/colqwen2-base",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": "gaussian",
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 128,
14
+ "lora_dropout": 0.1,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 128,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": "(.*(model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)",
23
+ "task_type": "FEATURE_EXTRACTION",
24
+ "use_dora": false,
25
+ "use_rslora": false
26
+ }
checkpoint-1137/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:641ec4216b7332961fd4c782686bea95a84d362f4673b0dd978b1717f1e835bf
3
+ size 295915936
checkpoint-1137/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_implementation": "flash_attention_2",
3
+ "bos_token_id": 151643,
4
+ "do_sample": true,
5
+ "eos_token_id": [
6
+ 151645,
7
+ 151643
8
+ ],
9
+ "pad_token_id": 151643,
10
+ "temperature": 0.01,
11
+ "top_k": 1,
12
+ "top_p": 0.001,
13
+ "transformers_version": "4.46.3"
14
+ }
checkpoint-1137/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7203dd9efbb829e0d9fce93b4da2106a7cc7ca88b434a6d22532d2395a166141
3
+ size 592056816
checkpoint-1137/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8dfe83380afd02e39e99a452ffa4d0e2ed6cf9dc67102f3b1f8d92f6e58b7b61
3
+ size 15920
checkpoint-1137/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45e151cc59a71fe4be7f2c4af3813dc815440d9b69f665243ffdde6af209f47f
3
+ size 15920
checkpoint-1137/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d9806843fadafe99ac79ed0f989905231a01f31b46537122656af6dca41a56f
3
+ size 15920
checkpoint-1137/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9388c726ec6adbd0a1e8f7c95c4c2ca6810f34dd6b1f4a5741d93c8875609ad
3
+ size 15920
checkpoint-1137/rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:110bec9a33476e4807b85778f0b15ca30064f71eebac2d0c66eb7770a14fd256
3
+ size 15920
checkpoint-1137/rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bf4847286fca82e1e6a60e71b0e53660e16d33c4e3023fed8635447b49195b4
3
+ size 15920
checkpoint-1137/rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4682b50c72ea5dd8bae1b4e43373e007756c1519b805597f29b23bfc84030729
3
+ size 15920
checkpoint-1137/rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecb6005cc8ddfe00d00c9e2c9f41bda27cc503e73bc78577188fbcda6beb384d
3
+ size 15920
checkpoint-1137/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4091fdd205669b2e0406ab4e0ed89397fae90307c5f2a4d59f636f4a9012111
3
+ size 1064
checkpoint-1137/trainer_state.json ADDED
@@ -0,0 +1,609 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 50,
6
+ "global_step": 1137,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.002638522427440633,
13
+ "eval_loss": 0.3697243332862854,
14
+ "eval_runtime": 31.4109,
15
+ "eval_samples_per_second": 63.672,
16
+ "eval_steps_per_second": 0.255,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.052770448548812667,
21
+ "grad_norm": 0.26953125,
22
+ "learning_rate": 0.00010526315789473685,
23
+ "loss": 0.3823,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.10554089709762533,
28
+ "grad_norm": 0.201171875,
29
+ "learning_rate": 0.0001997845988152935,
30
+ "loss": 0.2239,
31
+ "step": 40
32
+ },
33
+ {
34
+ "epoch": 0.13192612137203166,
35
+ "eval_loss": 0.11808302253484726,
36
+ "eval_runtime": 29.4538,
37
+ "eval_samples_per_second": 67.903,
38
+ "eval_steps_per_second": 0.272,
39
+ "step": 50
40
+ },
41
+ {
42
+ "epoch": 0.158311345646438,
43
+ "grad_norm": 0.1962890625,
44
+ "learning_rate": 0.00019763058696822833,
45
+ "loss": 0.1799,
46
+ "step": 60
47
+ },
48
+ {
49
+ "epoch": 0.21108179419525067,
50
+ "grad_norm": 0.1943359375,
51
+ "learning_rate": 0.0001954765751211632,
52
+ "loss": 0.1651,
53
+ "step": 80
54
+ },
55
+ {
56
+ "epoch": 0.2638522427440633,
57
+ "grad_norm": 0.2255859375,
58
+ "learning_rate": 0.00019332256327409802,
59
+ "loss": 0.1571,
60
+ "step": 100
61
+ },
62
+ {
63
+ "epoch": 0.2638522427440633,
64
+ "eval_loss": 0.09250890463590622,
65
+ "eval_runtime": 28.2273,
66
+ "eval_samples_per_second": 70.853,
67
+ "eval_steps_per_second": 0.283,
68
+ "step": 100
69
+ },
70
+ {
71
+ "epoch": 0.316622691292876,
72
+ "grad_norm": 0.2333984375,
73
+ "learning_rate": 0.00019116855142703286,
74
+ "loss": 0.1535,
75
+ "step": 120
76
+ },
77
+ {
78
+ "epoch": 0.36939313984168864,
79
+ "grad_norm": 0.1611328125,
80
+ "learning_rate": 0.00018901453957996772,
81
+ "loss": 0.1456,
82
+ "step": 140
83
+ },
84
+ {
85
+ "epoch": 0.39577836411609496,
86
+ "eval_loss": 0.08707328885793686,
87
+ "eval_runtime": 27.6259,
88
+ "eval_samples_per_second": 72.396,
89
+ "eval_steps_per_second": 0.29,
90
+ "step": 150
91
+ },
92
+ {
93
+ "epoch": 0.42216358839050133,
94
+ "grad_norm": 0.1884765625,
95
+ "learning_rate": 0.00018686052773290255,
96
+ "loss": 0.1402,
97
+ "step": 160
98
+ },
99
+ {
100
+ "epoch": 0.47493403693931396,
101
+ "grad_norm": 0.2109375,
102
+ "learning_rate": 0.0001847065158858374,
103
+ "loss": 0.142,
104
+ "step": 180
105
+ },
106
+ {
107
+ "epoch": 0.5277044854881267,
108
+ "grad_norm": 0.1533203125,
109
+ "learning_rate": 0.00018255250403877222,
110
+ "loss": 0.1318,
111
+ "step": 200
112
+ },
113
+ {
114
+ "epoch": 0.5277044854881267,
115
+ "eval_loss": 0.080934077501297,
116
+ "eval_runtime": 27.3743,
117
+ "eval_samples_per_second": 73.061,
118
+ "eval_steps_per_second": 0.292,
119
+ "step": 200
120
+ },
121
+ {
122
+ "epoch": 0.5804749340369393,
123
+ "grad_norm": 0.216796875,
124
+ "learning_rate": 0.00018039849219170706,
125
+ "loss": 0.1301,
126
+ "step": 220
127
+ },
128
+ {
129
+ "epoch": 0.633245382585752,
130
+ "grad_norm": 0.162109375,
131
+ "learning_rate": 0.0001782444803446419,
132
+ "loss": 0.1317,
133
+ "step": 240
134
+ },
135
+ {
136
+ "epoch": 0.6596306068601583,
137
+ "eval_loss": 0.0750429555773735,
138
+ "eval_runtime": 27.7505,
139
+ "eval_samples_per_second": 72.071,
140
+ "eval_steps_per_second": 0.288,
141
+ "step": 250
142
+ },
143
+ {
144
+ "epoch": 0.6860158311345647,
145
+ "grad_norm": 0.185546875,
146
+ "learning_rate": 0.00017609046849757676,
147
+ "loss": 0.1269,
148
+ "step": 260
149
+ },
150
+ {
151
+ "epoch": 0.7387862796833773,
152
+ "grad_norm": 0.203125,
153
+ "learning_rate": 0.0001739364566505116,
154
+ "loss": 0.1267,
155
+ "step": 280
156
+ },
157
+ {
158
+ "epoch": 0.7915567282321899,
159
+ "grad_norm": 0.1455078125,
160
+ "learning_rate": 0.00017178244480344642,
161
+ "loss": 0.1226,
162
+ "step": 300
163
+ },
164
+ {
165
+ "epoch": 0.7915567282321899,
166
+ "eval_loss": 0.07792137563228607,
167
+ "eval_runtime": 27.3248,
168
+ "eval_samples_per_second": 73.194,
169
+ "eval_steps_per_second": 0.293,
170
+ "step": 300
171
+ },
172
+ {
173
+ "epoch": 0.8443271767810027,
174
+ "grad_norm": 0.1630859375,
175
+ "learning_rate": 0.00016962843295638126,
176
+ "loss": 0.1222,
177
+ "step": 320
178
+ },
179
+ {
180
+ "epoch": 0.8970976253298153,
181
+ "grad_norm": 0.173828125,
182
+ "learning_rate": 0.0001674744211093161,
183
+ "loss": 0.1254,
184
+ "step": 340
185
+ },
186
+ {
187
+ "epoch": 0.9234828496042217,
188
+ "eval_loss": 0.07484881579875946,
189
+ "eval_runtime": 27.8135,
190
+ "eval_samples_per_second": 71.907,
191
+ "eval_steps_per_second": 0.288,
192
+ "step": 350
193
+ },
194
+ {
195
+ "epoch": 0.9498680738786279,
196
+ "grad_norm": 0.1728515625,
197
+ "learning_rate": 0.00016532040926225093,
198
+ "loss": 0.1177,
199
+ "step": 360
200
+ },
201
+ {
202
+ "epoch": 1.0026385224274406,
203
+ "grad_norm": 0.1220703125,
204
+ "learning_rate": 0.0001631663974151858,
205
+ "loss": 0.1207,
206
+ "step": 380
207
+ },
208
+ {
209
+ "epoch": 1.0554089709762533,
210
+ "grad_norm": 0.1591796875,
211
+ "learning_rate": 0.00016101238556812063,
212
+ "loss": 0.1046,
213
+ "step": 400
214
+ },
215
+ {
216
+ "epoch": 1.0554089709762533,
217
+ "eval_loss": 0.0715707540512085,
218
+ "eval_runtime": 27.7758,
219
+ "eval_samples_per_second": 72.005,
220
+ "eval_steps_per_second": 0.288,
221
+ "step": 400
222
+ },
223
+ {
224
+ "epoch": 1.108179419525066,
225
+ "grad_norm": 0.1142578125,
226
+ "learning_rate": 0.0001588583737210555,
227
+ "loss": 0.1041,
228
+ "step": 420
229
+ },
230
+ {
231
+ "epoch": 1.1609498680738786,
232
+ "grad_norm": 0.177734375,
233
+ "learning_rate": 0.00015670436187399032,
234
+ "loss": 0.1034,
235
+ "step": 440
236
+ },
237
+ {
238
+ "epoch": 1.187335092348285,
239
+ "eval_loss": 0.0693235993385315,
240
+ "eval_runtime": 27.7658,
241
+ "eval_samples_per_second": 72.031,
242
+ "eval_steps_per_second": 0.288,
243
+ "step": 450
244
+ },
245
+ {
246
+ "epoch": 1.2137203166226913,
247
+ "grad_norm": 0.1630859375,
248
+ "learning_rate": 0.00015455035002692516,
249
+ "loss": 0.1042,
250
+ "step": 460
251
+ },
252
+ {
253
+ "epoch": 1.266490765171504,
254
+ "grad_norm": 0.1611328125,
255
+ "learning_rate": 0.00015239633817986,
256
+ "loss": 0.1032,
257
+ "step": 480
258
+ },
259
+ {
260
+ "epoch": 1.3192612137203166,
261
+ "grad_norm": 0.169921875,
262
+ "learning_rate": 0.00015024232633279485,
263
+ "loss": 0.1021,
264
+ "step": 500
265
+ },
266
+ {
267
+ "epoch": 1.3192612137203166,
268
+ "eval_loss": 0.06579812616109848,
269
+ "eval_runtime": 27.42,
270
+ "eval_samples_per_second": 72.939,
271
+ "eval_steps_per_second": 0.292,
272
+ "step": 500
273
+ },
274
+ {
275
+ "epoch": 1.3720316622691293,
276
+ "grad_norm": 0.1611328125,
277
+ "learning_rate": 0.0001480883144857297,
278
+ "loss": 0.1041,
279
+ "step": 520
280
+ },
281
+ {
282
+ "epoch": 1.424802110817942,
283
+ "grad_norm": 0.11474609375,
284
+ "learning_rate": 0.00014593430263866452,
285
+ "loss": 0.1006,
286
+ "step": 540
287
+ },
288
+ {
289
+ "epoch": 1.4511873350923483,
290
+ "eval_loss": 0.06417644023895264,
291
+ "eval_runtime": 27.5371,
292
+ "eval_samples_per_second": 72.629,
293
+ "eval_steps_per_second": 0.291,
294
+ "step": 550
295
+ },
296
+ {
297
+ "epoch": 1.4775725593667546,
298
+ "grad_norm": 0.1259765625,
299
+ "learning_rate": 0.00014378029079159936,
300
+ "loss": 0.1001,
301
+ "step": 560
302
+ },
303
+ {
304
+ "epoch": 1.5303430079155673,
305
+ "grad_norm": 0.146484375,
306
+ "learning_rate": 0.0001416262789445342,
307
+ "loss": 0.1013,
308
+ "step": 580
309
+ },
310
+ {
311
+ "epoch": 1.58311345646438,
312
+ "grad_norm": 0.1591796875,
313
+ "learning_rate": 0.00013947226709746903,
314
+ "loss": 0.1,
315
+ "step": 600
316
+ },
317
+ {
318
+ "epoch": 1.58311345646438,
319
+ "eval_loss": 0.06583409756422043,
320
+ "eval_runtime": 28.0223,
321
+ "eval_samples_per_second": 71.372,
322
+ "eval_steps_per_second": 0.285,
323
+ "step": 600
324
+ },
325
+ {
326
+ "epoch": 1.6358839050131926,
327
+ "grad_norm": 0.1611328125,
328
+ "learning_rate": 0.0001373182552504039,
329
+ "loss": 0.1021,
330
+ "step": 620
331
+ },
332
+ {
333
+ "epoch": 1.6886543535620053,
334
+ "grad_norm": 0.14453125,
335
+ "learning_rate": 0.00013516424340333873,
336
+ "loss": 0.1002,
337
+ "step": 640
338
+ },
339
+ {
340
+ "epoch": 1.7150395778364116,
341
+ "eval_loss": 0.06498919427394867,
342
+ "eval_runtime": 28.3581,
343
+ "eval_samples_per_second": 70.527,
344
+ "eval_steps_per_second": 0.282,
345
+ "step": 650
346
+ },
347
+ {
348
+ "epoch": 1.741424802110818,
349
+ "grad_norm": 0.111328125,
350
+ "learning_rate": 0.00013301023155627356,
351
+ "loss": 0.0967,
352
+ "step": 660
353
+ },
354
+ {
355
+ "epoch": 1.7941952506596306,
356
+ "grad_norm": 0.1884765625,
357
+ "learning_rate": 0.0001308562197092084,
358
+ "loss": 0.1004,
359
+ "step": 680
360
+ },
361
+ {
362
+ "epoch": 1.8469656992084431,
363
+ "grad_norm": 0.13671875,
364
+ "learning_rate": 0.00012870220786214323,
365
+ "loss": 0.0992,
366
+ "step": 700
367
+ },
368
+ {
369
+ "epoch": 1.8469656992084431,
370
+ "eval_loss": 0.06491042673587799,
371
+ "eval_runtime": 27.748,
372
+ "eval_samples_per_second": 72.077,
373
+ "eval_steps_per_second": 0.288,
374
+ "step": 700
375
+ },
376
+ {
377
+ "epoch": 1.899736147757256,
378
+ "grad_norm": 0.15234375,
379
+ "learning_rate": 0.0001265481960150781,
380
+ "loss": 0.0967,
381
+ "step": 720
382
+ },
383
+ {
384
+ "epoch": 1.9525065963060686,
385
+ "grad_norm": 0.12451171875,
386
+ "learning_rate": 0.00012439418416801293,
387
+ "loss": 0.0956,
388
+ "step": 740
389
+ },
390
+ {
391
+ "epoch": 1.978891820580475,
392
+ "eval_loss": 0.06425958126783371,
393
+ "eval_runtime": 27.654,
394
+ "eval_samples_per_second": 72.322,
395
+ "eval_steps_per_second": 0.289,
396
+ "step": 750
397
+ },
398
+ {
399
+ "epoch": 2.005277044854881,
400
+ "grad_norm": 0.12060546875,
401
+ "learning_rate": 0.0001222401723209478,
402
+ "loss": 0.0934,
403
+ "step": 760
404
+ },
405
+ {
406
+ "epoch": 2.058047493403694,
407
+ "grad_norm": 0.171875,
408
+ "learning_rate": 0.00012008616047388261,
409
+ "loss": 0.0907,
410
+ "step": 780
411
+ },
412
+ {
413
+ "epoch": 2.1108179419525066,
414
+ "grad_norm": 0.16796875,
415
+ "learning_rate": 0.00011793214862681745,
416
+ "loss": 0.0861,
417
+ "step": 800
418
+ },
419
+ {
420
+ "epoch": 2.1108179419525066,
421
+ "eval_loss": 0.06223862245678902,
422
+ "eval_runtime": 27.4046,
423
+ "eval_samples_per_second": 72.981,
424
+ "eval_steps_per_second": 0.292,
425
+ "step": 800
426
+ },
427
+ {
428
+ "epoch": 2.163588390501319,
429
+ "grad_norm": 0.134765625,
430
+ "learning_rate": 0.0001157781367797523,
431
+ "loss": 0.0864,
432
+ "step": 820
433
+ },
434
+ {
435
+ "epoch": 2.216358839050132,
436
+ "grad_norm": 0.123046875,
437
+ "learning_rate": 0.00011362412493268713,
438
+ "loss": 0.0842,
439
+ "step": 840
440
+ },
441
+ {
442
+ "epoch": 2.242744063324538,
443
+ "eval_loss": 0.060463495552539825,
444
+ "eval_runtime": 27.4597,
445
+ "eval_samples_per_second": 72.834,
446
+ "eval_steps_per_second": 0.291,
447
+ "step": 850
448
+ },
449
+ {
450
+ "epoch": 2.2691292875989446,
451
+ "grad_norm": 0.1435546875,
452
+ "learning_rate": 0.00011147011308562199,
453
+ "loss": 0.0863,
454
+ "step": 860
455
+ },
456
+ {
457
+ "epoch": 2.321899736147757,
458
+ "grad_norm": 0.1494140625,
459
+ "learning_rate": 0.00010931610123855683,
460
+ "loss": 0.0858,
461
+ "step": 880
462
+ },
463
+ {
464
+ "epoch": 2.37467018469657,
465
+ "grad_norm": 0.1259765625,
466
+ "learning_rate": 0.00010716208939149166,
467
+ "loss": 0.0866,
468
+ "step": 900
469
+ },
470
+ {
471
+ "epoch": 2.37467018469657,
472
+ "eval_loss": 0.06099672615528107,
473
+ "eval_runtime": 27.7635,
474
+ "eval_samples_per_second": 72.037,
475
+ "eval_steps_per_second": 0.288,
476
+ "step": 900
477
+ },
478
+ {
479
+ "epoch": 2.4274406332453826,
480
+ "grad_norm": 0.1376953125,
481
+ "learning_rate": 0.0001050080775444265,
482
+ "loss": 0.0873,
483
+ "step": 920
484
+ },
485
+ {
486
+ "epoch": 2.480211081794195,
487
+ "grad_norm": 0.158203125,
488
+ "learning_rate": 0.00010285406569736133,
489
+ "loss": 0.0853,
490
+ "step": 940
491
+ },
492
+ {
493
+ "epoch": 2.5065963060686016,
494
+ "eval_loss": 0.06115744262933731,
495
+ "eval_runtime": 27.8521,
496
+ "eval_samples_per_second": 71.808,
497
+ "eval_steps_per_second": 0.287,
498
+ "step": 950
499
+ },
500
+ {
501
+ "epoch": 2.532981530343008,
502
+ "grad_norm": 0.1259765625,
503
+ "learning_rate": 0.00010070005385029618,
504
+ "loss": 0.0849,
505
+ "step": 960
506
+ },
507
+ {
508
+ "epoch": 2.5857519788918206,
509
+ "grad_norm": 0.1318359375,
510
+ "learning_rate": 9.854604200323103e-05,
511
+ "loss": 0.0814,
512
+ "step": 980
513
+ },
514
+ {
515
+ "epoch": 2.638522427440633,
516
+ "grad_norm": 0.1376953125,
517
+ "learning_rate": 9.639203015616588e-05,
518
+ "loss": 0.0864,
519
+ "step": 1000
520
+ },
521
+ {
522
+ "epoch": 2.638522427440633,
523
+ "eval_loss": 0.05968466028571129,
524
+ "eval_runtime": 27.6897,
525
+ "eval_samples_per_second": 72.229,
526
+ "eval_steps_per_second": 0.289,
527
+ "step": 1000
528
+ },
529
+ {
530
+ "epoch": 2.691292875989446,
531
+ "grad_norm": 0.16015625,
532
+ "learning_rate": 9.423801830910071e-05,
533
+ "loss": 0.0869,
534
+ "step": 1020
535
+ },
536
+ {
537
+ "epoch": 2.7440633245382586,
538
+ "grad_norm": 0.12890625,
539
+ "learning_rate": 9.208400646203555e-05,
540
+ "loss": 0.0821,
541
+ "step": 1040
542
+ },
543
+ {
544
+ "epoch": 2.7704485488126647,
545
+ "eval_loss": 0.059157080948352814,
546
+ "eval_runtime": 27.7435,
547
+ "eval_samples_per_second": 72.089,
548
+ "eval_steps_per_second": 0.288,
549
+ "step": 1050
550
+ },
551
+ {
552
+ "epoch": 2.796833773087071,
553
+ "grad_norm": 0.1337890625,
554
+ "learning_rate": 8.99299946149704e-05,
555
+ "loss": 0.0842,
556
+ "step": 1060
557
+ },
558
+ {
559
+ "epoch": 2.849604221635884,
560
+ "grad_norm": 0.1513671875,
561
+ "learning_rate": 8.777598276790523e-05,
562
+ "loss": 0.0846,
563
+ "step": 1080
564
+ },
565
+ {
566
+ "epoch": 2.9023746701846966,
567
+ "grad_norm": 0.1328125,
568
+ "learning_rate": 8.562197092084006e-05,
569
+ "loss": 0.0841,
570
+ "step": 1100
571
+ },
572
+ {
573
+ "epoch": 2.9023746701846966,
574
+ "eval_loss": 0.05879725515842438,
575
+ "eval_runtime": 27.612,
576
+ "eval_samples_per_second": 72.432,
577
+ "eval_steps_per_second": 0.29,
578
+ "step": 1100
579
+ },
580
+ {
581
+ "epoch": 2.955145118733509,
582
+ "grad_norm": 0.1455078125,
583
+ "learning_rate": 8.346795907377491e-05,
584
+ "loss": 0.0809,
585
+ "step": 1120
586
+ }
587
+ ],
588
+ "logging_steps": 20,
589
+ "max_steps": 1895,
590
+ "num_input_tokens_seen": 0,
591
+ "num_train_epochs": 5,
592
+ "save_steps": 500,
593
+ "stateful_callbacks": {
594
+ "TrainerControl": {
595
+ "args": {
596
+ "should_epoch_stop": false,
597
+ "should_evaluate": false,
598
+ "should_log": false,
599
+ "should_save": true,
600
+ "should_training_stop": false
601
+ },
602
+ "attributes": {}
603
+ }
604
+ },
605
+ "total_flos": 1.527787484038457e+19,
606
+ "train_batch_size": 128,
607
+ "trial_name": null,
608
+ "trial_params": null
609
+ }
checkpoint-1137/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22bcb641e1380aac3e0a537740052e2d8d16eda981f3eee0acf549dc506b6fc5
3
+ size 5496
checkpoint-1516/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/home/pv_rwm_models/models/colqwen2-base",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": "gaussian",
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 128,
14
+ "lora_dropout": 0.1,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 128,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": "(.*(model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)",
23
+ "task_type": "FEATURE_EXTRACTION",
24
+ "use_dora": false,
25
+ "use_rslora": false
26
+ }
checkpoint-1516/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c580aa851cabe54fc22f26cdb740deeee6dee2943016300ccfdad72b26da592
3
+ size 295915936
checkpoint-1516/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_implementation": "flash_attention_2",
3
+ "bos_token_id": 151643,
4
+ "do_sample": true,
5
+ "eos_token_id": [
6
+ 151645,
7
+ 151643
8
+ ],
9
+ "pad_token_id": 151643,
10
+ "temperature": 0.01,
11
+ "top_k": 1,
12
+ "top_p": 0.001,
13
+ "transformers_version": "4.46.3"
14
+ }
checkpoint-1516/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6583dba1a58705127a72e49b37d7e1542fdd6fb62d256acb1705b96f7e82f303
3
+ size 592056816
checkpoint-1516/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b0ba13064fbf321a4eb0e1ed3fee40f0fd95acee64afbc00f47b596d37f1fe1
3
+ size 15920
checkpoint-1516/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d57aa5e0bde1a888e7fff30fa063c46effd84d31bf93a0d16ad9cd6805c87946
3
+ size 15920
checkpoint-1516/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:366a123cb93b9f156c870f60dd41696c26a829c68e4de7be456529ee7f0a56e4
3
+ size 15920
checkpoint-1516/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c85a46817fed8889369af9ec74d0548eab259ec50c8ba9d0edfcbe41fd510b9
3
+ size 15920
checkpoint-1516/rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4913a2578db3305d59ec94ed4b15383c18fba2f81b83503c62aa273015e74e39
3
+ size 15920
checkpoint-1516/rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04cdf56e79b92c0efb8ba087f90b4de8850ec89166f10cef5840f7cbd41b8f75
3
+ size 15920
checkpoint-1516/rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a0d9b6a689b60bbfc2cc7e2cfc0d9c50ae0087eeb6a40d9bc9af95a91502eb1
3
+ size 15920
checkpoint-1516/rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15f91b072e1ad8a4e2cf1e6f3b9bed4795f17c286df4d6f0ef7a0afdd42c4b01
3
+ size 15920
checkpoint-1516/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20ba51cc4dc1b58a3f92ee2894c92ca4536c4332a2995e91b4af3a3831fa0d08
3
+ size 1064
checkpoint-1516/trainer_state.json ADDED
@@ -0,0 +1,806 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 4.0,
5
+ "eval_steps": 50,
6
+ "global_step": 1516,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.002638522427440633,
13
+ "eval_loss": 0.3697243332862854,
14
+ "eval_runtime": 31.4109,
15
+ "eval_samples_per_second": 63.672,
16
+ "eval_steps_per_second": 0.255,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.052770448548812667,
21
+ "grad_norm": 0.26953125,
22
+ "learning_rate": 0.00010526315789473685,
23
+ "loss": 0.3823,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.10554089709762533,
28
+ "grad_norm": 0.201171875,
29
+ "learning_rate": 0.0001997845988152935,
30
+ "loss": 0.2239,
31
+ "step": 40
32
+ },
33
+ {
34
+ "epoch": 0.13192612137203166,
35
+ "eval_loss": 0.11808302253484726,
36
+ "eval_runtime": 29.4538,
37
+ "eval_samples_per_second": 67.903,
38
+ "eval_steps_per_second": 0.272,
39
+ "step": 50
40
+ },
41
+ {
42
+ "epoch": 0.158311345646438,
43
+ "grad_norm": 0.1962890625,
44
+ "learning_rate": 0.00019763058696822833,
45
+ "loss": 0.1799,
46
+ "step": 60
47
+ },
48
+ {
49
+ "epoch": 0.21108179419525067,
50
+ "grad_norm": 0.1943359375,
51
+ "learning_rate": 0.0001954765751211632,
52
+ "loss": 0.1651,
53
+ "step": 80
54
+ },
55
+ {
56
+ "epoch": 0.2638522427440633,
57
+ "grad_norm": 0.2255859375,
58
+ "learning_rate": 0.00019332256327409802,
59
+ "loss": 0.1571,
60
+ "step": 100
61
+ },
62
+ {
63
+ "epoch": 0.2638522427440633,
64
+ "eval_loss": 0.09250890463590622,
65
+ "eval_runtime": 28.2273,
66
+ "eval_samples_per_second": 70.853,
67
+ "eval_steps_per_second": 0.283,
68
+ "step": 100
69
+ },
70
+ {
71
+ "epoch": 0.316622691292876,
72
+ "grad_norm": 0.2333984375,
73
+ "learning_rate": 0.00019116855142703286,
74
+ "loss": 0.1535,
75
+ "step": 120
76
+ },
77
+ {
78
+ "epoch": 0.36939313984168864,
79
+ "grad_norm": 0.1611328125,
80
+ "learning_rate": 0.00018901453957996772,
81
+ "loss": 0.1456,
82
+ "step": 140
83
+ },
84
+ {
85
+ "epoch": 0.39577836411609496,
86
+ "eval_loss": 0.08707328885793686,
87
+ "eval_runtime": 27.6259,
88
+ "eval_samples_per_second": 72.396,
89
+ "eval_steps_per_second": 0.29,
90
+ "step": 150
91
+ },
92
+ {
93
+ "epoch": 0.42216358839050133,
94
+ "grad_norm": 0.1884765625,
95
+ "learning_rate": 0.00018686052773290255,
96
+ "loss": 0.1402,
97
+ "step": 160
98
+ },
99
+ {
100
+ "epoch": 0.47493403693931396,
101
+ "grad_norm": 0.2109375,
102
+ "learning_rate": 0.0001847065158858374,
103
+ "loss": 0.142,
104
+ "step": 180
105
+ },
106
+ {
107
+ "epoch": 0.5277044854881267,
108
+ "grad_norm": 0.1533203125,
109
+ "learning_rate": 0.00018255250403877222,
110
+ "loss": 0.1318,
111
+ "step": 200
112
+ },
113
+ {
114
+ "epoch": 0.5277044854881267,
115
+ "eval_loss": 0.080934077501297,
116
+ "eval_runtime": 27.3743,
117
+ "eval_samples_per_second": 73.061,
118
+ "eval_steps_per_second": 0.292,
119
+ "step": 200
120
+ },
121
+ {
122
+ "epoch": 0.5804749340369393,
123
+ "grad_norm": 0.216796875,
124
+ "learning_rate": 0.00018039849219170706,
125
+ "loss": 0.1301,
126
+ "step": 220
127
+ },
128
+ {
129
+ "epoch": 0.633245382585752,
130
+ "grad_norm": 0.162109375,
131
+ "learning_rate": 0.0001782444803446419,
132
+ "loss": 0.1317,
133
+ "step": 240
134
+ },
135
+ {
136
+ "epoch": 0.6596306068601583,
137
+ "eval_loss": 0.0750429555773735,
138
+ "eval_runtime": 27.7505,
139
+ "eval_samples_per_second": 72.071,
140
+ "eval_steps_per_second": 0.288,
141
+ "step": 250
142
+ },
143
+ {
144
+ "epoch": 0.6860158311345647,
145
+ "grad_norm": 0.185546875,
146
+ "learning_rate": 0.00017609046849757676,
147
+ "loss": 0.1269,
148
+ "step": 260
149
+ },
150
+ {
151
+ "epoch": 0.7387862796833773,
152
+ "grad_norm": 0.203125,
153
+ "learning_rate": 0.0001739364566505116,
154
+ "loss": 0.1267,
155
+ "step": 280
156
+ },
157
+ {
158
+ "epoch": 0.7915567282321899,
159
+ "grad_norm": 0.1455078125,
160
+ "learning_rate": 0.00017178244480344642,
161
+ "loss": 0.1226,
162
+ "step": 300
163
+ },
164
+ {
165
+ "epoch": 0.7915567282321899,
166
+ "eval_loss": 0.07792137563228607,
167
+ "eval_runtime": 27.3248,
168
+ "eval_samples_per_second": 73.194,
169
+ "eval_steps_per_second": 0.293,
170
+ "step": 300
171
+ },
172
+ {
173
+ "epoch": 0.8443271767810027,
174
+ "grad_norm": 0.1630859375,
175
+ "learning_rate": 0.00016962843295638126,
176
+ "loss": 0.1222,
177
+ "step": 320
178
+ },
179
+ {
180
+ "epoch": 0.8970976253298153,
181
+ "grad_norm": 0.173828125,
182
+ "learning_rate": 0.0001674744211093161,
183
+ "loss": 0.1254,
184
+ "step": 340
185
+ },
186
+ {
187
+ "epoch": 0.9234828496042217,
188
+ "eval_loss": 0.07484881579875946,
189
+ "eval_runtime": 27.8135,
190
+ "eval_samples_per_second": 71.907,
191
+ "eval_steps_per_second": 0.288,
192
+ "step": 350
193
+ },
194
+ {
195
+ "epoch": 0.9498680738786279,
196
+ "grad_norm": 0.1728515625,
197
+ "learning_rate": 0.00016532040926225093,
198
+ "loss": 0.1177,
199
+ "step": 360
200
+ },
201
+ {
202
+ "epoch": 1.0026385224274406,
203
+ "grad_norm": 0.1220703125,
204
+ "learning_rate": 0.0001631663974151858,
205
+ "loss": 0.1207,
206
+ "step": 380
207
+ },
208
+ {
209
+ "epoch": 1.0554089709762533,
210
+ "grad_norm": 0.1591796875,
211
+ "learning_rate": 0.00016101238556812063,
212
+ "loss": 0.1046,
213
+ "step": 400
214
+ },
215
+ {
216
+ "epoch": 1.0554089709762533,
217
+ "eval_loss": 0.0715707540512085,
218
+ "eval_runtime": 27.7758,
219
+ "eval_samples_per_second": 72.005,
220
+ "eval_steps_per_second": 0.288,
221
+ "step": 400
222
+ },
223
+ {
224
+ "epoch": 1.108179419525066,
225
+ "grad_norm": 0.1142578125,
226
+ "learning_rate": 0.0001588583737210555,
227
+ "loss": 0.1041,
228
+ "step": 420
229
+ },
230
+ {
231
+ "epoch": 1.1609498680738786,
232
+ "grad_norm": 0.177734375,
233
+ "learning_rate": 0.00015670436187399032,
234
+ "loss": 0.1034,
235
+ "step": 440
236
+ },
237
+ {
238
+ "epoch": 1.187335092348285,
239
+ "eval_loss": 0.0693235993385315,
240
+ "eval_runtime": 27.7658,
241
+ "eval_samples_per_second": 72.031,
242
+ "eval_steps_per_second": 0.288,
243
+ "step": 450
244
+ },
245
+ {
246
+ "epoch": 1.2137203166226913,
247
+ "grad_norm": 0.1630859375,
248
+ "learning_rate": 0.00015455035002692516,
249
+ "loss": 0.1042,
250
+ "step": 460
251
+ },
252
+ {
253
+ "epoch": 1.266490765171504,
254
+ "grad_norm": 0.1611328125,
255
+ "learning_rate": 0.00015239633817986,
256
+ "loss": 0.1032,
257
+ "step": 480
258
+ },
259
+ {
260
+ "epoch": 1.3192612137203166,
261
+ "grad_norm": 0.169921875,
262
+ "learning_rate": 0.00015024232633279485,
263
+ "loss": 0.1021,
264
+ "step": 500
265
+ },
266
+ {
267
+ "epoch": 1.3192612137203166,
268
+ "eval_loss": 0.06579812616109848,
269
+ "eval_runtime": 27.42,
270
+ "eval_samples_per_second": 72.939,
271
+ "eval_steps_per_second": 0.292,
272
+ "step": 500
273
+ },
274
+ {
275
+ "epoch": 1.3720316622691293,
276
+ "grad_norm": 0.1611328125,
277
+ "learning_rate": 0.0001480883144857297,
278
+ "loss": 0.1041,
279
+ "step": 520
280
+ },
281
+ {
282
+ "epoch": 1.424802110817942,
283
+ "grad_norm": 0.11474609375,
284
+ "learning_rate": 0.00014593430263866452,
285
+ "loss": 0.1006,
286
+ "step": 540
287
+ },
288
+ {
289
+ "epoch": 1.4511873350923483,
290
+ "eval_loss": 0.06417644023895264,
291
+ "eval_runtime": 27.5371,
292
+ "eval_samples_per_second": 72.629,
293
+ "eval_steps_per_second": 0.291,
294
+ "step": 550
295
+ },
296
+ {
297
+ "epoch": 1.4775725593667546,
298
+ "grad_norm": 0.1259765625,
299
+ "learning_rate": 0.00014378029079159936,
300
+ "loss": 0.1001,
301
+ "step": 560
302
+ },
303
+ {
304
+ "epoch": 1.5303430079155673,
305
+ "grad_norm": 0.146484375,
306
+ "learning_rate": 0.0001416262789445342,
307
+ "loss": 0.1013,
308
+ "step": 580
309
+ },
310
+ {
311
+ "epoch": 1.58311345646438,
312
+ "grad_norm": 0.1591796875,
313
+ "learning_rate": 0.00013947226709746903,
314
+ "loss": 0.1,
315
+ "step": 600
316
+ },
317
+ {
318
+ "epoch": 1.58311345646438,
319
+ "eval_loss": 0.06583409756422043,
320
+ "eval_runtime": 28.0223,
321
+ "eval_samples_per_second": 71.372,
322
+ "eval_steps_per_second": 0.285,
323
+ "step": 600
324
+ },
325
+ {
326
+ "epoch": 1.6358839050131926,
327
+ "grad_norm": 0.1611328125,
328
+ "learning_rate": 0.0001373182552504039,
329
+ "loss": 0.1021,
330
+ "step": 620
331
+ },
332
+ {
333
+ "epoch": 1.6886543535620053,
334
+ "grad_norm": 0.14453125,
335
+ "learning_rate": 0.00013516424340333873,
336
+ "loss": 0.1002,
337
+ "step": 640
338
+ },
339
+ {
340
+ "epoch": 1.7150395778364116,
341
+ "eval_loss": 0.06498919427394867,
342
+ "eval_runtime": 28.3581,
343
+ "eval_samples_per_second": 70.527,
344
+ "eval_steps_per_second": 0.282,
345
+ "step": 650
346
+ },
347
+ {
348
+ "epoch": 1.741424802110818,
349
+ "grad_norm": 0.111328125,
350
+ "learning_rate": 0.00013301023155627356,
351
+ "loss": 0.0967,
352
+ "step": 660
353
+ },
354
+ {
355
+ "epoch": 1.7941952506596306,
356
+ "grad_norm": 0.1884765625,
357
+ "learning_rate": 0.0001308562197092084,
358
+ "loss": 0.1004,
359
+ "step": 680
360
+ },
361
+ {
362
+ "epoch": 1.8469656992084431,
363
+ "grad_norm": 0.13671875,
364
+ "learning_rate": 0.00012870220786214323,
365
+ "loss": 0.0992,
366
+ "step": 700
367
+ },
368
+ {
369
+ "epoch": 1.8469656992084431,
370
+ "eval_loss": 0.06491042673587799,
371
+ "eval_runtime": 27.748,
372
+ "eval_samples_per_second": 72.077,
373
+ "eval_steps_per_second": 0.288,
374
+ "step": 700
375
+ },
376
+ {
377
+ "epoch": 1.899736147757256,
378
+ "grad_norm": 0.15234375,
379
+ "learning_rate": 0.0001265481960150781,
380
+ "loss": 0.0967,
381
+ "step": 720
382
+ },
383
+ {
384
+ "epoch": 1.9525065963060686,
385
+ "grad_norm": 0.12451171875,
386
+ "learning_rate": 0.00012439418416801293,
387
+ "loss": 0.0956,
388
+ "step": 740
389
+ },
390
+ {
391
+ "epoch": 1.978891820580475,
392
+ "eval_loss": 0.06425958126783371,
393
+ "eval_runtime": 27.654,
394
+ "eval_samples_per_second": 72.322,
395
+ "eval_steps_per_second": 0.289,
396
+ "step": 750
397
+ },
398
+ {
399
+ "epoch": 2.005277044854881,
400
+ "grad_norm": 0.12060546875,
401
+ "learning_rate": 0.0001222401723209478,
402
+ "loss": 0.0934,
403
+ "step": 760
404
+ },
405
+ {
406
+ "epoch": 2.058047493403694,
407
+ "grad_norm": 0.171875,
408
+ "learning_rate": 0.00012008616047388261,
409
+ "loss": 0.0907,
410
+ "step": 780
411
+ },
412
+ {
413
+ "epoch": 2.1108179419525066,
414
+ "grad_norm": 0.16796875,
415
+ "learning_rate": 0.00011793214862681745,
416
+ "loss": 0.0861,
417
+ "step": 800
418
+ },
419
+ {
420
+ "epoch": 2.1108179419525066,
421
+ "eval_loss": 0.06223862245678902,
422
+ "eval_runtime": 27.4046,
423
+ "eval_samples_per_second": 72.981,
424
+ "eval_steps_per_second": 0.292,
425
+ "step": 800
426
+ },
427
+ {
428
+ "epoch": 2.163588390501319,
429
+ "grad_norm": 0.134765625,
430
+ "learning_rate": 0.0001157781367797523,
431
+ "loss": 0.0864,
432
+ "step": 820
433
+ },
434
+ {
435
+ "epoch": 2.216358839050132,
436
+ "grad_norm": 0.123046875,
437
+ "learning_rate": 0.00011362412493268713,
438
+ "loss": 0.0842,
439
+ "step": 840
440
+ },
441
+ {
442
+ "epoch": 2.242744063324538,
443
+ "eval_loss": 0.060463495552539825,
444
+ "eval_runtime": 27.4597,
445
+ "eval_samples_per_second": 72.834,
446
+ "eval_steps_per_second": 0.291,
447
+ "step": 850
448
+ },
449
+ {
450
+ "epoch": 2.2691292875989446,
451
+ "grad_norm": 0.1435546875,
452
+ "learning_rate": 0.00011147011308562199,
453
+ "loss": 0.0863,
454
+ "step": 860
455
+ },
456
+ {
457
+ "epoch": 2.321899736147757,
458
+ "grad_norm": 0.1494140625,
459
+ "learning_rate": 0.00010931610123855683,
460
+ "loss": 0.0858,
461
+ "step": 880
462
+ },
463
+ {
464
+ "epoch": 2.37467018469657,
465
+ "grad_norm": 0.1259765625,
466
+ "learning_rate": 0.00010716208939149166,
467
+ "loss": 0.0866,
468
+ "step": 900
469
+ },
470
+ {
471
+ "epoch": 2.37467018469657,
472
+ "eval_loss": 0.06099672615528107,
473
+ "eval_runtime": 27.7635,
474
+ "eval_samples_per_second": 72.037,
475
+ "eval_steps_per_second": 0.288,
476
+ "step": 900
477
+ },
478
+ {
479
+ "epoch": 2.4274406332453826,
480
+ "grad_norm": 0.1376953125,
481
+ "learning_rate": 0.0001050080775444265,
482
+ "loss": 0.0873,
483
+ "step": 920
484
+ },
485
+ {
486
+ "epoch": 2.480211081794195,
487
+ "grad_norm": 0.158203125,
488
+ "learning_rate": 0.00010285406569736133,
489
+ "loss": 0.0853,
490
+ "step": 940
491
+ },
492
+ {
493
+ "epoch": 2.5065963060686016,
494
+ "eval_loss": 0.06115744262933731,
495
+ "eval_runtime": 27.8521,
496
+ "eval_samples_per_second": 71.808,
497
+ "eval_steps_per_second": 0.287,
498
+ "step": 950
499
+ },
500
+ {
501
+ "epoch": 2.532981530343008,
502
+ "grad_norm": 0.1259765625,
503
+ "learning_rate": 0.00010070005385029618,
504
+ "loss": 0.0849,
505
+ "step": 960
506
+ },
507
+ {
508
+ "epoch": 2.5857519788918206,
509
+ "grad_norm": 0.1318359375,
510
+ "learning_rate": 9.854604200323103e-05,
511
+ "loss": 0.0814,
512
+ "step": 980
513
+ },
514
+ {
515
+ "epoch": 2.638522427440633,
516
+ "grad_norm": 0.1376953125,
517
+ "learning_rate": 9.639203015616588e-05,
518
+ "loss": 0.0864,
519
+ "step": 1000
520
+ },
521
+ {
522
+ "epoch": 2.638522427440633,
523
+ "eval_loss": 0.05968466028571129,
524
+ "eval_runtime": 27.6897,
525
+ "eval_samples_per_second": 72.229,
526
+ "eval_steps_per_second": 0.289,
527
+ "step": 1000
528
+ },
529
+ {
530
+ "epoch": 2.691292875989446,
531
+ "grad_norm": 0.16015625,
532
+ "learning_rate": 9.423801830910071e-05,
533
+ "loss": 0.0869,
534
+ "step": 1020
535
+ },
536
+ {
537
+ "epoch": 2.7440633245382586,
538
+ "grad_norm": 0.12890625,
539
+ "learning_rate": 9.208400646203555e-05,
540
+ "loss": 0.0821,
541
+ "step": 1040
542
+ },
543
+ {
544
+ "epoch": 2.7704485488126647,
545
+ "eval_loss": 0.059157080948352814,
546
+ "eval_runtime": 27.7435,
547
+ "eval_samples_per_second": 72.089,
548
+ "eval_steps_per_second": 0.288,
549
+ "step": 1050
550
+ },
551
+ {
552
+ "epoch": 2.796833773087071,
553
+ "grad_norm": 0.1337890625,
554
+ "learning_rate": 8.99299946149704e-05,
555
+ "loss": 0.0842,
556
+ "step": 1060
557
+ },
558
+ {
559
+ "epoch": 2.849604221635884,
560
+ "grad_norm": 0.1513671875,
561
+ "learning_rate": 8.777598276790523e-05,
562
+ "loss": 0.0846,
563
+ "step": 1080
564
+ },
565
+ {
566
+ "epoch": 2.9023746701846966,
567
+ "grad_norm": 0.1328125,
568
+ "learning_rate": 8.562197092084006e-05,
569
+ "loss": 0.0841,
570
+ "step": 1100
571
+ },
572
+ {
573
+ "epoch": 2.9023746701846966,
574
+ "eval_loss": 0.05879725515842438,
575
+ "eval_runtime": 27.612,
576
+ "eval_samples_per_second": 72.432,
577
+ "eval_steps_per_second": 0.29,
578
+ "step": 1100
579
+ },
580
+ {
581
+ "epoch": 2.955145118733509,
582
+ "grad_norm": 0.1455078125,
583
+ "learning_rate": 8.346795907377491e-05,
584
+ "loss": 0.0809,
585
+ "step": 1120
586
+ },
587
+ {
588
+ "epoch": 3.007915567282322,
589
+ "grad_norm": 0.1259765625,
590
+ "learning_rate": 8.131394722670975e-05,
591
+ "loss": 0.0815,
592
+ "step": 1140
593
+ },
594
+ {
595
+ "epoch": 3.034300791556728,
596
+ "eval_loss": 0.05831225588917732,
597
+ "eval_runtime": 27.6258,
598
+ "eval_samples_per_second": 72.396,
599
+ "eval_steps_per_second": 0.29,
600
+ "step": 1150
601
+ },
602
+ {
603
+ "epoch": 3.0606860158311346,
604
+ "grad_norm": 0.130859375,
605
+ "learning_rate": 7.91599353796446e-05,
606
+ "loss": 0.0793,
607
+ "step": 1160
608
+ },
609
+ {
610
+ "epoch": 3.113456464379947,
611
+ "grad_norm": 0.1435546875,
612
+ "learning_rate": 7.700592353257944e-05,
613
+ "loss": 0.0775,
614
+ "step": 1180
615
+ },
616
+ {
617
+ "epoch": 3.16622691292876,
618
+ "grad_norm": 0.1357421875,
619
+ "learning_rate": 7.485191168551428e-05,
620
+ "loss": 0.0795,
621
+ "step": 1200
622
+ },
623
+ {
624
+ "epoch": 3.16622691292876,
625
+ "eval_loss": 0.0580158606171608,
626
+ "eval_runtime": 27.9777,
627
+ "eval_samples_per_second": 71.485,
628
+ "eval_steps_per_second": 0.286,
629
+ "step": 1200
630
+ },
631
+ {
632
+ "epoch": 3.2189973614775726,
633
+ "grad_norm": 0.1220703125,
634
+ "learning_rate": 7.269789983844911e-05,
635
+ "loss": 0.0766,
636
+ "step": 1220
637
+ },
638
+ {
639
+ "epoch": 3.271767810026385,
640
+ "grad_norm": 0.1318359375,
641
+ "learning_rate": 7.054388799138396e-05,
642
+ "loss": 0.0732,
643
+ "step": 1240
644
+ },
645
+ {
646
+ "epoch": 3.2981530343007917,
647
+ "eval_loss": 0.057783834636211395,
648
+ "eval_runtime": 28.6683,
649
+ "eval_samples_per_second": 69.763,
650
+ "eval_steps_per_second": 0.279,
651
+ "step": 1250
652
+ },
653
+ {
654
+ "epoch": 3.324538258575198,
655
+ "grad_norm": 0.130859375,
656
+ "learning_rate": 6.83898761443188e-05,
657
+ "loss": 0.0754,
658
+ "step": 1260
659
+ },
660
+ {
661
+ "epoch": 3.3773087071240107,
662
+ "grad_norm": 0.1611328125,
663
+ "learning_rate": 6.623586429725363e-05,
664
+ "loss": 0.0793,
665
+ "step": 1280
666
+ },
667
+ {
668
+ "epoch": 3.430079155672823,
669
+ "grad_norm": 0.1181640625,
670
+ "learning_rate": 6.408185245018848e-05,
671
+ "loss": 0.076,
672
+ "step": 1300
673
+ },
674
+ {
675
+ "epoch": 3.430079155672823,
676
+ "eval_loss": 0.05801219865679741,
677
+ "eval_runtime": 28.2125,
678
+ "eval_samples_per_second": 70.891,
679
+ "eval_steps_per_second": 0.284,
680
+ "step": 1300
681
+ },
682
+ {
683
+ "epoch": 3.4828496042216357,
684
+ "grad_norm": 0.1611328125,
685
+ "learning_rate": 6.192784060312333e-05,
686
+ "loss": 0.0745,
687
+ "step": 1320
688
+ },
689
+ {
690
+ "epoch": 3.5356200527704487,
691
+ "grad_norm": 0.1142578125,
692
+ "learning_rate": 5.9773828756058156e-05,
693
+ "loss": 0.0766,
694
+ "step": 1340
695
+ },
696
+ {
697
+ "epoch": 3.5620052770448547,
698
+ "eval_loss": 0.05800151824951172,
699
+ "eval_runtime": 27.919,
700
+ "eval_samples_per_second": 71.636,
701
+ "eval_steps_per_second": 0.287,
702
+ "step": 1350
703
+ },
704
+ {
705
+ "epoch": 3.588390501319261,
706
+ "grad_norm": 0.140625,
707
+ "learning_rate": 5.7619816908993005e-05,
708
+ "loss": 0.0753,
709
+ "step": 1360
710
+ },
711
+ {
712
+ "epoch": 3.641160949868074,
713
+ "grad_norm": 0.1328125,
714
+ "learning_rate": 5.5465805061927846e-05,
715
+ "loss": 0.0772,
716
+ "step": 1380
717
+ },
718
+ {
719
+ "epoch": 3.6939313984168867,
720
+ "grad_norm": 0.1328125,
721
+ "learning_rate": 5.331179321486268e-05,
722
+ "loss": 0.0716,
723
+ "step": 1400
724
+ },
725
+ {
726
+ "epoch": 3.6939313984168867,
727
+ "eval_loss": 0.057653266936540604,
728
+ "eval_runtime": 28.2955,
729
+ "eval_samples_per_second": 70.683,
730
+ "eval_steps_per_second": 0.283,
731
+ "step": 1400
732
+ },
733
+ {
734
+ "epoch": 3.746701846965699,
735
+ "grad_norm": 0.1513671875,
736
+ "learning_rate": 5.115778136779753e-05,
737
+ "loss": 0.0744,
738
+ "step": 1420
739
+ },
740
+ {
741
+ "epoch": 3.7994722955145117,
742
+ "grad_norm": 0.1259765625,
743
+ "learning_rate": 4.9003769520732365e-05,
744
+ "loss": 0.0777,
745
+ "step": 1440
746
+ },
747
+ {
748
+ "epoch": 3.825857519788918,
749
+ "eval_loss": 0.05697743222117424,
750
+ "eval_runtime": 28.2563,
751
+ "eval_samples_per_second": 70.781,
752
+ "eval_steps_per_second": 0.283,
753
+ "step": 1450
754
+ },
755
+ {
756
+ "epoch": 3.8522427440633247,
757
+ "grad_norm": 0.1640625,
758
+ "learning_rate": 4.6849757673667206e-05,
759
+ "loss": 0.0736,
760
+ "step": 1460
761
+ },
762
+ {
763
+ "epoch": 3.905013192612137,
764
+ "grad_norm": 0.1318359375,
765
+ "learning_rate": 4.469574582660205e-05,
766
+ "loss": 0.0753,
767
+ "step": 1480
768
+ },
769
+ {
770
+ "epoch": 3.9577836411609497,
771
+ "grad_norm": 0.12255859375,
772
+ "learning_rate": 4.254173397953689e-05,
773
+ "loss": 0.0745,
774
+ "step": 1500
775
+ },
776
+ {
777
+ "epoch": 3.9577836411609497,
778
+ "eval_loss": 0.05676369369029999,
779
+ "eval_runtime": 27.6767,
780
+ "eval_samples_per_second": 72.263,
781
+ "eval_steps_per_second": 0.289,
782
+ "step": 1500
783
+ }
784
+ ],
785
+ "logging_steps": 20,
786
+ "max_steps": 1895,
787
+ "num_input_tokens_seen": 0,
788
+ "num_train_epochs": 5,
789
+ "save_steps": 500,
790
+ "stateful_callbacks": {
791
+ "TrainerControl": {
792
+ "args": {
793
+ "should_epoch_stop": false,
794
+ "should_evaluate": false,
795
+ "should_log": false,
796
+ "should_save": true,
797
+ "should_training_stop": false
798
+ },
799
+ "attributes": {}
800
+ }
801
+ },
802
+ "total_flos": 2.03702605821971e+19,
803
+ "train_batch_size": 128,
804
+ "trial_name": null,
805
+ "trial_params": null
806
+ }
checkpoint-1516/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22bcb641e1380aac3e0a537740052e2d8d16eda981f3eee0acf549dc506b6fc5
3
+ size 5496
checkpoint-1895/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/home/pv_rwm_models/models/colqwen2-base",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": "gaussian",
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 128,
14
+ "lora_dropout": 0.1,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 128,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": "(.*(model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)",
23
+ "task_type": "FEATURE_EXTRACTION",
24
+ "use_dora": false,
25
+ "use_rslora": false
26
+ }
checkpoint-1895/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c42115acdfbac77d6ad3cd0992cac6b57ceb4fc0caab3ac9aa65716d5c31e771
3
+ size 295915936
checkpoint-1895/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_implementation": "flash_attention_2",
3
+ "bos_token_id": 151643,
4
+ "do_sample": true,
5
+ "eos_token_id": [
6
+ 151645,
7
+ 151643
8
+ ],
9
+ "pad_token_id": 151643,
10
+ "temperature": 0.01,
11
+ "top_k": 1,
12
+ "top_p": 0.001,
13
+ "transformers_version": "4.46.3"
14
+ }
checkpoint-1895/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:983778182711adce3793bc795c03e28a3eb7b8d45483b565126796e99edd23eb
3
+ size 592056816
checkpoint-1895/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e99e86e0a0d7a88873391e9424a84a188388f63d1a548d088fde3b4084b52f27
3
+ size 15920
checkpoint-1895/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8f66742b995c6c203ea5e3466df0c12dd79bc545cf32724da32e51fd7017594
3
+ size 15920
checkpoint-1895/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d74adeb07389fe4a761c0f1acd55b81fd5d632da6a71dbea2b83be10059f25bb
3
+ size 15920
checkpoint-1895/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e4d2fc43d54392f39410b3801732690887609174c1340233fdc1b8a66b9895a
3
+ size 15920
checkpoint-1895/rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a733d4ac23672067544bddddc8cefba69cc71eb28ead0e731f1dfc8f81ce5b7e
3
+ size 15920
checkpoint-1895/rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42635529ea678ebe0e9e77b10e3247c8e2e56cbbab7fe1fd3eb25c0077bf57e4
3
+ size 15920
checkpoint-1895/rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10021518b517df6aab8ff7711512fdf39a1b8b36eb3fe6ba6aa4255e5b8d9cfb
3
+ size 15920
checkpoint-1895/rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f50e0a8b40bdf58ea057fffbb0cd5bfe8f50970c3e5bb0485103af498a65850
3
+ size 15920
checkpoint-1895/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dd0f4da151ff0a05e3ccf9e4fcb080a087bfbe15e9973146872df4755d8610f
3
+ size 1064
checkpoint-1895/trainer_state.json ADDED
@@ -0,0 +1,995 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
+ "eval_steps": 50,
6
+ "global_step": 1895,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.002638522427440633,
13
+ "eval_loss": 0.3697243332862854,
14
+ "eval_runtime": 31.4109,
15
+ "eval_samples_per_second": 63.672,
16
+ "eval_steps_per_second": 0.255,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.052770448548812667,
21
+ "grad_norm": 0.26953125,
22
+ "learning_rate": 0.00010526315789473685,
23
+ "loss": 0.3823,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.10554089709762533,
28
+ "grad_norm": 0.201171875,
29
+ "learning_rate": 0.0001997845988152935,
30
+ "loss": 0.2239,
31
+ "step": 40
32
+ },
33
+ {
34
+ "epoch": 0.13192612137203166,
35
+ "eval_loss": 0.11808302253484726,
36
+ "eval_runtime": 29.4538,
37
+ "eval_samples_per_second": 67.903,
38
+ "eval_steps_per_second": 0.272,
39
+ "step": 50
40
+ },
41
+ {
42
+ "epoch": 0.158311345646438,
43
+ "grad_norm": 0.1962890625,
44
+ "learning_rate": 0.00019763058696822833,
45
+ "loss": 0.1799,
46
+ "step": 60
47
+ },
48
+ {
49
+ "epoch": 0.21108179419525067,
50
+ "grad_norm": 0.1943359375,
51
+ "learning_rate": 0.0001954765751211632,
52
+ "loss": 0.1651,
53
+ "step": 80
54
+ },
55
+ {
56
+ "epoch": 0.2638522427440633,
57
+ "grad_norm": 0.2255859375,
58
+ "learning_rate": 0.00019332256327409802,
59
+ "loss": 0.1571,
60
+ "step": 100
61
+ },
62
+ {
63
+ "epoch": 0.2638522427440633,
64
+ "eval_loss": 0.09250890463590622,
65
+ "eval_runtime": 28.2273,
66
+ "eval_samples_per_second": 70.853,
67
+ "eval_steps_per_second": 0.283,
68
+ "step": 100
69
+ },
70
+ {
71
+ "epoch": 0.316622691292876,
72
+ "grad_norm": 0.2333984375,
73
+ "learning_rate": 0.00019116855142703286,
74
+ "loss": 0.1535,
75
+ "step": 120
76
+ },
77
+ {
78
+ "epoch": 0.36939313984168864,
79
+ "grad_norm": 0.1611328125,
80
+ "learning_rate": 0.00018901453957996772,
81
+ "loss": 0.1456,
82
+ "step": 140
83
+ },
84
+ {
85
+ "epoch": 0.39577836411609496,
86
+ "eval_loss": 0.08707328885793686,
87
+ "eval_runtime": 27.6259,
88
+ "eval_samples_per_second": 72.396,
89
+ "eval_steps_per_second": 0.29,
90
+ "step": 150
91
+ },
92
+ {
93
+ "epoch": 0.42216358839050133,
94
+ "grad_norm": 0.1884765625,
95
+ "learning_rate": 0.00018686052773290255,
96
+ "loss": 0.1402,
97
+ "step": 160
98
+ },
99
+ {
100
+ "epoch": 0.47493403693931396,
101
+ "grad_norm": 0.2109375,
102
+ "learning_rate": 0.0001847065158858374,
103
+ "loss": 0.142,
104
+ "step": 180
105
+ },
106
+ {
107
+ "epoch": 0.5277044854881267,
108
+ "grad_norm": 0.1533203125,
109
+ "learning_rate": 0.00018255250403877222,
110
+ "loss": 0.1318,
111
+ "step": 200
112
+ },
113
+ {
114
+ "epoch": 0.5277044854881267,
115
+ "eval_loss": 0.080934077501297,
116
+ "eval_runtime": 27.3743,
117
+ "eval_samples_per_second": 73.061,
118
+ "eval_steps_per_second": 0.292,
119
+ "step": 200
120
+ },
121
+ {
122
+ "epoch": 0.5804749340369393,
123
+ "grad_norm": 0.216796875,
124
+ "learning_rate": 0.00018039849219170706,
125
+ "loss": 0.1301,
126
+ "step": 220
127
+ },
128
+ {
129
+ "epoch": 0.633245382585752,
130
+ "grad_norm": 0.162109375,
131
+ "learning_rate": 0.0001782444803446419,
132
+ "loss": 0.1317,
133
+ "step": 240
134
+ },
135
+ {
136
+ "epoch": 0.6596306068601583,
137
+ "eval_loss": 0.0750429555773735,
138
+ "eval_runtime": 27.7505,
139
+ "eval_samples_per_second": 72.071,
140
+ "eval_steps_per_second": 0.288,
141
+ "step": 250
142
+ },
143
+ {
144
+ "epoch": 0.6860158311345647,
145
+ "grad_norm": 0.185546875,
146
+ "learning_rate": 0.00017609046849757676,
147
+ "loss": 0.1269,
148
+ "step": 260
149
+ },
150
+ {
151
+ "epoch": 0.7387862796833773,
152
+ "grad_norm": 0.203125,
153
+ "learning_rate": 0.0001739364566505116,
154
+ "loss": 0.1267,
155
+ "step": 280
156
+ },
157
+ {
158
+ "epoch": 0.7915567282321899,
159
+ "grad_norm": 0.1455078125,
160
+ "learning_rate": 0.00017178244480344642,
161
+ "loss": 0.1226,
162
+ "step": 300
163
+ },
164
+ {
165
+ "epoch": 0.7915567282321899,
166
+ "eval_loss": 0.07792137563228607,
167
+ "eval_runtime": 27.3248,
168
+ "eval_samples_per_second": 73.194,
169
+ "eval_steps_per_second": 0.293,
170
+ "step": 300
171
+ },
172
+ {
173
+ "epoch": 0.8443271767810027,
174
+ "grad_norm": 0.1630859375,
175
+ "learning_rate": 0.00016962843295638126,
176
+ "loss": 0.1222,
177
+ "step": 320
178
+ },
179
+ {
180
+ "epoch": 0.8970976253298153,
181
+ "grad_norm": 0.173828125,
182
+ "learning_rate": 0.0001674744211093161,
183
+ "loss": 0.1254,
184
+ "step": 340
185
+ },
186
+ {
187
+ "epoch": 0.9234828496042217,
188
+ "eval_loss": 0.07484881579875946,
189
+ "eval_runtime": 27.8135,
190
+ "eval_samples_per_second": 71.907,
191
+ "eval_steps_per_second": 0.288,
192
+ "step": 350
193
+ },
194
+ {
195
+ "epoch": 0.9498680738786279,
196
+ "grad_norm": 0.1728515625,
197
+ "learning_rate": 0.00016532040926225093,
198
+ "loss": 0.1177,
199
+ "step": 360
200
+ },
201
+ {
202
+ "epoch": 1.0026385224274406,
203
+ "grad_norm": 0.1220703125,
204
+ "learning_rate": 0.0001631663974151858,
205
+ "loss": 0.1207,
206
+ "step": 380
207
+ },
208
+ {
209
+ "epoch": 1.0554089709762533,
210
+ "grad_norm": 0.1591796875,
211
+ "learning_rate": 0.00016101238556812063,
212
+ "loss": 0.1046,
213
+ "step": 400
214
+ },
215
+ {
216
+ "epoch": 1.0554089709762533,
217
+ "eval_loss": 0.0715707540512085,
218
+ "eval_runtime": 27.7758,
219
+ "eval_samples_per_second": 72.005,
220
+ "eval_steps_per_second": 0.288,
221
+ "step": 400
222
+ },
223
+ {
224
+ "epoch": 1.108179419525066,
225
+ "grad_norm": 0.1142578125,
226
+ "learning_rate": 0.0001588583737210555,
227
+ "loss": 0.1041,
228
+ "step": 420
229
+ },
230
+ {
231
+ "epoch": 1.1609498680738786,
232
+ "grad_norm": 0.177734375,
233
+ "learning_rate": 0.00015670436187399032,
234
+ "loss": 0.1034,
235
+ "step": 440
236
+ },
237
+ {
238
+ "epoch": 1.187335092348285,
239
+ "eval_loss": 0.0693235993385315,
240
+ "eval_runtime": 27.7658,
241
+ "eval_samples_per_second": 72.031,
242
+ "eval_steps_per_second": 0.288,
243
+ "step": 450
244
+ },
245
+ {
246
+ "epoch": 1.2137203166226913,
247
+ "grad_norm": 0.1630859375,
248
+ "learning_rate": 0.00015455035002692516,
249
+ "loss": 0.1042,
250
+ "step": 460
251
+ },
252
+ {
253
+ "epoch": 1.266490765171504,
254
+ "grad_norm": 0.1611328125,
255
+ "learning_rate": 0.00015239633817986,
256
+ "loss": 0.1032,
257
+ "step": 480
258
+ },
259
+ {
260
+ "epoch": 1.3192612137203166,
261
+ "grad_norm": 0.169921875,
262
+ "learning_rate": 0.00015024232633279485,
263
+ "loss": 0.1021,
264
+ "step": 500
265
+ },
266
+ {
267
+ "epoch": 1.3192612137203166,
268
+ "eval_loss": 0.06579812616109848,
269
+ "eval_runtime": 27.42,
270
+ "eval_samples_per_second": 72.939,
271
+ "eval_steps_per_second": 0.292,
272
+ "step": 500
273
+ },
274
+ {
275
+ "epoch": 1.3720316622691293,
276
+ "grad_norm": 0.1611328125,
277
+ "learning_rate": 0.0001480883144857297,
278
+ "loss": 0.1041,
279
+ "step": 520
280
+ },
281
+ {
282
+ "epoch": 1.424802110817942,
283
+ "grad_norm": 0.11474609375,
284
+ "learning_rate": 0.00014593430263866452,
285
+ "loss": 0.1006,
286
+ "step": 540
287
+ },
288
+ {
289
+ "epoch": 1.4511873350923483,
290
+ "eval_loss": 0.06417644023895264,
291
+ "eval_runtime": 27.5371,
292
+ "eval_samples_per_second": 72.629,
293
+ "eval_steps_per_second": 0.291,
294
+ "step": 550
295
+ },
296
+ {
297
+ "epoch": 1.4775725593667546,
298
+ "grad_norm": 0.1259765625,
299
+ "learning_rate": 0.00014378029079159936,
300
+ "loss": 0.1001,
301
+ "step": 560
302
+ },
303
+ {
304
+ "epoch": 1.5303430079155673,
305
+ "grad_norm": 0.146484375,
306
+ "learning_rate": 0.0001416262789445342,
307
+ "loss": 0.1013,
308
+ "step": 580
309
+ },
310
+ {
311
+ "epoch": 1.58311345646438,
312
+ "grad_norm": 0.1591796875,
313
+ "learning_rate": 0.00013947226709746903,
314
+ "loss": 0.1,
315
+ "step": 600
316
+ },
317
+ {
318
+ "epoch": 1.58311345646438,
319
+ "eval_loss": 0.06583409756422043,
320
+ "eval_runtime": 28.0223,
321
+ "eval_samples_per_second": 71.372,
322
+ "eval_steps_per_second": 0.285,
323
+ "step": 600
324
+ },
325
+ {
326
+ "epoch": 1.6358839050131926,
327
+ "grad_norm": 0.1611328125,
328
+ "learning_rate": 0.0001373182552504039,
329
+ "loss": 0.1021,
330
+ "step": 620
331
+ },
332
+ {
333
+ "epoch": 1.6886543535620053,
334
+ "grad_norm": 0.14453125,
335
+ "learning_rate": 0.00013516424340333873,
336
+ "loss": 0.1002,
337
+ "step": 640
338
+ },
339
+ {
340
+ "epoch": 1.7150395778364116,
341
+ "eval_loss": 0.06498919427394867,
342
+ "eval_runtime": 28.3581,
343
+ "eval_samples_per_second": 70.527,
344
+ "eval_steps_per_second": 0.282,
345
+ "step": 650
346
+ },
347
+ {
348
+ "epoch": 1.741424802110818,
349
+ "grad_norm": 0.111328125,
350
+ "learning_rate": 0.00013301023155627356,
351
+ "loss": 0.0967,
352
+ "step": 660
353
+ },
354
+ {
355
+ "epoch": 1.7941952506596306,
356
+ "grad_norm": 0.1884765625,
357
+ "learning_rate": 0.0001308562197092084,
358
+ "loss": 0.1004,
359
+ "step": 680
360
+ },
361
+ {
362
+ "epoch": 1.8469656992084431,
363
+ "grad_norm": 0.13671875,
364
+ "learning_rate": 0.00012870220786214323,
365
+ "loss": 0.0992,
366
+ "step": 700
367
+ },
368
+ {
369
+ "epoch": 1.8469656992084431,
370
+ "eval_loss": 0.06491042673587799,
371
+ "eval_runtime": 27.748,
372
+ "eval_samples_per_second": 72.077,
373
+ "eval_steps_per_second": 0.288,
374
+ "step": 700
375
+ },
376
+ {
377
+ "epoch": 1.899736147757256,
378
+ "grad_norm": 0.15234375,
379
+ "learning_rate": 0.0001265481960150781,
380
+ "loss": 0.0967,
381
+ "step": 720
382
+ },
383
+ {
384
+ "epoch": 1.9525065963060686,
385
+ "grad_norm": 0.12451171875,
386
+ "learning_rate": 0.00012439418416801293,
387
+ "loss": 0.0956,
388
+ "step": 740
389
+ },
390
+ {
391
+ "epoch": 1.978891820580475,
392
+ "eval_loss": 0.06425958126783371,
393
+ "eval_runtime": 27.654,
394
+ "eval_samples_per_second": 72.322,
395
+ "eval_steps_per_second": 0.289,
396
+ "step": 750
397
+ },
398
+ {
399
+ "epoch": 2.005277044854881,
400
+ "grad_norm": 0.12060546875,
401
+ "learning_rate": 0.0001222401723209478,
402
+ "loss": 0.0934,
403
+ "step": 760
404
+ },
405
+ {
406
+ "epoch": 2.058047493403694,
407
+ "grad_norm": 0.171875,
408
+ "learning_rate": 0.00012008616047388261,
409
+ "loss": 0.0907,
410
+ "step": 780
411
+ },
412
+ {
413
+ "epoch": 2.1108179419525066,
414
+ "grad_norm": 0.16796875,
415
+ "learning_rate": 0.00011793214862681745,
416
+ "loss": 0.0861,
417
+ "step": 800
418
+ },
419
+ {
420
+ "epoch": 2.1108179419525066,
421
+ "eval_loss": 0.06223862245678902,
422
+ "eval_runtime": 27.4046,
423
+ "eval_samples_per_second": 72.981,
424
+ "eval_steps_per_second": 0.292,
425
+ "step": 800
426
+ },
427
+ {
428
+ "epoch": 2.163588390501319,
429
+ "grad_norm": 0.134765625,
430
+ "learning_rate": 0.0001157781367797523,
431
+ "loss": 0.0864,
432
+ "step": 820
433
+ },
434
+ {
435
+ "epoch": 2.216358839050132,
436
+ "grad_norm": 0.123046875,
437
+ "learning_rate": 0.00011362412493268713,
438
+ "loss": 0.0842,
439
+ "step": 840
440
+ },
441
+ {
442
+ "epoch": 2.242744063324538,
443
+ "eval_loss": 0.060463495552539825,
444
+ "eval_runtime": 27.4597,
445
+ "eval_samples_per_second": 72.834,
446
+ "eval_steps_per_second": 0.291,
447
+ "step": 850
448
+ },
449
+ {
450
+ "epoch": 2.2691292875989446,
451
+ "grad_norm": 0.1435546875,
452
+ "learning_rate": 0.00011147011308562199,
453
+ "loss": 0.0863,
454
+ "step": 860
455
+ },
456
+ {
457
+ "epoch": 2.321899736147757,
458
+ "grad_norm": 0.1494140625,
459
+ "learning_rate": 0.00010931610123855683,
460
+ "loss": 0.0858,
461
+ "step": 880
462
+ },
463
+ {
464
+ "epoch": 2.37467018469657,
465
+ "grad_norm": 0.1259765625,
466
+ "learning_rate": 0.00010716208939149166,
467
+ "loss": 0.0866,
468
+ "step": 900
469
+ },
470
+ {
471
+ "epoch": 2.37467018469657,
472
+ "eval_loss": 0.06099672615528107,
473
+ "eval_runtime": 27.7635,
474
+ "eval_samples_per_second": 72.037,
475
+ "eval_steps_per_second": 0.288,
476
+ "step": 900
477
+ },
478
+ {
479
+ "epoch": 2.4274406332453826,
480
+ "grad_norm": 0.1376953125,
481
+ "learning_rate": 0.0001050080775444265,
482
+ "loss": 0.0873,
483
+ "step": 920
484
+ },
485
+ {
486
+ "epoch": 2.480211081794195,
487
+ "grad_norm": 0.158203125,
488
+ "learning_rate": 0.00010285406569736133,
489
+ "loss": 0.0853,
490
+ "step": 940
491
+ },
492
+ {
493
+ "epoch": 2.5065963060686016,
494
+ "eval_loss": 0.06115744262933731,
495
+ "eval_runtime": 27.8521,
496
+ "eval_samples_per_second": 71.808,
497
+ "eval_steps_per_second": 0.287,
498
+ "step": 950
499
+ },
500
+ {
501
+ "epoch": 2.532981530343008,
502
+ "grad_norm": 0.1259765625,
503
+ "learning_rate": 0.00010070005385029618,
504
+ "loss": 0.0849,
505
+ "step": 960
506
+ },
507
+ {
508
+ "epoch": 2.5857519788918206,
509
+ "grad_norm": 0.1318359375,
510
+ "learning_rate": 9.854604200323103e-05,
511
+ "loss": 0.0814,
512
+ "step": 980
513
+ },
514
+ {
515
+ "epoch": 2.638522427440633,
516
+ "grad_norm": 0.1376953125,
517
+ "learning_rate": 9.639203015616588e-05,
518
+ "loss": 0.0864,
519
+ "step": 1000
520
+ },
521
+ {
522
+ "epoch": 2.638522427440633,
523
+ "eval_loss": 0.05968466028571129,
524
+ "eval_runtime": 27.6897,
525
+ "eval_samples_per_second": 72.229,
526
+ "eval_steps_per_second": 0.289,
527
+ "step": 1000
528
+ },
529
+ {
530
+ "epoch": 2.691292875989446,
531
+ "grad_norm": 0.16015625,
532
+ "learning_rate": 9.423801830910071e-05,
533
+ "loss": 0.0869,
534
+ "step": 1020
535
+ },
536
+ {
537
+ "epoch": 2.7440633245382586,
538
+ "grad_norm": 0.12890625,
539
+ "learning_rate": 9.208400646203555e-05,
540
+ "loss": 0.0821,
541
+ "step": 1040
542
+ },
543
+ {
544
+ "epoch": 2.7704485488126647,
545
+ "eval_loss": 0.059157080948352814,
546
+ "eval_runtime": 27.7435,
547
+ "eval_samples_per_second": 72.089,
548
+ "eval_steps_per_second": 0.288,
549
+ "step": 1050
550
+ },
551
+ {
552
+ "epoch": 2.796833773087071,
553
+ "grad_norm": 0.1337890625,
554
+ "learning_rate": 8.99299946149704e-05,
555
+ "loss": 0.0842,
556
+ "step": 1060
557
+ },
558
+ {
559
+ "epoch": 2.849604221635884,
560
+ "grad_norm": 0.1513671875,
561
+ "learning_rate": 8.777598276790523e-05,
562
+ "loss": 0.0846,
563
+ "step": 1080
564
+ },
565
+ {
566
+ "epoch": 2.9023746701846966,
567
+ "grad_norm": 0.1328125,
568
+ "learning_rate": 8.562197092084006e-05,
569
+ "loss": 0.0841,
570
+ "step": 1100
571
+ },
572
+ {
573
+ "epoch": 2.9023746701846966,
574
+ "eval_loss": 0.05879725515842438,
575
+ "eval_runtime": 27.612,
576
+ "eval_samples_per_second": 72.432,
577
+ "eval_steps_per_second": 0.29,
578
+ "step": 1100
579
+ },
580
+ {
581
+ "epoch": 2.955145118733509,
582
+ "grad_norm": 0.1455078125,
583
+ "learning_rate": 8.346795907377491e-05,
584
+ "loss": 0.0809,
585
+ "step": 1120
586
+ },
587
+ {
588
+ "epoch": 3.007915567282322,
589
+ "grad_norm": 0.1259765625,
590
+ "learning_rate": 8.131394722670975e-05,
591
+ "loss": 0.0815,
592
+ "step": 1140
593
+ },
594
+ {
595
+ "epoch": 3.034300791556728,
596
+ "eval_loss": 0.05831225588917732,
597
+ "eval_runtime": 27.6258,
598
+ "eval_samples_per_second": 72.396,
599
+ "eval_steps_per_second": 0.29,
600
+ "step": 1150
601
+ },
602
+ {
603
+ "epoch": 3.0606860158311346,
604
+ "grad_norm": 0.130859375,
605
+ "learning_rate": 7.91599353796446e-05,
606
+ "loss": 0.0793,
607
+ "step": 1160
608
+ },
609
+ {
610
+ "epoch": 3.113456464379947,
611
+ "grad_norm": 0.1435546875,
612
+ "learning_rate": 7.700592353257944e-05,
613
+ "loss": 0.0775,
614
+ "step": 1180
615
+ },
616
+ {
617
+ "epoch": 3.16622691292876,
618
+ "grad_norm": 0.1357421875,
619
+ "learning_rate": 7.485191168551428e-05,
620
+ "loss": 0.0795,
621
+ "step": 1200
622
+ },
623
+ {
624
+ "epoch": 3.16622691292876,
625
+ "eval_loss": 0.0580158606171608,
626
+ "eval_runtime": 27.9777,
627
+ "eval_samples_per_second": 71.485,
628
+ "eval_steps_per_second": 0.286,
629
+ "step": 1200
630
+ },
631
+ {
632
+ "epoch": 3.2189973614775726,
633
+ "grad_norm": 0.1220703125,
634
+ "learning_rate": 7.269789983844911e-05,
635
+ "loss": 0.0766,
636
+ "step": 1220
637
+ },
638
+ {
639
+ "epoch": 3.271767810026385,
640
+ "grad_norm": 0.1318359375,
641
+ "learning_rate": 7.054388799138396e-05,
642
+ "loss": 0.0732,
643
+ "step": 1240
644
+ },
645
+ {
646
+ "epoch": 3.2981530343007917,
647
+ "eval_loss": 0.057783834636211395,
648
+ "eval_runtime": 28.6683,
649
+ "eval_samples_per_second": 69.763,
650
+ "eval_steps_per_second": 0.279,
651
+ "step": 1250
652
+ },
653
+ {
654
+ "epoch": 3.324538258575198,
655
+ "grad_norm": 0.130859375,
656
+ "learning_rate": 6.83898761443188e-05,
657
+ "loss": 0.0754,
658
+ "step": 1260
659
+ },
660
+ {
661
+ "epoch": 3.3773087071240107,
662
+ "grad_norm": 0.1611328125,
663
+ "learning_rate": 6.623586429725363e-05,
664
+ "loss": 0.0793,
665
+ "step": 1280
666
+ },
667
+ {
668
+ "epoch": 3.430079155672823,
669
+ "grad_norm": 0.1181640625,
670
+ "learning_rate": 6.408185245018848e-05,
671
+ "loss": 0.076,
672
+ "step": 1300
673
+ },
674
+ {
675
+ "epoch": 3.430079155672823,
676
+ "eval_loss": 0.05801219865679741,
677
+ "eval_runtime": 28.2125,
678
+ "eval_samples_per_second": 70.891,
679
+ "eval_steps_per_second": 0.284,
680
+ "step": 1300
681
+ },
682
+ {
683
+ "epoch": 3.4828496042216357,
684
+ "grad_norm": 0.1611328125,
685
+ "learning_rate": 6.192784060312333e-05,
686
+ "loss": 0.0745,
687
+ "step": 1320
688
+ },
689
+ {
690
+ "epoch": 3.5356200527704487,
691
+ "grad_norm": 0.1142578125,
692
+ "learning_rate": 5.9773828756058156e-05,
693
+ "loss": 0.0766,
694
+ "step": 1340
695
+ },
696
+ {
697
+ "epoch": 3.5620052770448547,
698
+ "eval_loss": 0.05800151824951172,
699
+ "eval_runtime": 27.919,
700
+ "eval_samples_per_second": 71.636,
701
+ "eval_steps_per_second": 0.287,
702
+ "step": 1350
703
+ },
704
+ {
705
+ "epoch": 3.588390501319261,
706
+ "grad_norm": 0.140625,
707
+ "learning_rate": 5.7619816908993005e-05,
708
+ "loss": 0.0753,
709
+ "step": 1360
710
+ },
711
+ {
712
+ "epoch": 3.641160949868074,
713
+ "grad_norm": 0.1328125,
714
+ "learning_rate": 5.5465805061927846e-05,
715
+ "loss": 0.0772,
716
+ "step": 1380
717
+ },
718
+ {
719
+ "epoch": 3.6939313984168867,
720
+ "grad_norm": 0.1328125,
721
+ "learning_rate": 5.331179321486268e-05,
722
+ "loss": 0.0716,
723
+ "step": 1400
724
+ },
725
+ {
726
+ "epoch": 3.6939313984168867,
727
+ "eval_loss": 0.057653266936540604,
728
+ "eval_runtime": 28.2955,
729
+ "eval_samples_per_second": 70.683,
730
+ "eval_steps_per_second": 0.283,
731
+ "step": 1400
732
+ },
733
+ {
734
+ "epoch": 3.746701846965699,
735
+ "grad_norm": 0.1513671875,
736
+ "learning_rate": 5.115778136779753e-05,
737
+ "loss": 0.0744,
738
+ "step": 1420
739
+ },
740
+ {
741
+ "epoch": 3.7994722955145117,
742
+ "grad_norm": 0.1259765625,
743
+ "learning_rate": 4.9003769520732365e-05,
744
+ "loss": 0.0777,
745
+ "step": 1440
746
+ },
747
+ {
748
+ "epoch": 3.825857519788918,
749
+ "eval_loss": 0.05697743222117424,
750
+ "eval_runtime": 28.2563,
751
+ "eval_samples_per_second": 70.781,
752
+ "eval_steps_per_second": 0.283,
753
+ "step": 1450
754
+ },
755
+ {
756
+ "epoch": 3.8522427440633247,
757
+ "grad_norm": 0.1640625,
758
+ "learning_rate": 4.6849757673667206e-05,
759
+ "loss": 0.0736,
760
+ "step": 1460
761
+ },
762
+ {
763
+ "epoch": 3.905013192612137,
764
+ "grad_norm": 0.1318359375,
765
+ "learning_rate": 4.469574582660205e-05,
766
+ "loss": 0.0753,
767
+ "step": 1480
768
+ },
769
+ {
770
+ "epoch": 3.9577836411609497,
771
+ "grad_norm": 0.12255859375,
772
+ "learning_rate": 4.254173397953689e-05,
773
+ "loss": 0.0745,
774
+ "step": 1500
775
+ },
776
+ {
777
+ "epoch": 3.9577836411609497,
778
+ "eval_loss": 0.05676369369029999,
779
+ "eval_runtime": 27.6767,
780
+ "eval_samples_per_second": 72.263,
781
+ "eval_steps_per_second": 0.289,
782
+ "step": 1500
783
+ },
784
+ {
785
+ "epoch": 4.010554089709762,
786
+ "grad_norm": 0.1376953125,
787
+ "learning_rate": 4.038772213247173e-05,
788
+ "loss": 0.0753,
789
+ "step": 1520
790
+ },
791
+ {
792
+ "epoch": 4.063324538258575,
793
+ "grad_norm": 0.1005859375,
794
+ "learning_rate": 3.823371028540657e-05,
795
+ "loss": 0.0742,
796
+ "step": 1540
797
+ },
798
+ {
799
+ "epoch": 4.089709762532982,
800
+ "eval_loss": 0.056621015071868896,
801
+ "eval_runtime": 27.6095,
802
+ "eval_samples_per_second": 72.439,
803
+ "eval_steps_per_second": 0.29,
804
+ "step": 1550
805
+ },
806
+ {
807
+ "epoch": 4.116094986807388,
808
+ "grad_norm": 0.1298828125,
809
+ "learning_rate": 3.6079698438341414e-05,
810
+ "loss": 0.0704,
811
+ "step": 1560
812
+ },
813
+ {
814
+ "epoch": 4.1688654353562,
815
+ "grad_norm": 0.1083984375,
816
+ "learning_rate": 3.392568659127625e-05,
817
+ "loss": 0.0719,
818
+ "step": 1580
819
+ },
820
+ {
821
+ "epoch": 4.221635883905013,
822
+ "grad_norm": 0.1337890625,
823
+ "learning_rate": 3.177167474421109e-05,
824
+ "loss": 0.075,
825
+ "step": 1600
826
+ },
827
+ {
828
+ "epoch": 4.221635883905013,
829
+ "eval_loss": 0.05666119232773781,
830
+ "eval_runtime": 28.0948,
831
+ "eval_samples_per_second": 71.188,
832
+ "eval_steps_per_second": 0.285,
833
+ "step": 1600
834
+ },
835
+ {
836
+ "epoch": 4.274406332453826,
837
+ "grad_norm": 0.1142578125,
838
+ "learning_rate": 2.9617662897145936e-05,
839
+ "loss": 0.0709,
840
+ "step": 1620
841
+ },
842
+ {
843
+ "epoch": 4.327176781002638,
844
+ "grad_norm": 0.134765625,
845
+ "learning_rate": 2.7463651050080774e-05,
846
+ "loss": 0.0752,
847
+ "step": 1640
848
+ },
849
+ {
850
+ "epoch": 4.353562005277045,
851
+ "eval_loss": 0.05666811391711235,
852
+ "eval_runtime": 28.2473,
853
+ "eval_samples_per_second": 70.803,
854
+ "eval_steps_per_second": 0.283,
855
+ "step": 1650
856
+ },
857
+ {
858
+ "epoch": 4.379947229551451,
859
+ "grad_norm": 0.1591796875,
860
+ "learning_rate": 2.5309639203015616e-05,
861
+ "loss": 0.0719,
862
+ "step": 1660
863
+ },
864
+ {
865
+ "epoch": 4.432717678100264,
866
+ "grad_norm": 0.138671875,
867
+ "learning_rate": 2.315562735595046e-05,
868
+ "loss": 0.0723,
869
+ "step": 1680
870
+ },
871
+ {
872
+ "epoch": 4.485488126649076,
873
+ "grad_norm": 0.095703125,
874
+ "learning_rate": 2.10016155088853e-05,
875
+ "loss": 0.0712,
876
+ "step": 1700
877
+ },
878
+ {
879
+ "epoch": 4.485488126649076,
880
+ "eval_loss": 0.05663124471902847,
881
+ "eval_runtime": 27.7726,
882
+ "eval_samples_per_second": 72.013,
883
+ "eval_steps_per_second": 0.288,
884
+ "step": 1700
885
+ },
886
+ {
887
+ "epoch": 4.538258575197889,
888
+ "grad_norm": 0.10205078125,
889
+ "learning_rate": 1.884760366182014e-05,
890
+ "loss": 0.072,
891
+ "step": 1720
892
+ },
893
+ {
894
+ "epoch": 4.591029023746702,
895
+ "grad_norm": 0.140625,
896
+ "learning_rate": 1.6693591814754982e-05,
897
+ "loss": 0.0723,
898
+ "step": 1740
899
+ },
900
+ {
901
+ "epoch": 4.617414248021108,
902
+ "eval_loss": 0.056529395282268524,
903
+ "eval_runtime": 27.9804,
904
+ "eval_samples_per_second": 71.479,
905
+ "eval_steps_per_second": 0.286,
906
+ "step": 1750
907
+ },
908
+ {
909
+ "epoch": 4.643799472295514,
910
+ "grad_norm": 0.1484375,
911
+ "learning_rate": 1.4539579967689822e-05,
912
+ "loss": 0.072,
913
+ "step": 1760
914
+ },
915
+ {
916
+ "epoch": 4.696569920844327,
917
+ "grad_norm": 0.1650390625,
918
+ "learning_rate": 1.2385568120624664e-05,
919
+ "loss": 0.074,
920
+ "step": 1780
921
+ },
922
+ {
923
+ "epoch": 4.74934036939314,
924
+ "grad_norm": 0.0986328125,
925
+ "learning_rate": 1.0231556273559504e-05,
926
+ "loss": 0.0722,
927
+ "step": 1800
928
+ },
929
+ {
930
+ "epoch": 4.74934036939314,
931
+ "eval_loss": 0.056551240384578705,
932
+ "eval_runtime": 28.2737,
933
+ "eval_samples_per_second": 70.737,
934
+ "eval_steps_per_second": 0.283,
935
+ "step": 1800
936
+ },
937
+ {
938
+ "epoch": 4.802110817941952,
939
+ "grad_norm": 0.10693359375,
940
+ "learning_rate": 8.077544426494346e-06,
941
+ "loss": 0.0693,
942
+ "step": 1820
943
+ },
944
+ {
945
+ "epoch": 4.854881266490765,
946
+ "grad_norm": 0.1416015625,
947
+ "learning_rate": 5.923532579429187e-06,
948
+ "loss": 0.0711,
949
+ "step": 1840
950
+ },
951
+ {
952
+ "epoch": 4.881266490765172,
953
+ "eval_loss": 0.05647709220647812,
954
+ "eval_runtime": 28.1268,
955
+ "eval_samples_per_second": 71.106,
956
+ "eval_steps_per_second": 0.284,
957
+ "step": 1850
958
+ },
959
+ {
960
+ "epoch": 4.907651715039578,
961
+ "grad_norm": 0.0849609375,
962
+ "learning_rate": 3.7695207323640284e-06,
963
+ "loss": 0.0714,
964
+ "step": 1860
965
+ },
966
+ {
967
+ "epoch": 4.96042216358839,
968
+ "grad_norm": 0.1396484375,
969
+ "learning_rate": 1.6155088852988692e-06,
970
+ "loss": 0.0727,
971
+ "step": 1880
972
+ }
973
+ ],
974
+ "logging_steps": 20,
975
+ "max_steps": 1895,
976
+ "num_input_tokens_seen": 0,
977
+ "num_train_epochs": 5,
978
+ "save_steps": 500,
979
+ "stateful_callbacks": {
980
+ "TrainerControl": {
981
+ "args": {
982
+ "should_epoch_stop": false,
983
+ "should_evaluate": false,
984
+ "should_log": false,
985
+ "should_save": true,
986
+ "should_training_stop": true
987
+ },
988
+ "attributes": {}
989
+ }
990
+ },
991
+ "total_flos": 2.546294970331955e+19,
992
+ "train_batch_size": 128,
993
+ "trial_name": null,
994
+ "trial_params": null
995
+ }
checkpoint-1895/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22bcb641e1380aac3e0a537740052e2d8d16eda981f3eee0acf549dc506b6fc5
3
+ size 5496
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "attn_implementation": "flash_attention_2",
3
+ "bos_token_id": 151643,
4
+ "do_sample": true,
5
+ "eos_token_id": [
6
+ 151645,
7
+ 151643
8
+ ],
9
+ "pad_token_id": 151643,
10
+ "temperature": 0.01,
11
+ "top_k": 1,
12
+ "top_p": 0.001,
13
+ "transformers_version": "4.46.3"
14
+ }