robertou2 commited on
Commit
d1e7d04
·
verified ·
1 Parent(s): 12af06e

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -20,10 +20,10 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "qkv_proj",
24
- "down_proj",
25
  "o_proj",
26
- "gate_up_proj"
 
 
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
23
  "o_proj",
24
+ "gate_up_proj",
25
+ "down_proj",
26
+ "qkv_proj"
27
  ],
28
  "task_type": "CAUSAL_LM",
29
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d39d4b71c2d9c958752b7019b0481033ab8d7caa096419fe04a39f1e2c03e5f
3
  size 402688040
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17df5b37264eec43ee3b04c003030e9ef85cf5b6f14b7f60e70e79bb01519c0e
3
  size 402688040
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2214c2d7be4e7002e6b458c215e56dc3cc1231d71e76dcf574cfefeb1df1f14
3
  size 805522170
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:414621f021c20ee99d8749762fb235a920491413d965ecd572e7b6fd2c8676c4
3
  size 805522170
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96a39edec8fd0ca2c66adccb7ddca2a246727221a5cedfcaa945c37683bd0907
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0954ce5829feaa075bd00d46c62f4c6b3adca283c9efdc99db72335353dd29ce
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db20a34ad6b350b7c1ce1bf536f3e5516e15fa5f9d629c0ece20011d12bce789
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37332900e696ff0a98c0047cec7566becbfb436912055ab70393a526c6bb795d
3
  size 1064
trainer_state.json CHANGED
@@ -1,235 +1,871 @@
1
  {
2
- "best_metric": 0.5998682379722595,
3
- "best_model_checkpoint": "//outputs/task7_microsoft/Phi-3.5-mini-instruct/checkpoint-238",
4
- "epoch": 7.0,
5
  "eval_steps": 500,
6
- "global_step": 238,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.2962962962962963,
13
- "grad_norm": 0.869780957698822,
14
- "learning_rate": 7.692307692307694e-06,
15
- "loss": 0.9461,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.5925925925925926,
20
- "grad_norm": 0.5883250832557678,
21
- "learning_rate": 9.978490638616671e-06,
22
- "loss": 0.6991,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.8888888888888888,
27
- "grad_norm": 0.922535240650177,
28
- "learning_rate": 9.873583924954152e-06,
29
- "loss": 0.7785,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 1.0,
34
- "eval_loss": 0.6846582889556885,
35
- "eval_runtime": 3.3866,
36
  "eval_samples_per_second": 4.429,
37
  "eval_steps_per_second": 0.591,
38
  "step": 34
39
  },
40
  {
41
  "epoch": 1.1777777777777778,
42
- "grad_norm": 0.3914264738559723,
43
- "learning_rate": 9.68316749134364e-06,
44
- "loss": 0.6765,
45
  "step": 40
46
  },
47
  {
48
  "epoch": 1.474074074074074,
49
- "grad_norm": 0.5533085465431213,
50
- "learning_rate": 9.410582299213574e-06,
51
- "loss": 0.799,
52
  "step": 50
53
  },
54
  {
55
  "epoch": 1.7703703703703704,
56
- "grad_norm": 0.3141545355319977,
57
- "learning_rate": 9.060611006213833e-06,
58
- "loss": 0.5998,
59
  "step": 60
60
  },
61
  {
62
  "epoch": 2.0,
63
- "eval_loss": 0.6442674994468689,
64
- "eval_runtime": 3.3729,
65
- "eval_samples_per_second": 4.447,
66
- "eval_steps_per_second": 0.593,
67
  "step": 68
68
  },
69
  {
70
  "epoch": 2.0592592592592593,
71
- "grad_norm": 0.6881595849990845,
72
- "learning_rate": 8.639394051847472e-06,
73
- "loss": 0.6565,
74
  "step": 70
75
  },
76
  {
77
  "epoch": 2.3555555555555556,
78
- "grad_norm": 0.817984402179718,
79
- "learning_rate": 8.154321920070415e-06,
80
- "loss": 0.6779,
81
  "step": 80
82
  },
83
  {
84
  "epoch": 2.651851851851852,
85
- "grad_norm": 0.585773229598999,
86
- "learning_rate": 7.613905469171247e-06,
87
- "loss": 0.5443,
88
  "step": 90
89
  },
90
  {
91
  "epoch": 2.948148148148148,
92
- "grad_norm": 0.5395255088806152,
93
- "learning_rate": 7.02762660406497e-06,
94
- "loss": 0.6243,
95
  "step": 100
96
  },
97
  {
98
  "epoch": 3.0,
99
- "eval_loss": 0.6209592223167419,
100
- "eval_runtime": 3.3724,
101
- "eval_samples_per_second": 4.448,
102
- "eval_steps_per_second": 0.593,
103
  "step": 102
104
  },
105
  {
106
  "epoch": 3.237037037037037,
107
- "grad_norm": 0.7252342104911804,
108
- "learning_rate": 6.405771911037698e-06,
109
- "loss": 0.6189,
110
  "step": 110
111
  },
112
  {
113
  "epoch": 3.533333333333333,
114
- "grad_norm": 0.5255349278450012,
115
- "learning_rate": 5.759252173912573e-06,
116
- "loss": 0.5914,
117
  "step": 120
118
  },
119
  {
120
  "epoch": 3.8296296296296295,
121
- "grad_norm": 0.5693609118461609,
122
- "learning_rate": 5.099410938325351e-06,
123
- "loss": 0.5872,
124
  "step": 130
125
  },
126
  {
127
  "epoch": 4.0,
128
- "eval_loss": 0.6094754934310913,
129
- "eval_runtime": 3.3715,
130
- "eval_samples_per_second": 4.449,
131
- "eval_steps_per_second": 0.593,
132
  "step": 136
133
  },
134
  {
135
  "epoch": 4.118518518518519,
136
- "grad_norm": 0.38578182458877563,
137
- "learning_rate": 4.43782548295514e-06,
138
- "loss": 0.574,
139
  "step": 140
140
  },
141
  {
142
  "epoch": 4.4148148148148145,
143
- "grad_norm": 0.5012251138687134,
144
- "learning_rate": 3.786103689779861e-06,
145
- "loss": 0.5227,
146
  "step": 150
147
  },
148
  {
149
  "epoch": 4.711111111111111,
150
- "grad_norm": 0.5396020412445068,
151
- "learning_rate": 3.1556803773799616e-06,
152
- "loss": 0.5366,
153
  "step": 160
154
  },
155
  {
156
  "epoch": 5.0,
157
- "grad_norm": 0.5771762728691101,
158
- "learning_rate": 2.5576166707349387e-06,
159
- "loss": 0.6322,
160
  "step": 170
161
  },
162
  {
163
  "epoch": 5.0,
164
- "eval_loss": 0.6032379269599915,
165
- "eval_runtime": 3.372,
166
- "eval_samples_per_second": 4.448,
167
- "eval_steps_per_second": 0.593,
168
  "step": 170
169
  },
170
  {
171
  "epoch": 5.296296296296296,
172
- "grad_norm": 0.5676046013832092,
173
- "learning_rate": 2.0024059276803742e-06,
174
- "loss": 0.5883,
175
  "step": 180
176
  },
177
  {
178
  "epoch": 5.592592592592593,
179
- "grad_norm": 0.4530661702156067,
180
- "learning_rate": 1.499789627152874e-06,
181
- "loss": 0.5619,
182
  "step": 190
183
  },
184
  {
185
  "epoch": 5.888888888888889,
186
- "grad_norm": 0.5783275365829468,
187
- "learning_rate": 1.0585864495652899e-06,
188
- "loss": 0.4661,
189
  "step": 200
190
  },
191
  {
192
  "epoch": 6.0,
193
- "eval_loss": 0.6005836129188538,
194
- "eval_runtime": 3.3722,
195
- "eval_samples_per_second": 4.448,
196
- "eval_steps_per_second": 0.593,
197
  "step": 204
198
  },
199
  {
200
  "epoch": 6.177777777777778,
201
- "grad_norm": 0.5576639175415039,
202
- "learning_rate": 6.865375481914017e-07,
203
- "loss": 0.5346,
204
  "step": 210
205
  },
206
  {
207
  "epoch": 6.474074074074074,
208
- "grad_norm": 0.5694870948791504,
209
- "learning_rate": 3.9017072635896716e-07,
210
- "loss": 0.5697,
211
  "step": 220
212
  },
213
  {
214
  "epoch": 6.770370370370371,
215
- "grad_norm": 0.35573288798332214,
216
- "learning_rate": 1.7468590353731495e-07,
217
- "loss": 0.5585,
218
  "step": 230
219
  },
220
  {
221
  "epoch": 7.0,
222
- "eval_loss": 0.5998682379722595,
223
- "eval_runtime": 3.3736,
224
- "eval_samples_per_second": 4.446,
225
- "eval_steps_per_second": 0.593,
226
  "step": 238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  }
228
  ],
229
  "logging_steps": 10,
230
- "max_steps": 250,
231
  "num_input_tokens_seen": 0,
232
- "num_train_epochs": 8,
233
  "save_steps": 500,
234
  "stateful_callbacks": {
235
  "TrainerControl": {
@@ -243,7 +879,7 @@
243
  "attributes": {}
244
  }
245
  },
246
- "total_flos": 2.291052678921216e+16,
247
  "train_batch_size": 1,
248
  "trial_name": null,
249
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.6473982334136963,
3
+ "best_model_checkpoint": "//outputs/task7_microsoft/Phi-3.5-mini-instruct/checkpoint-918",
4
+ "epoch": 27.0,
5
  "eval_steps": 500,
6
+ "global_step": 918,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.2962962962962963,
13
+ "grad_norm": 0.9734601378440857,
14
+ "learning_rate": 2e-07,
15
+ "loss": 0.9542,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.5925925925925926,
20
+ "grad_norm": 1.0604201555252075,
21
+ "learning_rate": 4e-07,
22
+ "loss": 0.754,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.8888888888888888,
27
+ "grad_norm": 1.4953643083572388,
28
+ "learning_rate": 6e-07,
29
+ "loss": 0.8703,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 1.0,
34
+ "eval_loss": 0.7866833209991455,
35
+ "eval_runtime": 3.3869,
36
  "eval_samples_per_second": 4.429,
37
  "eval_steps_per_second": 0.591,
38
  "step": 34
39
  },
40
  {
41
  "epoch": 1.1777777777777778,
42
+ "grad_norm": 0.9286134243011475,
43
+ "learning_rate": 8e-07,
44
+ "loss": 0.7908,
45
  "step": 40
46
  },
47
  {
48
  "epoch": 1.474074074074074,
49
+ "grad_norm": 1.2561135292053223,
50
+ "learning_rate": 1e-06,
51
+ "loss": 0.9372,
52
  "step": 50
53
  },
54
  {
55
  "epoch": 1.7703703703703704,
56
+ "grad_norm": 0.749400794506073,
57
+ "learning_rate": 9.99726628670463e-07,
58
+ "loss": 0.7326,
59
  "step": 60
60
  },
61
  {
62
  "epoch": 2.0,
63
+ "eval_loss": 0.7615469098091125,
64
+ "eval_runtime": 3.3795,
65
+ "eval_samples_per_second": 4.439,
66
+ "eval_steps_per_second": 0.592,
67
  "step": 68
68
  },
69
  {
70
  "epoch": 2.0592592592592593,
71
+ "grad_norm": 1.545391321182251,
72
+ "learning_rate": 9.989068136093872e-07,
73
+ "loss": 0.8027,
74
  "step": 70
75
  },
76
  {
77
  "epoch": 2.3555555555555556,
78
+ "grad_norm": 1.2902804613113403,
79
+ "learning_rate": 9.975414512725056e-07,
80
+ "loss": 0.8375,
81
  "step": 80
82
  },
83
  {
84
  "epoch": 2.651851851851852,
85
+ "grad_norm": 0.8745124936103821,
86
+ "learning_rate": 9.956320346634875e-07,
87
+ "loss": 0.692,
88
  "step": 90
89
  },
90
  {
91
  "epoch": 2.948148148148148,
92
+ "grad_norm": 0.71682208776474,
93
+ "learning_rate": 9.931806517013612e-07,
94
+ "loss": 0.7837,
95
  "step": 100
96
  },
97
  {
98
  "epoch": 3.0,
99
+ "eval_loss": 0.7384690642356873,
100
+ "eval_runtime": 3.38,
101
+ "eval_samples_per_second": 4.438,
102
+ "eval_steps_per_second": 0.592,
103
  "step": 102
104
  },
105
  {
106
  "epoch": 3.237037037037037,
107
+ "grad_norm": 1.1133283376693726,
108
+ "learning_rate": 9.901899829374047e-07,
109
+ "loss": 0.8015,
110
  "step": 110
111
  },
112
  {
113
  "epoch": 3.533333333333333,
114
+ "grad_norm": 0.6814171075820923,
115
+ "learning_rate": 9.866632986240029e-07,
116
+ "loss": 0.7579,
117
  "step": 120
118
  },
119
  {
120
  "epoch": 3.8296296296296295,
121
+ "grad_norm": 0.6647155284881592,
122
+ "learning_rate": 9.826044551386742e-07,
123
+ "loss": 0.7589,
124
  "step": 130
125
  },
126
  {
127
  "epoch": 4.0,
128
+ "eval_loss": 0.7234861850738525,
129
+ "eval_runtime": 3.3808,
130
+ "eval_samples_per_second": 4.437,
131
+ "eval_steps_per_second": 0.592,
132
  "step": 136
133
  },
134
  {
135
  "epoch": 4.118518518518519,
136
+ "grad_norm": 0.41486606001853943,
137
+ "learning_rate": 9.780178907671788e-07,
138
+ "loss": 0.7408,
139
  "step": 140
140
  },
141
  {
142
  "epoch": 4.4148148148148145,
143
+ "grad_norm": 0.6310161352157593,
144
+ "learning_rate": 9.729086208503173e-07,
145
+ "loss": 0.7029,
146
  "step": 150
147
  },
148
  {
149
  "epoch": 4.711111111111111,
150
+ "grad_norm": 0.6634405851364136,
151
+ "learning_rate": 9.672822322997304e-07,
152
+ "loss": 0.7053,
153
  "step": 160
154
  },
155
  {
156
  "epoch": 5.0,
157
+ "grad_norm": 0.5732757449150085,
158
+ "learning_rate": 9.611448774886923e-07,
159
+ "loss": 0.8443,
160
  "step": 170
161
  },
162
  {
163
  "epoch": 5.0,
164
+ "eval_loss": 0.7118446230888367,
165
+ "eval_runtime": 3.3807,
166
+ "eval_samples_per_second": 4.437,
167
+ "eval_steps_per_second": 0.592,
168
  "step": 170
169
  },
170
  {
171
  "epoch": 5.296296296296296,
172
+ "grad_norm": 0.5060367584228516,
173
+ "learning_rate": 9.545032675245813e-07,
174
+ "loss": 0.7704,
175
  "step": 180
176
  },
177
  {
178
  "epoch": 5.592592592592593,
179
+ "grad_norm": 0.47104501724243164,
180
+ "learning_rate": 9.473646649103817e-07,
181
+ "loss": 0.758,
182
  "step": 190
183
  },
184
  {
185
  "epoch": 5.888888888888889,
186
+ "grad_norm": 0.6369937062263489,
187
+ "learning_rate": 9.397368756032444e-07,
188
+ "loss": 0.6357,
189
  "step": 200
190
  },
191
  {
192
  "epoch": 6.0,
193
+ "eval_loss": 0.7020046710968018,
194
+ "eval_runtime": 3.3979,
195
+ "eval_samples_per_second": 4.414,
196
+ "eval_steps_per_second": 0.589,
197
  "step": 204
198
  },
199
  {
200
  "epoch": 6.177777777777778,
201
+ "grad_norm": 0.5188702940940857,
202
+ "learning_rate": 9.316282404787869e-07,
203
+ "loss": 0.7171,
204
  "step": 210
205
  },
206
  {
207
  "epoch": 6.474074074074074,
208
+ "grad_norm": 0.5187780857086182,
209
+ "learning_rate": 9.230476262104676e-07,
210
+ "loss": 0.7533,
211
  "step": 220
212
  },
213
  {
214
  "epoch": 6.770370370370371,
215
+ "grad_norm": 0.3468983471393585,
216
+ "learning_rate": 9.1400441557401e-07,
217
+ "loss": 0.7398,
218
  "step": 230
219
  },
220
  {
221
  "epoch": 7.0,
222
+ "eval_loss": 0.694381058216095,
223
+ "eval_runtime": 3.3812,
224
+ "eval_samples_per_second": 4.436,
225
+ "eval_steps_per_second": 0.591,
226
  "step": 238
227
+ },
228
+ {
229
+ "epoch": 7.059259259259259,
230
+ "grad_norm": 0.48491501808166504,
231
+ "learning_rate": 9.045084971874737e-07,
232
+ "loss": 0.6413,
233
+ "step": 240
234
+ },
235
+ {
236
+ "epoch": 7.355555555555555,
237
+ "grad_norm": 0.5511935949325562,
238
+ "learning_rate": 8.945702546981968e-07,
239
+ "loss": 0.7137,
240
+ "step": 250
241
+ },
242
+ {
243
+ "epoch": 7.651851851851852,
244
+ "grad_norm": 0.4468119144439697,
245
+ "learning_rate": 8.842005554284295e-07,
246
+ "loss": 0.7029,
247
+ "step": 260
248
+ },
249
+ {
250
+ "epoch": 7.948148148148148,
251
+ "grad_norm": 1.057121992111206,
252
+ "learning_rate": 8.734107384920769e-07,
253
+ "loss": 0.7468,
254
+ "step": 270
255
+ },
256
+ {
257
+ "epoch": 8.0,
258
+ "eval_loss": 0.6881054043769836,
259
+ "eval_runtime": 3.3795,
260
+ "eval_samples_per_second": 4.439,
261
+ "eval_steps_per_second": 0.592,
262
+ "step": 272
263
+ },
264
+ {
265
+ "epoch": 8.237037037037037,
266
+ "grad_norm": 0.6974695324897766,
267
+ "learning_rate": 8.622126023955445e-07,
268
+ "loss": 0.6952,
269
+ "step": 280
270
+ },
271
+ {
272
+ "epoch": 8.533333333333333,
273
+ "grad_norm": 0.5931522250175476,
274
+ "learning_rate": 8.506183921362442e-07,
275
+ "loss": 0.6434,
276
+ "step": 290
277
+ },
278
+ {
279
+ "epoch": 8.829629629629629,
280
+ "grad_norm": 0.5849316716194153,
281
+ "learning_rate": 8.386407858128706e-07,
282
+ "loss": 0.711,
283
+ "step": 300
284
+ },
285
+ {
286
+ "epoch": 9.0,
287
+ "eval_loss": 0.6817243695259094,
288
+ "eval_runtime": 3.3796,
289
+ "eval_samples_per_second": 4.438,
290
+ "eval_steps_per_second": 0.592,
291
+ "step": 306
292
+ },
293
+ {
294
+ "epoch": 9.118518518518519,
295
+ "grad_norm": 0.4516354501247406,
296
+ "learning_rate": 8.262928807620843e-07,
297
+ "loss": 0.7294,
298
+ "step": 310
299
+ },
300
+ {
301
+ "epoch": 9.414814814814815,
302
+ "grad_norm": 0.3970250189304352,
303
+ "learning_rate": 8.135881792367685e-07,
304
+ "loss": 0.6861,
305
+ "step": 320
306
+ },
307
+ {
308
+ "epoch": 9.71111111111111,
309
+ "grad_norm": 0.4954105317592621,
310
+ "learning_rate": 8.005405736415125e-07,
311
+ "loss": 0.6443,
312
+ "step": 330
313
+ },
314
+ {
315
+ "epoch": 10.0,
316
+ "grad_norm": 0.9217901825904846,
317
+ "learning_rate": 7.871643313414718e-07,
318
+ "loss": 0.693,
319
+ "step": 340
320
+ },
321
+ {
322
+ "epoch": 10.0,
323
+ "eval_loss": 0.6770989298820496,
324
+ "eval_runtime": 3.3796,
325
+ "eval_samples_per_second": 4.438,
326
+ "eval_steps_per_second": 0.592,
327
+ "step": 340
328
+ },
329
+ {
330
+ "epoch": 10.296296296296296,
331
+ "grad_norm": 0.6658220887184143,
332
+ "learning_rate": 7.734740790612136e-07,
333
+ "loss": 0.6673,
334
+ "step": 350
335
+ },
336
+ {
337
+ "epoch": 10.592592592592592,
338
+ "grad_norm": 1.5193367004394531,
339
+ "learning_rate": 7.594847868906076e-07,
340
+ "loss": 0.8039,
341
+ "step": 360
342
+ },
343
+ {
344
+ "epoch": 10.88888888888889,
345
+ "grad_norm": 0.3789392411708832,
346
+ "learning_rate": 7.452117519152541e-07,
347
+ "loss": 0.6556,
348
+ "step": 370
349
+ },
350
+ {
351
+ "epoch": 11.0,
352
+ "eval_loss": 0.6725327968597412,
353
+ "eval_runtime": 3.3798,
354
+ "eval_samples_per_second": 4.438,
355
+ "eval_steps_per_second": 0.592,
356
+ "step": 374
357
+ },
358
+ {
359
+ "epoch": 11.177777777777777,
360
+ "grad_norm": 0.5015827417373657,
361
+ "learning_rate": 7.306705814893439e-07,
362
+ "loss": 0.6501,
363
+ "step": 380
364
+ },
365
+ {
366
+ "epoch": 11.474074074074075,
367
+ "grad_norm": 0.3010861575603485,
368
+ "learning_rate": 7.158771761692464e-07,
369
+ "loss": 0.6209,
370
+ "step": 390
371
+ },
372
+ {
373
+ "epoch": 11.77037037037037,
374
+ "grad_norm": 0.8139746785163879,
375
+ "learning_rate": 7.008477123264847e-07,
376
+ "loss": 0.7087,
377
+ "step": 400
378
+ },
379
+ {
380
+ "epoch": 12.0,
381
+ "eval_loss": 0.6691569685935974,
382
+ "eval_runtime": 3.3801,
383
+ "eval_samples_per_second": 4.438,
384
+ "eval_steps_per_second": 0.592,
385
+ "step": 408
386
+ },
387
+ {
388
+ "epoch": 12.059259259259258,
389
+ "grad_norm": 0.5303260087966919,
390
+ "learning_rate": 6.855986244591103e-07,
391
+ "loss": 0.6422,
392
+ "step": 410
393
+ },
394
+ {
395
+ "epoch": 12.355555555555556,
396
+ "grad_norm": 0.6748088002204895,
397
+ "learning_rate": 6.701465872208216e-07,
398
+ "loss": 0.6324,
399
+ "step": 420
400
+ },
401
+ {
402
+ "epoch": 12.651851851851852,
403
+ "grad_norm": 0.30257901549339294,
404
+ "learning_rate": 6.545084971874736e-07,
405
+ "loss": 0.6435,
406
+ "step": 430
407
+ },
408
+ {
409
+ "epoch": 12.948148148148148,
410
+ "grad_norm": 0.502646803855896,
411
+ "learning_rate": 6.387014543809223e-07,
412
+ "loss": 0.7937,
413
+ "step": 440
414
+ },
415
+ {
416
+ "epoch": 13.0,
417
+ "eval_loss": 0.6658554077148438,
418
+ "eval_runtime": 3.3771,
419
+ "eval_samples_per_second": 4.442,
420
+ "eval_steps_per_second": 0.592,
421
+ "step": 442
422
+ },
423
+ {
424
+ "epoch": 13.237037037037037,
425
+ "grad_norm": 0.5977727174758911,
426
+ "learning_rate": 6.227427435703995e-07,
427
+ "loss": 0.7328,
428
+ "step": 450
429
+ },
430
+ {
431
+ "epoch": 13.533333333333333,
432
+ "grad_norm": 0.5516536235809326,
433
+ "learning_rate": 6.066498153718734e-07,
434
+ "loss": 0.6106,
435
+ "step": 460
436
+ },
437
+ {
438
+ "epoch": 13.829629629629629,
439
+ "grad_norm": 0.6513301730155945,
440
+ "learning_rate": 5.90440267166055e-07,
441
+ "loss": 0.6848,
442
+ "step": 470
443
+ },
444
+ {
445
+ "epoch": 14.0,
446
+ "eval_loss": 0.6627060174942017,
447
+ "eval_runtime": 3.3775,
448
+ "eval_samples_per_second": 4.441,
449
+ "eval_steps_per_second": 0.592,
450
+ "step": 476
451
+ },
452
+ {
453
+ "epoch": 14.118518518518519,
454
+ "grad_norm": 0.548687756061554,
455
+ "learning_rate": 5.741318238559209e-07,
456
+ "loss": 0.6687,
457
+ "step": 480
458
+ },
459
+ {
460
+ "epoch": 14.414814814814815,
461
+ "grad_norm": 0.5744287967681885,
462
+ "learning_rate": 5.577423184847931e-07,
463
+ "loss": 0.6226,
464
+ "step": 490
465
+ },
466
+ {
467
+ "epoch": 14.71111111111111,
468
+ "grad_norm": 0.6195695996284485,
469
+ "learning_rate": 5.412896727361662e-07,
470
+ "loss": 0.6681,
471
+ "step": 500
472
+ },
473
+ {
474
+ "epoch": 15.0,
475
+ "grad_norm": 0.9363336563110352,
476
+ "learning_rate": 5.247918773366111e-07,
477
+ "loss": 0.6825,
478
+ "step": 510
479
+ },
480
+ {
481
+ "epoch": 15.0,
482
+ "eval_loss": 0.6597088575363159,
483
+ "eval_runtime": 3.3779,
484
+ "eval_samples_per_second": 4.441,
485
+ "eval_steps_per_second": 0.592,
486
+ "step": 510
487
+ },
488
+ {
489
+ "epoch": 15.296296296296296,
490
+ "grad_norm": 0.4059470295906067,
491
+ "learning_rate": 5.082669723831793e-07,
492
+ "loss": 0.5709,
493
+ "step": 520
494
+ },
495
+ {
496
+ "epoch": 15.592592592592592,
497
+ "grad_norm": 0.5535906553268433,
498
+ "learning_rate": 4.917330276168208e-07,
499
+ "loss": 0.712,
500
+ "step": 530
501
+ },
502
+ {
503
+ "epoch": 15.88888888888889,
504
+ "grad_norm": 0.5456708669662476,
505
+ "learning_rate": 4.752081226633888e-07,
506
+ "loss": 0.6777,
507
+ "step": 540
508
+ },
509
+ {
510
+ "epoch": 16.0,
511
+ "eval_loss": 0.6580029726028442,
512
+ "eval_runtime": 3.3796,
513
+ "eval_samples_per_second": 4.438,
514
+ "eval_steps_per_second": 0.592,
515
+ "step": 544
516
+ },
517
+ {
518
+ "epoch": 16.177777777777777,
519
+ "grad_norm": 0.2641673684120178,
520
+ "learning_rate": 4.5871032726383385e-07,
521
+ "loss": 0.7006,
522
+ "step": 550
523
+ },
524
+ {
525
+ "epoch": 16.474074074074075,
526
+ "grad_norm": 0.49657076597213745,
527
+ "learning_rate": 4.4225768151520694e-07,
528
+ "loss": 0.7039,
529
+ "step": 560
530
+ },
531
+ {
532
+ "epoch": 16.77037037037037,
533
+ "grad_norm": 0.5019007325172424,
534
+ "learning_rate": 4.258681761440789e-07,
535
+ "loss": 0.7018,
536
+ "step": 570
537
+ },
538
+ {
539
+ "epoch": 17.0,
540
+ "eval_loss": 0.655741810798645,
541
+ "eval_runtime": 3.3817,
542
+ "eval_samples_per_second": 4.436,
543
+ "eval_steps_per_second": 0.591,
544
+ "step": 578
545
+ },
546
+ {
547
+ "epoch": 17.05925925925926,
548
+ "grad_norm": 0.5443792343139648,
549
+ "learning_rate": 4.095597328339452e-07,
550
+ "loss": 0.5107,
551
+ "step": 580
552
+ },
553
+ {
554
+ "epoch": 17.355555555555554,
555
+ "grad_norm": 0.6433914303779602,
556
+ "learning_rate": 3.9335018462812664e-07,
557
+ "loss": 0.724,
558
+ "step": 590
559
+ },
560
+ {
561
+ "epoch": 17.651851851851852,
562
+ "grad_norm": 0.8970219492912292,
563
+ "learning_rate": 3.772572564296004e-07,
564
+ "loss": 0.6995,
565
+ "step": 600
566
+ },
567
+ {
568
+ "epoch": 17.94814814814815,
569
+ "grad_norm": 0.43578681349754333,
570
+ "learning_rate": 3.612985456190778e-07,
571
+ "loss": 0.6126,
572
+ "step": 610
573
+ },
574
+ {
575
+ "epoch": 18.0,
576
+ "eval_loss": 0.6544412970542908,
577
+ "eval_runtime": 3.3793,
578
+ "eval_samples_per_second": 4.439,
579
+ "eval_steps_per_second": 0.592,
580
+ "step": 612
581
+ },
582
+ {
583
+ "epoch": 18.237037037037037,
584
+ "grad_norm": 0.6419870257377625,
585
+ "learning_rate": 3.454915028125263e-07,
586
+ "loss": 0.5766,
587
+ "step": 620
588
+ },
589
+ {
590
+ "epoch": 18.533333333333335,
591
+ "grad_norm": 0.3237641751766205,
592
+ "learning_rate": 3.2985341277917846e-07,
593
+ "loss": 0.6299,
594
+ "step": 630
595
+ },
596
+ {
597
+ "epoch": 18.82962962962963,
598
+ "grad_norm": 0.409332275390625,
599
+ "learning_rate": 3.1440137554088953e-07,
600
+ "loss": 0.5755,
601
+ "step": 640
602
+ },
603
+ {
604
+ "epoch": 19.0,
605
+ "eval_loss": 0.6529051661491394,
606
+ "eval_runtime": 3.3802,
607
+ "eval_samples_per_second": 4.438,
608
+ "eval_steps_per_second": 0.592,
609
+ "step": 646
610
+ },
611
+ {
612
+ "epoch": 19.118518518518517,
613
+ "grad_norm": 0.4559795558452606,
614
+ "learning_rate": 2.9915228767351535e-07,
615
+ "loss": 0.7702,
616
+ "step": 650
617
+ },
618
+ {
619
+ "epoch": 19.414814814814815,
620
+ "grad_norm": 0.44174766540527344,
621
+ "learning_rate": 2.841228238307536e-07,
622
+ "loss": 0.5212,
623
+ "step": 660
624
+ },
625
+ {
626
+ "epoch": 19.711111111111112,
627
+ "grad_norm": 0.5898323655128479,
628
+ "learning_rate": 2.6932941851065615e-07,
629
+ "loss": 0.6807,
630
+ "step": 670
631
+ },
632
+ {
633
+ "epoch": 20.0,
634
+ "grad_norm": 0.6098238825798035,
635
+ "learning_rate": 2.547882480847461e-07,
636
+ "loss": 0.6458,
637
+ "step": 680
638
+ },
639
+ {
640
+ "epoch": 20.0,
641
+ "eval_loss": 0.6510729193687439,
642
+ "eval_runtime": 3.3804,
643
+ "eval_samples_per_second": 4.437,
644
+ "eval_steps_per_second": 0.592,
645
+ "step": 680
646
+ },
647
+ {
648
+ "epoch": 20.296296296296298,
649
+ "grad_norm": 0.5178992748260498,
650
+ "learning_rate": 2.4051521310939254e-07,
651
+ "loss": 0.6025,
652
+ "step": 690
653
+ },
654
+ {
655
+ "epoch": 20.59259259259259,
656
+ "grad_norm": 0.7392374873161316,
657
+ "learning_rate": 2.2652592093878665e-07,
658
+ "loss": 0.7731,
659
+ "step": 700
660
+ },
661
+ {
662
+ "epoch": 20.88888888888889,
663
+ "grad_norm": 0.34529829025268555,
664
+ "learning_rate": 2.128356686585282e-07,
665
+ "loss": 0.6715,
666
+ "step": 710
667
+ },
668
+ {
669
+ "epoch": 21.0,
670
+ "eval_loss": 0.6509791016578674,
671
+ "eval_runtime": 3.38,
672
+ "eval_samples_per_second": 4.438,
673
+ "eval_steps_per_second": 0.592,
674
+ "step": 714
675
+ },
676
+ {
677
+ "epoch": 21.177777777777777,
678
+ "grad_norm": 0.5632665753364563,
679
+ "learning_rate": 1.9945942635848745e-07,
680
+ "loss": 0.65,
681
+ "step": 720
682
+ },
683
+ {
684
+ "epoch": 21.474074074074075,
685
+ "grad_norm": 0.41342058777809143,
686
+ "learning_rate": 1.8641182076323148e-07,
687
+ "loss": 0.6272,
688
+ "step": 730
689
+ },
690
+ {
691
+ "epoch": 21.77037037037037,
692
+ "grad_norm": 0.3611527383327484,
693
+ "learning_rate": 1.7370711923791564e-07,
694
+ "loss": 0.6173,
695
+ "step": 740
696
+ },
697
+ {
698
+ "epoch": 22.0,
699
+ "eval_loss": 0.649942934513092,
700
+ "eval_runtime": 3.38,
701
+ "eval_samples_per_second": 4.438,
702
+ "eval_steps_per_second": 0.592,
703
+ "step": 748
704
+ },
705
+ {
706
+ "epoch": 22.05925925925926,
707
+ "grad_norm": 0.9855077266693115,
708
+ "learning_rate": 1.6135921418712955e-07,
709
+ "loss": 0.6918,
710
+ "step": 750
711
+ },
712
+ {
713
+ "epoch": 22.355555555555554,
714
+ "grad_norm": 0.35098254680633545,
715
+ "learning_rate": 1.493816078637557e-07,
716
+ "loss": 0.5533,
717
+ "step": 760
718
+ },
719
+ {
720
+ "epoch": 22.651851851851852,
721
+ "grad_norm": 0.5630102157592773,
722
+ "learning_rate": 1.3778739760445552e-07,
723
+ "loss": 0.5668,
724
+ "step": 770
725
+ },
726
+ {
727
+ "epoch": 22.94814814814815,
728
+ "grad_norm": 0.42931997776031494,
729
+ "learning_rate": 1.2658926150792322e-07,
730
+ "loss": 0.7773,
731
+ "step": 780
732
+ },
733
+ {
734
+ "epoch": 23.0,
735
+ "eval_loss": 0.6492455005645752,
736
+ "eval_runtime": 3.3795,
737
+ "eval_samples_per_second": 4.439,
738
+ "eval_steps_per_second": 0.592,
739
+ "step": 782
740
+ },
741
+ {
742
+ "epoch": 23.237037037037037,
743
+ "grad_norm": 1.0845577716827393,
744
+ "learning_rate": 1.1579944457157059e-07,
745
+ "loss": 0.5997,
746
+ "step": 790
747
+ },
748
+ {
749
+ "epoch": 23.533333333333335,
750
+ "grad_norm": 0.8447713255882263,
751
+ "learning_rate": 1.0542974530180327e-07,
752
+ "loss": 0.6652,
753
+ "step": 800
754
+ },
755
+ {
756
+ "epoch": 23.82962962962963,
757
+ "grad_norm": 0.7863607406616211,
758
+ "learning_rate": 9.549150281252632e-08,
759
+ "loss": 0.5996,
760
+ "step": 810
761
+ },
762
+ {
763
+ "epoch": 24.0,
764
+ "eval_loss": 0.6488202810287476,
765
+ "eval_runtime": 3.3801,
766
+ "eval_samples_per_second": 4.438,
767
+ "eval_steps_per_second": 0.592,
768
+ "step": 816
769
+ },
770
+ {
771
+ "epoch": 24.118518518518517,
772
+ "grad_norm": 0.9356284141540527,
773
+ "learning_rate": 8.599558442598998e-08,
774
+ "loss": 0.7435,
775
+ "step": 820
776
+ },
777
+ {
778
+ "epoch": 24.414814814814815,
779
+ "grad_norm": 0.4750384986400604,
780
+ "learning_rate": 7.695237378953224e-08,
781
+ "loss": 0.6673,
782
+ "step": 830
783
+ },
784
+ {
785
+ "epoch": 24.711111111111112,
786
+ "grad_norm": 0.5514878630638123,
787
+ "learning_rate": 6.837175952121304e-08,
788
+ "loss": 0.6226,
789
+ "step": 840
790
+ },
791
+ {
792
+ "epoch": 25.0,
793
+ "grad_norm": 0.5611656904220581,
794
+ "learning_rate": 6.026312439675551e-08,
795
+ "loss": 0.5958,
796
+ "step": 850
797
+ },
798
+ {
799
+ "epoch": 25.0,
800
+ "eval_loss": 0.6484441161155701,
801
+ "eval_runtime": 3.3794,
802
+ "eval_samples_per_second": 4.439,
803
+ "eval_steps_per_second": 0.592,
804
+ "step": 850
805
+ },
806
+ {
807
+ "epoch": 25.296296296296298,
808
+ "grad_norm": 0.3849276900291443,
809
+ "learning_rate": 5.263533508961826e-08,
810
+ "loss": 0.7222,
811
+ "step": 860
812
+ },
813
+ {
814
+ "epoch": 25.59259259259259,
815
+ "grad_norm": 0.5683622360229492,
816
+ "learning_rate": 4.549673247541874e-08,
817
+ "loss": 0.7069,
818
+ "step": 870
819
+ },
820
+ {
821
+ "epoch": 25.88888888888889,
822
+ "grad_norm": 0.7111372351646423,
823
+ "learning_rate": 3.8855122511307626e-08,
824
+ "loss": 0.5773,
825
+ "step": 880
826
+ },
827
+ {
828
+ "epoch": 26.0,
829
+ "eval_loss": 0.6479985117912292,
830
+ "eval_runtime": 3.3801,
831
+ "eval_samples_per_second": 4.438,
832
+ "eval_steps_per_second": 0.592,
833
+ "step": 884
834
+ },
835
+ {
836
+ "epoch": 26.177777777777777,
837
+ "grad_norm": 0.27255579829216003,
838
+ "learning_rate": 3.271776770026963e-08,
839
+ "loss": 0.6349,
840
+ "step": 890
841
+ },
842
+ {
843
+ "epoch": 26.474074074074075,
844
+ "grad_norm": 0.54938143491745,
845
+ "learning_rate": 2.7091379149682682e-08,
846
+ "loss": 0.6322,
847
+ "step": 900
848
+ },
849
+ {
850
+ "epoch": 26.77037037037037,
851
+ "grad_norm": 0.6063421964645386,
852
+ "learning_rate": 2.1982109232821176e-08,
853
+ "loss": 0.6462,
854
+ "step": 910
855
+ },
856
+ {
857
+ "epoch": 27.0,
858
+ "eval_loss": 0.6473982334136963,
859
+ "eval_runtime": 3.3791,
860
+ "eval_samples_per_second": 4.439,
861
+ "eval_steps_per_second": 0.592,
862
+ "step": 918
863
  }
864
  ],
865
  "logging_steps": 10,
866
+ "max_steps": 1000,
867
  "num_input_tokens_seen": 0,
868
+ "num_train_epochs": 31,
869
  "save_steps": 500,
870
  "stateful_callbacks": {
871
  "TrainerControl": {
 
879
  "attributes": {}
880
  }
881
  },
882
+ "total_flos": 8.836917475838976e+16,
883
  "train_batch_size": 1,
884
  "trial_name": null,
885
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:057e998df0396c8c0743c2e8486036bb54b886294b8c5da9a7b7083bcb4e9d62
3
  size 5624
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a70798789a008d4e58669a5b402559307877fa0be790a9160ed0e412b4cc179
3
  size 5624