Namronaldo2004 commited on
Commit
db7c33c
1 Parent(s): cc14b49

Update fine-tuned model

Browse files
adapter_config.json CHANGED
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "k_proj",
27
- "gate_proj",
28
  "q_proj",
29
- "down_proj",
 
30
  "v_proj",
31
  "up_proj",
32
- "o_proj"
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
 
 
26
  "q_proj",
27
+ "o_proj",
28
+ "k_proj",
29
  "v_proj",
30
  "up_proj",
31
+ "down_proj",
32
+ "gate_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b4da9de7e6a0c855646daa4e8692b61bf593a9e5db24ad71f56b011fd878c8a3
3
  size 159967880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae7d6124db2999b66ba36817f8c2d2311a0f4f6fb36106ec14c0bc5b31769573
3
  size 159967880
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7a93b54fda0e2952d60fd2423d8d212c30612413fbad1598c7e439365d4fa8fc
3
  size 852876198
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60e76bf5ab8ae5b9e9260b0b8905e5be77afb63ceeab61da8f715e051ae9ec5
3
  size 852876198
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:042eeaadaa77e55313f6c5e71c307c518f1290d990d00304e40386bd32b1d3e0
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:841c3d2e9b5e46e8a77c6c9e705dba80a96ae5b9084634adc158035e3d78011a
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d01151db1fc4f9c05131abecdc90435e3aab7eb2c3021fc926311286e779587
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:babd43118aa66a4f5266d730539cf7f09611158b169d9e63dbcb83f6bbaa8626
3
  size 1064
trainer_state.json CHANGED
@@ -1,298 +1,543 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 8.016032064128256,
5
  "eval_steps": 500,
6
- "global_step": 40,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.20040080160320642,
13
- "grad_norm": 3.027674674987793,
14
- "learning_rate": 0.0001,
15
- "loss": 1.9225,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.40080160320641284,
20
- "grad_norm": 3.0814154148101807,
21
- "learning_rate": 0.0002,
22
- "loss": 1.937,
23
  "step": 2
24
  },
25
  {
26
- "epoch": 0.6012024048096193,
27
- "grad_norm": 2.178030252456665,
28
- "learning_rate": 0.000199658449300667,
29
- "loss": 1.585,
30
  "step": 3
31
  },
32
  {
33
- "epoch": 0.8016032064128257,
34
- "grad_norm": 1.4929898977279663,
35
- "learning_rate": 0.00019863613034027224,
36
- "loss": 1.2414,
37
  "step": 4
38
  },
39
  {
40
- "epoch": 1.002004008016032,
41
- "grad_norm": 1.2824300527572632,
42
- "learning_rate": 0.00019694002659393305,
43
- "loss": 1.109,
44
  "step": 5
45
  },
46
  {
47
- "epoch": 1.2024048096192386,
48
- "grad_norm": 0.8394728899002075,
49
- "learning_rate": 0.00019458172417006347,
50
- "loss": 0.7618,
51
  "step": 6
52
  },
53
  {
54
- "epoch": 1.402805611222445,
55
- "grad_norm": 0.7309438586235046,
56
- "learning_rate": 0.00019157733266550575,
57
- "loss": 0.6732,
58
  "step": 7
59
  },
60
  {
61
- "epoch": 1.6032064128256514,
62
- "grad_norm": 0.69007807970047,
63
- "learning_rate": 0.0001879473751206489,
64
- "loss": 0.646,
65
  "step": 8
66
  },
67
  {
68
- "epoch": 1.8036072144288577,
69
- "grad_norm": 0.6368725299835205,
70
- "learning_rate": 0.00018371664782625287,
71
- "loss": 0.6147,
72
  "step": 9
73
  },
74
  {
75
- "epoch": 2.004008016032064,
76
- "grad_norm": 0.5702280402183533,
77
- "learning_rate": 0.00017891405093963938,
78
- "loss": 0.5384,
79
  "step": 10
80
  },
81
  {
82
- "epoch": 2.2044088176352705,
83
- "grad_norm": 0.5861708521842957,
84
- "learning_rate": 0.00017357239106731317,
85
- "loss": 0.4272,
86
  "step": 11
87
  },
88
  {
89
- "epoch": 2.404809619238477,
90
- "grad_norm": 0.537497341632843,
91
- "learning_rate": 0.00016772815716257412,
92
- "loss": 0.419,
93
  "step": 12
94
  },
95
  {
96
- "epoch": 2.6052104208416833,
97
- "grad_norm": 0.4901179373264313,
98
- "learning_rate": 0.0001614212712689668,
99
- "loss": 0.3951,
100
  "step": 13
101
  },
102
  {
103
- "epoch": 2.80561122244489,
104
- "grad_norm": 0.44876372814178467,
105
- "learning_rate": 0.00015469481581224272,
106
- "loss": 0.3362,
107
  "step": 14
108
  },
109
  {
110
- "epoch": 3.006012024048096,
111
- "grad_norm": 0.4591052234172821,
112
- "learning_rate": 0.00014759473930370736,
113
- "loss": 0.321,
114
  "step": 15
115
  },
116
  {
117
- "epoch": 3.2064128256513027,
118
- "grad_norm": 0.3763630986213684,
119
- "learning_rate": 0.00014016954246529696,
120
- "loss": 0.2673,
121
  "step": 16
122
  },
123
  {
124
- "epoch": 3.406813627254509,
125
- "grad_norm": 0.37145256996154785,
126
- "learning_rate": 0.00013246994692046836,
127
- "loss": 0.2554,
128
  "step": 17
129
  },
130
  {
131
- "epoch": 3.6072144288577155,
132
- "grad_norm": 0.3408704102039337,
133
- "learning_rate": 0.00012454854871407994,
134
- "loss": 0.229,
135
  "step": 18
136
  },
137
  {
138
- "epoch": 3.8076152304609217,
139
- "grad_norm": 0.36588045954704285,
140
- "learning_rate": 0.00011645945902807341,
141
- "loss": 0.2371,
142
  "step": 19
143
  },
144
  {
145
- "epoch": 4.008016032064128,
146
- "grad_norm": 0.349997341632843,
147
- "learning_rate": 0.00010825793454723325,
148
- "loss": 0.2127,
149
  "step": 20
150
  },
151
  {
152
- "epoch": 4.208416833667335,
153
- "grad_norm": 0.30794546008110046,
154
- "learning_rate": 0.0001,
155
- "loss": 0.1856,
156
  "step": 21
157
  },
158
  {
159
- "epoch": 4.408817635270541,
160
- "grad_norm": 0.2869230806827545,
161
- "learning_rate": 9.174206545276677e-05,
162
- "loss": 0.152,
163
  "step": 22
164
  },
165
  {
166
- "epoch": 4.609218436873747,
167
- "grad_norm": 0.29317694902420044,
168
- "learning_rate": 8.35405409719266e-05,
169
- "loss": 0.1581,
170
  "step": 23
171
  },
172
  {
173
- "epoch": 4.809619238476954,
174
- "grad_norm": 0.290088951587677,
175
- "learning_rate": 7.54514512859201e-05,
176
- "loss": 0.1564,
177
  "step": 24
178
  },
179
  {
180
- "epoch": 5.01002004008016,
181
- "grad_norm": 0.30606502294540405,
182
- "learning_rate": 6.753005307953167e-05,
183
- "loss": 0.1594,
184
  "step": 25
185
  },
186
  {
187
- "epoch": 5.210420841683367,
188
- "grad_norm": 0.2588537335395813,
189
- "learning_rate": 5.983045753470308e-05,
190
- "loss": 0.1253,
191
  "step": 26
192
  },
193
  {
194
- "epoch": 5.410821643286573,
195
- "grad_norm": 0.2805459201335907,
196
- "learning_rate": 5.240526069629265e-05,
197
- "loss": 0.1488,
198
  "step": 27
199
  },
200
  {
201
- "epoch": 5.61122244488978,
202
- "grad_norm": 0.24708090722560883,
203
- "learning_rate": 4.530518418775733e-05,
204
- "loss": 0.1084,
205
  "step": 28
206
  },
207
  {
208
- "epoch": 5.811623246492986,
209
- "grad_norm": 0.2635113298892975,
210
- "learning_rate": 3.857872873103322e-05,
211
- "loss": 0.1255,
212
  "step": 29
213
  },
214
  {
215
- "epoch": 6.012024048096192,
216
- "grad_norm": 0.24471008777618408,
217
- "learning_rate": 3.227184283742591e-05,
218
- "loss": 0.1017,
219
  "step": 30
220
  },
221
  {
222
- "epoch": 6.212424849699399,
223
- "grad_norm": 0.23846429586410522,
224
- "learning_rate": 2.6427608932686843e-05,
225
- "loss": 0.1089,
226
  "step": 31
227
  },
228
  {
229
- "epoch": 6.412825651302605,
230
- "grad_norm": 0.25631099939346313,
231
- "learning_rate": 2.1085949060360654e-05,
232
- "loss": 0.1166,
233
  "step": 32
234
  },
235
  {
236
- "epoch": 6.613226452905812,
237
- "grad_norm": 0.24143779277801514,
238
- "learning_rate": 1.6283352173747145e-05,
239
- "loss": 0.1001,
240
  "step": 33
241
  },
242
  {
243
- "epoch": 6.813627254509018,
244
- "grad_norm": 0.22203697264194489,
245
- "learning_rate": 1.2052624879351104e-05,
246
- "loss": 0.0966,
247
  "step": 34
248
  },
249
  {
250
- "epoch": 7.014028056112225,
251
- "grad_norm": 0.228188157081604,
252
- "learning_rate": 8.422667334494249e-06,
253
- "loss": 0.0937,
254
  "step": 35
255
  },
256
  {
257
- "epoch": 7.214428857715431,
258
- "grad_norm": 0.22016100585460663,
259
- "learning_rate": 5.418275829936537e-06,
260
- "loss": 0.0982,
261
  "step": 36
262
  },
263
  {
264
- "epoch": 7.414829659318637,
265
- "grad_norm": 0.21842055022716522,
266
- "learning_rate": 3.059973406066963e-06,
267
- "loss": 0.0914,
268
  "step": 37
269
  },
270
  {
271
- "epoch": 7.615230460921843,
272
- "grad_norm": 0.22781485319137573,
273
- "learning_rate": 1.3638696597277679e-06,
274
- "loss": 0.0948,
275
  "step": 38
276
  },
277
  {
278
- "epoch": 7.81563126252505,
279
- "grad_norm": 0.23596827685832977,
280
- "learning_rate": 3.415506993330153e-07,
281
- "loss": 0.0959,
282
  "step": 39
283
  },
284
  {
285
- "epoch": 8.016032064128256,
286
- "grad_norm": 0.24492216110229492,
287
- "learning_rate": 0.0,
288
- "loss": 0.1,
289
  "step": 40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  }
291
  ],
292
  "logging_steps": 1,
293
- "max_steps": 40,
294
  "num_input_tokens_seen": 0,
295
- "num_train_epochs": 10,
296
  "save_steps": 500,
297
  "stateful_callbacks": {
298
  "TrainerControl": {
@@ -306,7 +551,7 @@
306
  "attributes": {}
307
  }
308
  },
309
- "total_flos": 3.4679807860064256e+16,
310
  "train_batch_size": 1,
311
  "trial_name": null,
312
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
  "eval_steps": 500,
6
+ "global_step": 75,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.04,
13
+ "grad_norm": 1.728904366493225,
14
+ "learning_rate": 5e-05,
15
+ "loss": 0.7312,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.08,
20
+ "grad_norm": 1.7350375652313232,
21
+ "learning_rate": 0.0001,
22
+ "loss": 0.7843,
23
  "step": 2
24
  },
25
  {
26
+ "epoch": 0.12,
27
+ "grad_norm": 1.4805001020431519,
28
+ "learning_rate": 0.00015000000000000001,
29
+ "loss": 0.6772,
30
  "step": 3
31
  },
32
  {
33
+ "epoch": 0.16,
34
+ "grad_norm": 1.0534298419952393,
35
+ "learning_rate": 0.0002,
36
+ "loss": 0.6122,
37
  "step": 4
38
  },
39
  {
40
+ "epoch": 0.2,
41
+ "grad_norm": 0.8123345375061035,
42
+ "learning_rate": 0.00019990212265199738,
43
+ "loss": 0.5329,
44
  "step": 5
45
  },
46
  {
47
+ "epoch": 0.24,
48
+ "grad_norm": 0.6635419130325317,
49
+ "learning_rate": 0.00019960868220749448,
50
+ "loss": 0.4878,
51
  "step": 6
52
  },
53
  {
54
+ "epoch": 0.28,
55
+ "grad_norm": 0.6072973608970642,
56
+ "learning_rate": 0.00019912025308994148,
57
+ "loss": 0.4836,
58
  "step": 7
59
  },
60
  {
61
+ "epoch": 0.32,
62
+ "grad_norm": 0.5697150826454163,
63
+ "learning_rate": 0.00019843779142227256,
64
+ "loss": 0.5162,
65
  "step": 8
66
  },
67
  {
68
+ "epoch": 0.36,
69
+ "grad_norm": 0.5166112780570984,
70
+ "learning_rate": 0.0001975626331552507,
71
+ "loss": 0.4825,
72
  "step": 9
73
  },
74
  {
75
+ "epoch": 0.4,
76
+ "grad_norm": 0.5054742097854614,
77
+ "learning_rate": 0.00019649649145228102,
78
+ "loss": 0.4564,
79
  "step": 10
80
  },
81
  {
82
+ "epoch": 0.44,
83
+ "grad_norm": 0.4537505805492401,
84
+ "learning_rate": 0.00019524145333581317,
85
+ "loss": 0.4383,
86
  "step": 11
87
  },
88
  {
89
+ "epoch": 0.48,
90
+ "grad_norm": 0.4296068251132965,
91
+ "learning_rate": 0.00019379997560189675,
92
+ "loss": 0.4529,
93
  "step": 12
94
  },
95
  {
96
+ "epoch": 0.52,
97
+ "grad_norm": 0.44367527961730957,
98
+ "learning_rate": 0.00019217488001088784,
99
+ "loss": 0.4545,
100
  "step": 13
101
  },
102
  {
103
+ "epoch": 0.56,
104
+ "grad_norm": 0.4168623685836792,
105
+ "learning_rate": 0.0001903693477637204,
106
+ "loss": 0.418,
107
  "step": 14
108
  },
109
  {
110
+ "epoch": 0.6,
111
+ "grad_norm": 0.4231944680213928,
112
+ "learning_rate": 0.0001883869132745561,
113
+ "loss": 0.435,
114
  "step": 15
115
  },
116
  {
117
+ "epoch": 0.64,
118
+ "grad_norm": 0.4075939953327179,
119
+ "learning_rate": 0.00018623145725200278,
120
+ "loss": 0.4274,
121
  "step": 16
122
  },
123
  {
124
+ "epoch": 0.68,
125
+ "grad_norm": 0.36284372210502625,
126
+ "learning_rate": 0.00018390719910244487,
127
+ "loss": 0.3972,
128
  "step": 17
129
  },
130
  {
131
+ "epoch": 0.72,
132
+ "grad_norm": 0.3902932405471802,
133
+ "learning_rate": 0.00018141868867035745,
134
+ "loss": 0.3953,
135
  "step": 18
136
  },
137
  {
138
+ "epoch": 0.76,
139
+ "grad_norm": 0.38837161660194397,
140
+ "learning_rate": 0.00017877079733177184,
141
+ "loss": 0.4294,
142
  "step": 19
143
  },
144
  {
145
+ "epoch": 0.8,
146
+ "grad_norm": 0.38330700993537903,
147
+ "learning_rate": 0.0001759687084583285,
148
+ "loss": 0.4155,
149
  "step": 20
150
  },
151
  {
152
+ "epoch": 0.84,
153
+ "grad_norm": 0.3897230625152588,
154
+ "learning_rate": 0.00017301790727058345,
155
+ "loss": 0.3791,
156
  "step": 21
157
  },
158
  {
159
+ "epoch": 0.88,
160
+ "grad_norm": 0.40314897894859314,
161
+ "learning_rate": 0.00016992417010043142,
162
+ "loss": 0.4018,
163
  "step": 22
164
  },
165
  {
166
+ "epoch": 0.92,
167
+ "grad_norm": 0.4068446755409241,
168
+ "learning_rate": 0.0001666935530836651,
169
+ "loss": 0.3801,
170
  "step": 23
171
  },
172
  {
173
+ "epoch": 0.96,
174
+ "grad_norm": 0.40863823890686035,
175
+ "learning_rate": 0.0001633323803048047,
176
+ "loss": 0.3879,
177
  "step": 24
178
  },
179
  {
180
+ "epoch": 1.0,
181
+ "grad_norm": 0.3958357274532318,
182
+ "learning_rate": 0.00015984723141740576,
183
+ "loss": 0.3929,
184
  "step": 25
185
  },
186
  {
187
+ "epoch": 1.04,
188
+ "grad_norm": 0.3210630416870117,
189
+ "learning_rate": 0.0001562449287640781,
190
+ "loss": 0.2722,
191
  "step": 26
192
  },
193
  {
194
+ "epoch": 1.08,
195
+ "grad_norm": 0.34723371267318726,
196
+ "learning_rate": 0.00015253252402142988,
197
+ "loss": 0.2701,
198
  "step": 27
199
  },
200
  {
201
+ "epoch": 1.12,
202
+ "grad_norm": 0.3267146050930023,
203
+ "learning_rate": 0.00014871728439607966,
204
+ "loss": 0.2654,
205
  "step": 28
206
  },
207
  {
208
+ "epoch": 1.16,
209
+ "grad_norm": 0.3217560350894928,
210
+ "learning_rate": 0.00014480667839875786,
211
+ "loss": 0.2653,
212
  "step": 29
213
  },
214
  {
215
+ "epoch": 1.2,
216
+ "grad_norm": 0.3129405975341797,
217
+ "learning_rate": 0.0001408083612243465,
218
+ "loss": 0.2495,
219
  "step": 30
220
  },
221
  {
222
+ "epoch": 1.24,
223
+ "grad_norm": 0.3169604241847992,
224
+ "learning_rate": 0.00013673015976647568,
225
+ "loss": 0.2783,
226
  "step": 31
227
  },
228
  {
229
+ "epoch": 1.28,
230
+ "grad_norm": 0.3302832543849945,
231
+ "learning_rate": 0.00013258005729601177,
232
+ "loss": 0.2589,
233
  "step": 32
234
  },
235
  {
236
+ "epoch": 1.32,
237
+ "grad_norm": 0.3463418781757355,
238
+ "learning_rate": 0.0001283661778334297,
239
+ "loss": 0.2453,
240
  "step": 33
241
  },
242
  {
243
+ "epoch": 1.3599999999999999,
244
+ "grad_norm": 0.3463260531425476,
245
+ "learning_rate": 0.00012409677024566144,
246
+ "loss": 0.242,
247
  "step": 34
248
  },
249
  {
250
+ "epoch": 1.4,
251
+ "grad_norm": 0.3702252209186554,
252
+ "learning_rate": 0.00011978019209855174,
253
+ "loss": 0.264,
254
  "step": 35
255
  },
256
  {
257
+ "epoch": 1.44,
258
+ "grad_norm": 0.3509206771850586,
259
+ "learning_rate": 0.00011542489329653024,
260
+ "loss": 0.2593,
261
  "step": 36
262
  },
263
  {
264
+ "epoch": 1.48,
265
+ "grad_norm": 0.3612159490585327,
266
+ "learning_rate": 0.000111039399541527,
267
+ "loss": 0.2411,
268
  "step": 37
269
  },
270
  {
271
+ "epoch": 1.52,
272
+ "grad_norm": 0.3651520609855652,
273
+ "learning_rate": 0.00010663229564351041,
274
+ "loss": 0.2378,
275
  "step": 38
276
  },
277
  {
278
+ "epoch": 1.56,
279
+ "grad_norm": 0.3665476441383362,
280
+ "learning_rate": 0.00010221220871531869,
281
+ "loss": 0.2334,
282
  "step": 39
283
  },
284
  {
285
+ "epoch": 1.6,
286
+ "grad_norm": 0.34961438179016113,
287
+ "learning_rate": 9.778779128468132e-05,
288
+ "loss": 0.2307,
289
  "step": 40
290
+ },
291
+ {
292
+ "epoch": 1.6400000000000001,
293
+ "grad_norm": 0.379111111164093,
294
+ "learning_rate": 9.336770435648964e-05,
295
+ "loss": 0.2212,
296
+ "step": 41
297
+ },
298
+ {
299
+ "epoch": 1.6800000000000002,
300
+ "grad_norm": 0.38593631982803345,
301
+ "learning_rate": 8.896060045847304e-05,
302
+ "loss": 0.2335,
303
+ "step": 42
304
+ },
305
+ {
306
+ "epoch": 1.72,
307
+ "grad_norm": 0.37961545586586,
308
+ "learning_rate": 8.457510670346976e-05,
309
+ "loss": 0.2306,
310
+ "step": 43
311
+ },
312
+ {
313
+ "epoch": 1.76,
314
+ "grad_norm": 0.3735259771347046,
315
+ "learning_rate": 8.021980790144827e-05,
316
+ "loss": 0.2499,
317
+ "step": 44
318
+ },
319
+ {
320
+ "epoch": 1.8,
321
+ "grad_norm": 0.40170496702194214,
322
+ "learning_rate": 7.590322975433857e-05,
323
+ "loss": 0.2275,
324
+ "step": 45
325
+ },
326
+ {
327
+ "epoch": 1.8399999999999999,
328
+ "grad_norm": 0.3875046372413635,
329
+ "learning_rate": 7.163382216657034e-05,
330
+ "loss": 0.218,
331
+ "step": 46
332
+ },
333
+ {
334
+ "epoch": 1.88,
335
+ "grad_norm": 0.37028905749320984,
336
+ "learning_rate": 6.741994270398826e-05,
337
+ "loss": 0.2422,
338
+ "step": 47
339
+ },
340
+ {
341
+ "epoch": 1.92,
342
+ "grad_norm": 0.3737669289112091,
343
+ "learning_rate": 6.326984023352435e-05,
344
+ "loss": 0.2195,
345
+ "step": 48
346
+ },
347
+ {
348
+ "epoch": 1.96,
349
+ "grad_norm": 0.3924426734447479,
350
+ "learning_rate": 5.91916387756535e-05,
351
+ "loss": 0.2235,
352
+ "step": 49
353
+ },
354
+ {
355
+ "epoch": 2.0,
356
+ "grad_norm": 0.36921918392181396,
357
+ "learning_rate": 5.5193321601242156e-05,
358
+ "loss": 0.2141,
359
+ "step": 50
360
+ },
361
+ {
362
+ "epoch": 2.04,
363
+ "grad_norm": 0.28800830245018005,
364
+ "learning_rate": 5.1282715603920374e-05,
365
+ "loss": 0.1736,
366
+ "step": 51
367
+ },
368
+ {
369
+ "epoch": 2.08,
370
+ "grad_norm": 0.31100502610206604,
371
+ "learning_rate": 4.746747597857014e-05,
372
+ "loss": 0.1578,
373
+ "step": 52
374
+ },
375
+ {
376
+ "epoch": 2.12,
377
+ "grad_norm": 0.3049222528934479,
378
+ "learning_rate": 4.375507123592194e-05,
379
+ "loss": 0.1771,
380
+ "step": 53
381
+ },
382
+ {
383
+ "epoch": 2.16,
384
+ "grad_norm": 0.28219300508499146,
385
+ "learning_rate": 4.015276858259427e-05,
386
+ "loss": 0.1476,
387
+ "step": 54
388
+ },
389
+ {
390
+ "epoch": 2.2,
391
+ "grad_norm": 0.3022613525390625,
392
+ "learning_rate": 3.6667619695195285e-05,
393
+ "loss": 0.1779,
394
+ "step": 55
395
+ },
396
+ {
397
+ "epoch": 2.24,
398
+ "grad_norm": 0.2823966443538666,
399
+ "learning_rate": 3.330644691633492e-05,
400
+ "loss": 0.1501,
401
+ "step": 56
402
+ },
403
+ {
404
+ "epoch": 2.2800000000000002,
405
+ "grad_norm": 0.28174689412117004,
406
+ "learning_rate": 3.0075829899568597e-05,
407
+ "loss": 0.1511,
408
+ "step": 57
409
+ },
410
+ {
411
+ "epoch": 2.32,
412
+ "grad_norm": 0.2776714861392975,
413
+ "learning_rate": 2.6982092729416587e-05,
414
+ "loss": 0.1568,
415
+ "step": 58
416
+ },
417
+ {
418
+ "epoch": 2.36,
419
+ "grad_norm": 0.2745690643787384,
420
+ "learning_rate": 2.403129154167153e-05,
421
+ "loss": 0.1393,
422
+ "step": 59
423
+ },
424
+ {
425
+ "epoch": 2.4,
426
+ "grad_norm": 0.2862659692764282,
427
+ "learning_rate": 2.1229202668228197e-05,
428
+ "loss": 0.1568,
429
+ "step": 60
430
+ },
431
+ {
432
+ "epoch": 2.44,
433
+ "grad_norm": 0.30168795585632324,
434
+ "learning_rate": 1.858131132964259e-05,
435
+ "loss": 0.164,
436
+ "step": 61
437
+ },
438
+ {
439
+ "epoch": 2.48,
440
+ "grad_norm": 0.30739548802375793,
441
+ "learning_rate": 1.609280089755515e-05,
442
+ "loss": 0.1595,
443
+ "step": 62
444
+ },
445
+ {
446
+ "epoch": 2.52,
447
+ "grad_norm": 0.2983320355415344,
448
+ "learning_rate": 1.3768542747997215e-05,
449
+ "loss": 0.174,
450
+ "step": 63
451
+ },
452
+ {
453
+ "epoch": 2.56,
454
+ "grad_norm": 0.30526575446128845,
455
+ "learning_rate": 1.161308672544389e-05,
456
+ "loss": 0.168,
457
+ "step": 64
458
+ },
459
+ {
460
+ "epoch": 2.6,
461
+ "grad_norm": 0.28905755281448364,
462
+ "learning_rate": 9.630652236279625e-06,
463
+ "loss": 0.1557,
464
+ "step": 65
465
+ },
466
+ {
467
+ "epoch": 2.64,
468
+ "grad_norm": 0.29685401916503906,
469
+ "learning_rate": 7.825119989112173e-06,
470
+ "loss": 0.1531,
471
+ "step": 66
472
+ },
473
+ {
474
+ "epoch": 2.68,
475
+ "grad_norm": 0.309733510017395,
476
+ "learning_rate": 6.200024398103255e-06,
477
+ "loss": 0.1538,
478
+ "step": 67
479
+ },
480
+ {
481
+ "epoch": 2.7199999999999998,
482
+ "grad_norm": 0.31325626373291016,
483
+ "learning_rate": 4.758546664186869e-06,
484
+ "loss": 0.16,
485
+ "step": 68
486
+ },
487
+ {
488
+ "epoch": 2.76,
489
+ "grad_norm": 0.29155057668685913,
490
+ "learning_rate": 3.5035085477190143e-06,
491
+ "loss": 0.1612,
492
+ "step": 69
493
+ },
494
+ {
495
+ "epoch": 2.8,
496
+ "grad_norm": 0.31043997406959534,
497
+ "learning_rate": 2.4373668447493224e-06,
498
+ "loss": 0.1655,
499
+ "step": 70
500
+ },
501
+ {
502
+ "epoch": 2.84,
503
+ "grad_norm": 0.30450770258903503,
504
+ "learning_rate": 1.562208577727442e-06,
505
+ "loss": 0.1581,
506
+ "step": 71
507
+ },
508
+ {
509
+ "epoch": 2.88,
510
+ "grad_norm": 0.2816106677055359,
511
+ "learning_rate": 8.797469100585431e-07,
512
+ "loss": 0.1369,
513
+ "step": 72
514
+ },
515
+ {
516
+ "epoch": 2.92,
517
+ "grad_norm": 0.3166625201702118,
518
+ "learning_rate": 3.913177925055189e-07,
519
+ "loss": 0.1724,
520
+ "step": 73
521
+ },
522
+ {
523
+ "epoch": 2.96,
524
+ "grad_norm": 0.30547070503234863,
525
+ "learning_rate": 9.78773480026396e-08,
526
+ "loss": 0.1555,
527
+ "step": 74
528
+ },
529
+ {
530
+ "epoch": 3.0,
531
+ "grad_norm": 0.29624301195144653,
532
+ "learning_rate": 0.0,
533
+ "loss": 0.1598,
534
+ "step": 75
535
  }
536
  ],
537
  "logging_steps": 1,
538
+ "max_steps": 75,
539
  "num_input_tokens_seen": 0,
540
+ "num_train_epochs": 3,
541
  "save_steps": 500,
542
  "stateful_callbacks": {
543
  "TrainerControl": {
 
551
  "attributes": {}
552
  }
553
  },
554
+ "total_flos": 9.106714169779814e+16,
555
  "train_batch_size": 1,
556
  "trial_name": null,
557
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8923db3bbc9d64f25728011e5b67d183e43b0af93caeb0253ce117a78cbdba2
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c82f1076477830c5279e886b1bb2af3d8f3ef16a462c5c5df5187f2aded327b7
3
  size 5240