robertou2 commited on
Commit
f54b544
·
verified ·
1 Parent(s): cb34667

Upload folder using huggingface_hub

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ba743cbcbe7a17a13ffee64e044e449254882634e848aa631f63e6778810b27
3
  size 201361312
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34e8c063268c82d446f081987dac5fd9c69282ecfad89abd0570dc93517cdbc9
3
  size 201361312
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa8c590b229a780debdb448bc28cb8f79b28f2ec2c6ea5636a4abf950ae5a038
3
  size 402868986
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0157da31bb4062434f031ff2dd7c51f693e094db4fe85815de38edaefd40b9fa
3
  size 402868986
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:78412adf2dda42daa646069b544a18df9b06cb455b0068bb5473d031abd28e97
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed70ecedcd9a62bbb04bf9838304aced41ca983de90cea5987c3cff1d4f80fe3
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:45e12526c8172a948234d8cb869935e517c484d36da5eb6ac9a7382e7d268eff
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f94793ec3497737749203684f2a64875f06eeb7a4781950315fb5cb4ec740a8
3
  size 1064
trainer_state.json CHANGED
@@ -1,802 +1,408 @@
1
  {
2
- "best_metric": 0.5424160957336426,
3
- "best_model_checkpoint": "//outputs/task7_microsoft/Phi-3.5-mini-instruct/checkpoint-400",
4
- "epoch": 8.0,
5
  "eval_steps": 500,
6
- "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.5882352941176471,
13
- "grad_norm": 0.564612090587616,
14
- "learning_rate": 5e-06,
15
- "loss": 0.8053,
16
  "step": 5
17
  },
18
  {
19
- "epoch": 1.0,
20
- "eval_loss": 0.7743130326271057,
21
- "eval_runtime": 3.5233,
22
- "eval_samples_per_second": 4.257,
23
- "eval_steps_per_second": 0.568,
24
- "step": 9
25
- },
26
- {
27
- "epoch": 1.1176470588235294,
28
- "grad_norm": 0.5536892414093018,
29
- "learning_rate": 1e-05,
30
- "loss": 0.7727,
31
  "step": 10
32
  },
33
  {
34
- "epoch": 1.7058823529411766,
35
- "grad_norm": 0.3953665494918823,
36
- "learning_rate": 9.98292246503335e-06,
37
- "loss": 0.7726,
38
  "step": 15
39
  },
40
  {
41
- "epoch": 2.0,
42
- "eval_loss": 0.7348855137825012,
43
- "eval_runtime": 3.3629,
44
- "eval_samples_per_second": 4.46,
45
- "eval_steps_per_second": 0.595,
46
- "step": 18
47
- },
48
- {
49
- "epoch": 2.235294117647059,
50
- "grad_norm": 0.32548508048057556,
51
- "learning_rate": 9.931806517013612e-06,
52
- "loss": 0.7178,
53
  "step": 20
54
  },
55
  {
56
- "epoch": 2.8235294117647056,
57
- "grad_norm": 0.27749133110046387,
58
- "learning_rate": 9.847001329696653e-06,
59
- "loss": 0.6607,
60
  "step": 25
61
  },
62
  {
63
- "epoch": 3.0,
64
- "eval_loss": 0.7112905383110046,
65
- "eval_runtime": 3.3607,
66
- "eval_samples_per_second": 4.463,
67
- "eval_steps_per_second": 0.595,
68
- "step": 27
69
- },
70
- {
71
- "epoch": 3.3529411764705883,
72
- "grad_norm": 0.28755590319633484,
73
- "learning_rate": 9.729086208503174e-06,
74
- "loss": 0.7081,
75
  "step": 30
76
  },
77
  {
78
- "epoch": 3.9411764705882355,
79
- "grad_norm": 0.25980502367019653,
80
- "learning_rate": 9.578866633275289e-06,
81
- "loss": 0.6063,
82
  "step": 35
83
  },
84
  {
85
- "epoch": 4.0,
86
- "eval_loss": 0.6943528056144714,
87
- "eval_runtime": 3.3629,
88
- "eval_samples_per_second": 4.46,
89
- "eval_steps_per_second": 0.595,
90
- "step": 36
91
- },
92
- {
93
- "epoch": 4.470588235294118,
94
- "grad_norm": 0.23154301941394806,
95
- "learning_rate": 9.397368756032445e-06,
96
- "loss": 0.6561,
97
  "step": 40
98
  },
99
  {
100
- "epoch": 5.0,
101
- "grad_norm": 0.30559542775154114,
102
- "learning_rate": 9.185832391312644e-06,
103
- "loss": 0.6935,
104
  "step": 45
105
  },
106
  {
107
- "epoch": 5.0,
108
- "eval_loss": 0.6810200214385986,
109
- "eval_runtime": 3.3611,
110
- "eval_samples_per_second": 4.463,
111
- "eval_steps_per_second": 0.595,
112
- "step": 45
113
- },
114
- {
115
- "epoch": 5.588235294117647,
116
- "grad_norm": 0.21162718534469604,
117
- "learning_rate": 8.94570254698197e-06,
118
- "loss": 0.6829,
119
  "step": 50
120
  },
121
  {
122
- "epoch": 6.0,
123
- "eval_loss": 0.6704084277153015,
124
- "eval_runtime": 3.3625,
125
- "eval_samples_per_second": 4.461,
126
- "eval_steps_per_second": 0.595,
127
- "step": 54
128
  },
129
  {
130
- "epoch": 6.117647058823529,
131
- "grad_norm": 0.26222917437553406,
132
- "learning_rate": 8.67861955336566e-06,
133
- "loss": 0.6021,
134
  "step": 55
135
  },
136
  {
137
- "epoch": 6.705882352941177,
138
- "grad_norm": 0.23411308228969574,
139
- "learning_rate": 8.386407858128707e-06,
140
- "loss": 0.6483,
141
  "step": 60
142
  },
143
  {
144
- "epoch": 7.0,
145
- "eval_loss": 0.6606718897819519,
146
- "eval_runtime": 3.3601,
147
- "eval_samples_per_second": 4.464,
148
- "eval_steps_per_second": 0.595,
149
- "step": 63
150
- },
151
- {
152
- "epoch": 7.235294117647059,
153
- "grad_norm": 0.18744103610515594,
154
- "learning_rate": 8.071063563448341e-06,
155
- "loss": 0.5817,
156
  "step": 65
157
  },
158
  {
159
- "epoch": 7.823529411764706,
160
- "grad_norm": 0.18960484862327576,
161
- "learning_rate": 7.734740790612137e-06,
162
- "loss": 0.6352,
163
  "step": 70
164
  },
165
  {
166
- "epoch": 8.0,
167
- "eval_loss": 0.6521106958389282,
168
- "eval_runtime": 3.3613,
169
- "eval_samples_per_second": 4.463,
170
- "eval_steps_per_second": 0.595,
171
- "step": 72
172
- },
173
- {
174
- "epoch": 8.352941176470589,
175
- "grad_norm": 0.15531951189041138,
176
- "learning_rate": 7.379736965185369e-06,
177
- "loss": 0.5719,
178
  "step": 75
179
  },
180
  {
181
- "epoch": 8.941176470588236,
182
- "grad_norm": 0.34726396203041077,
183
- "learning_rate": 7.008477123264849e-06,
184
- "loss": 0.6186,
185
  "step": 80
186
  },
187
  {
188
- "epoch": 9.0,
189
- "eval_loss": 0.6448661088943481,
190
- "eval_runtime": 3.3624,
191
- "eval_samples_per_second": 4.461,
192
- "eval_steps_per_second": 0.595,
193
- "step": 81
194
- },
195
- {
196
- "epoch": 9.470588235294118,
197
- "grad_norm": 0.1773035228252411,
198
- "learning_rate": 6.6234973460234184e-06,
199
- "loss": 0.6052,
200
  "step": 85
201
  },
202
  {
203
- "epoch": 10.0,
204
- "grad_norm": 0.2170713096857071,
205
- "learning_rate": 6.227427435703997e-06,
206
- "loss": 0.5415,
207
  "step": 90
208
  },
209
  {
210
- "epoch": 10.0,
211
- "eval_loss": 0.6390407681465149,
212
- "eval_runtime": 3.3658,
213
- "eval_samples_per_second": 4.457,
214
- "eval_steps_per_second": 0.594,
215
- "step": 90
216
- },
217
- {
218
- "epoch": 10.588235294117647,
219
- "grad_norm": 0.2540779709815979,
220
- "learning_rate": 5.82297295140367e-06,
221
- "loss": 0.6305,
222
  "step": 95
223
  },
224
  {
225
- "epoch": 11.0,
226
- "eval_loss": 0.6332173943519592,
227
- "eval_runtime": 3.3622,
228
- "eval_samples_per_second": 4.461,
229
- "eval_steps_per_second": 0.595,
230
- "step": 99
231
  },
232
  {
233
- "epoch": 11.117647058823529,
234
- "grad_norm": 0.2432163953781128,
235
- "learning_rate": 5.412896727361663e-06,
236
- "loss": 0.5547,
 
237
  "step": 100
238
  },
239
  {
240
- "epoch": 11.705882352941176,
241
- "grad_norm": 0.2414003312587738,
242
- "learning_rate": 5e-06,
243
- "loss": 0.5385,
244
  "step": 105
245
  },
246
  {
247
- "epoch": 12.0,
248
- "eval_loss": 0.6285383701324463,
249
- "eval_runtime": 3.3638,
250
- "eval_samples_per_second": 4.459,
251
- "eval_steps_per_second": 0.595,
252
- "step": 108
253
- },
254
- {
255
- "epoch": 12.235294117647058,
256
- "grad_norm": 0.2067604809999466,
257
- "learning_rate": 4.587103272638339e-06,
258
- "loss": 0.536,
259
  "step": 110
260
  },
261
  {
262
- "epoch": 12.823529411764707,
263
- "grad_norm": 0.29979485273361206,
264
- "learning_rate": 4.17702704859633e-06,
265
- "loss": 0.5896,
266
  "step": 115
267
  },
268
  {
269
- "epoch": 13.0,
270
- "eval_loss": 0.6254769563674927,
271
- "eval_runtime": 3.3694,
272
- "eval_samples_per_second": 4.452,
273
- "eval_steps_per_second": 0.594,
274
- "step": 117
275
- },
276
- {
277
- "epoch": 13.352941176470589,
278
- "grad_norm": 0.1513441950082779,
279
- "learning_rate": 3.7725725642960047e-06,
280
- "loss": 0.5415,
281
  "step": 120
282
  },
283
  {
284
- "epoch": 13.941176470588236,
285
- "grad_norm": 0.2250215709209442,
286
- "learning_rate": 3.3765026539765832e-06,
287
- "loss": 0.5612,
288
  "step": 125
289
  },
290
  {
291
- "epoch": 14.0,
292
- "eval_loss": 0.6232194900512695,
293
- "eval_runtime": 3.3613,
294
- "eval_samples_per_second": 4.463,
295
- "eval_steps_per_second": 0.595,
296
- "step": 126
297
- },
298
- {
299
- "epoch": 14.470588235294118,
300
- "grad_norm": 0.21195632219314575,
301
- "learning_rate": 2.991522876735154e-06,
302
- "loss": 0.5624,
303
  "step": 130
304
  },
305
  {
306
- "epoch": 15.0,
307
- "grad_norm": 0.4384087026119232,
308
- "learning_rate": 2.6202630348146323e-06,
309
- "loss": 0.5871,
310
  "step": 135
311
  },
312
  {
313
- "epoch": 15.0,
314
- "eval_loss": 0.6213398575782776,
315
- "eval_runtime": 3.3593,
316
- "eval_samples_per_second": 4.465,
317
- "eval_steps_per_second": 0.595,
318
- "step": 135
319
- },
320
- {
321
- "epoch": 15.588235294117647,
322
- "grad_norm": 0.23890897631645203,
323
- "learning_rate": 2.265259209387867e-06,
324
- "loss": 0.5352,
325
  "step": 140
326
  },
327
  {
328
- "epoch": 16.0,
329
- "eval_loss": 0.6193457841873169,
330
- "eval_runtime": 3.3601,
331
- "eval_samples_per_second": 4.464,
332
- "eval_steps_per_second": 0.595,
333
- "step": 144
334
- },
335
- {
336
- "epoch": 16.11764705882353,
337
- "grad_norm": 0.24785251915454865,
338
- "learning_rate": 1.928936436551661e-06,
339
- "loss": 0.5998,
340
  "step": 145
341
  },
342
  {
343
- "epoch": 16.705882352941178,
344
- "grad_norm": 0.21428382396697998,
345
- "learning_rate": 1.6135921418712959e-06,
346
- "loss": 0.5564,
347
  "step": 150
348
  },
349
  {
350
- "epoch": 17.0,
351
- "eval_loss": 0.618452787399292,
352
- "eval_runtime": 3.3625,
353
- "eval_samples_per_second": 4.461,
354
- "eval_steps_per_second": 0.595,
355
- "step": 153
356
  },
357
  {
358
- "epoch": 17.235294117647058,
359
- "grad_norm": 0.19924059510231018,
360
- "learning_rate": 1.321380446634342e-06,
361
- "loss": 0.4868,
362
  "step": 155
363
  },
364
  {
365
- "epoch": 17.823529411764707,
366
- "grad_norm": 0.16416364908218384,
367
- "learning_rate": 1.0542974530180327e-06,
368
- "loss": 0.6029,
369
  "step": 160
370
  },
371
  {
372
- "epoch": 18.0,
373
- "eval_loss": 0.6172903776168823,
374
- "eval_runtime": 3.3616,
375
- "eval_samples_per_second": 4.462,
376
- "eval_steps_per_second": 0.595,
377
- "step": 162
378
- },
379
- {
380
- "epoch": 18.352941176470587,
381
- "grad_norm": 0.21794988214969635,
382
- "learning_rate": 8.141676086873574e-07,
383
- "loss": 0.4832,
384
  "step": 165
385
  },
386
  {
387
- "epoch": 18.941176470588236,
388
- "grad_norm": 0.27910733222961426,
389
- "learning_rate": 6.026312439675553e-07,
390
- "loss": 0.5107,
391
  "step": 170
392
  },
393
  {
394
- "epoch": 19.0,
395
- "eval_loss": 0.617369532585144,
396
- "eval_runtime": 3.3609,
397
- "eval_samples_per_second": 4.463,
398
- "eval_steps_per_second": 0.595,
399
- "step": 171
400
- },
401
- {
402
- "epoch": 19.470588235294116,
403
- "grad_norm": 0.21645767986774445,
404
- "learning_rate": 4.211333667247125e-07,
405
- "loss": 0.5692,
406
  "step": 175
407
  },
408
  {
409
- "epoch": 20.0,
410
- "grad_norm": 0.39115971326828003,
411
- "learning_rate": 2.7091379149682683e-07,
412
- "loss": 0.5808,
413
  "step": 180
414
  },
415
  {
416
- "epoch": 20.0,
417
- "eval_loss": 0.6167533993721008,
418
- "eval_runtime": 3.3616,
419
- "eval_samples_per_second": 4.462,
420
- "eval_steps_per_second": 0.595,
421
- "step": 180
422
- },
423
- {
424
- "epoch": 20.58823529411765,
425
- "grad_norm": 0.26653149724006653,
426
- "learning_rate": 1.5299867030334815e-07,
427
- "loss": 0.5835,
428
  "step": 185
429
  },
430
  {
431
- "epoch": 21.0,
432
- "eval_loss": 0.6167729496955872,
433
- "eval_runtime": 3.3615,
434
- "eval_samples_per_second": 4.462,
435
- "eval_steps_per_second": 0.595,
436
- "step": 189
437
- },
438
- {
439
- "epoch": 21.11764705882353,
440
- "grad_norm": 0.27125898003578186,
441
- "learning_rate": 6.819348298638839e-08,
442
- "loss": 0.5515,
443
  "step": 190
444
  },
445
  {
446
- "epoch": 21.705882352941178,
447
- "grad_norm": 0.20525327324867249,
448
- "learning_rate": 1.7077534966650767e-08,
449
- "loss": 0.5211,
450
  "step": 195
451
  },
452
  {
453
- "epoch": 22.0,
454
- "eval_loss": 0.6173871159553528,
455
- "eval_runtime": 3.3629,
456
- "eval_samples_per_second": 4.46,
457
- "eval_steps_per_second": 0.595,
458
- "step": 198
459
- },
460
- {
461
- "epoch": 22.235294117647058,
462
- "grad_norm": 0.19269497692584991,
463
- "learning_rate": 0.0,
464
- "loss": 0.5147,
465
  "step": 200
466
  },
467
  {
468
- "epoch": 22.235294117647058,
469
- "eval_loss": 0.6163371205329895,
470
- "eval_runtime": 3.3629,
471
- "eval_samples_per_second": 4.46,
472
- "eval_steps_per_second": 0.595,
473
  "step": 200
474
  },
475
- {
476
- "epoch": 3.586666666666667,
477
- "eval_loss": 0.5931960940361023,
478
- "eval_runtime": 28.5506,
479
- "eval_samples_per_second": 3.503,
480
- "eval_steps_per_second": 0.455,
481
- "step": 201
482
- },
483
  {
484
  "epoch": 4.1,
485
- "grad_norm": 0.8143700957298279,
486
- "learning_rate": 5.206624871244066e-06,
487
- "loss": 0.9672,
488
  "step": 205
489
  },
490
  {
491
  "epoch": 4.2,
492
- "grad_norm": 0.670274019241333,
493
- "learning_rate": 5e-06,
494
- "loss": 0.9171,
495
  "step": 210
496
  },
497
  {
498
  "epoch": 4.3,
499
- "grad_norm": 0.5900228023529053,
500
- "learning_rate": 4.793375128755934e-06,
501
- "loss": 0.8865,
502
  "step": 215
503
  },
504
  {
505
  "epoch": 4.4,
506
- "grad_norm": 0.5981155633926392,
507
- "learning_rate": 4.587103272638339e-06,
508
- "loss": 1.1775,
509
  "step": 220
510
  },
511
  {
512
  "epoch": 4.5,
513
- "grad_norm": 0.5991724729537964,
514
- "learning_rate": 4.381536843653262e-06,
515
- "loss": 0.7489,
516
  "step": 225
517
  },
518
  {
519
  "epoch": 4.6,
520
- "grad_norm": 0.5450884103775024,
521
- "learning_rate": 4.17702704859633e-06,
522
- "loss": 0.8612,
523
  "step": 230
524
  },
525
  {
526
  "epoch": 4.7,
527
- "grad_norm": 0.444416344165802,
528
- "learning_rate": 3.973923289021829e-06,
529
- "loss": 0.7293,
530
  "step": 235
531
  },
532
  {
533
  "epoch": 4.8,
534
- "grad_norm": 0.3834201395511627,
535
- "learning_rate": 3.7725725642960047e-06,
536
- "loss": 0.7699,
537
  "step": 240
538
  },
539
  {
540
  "epoch": 4.9,
541
- "grad_norm": 0.3441762924194336,
542
- "learning_rate": 3.573318878754475e-06,
543
- "loss": 0.8972,
544
  "step": 245
545
  },
546
  {
547
  "epoch": 5.0,
548
- "grad_norm": 0.5351847410202026,
549
- "learning_rate": 3.3765026539765832e-06,
550
- "loss": 0.6602,
551
  "step": 250
552
  },
553
  {
554
  "epoch": 5.0,
555
- "eval_loss": 0.5578957200050354,
556
- "eval_runtime": 52.5326,
557
- "eval_samples_per_second": 3.807,
558
- "eval_steps_per_second": 0.476,
559
- "step": 250
560
- },
561
- {
562
- "epoch": 5.1,
563
- "grad_norm": 0.37455469369888306,
564
- "learning_rate": 3.1824601471808504e-06,
565
- "loss": 0.884,
566
- "step": 255
567
- },
568
- {
569
- "epoch": 5.2,
570
- "grad_norm": 0.6285215020179749,
571
- "learning_rate": 2.991522876735154e-06,
572
- "loss": 0.8042,
573
- "step": 260
574
- },
575
- {
576
- "epoch": 5.3,
577
- "grad_norm": 0.37903887033462524,
578
- "learning_rate": 2.804017055763149e-06,
579
- "loss": 0.6865,
580
- "step": 265
581
- },
582
- {
583
- "epoch": 5.4,
584
- "grad_norm": 0.4468790292739868,
585
- "learning_rate": 2.6202630348146323e-06,
586
- "loss": 0.9571,
587
- "step": 270
588
- },
589
- {
590
- "epoch": 5.5,
591
- "grad_norm": 2.321368932723999,
592
- "learning_rate": 2.4405747545519966e-06,
593
- "loss": 0.7722,
594
- "step": 275
595
- },
596
- {
597
- "epoch": 5.6,
598
- "grad_norm": 0.3462996482849121,
599
- "learning_rate": 2.265259209387867e-06,
600
- "loss": 0.6575,
601
- "step": 280
602
- },
603
- {
604
- "epoch": 5.7,
605
- "grad_norm": 0.7634517550468445,
606
- "learning_rate": 2.094615922990309e-06,
607
- "loss": 0.7036,
608
- "step": 285
609
- },
610
- {
611
- "epoch": 5.8,
612
- "grad_norm": 0.33972227573394775,
613
- "learning_rate": 1.928936436551661e-06,
614
- "loss": 0.6193,
615
- "step": 290
616
- },
617
- {
618
- "epoch": 5.9,
619
- "grad_norm": 0.863368570804596,
620
- "learning_rate": 1.7685038106952952e-06,
621
- "loss": 0.7429,
622
- "step": 295
623
- },
624
- {
625
- "epoch": 6.0,
626
- "grad_norm": 0.8421957492828369,
627
- "learning_rate": 1.6135921418712959e-06,
628
- "loss": 0.6177,
629
- "step": 300
630
- },
631
- {
632
- "epoch": 6.0,
633
- "eval_loss": 0.5471388697624207,
634
- "eval_runtime": 52.1971,
635
- "eval_samples_per_second": 3.832,
636
- "eval_steps_per_second": 0.479,
637
- "step": 300
638
- },
639
- {
640
- "epoch": 6.1,
641
- "grad_norm": 0.42387768626213074,
642
- "learning_rate": 1.4644660940672628e-06,
643
- "loss": 0.7107,
644
- "step": 305
645
- },
646
- {
647
- "epoch": 6.2,
648
- "grad_norm": 0.40212640166282654,
649
- "learning_rate": 1.321380446634342e-06,
650
- "loss": 0.6465,
651
- "step": 310
652
- },
653
- {
654
- "epoch": 6.3,
655
- "grad_norm": 0.38275906443595886,
656
- "learning_rate": 1.1845796590009684e-06,
657
- "loss": 0.7838,
658
- "step": 315
659
- },
660
- {
661
- "epoch": 6.4,
662
- "grad_norm": 0.517331063747406,
663
- "learning_rate": 1.0542974530180327e-06,
664
- "loss": 0.6743,
665
- "step": 320
666
- },
667
- {
668
- "epoch": 6.5,
669
- "grad_norm": 0.4819343388080597,
670
- "learning_rate": 9.307564136490255e-07,
671
- "loss": 0.6544,
672
- "step": 325
673
- },
674
- {
675
- "epoch": 6.6,
676
- "grad_norm": 0.5918112397193909,
677
- "learning_rate": 8.141676086873574e-07,
678
- "loss": 0.6178,
679
- "step": 330
680
- },
681
- {
682
- "epoch": 6.7,
683
- "grad_norm": 0.3847924768924713,
684
- "learning_rate": 7.047302281505735e-07,
685
- "loss": 0.5631,
686
- "step": 335
687
- },
688
- {
689
- "epoch": 6.8,
690
- "grad_norm": 0.43630239367485046,
691
- "learning_rate": 6.026312439675553e-07,
692
- "loss": 0.5709,
693
- "step": 340
694
- },
695
- {
696
- "epoch": 6.9,
697
- "grad_norm": 0.6350282430648804,
698
- "learning_rate": 5.080450905401057e-07,
699
- "loss": 0.7065,
700
- "step": 345
701
- },
702
- {
703
- "epoch": 7.0,
704
- "grad_norm": 0.5881220102310181,
705
- "learning_rate": 4.211333667247125e-07,
706
- "loss": 0.6102,
707
- "step": 350
708
- },
709
- {
710
- "epoch": 7.0,
711
- "eval_loss": 0.5426855683326721,
712
- "eval_runtime": 52.2072,
713
  "eval_samples_per_second": 3.831,
714
  "eval_steps_per_second": 0.479,
715
- "step": 350
716
- },
717
- {
718
- "epoch": 7.1,
719
- "grad_norm": 0.5317939519882202,
720
- "learning_rate": 3.420445597436056e-07,
721
- "loss": 0.6632,
722
- "step": 355
723
- },
724
- {
725
- "epoch": 7.2,
726
- "grad_norm": 0.5702535510063171,
727
- "learning_rate": 2.7091379149682683e-07,
728
- "loss": 0.5992,
729
- "step": 360
730
- },
731
- {
732
- "epoch": 7.3,
733
- "grad_norm": 0.6872391104698181,
734
- "learning_rate": 2.0786258770873647e-07,
735
- "loss": 0.6422,
736
- "step": 365
737
- },
738
- {
739
- "epoch": 7.4,
740
- "grad_norm": 0.32829490303993225,
741
- "learning_rate": 1.5299867030334815e-07,
742
- "loss": 0.6811,
743
- "step": 370
744
- },
745
- {
746
- "epoch": 7.5,
747
- "grad_norm": 0.5375828742980957,
748
- "learning_rate": 1.0641577336322761e-07,
749
- "loss": 0.8423,
750
- "step": 375
751
- },
752
- {
753
- "epoch": 7.6,
754
- "grad_norm": 0.6306584477424622,
755
- "learning_rate": 6.819348298638839e-08,
756
- "loss": 0.5899,
757
- "step": 380
758
- },
759
- {
760
- "epoch": 7.7,
761
- "grad_norm": 0.44418570399284363,
762
- "learning_rate": 3.839710131477492e-08,
763
- "loss": 0.6571,
764
- "step": 385
765
- },
766
- {
767
- "epoch": 7.8,
768
- "grad_norm": 0.49700650572776794,
769
- "learning_rate": 1.7077534966650767e-08,
770
- "loss": 0.6561,
771
- "step": 390
772
- },
773
- {
774
- "epoch": 7.9,
775
- "grad_norm": 0.3311610519886017,
776
- "learning_rate": 4.2712080634949024e-09,
777
- "loss": 0.6226,
778
- "step": 395
779
- },
780
- {
781
- "epoch": 8.0,
782
- "grad_norm": 1.5899903774261475,
783
- "learning_rate": 0.0,
784
- "loss": 0.6762,
785
- "step": 400
786
- },
787
- {
788
- "epoch": 8.0,
789
- "eval_loss": 0.5424160957336426,
790
- "eval_runtime": 52.193,
791
- "eval_samples_per_second": 3.832,
792
- "eval_steps_per_second": 0.479,
793
- "step": 400
794
  }
795
  ],
796
  "logging_steps": 5,
797
- "max_steps": 400,
798
  "num_input_tokens_seen": 0,
799
- "num_train_epochs": 8,
800
  "save_steps": 500,
801
  "stateful_callbacks": {
802
  "TrainerControl": {
@@ -805,12 +411,12 @@
805
  "should_evaluate": false,
806
  "should_log": false,
807
  "should_save": true,
808
- "should_training_stop": true
809
  },
810
  "attributes": {}
811
  }
812
  },
813
- "total_flos": 1.4231605134807245e+17,
814
  "train_batch_size": 2,
815
  "trial_name": null,
816
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.4687739610671997,
3
+ "best_model_checkpoint": "//outputs/task7_microsoft/Phi-3.5-mini-instruct/checkpoint-250",
4
+ "epoch": 5.0,
5
  "eval_steps": 500,
6
+ "global_step": 250,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.1,
13
+ "grad_norm": 1.4559898376464844,
14
+ "learning_rate": 3.75e-06,
15
+ "loss": 1.8397,
16
  "step": 5
17
  },
18
  {
19
+ "epoch": 0.2,
20
+ "grad_norm": 1.011980414390564,
21
+ "learning_rate": 7.5e-06,
22
+ "loss": 1.4061,
 
 
 
 
 
 
 
 
23
  "step": 10
24
  },
25
  {
26
+ "epoch": 0.3,
27
+ "grad_norm": 0.8619025945663452,
28
+ "learning_rate": 1.125e-05,
29
+ "loss": 1.037,
30
  "step": 15
31
  },
32
  {
33
+ "epoch": 0.4,
34
+ "grad_norm": 1.4890649318695068,
35
+ "learning_rate": 1.5e-05,
36
+ "loss": 1.2559,
 
 
 
 
 
 
 
 
37
  "step": 20
38
  },
39
  {
40
+ "epoch": 0.5,
41
+ "grad_norm": 0.3758047819137573,
42
+ "learning_rate": 1.8750000000000002e-05,
43
+ "loss": 1.0541,
44
  "step": 25
45
  },
46
  {
47
+ "epoch": 0.6,
48
+ "grad_norm": 1.0797535181045532,
49
+ "learning_rate": 2.25e-05,
50
+ "loss": 1.2359,
 
 
 
 
 
 
 
 
51
  "step": 30
52
  },
53
  {
54
+ "epoch": 0.7,
55
+ "grad_norm": 0.32953447103500366,
56
+ "learning_rate": 2.625e-05,
57
+ "loss": 0.8877,
58
  "step": 35
59
  },
60
  {
61
+ "epoch": 0.8,
62
+ "grad_norm": 0.319231241941452,
63
+ "learning_rate": 3e-05,
64
+ "loss": 1.0191,
 
 
 
 
 
 
 
 
65
  "step": 40
66
  },
67
  {
68
+ "epoch": 0.9,
69
+ "grad_norm": 0.319320410490036,
70
+ "learning_rate": 2.9996796251818968e-05,
71
+ "loss": 0.8399,
72
  "step": 45
73
  },
74
  {
75
+ "epoch": 1.0,
76
+ "grad_norm": 1.2043859958648682,
77
+ "learning_rate": 2.9987186375809513e-05,
78
+ "loss": 0.9834,
 
 
 
 
 
 
 
 
79
  "step": 50
80
  },
81
  {
82
+ "epoch": 1.0,
83
+ "eval_loss": 0.6599301099777222,
84
+ "eval_runtime": 52.5278,
85
+ "eval_samples_per_second": 3.808,
86
+ "eval_steps_per_second": 0.476,
87
+ "step": 50
88
  },
89
  {
90
+ "epoch": 1.1,
91
+ "grad_norm": 0.39200976490974426,
92
+ "learning_rate": 2.997117447698802e-05,
93
+ "loss": 0.8063,
94
  "step": 55
95
  },
96
  {
97
+ "epoch": 1.2,
98
+ "grad_norm": 0.42485809326171875,
99
+ "learning_rate": 2.994876739510005e-05,
100
+ "loss": 0.5906,
101
  "step": 60
102
  },
103
  {
104
+ "epoch": 1.3,
105
+ "grad_norm": 0.5581662654876709,
106
+ "learning_rate": 2.9919974701698638e-05,
107
+ "loss": 0.7749,
 
 
 
 
 
 
 
 
108
  "step": 65
109
  },
110
  {
111
+ "epoch": 1.4,
112
+ "grad_norm": 0.8188683390617371,
113
+ "learning_rate": 2.9884808696055675e-05,
114
+ "loss": 0.7623,
115
  "step": 70
116
  },
117
  {
118
+ "epoch": 1.5,
119
+ "grad_norm": 0.4976309537887573,
120
+ "learning_rate": 2.984328439990804e-05,
121
+ "loss": 0.7587,
 
 
 
 
 
 
 
 
122
  "step": 75
123
  },
124
  {
125
+ "epoch": 1.6,
126
+ "grad_norm": 0.515602171421051,
127
+ "learning_rate": 2.9795419551040836e-05,
128
+ "loss": 0.6395,
129
  "step": 80
130
  },
131
  {
132
+ "epoch": 1.7,
133
+ "grad_norm": 0.8577103018760681,
134
+ "learning_rate": 2.9741234595710393e-05,
135
+ "loss": 0.5315,
 
 
 
 
 
 
 
 
136
  "step": 85
137
  },
138
  {
139
+ "epoch": 1.8,
140
+ "grad_norm": 0.6678707599639893,
141
+ "learning_rate": 2.968075267991032e-05,
142
+ "loss": 0.6739,
143
  "step": 90
144
  },
145
  {
146
+ "epoch": 1.9,
147
+ "grad_norm": 0.3638306260108948,
148
+ "learning_rate": 2.9613999639484314e-05,
149
+ "loss": 0.6927,
 
 
 
 
 
 
 
 
150
  "step": 95
151
  },
152
  {
153
+ "epoch": 2.0,
154
+ "grad_norm": 0.8823966383934021,
155
+ "learning_rate": 2.9541003989089956e-05,
156
+ "loss": 0.6094,
157
+ "step": 100
 
158
  },
159
  {
160
+ "epoch": 2.0,
161
+ "eval_loss": 0.5690982341766357,
162
+ "eval_runtime": 52.1876,
163
+ "eval_samples_per_second": 3.832,
164
+ "eval_steps_per_second": 0.479,
165
  "step": 100
166
  },
167
  {
168
+ "epoch": 2.1,
169
+ "grad_norm": 0.5922141671180725,
170
+ "learning_rate": 2.9461796910018204e-05,
171
+ "loss": 0.6031,
172
  "step": 105
173
  },
174
  {
175
+ "epoch": 2.2,
176
+ "grad_norm": 0.5325513482093811,
177
+ "learning_rate": 2.9376412236873792e-05,
178
+ "loss": 0.493,
 
 
 
 
 
 
 
 
179
  "step": 110
180
  },
181
  {
182
+ "epoch": 2.3,
183
+ "grad_norm": 1.020575761795044,
184
+ "learning_rate": 2.928488644312222e-05,
185
+ "loss": 0.4483,
186
  "step": 115
187
  },
188
  {
189
+ "epoch": 2.4,
190
+ "grad_norm": 0.9036449790000916,
191
+ "learning_rate": 2.9187258625509518e-05,
192
+ "loss": 0.5766,
 
 
 
 
 
 
 
 
193
  "step": 120
194
  },
195
  {
196
+ "epoch": 2.5,
197
+ "grad_norm": 1.0615090131759644,
198
+ "learning_rate": 2.9083570487361445e-05,
199
+ "loss": 0.4717,
200
  "step": 125
201
  },
202
  {
203
+ "epoch": 2.6,
204
+ "grad_norm": 0.638048529624939,
205
+ "learning_rate": 2.8973866320769186e-05,
206
+ "loss": 0.3577,
 
 
 
 
 
 
 
 
207
  "step": 130
208
  },
209
  {
210
+ "epoch": 2.7,
211
+ "grad_norm": 1.1508071422576904,
212
+ "learning_rate": 2.8858192987669303e-05,
213
+ "loss": 0.5615,
214
  "step": 135
215
  },
216
  {
217
+ "epoch": 2.8,
218
+ "grad_norm": 0.6334187984466553,
219
+ "learning_rate": 2.873659989982586e-05,
220
+ "loss": 0.3704,
 
 
 
 
 
 
 
 
221
  "step": 140
222
  },
223
  {
224
+ "epoch": 2.9,
225
+ "grad_norm": 0.53675377368927,
226
+ "learning_rate": 2.86091389977234e-05,
227
+ "loss": 0.3623,
 
 
 
 
 
 
 
 
228
  "step": 145
229
  },
230
  {
231
+ "epoch": 3.0,
232
+ "grad_norm": 0.5917493104934692,
233
+ "learning_rate": 2.8475864728379682e-05,
234
+ "loss": 0.3345,
235
  "step": 150
236
  },
237
  {
238
+ "epoch": 3.0,
239
+ "eval_loss": 0.5363968014717102,
240
+ "eval_runtime": 52.2028,
241
+ "eval_samples_per_second": 3.831,
242
+ "eval_steps_per_second": 0.479,
243
+ "step": 150
244
  },
245
  {
246
+ "epoch": 3.1,
247
+ "grad_norm": 1.654146671295166,
248
+ "learning_rate": 2.8336834022087776e-05,
249
+ "loss": 0.3779,
250
  "step": 155
251
  },
252
  {
253
+ "epoch": 3.2,
254
+ "grad_norm": 0.9066053032875061,
255
+ "learning_rate": 2.8192106268097336e-05,
256
+ "loss": 0.2994,
257
  "step": 160
258
  },
259
  {
260
+ "epoch": 3.3,
261
+ "grad_norm": 0.5281007289886475,
262
+ "learning_rate": 2.8041743289245503e-05,
263
+ "loss": 0.4545,
 
 
 
 
 
 
 
 
264
  "step": 165
265
  },
266
  {
267
+ "epoch": 3.4,
268
+ "grad_norm": 0.8571799397468567,
269
+ "learning_rate": 2.788580931554828e-05,
270
+ "loss": 0.3399,
271
  "step": 170
272
  },
273
  {
274
+ "epoch": 3.5,
275
+ "grad_norm": 0.43631649017333984,
276
+ "learning_rate": 2.7724370956763605e-05,
277
+ "loss": 0.2589,
 
 
 
 
 
 
 
 
278
  "step": 175
279
  },
280
  {
281
+ "epoch": 3.6,
282
+ "grad_norm": 0.7908278107643127,
283
+ "learning_rate": 2.7557497173937928e-05,
284
+ "loss": 0.3241,
285
  "step": 180
286
  },
287
  {
288
+ "epoch": 3.7,
289
+ "grad_norm": 1.0415078401565552,
290
+ "learning_rate": 2.7385259249948338e-05,
291
+ "loss": 0.3205,
 
 
 
 
 
 
 
 
292
  "step": 185
293
  },
294
  {
295
+ "epoch": 3.8,
296
+ "grad_norm": 0.5231990218162537,
297
+ "learning_rate": 2.7207730759052925e-05,
298
+ "loss": 0.1806,
 
 
 
 
 
 
 
 
299
  "step": 190
300
  },
301
  {
302
+ "epoch": 3.9,
303
+ "grad_norm": 0.48716872930526733,
304
+ "learning_rate": 2.7024987535462327e-05,
305
+ "loss": 0.172,
306
  "step": 195
307
  },
308
  {
309
+ "epoch": 4.0,
310
+ "grad_norm": 0.6646760702133179,
311
+ "learning_rate": 2.6837107640945904e-05,
312
+ "loss": 0.2291,
 
 
 
 
 
 
 
 
313
  "step": 200
314
  },
315
  {
316
+ "epoch": 4.0,
317
+ "eval_loss": 0.5222796201705933,
318
+ "eval_runtime": 52.1967,
319
+ "eval_samples_per_second": 3.832,
320
+ "eval_steps_per_second": 0.479,
321
  "step": 200
322
  },
 
 
 
 
 
 
 
 
323
  {
324
  "epoch": 4.1,
325
+ "grad_norm": 1.3394831418991089,
326
+ "learning_rate": 2.6644171331486363e-05,
327
+ "loss": 0.2097,
328
  "step": 205
329
  },
330
  {
331
  "epoch": 4.2,
332
+ "grad_norm": 0.6753952503204346,
333
+ "learning_rate": 2.6446261022997098e-05,
334
+ "loss": 0.2552,
335
  "step": 210
336
  },
337
  {
338
  "epoch": 4.3,
339
+ "grad_norm": 0.5856276750564575,
340
+ "learning_rate": 2.6243461256116892e-05,
341
+ "loss": 0.1606,
342
  "step": 215
343
  },
344
  {
345
  "epoch": 4.4,
346
+ "grad_norm": 0.695767879486084,
347
+ "learning_rate": 2.6035858660096975e-05,
348
+ "loss": 0.2958,
349
  "step": 220
350
  },
351
  {
352
  "epoch": 4.5,
353
+ "grad_norm": 0.6565276980400085,
354
+ "learning_rate": 2.5823541915795932e-05,
355
+ "loss": 0.1491,
356
  "step": 225
357
  },
358
  {
359
  "epoch": 4.6,
360
+ "grad_norm": 0.497454971075058,
361
+ "learning_rate": 2.5606601717798212e-05,
362
+ "loss": 0.1945,
363
  "step": 230
364
  },
365
  {
366
  "epoch": 4.7,
367
+ "grad_norm": 0.7928630709648132,
368
+ "learning_rate": 2.5385130735672442e-05,
369
+ "loss": 0.1197,
370
  "step": 235
371
  },
372
  {
373
  "epoch": 4.8,
374
+ "grad_norm": 0.9403858780860901,
375
+ "learning_rate": 2.5159223574386117e-05,
376
+ "loss": 0.2448,
377
  "step": 240
378
  },
379
  {
380
  "epoch": 4.9,
381
+ "grad_norm": 0.41166239976882935,
382
+ "learning_rate": 2.49289767338935e-05,
383
+ "loss": 0.2321,
384
  "step": 245
385
  },
386
  {
387
  "epoch": 5.0,
388
+ "grad_norm": 0.4782765805721283,
389
+ "learning_rate": 2.469448856791411e-05,
390
+ "loss": 0.1126,
391
  "step": 250
392
  },
393
  {
394
  "epoch": 5.0,
395
+ "eval_loss": 0.4687739610671997,
396
+ "eval_runtime": 52.1992,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  "eval_samples_per_second": 3.831,
398
  "eval_steps_per_second": 0.479,
399
+ "step": 250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
  }
401
  ],
402
  "logging_steps": 5,
403
+ "max_steps": 800,
404
  "num_input_tokens_seen": 0,
405
+ "num_train_epochs": 16,
406
  "save_steps": 500,
407
  "stateful_callbacks": {
408
  "TrainerControl": {
 
411
  "should_evaluate": false,
412
  "should_log": false,
413
  "should_save": true,
414
+ "should_training_stop": false
415
  },
416
  "attributes": {}
417
  }
418
  },
419
+ "total_flos": 4.646855528216986e+16,
420
  "train_batch_size": 2,
421
  "trial_name": null,
422
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d7ef1ca84158a115fb2ab949b3f781c814c5ef428f591fc8d6d01108daabb83
3
  size 5624
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44a91374d47e061d44848107bfc25ebd1ed4e3cf32bfc6349d577cac835076d2
3
  size 5624