|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.99695843190267, |
|
"eval_steps": 500, |
|
"global_step": 1107, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.027036160865157147, |
|
"grad_norm": 3.523985384534832, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8602, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.054072321730314295, |
|
"grad_norm": 2.330230157876199, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6873, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08110848259547145, |
|
"grad_norm": 2.692866742411895, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6489, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10814464346062859, |
|
"grad_norm": 1.742013451785388, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6363, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13518080432578575, |
|
"grad_norm": 1.9825861616522564, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6212, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1622169651909429, |
|
"grad_norm": 1.963028689906959, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6158, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.18925312605610004, |
|
"grad_norm": 1.5400134410581494, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6126, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.21628928692125718, |
|
"grad_norm": 1.7957880103549582, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6046, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.24332544778641432, |
|
"grad_norm": 2.0825972162611626, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6015, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2703616086515715, |
|
"grad_norm": 1.5684743614874819, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6044, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.29739776951672864, |
|
"grad_norm": 1.5536576895608207, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6009, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3244339303818858, |
|
"grad_norm": 1.9211273900718062, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5959, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3514700912470429, |
|
"grad_norm": 1.857918426411918, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5981, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.37850625211220007, |
|
"grad_norm": 2.480111887309551, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5972, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4055424129773572, |
|
"grad_norm": 1.9435865546624187, |
|
"learning_rate": 5e-06, |
|
"loss": 0.589, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.43257857384251436, |
|
"grad_norm": 2.256806266648931, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5905, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4596147347076715, |
|
"grad_norm": 1.9395141957859183, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5907, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.48665089557282865, |
|
"grad_norm": 1.70470436825217, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5891, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5136870564379858, |
|
"grad_norm": 1.2803454947922437, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5864, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.540723217303143, |
|
"grad_norm": 1.6583430834053514, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5845, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5677593781683001, |
|
"grad_norm": 1.6236160787848246, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5873, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5947955390334573, |
|
"grad_norm": 1.4448675180275712, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5829, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6218316998986144, |
|
"grad_norm": 1.5221126449990794, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5844, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6488678607637716, |
|
"grad_norm": 1.3401517373172736, |
|
"learning_rate": 5e-06, |
|
"loss": 0.581, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6759040216289287, |
|
"grad_norm": 3.0974221570456697, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5751, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7029401824940859, |
|
"grad_norm": 1.7078786998431879, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5786, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.729976343359243, |
|
"grad_norm": 1.3452565169314334, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5802, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7570125042244001, |
|
"grad_norm": 1.2754169661691006, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5753, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7840486650895573, |
|
"grad_norm": 1.561813747345879, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5756, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8110848259547144, |
|
"grad_norm": 1.4562454979417123, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5754, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8381209868198716, |
|
"grad_norm": 1.3655414659589415, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5731, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8651571476850287, |
|
"grad_norm": 1.57353294016275, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5721, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8921933085501859, |
|
"grad_norm": 1.9418137004471465, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5736, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.919229469415343, |
|
"grad_norm": 1.6960343893725316, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5806, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9462656302805001, |
|
"grad_norm": 2.406507142621058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5714, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9733017911456573, |
|
"grad_norm": 1.9230224466359063, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5704, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9976343359242987, |
|
"eval_loss": 0.07099956274032593, |
|
"eval_runtime": 383.3884, |
|
"eval_samples_per_second": 25.992, |
|
"eval_steps_per_second": 0.407, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.0023656640757013, |
|
"grad_norm": 3.4559216993428326, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5655, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.0294018249408583, |
|
"grad_norm": 2.6945339527227783, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4803, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0564379858060156, |
|
"grad_norm": 2.274860352040799, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4751, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0834741466711728, |
|
"grad_norm": 1.9418080331159586, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4775, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.1105103075363298, |
|
"grad_norm": 1.3221563752390588, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4751, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.1375464684014869, |
|
"grad_norm": 1.6270959849174909, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4773, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.1645826292666441, |
|
"grad_norm": 1.723481598695817, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4861, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.1916187901318014, |
|
"grad_norm": 1.6689122667194243, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4771, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.2186549509969584, |
|
"grad_norm": 1.8852129756960698, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4817, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.2456911118621157, |
|
"grad_norm": 1.5781803973122046, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4832, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.2727272727272727, |
|
"grad_norm": 1.8932565449503365, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4855, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.29976343359243, |
|
"grad_norm": 1.5040934591134398, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4816, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.326799594457587, |
|
"grad_norm": 1.415624345433887, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4817, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.3538357553227442, |
|
"grad_norm": 1.4726186128545236, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4859, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3808719161879013, |
|
"grad_norm": 1.371837855586058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4862, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.4079080770530585, |
|
"grad_norm": 1.625255953470612, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4899, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.4349442379182156, |
|
"grad_norm": 1.4470657655644708, |
|
"learning_rate": 5e-06, |
|
"loss": 0.484, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.4619803987833728, |
|
"grad_norm": 1.4168708504506906, |
|
"learning_rate": 5e-06, |
|
"loss": 0.489, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.4890165596485299, |
|
"grad_norm": 1.2953354115079219, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4876, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.5160527205136871, |
|
"grad_norm": 1.2905587316106748, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4898, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.5430888813788441, |
|
"grad_norm": 1.874461693755812, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4852, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.5701250422440014, |
|
"grad_norm": 1.537393515627057, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4874, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.5971612031091587, |
|
"grad_norm": 1.7234212392856714, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4911, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.6241973639743157, |
|
"grad_norm": 1.4569000028167551, |
|
"learning_rate": 5e-06, |
|
"loss": 0.487, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.6512335248394727, |
|
"grad_norm": 1.4876997193606485, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4854, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.67826968570463, |
|
"grad_norm": 1.4853131850089583, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4902, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.7053058465697872, |
|
"grad_norm": 1.4854501350323384, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4952, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.7323420074349443, |
|
"grad_norm": 1.613201730070182, |
|
"learning_rate": 5e-06, |
|
"loss": 0.493, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.7593781683001013, |
|
"grad_norm": 1.3411867074544503, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4904, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.7864143291652586, |
|
"grad_norm": 1.3453881021060534, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4879, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.8134504900304158, |
|
"grad_norm": 1.4275860747925428, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4904, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.8404866508955728, |
|
"grad_norm": 1.3712075307477265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4935, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.8675228117607299, |
|
"grad_norm": 1.2986482215538881, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4917, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.8945589726258871, |
|
"grad_norm": 1.2770662158812232, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4943, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.9215951334910444, |
|
"grad_norm": 1.2971456378708284, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4994, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.9486312943562014, |
|
"grad_norm": 1.2953045522832038, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4927, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.9756674552213584, |
|
"grad_norm": 1.2187798944947157, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4886, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.9972963839134843, |
|
"eval_loss": 0.07174264639616013, |
|
"eval_runtime": 383.5066, |
|
"eval_samples_per_second": 25.984, |
|
"eval_steps_per_second": 0.407, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 2.0047313281514025, |
|
"grad_norm": 3.2606644157882583, |
|
"learning_rate": 5e-06, |
|
"loss": 0.477, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.0317674890165596, |
|
"grad_norm": 1.997812835463083, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3854, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.0588036498817166, |
|
"grad_norm": 1.9996540417166961, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3864, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.085839810746874, |
|
"grad_norm": 1.8618681796717789, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3812, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.112875971612031, |
|
"grad_norm": 1.4915524529602713, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3811, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.139912132477188, |
|
"grad_norm": 1.5723106961760522, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3808, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.1669482933423456, |
|
"grad_norm": 1.714466030885258, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3876, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.1939844542075027, |
|
"grad_norm": 1.7511244660613634, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3862, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.2210206150726597, |
|
"grad_norm": 1.5027273200313567, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3873, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.2480567759378167, |
|
"grad_norm": 1.6128980496405356, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3896, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.2750929368029738, |
|
"grad_norm": 1.561276115932866, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3909, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.3021290976681312, |
|
"grad_norm": 1.7787495204510098, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3866, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.3291652585332883, |
|
"grad_norm": 1.5802735443144562, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3896, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.3562014193984453, |
|
"grad_norm": 1.5469387511116455, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3948, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.3832375802636028, |
|
"grad_norm": 1.6780934080456225, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3937, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.41027374112876, |
|
"grad_norm": 1.5538724349535749, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3933, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.437309901993917, |
|
"grad_norm": 1.6919213854745063, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3927, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.464346062859074, |
|
"grad_norm": 1.6467399942324181, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3916, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.4913822237242313, |
|
"grad_norm": 1.5494538660407549, |
|
"learning_rate": 5e-06, |
|
"loss": 0.393, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.5184183845893884, |
|
"grad_norm": 1.6061582119048823, |
|
"learning_rate": 5e-06, |
|
"loss": 0.392, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.5454545454545454, |
|
"grad_norm": 1.5497867717979459, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3996, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.5724907063197024, |
|
"grad_norm": 1.8696325727031804, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3949, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.59952686718486, |
|
"grad_norm": 1.6545052949984496, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4002, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.626563028050017, |
|
"grad_norm": 1.5124642558655546, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3988, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.653599188915174, |
|
"grad_norm": 1.559510310440385, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4024, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.6806353497803315, |
|
"grad_norm": 1.676052966396514, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4007, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.7076715106454885, |
|
"grad_norm": 2.1046446839691, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3952, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.7347076715106455, |
|
"grad_norm": 2.0451177849286464, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4021, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.7617438323758026, |
|
"grad_norm": 1.6691707308120913, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4046, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.7887799932409596, |
|
"grad_norm": 1.4465197945931527, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4022, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.815816154106117, |
|
"grad_norm": 1.4351190888202614, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4014, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.842852314971274, |
|
"grad_norm": 1.485706611946695, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4034, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.869888475836431, |
|
"grad_norm": 1.4860832182563435, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3996, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.8969246367015886, |
|
"grad_norm": 1.5630718102725172, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4077, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.9239607975667457, |
|
"grad_norm": 1.4737818975824717, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4027, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.9509969584319027, |
|
"grad_norm": 1.5487795543993597, |
|
"learning_rate": 5e-06, |
|
"loss": 0.406, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.9780331192970597, |
|
"grad_norm": 1.3628828414709748, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4047, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.99695843190267, |
|
"eval_loss": 0.07649385929107666, |
|
"eval_runtime": 385.8472, |
|
"eval_samples_per_second": 25.826, |
|
"eval_steps_per_second": 0.404, |
|
"step": 1107 |
|
}, |
|
{ |
|
"epoch": 2.99695843190267, |
|
"step": 1107, |
|
"total_flos": 1854056851046400.0, |
|
"train_loss": 0.49424083467636865, |
|
"train_runtime": 64065.8676, |
|
"train_samples_per_second": 8.866, |
|
"train_steps_per_second": 0.017 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1107, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1854056851046400.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|