{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.99695843190267, "eval_steps": 500, "global_step": 1107, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.027036160865157147, "grad_norm": 11.797046464039566, "learning_rate": 2.9940119760479047e-07, "loss": 0.9096, "step": 10 }, { "epoch": 0.054072321730314295, "grad_norm": 6.022652618383022, "learning_rate": 5.988023952095809e-07, "loss": 0.815, "step": 20 }, { "epoch": 0.08110848259547145, "grad_norm": 2.383442971376186, "learning_rate": 8.982035928143713e-07, "loss": 0.7311, "step": 30 }, { "epoch": 0.10814464346062859, "grad_norm": 1.6738069912398006, "learning_rate": 1.1976047904191619e-06, "loss": 0.6913, "step": 40 }, { "epoch": 0.13518080432578575, "grad_norm": 2.1314550554105356, "learning_rate": 1.4970059880239521e-06, "loss": 0.6597, "step": 50 }, { "epoch": 0.1622169651909429, "grad_norm": 1.736158500887477, "learning_rate": 1.7964071856287426e-06, "loss": 0.6447, "step": 60 }, { "epoch": 0.18925312605610004, "grad_norm": 2.2815429009907864, "learning_rate": 2.095808383233533e-06, "loss": 0.6348, "step": 70 }, { "epoch": 0.21628928692125718, "grad_norm": 1.706495414515813, "learning_rate": 2.3952095808383237e-06, "loss": 0.6232, "step": 80 }, { "epoch": 0.24332544778641432, "grad_norm": 2.248832183747629, "learning_rate": 2.694610778443114e-06, "loss": 0.6175, "step": 90 }, { "epoch": 0.2703616086515715, "grad_norm": 2.3308669289239763, "learning_rate": 2.9940119760479042e-06, "loss": 0.6183, "step": 100 }, { "epoch": 0.29739776951672864, "grad_norm": 2.7217593935560473, "learning_rate": 3.2934131736526947e-06, "loss": 0.614, "step": 110 }, { "epoch": 0.3244339303818858, "grad_norm": 1.681531266986669, "learning_rate": 3.592814371257485e-06, "loss": 0.6088, "step": 120 }, { "epoch": 0.3514700912470429, "grad_norm": 2.313372317991298, "learning_rate": 3.892215568862276e-06, "loss": 0.6103, "step": 130 }, { "epoch": 0.37850625211220007, "grad_norm": 1.773649948117426, "learning_rate": 4.191616766467066e-06, "loss": 0.6091, "step": 140 }, { "epoch": 0.4055424129773572, "grad_norm": 2.7549411259723233, "learning_rate": 4.4910179640718566e-06, "loss": 0.6, "step": 150 }, { "epoch": 0.43257857384251436, "grad_norm": 2.3158360051535043, "learning_rate": 4.7904191616766475e-06, "loss": 0.6009, "step": 160 }, { "epoch": 0.4596147347076715, "grad_norm": 2.7122550721067333, "learning_rate": 4.999874341195841e-06, "loss": 0.6016, "step": 170 }, { "epoch": 0.48665089557282865, "grad_norm": 2.114842072958422, "learning_rate": 4.997640758293996e-06, "loss": 0.6007, "step": 180 }, { "epoch": 0.5136870564379858, "grad_norm": 2.6143309965853376, "learning_rate": 4.9926176290157e-06, "loss": 0.5978, "step": 190 }, { "epoch": 0.540723217303143, "grad_norm": 2.7700517825407145, "learning_rate": 4.9848105635561055e-06, "loss": 0.5941, "step": 200 }, { "epoch": 0.5677593781683001, "grad_norm": 2.3412428988651204, "learning_rate": 4.974228281412239e-06, "loss": 0.5947, "step": 210 }, { "epoch": 0.5947955390334573, "grad_norm": 1.7184040494161494, "learning_rate": 4.960882601644431e-06, "loss": 0.5887, "step": 220 }, { "epoch": 0.6218316998986144, "grad_norm": 2.1489528751562204, "learning_rate": 4.9447884296759364e-06, "loss": 0.5912, "step": 230 }, { "epoch": 0.6488678607637716, "grad_norm": 1.9828433047198417, "learning_rate": 4.925963740645471e-06, "loss": 0.5878, "step": 240 }, { "epoch": 0.6759040216289287, "grad_norm": 1.8322010356109428, "learning_rate": 4.904429559331279e-06, "loss": 0.5809, "step": 250 }, { "epoch": 0.7029401824940859, "grad_norm": 1.8050576051972331, "learning_rate": 4.8802099366691305e-06, "loss": 0.5835, "step": 260 }, { "epoch": 0.729976343359243, "grad_norm": 2.095519869838614, "learning_rate": 4.853331922890492e-06, "loss": 0.5867, "step": 270 }, { "epoch": 0.7570125042244001, "grad_norm": 1.6036862776888092, "learning_rate": 4.8238255373108705e-06, "loss": 0.5819, "step": 280 }, { "epoch": 0.7840486650895573, "grad_norm": 1.6319068896943107, "learning_rate": 4.79172373480206e-06, "loss": 0.5818, "step": 290 }, { "epoch": 0.8110848259547144, "grad_norm": 1.7290935070537745, "learning_rate": 4.757062368985748e-06, "loss": 0.5817, "step": 300 }, { "epoch": 0.8381209868198716, "grad_norm": 1.7470779413165216, "learning_rate": 4.7198801521895985e-06, "loss": 0.5792, "step": 310 }, { "epoch": 0.8651571476850287, "grad_norm": 1.587777671242289, "learning_rate": 4.680218612210509e-06, "loss": 0.5776, "step": 320 }, { "epoch": 0.8921933085501859, "grad_norm": 1.3317497751452183, "learning_rate": 4.638122045933353e-06, "loss": 0.5781, "step": 330 }, { "epoch": 0.919229469415343, "grad_norm": 1.4447723007472768, "learning_rate": 4.593637469857015e-06, "loss": 0.5852, "step": 340 }, { "epoch": 0.9462656302805001, "grad_norm": 2.8250562829782124, "learning_rate": 4.546814567582945e-06, "loss": 0.5775, "step": 350 }, { "epoch": 0.9733017911456573, "grad_norm": 2.456765823638226, "learning_rate": 4.497705634324912e-06, "loss": 0.5777, "step": 360 }, { "epoch": 0.9976343359242987, "eval_loss": 0.07170024514198303, "eval_runtime": 250.9721, "eval_samples_per_second": 39.706, "eval_steps_per_second": 0.622, "step": 369 }, { "epoch": 1.0023656640757013, "grad_norm": 3.626809923279979, "learning_rate": 4.446365518501915e-06, "loss": 0.5727, "step": 370 }, { "epoch": 1.0294018249408583, "grad_norm": 3.160769194185762, "learning_rate": 4.392851560479492e-06, "loss": 0.5049, "step": 380 }, { "epoch": 1.0564379858060156, "grad_norm": 2.94467202352765, "learning_rate": 4.337223528527837e-06, "loss": 0.5011, "step": 390 }, { "epoch": 1.0834741466711728, "grad_norm": 2.517935853855786, "learning_rate": 4.279543552068262e-06, "loss": 0.5033, "step": 400 }, { "epoch": 1.1105103075363298, "grad_norm": 2.0143324349410903, "learning_rate": 4.219876052282555e-06, "loss": 0.5004, "step": 410 }, { "epoch": 1.1375464684014869, "grad_norm": 2.0739387510483147, "learning_rate": 4.158287670162725e-06, "loss": 0.5029, "step": 420 }, { "epoch": 1.1645826292666441, "grad_norm": 1.7467667472974562, "learning_rate": 4.094847192081516e-06, "loss": 0.5105, "step": 430 }, { "epoch": 1.1916187901318014, "grad_norm": 1.6077647814117841, "learning_rate": 4.029625472966785e-06, "loss": 0.5024, "step": 440 }, { "epoch": 1.2186549509969584, "grad_norm": 2.0479820532317223, "learning_rate": 3.962695357165593e-06, "loss": 0.5071, "step": 450 }, { "epoch": 1.2456911118621157, "grad_norm": 1.626219735076453, "learning_rate": 3.894131597086342e-06, "loss": 0.5072, "step": 460 }, { "epoch": 1.2727272727272727, "grad_norm": 1.7379360119053935, "learning_rate": 3.824010769709868e-06, "loss": 0.5095, "step": 470 }, { "epoch": 1.29976343359243, "grad_norm": 1.6374256521813568, "learning_rate": 3.752411191062716e-06, "loss": 0.5043, "step": 480 }, { "epoch": 1.326799594457587, "grad_norm": 1.533412284293863, "learning_rate": 3.679412828748114e-06, "loss": 0.5048, "step": 490 }, { "epoch": 1.3538357553227442, "grad_norm": 1.467704598854822, "learning_rate": 3.6050972126323614e-06, "loss": 0.5082, "step": 500 }, { "epoch": 1.3808719161879013, "grad_norm": 1.3920376131876018, "learning_rate": 3.5295473437863504e-06, "loss": 0.5078, "step": 510 }, { "epoch": 1.4079080770530585, "grad_norm": 1.5367632044528639, "learning_rate": 3.4528476017839595e-06, "loss": 0.5113, "step": 520 }, { "epoch": 1.4349442379182156, "grad_norm": 1.3875458210104965, "learning_rate": 3.3750836504608176e-06, "loss": 0.5038, "step": 530 }, { "epoch": 1.4619803987833728, "grad_norm": 1.4009582075169091, "learning_rate": 3.2963423422387174e-06, "loss": 0.5079, "step": 540 }, { "epoch": 1.4890165596485299, "grad_norm": 1.4166923996239738, "learning_rate": 3.216711621122537e-06, "loss": 0.505, "step": 550 }, { "epoch": 1.5160527205136871, "grad_norm": 1.4867606179248776, "learning_rate": 3.136280424477991e-06, "loss": 0.5064, "step": 560 }, { "epoch": 1.5430888813788441, "grad_norm": 1.4585388387447702, "learning_rate": 3.0551385836999305e-06, "loss": 0.5024, "step": 570 }, { "epoch": 1.5701250422440014, "grad_norm": 1.2202732607501734, "learning_rate": 2.9733767238821227e-06, "loss": 0.5036, "step": 580 }, { "epoch": 1.5971612031091587, "grad_norm": 1.3872482629989873, "learning_rate": 2.8910861626005774e-06, "loss": 0.5065, "step": 590 }, { "epoch": 1.6241973639743157, "grad_norm": 1.2929710321452088, "learning_rate": 2.8083588079234487e-06, "loss": 0.5025, "step": 600 }, { "epoch": 1.6512335248394727, "grad_norm": 1.4519067305450637, "learning_rate": 2.7252870557614404e-06, "loss": 0.5006, "step": 610 }, { "epoch": 1.67826968570463, "grad_norm": 1.3296895928559778, "learning_rate": 2.641963686673349e-06, "loss": 0.5054, "step": 620 }, { "epoch": 1.7053058465697872, "grad_norm": 1.309914011850238, "learning_rate": 2.5584817622420078e-06, "loss": 0.51, "step": 630 }, { "epoch": 1.7323420074349443, "grad_norm": 1.1972154530947354, "learning_rate": 2.4749345211363575e-06, "loss": 0.5063, "step": 640 }, { "epoch": 1.7593781683001013, "grad_norm": 1.4233699776417705, "learning_rate": 2.391415274975738e-06, "loss": 0.5042, "step": 650 }, { "epoch": 1.7864143291652586, "grad_norm": 1.396590849006417, "learning_rate": 2.3080173041127073e-06, "loss": 0.5004, "step": 660 }, { "epoch": 1.8134504900304158, "grad_norm": 1.4737059270471085, "learning_rate": 2.2248337534507712e-06, "loss": 0.5017, "step": 670 }, { "epoch": 1.8404866508955728, "grad_norm": 1.269208728908367, "learning_rate": 2.141957528413399e-06, "loss": 0.5045, "step": 680 }, { "epoch": 1.8675228117607299, "grad_norm": 1.2494583107579733, "learning_rate": 2.0594811911805146e-06, "loss": 0.5013, "step": 690 }, { "epoch": 1.8945589726258871, "grad_norm": 1.2922042556822486, "learning_rate": 1.9774968573083294e-06, "loss": 0.5037, "step": 700 }, { "epoch": 1.9215951334910444, "grad_norm": 1.3706382242795134, "learning_rate": 1.8960960928480138e-06, "loss": 0.508, "step": 710 }, { "epoch": 1.9486312943562014, "grad_norm": 1.288027503871267, "learning_rate": 1.8153698120780882e-06, "loss": 0.4999, "step": 720 }, { "epoch": 1.9756674552213584, "grad_norm": 1.260241344819317, "learning_rate": 1.735408175964759e-06, "loss": 0.4947, "step": 730 }, { "epoch": 1.9972963839134843, "eval_loss": 0.07017708569765091, "eval_runtime": 252.9383, "eval_samples_per_second": 39.397, "eval_steps_per_second": 0.617, "step": 738 }, { "epoch": 2.0047313281514025, "grad_norm": 3.1137959699646944, "learning_rate": 1.6563004914636132e-06, "loss": 0.4869, "step": 740 }, { "epoch": 2.0317674890165596, "grad_norm": 1.6589859537412457, "learning_rate": 1.5781351117751336e-06, "loss": 0.4131, "step": 750 }, { "epoch": 2.0588036498817166, "grad_norm": 1.5527387593241242, "learning_rate": 1.500999337665433e-06, "loss": 0.4179, "step": 760 }, { "epoch": 2.085839810746874, "grad_norm": 1.3379590285436587, "learning_rate": 1.4249793199624284e-06, "loss": 0.4133, "step": 770 }, { "epoch": 2.112875971612031, "grad_norm": 1.5145255566280442, "learning_rate": 1.350159963336341e-06, "loss": 0.413, "step": 780 }, { "epoch": 2.139912132477188, "grad_norm": 1.3384049087153889, "learning_rate": 1.2766248314720103e-06, "loss": 0.4119, "step": 790 }, { "epoch": 2.1669482933423456, "grad_norm": 1.3428619349256303, "learning_rate": 1.2044560537389043e-06, "loss": 0.4172, "step": 800 }, { "epoch": 2.1939844542075027, "grad_norm": 1.3749157598517765, "learning_rate": 1.133734233463081e-06, "loss": 0.4147, "step": 810 }, { "epoch": 2.2210206150726597, "grad_norm": 1.2460130219709455, "learning_rate": 1.064538357903548e-06, "loss": 0.4161, "step": 820 }, { "epoch": 2.2480567759378167, "grad_norm": 1.318406395179503, "learning_rate": 9.969457100335657e-07, "loss": 0.4169, "step": 830 }, { "epoch": 2.2750929368029738, "grad_norm": 1.2488975357296948, "learning_rate": 9.310317822254111e-07, "loss": 0.4169, "step": 840 }, { "epoch": 2.3021290976681312, "grad_norm": 1.3146898828732563, "learning_rate": 8.668701919350217e-07, "loss": 0.4116, "step": 850 }, { "epoch": 2.3291652585332883, "grad_norm": 1.2616739186242787, "learning_rate": 8.045325994806838e-07, "loss": 0.4143, "step": 860 }, { "epoch": 2.3562014193984453, "grad_norm": 1.2382882830499569, "learning_rate": 7.440886280075887e-07, "loss": 0.4187, "step": 870 }, { "epoch": 2.3832375802636028, "grad_norm": 1.2670618808963707, "learning_rate": 6.856057857276507e-07, "loss": 0.4175, "step": 880 }, { "epoch": 2.41027374112876, "grad_norm": 1.331501462877456, "learning_rate": 6.291493905214455e-07, "loss": 0.416, "step": 890 }, { "epoch": 2.437309901993917, "grad_norm": 1.245214644999882, "learning_rate": 5.747824969864554e-07, "loss": 0.4144, "step": 900 }, { "epoch": 2.464346062859074, "grad_norm": 1.293251956768236, "learning_rate": 5.225658260131289e-07, "loss": 0.4124, "step": 910 }, { "epoch": 2.4913822237242313, "grad_norm": 1.2170360666129485, "learning_rate": 4.7255769696737896e-07, "loss": 0.4122, "step": 920 }, { "epoch": 2.5184183845893884, "grad_norm": 1.3332188286676712, "learning_rate": 4.248139625552877e-07, "loss": 0.4105, "step": 930 }, { "epoch": 2.5454545454545454, "grad_norm": 1.2374380268615297, "learning_rate": 3.7938794644275357e-07, "loss": 0.4171, "step": 940 }, { "epoch": 2.5724907063197024, "grad_norm": 1.2320455143383866, "learning_rate": 3.3633038369975e-07, "loss": 0.4102, "step": 950 }, { "epoch": 2.59952686718486, "grad_norm": 1.2370736360581813, "learning_rate": 2.9568936413572363e-07, "loss": 0.4172, "step": 960 }, { "epoch": 2.626563028050017, "grad_norm": 1.2220768518142908, "learning_rate": 2.575102785894007e-07, "loss": 0.4145, "step": 970 }, { "epoch": 2.653599188915174, "grad_norm": 1.23419878256637, "learning_rate": 2.218357682330119e-07, "loss": 0.4172, "step": 980 }, { "epoch": 2.6806353497803315, "grad_norm": 1.207639360228848, "learning_rate": 1.88705676947534e-07, "loss": 0.4142, "step": 990 }, { "epoch": 2.7076715106454885, "grad_norm": 1.2366362763552143, "learning_rate": 1.581570068221544e-07, "loss": 0.4084, "step": 1000 }, { "epoch": 2.7347076715106455, "grad_norm": 1.2600299559006616, "learning_rate": 1.3022387682765396e-07, "loss": 0.4158, "step": 1010 }, { "epoch": 2.7617438323758026, "grad_norm": 1.1728597476384803, "learning_rate": 1.0493748470986553e-07, "loss": 0.4176, "step": 1020 }, { "epoch": 2.7887799932409596, "grad_norm": 1.217687682870758, "learning_rate": 8.23260721457686e-08, "loss": 0.4151, "step": 1030 }, { "epoch": 2.815816154106117, "grad_norm": 1.1768131269651871, "learning_rate": 6.241489320113453e-08, "loss": 0.4129, "step": 1040 }, { "epoch": 2.842852314971274, "grad_norm": 1.1724105689001012, "learning_rate": 4.5226186124956164e-08, "loss": 0.4136, "step": 1050 }, { "epoch": 2.869888475836431, "grad_norm": 1.2771491635206371, "learning_rate": 3.077914851215585e-08, "loss": 0.4083, "step": 1060 }, { "epoch": 2.8969246367015886, "grad_norm": 1.1829354540448112, "learning_rate": 1.9089915862320062e-08, "loss": 0.4169, "step": 1070 }, { "epoch": 2.9239607975667457, "grad_norm": 1.194436281414163, "learning_rate": 1.0171543558403774e-08, "loss": 0.4115, "step": 1080 }, { "epoch": 2.9509969584319027, "grad_norm": 1.189312231071839, "learning_rate": 4.0339922855298086e-09, "loss": 0.4145, "step": 1090 }, { "epoch": 2.9780331192970597, "grad_norm": 1.1522333741980948, "learning_rate": 6.841169061744257e-10, "loss": 0.4132, "step": 1100 }, { "epoch": 2.99695843190267, "eval_loss": 0.072703517973423, "eval_runtime": 249.3626, "eval_samples_per_second": 39.962, "eval_steps_per_second": 0.626, "step": 1107 }, { "epoch": 2.99695843190267, "step": 1107, "total_flos": 1854056851046400.0, "train_loss": 0.5126692639241481, "train_runtime": 35381.6642, "train_samples_per_second": 16.053, "train_steps_per_second": 0.031 } ], "logging_steps": 10, "max_steps": 1107, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1854056851046400.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }