{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.113015284854099, "eval_steps": 200, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018527095877721167, "grad_norm": 5.348576545715332, "learning_rate": 1.234567901234568e-06, "loss": 1.7335, "step": 10 }, { "epoch": 0.037054191755442334, "grad_norm": 5.819892883300781, "learning_rate": 2.469135802469136e-06, "loss": 1.7063, "step": 20 }, { "epoch": 0.0555812876331635, "grad_norm": 2.0946009159088135, "learning_rate": 3.7037037037037037e-06, "loss": 1.5389, "step": 30 }, { "epoch": 0.07410838351088467, "grad_norm": 6.124255657196045, "learning_rate": 4.938271604938272e-06, "loss": 1.9017, "step": 40 }, { "epoch": 0.09263547938860583, "grad_norm": 4.986006736755371, "learning_rate": 6.17283950617284e-06, "loss": 1.4697, "step": 50 }, { "epoch": 0.111162575266327, "grad_norm": 3.69557785987854, "learning_rate": 7.4074074074074075e-06, "loss": 1.2454, "step": 60 }, { "epoch": 0.12968967114404817, "grad_norm": 4.338206768035889, "learning_rate": 8.641975308641975e-06, "loss": 1.2242, "step": 70 }, { "epoch": 0.14821676702176934, "grad_norm": 2.3303167819976807, "learning_rate": 9.876543209876543e-06, "loss": 0.7272, "step": 80 }, { "epoch": 0.1667438628994905, "grad_norm": 2.708115339279175, "learning_rate": 1.1111111111111113e-05, "loss": 0.5907, "step": 90 }, { "epoch": 0.18527095877721167, "grad_norm": 1.4415699243545532, "learning_rate": 1.234567901234568e-05, "loss": 0.6154, "step": 100 }, { "epoch": 0.20379805465493284, "grad_norm": 1.7580137252807617, "learning_rate": 1.3580246913580248e-05, "loss": 0.4568, "step": 110 }, { "epoch": 0.222325150532654, "grad_norm": 2.5971596240997314, "learning_rate": 1.4814814814814815e-05, "loss": 0.4862, "step": 120 }, { "epoch": 0.24085224641037517, "grad_norm": 1.3559226989746094, "learning_rate": 1.6049382716049385e-05, "loss": 0.4366, "step": 130 }, { "epoch": 0.25937934228809634, "grad_norm": 0.6591945290565491, "learning_rate": 1.728395061728395e-05, "loss": 0.3757, "step": 140 }, { "epoch": 0.2779064381658175, "grad_norm": 2.574704170227051, "learning_rate": 1.851851851851852e-05, "loss": 0.4693, "step": 150 }, { "epoch": 0.29643353404353867, "grad_norm": 3.002263307571411, "learning_rate": 1.9753086419753087e-05, "loss": 0.4896, "step": 160 }, { "epoch": 0.31496062992125984, "grad_norm": 3.431332588195801, "learning_rate": 1.999850819197622e-05, "loss": 0.4864, "step": 170 }, { "epoch": 0.333487725798981, "grad_norm": 1.1350328922271729, "learning_rate": 1.99924484847108e-05, "loss": 0.3713, "step": 180 }, { "epoch": 0.35201482167670217, "grad_norm": 1.6894770860671997, "learning_rate": 1.9981730462964303e-05, "loss": 0.4814, "step": 190 }, { "epoch": 0.37054191755442334, "grad_norm": 1.3769453763961792, "learning_rate": 1.9966359123301492e-05, "loss": 0.4288, "step": 200 }, { "epoch": 0.3890690134321445, "grad_norm": 1.1856595277786255, "learning_rate": 1.9946341631587086e-05, "loss": 0.4447, "step": 210 }, { "epoch": 0.4075961093098657, "grad_norm": 1.7599550485610962, "learning_rate": 1.9921687319645183e-05, "loss": 0.349, "step": 220 }, { "epoch": 0.42612320518758684, "grad_norm": 1.5848398208618164, "learning_rate": 1.9892407680908904e-05, "loss": 0.396, "step": 230 }, { "epoch": 0.444650301065308, "grad_norm": 1.9259053468704224, "learning_rate": 1.9858516365062334e-05, "loss": 0.3352, "step": 240 }, { "epoch": 0.4631773969430292, "grad_norm": 1.4261807203292847, "learning_rate": 1.9820029171677288e-05, "loss": 0.3511, "step": 250 }, { "epoch": 0.48170449282075034, "grad_norm": 1.699010968208313, "learning_rate": 1.977696404284779e-05, "loss": 0.4073, "step": 260 }, { "epoch": 0.5002315886984715, "grad_norm": 1.3403549194335938, "learning_rate": 1.9729341054825783e-05, "loss": 0.4454, "step": 270 }, { "epoch": 0.5187586845761927, "grad_norm": 1.2229658365249634, "learning_rate": 1.9677182408661894e-05, "loss": 0.4352, "step": 280 }, { "epoch": 0.5372857804539138, "grad_norm": 2.2487080097198486, "learning_rate": 1.9620512419855684e-05, "loss": 0.392, "step": 290 }, { "epoch": 0.555812876331635, "grad_norm": 2.6429977416992188, "learning_rate": 1.9559357507020163e-05, "loss": 0.4013, "step": 300 }, { "epoch": 0.5743399722093562, "grad_norm": 2.240354061126709, "learning_rate": 1.9493746179565854e-05, "loss": 0.4111, "step": 310 }, { "epoch": 0.5928670680870773, "grad_norm": 1.2388675212860107, "learning_rate": 1.94237090244102e-05, "loss": 0.3653, "step": 320 }, { "epoch": 0.6113941639647985, "grad_norm": 2.2535054683685303, "learning_rate": 1.9349278691718426e-05, "loss": 0.3956, "step": 330 }, { "epoch": 0.6299212598425197, "grad_norm": 2.5032520294189453, "learning_rate": 1.9270489879682592e-05, "loss": 0.3697, "step": 340 }, { "epoch": 0.6484483557202408, "grad_norm": 2.4367105960845947, "learning_rate": 1.9187379318345845e-05, "loss": 0.4188, "step": 350 }, { "epoch": 0.666975451597962, "grad_norm": 2.6159491539001465, "learning_rate": 1.9099985752479505e-05, "loss": 0.4415, "step": 360 }, { "epoch": 0.6855025474756832, "grad_norm": 1.8182092905044556, "learning_rate": 1.900834992352087e-05, "loss": 0.3273, "step": 370 }, { "epoch": 0.7040296433534043, "grad_norm": 3.406963348388672, "learning_rate": 1.8912514550580242e-05, "loss": 0.4069, "step": 380 }, { "epoch": 0.7225567392311255, "grad_norm": 1.886953353881836, "learning_rate": 1.881252431052599e-05, "loss": 0.3452, "step": 390 }, { "epoch": 0.7410838351088467, "grad_norm": 2.346081018447876, "learning_rate": 1.870842581715691e-05, "loss": 0.2954, "step": 400 }, { "epoch": 0.7596109309865678, "grad_norm": 1.7905707359313965, "learning_rate": 1.8600267599471663e-05, "loss": 0.344, "step": 410 }, { "epoch": 0.778138026864289, "grad_norm": 1.5074595212936401, "learning_rate": 1.8488100079045345e-05, "loss": 0.4834, "step": 420 }, { "epoch": 0.7966651227420102, "grad_norm": 2.0368354320526123, "learning_rate": 1.8371975546523795e-05, "loss": 0.4263, "step": 430 }, { "epoch": 0.8151922186197313, "grad_norm": 2.4880967140197754, "learning_rate": 1.825194813724654e-05, "loss": 0.2868, "step": 440 }, { "epoch": 0.8337193144974525, "grad_norm": 1.4323982000350952, "learning_rate": 1.81280738060098e-05, "loss": 0.3404, "step": 450 }, { "epoch": 0.8522464103751737, "grad_norm": 1.955913782119751, "learning_rate": 1.8000410300981305e-05, "loss": 0.329, "step": 460 }, { "epoch": 0.8707735062528948, "grad_norm": 2.0698235034942627, "learning_rate": 1.786901713677902e-05, "loss": 0.3959, "step": 470 }, { "epoch": 0.889300602130616, "grad_norm": 2.9616572856903076, "learning_rate": 1.7733955566726438e-05, "loss": 0.3973, "step": 480 }, { "epoch": 0.9078276980083372, "grad_norm": 3.0657591819763184, "learning_rate": 1.7595288554297295e-05, "loss": 0.4035, "step": 490 }, { "epoch": 0.9263547938860583, "grad_norm": 1.5825896263122559, "learning_rate": 1.7453080743763e-05, "loss": 0.3797, "step": 500 }, { "epoch": 0.9448818897637795, "grad_norm": 1.8893063068389893, "learning_rate": 1.7307398430056595e-05, "loss": 0.2627, "step": 510 }, { "epoch": 0.9634089856415007, "grad_norm": 1.5115277767181396, "learning_rate": 1.7158309527867117e-05, "loss": 0.281, "step": 520 }, { "epoch": 0.9819360815192218, "grad_norm": 4.204952239990234, "learning_rate": 1.700588353997891e-05, "loss": 0.4839, "step": 530 }, { "epoch": 1.001852709587772, "grad_norm": 3.4891834259033203, "learning_rate": 1.6850191524870548e-05, "loss": 0.4232, "step": 540 }, { "epoch": 1.0203798054654933, "grad_norm": 2.1796462535858154, "learning_rate": 1.6691306063588583e-05, "loss": 0.3583, "step": 550 }, { "epoch": 1.0389069013432144, "grad_norm": 1.7773243188858032, "learning_rate": 1.6529301225911433e-05, "loss": 0.3135, "step": 560 }, { "epoch": 1.0574339972209357, "grad_norm": 1.6911367177963257, "learning_rate": 1.6364252535819284e-05, "loss": 0.2577, "step": 570 }, { "epoch": 1.0759610930986567, "grad_norm": 1.934979796409607, "learning_rate": 1.619623693628605e-05, "loss": 0.2957, "step": 580 }, { "epoch": 1.094488188976378, "grad_norm": 2.352208137512207, "learning_rate": 1.602533275340984e-05, "loss": 0.3576, "step": 590 }, { "epoch": 1.113015284854099, "grad_norm": 1.4006640911102295, "learning_rate": 1.5851619659898623e-05, "loss": 0.3574, "step": 600 } ], "logging_steps": 10, "max_steps": 1617, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.3080296690801705e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }