{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.32986970146792016, "eval_steps": 10, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006597394029358403, "grad_norm": 1.2868798971176147, "learning_rate": 3.2894736842105265e-06, "loss": 4.6934, "step": 10 }, { "epoch": 0.006597394029358403, "eval_loss": 4.687362194061279, "eval_runtime": 282.9655, "eval_samples_per_second": 18.045, "eval_steps_per_second": 0.565, "step": 10 }, { "epoch": 0.013194788058716806, "grad_norm": 1.302221417427063, "learning_rate": 6.578947368421053e-06, "loss": 4.6651, "step": 20 }, { "epoch": 0.013194788058716806, "eval_loss": 4.648743152618408, "eval_runtime": 282.9326, "eval_samples_per_second": 18.047, "eval_steps_per_second": 0.566, "step": 20 }, { "epoch": 0.01979218208807521, "grad_norm": 1.4535945653915405, "learning_rate": 9.868421052631579e-06, "loss": 4.6063, "step": 30 }, { "epoch": 0.01979218208807521, "eval_loss": 4.54951286315918, "eval_runtime": 282.3584, "eval_samples_per_second": 18.083, "eval_steps_per_second": 0.567, "step": 30 }, { "epoch": 0.026389576117433613, "grad_norm": 1.714498519897461, "learning_rate": 1.3157894736842106e-05, "loss": 4.4527, "step": 40 }, { "epoch": 0.026389576117433613, "eval_loss": 4.304062366485596, "eval_runtime": 282.5935, "eval_samples_per_second": 18.068, "eval_steps_per_second": 0.566, "step": 40 }, { "epoch": 0.03298697014679202, "grad_norm": 1.832472324371338, "learning_rate": 1.6447368421052635e-05, "loss": 4.1216, "step": 50 }, { "epoch": 0.03298697014679202, "eval_loss": 3.8426761627197266, "eval_runtime": 281.3902, "eval_samples_per_second": 18.146, "eval_steps_per_second": 0.569, "step": 50 }, { "epoch": 0.03958436417615042, "grad_norm": 1.1160222291946411, "learning_rate": 1.9736842105263158e-05, "loss": 3.6561, "step": 60 }, { "epoch": 0.03958436417615042, "eval_loss": 3.4654476642608643, "eval_runtime": 282.2746, "eval_samples_per_second": 18.089, "eval_steps_per_second": 0.567, "step": 60 }, { "epoch": 0.04618175820550882, "grad_norm": 1.1160507202148438, "learning_rate": 2.3026315789473685e-05, "loss": 3.2993, "step": 70 }, { "epoch": 0.04618175820550882, "eval_loss": 3.0560474395751953, "eval_runtime": 282.4864, "eval_samples_per_second": 18.075, "eval_steps_per_second": 0.566, "step": 70 }, { "epoch": 0.052779152234867226, "grad_norm": 0.7468234896659851, "learning_rate": 2.6315789473684212e-05, "loss": 2.8777, "step": 80 }, { "epoch": 0.052779152234867226, "eval_loss": 2.7082414627075195, "eval_runtime": 282.279, "eval_samples_per_second": 18.088, "eval_steps_per_second": 0.567, "step": 80 }, { "epoch": 0.05937654626422563, "grad_norm": 0.7456527948379517, "learning_rate": 2.9605263157894735e-05, "loss": 2.5844, "step": 90 }, { "epoch": 0.05937654626422563, "eval_loss": 2.400399923324585, "eval_runtime": 282.4234, "eval_samples_per_second": 18.079, "eval_steps_per_second": 0.567, "step": 90 }, { "epoch": 0.06597394029358404, "grad_norm": 0.7843755483627319, "learning_rate": 3.289473684210527e-05, "loss": 2.2405, "step": 100 }, { "epoch": 0.06597394029358404, "eval_loss": 2.043464422225952, "eval_runtime": 282.4971, "eval_samples_per_second": 18.075, "eval_steps_per_second": 0.566, "step": 100 }, { "epoch": 0.07257133432294244, "grad_norm": 0.7848460078239441, "learning_rate": 3.618421052631579e-05, "loss": 1.8797, "step": 110 }, { "epoch": 0.07257133432294244, "eval_loss": 1.6749211549758911, "eval_runtime": 282.4596, "eval_samples_per_second": 18.077, "eval_steps_per_second": 0.566, "step": 110 }, { "epoch": 0.07916872835230084, "grad_norm": 0.8517773151397705, "learning_rate": 3.9473684210526316e-05, "loss": 1.5041, "step": 120 }, { "epoch": 0.07916872835230084, "eval_loss": 1.322484016418457, "eval_runtime": 282.425, "eval_samples_per_second": 18.079, "eval_steps_per_second": 0.567, "step": 120 }, { "epoch": 0.08576612238165925, "grad_norm": 0.924010694026947, "learning_rate": 4.2763157894736847e-05, "loss": 1.1712, "step": 130 }, { "epoch": 0.08576612238165925, "eval_loss": 1.0092010498046875, "eval_runtime": 282.4135, "eval_samples_per_second": 18.08, "eval_steps_per_second": 0.567, "step": 130 }, { "epoch": 0.09236351641101764, "grad_norm": 0.599291205406189, "learning_rate": 4.605263157894737e-05, "loss": 0.9068, "step": 140 }, { "epoch": 0.09236351641101764, "eval_loss": 0.8262147307395935, "eval_runtime": 282.3089, "eval_samples_per_second": 18.087, "eval_steps_per_second": 0.567, "step": 140 }, { "epoch": 0.09896091044037605, "grad_norm": 0.4690685570240021, "learning_rate": 4.9342105263157894e-05, "loss": 0.7821, "step": 150 }, { "epoch": 0.09896091044037605, "eval_loss": 0.7165087461471558, "eval_runtime": 282.1805, "eval_samples_per_second": 18.095, "eval_steps_per_second": 0.567, "step": 150 }, { "epoch": 0.10555830446973445, "grad_norm": 0.5129504203796387, "learning_rate": 4.9706529713866475e-05, "loss": 0.6867, "step": 160 }, { "epoch": 0.10555830446973445, "eval_loss": 0.6483914256095886, "eval_runtime": 282.207, "eval_samples_per_second": 18.093, "eval_steps_per_second": 0.567, "step": 160 }, { "epoch": 0.11215569849909286, "grad_norm": 0.4116060435771942, "learning_rate": 4.933969185619956e-05, "loss": 0.6315, "step": 170 }, { "epoch": 0.11215569849909286, "eval_loss": 0.6029236912727356, "eval_runtime": 282.3594, "eval_samples_per_second": 18.083, "eval_steps_per_second": 0.567, "step": 170 }, { "epoch": 0.11875309252845126, "grad_norm": 0.4301941990852356, "learning_rate": 4.8972853998532655e-05, "loss": 0.5833, "step": 180 }, { "epoch": 0.11875309252845126, "eval_loss": 0.5720704197883606, "eval_runtime": 282.0439, "eval_samples_per_second": 18.104, "eval_steps_per_second": 0.567, "step": 180 }, { "epoch": 0.12535048655780967, "grad_norm": 0.5514675378799438, "learning_rate": 4.860601614086574e-05, "loss": 0.5667, "step": 190 }, { "epoch": 0.12535048655780967, "eval_loss": 0.5490744709968567, "eval_runtime": 282.3239, "eval_samples_per_second": 18.086, "eval_steps_per_second": 0.567, "step": 190 }, { "epoch": 0.13194788058716808, "grad_norm": 0.4441012144088745, "learning_rate": 4.823917828319883e-05, "loss": 0.5303, "step": 200 }, { "epoch": 0.13194788058716808, "eval_loss": 0.5291892886161804, "eval_runtime": 282.4233, "eval_samples_per_second": 18.079, "eval_steps_per_second": 0.567, "step": 200 }, { "epoch": 0.13854527461652646, "grad_norm": 0.4900895059108734, "learning_rate": 4.787234042553192e-05, "loss": 0.5109, "step": 210 }, { "epoch": 0.13854527461652646, "eval_loss": 0.5132142901420593, "eval_runtime": 282.4262, "eval_samples_per_second": 18.079, "eval_steps_per_second": 0.567, "step": 210 }, { "epoch": 0.14514266864588488, "grad_norm": 0.4287125766277313, "learning_rate": 4.750550256786501e-05, "loss": 0.5109, "step": 220 }, { "epoch": 0.14514266864588488, "eval_loss": 0.4994109869003296, "eval_runtime": 282.5477, "eval_samples_per_second": 18.071, "eval_steps_per_second": 0.566, "step": 220 }, { "epoch": 0.1517400626752433, "grad_norm": 0.4390946626663208, "learning_rate": 4.713866471019809e-05, "loss": 0.4952, "step": 230 }, { "epoch": 0.1517400626752433, "eval_loss": 0.4881007969379425, "eval_runtime": 282.5158, "eval_samples_per_second": 18.073, "eval_steps_per_second": 0.566, "step": 230 }, { "epoch": 0.15833745670460167, "grad_norm": 0.5645927786827087, "learning_rate": 4.677182685253118e-05, "loss": 0.488, "step": 240 }, { "epoch": 0.15833745670460167, "eval_loss": 0.4788074791431427, "eval_runtime": 282.6337, "eval_samples_per_second": 18.066, "eval_steps_per_second": 0.566, "step": 240 }, { "epoch": 0.16493485073396008, "grad_norm": 0.5930696129798889, "learning_rate": 4.6404988994864274e-05, "loss": 0.4833, "step": 250 }, { "epoch": 0.16493485073396008, "eval_loss": 0.4700024425983429, "eval_runtime": 282.5674, "eval_samples_per_second": 18.07, "eval_steps_per_second": 0.566, "step": 250 }, { "epoch": 0.1715322447633185, "grad_norm": 0.408792644739151, "learning_rate": 4.6038151137197364e-05, "loss": 0.4696, "step": 260 }, { "epoch": 0.1715322447633185, "eval_loss": 0.46224626898765564, "eval_runtime": 282.2108, "eval_samples_per_second": 18.093, "eval_steps_per_second": 0.567, "step": 260 }, { "epoch": 0.1781296387926769, "grad_norm": 0.5276111960411072, "learning_rate": 4.567131327953045e-05, "loss": 0.4579, "step": 270 }, { "epoch": 0.1781296387926769, "eval_loss": 0.455151230096817, "eval_runtime": 282.6419, "eval_samples_per_second": 18.065, "eval_steps_per_second": 0.566, "step": 270 }, { "epoch": 0.1847270328220353, "grad_norm": 0.5726278424263, "learning_rate": 4.530447542186354e-05, "loss": 0.4547, "step": 280 }, { "epoch": 0.1847270328220353, "eval_loss": 0.44886669516563416, "eval_runtime": 282.6094, "eval_samples_per_second": 18.067, "eval_steps_per_second": 0.566, "step": 280 }, { "epoch": 0.1913244268513937, "grad_norm": 0.4161057770252228, "learning_rate": 4.493763756419663e-05, "loss": 0.4513, "step": 290 }, { "epoch": 0.1913244268513937, "eval_loss": 0.44479382038116455, "eval_runtime": 282.5015, "eval_samples_per_second": 18.074, "eval_steps_per_second": 0.566, "step": 290 }, { "epoch": 0.1979218208807521, "grad_norm": 0.4173205494880676, "learning_rate": 4.457079970652971e-05, "loss": 0.4364, "step": 300 }, { "epoch": 0.1979218208807521, "eval_loss": 0.4391051232814789, "eval_runtime": 282.6206, "eval_samples_per_second": 18.067, "eval_steps_per_second": 0.566, "step": 300 }, { "epoch": 0.20451921491011052, "grad_norm": 0.4726370871067047, "learning_rate": 4.420396184886281e-05, "loss": 0.4413, "step": 310 }, { "epoch": 0.20451921491011052, "eval_loss": 0.43503668904304504, "eval_runtime": 282.4681, "eval_samples_per_second": 18.076, "eval_steps_per_second": 0.566, "step": 310 }, { "epoch": 0.2111166089394689, "grad_norm": 0.528791069984436, "learning_rate": 4.383712399119589e-05, "loss": 0.4347, "step": 320 }, { "epoch": 0.2111166089394689, "eval_loss": 0.4299105107784271, "eval_runtime": 282.3837, "eval_samples_per_second": 18.082, "eval_steps_per_second": 0.567, "step": 320 }, { "epoch": 0.2177140029688273, "grad_norm": 0.4637807607650757, "learning_rate": 4.347028613352898e-05, "loss": 0.4263, "step": 330 }, { "epoch": 0.2177140029688273, "eval_loss": 0.42648786306381226, "eval_runtime": 282.4045, "eval_samples_per_second": 18.08, "eval_steps_per_second": 0.567, "step": 330 }, { "epoch": 0.22431139699818572, "grad_norm": 0.5652016401290894, "learning_rate": 4.3103448275862066e-05, "loss": 0.424, "step": 340 }, { "epoch": 0.22431139699818572, "eval_loss": 0.42295747995376587, "eval_runtime": 282.132, "eval_samples_per_second": 18.098, "eval_steps_per_second": 0.567, "step": 340 }, { "epoch": 0.2309087910275441, "grad_norm": 0.5480278134346008, "learning_rate": 4.273661041819516e-05, "loss": 0.4246, "step": 350 }, { "epoch": 0.2309087910275441, "eval_loss": 0.4191484749317169, "eval_runtime": 282.8434, "eval_samples_per_second": 18.052, "eval_steps_per_second": 0.566, "step": 350 }, { "epoch": 0.23750618505690252, "grad_norm": 0.5954585671424866, "learning_rate": 4.2369772560528246e-05, "loss": 0.4194, "step": 360 }, { "epoch": 0.23750618505690252, "eval_loss": 0.4160284399986267, "eval_runtime": 282.0609, "eval_samples_per_second": 18.102, "eval_steps_per_second": 0.567, "step": 360 }, { "epoch": 0.24410357908626093, "grad_norm": 0.6444059610366821, "learning_rate": 4.2002934702861336e-05, "loss": 0.4151, "step": 370 }, { "epoch": 0.24410357908626093, "eval_loss": 0.4145909547805786, "eval_runtime": 282.2951, "eval_samples_per_second": 18.087, "eval_steps_per_second": 0.567, "step": 370 }, { "epoch": 0.25070097311561934, "grad_norm": 0.5506740212440491, "learning_rate": 4.163609684519443e-05, "loss": 0.4143, "step": 380 }, { "epoch": 0.25070097311561934, "eval_loss": 0.41122376918792725, "eval_runtime": 282.3323, "eval_samples_per_second": 18.085, "eval_steps_per_second": 0.567, "step": 380 }, { "epoch": 0.2572983671449777, "grad_norm": 0.5273014307022095, "learning_rate": 4.126925898752752e-05, "loss": 0.4059, "step": 390 }, { "epoch": 0.2572983671449777, "eval_loss": 0.41092923283576965, "eval_runtime": 282.2453, "eval_samples_per_second": 18.091, "eval_steps_per_second": 0.567, "step": 390 }, { "epoch": 0.26389576117433616, "grad_norm": 0.5662177205085754, "learning_rate": 4.09024211298606e-05, "loss": 0.411, "step": 400 }, { "epoch": 0.26389576117433616, "eval_loss": 0.40584176778793335, "eval_runtime": 282.2166, "eval_samples_per_second": 18.092, "eval_steps_per_second": 0.567, "step": 400 }, { "epoch": 0.27049315520369455, "grad_norm": 0.6541048884391785, "learning_rate": 4.05355832721937e-05, "loss": 0.4017, "step": 410 }, { "epoch": 0.27049315520369455, "eval_loss": 0.4048697054386139, "eval_runtime": 282.3287, "eval_samples_per_second": 18.085, "eval_steps_per_second": 0.567, "step": 410 }, { "epoch": 0.27709054923305293, "grad_norm": 0.490032821893692, "learning_rate": 4.016874541452678e-05, "loss": 0.3986, "step": 420 }, { "epoch": 0.27709054923305293, "eval_loss": 0.4038483202457428, "eval_runtime": 282.2547, "eval_samples_per_second": 18.09, "eval_steps_per_second": 0.567, "step": 420 }, { "epoch": 0.28368794326241137, "grad_norm": 0.4884147644042969, "learning_rate": 3.980190755685987e-05, "loss": 0.4003, "step": 430 }, { "epoch": 0.28368794326241137, "eval_loss": 0.40054452419281006, "eval_runtime": 282.2046, "eval_samples_per_second": 18.093, "eval_steps_per_second": 0.567, "step": 430 }, { "epoch": 0.29028533729176975, "grad_norm": 0.5234985947608948, "learning_rate": 3.943506969919296e-05, "loss": 0.4005, "step": 440 }, { "epoch": 0.29028533729176975, "eval_loss": 0.39862698316574097, "eval_runtime": 282.1876, "eval_samples_per_second": 18.094, "eval_steps_per_second": 0.567, "step": 440 }, { "epoch": 0.29688273132112813, "grad_norm": 0.512843668460846, "learning_rate": 3.9068231841526045e-05, "loss": 0.3952, "step": 450 }, { "epoch": 0.29688273132112813, "eval_loss": 0.3970402181148529, "eval_runtime": 282.2615, "eval_samples_per_second": 18.09, "eval_steps_per_second": 0.567, "step": 450 }, { "epoch": 0.3034801253504866, "grad_norm": 0.46752044558525085, "learning_rate": 3.8701393983859135e-05, "loss": 0.3967, "step": 460 }, { "epoch": 0.3034801253504866, "eval_loss": 0.39410126209259033, "eval_runtime": 282.2669, "eval_samples_per_second": 18.089, "eval_steps_per_second": 0.567, "step": 460 }, { "epoch": 0.31007751937984496, "grad_norm": 0.49608954787254333, "learning_rate": 3.8334556126192226e-05, "loss": 0.3893, "step": 470 }, { "epoch": 0.31007751937984496, "eval_loss": 0.3944699168205261, "eval_runtime": 282.3461, "eval_samples_per_second": 18.084, "eval_steps_per_second": 0.567, "step": 470 }, { "epoch": 0.31667491340920334, "grad_norm": 0.4948144853115082, "learning_rate": 3.7967718268525316e-05, "loss": 0.3883, "step": 480 }, { "epoch": 0.31667491340920334, "eval_loss": 0.3926471471786499, "eval_runtime": 282.4921, "eval_samples_per_second": 18.075, "eval_steps_per_second": 0.566, "step": 480 }, { "epoch": 0.3232723074385618, "grad_norm": 0.5696155428886414, "learning_rate": 3.76008804108584e-05, "loss": 0.3921, "step": 490 }, { "epoch": 0.3232723074385618, "eval_loss": 0.39166340231895447, "eval_runtime": 282.2989, "eval_samples_per_second": 18.087, "eval_steps_per_second": 0.567, "step": 490 }, { "epoch": 0.32986970146792016, "grad_norm": 0.5400599241256714, "learning_rate": 3.723404255319149e-05, "loss": 0.3927, "step": 500 }, { "epoch": 0.32986970146792016, "eval_loss": 0.3899206817150116, "eval_runtime": 282.6164, "eval_samples_per_second": 18.067, "eval_steps_per_second": 0.566, "step": 500 } ], "logging_steps": 10, "max_steps": 1515, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1763083231772672e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }