{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6597394029358403, "eval_steps": 10, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006597394029358403, "grad_norm": 1.2868798971176147, "learning_rate": 3.2894736842105265e-06, "loss": 4.6934, "step": 10 }, { "epoch": 0.006597394029358403, "eval_loss": 4.687362194061279, "eval_runtime": 282.9655, "eval_samples_per_second": 18.045, "eval_steps_per_second": 0.565, "step": 10 }, { "epoch": 0.013194788058716806, "grad_norm": 1.302221417427063, "learning_rate": 6.578947368421053e-06, "loss": 4.6651, "step": 20 }, { "epoch": 0.013194788058716806, "eval_loss": 4.648743152618408, "eval_runtime": 282.9326, "eval_samples_per_second": 18.047, "eval_steps_per_second": 0.566, "step": 20 }, { "epoch": 0.01979218208807521, "grad_norm": 1.4535945653915405, "learning_rate": 9.868421052631579e-06, "loss": 4.6063, "step": 30 }, { "epoch": 0.01979218208807521, "eval_loss": 4.54951286315918, "eval_runtime": 282.3584, "eval_samples_per_second": 18.083, "eval_steps_per_second": 0.567, "step": 30 }, { "epoch": 0.026389576117433613, "grad_norm": 1.714498519897461, "learning_rate": 1.3157894736842106e-05, "loss": 4.4527, "step": 40 }, { "epoch": 0.026389576117433613, "eval_loss": 4.304062366485596, "eval_runtime": 282.5935, "eval_samples_per_second": 18.068, "eval_steps_per_second": 0.566, "step": 40 }, { "epoch": 0.03298697014679202, "grad_norm": 1.832472324371338, "learning_rate": 1.6447368421052635e-05, "loss": 4.1216, "step": 50 }, { "epoch": 0.03298697014679202, "eval_loss": 3.8426761627197266, "eval_runtime": 281.3902, "eval_samples_per_second": 18.146, "eval_steps_per_second": 0.569, "step": 50 }, { "epoch": 0.03958436417615042, "grad_norm": 1.1160222291946411, "learning_rate": 1.9736842105263158e-05, "loss": 3.6561, "step": 60 }, { "epoch": 0.03958436417615042, "eval_loss": 3.4654476642608643, "eval_runtime": 282.2746, "eval_samples_per_second": 18.089, "eval_steps_per_second": 0.567, "step": 60 }, { "epoch": 0.04618175820550882, "grad_norm": 1.1160507202148438, "learning_rate": 2.3026315789473685e-05, "loss": 3.2993, "step": 70 }, { "epoch": 0.04618175820550882, "eval_loss": 3.0560474395751953, "eval_runtime": 282.4864, "eval_samples_per_second": 18.075, "eval_steps_per_second": 0.566, "step": 70 }, { "epoch": 0.052779152234867226, "grad_norm": 0.7468234896659851, "learning_rate": 2.6315789473684212e-05, "loss": 2.8777, "step": 80 }, { "epoch": 0.052779152234867226, "eval_loss": 2.7082414627075195, "eval_runtime": 282.279, "eval_samples_per_second": 18.088, "eval_steps_per_second": 0.567, "step": 80 }, { "epoch": 0.05937654626422563, "grad_norm": 0.7456527948379517, "learning_rate": 2.9605263157894735e-05, "loss": 2.5844, "step": 90 }, { "epoch": 0.05937654626422563, "eval_loss": 2.400399923324585, "eval_runtime": 282.4234, "eval_samples_per_second": 18.079, "eval_steps_per_second": 0.567, "step": 90 }, { "epoch": 0.06597394029358404, "grad_norm": 0.7843755483627319, "learning_rate": 3.289473684210527e-05, "loss": 2.2405, "step": 100 }, { "epoch": 0.06597394029358404, "eval_loss": 2.043464422225952, "eval_runtime": 282.4971, "eval_samples_per_second": 18.075, "eval_steps_per_second": 0.566, "step": 100 }, { "epoch": 0.07257133432294244, "grad_norm": 0.7848460078239441, "learning_rate": 3.618421052631579e-05, "loss": 1.8797, "step": 110 }, { "epoch": 0.07257133432294244, "eval_loss": 1.6749211549758911, "eval_runtime": 282.4596, "eval_samples_per_second": 18.077, "eval_steps_per_second": 0.566, "step": 110 }, { "epoch": 0.07916872835230084, "grad_norm": 0.8517773151397705, "learning_rate": 3.9473684210526316e-05, "loss": 1.5041, "step": 120 }, { "epoch": 0.07916872835230084, "eval_loss": 1.322484016418457, "eval_runtime": 282.425, "eval_samples_per_second": 18.079, "eval_steps_per_second": 0.567, "step": 120 }, { "epoch": 0.08576612238165925, "grad_norm": 0.924010694026947, "learning_rate": 4.2763157894736847e-05, "loss": 1.1712, "step": 130 }, { "epoch": 0.08576612238165925, "eval_loss": 1.0092010498046875, "eval_runtime": 282.4135, "eval_samples_per_second": 18.08, "eval_steps_per_second": 0.567, "step": 130 }, { "epoch": 0.09236351641101764, "grad_norm": 0.599291205406189, "learning_rate": 4.605263157894737e-05, "loss": 0.9068, "step": 140 }, { "epoch": 0.09236351641101764, "eval_loss": 0.8262147307395935, "eval_runtime": 282.3089, "eval_samples_per_second": 18.087, "eval_steps_per_second": 0.567, "step": 140 }, { "epoch": 0.09896091044037605, "grad_norm": 0.4690685570240021, "learning_rate": 4.9342105263157894e-05, "loss": 0.7821, "step": 150 }, { "epoch": 0.09896091044037605, "eval_loss": 0.7165087461471558, "eval_runtime": 282.1805, "eval_samples_per_second": 18.095, "eval_steps_per_second": 0.567, "step": 150 }, { "epoch": 0.10555830446973445, "grad_norm": 0.5129504203796387, "learning_rate": 4.9706529713866475e-05, "loss": 0.6867, "step": 160 }, { "epoch": 0.10555830446973445, "eval_loss": 0.6483914256095886, "eval_runtime": 282.207, "eval_samples_per_second": 18.093, "eval_steps_per_second": 0.567, "step": 160 }, { "epoch": 0.11215569849909286, "grad_norm": 0.4116060435771942, "learning_rate": 4.933969185619956e-05, "loss": 0.6315, "step": 170 }, { "epoch": 0.11215569849909286, "eval_loss": 0.6029236912727356, "eval_runtime": 282.3594, "eval_samples_per_second": 18.083, "eval_steps_per_second": 0.567, "step": 170 }, { "epoch": 0.11875309252845126, "grad_norm": 0.4301941990852356, "learning_rate": 4.8972853998532655e-05, "loss": 0.5833, "step": 180 }, { "epoch": 0.11875309252845126, "eval_loss": 0.5720704197883606, "eval_runtime": 282.0439, "eval_samples_per_second": 18.104, "eval_steps_per_second": 0.567, "step": 180 }, { "epoch": 0.12535048655780967, "grad_norm": 0.5514675378799438, "learning_rate": 4.860601614086574e-05, "loss": 0.5667, "step": 190 }, { "epoch": 0.12535048655780967, "eval_loss": 0.5490744709968567, "eval_runtime": 282.3239, "eval_samples_per_second": 18.086, "eval_steps_per_second": 0.567, "step": 190 }, { "epoch": 0.13194788058716808, "grad_norm": 0.4441012144088745, "learning_rate": 4.823917828319883e-05, "loss": 0.5303, "step": 200 }, { "epoch": 0.13194788058716808, "eval_loss": 0.5291892886161804, "eval_runtime": 282.4233, "eval_samples_per_second": 18.079, "eval_steps_per_second": 0.567, "step": 200 }, { "epoch": 0.13854527461652646, "grad_norm": 0.4900895059108734, "learning_rate": 4.787234042553192e-05, "loss": 0.5109, "step": 210 }, { "epoch": 0.13854527461652646, "eval_loss": 0.5132142901420593, "eval_runtime": 282.4262, "eval_samples_per_second": 18.079, "eval_steps_per_second": 0.567, "step": 210 }, { "epoch": 0.14514266864588488, "grad_norm": 0.4287125766277313, "learning_rate": 4.750550256786501e-05, "loss": 0.5109, "step": 220 }, { "epoch": 0.14514266864588488, "eval_loss": 0.4994109869003296, "eval_runtime": 282.5477, "eval_samples_per_second": 18.071, "eval_steps_per_second": 0.566, "step": 220 }, { "epoch": 0.1517400626752433, "grad_norm": 0.4390946626663208, "learning_rate": 4.713866471019809e-05, "loss": 0.4952, "step": 230 }, { "epoch": 0.1517400626752433, "eval_loss": 0.4881007969379425, "eval_runtime": 282.5158, "eval_samples_per_second": 18.073, "eval_steps_per_second": 0.566, "step": 230 }, { "epoch": 0.15833745670460167, "grad_norm": 0.5645927786827087, "learning_rate": 4.677182685253118e-05, "loss": 0.488, "step": 240 }, { "epoch": 0.15833745670460167, "eval_loss": 0.4788074791431427, "eval_runtime": 282.6337, "eval_samples_per_second": 18.066, "eval_steps_per_second": 0.566, "step": 240 }, { "epoch": 0.16493485073396008, "grad_norm": 0.5930696129798889, "learning_rate": 4.6404988994864274e-05, "loss": 0.4833, "step": 250 }, { "epoch": 0.16493485073396008, "eval_loss": 0.4700024425983429, "eval_runtime": 282.5674, "eval_samples_per_second": 18.07, "eval_steps_per_second": 0.566, "step": 250 }, { "epoch": 0.1715322447633185, "grad_norm": 0.408792644739151, "learning_rate": 4.6038151137197364e-05, "loss": 0.4696, "step": 260 }, { "epoch": 0.1715322447633185, "eval_loss": 0.46224626898765564, "eval_runtime": 282.2108, "eval_samples_per_second": 18.093, "eval_steps_per_second": 0.567, "step": 260 }, { "epoch": 0.1781296387926769, "grad_norm": 0.5276111960411072, "learning_rate": 4.567131327953045e-05, "loss": 0.4579, "step": 270 }, { "epoch": 0.1781296387926769, "eval_loss": 0.455151230096817, "eval_runtime": 282.6419, "eval_samples_per_second": 18.065, "eval_steps_per_second": 0.566, "step": 270 }, { "epoch": 0.1847270328220353, "grad_norm": 0.5726278424263, "learning_rate": 4.530447542186354e-05, "loss": 0.4547, "step": 280 }, { "epoch": 0.1847270328220353, "eval_loss": 0.44886669516563416, "eval_runtime": 282.6094, "eval_samples_per_second": 18.067, "eval_steps_per_second": 0.566, "step": 280 }, { "epoch": 0.1913244268513937, "grad_norm": 0.4161057770252228, "learning_rate": 4.493763756419663e-05, "loss": 0.4513, "step": 290 }, { "epoch": 0.1913244268513937, "eval_loss": 0.44479382038116455, "eval_runtime": 282.5015, "eval_samples_per_second": 18.074, "eval_steps_per_second": 0.566, "step": 290 }, { "epoch": 0.1979218208807521, "grad_norm": 0.4173205494880676, "learning_rate": 4.457079970652971e-05, "loss": 0.4364, "step": 300 }, { "epoch": 0.1979218208807521, "eval_loss": 0.4391051232814789, "eval_runtime": 282.6206, "eval_samples_per_second": 18.067, "eval_steps_per_second": 0.566, "step": 300 }, { "epoch": 0.20451921491011052, "grad_norm": 0.4726370871067047, "learning_rate": 4.420396184886281e-05, "loss": 0.4413, "step": 310 }, { "epoch": 0.20451921491011052, "eval_loss": 0.43503668904304504, "eval_runtime": 282.4681, "eval_samples_per_second": 18.076, "eval_steps_per_second": 0.566, "step": 310 }, { "epoch": 0.2111166089394689, "grad_norm": 0.528791069984436, "learning_rate": 4.383712399119589e-05, "loss": 0.4347, "step": 320 }, { "epoch": 0.2111166089394689, "eval_loss": 0.4299105107784271, "eval_runtime": 282.3837, "eval_samples_per_second": 18.082, "eval_steps_per_second": 0.567, "step": 320 }, { "epoch": 0.2177140029688273, "grad_norm": 0.4637807607650757, "learning_rate": 4.347028613352898e-05, "loss": 0.4263, "step": 330 }, { "epoch": 0.2177140029688273, "eval_loss": 0.42648786306381226, "eval_runtime": 282.4045, "eval_samples_per_second": 18.08, "eval_steps_per_second": 0.567, "step": 330 }, { "epoch": 0.22431139699818572, "grad_norm": 0.5652016401290894, "learning_rate": 4.3103448275862066e-05, "loss": 0.424, "step": 340 }, { "epoch": 0.22431139699818572, "eval_loss": 0.42295747995376587, "eval_runtime": 282.132, "eval_samples_per_second": 18.098, "eval_steps_per_second": 0.567, "step": 340 }, { "epoch": 0.2309087910275441, "grad_norm": 0.5480278134346008, "learning_rate": 4.273661041819516e-05, "loss": 0.4246, "step": 350 }, { "epoch": 0.2309087910275441, "eval_loss": 0.4191484749317169, "eval_runtime": 282.8434, "eval_samples_per_second": 18.052, "eval_steps_per_second": 0.566, "step": 350 }, { "epoch": 0.23750618505690252, "grad_norm": 0.5954585671424866, "learning_rate": 4.2369772560528246e-05, "loss": 0.4194, "step": 360 }, { "epoch": 0.23750618505690252, "eval_loss": 0.4160284399986267, "eval_runtime": 282.0609, "eval_samples_per_second": 18.102, "eval_steps_per_second": 0.567, "step": 360 }, { "epoch": 0.24410357908626093, "grad_norm": 0.6444059610366821, "learning_rate": 4.2002934702861336e-05, "loss": 0.4151, "step": 370 }, { "epoch": 0.24410357908626093, "eval_loss": 0.4145909547805786, "eval_runtime": 282.2951, "eval_samples_per_second": 18.087, "eval_steps_per_second": 0.567, "step": 370 }, { "epoch": 0.25070097311561934, "grad_norm": 0.5506740212440491, "learning_rate": 4.163609684519443e-05, "loss": 0.4143, "step": 380 }, { "epoch": 0.25070097311561934, "eval_loss": 0.41122376918792725, "eval_runtime": 282.3323, "eval_samples_per_second": 18.085, "eval_steps_per_second": 0.567, "step": 380 }, { "epoch": 0.2572983671449777, "grad_norm": 0.5273014307022095, "learning_rate": 4.126925898752752e-05, "loss": 0.4059, "step": 390 }, { "epoch": 0.2572983671449777, "eval_loss": 0.41092923283576965, "eval_runtime": 282.2453, "eval_samples_per_second": 18.091, "eval_steps_per_second": 0.567, "step": 390 }, { "epoch": 0.26389576117433616, "grad_norm": 0.5662177205085754, "learning_rate": 4.09024211298606e-05, "loss": 0.411, "step": 400 }, { "epoch": 0.26389576117433616, "eval_loss": 0.40584176778793335, "eval_runtime": 282.2166, "eval_samples_per_second": 18.092, "eval_steps_per_second": 0.567, "step": 400 }, { "epoch": 0.27049315520369455, "grad_norm": 0.6541048884391785, "learning_rate": 4.05355832721937e-05, "loss": 0.4017, "step": 410 }, { "epoch": 0.27049315520369455, "eval_loss": 0.4048697054386139, "eval_runtime": 282.3287, "eval_samples_per_second": 18.085, "eval_steps_per_second": 0.567, "step": 410 }, { "epoch": 0.27709054923305293, "grad_norm": 0.490032821893692, "learning_rate": 4.016874541452678e-05, "loss": 0.3986, "step": 420 }, { "epoch": 0.27709054923305293, "eval_loss": 0.4038483202457428, "eval_runtime": 282.2547, "eval_samples_per_second": 18.09, "eval_steps_per_second": 0.567, "step": 420 }, { "epoch": 0.28368794326241137, "grad_norm": 0.4884147644042969, "learning_rate": 3.980190755685987e-05, "loss": 0.4003, "step": 430 }, { "epoch": 0.28368794326241137, "eval_loss": 0.40054452419281006, "eval_runtime": 282.2046, "eval_samples_per_second": 18.093, "eval_steps_per_second": 0.567, "step": 430 }, { "epoch": 0.29028533729176975, "grad_norm": 0.5234985947608948, "learning_rate": 3.943506969919296e-05, "loss": 0.4005, "step": 440 }, { "epoch": 0.29028533729176975, "eval_loss": 0.39862698316574097, "eval_runtime": 282.1876, "eval_samples_per_second": 18.094, "eval_steps_per_second": 0.567, "step": 440 }, { "epoch": 0.29688273132112813, "grad_norm": 0.512843668460846, "learning_rate": 3.9068231841526045e-05, "loss": 0.3952, "step": 450 }, { "epoch": 0.29688273132112813, "eval_loss": 0.3970402181148529, "eval_runtime": 282.2615, "eval_samples_per_second": 18.09, "eval_steps_per_second": 0.567, "step": 450 }, { "epoch": 0.3034801253504866, "grad_norm": 0.46752044558525085, "learning_rate": 3.8701393983859135e-05, "loss": 0.3967, "step": 460 }, { "epoch": 0.3034801253504866, "eval_loss": 0.39410126209259033, "eval_runtime": 282.2669, "eval_samples_per_second": 18.089, "eval_steps_per_second": 0.567, "step": 460 }, { "epoch": 0.31007751937984496, "grad_norm": 0.49608954787254333, "learning_rate": 3.8334556126192226e-05, "loss": 0.3893, "step": 470 }, { "epoch": 0.31007751937984496, "eval_loss": 0.3944699168205261, "eval_runtime": 282.3461, "eval_samples_per_second": 18.084, "eval_steps_per_second": 0.567, "step": 470 }, { "epoch": 0.31667491340920334, "grad_norm": 0.4948144853115082, "learning_rate": 3.7967718268525316e-05, "loss": 0.3883, "step": 480 }, { "epoch": 0.31667491340920334, "eval_loss": 0.3926471471786499, "eval_runtime": 282.4921, "eval_samples_per_second": 18.075, "eval_steps_per_second": 0.566, "step": 480 }, { "epoch": 0.3232723074385618, "grad_norm": 0.5696155428886414, "learning_rate": 3.76008804108584e-05, "loss": 0.3921, "step": 490 }, { "epoch": 0.3232723074385618, "eval_loss": 0.39166340231895447, "eval_runtime": 282.2989, "eval_samples_per_second": 18.087, "eval_steps_per_second": 0.567, "step": 490 }, { "epoch": 0.32986970146792016, "grad_norm": 0.5400599241256714, "learning_rate": 3.723404255319149e-05, "loss": 0.3927, "step": 500 }, { "epoch": 0.32986970146792016, "eval_loss": 0.3899206817150116, "eval_runtime": 282.6164, "eval_samples_per_second": 18.067, "eval_steps_per_second": 0.566, "step": 500 }, { "epoch": 0.3364670954972786, "grad_norm": 0.5328640937805176, "learning_rate": 3.686720469552458e-05, "loss": 0.3893, "step": 510 }, { "epoch": 0.3364670954972786, "eval_loss": 0.388072669506073, "eval_runtime": 282.6361, "eval_samples_per_second": 18.066, "eval_steps_per_second": 0.566, "step": 510 }, { "epoch": 0.343064489526637, "grad_norm": 0.6197268962860107, "learning_rate": 3.650036683785767e-05, "loss": 0.3896, "step": 520 }, { "epoch": 0.343064489526637, "eval_loss": 0.38577401638031006, "eval_runtime": 282.5926, "eval_samples_per_second": 18.068, "eval_steps_per_second": 0.566, "step": 520 }, { "epoch": 0.34966188355599537, "grad_norm": 0.5348854064941406, "learning_rate": 3.6133528980190754e-05, "loss": 0.3824, "step": 530 }, { "epoch": 0.34966188355599537, "eval_loss": 0.38608622550964355, "eval_runtime": 282.758, "eval_samples_per_second": 18.058, "eval_steps_per_second": 0.566, "step": 530 }, { "epoch": 0.3562592775853538, "grad_norm": 0.46607691049575806, "learning_rate": 3.576669112252385e-05, "loss": 0.39, "step": 540 }, { "epoch": 0.3562592775853538, "eval_loss": 0.3843710720539093, "eval_runtime": 282.6255, "eval_samples_per_second": 18.066, "eval_steps_per_second": 0.566, "step": 540 }, { "epoch": 0.3628566716147122, "grad_norm": 0.5211663246154785, "learning_rate": 3.5399853264856934e-05, "loss": 0.3831, "step": 550 }, { "epoch": 0.3628566716147122, "eval_loss": 0.38406872749328613, "eval_runtime": 282.6083, "eval_samples_per_second": 18.067, "eval_steps_per_second": 0.566, "step": 550 }, { "epoch": 0.3694540656440706, "grad_norm": 0.4613727927207947, "learning_rate": 3.5033015407190025e-05, "loss": 0.3848, "step": 560 }, { "epoch": 0.3694540656440706, "eval_loss": 0.38266727328300476, "eval_runtime": 282.5681, "eval_samples_per_second": 18.07, "eval_steps_per_second": 0.566, "step": 560 }, { "epoch": 0.376051459673429, "grad_norm": 0.4561725854873657, "learning_rate": 3.466617754952311e-05, "loss": 0.3771, "step": 570 }, { "epoch": 0.376051459673429, "eval_loss": 0.38093405961990356, "eval_runtime": 282.579, "eval_samples_per_second": 18.069, "eval_steps_per_second": 0.566, "step": 570 }, { "epoch": 0.3826488537027874, "grad_norm": 0.5360392928123474, "learning_rate": 3.4299339691856205e-05, "loss": 0.3819, "step": 580 }, { "epoch": 0.3826488537027874, "eval_loss": 0.380355566740036, "eval_runtime": 282.5981, "eval_samples_per_second": 18.068, "eval_steps_per_second": 0.566, "step": 580 }, { "epoch": 0.3892462477321458, "grad_norm": 0.6373459696769714, "learning_rate": 3.393250183418929e-05, "loss": 0.3773, "step": 590 }, { "epoch": 0.3892462477321458, "eval_loss": 0.37904950976371765, "eval_runtime": 282.4657, "eval_samples_per_second": 18.077, "eval_steps_per_second": 0.566, "step": 590 }, { "epoch": 0.3958436417615042, "grad_norm": 0.4231775403022766, "learning_rate": 3.356566397652238e-05, "loss": 0.3736, "step": 600 }, { "epoch": 0.3958436417615042, "eval_loss": 0.37867945432662964, "eval_runtime": 282.479, "eval_samples_per_second": 18.076, "eval_steps_per_second": 0.566, "step": 600 }, { "epoch": 0.4024410357908626, "grad_norm": 0.5430477261543274, "learning_rate": 3.319882611885547e-05, "loss": 0.3778, "step": 610 }, { "epoch": 0.4024410357908626, "eval_loss": 0.3770335912704468, "eval_runtime": 282.4938, "eval_samples_per_second": 18.075, "eval_steps_per_second": 0.566, "step": 610 }, { "epoch": 0.40903842982022104, "grad_norm": 0.5898250341415405, "learning_rate": 3.283198826118856e-05, "loss": 0.3783, "step": 620 }, { "epoch": 0.40903842982022104, "eval_loss": 0.37556448578834534, "eval_runtime": 282.4757, "eval_samples_per_second": 18.076, "eval_steps_per_second": 0.566, "step": 620 }, { "epoch": 0.4156358238495794, "grad_norm": 0.5367993116378784, "learning_rate": 3.246515040352164e-05, "loss": 0.3726, "step": 630 }, { "epoch": 0.4156358238495794, "eval_loss": 0.37541213631629944, "eval_runtime": 282.4621, "eval_samples_per_second": 18.077, "eval_steps_per_second": 0.566, "step": 630 }, { "epoch": 0.4222332178789378, "grad_norm": 0.5479997992515564, "learning_rate": 3.209831254585473e-05, "loss": 0.3747, "step": 640 }, { "epoch": 0.4222332178789378, "eval_loss": 0.37338846921920776, "eval_runtime": 281.8183, "eval_samples_per_second": 18.118, "eval_steps_per_second": 0.568, "step": 640 }, { "epoch": 0.42883061190829624, "grad_norm": 0.5168615579605103, "learning_rate": 3.1731474688187823e-05, "loss": 0.3748, "step": 650 }, { "epoch": 0.42883061190829624, "eval_loss": 0.373104453086853, "eval_runtime": 282.3532, "eval_samples_per_second": 18.084, "eval_steps_per_second": 0.567, "step": 650 }, { "epoch": 0.4354280059376546, "grad_norm": 0.4789239764213562, "learning_rate": 3.1364636830520914e-05, "loss": 0.3725, "step": 660 }, { "epoch": 0.4354280059376546, "eval_loss": 0.37169981002807617, "eval_runtime": 282.5035, "eval_samples_per_second": 18.074, "eval_steps_per_second": 0.566, "step": 660 }, { "epoch": 0.442025399967013, "grad_norm": 0.5288018584251404, "learning_rate": 3.0997798972854e-05, "loss": 0.3715, "step": 670 }, { "epoch": 0.442025399967013, "eval_loss": 0.3723226487636566, "eval_runtime": 282.4315, "eval_samples_per_second": 18.079, "eval_steps_per_second": 0.567, "step": 670 }, { "epoch": 0.44862279399637145, "grad_norm": 0.5399067997932434, "learning_rate": 3.063096111518709e-05, "loss": 0.3672, "step": 680 }, { "epoch": 0.44862279399637145, "eval_loss": 0.3709748387336731, "eval_runtime": 282.3214, "eval_samples_per_second": 18.086, "eval_steps_per_second": 0.567, "step": 680 }, { "epoch": 0.45522018802572983, "grad_norm": 0.5560658574104309, "learning_rate": 3.0264123257520178e-05, "loss": 0.3703, "step": 690 }, { "epoch": 0.45522018802572983, "eval_loss": 0.3694921135902405, "eval_runtime": 282.225, "eval_samples_per_second": 18.092, "eval_steps_per_second": 0.567, "step": 690 }, { "epoch": 0.4618175820550882, "grad_norm": 0.46738743782043457, "learning_rate": 2.9897285399853265e-05, "loss": 0.3691, "step": 700 }, { "epoch": 0.4618175820550882, "eval_loss": 0.36861899495124817, "eval_runtime": 281.4833, "eval_samples_per_second": 18.14, "eval_steps_per_second": 0.568, "step": 700 }, { "epoch": 0.46841497608444665, "grad_norm": 0.5636041760444641, "learning_rate": 2.953044754218636e-05, "loss": 0.3674, "step": 710 }, { "epoch": 0.46841497608444665, "eval_loss": 0.3687904477119446, "eval_runtime": 281.7696, "eval_samples_per_second": 18.121, "eval_steps_per_second": 0.568, "step": 710 }, { "epoch": 0.47501237011380504, "grad_norm": 0.6417700052261353, "learning_rate": 2.9163609684519445e-05, "loss": 0.3676, "step": 720 }, { "epoch": 0.47501237011380504, "eval_loss": 0.36721163988113403, "eval_runtime": 281.7105, "eval_samples_per_second": 18.125, "eval_steps_per_second": 0.568, "step": 720 }, { "epoch": 0.4816097641431635, "grad_norm": 0.5903835892677307, "learning_rate": 2.8796771826852532e-05, "loss": 0.3687, "step": 730 }, { "epoch": 0.4816097641431635, "eval_loss": 0.36735713481903076, "eval_runtime": 280.4935, "eval_samples_per_second": 18.204, "eval_steps_per_second": 0.57, "step": 730 }, { "epoch": 0.48820715817252186, "grad_norm": 0.6300948858261108, "learning_rate": 2.8429933969185622e-05, "loss": 0.3649, "step": 740 }, { "epoch": 0.48820715817252186, "eval_loss": 0.3654569089412689, "eval_runtime": 282.5933, "eval_samples_per_second": 18.068, "eval_steps_per_second": 0.566, "step": 740 }, { "epoch": 0.49480455220188024, "grad_norm": 0.5772879123687744, "learning_rate": 2.806309611151871e-05, "loss": 0.3636, "step": 750 }, { "epoch": 0.49480455220188024, "eval_loss": 0.3652118444442749, "eval_runtime": 281.9717, "eval_samples_per_second": 18.108, "eval_steps_per_second": 0.567, "step": 750 }, { "epoch": 0.5014019462312387, "grad_norm": 0.46134456992149353, "learning_rate": 2.7696258253851796e-05, "loss": 0.3626, "step": 760 }, { "epoch": 0.5014019462312387, "eval_loss": 0.364380419254303, "eval_runtime": 281.1459, "eval_samples_per_second": 18.161, "eval_steps_per_second": 0.569, "step": 760 }, { "epoch": 0.507999340260597, "grad_norm": 0.5804652571678162, "learning_rate": 2.7329420396184886e-05, "loss": 0.3611, "step": 770 }, { "epoch": 0.507999340260597, "eval_loss": 0.3632882237434387, "eval_runtime": 281.5379, "eval_samples_per_second": 18.136, "eval_steps_per_second": 0.568, "step": 770 }, { "epoch": 0.5145967342899554, "grad_norm": 0.45915406942367554, "learning_rate": 2.6962582538517977e-05, "loss": 0.3599, "step": 780 }, { "epoch": 0.5145967342899554, "eval_loss": 0.3634037375450134, "eval_runtime": 281.4618, "eval_samples_per_second": 18.141, "eval_steps_per_second": 0.568, "step": 780 }, { "epoch": 0.5211941283193139, "grad_norm": 0.4998124837875366, "learning_rate": 2.6595744680851064e-05, "loss": 0.3619, "step": 790 }, { "epoch": 0.5211941283193139, "eval_loss": 0.3615591526031494, "eval_runtime": 282.3789, "eval_samples_per_second": 18.082, "eval_steps_per_second": 0.567, "step": 790 }, { "epoch": 0.5277915223486723, "grad_norm": 0.5479339361190796, "learning_rate": 2.622890682318415e-05, "loss": 0.3577, "step": 800 }, { "epoch": 0.5277915223486723, "eval_loss": 0.3610810339450836, "eval_runtime": 282.2307, "eval_samples_per_second": 18.092, "eval_steps_per_second": 0.567, "step": 800 }, { "epoch": 0.5343889163780307, "grad_norm": 0.5121034383773804, "learning_rate": 2.5862068965517244e-05, "loss": 0.3621, "step": 810 }, { "epoch": 0.5343889163780307, "eval_loss": 0.3605840802192688, "eval_runtime": 281.2796, "eval_samples_per_second": 18.153, "eval_steps_per_second": 0.569, "step": 810 }, { "epoch": 0.5409863104073891, "grad_norm": 0.5326396822929382, "learning_rate": 2.549523110785033e-05, "loss": 0.3575, "step": 820 }, { "epoch": 0.5409863104073891, "eval_loss": 0.359646201133728, "eval_runtime": 281.2148, "eval_samples_per_second": 18.157, "eval_steps_per_second": 0.569, "step": 820 }, { "epoch": 0.5475837044367475, "grad_norm": 0.5050413608551025, "learning_rate": 2.5128393250183418e-05, "loss": 0.3545, "step": 830 }, { "epoch": 0.5475837044367475, "eval_loss": 0.3594898581504822, "eval_runtime": 281.2594, "eval_samples_per_second": 18.154, "eval_steps_per_second": 0.569, "step": 830 }, { "epoch": 0.5541810984661059, "grad_norm": 0.45093920826911926, "learning_rate": 2.4761555392516508e-05, "loss": 0.3585, "step": 840 }, { "epoch": 0.5541810984661059, "eval_loss": 0.35947272181510925, "eval_runtime": 280.5764, "eval_samples_per_second": 18.198, "eval_steps_per_second": 0.57, "step": 840 }, { "epoch": 0.5607784924954643, "grad_norm": 0.44670212268829346, "learning_rate": 2.43947175348496e-05, "loss": 0.3569, "step": 850 }, { "epoch": 0.5607784924954643, "eval_loss": 0.3579043745994568, "eval_runtime": 281.4427, "eval_samples_per_second": 18.142, "eval_steps_per_second": 0.568, "step": 850 }, { "epoch": 0.5673758865248227, "grad_norm": 0.5422260761260986, "learning_rate": 2.4027879677182685e-05, "loss": 0.3525, "step": 860 }, { "epoch": 0.5673758865248227, "eval_loss": 0.35847437381744385, "eval_runtime": 280.6628, "eval_samples_per_second": 18.193, "eval_steps_per_second": 0.57, "step": 860 }, { "epoch": 0.5739732805541811, "grad_norm": 0.49470987915992737, "learning_rate": 2.3661041819515776e-05, "loss": 0.3528, "step": 870 }, { "epoch": 0.5739732805541811, "eval_loss": 0.35762321949005127, "eval_runtime": 281.4774, "eval_samples_per_second": 18.14, "eval_steps_per_second": 0.568, "step": 870 }, { "epoch": 0.5805706745835395, "grad_norm": 0.5388288497924805, "learning_rate": 2.3294203961848866e-05, "loss": 0.3533, "step": 880 }, { "epoch": 0.5805706745835395, "eval_loss": 0.3578256666660309, "eval_runtime": 280.4896, "eval_samples_per_second": 18.204, "eval_steps_per_second": 0.57, "step": 880 }, { "epoch": 0.5871680686128979, "grad_norm": 0.4280325174331665, "learning_rate": 2.2927366104181953e-05, "loss": 0.3574, "step": 890 }, { "epoch": 0.5871680686128979, "eval_loss": 0.35690200328826904, "eval_runtime": 280.7823, "eval_samples_per_second": 18.185, "eval_steps_per_second": 0.57, "step": 890 }, { "epoch": 0.5937654626422563, "grad_norm": 0.6243644952774048, "learning_rate": 2.2560528246515043e-05, "loss": 0.356, "step": 900 }, { "epoch": 0.5937654626422563, "eval_loss": 0.3565099537372589, "eval_runtime": 280.4402, "eval_samples_per_second": 18.207, "eval_steps_per_second": 0.571, "step": 900 }, { "epoch": 0.6003628566716147, "grad_norm": 0.5401405096054077, "learning_rate": 2.219369038884813e-05, "loss": 0.3594, "step": 910 }, { "epoch": 0.6003628566716147, "eval_loss": 0.3559376895427704, "eval_runtime": 281.1004, "eval_samples_per_second": 18.164, "eval_steps_per_second": 0.569, "step": 910 }, { "epoch": 0.6069602507009731, "grad_norm": 0.578926682472229, "learning_rate": 2.182685253118122e-05, "loss": 0.3542, "step": 920 }, { "epoch": 0.6069602507009731, "eval_loss": 0.35559171438217163, "eval_runtime": 281.4745, "eval_samples_per_second": 18.14, "eval_steps_per_second": 0.568, "step": 920 }, { "epoch": 0.6135576447303315, "grad_norm": 0.460857629776001, "learning_rate": 2.1460014673514307e-05, "loss": 0.3587, "step": 930 }, { "epoch": 0.6135576447303315, "eval_loss": 0.35534095764160156, "eval_runtime": 280.8406, "eval_samples_per_second": 18.181, "eval_steps_per_second": 0.57, "step": 930 }, { "epoch": 0.6201550387596899, "grad_norm": 0.5364406704902649, "learning_rate": 2.1093176815847397e-05, "loss": 0.353, "step": 940 }, { "epoch": 0.6201550387596899, "eval_loss": 0.3539869785308838, "eval_runtime": 280.7515, "eval_samples_per_second": 18.187, "eval_steps_per_second": 0.57, "step": 940 }, { "epoch": 0.6267524327890484, "grad_norm": 0.50699782371521, "learning_rate": 2.0726338958180484e-05, "loss": 0.3525, "step": 950 }, { "epoch": 0.6267524327890484, "eval_loss": 0.35391008853912354, "eval_runtime": 281.3705, "eval_samples_per_second": 18.147, "eval_steps_per_second": 0.569, "step": 950 }, { "epoch": 0.6333498268184067, "grad_norm": 0.6315603852272034, "learning_rate": 2.035950110051357e-05, "loss": 0.3546, "step": 960 }, { "epoch": 0.6333498268184067, "eval_loss": 0.35362812876701355, "eval_runtime": 281.4224, "eval_samples_per_second": 18.144, "eval_steps_per_second": 0.569, "step": 960 }, { "epoch": 0.6399472208477651, "grad_norm": 0.5252578258514404, "learning_rate": 1.999266324284666e-05, "loss": 0.3521, "step": 970 }, { "epoch": 0.6399472208477651, "eval_loss": 0.3530423045158386, "eval_runtime": 280.5129, "eval_samples_per_second": 18.202, "eval_steps_per_second": 0.57, "step": 970 }, { "epoch": 0.6465446148771236, "grad_norm": 0.5415360331535339, "learning_rate": 1.962582538517975e-05, "loss": 0.3543, "step": 980 }, { "epoch": 0.6465446148771236, "eval_loss": 0.3528820276260376, "eval_runtime": 280.8145, "eval_samples_per_second": 18.183, "eval_steps_per_second": 0.57, "step": 980 }, { "epoch": 0.6531420089064819, "grad_norm": 0.5389861464500427, "learning_rate": 1.925898752751284e-05, "loss": 0.3528, "step": 990 }, { "epoch": 0.6531420089064819, "eval_loss": 0.3525787889957428, "eval_runtime": 280.6494, "eval_samples_per_second": 18.194, "eval_steps_per_second": 0.57, "step": 990 }, { "epoch": 0.6597394029358403, "grad_norm": 0.47096070647239685, "learning_rate": 1.889214966984593e-05, "loss": 0.3514, "step": 1000 }, { "epoch": 0.6597394029358403, "eval_loss": 0.352372944355011, "eval_runtime": 280.6707, "eval_samples_per_second": 18.192, "eval_steps_per_second": 0.57, "step": 1000 } ], "logging_steps": 10, "max_steps": 1515, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.3527241133195264e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }