|
{ |
|
"best_metric": 0.7777777777777778, |
|
"best_model_checkpoint": "CTMAE2_CS_V7_2/checkpoint-2600", |
|
"epoch": 49.016216216216215, |
|
"eval_steps": 500, |
|
"global_step": 12950, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0007722007722007722, |
|
"grad_norm": 2.6724629402160645, |
|
"learning_rate": 7.722007722007723e-08, |
|
"loss": 0.7116, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0015444015444015444, |
|
"grad_norm": 6.020544052124023, |
|
"learning_rate": 1.5444015444015445e-07, |
|
"loss": 0.6837, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0023166023166023165, |
|
"grad_norm": 6.438848972320557, |
|
"learning_rate": 2.3166023166023168e-07, |
|
"loss": 0.6963, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.003088803088803089, |
|
"grad_norm": 3.0015923976898193, |
|
"learning_rate": 3.088803088803089e-07, |
|
"loss": 0.6929, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.003861003861003861, |
|
"grad_norm": 3.25985050201416, |
|
"learning_rate": 3.8610038610038613e-07, |
|
"loss": 0.6872, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.004633204633204633, |
|
"grad_norm": 3.701803207397461, |
|
"learning_rate": 4.6332046332046336e-07, |
|
"loss": 0.6878, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.005405405405405406, |
|
"grad_norm": 3.4109742641448975, |
|
"learning_rate": 5.405405405405406e-07, |
|
"loss": 0.6827, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.006177606177606178, |
|
"grad_norm": 3.098447799682617, |
|
"learning_rate": 6.177606177606178e-07, |
|
"loss": 0.6798, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0069498069498069494, |
|
"grad_norm": 5.643575668334961, |
|
"learning_rate": 6.94980694980695e-07, |
|
"loss": 0.6694, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.007722007722007722, |
|
"grad_norm": 5.473111152648926, |
|
"learning_rate": 7.722007722007723e-07, |
|
"loss": 0.658, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.008494208494208495, |
|
"grad_norm": 7.615536212921143, |
|
"learning_rate": 8.494208494208495e-07, |
|
"loss": 0.6749, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.009266409266409266, |
|
"grad_norm": 3.840909957885742, |
|
"learning_rate": 9.266409266409267e-07, |
|
"loss": 0.6897, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.010038610038610039, |
|
"grad_norm": 4.946235179901123, |
|
"learning_rate": 1.0038610038610038e-06, |
|
"loss": 0.6457, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.010810810810810811, |
|
"grad_norm": 4.3364386558532715, |
|
"learning_rate": 1.0810810810810812e-06, |
|
"loss": 0.5965, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.011583011583011582, |
|
"grad_norm": 5.371955394744873, |
|
"learning_rate": 1.1583011583011585e-06, |
|
"loss": 0.6401, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.012355212355212355, |
|
"grad_norm": 6.771674633026123, |
|
"learning_rate": 1.2355212355212356e-06, |
|
"loss": 0.6538, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.013127413127413128, |
|
"grad_norm": 8.420292854309082, |
|
"learning_rate": 1.3127413127413127e-06, |
|
"loss": 0.6751, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.013899613899613899, |
|
"grad_norm": 11.156353950500488, |
|
"learning_rate": 1.38996138996139e-06, |
|
"loss": 0.645, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.014671814671814672, |
|
"grad_norm": 6.941310405731201, |
|
"learning_rate": 1.4671814671814674e-06, |
|
"loss": 0.5783, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.015444015444015444, |
|
"grad_norm": 29.779354095458984, |
|
"learning_rate": 1.5444015444015445e-06, |
|
"loss": 0.5873, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.016216216216216217, |
|
"grad_norm": 36.21226501464844, |
|
"learning_rate": 1.6216216216216219e-06, |
|
"loss": 0.8262, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.01698841698841699, |
|
"grad_norm": 13.295347213745117, |
|
"learning_rate": 1.698841698841699e-06, |
|
"loss": 0.5922, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.01776061776061776, |
|
"grad_norm": 12.150893211364746, |
|
"learning_rate": 1.7760617760617763e-06, |
|
"loss": 0.5256, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.018532818532818532, |
|
"grad_norm": 18.2507381439209, |
|
"learning_rate": 1.8532818532818534e-06, |
|
"loss": 0.7013, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.019305019305019305, |
|
"grad_norm": 44.01077651977539, |
|
"learning_rate": 1.9305019305019305e-06, |
|
"loss": 0.6761, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.020077220077220077, |
|
"grad_norm": 8.128087997436523, |
|
"learning_rate": 2.0077220077220077e-06, |
|
"loss": 0.5325, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.020077220077220077, |
|
"eval_accuracy": 0.4666666666666667, |
|
"eval_loss": 0.8741418719291687, |
|
"eval_runtime": 14.8248, |
|
"eval_samples_per_second": 3.035, |
|
"eval_steps_per_second": 1.012, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.0007722007722009, |
|
"grad_norm": 8.959938049316406, |
|
"learning_rate": 2.084942084942085e-06, |
|
"loss": 0.6265, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.0015444015444015, |
|
"grad_norm": 14.598134994506836, |
|
"learning_rate": 2.1621621621621623e-06, |
|
"loss": 0.5794, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.0023166023166024, |
|
"grad_norm": 12.391850471496582, |
|
"learning_rate": 2.2393822393822394e-06, |
|
"loss": 0.9588, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.003088803088803, |
|
"grad_norm": 19.856307983398438, |
|
"learning_rate": 2.316602316602317e-06, |
|
"loss": 0.6347, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.0038610038610039, |
|
"grad_norm": 9.293577194213867, |
|
"learning_rate": 2.393822393822394e-06, |
|
"loss": 0.621, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0046332046332047, |
|
"grad_norm": 18.37480354309082, |
|
"learning_rate": 2.4710424710424712e-06, |
|
"loss": 0.6988, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0054054054054054, |
|
"grad_norm": 12.758193016052246, |
|
"learning_rate": 2.5482625482625484e-06, |
|
"loss": 0.6771, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0061776061776062, |
|
"grad_norm": 11.605049133300781, |
|
"learning_rate": 2.6254826254826255e-06, |
|
"loss": 0.6083, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.0069498069498068, |
|
"grad_norm": 12.44294261932373, |
|
"learning_rate": 2.702702702702703e-06, |
|
"loss": 0.6117, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.0077220077220077, |
|
"grad_norm": 6.176090240478516, |
|
"learning_rate": 2.77992277992278e-06, |
|
"loss": 0.6549, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.0084942084942086, |
|
"grad_norm": 7.454275608062744, |
|
"learning_rate": 2.8571428571428573e-06, |
|
"loss": 0.6426, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.0092664092664092, |
|
"grad_norm": 8.811921119689941, |
|
"learning_rate": 2.934362934362935e-06, |
|
"loss": 0.5786, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.01003861003861, |
|
"grad_norm": 10.099629402160645, |
|
"learning_rate": 3.011583011583012e-06, |
|
"loss": 0.6792, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0108108108108107, |
|
"grad_norm": 7.9834136962890625, |
|
"learning_rate": 3.088803088803089e-06, |
|
"loss": 0.6326, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0115830115830116, |
|
"grad_norm": 22.69242286682129, |
|
"learning_rate": 3.166023166023166e-06, |
|
"loss": 0.6278, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.0123552123552124, |
|
"grad_norm": 26.15718650817871, |
|
"learning_rate": 3.2432432432432437e-06, |
|
"loss": 0.509, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.013127413127413, |
|
"grad_norm": 9.68459701538086, |
|
"learning_rate": 3.320463320463321e-06, |
|
"loss": 0.5917, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.013899613899614, |
|
"grad_norm": 11.585394859313965, |
|
"learning_rate": 3.397683397683398e-06, |
|
"loss": 0.5219, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.0146718146718148, |
|
"grad_norm": 25.82895851135254, |
|
"learning_rate": 3.4749034749034755e-06, |
|
"loss": 0.4272, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.0154440154440154, |
|
"grad_norm": 36.21267318725586, |
|
"learning_rate": 3.5521235521235526e-06, |
|
"loss": 0.7694, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.0162162162162163, |
|
"grad_norm": 9.973220825195312, |
|
"learning_rate": 3.6293436293436297e-06, |
|
"loss": 0.5405, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.016988416988417, |
|
"grad_norm": 16.958759307861328, |
|
"learning_rate": 3.706563706563707e-06, |
|
"loss": 0.5025, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.0177606177606178, |
|
"grad_norm": 8.369057655334473, |
|
"learning_rate": 3.7837837837837844e-06, |
|
"loss": 0.5491, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.0185328185328186, |
|
"grad_norm": 13.40450382232666, |
|
"learning_rate": 3.861003861003861e-06, |
|
"loss": 0.5724, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0193050193050193, |
|
"grad_norm": 14.084481239318848, |
|
"learning_rate": 3.938223938223939e-06, |
|
"loss": 0.5336, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.0200772200772201, |
|
"grad_norm": 7.159802436828613, |
|
"learning_rate": 4.015444015444015e-06, |
|
"loss": 0.5015, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.0200772200772201, |
|
"eval_accuracy": 0.4444444444444444, |
|
"eval_loss": 0.9817712306976318, |
|
"eval_runtime": 14.2868, |
|
"eval_samples_per_second": 3.15, |
|
"eval_steps_per_second": 1.05, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.0007722007722006, |
|
"grad_norm": 11.340039253234863, |
|
"learning_rate": 4.092664092664093e-06, |
|
"loss": 0.6822, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.0015444015444017, |
|
"grad_norm": 22.847156524658203, |
|
"learning_rate": 4.16988416988417e-06, |
|
"loss": 0.5966, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.0023166023166024, |
|
"grad_norm": 16.122802734375, |
|
"learning_rate": 4.247104247104247e-06, |
|
"loss": 0.6605, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.003088803088803, |
|
"grad_norm": 15.669265747070312, |
|
"learning_rate": 4.324324324324325e-06, |
|
"loss": 0.6131, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.003861003861004, |
|
"grad_norm": 1.0841310024261475, |
|
"learning_rate": 4.401544401544402e-06, |
|
"loss": 0.3651, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.0046332046332047, |
|
"grad_norm": 13.917180061340332, |
|
"learning_rate": 4.478764478764479e-06, |
|
"loss": 1.0038, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.0054054054054054, |
|
"grad_norm": 8.559019088745117, |
|
"learning_rate": 4.5559845559845564e-06, |
|
"loss": 0.4637, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.006177606177606, |
|
"grad_norm": 14.26176643371582, |
|
"learning_rate": 4.633204633204634e-06, |
|
"loss": 0.4126, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.006949806949807, |
|
"grad_norm": 30.660659790039062, |
|
"learning_rate": 4.710424710424711e-06, |
|
"loss": 0.6691, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.0077220077220077, |
|
"grad_norm": 33.097259521484375, |
|
"learning_rate": 4.787644787644788e-06, |
|
"loss": 0.4737, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.0084942084942083, |
|
"grad_norm": 32.17298126220703, |
|
"learning_rate": 4.864864864864866e-06, |
|
"loss": 0.5081, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.0092664092664094, |
|
"grad_norm": 6.045112133026123, |
|
"learning_rate": 4.9420849420849425e-06, |
|
"loss": 0.7164, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.01003861003861, |
|
"grad_norm": 19.53457260131836, |
|
"learning_rate": 5.01930501930502e-06, |
|
"loss": 0.5659, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.0108108108108107, |
|
"grad_norm": 51.153682708740234, |
|
"learning_rate": 5.096525096525097e-06, |
|
"loss": 0.7365, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.011583011583012, |
|
"grad_norm": 9.247794151306152, |
|
"learning_rate": 5.173745173745173e-06, |
|
"loss": 0.5064, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.0123552123552124, |
|
"grad_norm": 15.04385757446289, |
|
"learning_rate": 5.250965250965251e-06, |
|
"loss": 0.5947, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.013127413127413, |
|
"grad_norm": 6.074428558349609, |
|
"learning_rate": 5.328185328185329e-06, |
|
"loss": 0.4695, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.0138996138996137, |
|
"grad_norm": 6.8704328536987305, |
|
"learning_rate": 5.405405405405406e-06, |
|
"loss": 0.8112, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.0146718146718148, |
|
"grad_norm": 24.314729690551758, |
|
"learning_rate": 5.4826254826254836e-06, |
|
"loss": 0.4951, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.0154440154440154, |
|
"grad_norm": 7.067862033843994, |
|
"learning_rate": 5.55984555984556e-06, |
|
"loss": 0.5792, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.016216216216216, |
|
"grad_norm": 32.30609893798828, |
|
"learning_rate": 5.637065637065637e-06, |
|
"loss": 0.969, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.016988416988417, |
|
"grad_norm": 10.73764705657959, |
|
"learning_rate": 5.7142857142857145e-06, |
|
"loss": 0.5096, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.0177606177606178, |
|
"grad_norm": 10.906487464904785, |
|
"learning_rate": 5.791505791505791e-06, |
|
"loss": 0.7254, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.0185328185328184, |
|
"grad_norm": 25.743921279907227, |
|
"learning_rate": 5.86872586872587e-06, |
|
"loss": 0.5756, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.0193050193050195, |
|
"grad_norm": 12.43097972869873, |
|
"learning_rate": 5.945945945945947e-06, |
|
"loss": 0.5903, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.02007722007722, |
|
"grad_norm": 45.06875228881836, |
|
"learning_rate": 6.023166023166024e-06, |
|
"loss": 0.5515, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.02007722007722, |
|
"eval_accuracy": 0.5111111111111111, |
|
"eval_loss": 0.7133760452270508, |
|
"eval_runtime": 12.995, |
|
"eval_samples_per_second": 3.463, |
|
"eval_steps_per_second": 1.154, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.0007722007722006, |
|
"grad_norm": 10.925649642944336, |
|
"learning_rate": 6.1003861003861005e-06, |
|
"loss": 0.7103, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.0015444015444017, |
|
"grad_norm": 9.422943115234375, |
|
"learning_rate": 6.177606177606178e-06, |
|
"loss": 0.3968, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.0023166023166024, |
|
"grad_norm": 24.057771682739258, |
|
"learning_rate": 6.254826254826255e-06, |
|
"loss": 0.4071, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.003088803088803, |
|
"grad_norm": 1.2915865182876587, |
|
"learning_rate": 6.332046332046332e-06, |
|
"loss": 0.7694, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.003861003861004, |
|
"grad_norm": 13.25047779083252, |
|
"learning_rate": 6.409266409266411e-06, |
|
"loss": 0.6309, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.0046332046332047, |
|
"grad_norm": 21.487367630004883, |
|
"learning_rate": 6.486486486486487e-06, |
|
"loss": 0.7567, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.0054054054054054, |
|
"grad_norm": 18.282329559326172, |
|
"learning_rate": 6.563706563706564e-06, |
|
"loss": 0.3432, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.006177606177606, |
|
"grad_norm": 19.872180938720703, |
|
"learning_rate": 6.640926640926642e-06, |
|
"loss": 0.6271, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.006949806949807, |
|
"grad_norm": 6.643412113189697, |
|
"learning_rate": 6.718146718146718e-06, |
|
"loss": 0.6144, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.0077220077220077, |
|
"grad_norm": 29.06749725341797, |
|
"learning_rate": 6.795366795366796e-06, |
|
"loss": 0.7109, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.0084942084942083, |
|
"grad_norm": 9.899932861328125, |
|
"learning_rate": 6.872586872586873e-06, |
|
"loss": 0.619, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.0092664092664094, |
|
"grad_norm": 7.574603080749512, |
|
"learning_rate": 6.949806949806951e-06, |
|
"loss": 0.5907, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.01003861003861, |
|
"grad_norm": 14.081249237060547, |
|
"learning_rate": 7.027027027027028e-06, |
|
"loss": 0.6675, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.0108108108108107, |
|
"grad_norm": 16.958438873291016, |
|
"learning_rate": 7.104247104247105e-06, |
|
"loss": 0.5028, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.011583011583012, |
|
"grad_norm": 57.52739715576172, |
|
"learning_rate": 7.181467181467182e-06, |
|
"loss": 0.4541, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.0123552123552124, |
|
"grad_norm": 9.852923393249512, |
|
"learning_rate": 7.2586872586872595e-06, |
|
"loss": 0.43, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.013127413127413, |
|
"grad_norm": 12.256267547607422, |
|
"learning_rate": 7.335907335907336e-06, |
|
"loss": 0.581, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.0138996138996137, |
|
"grad_norm": 7.483799457550049, |
|
"learning_rate": 7.413127413127414e-06, |
|
"loss": 0.667, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.0146718146718148, |
|
"grad_norm": 13.821316719055176, |
|
"learning_rate": 7.49034749034749e-06, |
|
"loss": 0.6437, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.0154440154440154, |
|
"grad_norm": 39.73973083496094, |
|
"learning_rate": 7.567567567567569e-06, |
|
"loss": 0.4757, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.016216216216216, |
|
"grad_norm": 7.891088008880615, |
|
"learning_rate": 7.644787644787645e-06, |
|
"loss": 0.5968, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.016988416988417, |
|
"grad_norm": 10.223976135253906, |
|
"learning_rate": 7.722007722007722e-06, |
|
"loss": 0.5044, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.0177606177606178, |
|
"grad_norm": 44.67567825317383, |
|
"learning_rate": 7.7992277992278e-06, |
|
"loss": 0.3877, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.0185328185328184, |
|
"grad_norm": 11.546664237976074, |
|
"learning_rate": 7.876447876447877e-06, |
|
"loss": 0.5934, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.0193050193050195, |
|
"grad_norm": 3.135157823562622, |
|
"learning_rate": 7.953667953667954e-06, |
|
"loss": 1.0837, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.02007722007722, |
|
"grad_norm": 53.9516716003418, |
|
"learning_rate": 8.03088803088803e-06, |
|
"loss": 0.3752, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.02007722007722, |
|
"eval_accuracy": 0.6222222222222222, |
|
"eval_loss": 0.7412399053573608, |
|
"eval_runtime": 13.1721, |
|
"eval_samples_per_second": 3.416, |
|
"eval_steps_per_second": 1.139, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 4.000772200772201, |
|
"grad_norm": 9.257226943969727, |
|
"learning_rate": 8.108108108108109e-06, |
|
"loss": 0.3859, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 4.001544401544401, |
|
"grad_norm": 20.57451057434082, |
|
"learning_rate": 8.185328185328186e-06, |
|
"loss": 0.6062, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 4.002316602316602, |
|
"grad_norm": 11.12028694152832, |
|
"learning_rate": 8.262548262548264e-06, |
|
"loss": 0.4765, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 4.003088803088803, |
|
"grad_norm": 73.99553680419922, |
|
"learning_rate": 8.33976833976834e-06, |
|
"loss": 0.681, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 4.003861003861004, |
|
"grad_norm": 2.5823240280151367, |
|
"learning_rate": 8.416988416988418e-06, |
|
"loss": 0.7646, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 4.004633204633205, |
|
"grad_norm": 25.63375473022461, |
|
"learning_rate": 8.494208494208494e-06, |
|
"loss": 0.2601, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.005405405405406, |
|
"grad_norm": 2.795551300048828, |
|
"learning_rate": 8.571428571428571e-06, |
|
"loss": 0.6665, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 4.006177606177606, |
|
"grad_norm": 5.9316205978393555, |
|
"learning_rate": 8.64864864864865e-06, |
|
"loss": 0.5213, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 4.006949806949807, |
|
"grad_norm": 18.198753356933594, |
|
"learning_rate": 8.725868725868728e-06, |
|
"loss": 0.798, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 4.007722007722008, |
|
"grad_norm": 62.04732131958008, |
|
"learning_rate": 8.803088803088804e-06, |
|
"loss": 0.4566, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 4.008494208494208, |
|
"grad_norm": 25.7177791595459, |
|
"learning_rate": 8.880308880308881e-06, |
|
"loss": 0.5294, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 4.009266409266409, |
|
"grad_norm": 13.888814926147461, |
|
"learning_rate": 8.957528957528958e-06, |
|
"loss": 0.4431, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 4.01003861003861, |
|
"grad_norm": 8.091224670410156, |
|
"learning_rate": 9.034749034749034e-06, |
|
"loss": 0.3993, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 4.010810810810811, |
|
"grad_norm": 1.0357552766799927, |
|
"learning_rate": 9.111969111969113e-06, |
|
"loss": 0.7643, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 4.011583011583012, |
|
"grad_norm": 59.91904830932617, |
|
"learning_rate": 9.189189189189191e-06, |
|
"loss": 0.8771, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 4.012355212355212, |
|
"grad_norm": 7.978598117828369, |
|
"learning_rate": 9.266409266409268e-06, |
|
"loss": 0.4757, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 4.013127413127413, |
|
"grad_norm": 35.72863006591797, |
|
"learning_rate": 9.343629343629345e-06, |
|
"loss": 0.5221, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 4.013899613899614, |
|
"grad_norm": 77.4381103515625, |
|
"learning_rate": 9.420849420849421e-06, |
|
"loss": 0.7294, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 4.014671814671814, |
|
"grad_norm": 43.887699127197266, |
|
"learning_rate": 9.498069498069498e-06, |
|
"loss": 0.7571, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 4.015444015444015, |
|
"grad_norm": 22.43756103515625, |
|
"learning_rate": 9.575289575289576e-06, |
|
"loss": 0.4187, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 4.0162162162162165, |
|
"grad_norm": 8.597156524658203, |
|
"learning_rate": 9.652509652509653e-06, |
|
"loss": 0.8185, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 4.016988416988417, |
|
"grad_norm": 28.265419006347656, |
|
"learning_rate": 9.729729729729732e-06, |
|
"loss": 0.3588, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 4.017760617760618, |
|
"grad_norm": 102.77274322509766, |
|
"learning_rate": 9.806949806949808e-06, |
|
"loss": 0.515, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 4.018532818532819, |
|
"grad_norm": 17.284040451049805, |
|
"learning_rate": 9.884169884169885e-06, |
|
"loss": 1.6301, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 4.019305019305019, |
|
"grad_norm": 39.904273986816406, |
|
"learning_rate": 9.961389961389962e-06, |
|
"loss": 0.5565, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 4.02007722007722, |
|
"grad_norm": 6.165562629699707, |
|
"learning_rate": 9.995709995709997e-06, |
|
"loss": 0.3128, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.02007722007722, |
|
"eval_accuracy": 0.6, |
|
"eval_loss": 0.8534404039382935, |
|
"eval_runtime": 13.8974, |
|
"eval_samples_per_second": 3.238, |
|
"eval_steps_per_second": 1.079, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 5.000772200772201, |
|
"grad_norm": 17.28194808959961, |
|
"learning_rate": 9.987129987129988e-06, |
|
"loss": 0.489, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 5.001544401544401, |
|
"grad_norm": 68.17495727539062, |
|
"learning_rate": 9.97854997854998e-06, |
|
"loss": 0.5224, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 5.002316602316602, |
|
"grad_norm": 104.81306457519531, |
|
"learning_rate": 9.96996996996997e-06, |
|
"loss": 0.3856, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 5.003088803088803, |
|
"grad_norm": 33.45752716064453, |
|
"learning_rate": 9.961389961389962e-06, |
|
"loss": 0.347, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 5.003861003861004, |
|
"grad_norm": 20.993886947631836, |
|
"learning_rate": 9.952809952809953e-06, |
|
"loss": 0.3714, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 5.004633204633205, |
|
"grad_norm": 15.487417221069336, |
|
"learning_rate": 9.944229944229946e-06, |
|
"loss": 0.9065, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 5.005405405405406, |
|
"grad_norm": 4.61458158493042, |
|
"learning_rate": 9.935649935649937e-06, |
|
"loss": 1.0333, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 5.006177606177606, |
|
"grad_norm": 8.850576400756836, |
|
"learning_rate": 9.927069927069928e-06, |
|
"loss": 0.4281, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 5.006949806949807, |
|
"grad_norm": 23.16834831237793, |
|
"learning_rate": 9.91848991848992e-06, |
|
"loss": 0.8578, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 5.007722007722008, |
|
"grad_norm": 7.291067600250244, |
|
"learning_rate": 9.90990990990991e-06, |
|
"loss": 0.3951, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 5.008494208494208, |
|
"grad_norm": 15.696223258972168, |
|
"learning_rate": 9.901329901329903e-06, |
|
"loss": 0.4542, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 5.009266409266409, |
|
"grad_norm": 43.79963302612305, |
|
"learning_rate": 9.892749892749894e-06, |
|
"loss": 0.8643, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 5.01003861003861, |
|
"grad_norm": 8.4163179397583, |
|
"learning_rate": 9.884169884169885e-06, |
|
"loss": 0.9005, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 5.010810810810811, |
|
"grad_norm": 13.300799369812012, |
|
"learning_rate": 9.875589875589876e-06, |
|
"loss": 0.5591, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 5.011583011583012, |
|
"grad_norm": 27.228687286376953, |
|
"learning_rate": 9.867009867009867e-06, |
|
"loss": 0.4523, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 5.012355212355212, |
|
"grad_norm": 3.9106037616729736, |
|
"learning_rate": 9.858429858429858e-06, |
|
"loss": 0.3319, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 5.013127413127413, |
|
"grad_norm": 86.4173583984375, |
|
"learning_rate": 9.849849849849851e-06, |
|
"loss": 0.3241, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 5.013899613899614, |
|
"grad_norm": 25.517906188964844, |
|
"learning_rate": 9.841269841269842e-06, |
|
"loss": 0.7146, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 5.014671814671814, |
|
"grad_norm": 9.686749458312988, |
|
"learning_rate": 9.832689832689833e-06, |
|
"loss": 0.7438, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 5.015444015444015, |
|
"grad_norm": 0.10342951864004135, |
|
"learning_rate": 9.824109824109826e-06, |
|
"loss": 0.4138, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 5.0162162162162165, |
|
"grad_norm": 0.23315098881721497, |
|
"learning_rate": 9.815529815529815e-06, |
|
"loss": 1.2718, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 5.016988416988417, |
|
"grad_norm": 15.688437461853027, |
|
"learning_rate": 9.806949806949808e-06, |
|
"loss": 0.5534, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 5.017760617760618, |
|
"grad_norm": 14.076128005981445, |
|
"learning_rate": 9.7983697983698e-06, |
|
"loss": 0.4031, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 5.018532818532819, |
|
"grad_norm": 13.882486343383789, |
|
"learning_rate": 9.78978978978979e-06, |
|
"loss": 0.3393, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 5.019305019305019, |
|
"grad_norm": 3.6558635234832764, |
|
"learning_rate": 9.781209781209782e-06, |
|
"loss": 0.5401, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 5.02007722007722, |
|
"grad_norm": 1.8778923749923706, |
|
"learning_rate": 9.772629772629774e-06, |
|
"loss": 0.7261, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 5.02007722007722, |
|
"eval_accuracy": 0.7111111111111111, |
|
"eval_loss": 0.700229287147522, |
|
"eval_runtime": 13.0303, |
|
"eval_samples_per_second": 3.453, |
|
"eval_steps_per_second": 1.151, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 6.000772200772201, |
|
"grad_norm": 0.08510206639766693, |
|
"learning_rate": 9.764049764049764e-06, |
|
"loss": 0.4059, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 6.001544401544401, |
|
"grad_norm": 0.5811074376106262, |
|
"learning_rate": 9.755469755469757e-06, |
|
"loss": 0.4122, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 6.002316602316602, |
|
"grad_norm": 0.5630064010620117, |
|
"learning_rate": 9.746889746889748e-06, |
|
"loss": 0.3503, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 6.003088803088803, |
|
"grad_norm": 47.41246032714844, |
|
"learning_rate": 9.738309738309739e-06, |
|
"loss": 0.8755, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 6.003861003861004, |
|
"grad_norm": 34.18187713623047, |
|
"learning_rate": 9.729729729729732e-06, |
|
"loss": 0.5853, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 6.004633204633205, |
|
"grad_norm": 37.81892776489258, |
|
"learning_rate": 9.721149721149723e-06, |
|
"loss": 0.7055, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 6.005405405405406, |
|
"grad_norm": 4.734610557556152, |
|
"learning_rate": 9.712569712569714e-06, |
|
"loss": 0.4654, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 6.006177606177606, |
|
"grad_norm": 12.163606643676758, |
|
"learning_rate": 9.703989703989705e-06, |
|
"loss": 0.826, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 6.006949806949807, |
|
"grad_norm": 6.117342948913574, |
|
"learning_rate": 9.695409695409696e-06, |
|
"loss": 0.5362, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 6.007722007722008, |
|
"grad_norm": 44.437744140625, |
|
"learning_rate": 9.686829686829687e-06, |
|
"loss": 0.6832, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 6.008494208494208, |
|
"grad_norm": 1.0263047218322754, |
|
"learning_rate": 9.67824967824968e-06, |
|
"loss": 0.6571, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 6.009266409266409, |
|
"grad_norm": 44.683998107910156, |
|
"learning_rate": 9.669669669669671e-06, |
|
"loss": 0.6011, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 6.01003861003861, |
|
"grad_norm": 99.36419677734375, |
|
"learning_rate": 9.661089661089662e-06, |
|
"loss": 0.554, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 6.010810810810811, |
|
"grad_norm": 63.681400299072266, |
|
"learning_rate": 9.652509652509653e-06, |
|
"loss": 0.6625, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 6.011583011583012, |
|
"grad_norm": 6.034546375274658, |
|
"learning_rate": 9.643929643929644e-06, |
|
"loss": 0.7592, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 6.012355212355212, |
|
"grad_norm": 32.6746940612793, |
|
"learning_rate": 9.635349635349635e-06, |
|
"loss": 0.7335, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 6.013127413127413, |
|
"grad_norm": 25.542985916137695, |
|
"learning_rate": 9.626769626769628e-06, |
|
"loss": 0.4995, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 6.013899613899614, |
|
"grad_norm": 35.672698974609375, |
|
"learning_rate": 9.61818961818962e-06, |
|
"loss": 0.623, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 6.014671814671814, |
|
"grad_norm": 0.8925091624259949, |
|
"learning_rate": 9.60960960960961e-06, |
|
"loss": 0.5084, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 6.015444015444015, |
|
"grad_norm": 0.9361721277236938, |
|
"learning_rate": 9.601029601029601e-06, |
|
"loss": 0.375, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 6.0162162162162165, |
|
"grad_norm": 80.69046783447266, |
|
"learning_rate": 9.592449592449593e-06, |
|
"loss": 0.326, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 6.016988416988417, |
|
"grad_norm": 123.89753723144531, |
|
"learning_rate": 9.583869583869585e-06, |
|
"loss": 0.6367, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 6.017760617760618, |
|
"grad_norm": 0.18116500973701477, |
|
"learning_rate": 9.575289575289576e-06, |
|
"loss": 0.565, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 6.018532818532819, |
|
"grad_norm": 33.38082504272461, |
|
"learning_rate": 9.566709566709568e-06, |
|
"loss": 0.9349, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 6.019305019305019, |
|
"grad_norm": 51.91962814331055, |
|
"learning_rate": 9.558129558129559e-06, |
|
"loss": 0.6956, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 6.02007722007722, |
|
"grad_norm": 4.865780830383301, |
|
"learning_rate": 9.54954954954955e-06, |
|
"loss": 0.4644, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 6.02007722007722, |
|
"eval_accuracy": 0.7111111111111111, |
|
"eval_loss": 0.6549674272537231, |
|
"eval_runtime": 13.794, |
|
"eval_samples_per_second": 3.262, |
|
"eval_steps_per_second": 1.087, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 7.000772200772201, |
|
"grad_norm": 20.322017669677734, |
|
"learning_rate": 9.540969540969541e-06, |
|
"loss": 0.3135, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 7.001544401544401, |
|
"grad_norm": 1.3213744163513184, |
|
"learning_rate": 9.532389532389534e-06, |
|
"loss": 0.539, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 7.002316602316602, |
|
"grad_norm": 31.4146728515625, |
|
"learning_rate": 9.523809523809525e-06, |
|
"loss": 0.2655, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 7.003088803088803, |
|
"grad_norm": 0.04433909431099892, |
|
"learning_rate": 9.515229515229516e-06, |
|
"loss": 0.8404, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 7.003861003861004, |
|
"grad_norm": 109.51227569580078, |
|
"learning_rate": 9.506649506649509e-06, |
|
"loss": 0.4611, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 7.004633204633205, |
|
"grad_norm": 0.7903939485549927, |
|
"learning_rate": 9.498069498069498e-06, |
|
"loss": 0.8119, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 7.005405405405406, |
|
"grad_norm": 6.808485507965088, |
|
"learning_rate": 9.489489489489491e-06, |
|
"loss": 0.4345, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 7.006177606177606, |
|
"grad_norm": 11.219454765319824, |
|
"learning_rate": 9.480909480909482e-06, |
|
"loss": 0.5567, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 7.006949806949807, |
|
"grad_norm": 13.435988426208496, |
|
"learning_rate": 9.472329472329473e-06, |
|
"loss": 0.6326, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 7.007722007722008, |
|
"grad_norm": 58.40050506591797, |
|
"learning_rate": 9.463749463749464e-06, |
|
"loss": 0.6078, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 7.008494208494208, |
|
"grad_norm": 9.723285675048828, |
|
"learning_rate": 9.455169455169457e-06, |
|
"loss": 0.3013, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 7.009266409266409, |
|
"grad_norm": 21.477807998657227, |
|
"learning_rate": 9.446589446589446e-06, |
|
"loss": 0.4369, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 7.01003861003861, |
|
"grad_norm": 34.23115158081055, |
|
"learning_rate": 9.43800943800944e-06, |
|
"loss": 0.8515, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 7.010810810810811, |
|
"grad_norm": 16.622177124023438, |
|
"learning_rate": 9.42942942942943e-06, |
|
"loss": 0.5567, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 7.011583011583012, |
|
"grad_norm": 19.550809860229492, |
|
"learning_rate": 9.420849420849421e-06, |
|
"loss": 0.272, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 7.012355212355212, |
|
"grad_norm": 16.057876586914062, |
|
"learning_rate": 9.412269412269412e-06, |
|
"loss": 2.3903, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 7.013127413127413, |
|
"grad_norm": 13.52114200592041, |
|
"learning_rate": 9.403689403689405e-06, |
|
"loss": 0.7999, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 7.013899613899614, |
|
"grad_norm": 9.787277221679688, |
|
"learning_rate": 9.395109395109396e-06, |
|
"loss": 0.4997, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 7.014671814671814, |
|
"grad_norm": 6.769506454467773, |
|
"learning_rate": 9.386529386529387e-06, |
|
"loss": 0.5747, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 7.015444015444015, |
|
"grad_norm": 23.32634925842285, |
|
"learning_rate": 9.377949377949379e-06, |
|
"loss": 0.6525, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 7.0162162162162165, |
|
"grad_norm": 0.12107282876968384, |
|
"learning_rate": 9.36936936936937e-06, |
|
"loss": 0.4083, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 7.016988416988417, |
|
"grad_norm": 32.930877685546875, |
|
"learning_rate": 9.360789360789362e-06, |
|
"loss": 0.6903, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 7.017760617760618, |
|
"grad_norm": 0.47700074315071106, |
|
"learning_rate": 9.352209352209352e-06, |
|
"loss": 0.3165, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 7.018532818532819, |
|
"grad_norm": 17.217500686645508, |
|
"learning_rate": 9.343629343629345e-06, |
|
"loss": 0.866, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 7.019305019305019, |
|
"grad_norm": 3.1114206314086914, |
|
"learning_rate": 9.335049335049336e-06, |
|
"loss": 0.5437, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 7.02007722007722, |
|
"grad_norm": 0.07721736282110214, |
|
"learning_rate": 9.326469326469327e-06, |
|
"loss": 0.7702, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 7.02007722007722, |
|
"eval_accuracy": 0.7333333333333333, |
|
"eval_loss": 0.6853463649749756, |
|
"eval_runtime": 13.0358, |
|
"eval_samples_per_second": 3.452, |
|
"eval_steps_per_second": 1.151, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 8.0007722007722, |
|
"grad_norm": 3.543567657470703, |
|
"learning_rate": 9.317889317889318e-06, |
|
"loss": 0.51, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 8.001544401544402, |
|
"grad_norm": 1.479987621307373, |
|
"learning_rate": 9.30930930930931e-06, |
|
"loss": 0.3237, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 8.002316602316602, |
|
"grad_norm": 23.15900993347168, |
|
"learning_rate": 9.300729300729302e-06, |
|
"loss": 0.8069, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 8.003088803088803, |
|
"grad_norm": 19.172855377197266, |
|
"learning_rate": 9.292149292149293e-06, |
|
"loss": 0.6838, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 8.003861003861005, |
|
"grad_norm": 36.1146240234375, |
|
"learning_rate": 9.283569283569284e-06, |
|
"loss": 0.644, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 8.004633204633205, |
|
"grad_norm": 18.696842193603516, |
|
"learning_rate": 9.274989274989275e-06, |
|
"loss": 0.4751, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 8.005405405405405, |
|
"grad_norm": 12.750860214233398, |
|
"learning_rate": 9.266409266409268e-06, |
|
"loss": 0.6119, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 8.006177606177607, |
|
"grad_norm": 40.49456787109375, |
|
"learning_rate": 9.257829257829259e-06, |
|
"loss": 0.7243, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 8.006949806949807, |
|
"grad_norm": 8.020015716552734, |
|
"learning_rate": 9.24924924924925e-06, |
|
"loss": 0.3259, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 8.007722007722007, |
|
"grad_norm": 30.134857177734375, |
|
"learning_rate": 9.240669240669241e-06, |
|
"loss": 0.3053, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 8.00849420849421, |
|
"grad_norm": 35.37698745727539, |
|
"learning_rate": 9.232089232089232e-06, |
|
"loss": 1.0682, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 8.00926640926641, |
|
"grad_norm": 42.585323333740234, |
|
"learning_rate": 9.223509223509223e-06, |
|
"loss": 0.5787, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 8.01003861003861, |
|
"grad_norm": 26.13615608215332, |
|
"learning_rate": 9.214929214929216e-06, |
|
"loss": 0.6936, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 8.010810810810812, |
|
"grad_norm": 40.68016815185547, |
|
"learning_rate": 9.206349206349207e-06, |
|
"loss": 0.4108, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 8.011583011583012, |
|
"grad_norm": 0.34476640820503235, |
|
"learning_rate": 9.197769197769198e-06, |
|
"loss": 0.5791, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 8.012355212355212, |
|
"grad_norm": 0.125451922416687, |
|
"learning_rate": 9.189189189189191e-06, |
|
"loss": 0.6043, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 8.013127413127414, |
|
"grad_norm": 46.2276611328125, |
|
"learning_rate": 9.18060918060918e-06, |
|
"loss": 0.8779, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 8.013899613899614, |
|
"grad_norm": 2.52577543258667, |
|
"learning_rate": 9.172029172029173e-06, |
|
"loss": 0.4187, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 8.014671814671814, |
|
"grad_norm": 3.3626627922058105, |
|
"learning_rate": 9.163449163449165e-06, |
|
"loss": 0.4761, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 8.015444015444016, |
|
"grad_norm": 27.020002365112305, |
|
"learning_rate": 9.154869154869156e-06, |
|
"loss": 0.3704, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 8.016216216216216, |
|
"grad_norm": 0.6957538723945618, |
|
"learning_rate": 9.146289146289147e-06, |
|
"loss": 0.4882, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 8.016988416988417, |
|
"grad_norm": 3.1972832679748535, |
|
"learning_rate": 9.137709137709138e-06, |
|
"loss": 0.801, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 8.017760617760617, |
|
"grad_norm": 38.638031005859375, |
|
"learning_rate": 9.129129129129129e-06, |
|
"loss": 0.3989, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 8.018532818532819, |
|
"grad_norm": 60.1053352355957, |
|
"learning_rate": 9.120549120549122e-06, |
|
"loss": 0.6521, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 8.019305019305019, |
|
"grad_norm": 9.336408615112305, |
|
"learning_rate": 9.111969111969113e-06, |
|
"loss": 0.3019, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 8.02007722007722, |
|
"grad_norm": 0.277237206697464, |
|
"learning_rate": 9.103389103389104e-06, |
|
"loss": 0.4589, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 8.02007722007722, |
|
"eval_accuracy": 0.7555555555555555, |
|
"eval_loss": 0.7447373867034912, |
|
"eval_runtime": 13.042, |
|
"eval_samples_per_second": 3.45, |
|
"eval_steps_per_second": 1.15, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 9.0007722007722, |
|
"grad_norm": 26.95766258239746, |
|
"learning_rate": 9.094809094809095e-06, |
|
"loss": 0.2924, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 9.001544401544402, |
|
"grad_norm": 0.3911036550998688, |
|
"learning_rate": 9.086229086229086e-06, |
|
"loss": 0.2926, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 9.002316602316602, |
|
"grad_norm": 15.294562339782715, |
|
"learning_rate": 9.077649077649079e-06, |
|
"loss": 0.6503, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 9.003088803088803, |
|
"grad_norm": 0.5500653982162476, |
|
"learning_rate": 9.06906906906907e-06, |
|
"loss": 0.4895, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 9.003861003861005, |
|
"grad_norm": 68.16986846923828, |
|
"learning_rate": 9.060489060489061e-06, |
|
"loss": 0.8309, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 9.004633204633205, |
|
"grad_norm": 30.308658599853516, |
|
"learning_rate": 9.051909051909052e-06, |
|
"loss": 0.4314, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 9.005405405405405, |
|
"grad_norm": 16.645893096923828, |
|
"learning_rate": 9.043329043329045e-06, |
|
"loss": 0.6478, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 9.006177606177607, |
|
"grad_norm": 114.66397857666016, |
|
"learning_rate": 9.034749034749034e-06, |
|
"loss": 0.6211, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 9.006949806949807, |
|
"grad_norm": 24.798070907592773, |
|
"learning_rate": 9.026169026169027e-06, |
|
"loss": 0.3774, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 9.007722007722007, |
|
"grad_norm": 17.036094665527344, |
|
"learning_rate": 9.017589017589018e-06, |
|
"loss": 0.684, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 9.00849420849421, |
|
"grad_norm": 22.78154945373535, |
|
"learning_rate": 9.00900900900901e-06, |
|
"loss": 0.5217, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 9.00926640926641, |
|
"grad_norm": 17.106143951416016, |
|
"learning_rate": 9.000429000429e-06, |
|
"loss": 0.4613, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 9.01003861003861, |
|
"grad_norm": 89.09002685546875, |
|
"learning_rate": 8.991848991848993e-06, |
|
"loss": 0.6451, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 9.010810810810812, |
|
"grad_norm": 51.08162307739258, |
|
"learning_rate": 8.983268983268984e-06, |
|
"loss": 0.4516, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 9.011583011583012, |
|
"grad_norm": 17.878747940063477, |
|
"learning_rate": 8.974688974688976e-06, |
|
"loss": 0.3903, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 9.012355212355212, |
|
"grad_norm": 9.77619743347168, |
|
"learning_rate": 8.966108966108967e-06, |
|
"loss": 0.6817, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 9.013127413127414, |
|
"grad_norm": 9.895012855529785, |
|
"learning_rate": 8.957528957528958e-06, |
|
"loss": 0.5136, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 9.013899613899614, |
|
"grad_norm": 1.2748266458511353, |
|
"learning_rate": 8.94894894894895e-06, |
|
"loss": 0.6643, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 9.014671814671814, |
|
"grad_norm": 40.85063552856445, |
|
"learning_rate": 8.940368940368942e-06, |
|
"loss": 0.7568, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 9.015444015444016, |
|
"grad_norm": 28.490476608276367, |
|
"learning_rate": 8.931788931788933e-06, |
|
"loss": 0.4918, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 9.016216216216216, |
|
"grad_norm": 18.73026466369629, |
|
"learning_rate": 8.923208923208924e-06, |
|
"loss": 0.4231, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 9.016988416988417, |
|
"grad_norm": 6.791349411010742, |
|
"learning_rate": 8.914628914628915e-06, |
|
"loss": 0.4267, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 9.017760617760617, |
|
"grad_norm": 0.5822336673736572, |
|
"learning_rate": 8.906048906048906e-06, |
|
"loss": 0.2161, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 9.018532818532819, |
|
"grad_norm": 8.814290046691895, |
|
"learning_rate": 8.897468897468899e-06, |
|
"loss": 0.3644, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 9.019305019305019, |
|
"grad_norm": 45.50792694091797, |
|
"learning_rate": 8.888888888888888e-06, |
|
"loss": 0.5233, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 9.02007722007722, |
|
"grad_norm": 0.14075778424739838, |
|
"learning_rate": 8.880308880308881e-06, |
|
"loss": 0.5026, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 9.02007722007722, |
|
"eval_accuracy": 0.7777777777777778, |
|
"eval_loss": 0.742344081401825, |
|
"eval_runtime": 11.6183, |
|
"eval_samples_per_second": 3.873, |
|
"eval_steps_per_second": 1.291, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 10.0007722007722, |
|
"grad_norm": 0.050024695694446564, |
|
"learning_rate": 8.871728871728872e-06, |
|
"loss": 0.5309, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 10.001544401544402, |
|
"grad_norm": 36.7778205871582, |
|
"learning_rate": 8.863148863148863e-06, |
|
"loss": 0.5166, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 10.002316602316602, |
|
"grad_norm": 1.0572352409362793, |
|
"learning_rate": 8.854568854568856e-06, |
|
"loss": 0.4621, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 10.003088803088803, |
|
"grad_norm": 34.59428024291992, |
|
"learning_rate": 8.845988845988847e-06, |
|
"loss": 1.0248, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 10.003861003861005, |
|
"grad_norm": 35.09554672241211, |
|
"learning_rate": 8.837408837408838e-06, |
|
"loss": 0.6444, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 10.004633204633205, |
|
"grad_norm": 6.455074310302734, |
|
"learning_rate": 8.82882882882883e-06, |
|
"loss": 0.2533, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 10.005405405405405, |
|
"grad_norm": 0.7161849141120911, |
|
"learning_rate": 8.82024882024882e-06, |
|
"loss": 0.9718, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 10.006177606177607, |
|
"grad_norm": 24.07243537902832, |
|
"learning_rate": 8.811668811668812e-06, |
|
"loss": 0.6386, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 10.006949806949807, |
|
"grad_norm": 1.7033041715621948, |
|
"learning_rate": 8.803088803088804e-06, |
|
"loss": 0.536, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 10.007722007722007, |
|
"grad_norm": 40.123138427734375, |
|
"learning_rate": 8.794508794508795e-06, |
|
"loss": 0.7141, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 10.00849420849421, |
|
"grad_norm": 27.220144271850586, |
|
"learning_rate": 8.785928785928787e-06, |
|
"loss": 0.5531, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 10.00926640926641, |
|
"grad_norm": 12.67369556427002, |
|
"learning_rate": 8.777348777348778e-06, |
|
"loss": 0.3725, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 10.01003861003861, |
|
"grad_norm": 3.601840019226074, |
|
"learning_rate": 8.768768768768769e-06, |
|
"loss": 0.5135, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 10.010810810810812, |
|
"grad_norm": 91.34984588623047, |
|
"learning_rate": 8.760188760188762e-06, |
|
"loss": 0.2448, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 10.011583011583012, |
|
"grad_norm": 89.07131958007812, |
|
"learning_rate": 8.751608751608753e-06, |
|
"loss": 0.2126, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 10.012355212355212, |
|
"grad_norm": 0.4536842405796051, |
|
"learning_rate": 8.743028743028744e-06, |
|
"loss": 0.1193, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 10.013127413127414, |
|
"grad_norm": 150.1531219482422, |
|
"learning_rate": 8.734448734448735e-06, |
|
"loss": 0.857, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 10.013899613899614, |
|
"grad_norm": 167.5815887451172, |
|
"learning_rate": 8.725868725868728e-06, |
|
"loss": 0.6876, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 10.014671814671814, |
|
"grad_norm": 0.09778659045696259, |
|
"learning_rate": 8.717288717288717e-06, |
|
"loss": 0.5219, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 10.015444015444016, |
|
"grad_norm": 55.60439682006836, |
|
"learning_rate": 8.70870870870871e-06, |
|
"loss": 0.825, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 10.016216216216216, |
|
"grad_norm": 115.7952880859375, |
|
"learning_rate": 8.700128700128701e-06, |
|
"loss": 0.6771, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 10.016988416988417, |
|
"grad_norm": 76.89239501953125, |
|
"learning_rate": 8.691548691548692e-06, |
|
"loss": 0.3557, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 10.017760617760617, |
|
"grad_norm": 7.334883689880371, |
|
"learning_rate": 8.682968682968683e-06, |
|
"loss": 0.3834, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 10.018532818532819, |
|
"grad_norm": 26.45685386657715, |
|
"learning_rate": 8.674388674388674e-06, |
|
"loss": 0.3072, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 10.019305019305019, |
|
"grad_norm": 16.094499588012695, |
|
"learning_rate": 8.665808665808665e-06, |
|
"loss": 0.9027, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 10.02007722007722, |
|
"grad_norm": 0.5503826141357422, |
|
"learning_rate": 8.657228657228658e-06, |
|
"loss": 0.5612, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 10.02007722007722, |
|
"eval_accuracy": 0.6888888888888889, |
|
"eval_loss": 0.8798052668571472, |
|
"eval_runtime": 11.5437, |
|
"eval_samples_per_second": 3.898, |
|
"eval_steps_per_second": 1.299, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 11.0007722007722, |
|
"grad_norm": 38.276878356933594, |
|
"learning_rate": 8.64864864864865e-06, |
|
"loss": 0.5818, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 11.001544401544402, |
|
"grad_norm": 0.12790539860725403, |
|
"learning_rate": 8.64006864006864e-06, |
|
"loss": 0.1793, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 11.002316602316602, |
|
"grad_norm": 6.592782020568848, |
|
"learning_rate": 8.631488631488633e-06, |
|
"loss": 0.7483, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 11.003088803088803, |
|
"grad_norm": 77.4202651977539, |
|
"learning_rate": 8.622908622908623e-06, |
|
"loss": 0.6152, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 11.003861003861005, |
|
"grad_norm": 0.5944340825080872, |
|
"learning_rate": 8.614328614328615e-06, |
|
"loss": 0.337, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 11.004633204633205, |
|
"grad_norm": 0.9054240584373474, |
|
"learning_rate": 8.605748605748607e-06, |
|
"loss": 0.1523, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 11.005405405405405, |
|
"grad_norm": 75.37146759033203, |
|
"learning_rate": 8.597168597168598e-06, |
|
"loss": 0.5706, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 11.006177606177607, |
|
"grad_norm": 68.32598876953125, |
|
"learning_rate": 8.588588588588589e-06, |
|
"loss": 0.7453, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 11.006949806949807, |
|
"grad_norm": 86.55941772460938, |
|
"learning_rate": 8.580008580008582e-06, |
|
"loss": 0.6767, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 11.007722007722007, |
|
"grad_norm": 0.5787020921707153, |
|
"learning_rate": 8.571428571428571e-06, |
|
"loss": 0.5304, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 11.00849420849421, |
|
"grad_norm": 1.052687406539917, |
|
"learning_rate": 8.562848562848564e-06, |
|
"loss": 0.4398, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 11.00926640926641, |
|
"grad_norm": 100.41271209716797, |
|
"learning_rate": 8.554268554268555e-06, |
|
"loss": 0.4632, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 11.01003861003861, |
|
"grad_norm": 12.1259183883667, |
|
"learning_rate": 8.545688545688546e-06, |
|
"loss": 0.6563, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 11.010810810810812, |
|
"grad_norm": 0.3354950249195099, |
|
"learning_rate": 8.537108537108539e-06, |
|
"loss": 0.5515, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 11.011583011583012, |
|
"grad_norm": 0.6421205997467041, |
|
"learning_rate": 8.52852852852853e-06, |
|
"loss": 0.0537, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 11.012355212355212, |
|
"grad_norm": 0.13986188173294067, |
|
"learning_rate": 8.519948519948521e-06, |
|
"loss": 0.8416, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 11.013127413127414, |
|
"grad_norm": 70.42572784423828, |
|
"learning_rate": 8.511368511368512e-06, |
|
"loss": 0.7627, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 11.013899613899614, |
|
"grad_norm": 3.2118000984191895, |
|
"learning_rate": 8.502788502788503e-06, |
|
"loss": 0.4206, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 11.014671814671814, |
|
"grad_norm": 175.60455322265625, |
|
"learning_rate": 8.494208494208494e-06, |
|
"loss": 0.8047, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 11.015444015444016, |
|
"grad_norm": 0.3263649344444275, |
|
"learning_rate": 8.485628485628487e-06, |
|
"loss": 0.5895, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 11.016216216216216, |
|
"grad_norm": 0.11546669155359268, |
|
"learning_rate": 8.477048477048478e-06, |
|
"loss": 0.7429, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 11.016988416988417, |
|
"grad_norm": 10.180035591125488, |
|
"learning_rate": 8.46846846846847e-06, |
|
"loss": 1.0899, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 11.017760617760617, |
|
"grad_norm": 0.3308240473270416, |
|
"learning_rate": 8.45988845988846e-06, |
|
"loss": 0.3317, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 11.018532818532819, |
|
"grad_norm": 11.988396644592285, |
|
"learning_rate": 8.451308451308451e-06, |
|
"loss": 0.9498, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 11.019305019305019, |
|
"grad_norm": 1.0947588682174683, |
|
"learning_rate": 8.442728442728444e-06, |
|
"loss": 0.4188, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 11.02007722007722, |
|
"grad_norm": 0.16421347856521606, |
|
"learning_rate": 8.434148434148435e-06, |
|
"loss": 0.536, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 11.02007722007722, |
|
"eval_accuracy": 0.4888888888888889, |
|
"eval_loss": 1.5358043909072876, |
|
"eval_runtime": 12.3592, |
|
"eval_samples_per_second": 3.641, |
|
"eval_steps_per_second": 1.214, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 12.0007722007722, |
|
"grad_norm": 46.494842529296875, |
|
"learning_rate": 8.425568425568426e-06, |
|
"loss": 0.306, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 12.001544401544402, |
|
"grad_norm": 15.413254737854004, |
|
"learning_rate": 8.416988416988418e-06, |
|
"loss": 0.3972, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 12.002316602316602, |
|
"grad_norm": 29.484281539916992, |
|
"learning_rate": 8.408408408408409e-06, |
|
"loss": 0.5977, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 12.003088803088803, |
|
"grad_norm": 45.659481048583984, |
|
"learning_rate": 8.3998283998284e-06, |
|
"loss": 0.5106, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 12.003861003861005, |
|
"grad_norm": 115.90227508544922, |
|
"learning_rate": 8.391248391248393e-06, |
|
"loss": 0.653, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 12.004633204633205, |
|
"grad_norm": 0.9203261733055115, |
|
"learning_rate": 8.382668382668384e-06, |
|
"loss": 0.479, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 12.005405405405405, |
|
"grad_norm": 34.71575927734375, |
|
"learning_rate": 8.374088374088375e-06, |
|
"loss": 0.8072, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 12.006177606177607, |
|
"grad_norm": 32.08717346191406, |
|
"learning_rate": 8.365508365508366e-06, |
|
"loss": 0.5189, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 12.006949806949807, |
|
"grad_norm": 0.04885344207286835, |
|
"learning_rate": 8.356928356928357e-06, |
|
"loss": 0.5292, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 12.007722007722007, |
|
"grad_norm": 108.91436004638672, |
|
"learning_rate": 8.348348348348348e-06, |
|
"loss": 0.6934, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 12.00849420849421, |
|
"grad_norm": 7.798171520233154, |
|
"learning_rate": 8.33976833976834e-06, |
|
"loss": 0.6352, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 12.00926640926641, |
|
"grad_norm": 0.5554088950157166, |
|
"learning_rate": 8.331188331188332e-06, |
|
"loss": 0.3745, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 12.01003861003861, |
|
"grad_norm": 85.23207092285156, |
|
"learning_rate": 8.322608322608323e-06, |
|
"loss": 0.3618, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 12.010810810810812, |
|
"grad_norm": 0.41603294014930725, |
|
"learning_rate": 8.314028314028316e-06, |
|
"loss": 0.4017, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 12.011583011583012, |
|
"grad_norm": 43.02106857299805, |
|
"learning_rate": 8.305448305448305e-06, |
|
"loss": 0.7779, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 12.012355212355212, |
|
"grad_norm": 15.187088012695312, |
|
"learning_rate": 8.296868296868298e-06, |
|
"loss": 0.3938, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 12.013127413127414, |
|
"grad_norm": 0.11593673378229141, |
|
"learning_rate": 8.288288288288289e-06, |
|
"loss": 0.4054, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 12.013899613899614, |
|
"grad_norm": 44.22844696044922, |
|
"learning_rate": 8.27970827970828e-06, |
|
"loss": 0.9659, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 12.014671814671814, |
|
"grad_norm": 39.142581939697266, |
|
"learning_rate": 8.271128271128271e-06, |
|
"loss": 0.1711, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 12.015444015444016, |
|
"grad_norm": 29.718122482299805, |
|
"learning_rate": 8.262548262548264e-06, |
|
"loss": 0.3803, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 12.016216216216216, |
|
"grad_norm": 2.7277615070343018, |
|
"learning_rate": 8.253968253968254e-06, |
|
"loss": 0.6516, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 12.016988416988417, |
|
"grad_norm": 9.546491622924805, |
|
"learning_rate": 8.245388245388246e-06, |
|
"loss": 1.0321, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 12.017760617760617, |
|
"grad_norm": 30.349594116210938, |
|
"learning_rate": 8.236808236808237e-06, |
|
"loss": 0.739, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 12.018532818532819, |
|
"grad_norm": 13.018799781799316, |
|
"learning_rate": 8.228228228228229e-06, |
|
"loss": 0.2881, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 12.019305019305019, |
|
"grad_norm": 1.475889801979065, |
|
"learning_rate": 8.219648219648221e-06, |
|
"loss": 0.6082, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 12.02007722007722, |
|
"grad_norm": 33.09438705444336, |
|
"learning_rate": 8.211068211068212e-06, |
|
"loss": 0.6695, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 12.02007722007722, |
|
"eval_accuracy": 0.6666666666666666, |
|
"eval_loss": 1.3352181911468506, |
|
"eval_runtime": 11.6323, |
|
"eval_samples_per_second": 3.869, |
|
"eval_steps_per_second": 1.29, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 13.0007722007722, |
|
"grad_norm": 0.033746566623449326, |
|
"learning_rate": 8.202488202488204e-06, |
|
"loss": 0.421, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 13.001544401544402, |
|
"grad_norm": 0.36186423897743225, |
|
"learning_rate": 8.193908193908195e-06, |
|
"loss": 0.7725, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 13.002316602316602, |
|
"grad_norm": 0.7419402599334717, |
|
"learning_rate": 8.185328185328186e-06, |
|
"loss": 0.2973, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 13.003088803088803, |
|
"grad_norm": 16.69759750366211, |
|
"learning_rate": 8.176748176748177e-06, |
|
"loss": 0.6846, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 13.003861003861005, |
|
"grad_norm": 1.2755367755889893, |
|
"learning_rate": 8.16816816816817e-06, |
|
"loss": 0.4318, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 13.004633204633205, |
|
"grad_norm": 10.125988006591797, |
|
"learning_rate": 8.159588159588159e-06, |
|
"loss": 0.5978, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 13.005405405405405, |
|
"grad_norm": 0.27345284819602966, |
|
"learning_rate": 8.151008151008152e-06, |
|
"loss": 0.7275, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 13.006177606177607, |
|
"grad_norm": 2.0094382762908936, |
|
"learning_rate": 8.142428142428143e-06, |
|
"loss": 0.3855, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 13.006949806949807, |
|
"grad_norm": 30.24265480041504, |
|
"learning_rate": 8.133848133848134e-06, |
|
"loss": 0.3675, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 13.007722007722007, |
|
"grad_norm": 28.360801696777344, |
|
"learning_rate": 8.125268125268127e-06, |
|
"loss": 0.8046, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 13.00849420849421, |
|
"grad_norm": 0.12912389636039734, |
|
"learning_rate": 8.116688116688118e-06, |
|
"loss": 0.7357, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 13.00926640926641, |
|
"grad_norm": 0.8967646956443787, |
|
"learning_rate": 8.108108108108109e-06, |
|
"loss": 0.5318, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 13.01003861003861, |
|
"grad_norm": 84.27301788330078, |
|
"learning_rate": 8.0995280995281e-06, |
|
"loss": 0.764, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 13.010810810810812, |
|
"grad_norm": 10.367392539978027, |
|
"learning_rate": 8.090948090948091e-06, |
|
"loss": 0.4441, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 13.011583011583012, |
|
"grad_norm": 77.54997253417969, |
|
"learning_rate": 8.082368082368082e-06, |
|
"loss": 0.3571, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 13.012355212355212, |
|
"grad_norm": 0.0930776447057724, |
|
"learning_rate": 8.073788073788075e-06, |
|
"loss": 0.7569, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 13.013127413127414, |
|
"grad_norm": 27.489704132080078, |
|
"learning_rate": 8.065208065208066e-06, |
|
"loss": 0.2051, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 13.013899613899614, |
|
"grad_norm": 7.215950965881348, |
|
"learning_rate": 8.056628056628057e-06, |
|
"loss": 0.4319, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 13.014671814671814, |
|
"grad_norm": 2.232342481613159, |
|
"learning_rate": 8.048048048048048e-06, |
|
"loss": 0.6153, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 13.015444015444016, |
|
"grad_norm": 8.222293853759766, |
|
"learning_rate": 8.03946803946804e-06, |
|
"loss": 0.7993, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 13.016216216216216, |
|
"grad_norm": 0.9838555455207825, |
|
"learning_rate": 8.03088803088803e-06, |
|
"loss": 0.5731, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 13.016988416988417, |
|
"grad_norm": 21.45343589782715, |
|
"learning_rate": 8.022308022308023e-06, |
|
"loss": 0.7791, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 13.017760617760617, |
|
"grad_norm": 0.30541619658470154, |
|
"learning_rate": 8.013728013728015e-06, |
|
"loss": 0.3412, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 13.018532818532819, |
|
"grad_norm": 0.06337114423513412, |
|
"learning_rate": 8.005148005148006e-06, |
|
"loss": 0.2777, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 13.019305019305019, |
|
"grad_norm": 0.14662937819957733, |
|
"learning_rate": 7.996567996567998e-06, |
|
"loss": 0.7978, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 13.02007722007722, |
|
"grad_norm": 9.782637596130371, |
|
"learning_rate": 7.987987987987988e-06, |
|
"loss": 0.2699, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 13.02007722007722, |
|
"eval_accuracy": 0.7333333333333333, |
|
"eval_loss": 1.1053425073623657, |
|
"eval_runtime": 13.0668, |
|
"eval_samples_per_second": 3.444, |
|
"eval_steps_per_second": 1.148, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 14.0007722007722, |
|
"grad_norm": 0.04395415633916855, |
|
"learning_rate": 7.97940797940798e-06, |
|
"loss": 0.3761, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 14.001544401544402, |
|
"grad_norm": 0.18629683554172516, |
|
"learning_rate": 7.970827970827972e-06, |
|
"loss": 0.5233, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 14.002316602316602, |
|
"grad_norm": 0.24980448186397552, |
|
"learning_rate": 7.962247962247963e-06, |
|
"loss": 0.4489, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 14.003088803088803, |
|
"grad_norm": 0.09628362208604813, |
|
"learning_rate": 7.953667953667954e-06, |
|
"loss": 0.5638, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 14.003861003861005, |
|
"grad_norm": 0.016532419249415398, |
|
"learning_rate": 7.945087945087945e-06, |
|
"loss": 0.8614, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 14.004633204633205, |
|
"grad_norm": 137.2128448486328, |
|
"learning_rate": 7.936507936507936e-06, |
|
"loss": 0.2668, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 14.005405405405405, |
|
"grad_norm": 14.007122039794922, |
|
"learning_rate": 7.927927927927929e-06, |
|
"loss": 0.5763, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 14.006177606177607, |
|
"grad_norm": 0.029359988868236542, |
|
"learning_rate": 7.91934791934792e-06, |
|
"loss": 0.551, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 14.006949806949807, |
|
"grad_norm": 1.142870545387268, |
|
"learning_rate": 7.910767910767911e-06, |
|
"loss": 0.3888, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 14.007722007722007, |
|
"grad_norm": 22.303936004638672, |
|
"learning_rate": 7.902187902187904e-06, |
|
"loss": 0.8488, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 14.00849420849421, |
|
"grad_norm": 10.081164360046387, |
|
"learning_rate": 7.893607893607893e-06, |
|
"loss": 0.3886, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 14.00926640926641, |
|
"grad_norm": 56.892921447753906, |
|
"learning_rate": 7.885027885027886e-06, |
|
"loss": 0.7013, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 14.01003861003861, |
|
"grad_norm": 57.699581146240234, |
|
"learning_rate": 7.876447876447877e-06, |
|
"loss": 0.4642, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 14.010810810810812, |
|
"grad_norm": 78.83262634277344, |
|
"learning_rate": 7.867867867867868e-06, |
|
"loss": 0.5118, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 14.011583011583012, |
|
"grad_norm": 1.0107831954956055, |
|
"learning_rate": 7.85928785928786e-06, |
|
"loss": 0.6178, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 14.012355212355212, |
|
"grad_norm": 0.07490326464176178, |
|
"learning_rate": 7.850707850707852e-06, |
|
"loss": 0.7406, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 14.013127413127414, |
|
"grad_norm": 33.424617767333984, |
|
"learning_rate": 7.842127842127842e-06, |
|
"loss": 0.4972, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 14.013899613899614, |
|
"grad_norm": 0.40565818548202515, |
|
"learning_rate": 7.833547833547834e-06, |
|
"loss": 0.4828, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 14.014671814671814, |
|
"grad_norm": 0.5566960573196411, |
|
"learning_rate": 7.824967824967826e-06, |
|
"loss": 0.2992, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 14.015444015444016, |
|
"grad_norm": 149.54197692871094, |
|
"learning_rate": 7.816387816387817e-06, |
|
"loss": 0.8926, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 14.016216216216216, |
|
"grad_norm": 2.9310715198516846, |
|
"learning_rate": 7.807807807807808e-06, |
|
"loss": 0.8425, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 14.016988416988417, |
|
"grad_norm": 58.41656494140625, |
|
"learning_rate": 7.7992277992278e-06, |
|
"loss": 0.8109, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 14.017760617760617, |
|
"grad_norm": 7.718285083770752, |
|
"learning_rate": 7.790647790647792e-06, |
|
"loss": 0.4207, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 14.018532818532819, |
|
"grad_norm": 0.1663883924484253, |
|
"learning_rate": 7.782067782067783e-06, |
|
"loss": 0.2474, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 14.019305019305019, |
|
"grad_norm": 0.2051921933889389, |
|
"learning_rate": 7.773487773487774e-06, |
|
"loss": 0.3523, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 14.02007722007722, |
|
"grad_norm": 0.03690739721059799, |
|
"learning_rate": 7.764907764907765e-06, |
|
"loss": 0.5277, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 14.02007722007722, |
|
"eval_accuracy": 0.7111111111111111, |
|
"eval_loss": 0.9907832741737366, |
|
"eval_runtime": 13.0331, |
|
"eval_samples_per_second": 3.453, |
|
"eval_steps_per_second": 1.151, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 15.0007722007722, |
|
"grad_norm": 7.357924938201904, |
|
"learning_rate": 7.756327756327758e-06, |
|
"loss": 0.2465, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 15.001544401544402, |
|
"grad_norm": 104.24243927001953, |
|
"learning_rate": 7.747747747747749e-06, |
|
"loss": 0.4626, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 15.002316602316602, |
|
"grad_norm": 47.60615921020508, |
|
"learning_rate": 7.73916773916774e-06, |
|
"loss": 0.4113, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 15.003088803088803, |
|
"grad_norm": 140.14810180664062, |
|
"learning_rate": 7.730587730587731e-06, |
|
"loss": 0.3187, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 15.003861003861005, |
|
"grad_norm": 148.16998291015625, |
|
"learning_rate": 7.722007722007722e-06, |
|
"loss": 0.4277, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 15.004633204633205, |
|
"grad_norm": 6.010843753814697, |
|
"learning_rate": 7.713427713427713e-06, |
|
"loss": 0.3613, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 15.005405405405405, |
|
"grad_norm": 1.0568628311157227, |
|
"learning_rate": 7.704847704847706e-06, |
|
"loss": 0.2409, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 15.006177606177607, |
|
"grad_norm": 1.002126693725586, |
|
"learning_rate": 7.696267696267697e-06, |
|
"loss": 0.1332, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 15.006949806949807, |
|
"grad_norm": 0.5772402286529541, |
|
"learning_rate": 7.687687687687688e-06, |
|
"loss": 0.1781, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 15.007722007722007, |
|
"grad_norm": 6.310009002685547, |
|
"learning_rate": 7.67910767910768e-06, |
|
"loss": 0.3342, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 15.00849420849421, |
|
"grad_norm": 69.40351867675781, |
|
"learning_rate": 7.67052767052767e-06, |
|
"loss": 0.7218, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 15.00926640926641, |
|
"grad_norm": 0.10429790616035461, |
|
"learning_rate": 7.661947661947663e-06, |
|
"loss": 0.369, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 15.01003861003861, |
|
"grad_norm": 129.76885986328125, |
|
"learning_rate": 7.653367653367654e-06, |
|
"loss": 1.0676, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 15.010810810810812, |
|
"grad_norm": 95.47364807128906, |
|
"learning_rate": 7.644787644787645e-06, |
|
"loss": 0.6629, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 15.011583011583012, |
|
"grad_norm": 0.16719844937324524, |
|
"learning_rate": 7.636207636207637e-06, |
|
"loss": 0.3401, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 15.012355212355212, |
|
"grad_norm": 5.297786712646484, |
|
"learning_rate": 7.6276276276276285e-06, |
|
"loss": 1.0181, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 15.013127413127414, |
|
"grad_norm": 20.62660789489746, |
|
"learning_rate": 7.61904761904762e-06, |
|
"loss": 0.8109, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 15.013899613899614, |
|
"grad_norm": 0.046302299946546555, |
|
"learning_rate": 7.610467610467612e-06, |
|
"loss": 0.655, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 15.014671814671814, |
|
"grad_norm": 9.246990203857422, |
|
"learning_rate": 7.601887601887602e-06, |
|
"loss": 0.3383, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 15.015444015444016, |
|
"grad_norm": 0.6101518273353577, |
|
"learning_rate": 7.593307593307594e-06, |
|
"loss": 0.7927, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 15.016216216216216, |
|
"grad_norm": 1.2248023748397827, |
|
"learning_rate": 7.584727584727586e-06, |
|
"loss": 0.3503, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 15.016988416988417, |
|
"grad_norm": 0.7881536483764648, |
|
"learning_rate": 7.576147576147577e-06, |
|
"loss": 0.2756, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 15.017760617760617, |
|
"grad_norm": 69.70541381835938, |
|
"learning_rate": 7.567567567567569e-06, |
|
"loss": 0.3127, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 15.018532818532819, |
|
"grad_norm": 0.18171033263206482, |
|
"learning_rate": 7.558987558987559e-06, |
|
"loss": 0.4698, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 15.019305019305019, |
|
"grad_norm": 134.1101531982422, |
|
"learning_rate": 7.550407550407551e-06, |
|
"loss": 0.3204, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 15.02007722007722, |
|
"grad_norm": 114.70227813720703, |
|
"learning_rate": 7.541827541827542e-06, |
|
"loss": 0.7975, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 15.02007722007722, |
|
"eval_accuracy": 0.6666666666666666, |
|
"eval_loss": 1.0846061706542969, |
|
"eval_runtime": 13.9438, |
|
"eval_samples_per_second": 3.227, |
|
"eval_steps_per_second": 1.076, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 16.0007722007722, |
|
"grad_norm": 11.493532180786133, |
|
"learning_rate": 7.533247533247534e-06, |
|
"loss": 0.8145, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 16.0015444015444, |
|
"grad_norm": 0.4535379409790039, |
|
"learning_rate": 7.524667524667525e-06, |
|
"loss": 0.3171, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 16.0023166023166, |
|
"grad_norm": 37.140682220458984, |
|
"learning_rate": 7.516087516087517e-06, |
|
"loss": 0.5947, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 16.003088803088804, |
|
"grad_norm": 0.18892982602119446, |
|
"learning_rate": 7.507507507507507e-06, |
|
"loss": 0.2023, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 16.003861003861005, |
|
"grad_norm": 0.2875947952270508, |
|
"learning_rate": 7.498927498927499e-06, |
|
"loss": 0.337, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 16.004633204633205, |
|
"grad_norm": 123.1329574584961, |
|
"learning_rate": 7.49034749034749e-06, |
|
"loss": 0.631, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 16.005405405405405, |
|
"grad_norm": 66.25767517089844, |
|
"learning_rate": 7.481767481767482e-06, |
|
"loss": 0.3625, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 16.006177606177605, |
|
"grad_norm": 31.93522071838379, |
|
"learning_rate": 7.473187473187474e-06, |
|
"loss": 0.5928, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 16.006949806949805, |
|
"grad_norm": 0.35881680250167847, |
|
"learning_rate": 7.464607464607465e-06, |
|
"loss": 0.7209, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 16.00772200772201, |
|
"grad_norm": 106.6112060546875, |
|
"learning_rate": 7.456027456027457e-06, |
|
"loss": 0.2844, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 16.00849420849421, |
|
"grad_norm": 0.10399631410837173, |
|
"learning_rate": 7.447447447447448e-06, |
|
"loss": 0.2302, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 16.00926640926641, |
|
"grad_norm": 10.694889068603516, |
|
"learning_rate": 7.4388674388674395e-06, |
|
"loss": 0.5468, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 16.01003861003861, |
|
"grad_norm": 15.967513084411621, |
|
"learning_rate": 7.430287430287431e-06, |
|
"loss": 0.8236, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 16.01081081081081, |
|
"grad_norm": 73.18832397460938, |
|
"learning_rate": 7.421707421707423e-06, |
|
"loss": 0.5327, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 16.01158301158301, |
|
"grad_norm": 17.472103118896484, |
|
"learning_rate": 7.413127413127414e-06, |
|
"loss": 0.4371, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 16.012355212355214, |
|
"grad_norm": 0.6622916460037231, |
|
"learning_rate": 7.404547404547406e-06, |
|
"loss": 0.1643, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 16.013127413127414, |
|
"grad_norm": 0.12445707619190216, |
|
"learning_rate": 7.395967395967396e-06, |
|
"loss": 0.037, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 16.013899613899614, |
|
"grad_norm": 68.56678009033203, |
|
"learning_rate": 7.387387387387388e-06, |
|
"loss": 0.3456, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 16.014671814671814, |
|
"grad_norm": 0.490728497505188, |
|
"learning_rate": 7.37880737880738e-06, |
|
"loss": 0.4691, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 16.015444015444015, |
|
"grad_norm": 0.11592353880405426, |
|
"learning_rate": 7.370227370227371e-06, |
|
"loss": 0.3818, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 16.016216216216215, |
|
"grad_norm": 0.130640909075737, |
|
"learning_rate": 7.361647361647363e-06, |
|
"loss": 0.404, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 16.01698841698842, |
|
"grad_norm": 1.6968755722045898, |
|
"learning_rate": 7.353067353067354e-06, |
|
"loss": 0.3531, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 16.01776061776062, |
|
"grad_norm": 0.15732808411121368, |
|
"learning_rate": 7.344487344487345e-06, |
|
"loss": 0.9261, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 16.01853281853282, |
|
"grad_norm": 0.8805422782897949, |
|
"learning_rate": 7.335907335907336e-06, |
|
"loss": 0.6919, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 16.01930501930502, |
|
"grad_norm": 0.9305623173713684, |
|
"learning_rate": 7.327327327327328e-06, |
|
"loss": 0.1012, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 16.02007722007722, |
|
"grad_norm": 118.25983428955078, |
|
"learning_rate": 7.318747318747319e-06, |
|
"loss": 0.5766, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 16.02007722007722, |
|
"eval_accuracy": 0.7333333333333333, |
|
"eval_loss": 0.9611772298812866, |
|
"eval_runtime": 11.7122, |
|
"eval_samples_per_second": 3.842, |
|
"eval_steps_per_second": 1.281, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 17.0007722007722, |
|
"grad_norm": 0.2061794251203537, |
|
"learning_rate": 7.310167310167311e-06, |
|
"loss": 0.6818, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 17.0015444015444, |
|
"grad_norm": 89.11824035644531, |
|
"learning_rate": 7.301587301587301e-06, |
|
"loss": 0.2661, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 17.0023166023166, |
|
"grad_norm": 1.2090812921524048, |
|
"learning_rate": 7.293007293007293e-06, |
|
"loss": 0.5385, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 17.003088803088804, |
|
"grad_norm": 10.95218276977539, |
|
"learning_rate": 7.2844272844272845e-06, |
|
"loss": 0.3814, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 17.003861003861005, |
|
"grad_norm": 0.34872862696647644, |
|
"learning_rate": 7.275847275847276e-06, |
|
"loss": 0.319, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 17.004633204633205, |
|
"grad_norm": 0.4813756048679352, |
|
"learning_rate": 7.267267267267268e-06, |
|
"loss": 0.526, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 17.005405405405405, |
|
"grad_norm": 0.2315344214439392, |
|
"learning_rate": 7.2586872586872595e-06, |
|
"loss": 0.4553, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 17.006177606177605, |
|
"grad_norm": 19.774232864379883, |
|
"learning_rate": 7.250107250107251e-06, |
|
"loss": 0.2821, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 17.006949806949805, |
|
"grad_norm": 0.24049396812915802, |
|
"learning_rate": 7.241527241527242e-06, |
|
"loss": 0.4244, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 17.00772200772201, |
|
"grad_norm": 0.1969604790210724, |
|
"learning_rate": 7.232947232947234e-06, |
|
"loss": 0.1505, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 17.00849420849421, |
|
"grad_norm": 61.09734344482422, |
|
"learning_rate": 7.224367224367225e-06, |
|
"loss": 0.542, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 17.00926640926641, |
|
"grad_norm": 1.5801252126693726, |
|
"learning_rate": 7.215787215787217e-06, |
|
"loss": 0.2881, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 17.01003861003861, |
|
"grad_norm": 0.6441929936408997, |
|
"learning_rate": 7.207207207207208e-06, |
|
"loss": 0.8682, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 17.01081081081081, |
|
"grad_norm": 0.10384261608123779, |
|
"learning_rate": 7.1986271986272e-06, |
|
"loss": 0.314, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 17.01158301158301, |
|
"grad_norm": 1.5747077465057373, |
|
"learning_rate": 7.19004719004719e-06, |
|
"loss": 0.1259, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 17.012355212355214, |
|
"grad_norm": 47.53998947143555, |
|
"learning_rate": 7.181467181467182e-06, |
|
"loss": 0.7676, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 17.013127413127414, |
|
"grad_norm": 0.08766768872737885, |
|
"learning_rate": 7.172887172887173e-06, |
|
"loss": 0.1305, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 17.013899613899614, |
|
"grad_norm": 2.643843650817871, |
|
"learning_rate": 7.164307164307165e-06, |
|
"loss": 0.3499, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 17.014671814671814, |
|
"grad_norm": 0.15104711055755615, |
|
"learning_rate": 7.155727155727157e-06, |
|
"loss": 0.1806, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 17.015444015444015, |
|
"grad_norm": 0.07185631990432739, |
|
"learning_rate": 7.147147147147148e-06, |
|
"loss": 0.178, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 17.016216216216215, |
|
"grad_norm": 12.274628639221191, |
|
"learning_rate": 7.13856713856714e-06, |
|
"loss": 0.8695, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 17.01698841698842, |
|
"grad_norm": 234.01995849609375, |
|
"learning_rate": 7.12998712998713e-06, |
|
"loss": 0.7621, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 17.01776061776062, |
|
"grad_norm": 0.27887094020843506, |
|
"learning_rate": 7.121407121407122e-06, |
|
"loss": 0.7544, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 17.01853281853282, |
|
"grad_norm": 22.927310943603516, |
|
"learning_rate": 7.112827112827113e-06, |
|
"loss": 0.4503, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 17.01930501930502, |
|
"grad_norm": 0.3981897234916687, |
|
"learning_rate": 7.104247104247105e-06, |
|
"loss": 0.8743, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 17.02007722007722, |
|
"grad_norm": 0.04332108423113823, |
|
"learning_rate": 7.0956670956670955e-06, |
|
"loss": 0.3323, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 17.02007722007722, |
|
"eval_accuracy": 0.6888888888888889, |
|
"eval_loss": 1.1610815525054932, |
|
"eval_runtime": 13.5549, |
|
"eval_samples_per_second": 3.32, |
|
"eval_steps_per_second": 1.107, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 18.0007722007722, |
|
"grad_norm": 0.15380901098251343, |
|
"learning_rate": 7.087087087087087e-06, |
|
"loss": 0.5581, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 18.0015444015444, |
|
"grad_norm": 3.311131238937378, |
|
"learning_rate": 7.0785070785070785e-06, |
|
"loss": 0.5238, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 18.0023166023166, |
|
"grad_norm": 0.010178559459745884, |
|
"learning_rate": 7.0699270699270705e-06, |
|
"loss": 0.438, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 18.003088803088804, |
|
"grad_norm": 0.11447062343358994, |
|
"learning_rate": 7.0613470613470616e-06, |
|
"loss": 0.2677, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 18.003861003861005, |
|
"grad_norm": 35.86017608642578, |
|
"learning_rate": 7.0527670527670535e-06, |
|
"loss": 0.4821, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 18.004633204633205, |
|
"grad_norm": 0.34406280517578125, |
|
"learning_rate": 7.0441870441870455e-06, |
|
"loss": 0.0562, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 18.005405405405405, |
|
"grad_norm": 0.14499299228191376, |
|
"learning_rate": 7.035607035607036e-06, |
|
"loss": 0.5351, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 18.006177606177605, |
|
"grad_norm": 0.05108552426099777, |
|
"learning_rate": 7.027027027027028e-06, |
|
"loss": 0.3214, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 18.006949806949805, |
|
"grad_norm": 0.02121145650744438, |
|
"learning_rate": 7.018447018447019e-06, |
|
"loss": 0.5145, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 18.00772200772201, |
|
"grad_norm": 0.0515630841255188, |
|
"learning_rate": 7.009867009867011e-06, |
|
"loss": 0.281, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 18.00849420849421, |
|
"grad_norm": 0.12710994482040405, |
|
"learning_rate": 7.001287001287002e-06, |
|
"loss": 0.8898, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 18.00926640926641, |
|
"grad_norm": 0.12529367208480835, |
|
"learning_rate": 6.992706992706994e-06, |
|
"loss": 0.6286, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 18.01003861003861, |
|
"grad_norm": 42.525726318359375, |
|
"learning_rate": 6.984126984126984e-06, |
|
"loss": 0.2252, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 18.01081081081081, |
|
"grad_norm": 0.06859328597784042, |
|
"learning_rate": 6.975546975546976e-06, |
|
"loss": 0.2308, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 18.01158301158301, |
|
"grad_norm": 21.27465057373047, |
|
"learning_rate": 6.966966966966967e-06, |
|
"loss": 0.5888, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 18.012355212355214, |
|
"grad_norm": 0.022731417790055275, |
|
"learning_rate": 6.958386958386959e-06, |
|
"loss": 0.5071, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 18.013127413127414, |
|
"grad_norm": 0.6973882913589478, |
|
"learning_rate": 6.949806949806951e-06, |
|
"loss": 0.1836, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 18.013899613899614, |
|
"grad_norm": 0.375298410654068, |
|
"learning_rate": 6.941226941226942e-06, |
|
"loss": 0.568, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 18.014671814671814, |
|
"grad_norm": 0.13478554785251617, |
|
"learning_rate": 6.932646932646934e-06, |
|
"loss": 0.2527, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 18.015444015444015, |
|
"grad_norm": 77.43624877929688, |
|
"learning_rate": 6.924066924066924e-06, |
|
"loss": 0.338, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 18.016216216216215, |
|
"grad_norm": 33.197998046875, |
|
"learning_rate": 6.915486915486916e-06, |
|
"loss": 0.4993, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 18.01698841698842, |
|
"grad_norm": 1.216697096824646, |
|
"learning_rate": 6.906906906906907e-06, |
|
"loss": 0.502, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 18.01776061776062, |
|
"grad_norm": 15.660502433776855, |
|
"learning_rate": 6.898326898326899e-06, |
|
"loss": 0.6187, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 18.01853281853282, |
|
"grad_norm": 0.01838286779820919, |
|
"learning_rate": 6.88974688974689e-06, |
|
"loss": 0.3636, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 18.01930501930502, |
|
"grad_norm": 157.7162628173828, |
|
"learning_rate": 6.881166881166882e-06, |
|
"loss": 0.1696, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 18.02007722007722, |
|
"grad_norm": 0.8771587610244751, |
|
"learning_rate": 6.872586872586873e-06, |
|
"loss": 0.7162, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 18.02007722007722, |
|
"eval_accuracy": 0.7111111111111111, |
|
"eval_loss": 1.30552339553833, |
|
"eval_runtime": 12.9627, |
|
"eval_samples_per_second": 3.472, |
|
"eval_steps_per_second": 1.157, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 19.0007722007722, |
|
"grad_norm": 0.13848955929279327, |
|
"learning_rate": 6.8640068640068645e-06, |
|
"loss": 0.2741, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 19.0015444015444, |
|
"grad_norm": 0.023964807391166687, |
|
"learning_rate": 6.855426855426856e-06, |
|
"loss": 0.2638, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 19.0023166023166, |
|
"grad_norm": 4.055501461029053, |
|
"learning_rate": 6.846846846846848e-06, |
|
"loss": 0.3041, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 19.003088803088804, |
|
"grad_norm": 0.14882507920265198, |
|
"learning_rate": 6.8382668382668395e-06, |
|
"loss": 0.3066, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 19.003861003861005, |
|
"grad_norm": 1.0661811828613281, |
|
"learning_rate": 6.82968682968683e-06, |
|
"loss": 0.1638, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 19.004633204633205, |
|
"grad_norm": 0.04383436217904091, |
|
"learning_rate": 6.821106821106822e-06, |
|
"loss": 0.3109, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 19.005405405405405, |
|
"grad_norm": 43.136775970458984, |
|
"learning_rate": 6.812526812526813e-06, |
|
"loss": 0.7934, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 19.006177606177605, |
|
"grad_norm": 0.18534782528877258, |
|
"learning_rate": 6.803946803946805e-06, |
|
"loss": 0.4056, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 19.006949806949805, |
|
"grad_norm": 4.773721218109131, |
|
"learning_rate": 6.795366795366796e-06, |
|
"loss": 0.3571, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 19.00772200772201, |
|
"grad_norm": 117.23564910888672, |
|
"learning_rate": 6.786786786786788e-06, |
|
"loss": 0.1148, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 19.00849420849421, |
|
"grad_norm": 0.11472011357545853, |
|
"learning_rate": 6.778206778206778e-06, |
|
"loss": 0.3655, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 19.00926640926641, |
|
"grad_norm": 0.10049240291118622, |
|
"learning_rate": 6.76962676962677e-06, |
|
"loss": 0.0786, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 19.01003861003861, |
|
"grad_norm": 1.114931344985962, |
|
"learning_rate": 6.761046761046761e-06, |
|
"loss": 0.5683, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 19.01081081081081, |
|
"grad_norm": 0.0499207079410553, |
|
"learning_rate": 6.752466752466753e-06, |
|
"loss": 0.1673, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 19.01158301158301, |
|
"grad_norm": 0.04504622146487236, |
|
"learning_rate": 6.743886743886744e-06, |
|
"loss": 0.0325, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 19.012355212355214, |
|
"grad_norm": 24.717525482177734, |
|
"learning_rate": 6.735306735306736e-06, |
|
"loss": 0.2299, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 19.013127413127414, |
|
"grad_norm": 0.004078052006661892, |
|
"learning_rate": 6.726726726726728e-06, |
|
"loss": 0.4058, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 19.013899613899614, |
|
"grad_norm": 28.717302322387695, |
|
"learning_rate": 6.718146718146718e-06, |
|
"loss": 1.0464, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 19.014671814671814, |
|
"grad_norm": 0.1303737759590149, |
|
"learning_rate": 6.70956670956671e-06, |
|
"loss": 0.053, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 19.015444015444015, |
|
"grad_norm": 23.893720626831055, |
|
"learning_rate": 6.700986700986701e-06, |
|
"loss": 0.8631, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 19.016216216216215, |
|
"grad_norm": 0.1701466143131256, |
|
"learning_rate": 6.692406692406693e-06, |
|
"loss": 0.3543, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 19.01698841698842, |
|
"grad_norm": 118.48575592041016, |
|
"learning_rate": 6.6838266838266844e-06, |
|
"loss": 0.5097, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 19.01776061776062, |
|
"grad_norm": 58.45772933959961, |
|
"learning_rate": 6.675246675246676e-06, |
|
"loss": 0.183, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 19.01853281853282, |
|
"grad_norm": 0.15131786465644836, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.1063, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 19.01930501930502, |
|
"grad_norm": 0.42406681180000305, |
|
"learning_rate": 6.658086658086659e-06, |
|
"loss": 0.7002, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 19.02007722007722, |
|
"grad_norm": 0.004914190154522657, |
|
"learning_rate": 6.64950664950665e-06, |
|
"loss": 0.1248, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 19.02007722007722, |
|
"eval_accuracy": 0.6666666666666666, |
|
"eval_loss": 1.5836811065673828, |
|
"eval_runtime": 12.2459, |
|
"eval_samples_per_second": 3.675, |
|
"eval_steps_per_second": 1.225, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 20.0007722007722, |
|
"grad_norm": 113.19306182861328, |
|
"learning_rate": 6.640926640926642e-06, |
|
"loss": 0.2273, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 20.0015444015444, |
|
"grad_norm": 0.14228223264217377, |
|
"learning_rate": 6.632346632346633e-06, |
|
"loss": 0.2361, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 20.0023166023166, |
|
"grad_norm": 0.030990980565547943, |
|
"learning_rate": 6.623766623766624e-06, |
|
"loss": 0.3815, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 20.003088803088804, |
|
"grad_norm": 67.11844635009766, |
|
"learning_rate": 6.615186615186616e-06, |
|
"loss": 0.1815, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 20.003861003861005, |
|
"grad_norm": 1.2835884094238281, |
|
"learning_rate": 6.606606606606607e-06, |
|
"loss": 0.1885, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 20.004633204633205, |
|
"grad_norm": 182.08621215820312, |
|
"learning_rate": 6.598026598026599e-06, |
|
"loss": 0.2653, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 20.005405405405405, |
|
"grad_norm": 0.08672849088907242, |
|
"learning_rate": 6.58944658944659e-06, |
|
"loss": 0.323, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 20.006177606177605, |
|
"grad_norm": 0.03671255707740784, |
|
"learning_rate": 6.580866580866582e-06, |
|
"loss": 0.0026, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 20.006949806949805, |
|
"grad_norm": 12.600577354431152, |
|
"learning_rate": 6.572286572286572e-06, |
|
"loss": 0.3475, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 20.00772200772201, |
|
"grad_norm": 14.476115226745605, |
|
"learning_rate": 6.563706563706564e-06, |
|
"loss": 0.8815, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 20.00849420849421, |
|
"grad_norm": 0.035546738654375076, |
|
"learning_rate": 6.555126555126555e-06, |
|
"loss": 0.5063, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 20.00926640926641, |
|
"grad_norm": 0.6188809275627136, |
|
"learning_rate": 6.546546546546547e-06, |
|
"loss": 0.3486, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 20.01003861003861, |
|
"grad_norm": 0.2754557728767395, |
|
"learning_rate": 6.537966537966538e-06, |
|
"loss": 0.1505, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 20.01081081081081, |
|
"grad_norm": 25.426105499267578, |
|
"learning_rate": 6.52938652938653e-06, |
|
"loss": 0.5716, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 20.01158301158301, |
|
"grad_norm": 0.03355805203318596, |
|
"learning_rate": 6.520806520806522e-06, |
|
"loss": 0.3115, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 20.012355212355214, |
|
"grad_norm": 0.5074798464775085, |
|
"learning_rate": 6.512226512226512e-06, |
|
"loss": 0.3633, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 20.013127413127414, |
|
"grad_norm": 5.739364147186279, |
|
"learning_rate": 6.503646503646504e-06, |
|
"loss": 0.5785, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 20.013899613899614, |
|
"grad_norm": 7.886667728424072, |
|
"learning_rate": 6.4950664950664955e-06, |
|
"loss": 0.8187, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 20.014671814671814, |
|
"grad_norm": 0.18780578672885895, |
|
"learning_rate": 6.486486486486487e-06, |
|
"loss": 0.7403, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 20.015444015444015, |
|
"grad_norm": 197.27720642089844, |
|
"learning_rate": 6.4779064779064785e-06, |
|
"loss": 0.1305, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 20.016216216216215, |
|
"grad_norm": 107.21308135986328, |
|
"learning_rate": 6.4693264693264705e-06, |
|
"loss": 0.0233, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 20.01698841698842, |
|
"grad_norm": 15.643366813659668, |
|
"learning_rate": 6.460746460746461e-06, |
|
"loss": 0.4351, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 20.01776061776062, |
|
"grad_norm": 0.6989626884460449, |
|
"learning_rate": 6.452166452166453e-06, |
|
"loss": 0.1247, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 20.01853281853282, |
|
"grad_norm": 404.39361572265625, |
|
"learning_rate": 6.443586443586444e-06, |
|
"loss": 0.1101, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 20.01930501930502, |
|
"grad_norm": 0.03856265917420387, |
|
"learning_rate": 6.435006435006436e-06, |
|
"loss": 0.2845, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 20.02007722007722, |
|
"grad_norm": 0.12517881393432617, |
|
"learning_rate": 6.426426426426427e-06, |
|
"loss": 0.1746, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 20.02007722007722, |
|
"eval_accuracy": 0.7333333333333333, |
|
"eval_loss": 1.2694180011749268, |
|
"eval_runtime": 12.6449, |
|
"eval_samples_per_second": 3.559, |
|
"eval_steps_per_second": 1.186, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 21.0007722007722, |
|
"grad_norm": 0.5627447962760925, |
|
"learning_rate": 6.417846417846419e-06, |
|
"loss": 0.5971, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 21.0015444015444, |
|
"grad_norm": 209.05210876464844, |
|
"learning_rate": 6.409266409266411e-06, |
|
"loss": 1.1831, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 21.0023166023166, |
|
"grad_norm": 0.19635902345180511, |
|
"learning_rate": 6.400686400686401e-06, |
|
"loss": 0.5821, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 21.003088803088804, |
|
"grad_norm": 0.13137683272361755, |
|
"learning_rate": 6.392106392106393e-06, |
|
"loss": 0.557, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 21.003861003861005, |
|
"grad_norm": 0.09237836301326752, |
|
"learning_rate": 6.383526383526384e-06, |
|
"loss": 0.2781, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 21.004633204633205, |
|
"grad_norm": 0.08199318498373032, |
|
"learning_rate": 6.374946374946376e-06, |
|
"loss": 0.0898, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 21.005405405405405, |
|
"grad_norm": 54.22182846069336, |
|
"learning_rate": 6.366366366366366e-06, |
|
"loss": 0.2855, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 21.006177606177605, |
|
"grad_norm": 0.5029569268226624, |
|
"learning_rate": 6.357786357786358e-06, |
|
"loss": 0.0304, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 21.006949806949805, |
|
"grad_norm": 177.83555603027344, |
|
"learning_rate": 6.349206349206349e-06, |
|
"loss": 0.3167, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 21.00772200772201, |
|
"grad_norm": 0.42930805683135986, |
|
"learning_rate": 6.340626340626341e-06, |
|
"loss": 0.2099, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 21.00849420849421, |
|
"grad_norm": 0.050365786999464035, |
|
"learning_rate": 6.332046332046332e-06, |
|
"loss": 0.3829, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 21.00926640926641, |
|
"grad_norm": 0.05553288757801056, |
|
"learning_rate": 6.323466323466324e-06, |
|
"loss": 0.8343, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 21.01003861003861, |
|
"grad_norm": 0.05720970034599304, |
|
"learning_rate": 6.3148863148863145e-06, |
|
"loss": 0.1574, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 21.01081081081081, |
|
"grad_norm": 2.832240104675293, |
|
"learning_rate": 6.3063063063063065e-06, |
|
"loss": 0.409, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 21.01158301158301, |
|
"grad_norm": 158.55467224121094, |
|
"learning_rate": 6.297726297726298e-06, |
|
"loss": 0.5644, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 21.012355212355214, |
|
"grad_norm": 0.13720889389514923, |
|
"learning_rate": 6.2891462891462895e-06, |
|
"loss": 0.6388, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 21.013127413127414, |
|
"grad_norm": 64.92716217041016, |
|
"learning_rate": 6.2805662805662815e-06, |
|
"loss": 0.6628, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 21.013899613899614, |
|
"grad_norm": 17.310565948486328, |
|
"learning_rate": 6.2719862719862726e-06, |
|
"loss": 0.5685, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 21.014671814671814, |
|
"grad_norm": 0.2614205479621887, |
|
"learning_rate": 6.2634062634062645e-06, |
|
"loss": 0.0937, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 21.015444015444015, |
|
"grad_norm": 11.924259185791016, |
|
"learning_rate": 6.254826254826255e-06, |
|
"loss": 0.3255, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 21.016216216216215, |
|
"grad_norm": 0.06924501061439514, |
|
"learning_rate": 6.246246246246247e-06, |
|
"loss": 0.2687, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 21.01698841698842, |
|
"grad_norm": 0.0833396166563034, |
|
"learning_rate": 6.237666237666238e-06, |
|
"loss": 0.1404, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 21.01776061776062, |
|
"grad_norm": 0.17966850101947784, |
|
"learning_rate": 6.22908622908623e-06, |
|
"loss": 0.1596, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 21.01853281853282, |
|
"grad_norm": 0.0691114291548729, |
|
"learning_rate": 6.220506220506221e-06, |
|
"loss": 0.2709, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 21.01930501930502, |
|
"grad_norm": 41.90225601196289, |
|
"learning_rate": 6.211926211926213e-06, |
|
"loss": 0.4854, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 21.02007722007722, |
|
"grad_norm": 6.455798149108887, |
|
"learning_rate": 6.203346203346203e-06, |
|
"loss": 0.1973, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 21.02007722007722, |
|
"eval_accuracy": 0.7777777777777778, |
|
"eval_loss": 1.1572198867797852, |
|
"eval_runtime": 13.795, |
|
"eval_samples_per_second": 3.262, |
|
"eval_steps_per_second": 1.087, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 22.0007722007722, |
|
"grad_norm": 3.293867349624634, |
|
"learning_rate": 6.194766194766195e-06, |
|
"loss": 0.2579, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 22.0015444015444, |
|
"grad_norm": 0.11658459901809692, |
|
"learning_rate": 6.186186186186187e-06, |
|
"loss": 0.4474, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 22.0023166023166, |
|
"grad_norm": 0.0438237190246582, |
|
"learning_rate": 6.177606177606178e-06, |
|
"loss": 0.2652, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 22.003088803088804, |
|
"grad_norm": 182.23719787597656, |
|
"learning_rate": 6.16902616902617e-06, |
|
"loss": 0.6564, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 22.003861003861005, |
|
"grad_norm": 10.42225170135498, |
|
"learning_rate": 6.160446160446161e-06, |
|
"loss": 0.3219, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 22.004633204633205, |
|
"grad_norm": 68.25077819824219, |
|
"learning_rate": 6.151866151866153e-06, |
|
"loss": 0.5195, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 22.005405405405405, |
|
"grad_norm": 0.22111119329929352, |
|
"learning_rate": 6.143286143286143e-06, |
|
"loss": 0.1278, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 22.006177606177605, |
|
"grad_norm": 119.94930267333984, |
|
"learning_rate": 6.134706134706135e-06, |
|
"loss": 0.3084, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 22.006949806949805, |
|
"grad_norm": 0.29699742794036865, |
|
"learning_rate": 6.126126126126126e-06, |
|
"loss": 0.276, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 22.00772200772201, |
|
"grad_norm": 0.22920526564121246, |
|
"learning_rate": 6.117546117546118e-06, |
|
"loss": 0.3415, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 22.00849420849421, |
|
"grad_norm": 93.11829376220703, |
|
"learning_rate": 6.108966108966109e-06, |
|
"loss": 0.4063, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 22.00926640926641, |
|
"grad_norm": 0.09231970459222794, |
|
"learning_rate": 6.1003861003861005e-06, |
|
"loss": 0.2913, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 22.01003861003861, |
|
"grad_norm": 0.13953036069869995, |
|
"learning_rate": 6.0918060918060925e-06, |
|
"loss": 0.2231, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 22.01081081081081, |
|
"grad_norm": 9.176348686218262, |
|
"learning_rate": 6.083226083226084e-06, |
|
"loss": 0.1031, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 22.01158301158301, |
|
"grad_norm": 0.002297205151990056, |
|
"learning_rate": 6.0746460746460755e-06, |
|
"loss": 0.3377, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 22.012355212355214, |
|
"grad_norm": 0.011434354819357395, |
|
"learning_rate": 6.066066066066067e-06, |
|
"loss": 0.1273, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 22.013127413127414, |
|
"grad_norm": 0.0924556627869606, |
|
"learning_rate": 6.057486057486059e-06, |
|
"loss": 0.1457, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 22.013899613899614, |
|
"grad_norm": 0.3709995448589325, |
|
"learning_rate": 6.048906048906049e-06, |
|
"loss": 0.126, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 22.014671814671814, |
|
"grad_norm": 0.08549296110868454, |
|
"learning_rate": 6.040326040326041e-06, |
|
"loss": 0.1878, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 22.015444015444015, |
|
"grad_norm": 100.20759582519531, |
|
"learning_rate": 6.031746031746032e-06, |
|
"loss": 0.5293, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 22.016216216216215, |
|
"grad_norm": 0.04931352287530899, |
|
"learning_rate": 6.023166023166024e-06, |
|
"loss": 0.2636, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 22.01698841698842, |
|
"grad_norm": 0.4091545343399048, |
|
"learning_rate": 6.014586014586015e-06, |
|
"loss": 0.4039, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 22.01776061776062, |
|
"grad_norm": 0.03857075050473213, |
|
"learning_rate": 6.006006006006007e-06, |
|
"loss": 0.387, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 22.01853281853282, |
|
"grad_norm": 91.95096588134766, |
|
"learning_rate": 5.997425997425997e-06, |
|
"loss": 0.3726, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 22.01930501930502, |
|
"grad_norm": 0.522257387638092, |
|
"learning_rate": 5.988845988845989e-06, |
|
"loss": 0.256, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 22.02007722007722, |
|
"grad_norm": 0.06971725821495056, |
|
"learning_rate": 5.980265980265981e-06, |
|
"loss": 0.7038, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 22.02007722007722, |
|
"eval_accuracy": 0.7111111111111111, |
|
"eval_loss": 1.4034613370895386, |
|
"eval_runtime": 13.0086, |
|
"eval_samples_per_second": 3.459, |
|
"eval_steps_per_second": 1.153, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 23.0007722007722, |
|
"grad_norm": 0.025312229990959167, |
|
"learning_rate": 5.971685971685972e-06, |
|
"loss": 0.2679, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 23.0015444015444, |
|
"grad_norm": 4.412787437438965, |
|
"learning_rate": 5.963105963105964e-06, |
|
"loss": 1.0691, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 23.0023166023166, |
|
"grad_norm": 90.64250183105469, |
|
"learning_rate": 5.954525954525955e-06, |
|
"loss": 0.5154, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 23.003088803088804, |
|
"grad_norm": 0.0309459138661623, |
|
"learning_rate": 5.945945945945947e-06, |
|
"loss": 0.1943, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 23.003861003861005, |
|
"grad_norm": 9.064765930175781, |
|
"learning_rate": 5.937365937365937e-06, |
|
"loss": 0.2176, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 23.004633204633205, |
|
"grad_norm": 261.7420654296875, |
|
"learning_rate": 5.928785928785929e-06, |
|
"loss": 0.3884, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 23.005405405405405, |
|
"grad_norm": 0.10809492319822311, |
|
"learning_rate": 5.9202059202059204e-06, |
|
"loss": 0.4808, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 23.006177606177605, |
|
"grad_norm": 0.49106425046920776, |
|
"learning_rate": 5.911625911625912e-06, |
|
"loss": 0.2767, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 23.006949806949805, |
|
"grad_norm": 0.0949685275554657, |
|
"learning_rate": 5.9030459030459035e-06, |
|
"loss": 0.2362, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 23.00772200772201, |
|
"grad_norm": 0.03749189153313637, |
|
"learning_rate": 5.894465894465895e-06, |
|
"loss": 0.3946, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 23.00849420849421, |
|
"grad_norm": 0.4452950358390808, |
|
"learning_rate": 5.885885885885886e-06, |
|
"loss": 0.282, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 23.00926640926641, |
|
"grad_norm": 0.19385001063346863, |
|
"learning_rate": 5.877305877305878e-06, |
|
"loss": 0.7796, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 23.01003861003861, |
|
"grad_norm": 23.24810791015625, |
|
"learning_rate": 5.86872586872587e-06, |
|
"loss": 0.2623, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 23.01081081081081, |
|
"grad_norm": 20.898603439331055, |
|
"learning_rate": 5.860145860145861e-06, |
|
"loss": 0.2286, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 23.01158301158301, |
|
"grad_norm": 4.817591190338135, |
|
"learning_rate": 5.851565851565853e-06, |
|
"loss": 0.1913, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 23.012355212355214, |
|
"grad_norm": 57.4672737121582, |
|
"learning_rate": 5.842985842985843e-06, |
|
"loss": 0.5206, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 23.013127413127414, |
|
"grad_norm": 0.08692280203104019, |
|
"learning_rate": 5.834405834405835e-06, |
|
"loss": 0.168, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 23.013899613899614, |
|
"grad_norm": 0.025257082656025887, |
|
"learning_rate": 5.825825825825826e-06, |
|
"loss": 0.8649, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 23.014671814671814, |
|
"grad_norm": 0.3466915190219879, |
|
"learning_rate": 5.817245817245818e-06, |
|
"loss": 0.3208, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 23.015444015444015, |
|
"grad_norm": 0.011916655115783215, |
|
"learning_rate": 5.808665808665809e-06, |
|
"loss": 0.3571, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 23.016216216216215, |
|
"grad_norm": 0.740023672580719, |
|
"learning_rate": 5.800085800085801e-06, |
|
"loss": 0.3148, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 23.01698841698842, |
|
"grad_norm": 0.10393180698156357, |
|
"learning_rate": 5.791505791505791e-06, |
|
"loss": 0.0048, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 23.01776061776062, |
|
"grad_norm": 0.09348829090595245, |
|
"learning_rate": 5.782925782925783e-06, |
|
"loss": 0.1214, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 23.01853281853282, |
|
"grad_norm": 11.198524475097656, |
|
"learning_rate": 5.774345774345775e-06, |
|
"loss": 0.5352, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 23.01930501930502, |
|
"grad_norm": 0.02058136835694313, |
|
"learning_rate": 5.765765765765766e-06, |
|
"loss": 0.0152, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 23.02007722007722, |
|
"grad_norm": 0.2556219696998596, |
|
"learning_rate": 5.757185757185758e-06, |
|
"loss": 0.3939, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 23.02007722007722, |
|
"eval_accuracy": 0.6666666666666666, |
|
"eval_loss": 1.6774938106536865, |
|
"eval_runtime": 11.8897, |
|
"eval_samples_per_second": 3.785, |
|
"eval_steps_per_second": 1.262, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 24.0007722007722, |
|
"grad_norm": 66.1806411743164, |
|
"learning_rate": 5.748605748605749e-06, |
|
"loss": 0.3863, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 24.0015444015444, |
|
"grad_norm": 0.03881999850273132, |
|
"learning_rate": 5.740025740025741e-06, |
|
"loss": 0.3546, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 24.0023166023166, |
|
"grad_norm": 17.894153594970703, |
|
"learning_rate": 5.7314457314457315e-06, |
|
"loss": 0.366, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 24.003088803088804, |
|
"grad_norm": 0.07766028493642807, |
|
"learning_rate": 5.722865722865723e-06, |
|
"loss": 0.5358, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 24.003861003861005, |
|
"grad_norm": 0.7994852662086487, |
|
"learning_rate": 5.7142857142857145e-06, |
|
"loss": 0.3454, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 24.004633204633205, |
|
"grad_norm": 0.5819251537322998, |
|
"learning_rate": 5.7057057057057065e-06, |
|
"loss": 0.004, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 24.005405405405405, |
|
"grad_norm": 2.2003848552703857, |
|
"learning_rate": 5.6971256971256976e-06, |
|
"loss": 0.1608, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 24.006177606177605, |
|
"grad_norm": 0.14731518924236298, |
|
"learning_rate": 5.6885456885456895e-06, |
|
"loss": 0.031, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 24.006949806949805, |
|
"grad_norm": 0.06956463307142258, |
|
"learning_rate": 5.67996567996568e-06, |
|
"loss": 0.2076, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 24.00772200772201, |
|
"grad_norm": 0.060266412794589996, |
|
"learning_rate": 5.671385671385672e-06, |
|
"loss": 0.5671, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 24.00849420849421, |
|
"grad_norm": 16.026199340820312, |
|
"learning_rate": 5.662805662805664e-06, |
|
"loss": 0.4175, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 24.00926640926641, |
|
"grad_norm": 0.04196026548743248, |
|
"learning_rate": 5.654225654225655e-06, |
|
"loss": 0.5757, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 24.01003861003861, |
|
"grad_norm": 0.09439310431480408, |
|
"learning_rate": 5.645645645645647e-06, |
|
"loss": 0.075, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 24.01081081081081, |
|
"grad_norm": 0.08069531619548798, |
|
"learning_rate": 5.637065637065637e-06, |
|
"loss": 0.198, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 24.01158301158301, |
|
"grad_norm": 0.04372420907020569, |
|
"learning_rate": 5.628485628485629e-06, |
|
"loss": 0.1694, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 24.012355212355214, |
|
"grad_norm": 6.671726703643799, |
|
"learning_rate": 5.61990561990562e-06, |
|
"loss": 0.1794, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 24.013127413127414, |
|
"grad_norm": 3.417841911315918, |
|
"learning_rate": 5.611325611325612e-06, |
|
"loss": 0.0045, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 24.013899613899614, |
|
"grad_norm": 0.07374245673418045, |
|
"learning_rate": 5.602745602745603e-06, |
|
"loss": 0.3517, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 24.014671814671814, |
|
"grad_norm": 136.1538848876953, |
|
"learning_rate": 5.594165594165595e-06, |
|
"loss": 0.4944, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 24.015444015444015, |
|
"grad_norm": 0.022647704929113388, |
|
"learning_rate": 5.585585585585585e-06, |
|
"loss": 0.564, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 24.016216216216215, |
|
"grad_norm": 41.52387619018555, |
|
"learning_rate": 5.577005577005577e-06, |
|
"loss": 0.7174, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 24.01698841698842, |
|
"grad_norm": 0.14816536009311676, |
|
"learning_rate": 5.568425568425568e-06, |
|
"loss": 0.152, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 24.01776061776062, |
|
"grad_norm": 3.488743543624878, |
|
"learning_rate": 5.55984555984556e-06, |
|
"loss": 0.2765, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 24.01853281853282, |
|
"grad_norm": 0.18953141570091248, |
|
"learning_rate": 5.551265551265552e-06, |
|
"loss": 0.4315, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 24.01930501930502, |
|
"grad_norm": 0.027093812823295593, |
|
"learning_rate": 5.542685542685543e-06, |
|
"loss": 0.1277, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 24.02007722007722, |
|
"grad_norm": 0.24826771020889282, |
|
"learning_rate": 5.534105534105535e-06, |
|
"loss": 0.4129, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 24.02007722007722, |
|
"eval_accuracy": 0.6888888888888889, |
|
"eval_loss": 1.6039283275604248, |
|
"eval_runtime": 11.6661, |
|
"eval_samples_per_second": 3.857, |
|
"eval_steps_per_second": 1.286, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 25.0007722007722, |
|
"grad_norm": 0.07161914557218552, |
|
"learning_rate": 5.5255255255255255e-06, |
|
"loss": 0.1366, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 25.0015444015444, |
|
"grad_norm": 125.4383544921875, |
|
"learning_rate": 5.5169455169455175e-06, |
|
"loss": 0.1942, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 25.0023166023166, |
|
"grad_norm": 0.08652602881193161, |
|
"learning_rate": 5.5083655083655086e-06, |
|
"loss": 0.1576, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 25.003088803088804, |
|
"grad_norm": 0.08510206639766693, |
|
"learning_rate": 5.4997854997855005e-06, |
|
"loss": 0.2385, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 25.003861003861005, |
|
"grad_norm": 0.060757704079151154, |
|
"learning_rate": 5.491205491205492e-06, |
|
"loss": 0.3652, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 25.004633204633205, |
|
"grad_norm": 53.286048889160156, |
|
"learning_rate": 5.4826254826254836e-06, |
|
"loss": 0.5818, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 25.005405405405405, |
|
"grad_norm": 0.04687732830643654, |
|
"learning_rate": 5.474045474045474e-06, |
|
"loss": 0.5724, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 25.006177606177605, |
|
"grad_norm": 0.12820780277252197, |
|
"learning_rate": 5.465465465465466e-06, |
|
"loss": 0.5346, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 25.006949806949805, |
|
"grad_norm": 128.59718322753906, |
|
"learning_rate": 5.456885456885457e-06, |
|
"loss": 0.5187, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 25.00772200772201, |
|
"grad_norm": 15.195361137390137, |
|
"learning_rate": 5.448305448305449e-06, |
|
"loss": 0.0123, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 25.00849420849421, |
|
"grad_norm": 0.21481989324092865, |
|
"learning_rate": 5.439725439725441e-06, |
|
"loss": 0.1878, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 25.00926640926641, |
|
"grad_norm": 1.0520018339157104, |
|
"learning_rate": 5.431145431145432e-06, |
|
"loss": 0.1593, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 25.01003861003861, |
|
"grad_norm": 0.05230041593313217, |
|
"learning_rate": 5.422565422565423e-06, |
|
"loss": 0.1364, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 25.01081081081081, |
|
"grad_norm": 0.03155992925167084, |
|
"learning_rate": 5.413985413985414e-06, |
|
"loss": 0.0715, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 25.01158301158301, |
|
"grad_norm": 25.422481536865234, |
|
"learning_rate": 5.405405405405406e-06, |
|
"loss": 0.3257, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 25.012355212355214, |
|
"grad_norm": 0.06870962679386139, |
|
"learning_rate": 5.396825396825397e-06, |
|
"loss": 0.1611, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 25.013127413127414, |
|
"grad_norm": 0.7361809611320496, |
|
"learning_rate": 5.388245388245389e-06, |
|
"loss": 0.0661, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 25.013899613899614, |
|
"grad_norm": 0.007542428094893694, |
|
"learning_rate": 5.379665379665379e-06, |
|
"loss": 0.303, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 25.014671814671814, |
|
"grad_norm": 0.03758428990840912, |
|
"learning_rate": 5.371085371085371e-06, |
|
"loss": 0.3519, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 25.015444015444015, |
|
"grad_norm": 4.279178142547607, |
|
"learning_rate": 5.362505362505362e-06, |
|
"loss": 0.2035, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 25.016216216216215, |
|
"grad_norm": 0.024570725858211517, |
|
"learning_rate": 5.353925353925354e-06, |
|
"loss": 0.544, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 25.01698841698842, |
|
"grad_norm": 0.11677242070436478, |
|
"learning_rate": 5.345345345345346e-06, |
|
"loss": 0.2069, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 25.01776061776062, |
|
"grad_norm": 56.570030212402344, |
|
"learning_rate": 5.336765336765337e-06, |
|
"loss": 0.37, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 25.01853281853282, |
|
"grad_norm": 111.80818176269531, |
|
"learning_rate": 5.328185328185329e-06, |
|
"loss": 0.1691, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 25.01930501930502, |
|
"grad_norm": 19.810441970825195, |
|
"learning_rate": 5.31960531960532e-06, |
|
"loss": 0.6511, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 25.02007722007722, |
|
"grad_norm": 0.07480132579803467, |
|
"learning_rate": 5.3110253110253115e-06, |
|
"loss": 0.2852, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 25.02007722007722, |
|
"eval_accuracy": 0.7111111111111111, |
|
"eval_loss": 1.676919937133789, |
|
"eval_runtime": 11.755, |
|
"eval_samples_per_second": 3.828, |
|
"eval_steps_per_second": 1.276, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 26.0007722007722, |
|
"grad_norm": 7.0793681144714355, |
|
"learning_rate": 5.302445302445303e-06, |
|
"loss": 0.3598, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 26.0015444015444, |
|
"grad_norm": 0.04627354443073273, |
|
"learning_rate": 5.293865293865295e-06, |
|
"loss": 0.2332, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 26.0023166023166, |
|
"grad_norm": 0.015025601722300053, |
|
"learning_rate": 5.285285285285286e-06, |
|
"loss": 0.3468, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 26.003088803088804, |
|
"grad_norm": 0.9374003410339355, |
|
"learning_rate": 5.276705276705278e-06, |
|
"loss": 0.2451, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 26.003861003861005, |
|
"grad_norm": 0.06715880334377289, |
|
"learning_rate": 5.268125268125268e-06, |
|
"loss": 0.0179, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 26.004633204633205, |
|
"grad_norm": 0.1493833363056183, |
|
"learning_rate": 5.25954525954526e-06, |
|
"loss": 0.4544, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 26.005405405405405, |
|
"grad_norm": 0.008493917062878609, |
|
"learning_rate": 5.250965250965251e-06, |
|
"loss": 0.4215, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 26.006177606177605, |
|
"grad_norm": 0.24284197390079498, |
|
"learning_rate": 5.242385242385243e-06, |
|
"loss": 0.2307, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 26.006949806949805, |
|
"grad_norm": 1.8147085905075073, |
|
"learning_rate": 5.233805233805235e-06, |
|
"loss": 0.1164, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 26.00772200772201, |
|
"grad_norm": 0.02225414104759693, |
|
"learning_rate": 5.225225225225226e-06, |
|
"loss": 0.6091, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 26.00849420849421, |
|
"grad_norm": 0.03931829333305359, |
|
"learning_rate": 5.216645216645218e-06, |
|
"loss": 0.4782, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 26.00926640926641, |
|
"grad_norm": 0.9832797050476074, |
|
"learning_rate": 5.208065208065208e-06, |
|
"loss": 0.107, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 26.01003861003861, |
|
"grad_norm": 0.5841915607452393, |
|
"learning_rate": 5.1994851994852e-06, |
|
"loss": 0.0516, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 26.01081081081081, |
|
"grad_norm": 0.2941983640193939, |
|
"learning_rate": 5.190905190905191e-06, |
|
"loss": 0.4148, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 26.01158301158301, |
|
"grad_norm": 0.057785652577877045, |
|
"learning_rate": 5.182325182325183e-06, |
|
"loss": 0.1971, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 26.012355212355214, |
|
"grad_norm": 0.07821908593177795, |
|
"learning_rate": 5.173745173745173e-06, |
|
"loss": 0.1446, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 26.013127413127414, |
|
"grad_norm": 76.94812774658203, |
|
"learning_rate": 5.165165165165165e-06, |
|
"loss": 0.5563, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 26.013899613899614, |
|
"grad_norm": 0.0389854796230793, |
|
"learning_rate": 5.1565851565851565e-06, |
|
"loss": 0.2554, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 26.014671814671814, |
|
"grad_norm": 0.08624967187643051, |
|
"learning_rate": 5.148005148005148e-06, |
|
"loss": 0.0047, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 26.015444015444015, |
|
"grad_norm": 0.022646430879831314, |
|
"learning_rate": 5.1394251394251395e-06, |
|
"loss": 0.3356, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 26.016216216216215, |
|
"grad_norm": 0.004841935355216265, |
|
"learning_rate": 5.1308451308451314e-06, |
|
"loss": 0.0198, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 26.01698841698842, |
|
"grad_norm": 118.31085968017578, |
|
"learning_rate": 5.122265122265123e-06, |
|
"loss": 0.3741, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 26.01776061776062, |
|
"grad_norm": 95.70652770996094, |
|
"learning_rate": 5.113685113685114e-06, |
|
"loss": 0.224, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 26.01853281853282, |
|
"grad_norm": 114.44501495361328, |
|
"learning_rate": 5.105105105105106e-06, |
|
"loss": 0.2583, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 26.01930501930502, |
|
"grad_norm": 197.85206604003906, |
|
"learning_rate": 5.096525096525097e-06, |
|
"loss": 0.0754, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 26.02007722007722, |
|
"grad_norm": 0.006557074375450611, |
|
"learning_rate": 5.087945087945089e-06, |
|
"loss": 0.3278, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 26.02007722007722, |
|
"eval_accuracy": 0.6888888888888889, |
|
"eval_loss": 1.9128968715667725, |
|
"eval_runtime": 11.5823, |
|
"eval_samples_per_second": 3.885, |
|
"eval_steps_per_second": 1.295, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 27.0007722007722, |
|
"grad_norm": 0.02128458209335804, |
|
"learning_rate": 5.07936507936508e-06, |
|
"loss": 0.896, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 27.0015444015444, |
|
"grad_norm": 1.866234540939331, |
|
"learning_rate": 5.070785070785072e-06, |
|
"loss": 0.139, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 27.0023166023166, |
|
"grad_norm": 121.14530944824219, |
|
"learning_rate": 5.062205062205062e-06, |
|
"loss": 0.1566, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 27.003088803088804, |
|
"grad_norm": 0.4306340515613556, |
|
"learning_rate": 5.053625053625054e-06, |
|
"loss": 0.1125, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 27.003861003861005, |
|
"grad_norm": 0.0931638553738594, |
|
"learning_rate": 5.045045045045045e-06, |
|
"loss": 0.0045, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 27.004633204633205, |
|
"grad_norm": 21.92576789855957, |
|
"learning_rate": 5.036465036465037e-06, |
|
"loss": 0.5565, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 27.005405405405405, |
|
"grad_norm": 0.08851654082536697, |
|
"learning_rate": 5.027885027885028e-06, |
|
"loss": 0.0472, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 27.006177606177605, |
|
"grad_norm": 0.02390648052096367, |
|
"learning_rate": 5.01930501930502e-06, |
|
"loss": 0.3594, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 27.006949806949805, |
|
"grad_norm": 0.02931853011250496, |
|
"learning_rate": 5.010725010725012e-06, |
|
"loss": 0.285, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 27.00772200772201, |
|
"grad_norm": 0.044821541756391525, |
|
"learning_rate": 5.002145002145002e-06, |
|
"loss": 0.4182, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 27.00849420849421, |
|
"grad_norm": 0.0022613268811255693, |
|
"learning_rate": 4.993564993564994e-06, |
|
"loss": 0.2255, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 27.00926640926641, |
|
"grad_norm": 0.052004750818014145, |
|
"learning_rate": 4.984984984984985e-06, |
|
"loss": 0.0132, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 27.01003861003861, |
|
"grad_norm": 0.07019229978322983, |
|
"learning_rate": 4.976404976404976e-06, |
|
"loss": 0.1932, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 27.01081081081081, |
|
"grad_norm": 2.3248353004455566, |
|
"learning_rate": 4.967824967824968e-06, |
|
"loss": 0.3329, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 27.01158301158301, |
|
"grad_norm": 0.016531068831682205, |
|
"learning_rate": 4.95924495924496e-06, |
|
"loss": 0.2348, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 27.012355212355214, |
|
"grad_norm": 0.010612938553094864, |
|
"learning_rate": 4.950664950664951e-06, |
|
"loss": 0.0597, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 27.013127413127414, |
|
"grad_norm": 0.01717378944158554, |
|
"learning_rate": 4.9420849420849425e-06, |
|
"loss": 0.171, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 27.013899613899614, |
|
"grad_norm": 0.027986228466033936, |
|
"learning_rate": 4.9335049335049336e-06, |
|
"loss": 0.3064, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 27.014671814671814, |
|
"grad_norm": 0.02567068673670292, |
|
"learning_rate": 4.9249249249249255e-06, |
|
"loss": 0.1591, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 27.015444015444015, |
|
"grad_norm": 0.11277701705694199, |
|
"learning_rate": 4.916344916344917e-06, |
|
"loss": 0.0005, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 27.016216216216215, |
|
"grad_norm": 0.00369657832197845, |
|
"learning_rate": 4.907764907764908e-06, |
|
"loss": 0.2153, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 27.01698841698842, |
|
"grad_norm": 0.036675579845905304, |
|
"learning_rate": 4.8991848991849e-06, |
|
"loss": 0.2176, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 27.01776061776062, |
|
"grad_norm": 0.026838406920433044, |
|
"learning_rate": 4.890604890604891e-06, |
|
"loss": 0.5287, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 27.01853281853282, |
|
"grad_norm": 0.8378888964653015, |
|
"learning_rate": 4.882024882024882e-06, |
|
"loss": 0.0196, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 27.01930501930502, |
|
"grad_norm": 0.03345318138599396, |
|
"learning_rate": 4.873444873444874e-06, |
|
"loss": 0.1936, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 27.02007722007722, |
|
"grad_norm": 0.004973672796040773, |
|
"learning_rate": 4.864864864864866e-06, |
|
"loss": 0.7677, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 27.02007722007722, |
|
"eval_accuracy": 0.6666666666666666, |
|
"eval_loss": 1.83974027633667, |
|
"eval_runtime": 11.7937, |
|
"eval_samples_per_second": 3.816, |
|
"eval_steps_per_second": 1.272, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 28.0007722007722, |
|
"grad_norm": 0.023714346811175346, |
|
"learning_rate": 4.856284856284857e-06, |
|
"loss": 0.1018, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 28.0015444015444, |
|
"grad_norm": 122.97506713867188, |
|
"learning_rate": 4.847704847704848e-06, |
|
"loss": 0.219, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 28.0023166023166, |
|
"grad_norm": 0.14896957576274872, |
|
"learning_rate": 4.83912483912484e-06, |
|
"loss": 0.1888, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 28.003088803088804, |
|
"grad_norm": 0.05097675696015358, |
|
"learning_rate": 4.830544830544831e-06, |
|
"loss": 0.276, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 28.003861003861005, |
|
"grad_norm": 0.01336723007261753, |
|
"learning_rate": 4.821964821964822e-06, |
|
"loss": 0.0007, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 28.004633204633205, |
|
"grad_norm": 2.9897027015686035, |
|
"learning_rate": 4.813384813384814e-06, |
|
"loss": 0.0017, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 28.005405405405405, |
|
"grad_norm": 310.45367431640625, |
|
"learning_rate": 4.804804804804805e-06, |
|
"loss": 0.2582, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 28.006177606177605, |
|
"grad_norm": 12.623224258422852, |
|
"learning_rate": 4.796224796224796e-06, |
|
"loss": 0.2349, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 28.006949806949805, |
|
"grad_norm": 323.9424743652344, |
|
"learning_rate": 4.787644787644788e-06, |
|
"loss": 0.4081, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 28.00772200772201, |
|
"grad_norm": 0.05448540300130844, |
|
"learning_rate": 4.779064779064779e-06, |
|
"loss": 0.5078, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 28.00849420849421, |
|
"grad_norm": 105.11054229736328, |
|
"learning_rate": 4.7704847704847704e-06, |
|
"loss": 0.4487, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 28.00926640926641, |
|
"grad_norm": 0.3405386209487915, |
|
"learning_rate": 4.761904761904762e-06, |
|
"loss": 0.6769, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 28.01003861003861, |
|
"grad_norm": 0.48794102668762207, |
|
"learning_rate": 4.753324753324754e-06, |
|
"loss": 0.1995, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 28.01081081081081, |
|
"grad_norm": 0.042529307305812836, |
|
"learning_rate": 4.7447447447447454e-06, |
|
"loss": 0.2511, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 28.01158301158301, |
|
"grad_norm": 0.014477269724011421, |
|
"learning_rate": 4.7361647361647365e-06, |
|
"loss": 0.0833, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 28.012355212355214, |
|
"grad_norm": 0.003762512933462858, |
|
"learning_rate": 4.7275847275847285e-06, |
|
"loss": 0.2294, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 28.013127413127414, |
|
"grad_norm": 157.7808380126953, |
|
"learning_rate": 4.71900471900472e-06, |
|
"loss": 0.3185, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 28.013899613899614, |
|
"grad_norm": 93.19122314453125, |
|
"learning_rate": 4.710424710424711e-06, |
|
"loss": 0.4783, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 28.014671814671814, |
|
"grad_norm": 0.0037978780455887318, |
|
"learning_rate": 4.701844701844703e-06, |
|
"loss": 0.0011, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 28.015444015444015, |
|
"grad_norm": 0.003815456759184599, |
|
"learning_rate": 4.693264693264694e-06, |
|
"loss": 0.071, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 28.016216216216215, |
|
"grad_norm": 0.29880446195602417, |
|
"learning_rate": 4.684684684684685e-06, |
|
"loss": 0.0157, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 28.01698841698842, |
|
"grad_norm": 151.8013458251953, |
|
"learning_rate": 4.676104676104676e-06, |
|
"loss": 0.3594, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 28.01776061776062, |
|
"grad_norm": 0.013940732926130295, |
|
"learning_rate": 4.667524667524668e-06, |
|
"loss": 0.0044, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 28.01853281853282, |
|
"grad_norm": 0.23434804379940033, |
|
"learning_rate": 4.658944658944659e-06, |
|
"loss": 0.1779, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 28.01930501930502, |
|
"grad_norm": 0.02195580303668976, |
|
"learning_rate": 4.650364650364651e-06, |
|
"loss": 0.2855, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 28.02007722007722, |
|
"grad_norm": 0.017229193821549416, |
|
"learning_rate": 4.641784641784642e-06, |
|
"loss": 0.0122, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 28.02007722007722, |
|
"eval_accuracy": 0.6888888888888889, |
|
"eval_loss": 2.029585838317871, |
|
"eval_runtime": 12.7497, |
|
"eval_samples_per_second": 3.529, |
|
"eval_steps_per_second": 1.176, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 29.0007722007722, |
|
"grad_norm": 0.01609969511628151, |
|
"learning_rate": 4.633204633204634e-06, |
|
"loss": 0.1734, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 29.0015444015444, |
|
"grad_norm": 131.36447143554688, |
|
"learning_rate": 4.624624624624625e-06, |
|
"loss": 0.0932, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 29.0023166023166, |
|
"grad_norm": 0.007553355302661657, |
|
"learning_rate": 4.616044616044616e-06, |
|
"loss": 0.233, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 29.003088803088804, |
|
"grad_norm": 29.221181869506836, |
|
"learning_rate": 4.607464607464608e-06, |
|
"loss": 0.0419, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 29.003861003861005, |
|
"grad_norm": 3.5382421016693115, |
|
"learning_rate": 4.598884598884599e-06, |
|
"loss": 0.2079, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 29.004633204633205, |
|
"grad_norm": 28.57002830505371, |
|
"learning_rate": 4.59030459030459e-06, |
|
"loss": 0.2519, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 29.005405405405405, |
|
"grad_norm": 0.129736989736557, |
|
"learning_rate": 4.581724581724582e-06, |
|
"loss": 0.0076, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 29.006177606177605, |
|
"grad_norm": 2.590891122817993, |
|
"learning_rate": 4.573144573144573e-06, |
|
"loss": 0.4462, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 29.006949806949805, |
|
"grad_norm": 0.006880035623908043, |
|
"learning_rate": 4.5645645645645645e-06, |
|
"loss": 0.0013, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 29.00772200772201, |
|
"grad_norm": 0.006625477224588394, |
|
"learning_rate": 4.5559845559845564e-06, |
|
"loss": 0.6628, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 29.00849420849421, |
|
"grad_norm": 0.3407871425151825, |
|
"learning_rate": 4.5474045474045475e-06, |
|
"loss": 0.0684, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 29.00926640926641, |
|
"grad_norm": 0.2093740552663803, |
|
"learning_rate": 4.5388245388245395e-06, |
|
"loss": 0.1559, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 29.01003861003861, |
|
"grad_norm": 0.5102840662002563, |
|
"learning_rate": 4.530244530244531e-06, |
|
"loss": 0.3277, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 29.01081081081081, |
|
"grad_norm": 0.02635369263589382, |
|
"learning_rate": 4.5216645216645225e-06, |
|
"loss": 0.2345, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 29.01158301158301, |
|
"grad_norm": 47.610321044921875, |
|
"learning_rate": 4.513084513084514e-06, |
|
"loss": 0.5266, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 29.012355212355214, |
|
"grad_norm": 0.03327294811606407, |
|
"learning_rate": 4.504504504504505e-06, |
|
"loss": 0.1346, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 29.013127413127414, |
|
"grad_norm": 0.08322642743587494, |
|
"learning_rate": 4.495924495924497e-06, |
|
"loss": 0.2012, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 29.013899613899614, |
|
"grad_norm": 1.1271024942398071, |
|
"learning_rate": 4.487344487344488e-06, |
|
"loss": 0.239, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 29.014671814671814, |
|
"grad_norm": 0.010516617447137833, |
|
"learning_rate": 4.478764478764479e-06, |
|
"loss": 0.3092, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 29.015444015444015, |
|
"grad_norm": 39.980411529541016, |
|
"learning_rate": 4.470184470184471e-06, |
|
"loss": 0.2006, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 29.016216216216215, |
|
"grad_norm": 13.700182914733887, |
|
"learning_rate": 4.461604461604462e-06, |
|
"loss": 0.215, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 29.01698841698842, |
|
"grad_norm": 7.643761157989502, |
|
"learning_rate": 4.453024453024453e-06, |
|
"loss": 0.2452, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 29.01776061776062, |
|
"grad_norm": 0.015428297221660614, |
|
"learning_rate": 4.444444444444444e-06, |
|
"loss": 0.004, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 29.01853281853282, |
|
"grad_norm": 0.02670716680586338, |
|
"learning_rate": 4.435864435864436e-06, |
|
"loss": 0.0358, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 29.01930501930502, |
|
"grad_norm": 0.052240751683712006, |
|
"learning_rate": 4.427284427284428e-06, |
|
"loss": 0.2065, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 29.02007722007722, |
|
"grad_norm": 0.0033651816193014383, |
|
"learning_rate": 4.418704418704419e-06, |
|
"loss": 0.3014, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 29.02007722007722, |
|
"eval_accuracy": 0.6, |
|
"eval_loss": 2.4233789443969727, |
|
"eval_runtime": 11.8236, |
|
"eval_samples_per_second": 3.806, |
|
"eval_steps_per_second": 1.269, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 30.0007722007722, |
|
"grad_norm": 259.00091552734375, |
|
"learning_rate": 4.41012441012441e-06, |
|
"loss": 0.5366, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 30.0015444015444, |
|
"grad_norm": 0.02561926282942295, |
|
"learning_rate": 4.401544401544402e-06, |
|
"loss": 0.257, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 30.0023166023166, |
|
"grad_norm": 6.1988606452941895, |
|
"learning_rate": 4.392964392964393e-06, |
|
"loss": 0.1662, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 30.003088803088804, |
|
"grad_norm": 37.069583892822266, |
|
"learning_rate": 4.384384384384384e-06, |
|
"loss": 0.3467, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 30.003861003861005, |
|
"grad_norm": 1.0843921899795532, |
|
"learning_rate": 4.375804375804376e-06, |
|
"loss": 0.0114, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 30.004633204633205, |
|
"grad_norm": 0.07318487763404846, |
|
"learning_rate": 4.3672243672243675e-06, |
|
"loss": 0.1679, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 30.005405405405405, |
|
"grad_norm": 7.664827823638916, |
|
"learning_rate": 4.3586443586443586e-06, |
|
"loss": 0.1834, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 30.006177606177605, |
|
"grad_norm": 84.84963989257812, |
|
"learning_rate": 4.3500643500643505e-06, |
|
"loss": 0.6496, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 30.006949806949805, |
|
"grad_norm": 0.12111800163984299, |
|
"learning_rate": 4.341484341484342e-06, |
|
"loss": 0.0015, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 30.00772200772201, |
|
"grad_norm": 0.15672890841960907, |
|
"learning_rate": 4.332904332904333e-06, |
|
"loss": 1.4159, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 30.00849420849421, |
|
"grad_norm": 144.9275665283203, |
|
"learning_rate": 4.324324324324325e-06, |
|
"loss": 0.5313, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 30.00926640926641, |
|
"grad_norm": 0.24197010695934296, |
|
"learning_rate": 4.315744315744317e-06, |
|
"loss": 0.6925, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 30.01003861003861, |
|
"grad_norm": 178.34043884277344, |
|
"learning_rate": 4.307164307164308e-06, |
|
"loss": 0.0522, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 30.01081081081081, |
|
"grad_norm": 0.059441838413476944, |
|
"learning_rate": 4.298584298584299e-06, |
|
"loss": 0.0813, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 30.01158301158301, |
|
"grad_norm": 0.12464194744825363, |
|
"learning_rate": 4.290004290004291e-06, |
|
"loss": 0.0013, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 30.012355212355214, |
|
"grad_norm": 0.02124338410794735, |
|
"learning_rate": 4.281424281424282e-06, |
|
"loss": 0.1996, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 30.013127413127414, |
|
"grad_norm": 0.03584703058004379, |
|
"learning_rate": 4.272844272844273e-06, |
|
"loss": 0.1586, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 30.013899613899614, |
|
"grad_norm": 0.07277407497167587, |
|
"learning_rate": 4.264264264264265e-06, |
|
"loss": 0.1546, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 30.014671814671814, |
|
"grad_norm": 0.16193543374538422, |
|
"learning_rate": 4.255684255684256e-06, |
|
"loss": 0.4482, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 30.015444015444015, |
|
"grad_norm": 0.02647043950855732, |
|
"learning_rate": 4.247104247104247e-06, |
|
"loss": 0.1865, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 30.016216216216215, |
|
"grad_norm": 0.6456356048583984, |
|
"learning_rate": 4.238524238524239e-06, |
|
"loss": 0.1314, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 30.01698841698842, |
|
"grad_norm": 133.39508056640625, |
|
"learning_rate": 4.22994422994423e-06, |
|
"loss": 0.3308, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 30.01776061776062, |
|
"grad_norm": 0.02105424925684929, |
|
"learning_rate": 4.221364221364222e-06, |
|
"loss": 0.4071, |
|
"step": 8030 |
|
}, |
|
{ |
|
"epoch": 30.01853281853282, |
|
"grad_norm": 0.94647616147995, |
|
"learning_rate": 4.212784212784213e-06, |
|
"loss": 0.5776, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 30.01930501930502, |
|
"grad_norm": 4.219485282897949, |
|
"learning_rate": 4.204204204204204e-06, |
|
"loss": 0.2465, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 30.02007722007722, |
|
"grad_norm": 0.12528596818447113, |
|
"learning_rate": 4.195624195624196e-06, |
|
"loss": 0.3567, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 30.02007722007722, |
|
"eval_accuracy": 0.7111111111111111, |
|
"eval_loss": 1.7570387125015259, |
|
"eval_runtime": 11.787, |
|
"eval_samples_per_second": 3.818, |
|
"eval_steps_per_second": 1.273, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 31.0007722007722, |
|
"grad_norm": 0.003974312916398048, |
|
"learning_rate": 4.187044187044187e-06, |
|
"loss": 0.1881, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 31.0015444015444, |
|
"grad_norm": 0.21857279539108276, |
|
"learning_rate": 4.1784641784641785e-06, |
|
"loss": 0.0016, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 31.0023166023166, |
|
"grad_norm": 0.013827860355377197, |
|
"learning_rate": 4.16988416988417e-06, |
|
"loss": 0.1681, |
|
"step": 8090 |
|
}, |
|
{ |
|
"epoch": 31.003088803088804, |
|
"grad_norm": 0.16716662049293518, |
|
"learning_rate": 4.1613041613041615e-06, |
|
"loss": 0.4215, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 31.003861003861005, |
|
"grad_norm": 58.75962448120117, |
|
"learning_rate": 4.152724152724153e-06, |
|
"loss": 0.03, |
|
"step": 8110 |
|
}, |
|
{ |
|
"epoch": 31.004633204633205, |
|
"grad_norm": 70.25969696044922, |
|
"learning_rate": 4.1441441441441446e-06, |
|
"loss": 0.1636, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 31.005405405405405, |
|
"grad_norm": 0.07677017897367477, |
|
"learning_rate": 4.135564135564136e-06, |
|
"loss": 0.2356, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 31.006177606177605, |
|
"grad_norm": 0.009610922075808048, |
|
"learning_rate": 4.126984126984127e-06, |
|
"loss": 0.0007, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 31.006949806949805, |
|
"grad_norm": 0.0572146512567997, |
|
"learning_rate": 4.118404118404119e-06, |
|
"loss": 0.3287, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 31.00772200772201, |
|
"grad_norm": 0.07425564527511597, |
|
"learning_rate": 4.109824109824111e-06, |
|
"loss": 0.0168, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 31.00849420849421, |
|
"grad_norm": 0.026679538190364838, |
|
"learning_rate": 4.101244101244102e-06, |
|
"loss": 0.1668, |
|
"step": 8170 |
|
}, |
|
{ |
|
"epoch": 31.00926640926641, |
|
"grad_norm": 0.0018737409263849258, |
|
"learning_rate": 4.092664092664093e-06, |
|
"loss": 0.0066, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 31.01003861003861, |
|
"grad_norm": 396.3301696777344, |
|
"learning_rate": 4.084084084084085e-06, |
|
"loss": 0.3928, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 31.01081081081081, |
|
"grad_norm": 0.6448129415512085, |
|
"learning_rate": 4.075504075504076e-06, |
|
"loss": 0.2184, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 31.01158301158301, |
|
"grad_norm": 0.034431375563144684, |
|
"learning_rate": 4.066924066924067e-06, |
|
"loss": 0.0284, |
|
"step": 8210 |
|
}, |
|
{ |
|
"epoch": 31.012355212355214, |
|
"grad_norm": 0.013946445658802986, |
|
"learning_rate": 4.058344058344059e-06, |
|
"loss": 0.001, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 31.013127413127414, |
|
"grad_norm": 0.015018833801150322, |
|
"learning_rate": 4.04976404976405e-06, |
|
"loss": 0.0798, |
|
"step": 8230 |
|
}, |
|
{ |
|
"epoch": 31.013899613899614, |
|
"grad_norm": 0.08934608846902847, |
|
"learning_rate": 4.041184041184041e-06, |
|
"loss": 0.0119, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 31.014671814671814, |
|
"grad_norm": 0.25796929001808167, |
|
"learning_rate": 4.032604032604033e-06, |
|
"loss": 0.1963, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 31.015444015444015, |
|
"grad_norm": 0.0014374108286574483, |
|
"learning_rate": 4.024024024024024e-06, |
|
"loss": 0.2168, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 31.016216216216215, |
|
"grad_norm": 0.03616923466324806, |
|
"learning_rate": 4.015444015444015e-06, |
|
"loss": 0.0099, |
|
"step": 8270 |
|
}, |
|
{ |
|
"epoch": 31.01698841698842, |
|
"grad_norm": 0.26436764001846313, |
|
"learning_rate": 4.006864006864007e-06, |
|
"loss": 0.2602, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 31.01776061776062, |
|
"grad_norm": 199.53219604492188, |
|
"learning_rate": 3.998283998283999e-06, |
|
"loss": 0.6799, |
|
"step": 8290 |
|
}, |
|
{ |
|
"epoch": 31.01853281853282, |
|
"grad_norm": 0.08120285719633102, |
|
"learning_rate": 3.98970398970399e-06, |
|
"loss": 0.4377, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 31.01930501930502, |
|
"grad_norm": 0.0741524025797844, |
|
"learning_rate": 3.9811239811239814e-06, |
|
"loss": 0.3345, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 31.02007722007722, |
|
"grad_norm": 0.04211797937750816, |
|
"learning_rate": 3.9725439725439725e-06, |
|
"loss": 0.0334, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 31.02007722007722, |
|
"eval_accuracy": 0.7111111111111111, |
|
"eval_loss": 2.034332513809204, |
|
"eval_runtime": 12.4294, |
|
"eval_samples_per_second": 3.62, |
|
"eval_steps_per_second": 1.207, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 32.000772200772204, |
|
"grad_norm": 0.12727074325084686, |
|
"learning_rate": 3.9639639639639645e-06, |
|
"loss": 0.0861, |
|
"step": 8330 |
|
}, |
|
{ |
|
"epoch": 32.0015444015444, |
|
"grad_norm": 0.040023840963840485, |
|
"learning_rate": 3.955383955383956e-06, |
|
"loss": 0.0052, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 32.002316602316604, |
|
"grad_norm": 0.040361110121011734, |
|
"learning_rate": 3.946803946803947e-06, |
|
"loss": 0.3076, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 32.0030888030888, |
|
"grad_norm": 0.04353674501180649, |
|
"learning_rate": 3.938223938223939e-06, |
|
"loss": 0.0807, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 32.003861003861005, |
|
"grad_norm": 0.010240713134407997, |
|
"learning_rate": 3.92964392964393e-06, |
|
"loss": 0.2488, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 32.0046332046332, |
|
"grad_norm": 0.0010769086657091975, |
|
"learning_rate": 3.921063921063921e-06, |
|
"loss": 0.1591, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 32.005405405405405, |
|
"grad_norm": 0.037674229592084885, |
|
"learning_rate": 3.912483912483913e-06, |
|
"loss": 0.0437, |
|
"step": 8390 |
|
}, |
|
{ |
|
"epoch": 32.00617760617761, |
|
"grad_norm": 0.005637090187519789, |
|
"learning_rate": 3.903903903903904e-06, |
|
"loss": 0.0042, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 32.006949806949805, |
|
"grad_norm": 0.07691396027803421, |
|
"learning_rate": 3.895323895323896e-06, |
|
"loss": 0.763, |
|
"step": 8410 |
|
}, |
|
{ |
|
"epoch": 32.00772200772201, |
|
"grad_norm": 0.07949383556842804, |
|
"learning_rate": 3.886743886743887e-06, |
|
"loss": 0.006, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 32.008494208494206, |
|
"grad_norm": 0.019564570859074593, |
|
"learning_rate": 3.878163878163879e-06, |
|
"loss": 0.0007, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 32.00926640926641, |
|
"grad_norm": 0.03286294266581535, |
|
"learning_rate": 3.86958386958387e-06, |
|
"loss": 0.2045, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 32.01003861003861, |
|
"grad_norm": 0.004159833304584026, |
|
"learning_rate": 3.861003861003861e-06, |
|
"loss": 0.2489, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 32.01081081081081, |
|
"grad_norm": 0.001372206723317504, |
|
"learning_rate": 3.852423852423853e-06, |
|
"loss": 0.0011, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 32.011583011583014, |
|
"grad_norm": 0.06818258762359619, |
|
"learning_rate": 3.843843843843844e-06, |
|
"loss": 0.0446, |
|
"step": 8470 |
|
}, |
|
{ |
|
"epoch": 32.01235521235521, |
|
"grad_norm": 0.010435489006340504, |
|
"learning_rate": 3.835263835263835e-06, |
|
"loss": 0.0007, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 32.013127413127414, |
|
"grad_norm": 0.09285730123519897, |
|
"learning_rate": 3.826683826683827e-06, |
|
"loss": 0.0655, |
|
"step": 8490 |
|
}, |
|
{ |
|
"epoch": 32.01389961389961, |
|
"grad_norm": 0.007980815134942532, |
|
"learning_rate": 3.818103818103818e-06, |
|
"loss": 0.2478, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 32.014671814671814, |
|
"grad_norm": 268.385009765625, |
|
"learning_rate": 3.80952380952381e-06, |
|
"loss": 0.1007, |
|
"step": 8510 |
|
}, |
|
{ |
|
"epoch": 32.01544401544402, |
|
"grad_norm": 4.248262882232666, |
|
"learning_rate": 3.800943800943801e-06, |
|
"loss": 0.29, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 32.016216216216215, |
|
"grad_norm": 0.01907108724117279, |
|
"learning_rate": 3.792363792363793e-06, |
|
"loss": 0.0009, |
|
"step": 8530 |
|
}, |
|
{ |
|
"epoch": 32.01698841698842, |
|
"grad_norm": 5.199569225311279, |
|
"learning_rate": 3.7837837837837844e-06, |
|
"loss": 0.4102, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 32.017760617760615, |
|
"grad_norm": 0.012655259110033512, |
|
"learning_rate": 3.7752037752037755e-06, |
|
"loss": 0.3693, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 32.01853281853282, |
|
"grad_norm": 0.011323108337819576, |
|
"learning_rate": 3.766623766623767e-06, |
|
"loss": 0.0059, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 32.01930501930502, |
|
"grad_norm": 0.11602991819381714, |
|
"learning_rate": 3.7580437580437585e-06, |
|
"loss": 0.267, |
|
"step": 8570 |
|
}, |
|
{ |
|
"epoch": 32.02007722007722, |
|
"grad_norm": 0.0008211358217522502, |
|
"learning_rate": 3.7494637494637496e-06, |
|
"loss": 0.0043, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 32.02007722007722, |
|
"eval_accuracy": 0.7333333333333333, |
|
"eval_loss": 1.8095158338546753, |
|
"eval_runtime": 11.6106, |
|
"eval_samples_per_second": 3.876, |
|
"eval_steps_per_second": 1.292, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 33.000772200772204, |
|
"grad_norm": 15.456527709960938, |
|
"learning_rate": 3.740883740883741e-06, |
|
"loss": 0.8774, |
|
"step": 8590 |
|
}, |
|
{ |
|
"epoch": 33.0015444015444, |
|
"grad_norm": 0.003137608291581273, |
|
"learning_rate": 3.7323037323037327e-06, |
|
"loss": 0.0006, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 33.002316602316604, |
|
"grad_norm": 0.005284165497869253, |
|
"learning_rate": 3.723723723723724e-06, |
|
"loss": 0.0122, |
|
"step": 8610 |
|
}, |
|
{ |
|
"epoch": 33.0030888030888, |
|
"grad_norm": 0.0019698645919561386, |
|
"learning_rate": 3.7151437151437153e-06, |
|
"loss": 0.4956, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 33.003861003861005, |
|
"grad_norm": 0.050644420087337494, |
|
"learning_rate": 3.706563706563707e-06, |
|
"loss": 0.0333, |
|
"step": 8630 |
|
}, |
|
{ |
|
"epoch": 33.0046332046332, |
|
"grad_norm": 0.004142272751778364, |
|
"learning_rate": 3.697983697983698e-06, |
|
"loss": 0.0269, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 33.005405405405405, |
|
"grad_norm": 0.009355615824460983, |
|
"learning_rate": 3.68940368940369e-06, |
|
"loss": 0.4925, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 33.00617760617761, |
|
"grad_norm": 0.29982513189315796, |
|
"learning_rate": 3.6808236808236814e-06, |
|
"loss": 0.0305, |
|
"step": 8660 |
|
}, |
|
{ |
|
"epoch": 33.006949806949805, |
|
"grad_norm": 0.11938207596540451, |
|
"learning_rate": 3.6722436722436725e-06, |
|
"loss": 0.1785, |
|
"step": 8670 |
|
}, |
|
{ |
|
"epoch": 33.00772200772201, |
|
"grad_norm": 0.45936429500579834, |
|
"learning_rate": 3.663663663663664e-06, |
|
"loss": 0.1534, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 33.008494208494206, |
|
"grad_norm": 0.004370039328932762, |
|
"learning_rate": 3.6550836550836556e-06, |
|
"loss": 0.259, |
|
"step": 8690 |
|
}, |
|
{ |
|
"epoch": 33.00926640926641, |
|
"grad_norm": 0.05320889130234718, |
|
"learning_rate": 3.6465036465036467e-06, |
|
"loss": 0.0851, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 33.01003861003861, |
|
"grad_norm": 0.026938198134303093, |
|
"learning_rate": 3.637923637923638e-06, |
|
"loss": 0.4497, |
|
"step": 8710 |
|
}, |
|
{ |
|
"epoch": 33.01081081081081, |
|
"grad_norm": 0.06186371669173241, |
|
"learning_rate": 3.6293436293436297e-06, |
|
"loss": 0.0011, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 33.011583011583014, |
|
"grad_norm": 0.11341995745897293, |
|
"learning_rate": 3.620763620763621e-06, |
|
"loss": 0.1477, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 33.01235521235521, |
|
"grad_norm": 0.019785890355706215, |
|
"learning_rate": 3.6121836121836124e-06, |
|
"loss": 0.0007, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 33.013127413127414, |
|
"grad_norm": 0.03372610732913017, |
|
"learning_rate": 3.603603603603604e-06, |
|
"loss": 0.0823, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 33.01389961389961, |
|
"grad_norm": 0.011712102219462395, |
|
"learning_rate": 3.595023595023595e-06, |
|
"loss": 0.0005, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 33.014671814671814, |
|
"grad_norm": 0.004663593135774136, |
|
"learning_rate": 3.5864435864435865e-06, |
|
"loss": 0.0046, |
|
"step": 8770 |
|
}, |
|
{ |
|
"epoch": 33.01544401544402, |
|
"grad_norm": 0.007854433730244637, |
|
"learning_rate": 3.5778635778635785e-06, |
|
"loss": 0.2609, |
|
"step": 8780 |
|
}, |
|
{ |
|
"epoch": 33.016216216216215, |
|
"grad_norm": 350.70220947265625, |
|
"learning_rate": 3.56928356928357e-06, |
|
"loss": 0.3562, |
|
"step": 8790 |
|
}, |
|
{ |
|
"epoch": 33.01698841698842, |
|
"grad_norm": 0.017881860956549644, |
|
"learning_rate": 3.560703560703561e-06, |
|
"loss": 0.4138, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 33.017760617760615, |
|
"grad_norm": 0.02991180308163166, |
|
"learning_rate": 3.5521235521235526e-06, |
|
"loss": 0.0012, |
|
"step": 8810 |
|
}, |
|
{ |
|
"epoch": 33.01853281853282, |
|
"grad_norm": 0.011691785417497158, |
|
"learning_rate": 3.5435435435435437e-06, |
|
"loss": 0.0353, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 33.01930501930502, |
|
"grad_norm": 0.011075496673583984, |
|
"learning_rate": 3.5349635349635352e-06, |
|
"loss": 0.0004, |
|
"step": 8830 |
|
}, |
|
{ |
|
"epoch": 33.02007722007722, |
|
"grad_norm": 0.0012854086235165596, |
|
"learning_rate": 3.5263835263835268e-06, |
|
"loss": 0.0119, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 33.02007722007722, |
|
"eval_accuracy": 0.7555555555555555, |
|
"eval_loss": 1.6489903926849365, |
|
"eval_runtime": 11.5598, |
|
"eval_samples_per_second": 3.893, |
|
"eval_steps_per_second": 1.298, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 34.000772200772204, |
|
"grad_norm": 0.006392328068614006, |
|
"learning_rate": 3.517803517803518e-06, |
|
"loss": 0.0007, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 34.0015444015444, |
|
"grad_norm": 0.005775567144155502, |
|
"learning_rate": 3.5092235092235094e-06, |
|
"loss": 0.669, |
|
"step": 8860 |
|
}, |
|
{ |
|
"epoch": 34.002316602316604, |
|
"grad_norm": 23.32708168029785, |
|
"learning_rate": 3.500643500643501e-06, |
|
"loss": 0.5035, |
|
"step": 8870 |
|
}, |
|
{ |
|
"epoch": 34.0030888030888, |
|
"grad_norm": 0.010441070422530174, |
|
"learning_rate": 3.492063492063492e-06, |
|
"loss": 0.5568, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 34.003861003861005, |
|
"grad_norm": 0.021087242290377617, |
|
"learning_rate": 3.4834834834834835e-06, |
|
"loss": 0.0004, |
|
"step": 8890 |
|
}, |
|
{ |
|
"epoch": 34.0046332046332, |
|
"grad_norm": 0.017975032329559326, |
|
"learning_rate": 3.4749034749034755e-06, |
|
"loss": 0.0058, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 34.005405405405405, |
|
"grad_norm": 0.027708925306797028, |
|
"learning_rate": 3.466323466323467e-06, |
|
"loss": 0.115, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 34.00617760617761, |
|
"grad_norm": 2.0851829051971436, |
|
"learning_rate": 3.457743457743458e-06, |
|
"loss": 0.2488, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 34.006949806949805, |
|
"grad_norm": 0.029099296778440475, |
|
"learning_rate": 3.4491634491634496e-06, |
|
"loss": 0.2251, |
|
"step": 8930 |
|
}, |
|
{ |
|
"epoch": 34.00772200772201, |
|
"grad_norm": 0.021991893649101257, |
|
"learning_rate": 3.440583440583441e-06, |
|
"loss": 0.1571, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 34.008494208494206, |
|
"grad_norm": 0.06672017276287079, |
|
"learning_rate": 3.4320034320034323e-06, |
|
"loss": 0.0018, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 34.00926640926641, |
|
"grad_norm": 0.015447171404957771, |
|
"learning_rate": 3.423423423423424e-06, |
|
"loss": 0.1852, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 34.01003861003861, |
|
"grad_norm": 0.023431655019521713, |
|
"learning_rate": 3.414843414843415e-06, |
|
"loss": 0.3665, |
|
"step": 8970 |
|
}, |
|
{ |
|
"epoch": 34.01081081081081, |
|
"grad_norm": 0.014014458283782005, |
|
"learning_rate": 3.4062634062634064e-06, |
|
"loss": 0.1518, |
|
"step": 8980 |
|
}, |
|
{ |
|
"epoch": 34.011583011583014, |
|
"grad_norm": 0.015809480100870132, |
|
"learning_rate": 3.397683397683398e-06, |
|
"loss": 0.0686, |
|
"step": 8990 |
|
}, |
|
{ |
|
"epoch": 34.01235521235521, |
|
"grad_norm": 0.018740158528089523, |
|
"learning_rate": 3.389103389103389e-06, |
|
"loss": 0.0008, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 34.013127413127414, |
|
"grad_norm": 71.28443908691406, |
|
"learning_rate": 3.3805233805233806e-06, |
|
"loss": 0.2821, |
|
"step": 9010 |
|
}, |
|
{ |
|
"epoch": 34.01389961389961, |
|
"grad_norm": 0.03172178566455841, |
|
"learning_rate": 3.371943371943372e-06, |
|
"loss": 0.1551, |
|
"step": 9020 |
|
}, |
|
{ |
|
"epoch": 34.014671814671814, |
|
"grad_norm": 0.09424544870853424, |
|
"learning_rate": 3.363363363363364e-06, |
|
"loss": 0.001, |
|
"step": 9030 |
|
}, |
|
{ |
|
"epoch": 34.01544401544402, |
|
"grad_norm": 0.0008962831343524158, |
|
"learning_rate": 3.354783354783355e-06, |
|
"loss": 0.1171, |
|
"step": 9040 |
|
}, |
|
{ |
|
"epoch": 34.016216216216215, |
|
"grad_norm": 172.4552764892578, |
|
"learning_rate": 3.3462033462033467e-06, |
|
"loss": 0.1849, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 34.01698841698842, |
|
"grad_norm": 0.0054281530901789665, |
|
"learning_rate": 3.337623337623338e-06, |
|
"loss": 0.1935, |
|
"step": 9060 |
|
}, |
|
{ |
|
"epoch": 34.017760617760615, |
|
"grad_norm": 39.82855987548828, |
|
"learning_rate": 3.3290433290433293e-06, |
|
"loss": 0.3482, |
|
"step": 9070 |
|
}, |
|
{ |
|
"epoch": 34.01853281853282, |
|
"grad_norm": 0.02182273380458355, |
|
"learning_rate": 3.320463320463321e-06, |
|
"loss": 0.0011, |
|
"step": 9080 |
|
}, |
|
{ |
|
"epoch": 34.01930501930502, |
|
"grad_norm": 0.0020787569228559732, |
|
"learning_rate": 3.311883311883312e-06, |
|
"loss": 0.4725, |
|
"step": 9090 |
|
}, |
|
{ |
|
"epoch": 34.02007722007722, |
|
"grad_norm": 0.002378948265686631, |
|
"learning_rate": 3.3033033033033035e-06, |
|
"loss": 0.7503, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 34.02007722007722, |
|
"eval_accuracy": 0.6888888888888889, |
|
"eval_loss": 1.9143760204315186, |
|
"eval_runtime": 11.5549, |
|
"eval_samples_per_second": 3.894, |
|
"eval_steps_per_second": 1.298, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 35.000772200772204, |
|
"grad_norm": 0.024031909182667732, |
|
"learning_rate": 3.294723294723295e-06, |
|
"loss": 0.2538, |
|
"step": 9110 |
|
}, |
|
{ |
|
"epoch": 35.0015444015444, |
|
"grad_norm": 0.2591168284416199, |
|
"learning_rate": 3.286143286143286e-06, |
|
"loss": 0.0076, |
|
"step": 9120 |
|
}, |
|
{ |
|
"epoch": 35.002316602316604, |
|
"grad_norm": 0.015992112457752228, |
|
"learning_rate": 3.2775632775632776e-06, |
|
"loss": 0.2147, |
|
"step": 9130 |
|
}, |
|
{ |
|
"epoch": 35.0030888030888, |
|
"grad_norm": 0.01742151565849781, |
|
"learning_rate": 3.268983268983269e-06, |
|
"loss": 0.2486, |
|
"step": 9140 |
|
}, |
|
{ |
|
"epoch": 35.003861003861005, |
|
"grad_norm": 0.025499412789940834, |
|
"learning_rate": 3.260403260403261e-06, |
|
"loss": 0.0021, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 35.0046332046332, |
|
"grad_norm": 0.019917473196983337, |
|
"learning_rate": 3.251823251823252e-06, |
|
"loss": 0.1993, |
|
"step": 9160 |
|
}, |
|
{ |
|
"epoch": 35.005405405405405, |
|
"grad_norm": 0.07677433639764786, |
|
"learning_rate": 3.2432432432432437e-06, |
|
"loss": 0.0006, |
|
"step": 9170 |
|
}, |
|
{ |
|
"epoch": 35.00617760617761, |
|
"grad_norm": 3.4880127906799316, |
|
"learning_rate": 3.2346632346632352e-06, |
|
"loss": 0.0735, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 35.006949806949805, |
|
"grad_norm": 0.0009284795960411429, |
|
"learning_rate": 3.2260832260832263e-06, |
|
"loss": 0.2833, |
|
"step": 9190 |
|
}, |
|
{ |
|
"epoch": 35.00772200772201, |
|
"grad_norm": 0.032943326979875565, |
|
"learning_rate": 3.217503217503218e-06, |
|
"loss": 0.2312, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 35.008494208494206, |
|
"grad_norm": 30.327800750732422, |
|
"learning_rate": 3.2089232089232094e-06, |
|
"loss": 0.4013, |
|
"step": 9210 |
|
}, |
|
{ |
|
"epoch": 35.00926640926641, |
|
"grad_norm": 25.810949325561523, |
|
"learning_rate": 3.2003432003432005e-06, |
|
"loss": 0.0023, |
|
"step": 9220 |
|
}, |
|
{ |
|
"epoch": 35.01003861003861, |
|
"grad_norm": 0.04931256175041199, |
|
"learning_rate": 3.191763191763192e-06, |
|
"loss": 0.1492, |
|
"step": 9230 |
|
}, |
|
{ |
|
"epoch": 35.01081081081081, |
|
"grad_norm": 0.13327378034591675, |
|
"learning_rate": 3.183183183183183e-06, |
|
"loss": 0.1499, |
|
"step": 9240 |
|
}, |
|
{ |
|
"epoch": 35.011583011583014, |
|
"grad_norm": 0.0011244782945141196, |
|
"learning_rate": 3.1746031746031746e-06, |
|
"loss": 0.002, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 35.01235521235521, |
|
"grad_norm": 0.0651550218462944, |
|
"learning_rate": 3.166023166023166e-06, |
|
"loss": 0.1949, |
|
"step": 9260 |
|
}, |
|
{ |
|
"epoch": 35.013127413127414, |
|
"grad_norm": 0.05204087868332863, |
|
"learning_rate": 3.1574431574431573e-06, |
|
"loss": 0.2826, |
|
"step": 9270 |
|
}, |
|
{ |
|
"epoch": 35.01389961389961, |
|
"grad_norm": 0.3644317388534546, |
|
"learning_rate": 3.148863148863149e-06, |
|
"loss": 0.0004, |
|
"step": 9280 |
|
}, |
|
{ |
|
"epoch": 35.014671814671814, |
|
"grad_norm": 0.010621222667396069, |
|
"learning_rate": 3.1402831402831407e-06, |
|
"loss": 0.0004, |
|
"step": 9290 |
|
}, |
|
{ |
|
"epoch": 35.01544401544402, |
|
"grad_norm": 0.006138904020190239, |
|
"learning_rate": 3.1317031317031323e-06, |
|
"loss": 0.2331, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 35.016216216216215, |
|
"grad_norm": 0.00993597973138094, |
|
"learning_rate": 3.1231231231231234e-06, |
|
"loss": 0.0003, |
|
"step": 9310 |
|
}, |
|
{ |
|
"epoch": 35.01698841698842, |
|
"grad_norm": 0.0026269257068634033, |
|
"learning_rate": 3.114543114543115e-06, |
|
"loss": 0.0004, |
|
"step": 9320 |
|
}, |
|
{ |
|
"epoch": 35.017760617760615, |
|
"grad_norm": 0.018364176154136658, |
|
"learning_rate": 3.1059631059631064e-06, |
|
"loss": 0.2907, |
|
"step": 9330 |
|
}, |
|
{ |
|
"epoch": 35.01853281853282, |
|
"grad_norm": 0.15101493895053864, |
|
"learning_rate": 3.0973830973830975e-06, |
|
"loss": 0.3308, |
|
"step": 9340 |
|
}, |
|
{ |
|
"epoch": 35.01930501930502, |
|
"grad_norm": 31.66868782043457, |
|
"learning_rate": 3.088803088803089e-06, |
|
"loss": 0.4634, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 35.02007722007722, |
|
"grad_norm": 0.12729884684085846, |
|
"learning_rate": 3.0802230802230806e-06, |
|
"loss": 0.2105, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 35.02007722007722, |
|
"eval_accuracy": 0.7333333333333333, |
|
"eval_loss": 1.8402609825134277, |
|
"eval_runtime": 12.2345, |
|
"eval_samples_per_second": 3.678, |
|
"eval_steps_per_second": 1.226, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 36.000772200772204, |
|
"grad_norm": 0.0169172715395689, |
|
"learning_rate": 3.0716430716430717e-06, |
|
"loss": 0.0842, |
|
"step": 9370 |
|
}, |
|
{ |
|
"epoch": 36.0015444015444, |
|
"grad_norm": 228.06277465820312, |
|
"learning_rate": 3.063063063063063e-06, |
|
"loss": 0.3631, |
|
"step": 9380 |
|
}, |
|
{ |
|
"epoch": 36.002316602316604, |
|
"grad_norm": 0.5954656004905701, |
|
"learning_rate": 3.0544830544830543e-06, |
|
"loss": 0.0006, |
|
"step": 9390 |
|
}, |
|
{ |
|
"epoch": 36.0030888030888, |
|
"grad_norm": 0.026740454137325287, |
|
"learning_rate": 3.0459030459030462e-06, |
|
"loss": 0.2786, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 36.003861003861005, |
|
"grad_norm": 0.010428531095385551, |
|
"learning_rate": 3.0373230373230378e-06, |
|
"loss": 0.0411, |
|
"step": 9410 |
|
}, |
|
{ |
|
"epoch": 36.0046332046332, |
|
"grad_norm": 0.29002007842063904, |
|
"learning_rate": 3.0287430287430293e-06, |
|
"loss": 0.0214, |
|
"step": 9420 |
|
}, |
|
{ |
|
"epoch": 36.005405405405405, |
|
"grad_norm": 0.8424767255783081, |
|
"learning_rate": 3.0201630201630204e-06, |
|
"loss": 0.0016, |
|
"step": 9430 |
|
}, |
|
{ |
|
"epoch": 36.00617760617761, |
|
"grad_norm": 0.014938040636479855, |
|
"learning_rate": 3.011583011583012e-06, |
|
"loss": 0.0003, |
|
"step": 9440 |
|
}, |
|
{ |
|
"epoch": 36.006949806949805, |
|
"grad_norm": 0.014550106599926949, |
|
"learning_rate": 3.0030030030030034e-06, |
|
"loss": 0.5971, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 36.00772200772201, |
|
"grad_norm": 66.45316314697266, |
|
"learning_rate": 2.9944229944229945e-06, |
|
"loss": 0.1367, |
|
"step": 9460 |
|
}, |
|
{ |
|
"epoch": 36.008494208494206, |
|
"grad_norm": 0.00797193218022585, |
|
"learning_rate": 2.985842985842986e-06, |
|
"loss": 0.1833, |
|
"step": 9470 |
|
}, |
|
{ |
|
"epoch": 36.00926640926641, |
|
"grad_norm": 0.021717458963394165, |
|
"learning_rate": 2.9772629772629776e-06, |
|
"loss": 0.0004, |
|
"step": 9480 |
|
}, |
|
{ |
|
"epoch": 36.01003861003861, |
|
"grad_norm": 98.05574035644531, |
|
"learning_rate": 2.9686829686829687e-06, |
|
"loss": 0.2001, |
|
"step": 9490 |
|
}, |
|
{ |
|
"epoch": 36.01081081081081, |
|
"grad_norm": 163.5323486328125, |
|
"learning_rate": 2.9601029601029602e-06, |
|
"loss": 0.1822, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 36.011583011583014, |
|
"grad_norm": 0.009607790037989616, |
|
"learning_rate": 2.9515229515229517e-06, |
|
"loss": 0.0003, |
|
"step": 9510 |
|
}, |
|
{ |
|
"epoch": 36.01235521235521, |
|
"grad_norm": 0.006017378065735102, |
|
"learning_rate": 2.942942942942943e-06, |
|
"loss": 0.0002, |
|
"step": 9520 |
|
}, |
|
{ |
|
"epoch": 36.013127413127414, |
|
"grad_norm": 0.007440345361828804, |
|
"learning_rate": 2.934362934362935e-06, |
|
"loss": 0.0864, |
|
"step": 9530 |
|
}, |
|
{ |
|
"epoch": 36.01389961389961, |
|
"grad_norm": 0.013611230067908764, |
|
"learning_rate": 2.9257829257829263e-06, |
|
"loss": 0.4486, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 36.014671814671814, |
|
"grad_norm": 0.0026765179354697466, |
|
"learning_rate": 2.9172029172029174e-06, |
|
"loss": 0.0003, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 36.01544401544402, |
|
"grad_norm": 0.029210234060883522, |
|
"learning_rate": 2.908622908622909e-06, |
|
"loss": 0.0003, |
|
"step": 9560 |
|
}, |
|
{ |
|
"epoch": 36.016216216216215, |
|
"grad_norm": 11.608271598815918, |
|
"learning_rate": 2.9000429000429005e-06, |
|
"loss": 0.7841, |
|
"step": 9570 |
|
}, |
|
{ |
|
"epoch": 36.01698841698842, |
|
"grad_norm": 0.0011675491696223617, |
|
"learning_rate": 2.8914628914628916e-06, |
|
"loss": 0.2966, |
|
"step": 9580 |
|
}, |
|
{ |
|
"epoch": 36.017760617760615, |
|
"grad_norm": 46.25785827636719, |
|
"learning_rate": 2.882882882882883e-06, |
|
"loss": 0.5733, |
|
"step": 9590 |
|
}, |
|
{ |
|
"epoch": 36.01853281853282, |
|
"grad_norm": 0.008772353641688824, |
|
"learning_rate": 2.8743028743028746e-06, |
|
"loss": 0.0949, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 36.01930501930502, |
|
"grad_norm": 3.046584129333496, |
|
"learning_rate": 2.8657228657228657e-06, |
|
"loss": 0.0009, |
|
"step": 9610 |
|
}, |
|
{ |
|
"epoch": 36.02007722007722, |
|
"grad_norm": 0.05296040698885918, |
|
"learning_rate": 2.8571428571428573e-06, |
|
"loss": 0.003, |
|
"step": 9620 |
|
}, |
|
{ |
|
"epoch": 36.02007722007722, |
|
"eval_accuracy": 0.7333333333333333, |
|
"eval_loss": 1.8769656419754028, |
|
"eval_runtime": 12.2555, |
|
"eval_samples_per_second": 3.672, |
|
"eval_steps_per_second": 1.224, |
|
"step": 9620 |
|
}, |
|
{ |
|
"epoch": 37.000772200772204, |
|
"grad_norm": 0.017100481316447258, |
|
"learning_rate": 2.8485628485628488e-06, |
|
"loss": 0.0003, |
|
"step": 9630 |
|
}, |
|
{ |
|
"epoch": 37.0015444015444, |
|
"grad_norm": 0.006559515371918678, |
|
"learning_rate": 2.83998283998284e-06, |
|
"loss": 0.0005, |
|
"step": 9640 |
|
}, |
|
{ |
|
"epoch": 37.002316602316604, |
|
"grad_norm": 0.15823794901371002, |
|
"learning_rate": 2.831402831402832e-06, |
|
"loss": 0.2016, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 37.0030888030888, |
|
"grad_norm": 0.07216683030128479, |
|
"learning_rate": 2.8228228228228234e-06, |
|
"loss": 0.0141, |
|
"step": 9660 |
|
}, |
|
{ |
|
"epoch": 37.003861003861005, |
|
"grad_norm": 6.186749458312988, |
|
"learning_rate": 2.8142428142428145e-06, |
|
"loss": 0.1256, |
|
"step": 9670 |
|
}, |
|
{ |
|
"epoch": 37.0046332046332, |
|
"grad_norm": 0.005314590875059366, |
|
"learning_rate": 2.805662805662806e-06, |
|
"loss": 0.348, |
|
"step": 9680 |
|
}, |
|
{ |
|
"epoch": 37.005405405405405, |
|
"grad_norm": 5.876717567443848, |
|
"learning_rate": 2.7970827970827975e-06, |
|
"loss": 0.2215, |
|
"step": 9690 |
|
}, |
|
{ |
|
"epoch": 37.00617760617761, |
|
"grad_norm": 0.008445030078291893, |
|
"learning_rate": 2.7885027885027886e-06, |
|
"loss": 0.3307, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 37.006949806949805, |
|
"grad_norm": 0.07367029786109924, |
|
"learning_rate": 2.77992277992278e-06, |
|
"loss": 0.2974, |
|
"step": 9710 |
|
}, |
|
{ |
|
"epoch": 37.00772200772201, |
|
"grad_norm": 0.016041463240981102, |
|
"learning_rate": 2.7713427713427717e-06, |
|
"loss": 0.3192, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 37.008494208494206, |
|
"grad_norm": 0.007501265965402126, |
|
"learning_rate": 2.7627627627627628e-06, |
|
"loss": 0.0006, |
|
"step": 9730 |
|
}, |
|
{ |
|
"epoch": 37.00926640926641, |
|
"grad_norm": 0.02961740642786026, |
|
"learning_rate": 2.7541827541827543e-06, |
|
"loss": 0.0025, |
|
"step": 9740 |
|
}, |
|
{ |
|
"epoch": 37.01003861003861, |
|
"grad_norm": 0.039396531879901886, |
|
"learning_rate": 2.745602745602746e-06, |
|
"loss": 0.1664, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 37.01081081081081, |
|
"grad_norm": 0.0014447583816945553, |
|
"learning_rate": 2.737022737022737e-06, |
|
"loss": 0.0036, |
|
"step": 9760 |
|
}, |
|
{ |
|
"epoch": 37.011583011583014, |
|
"grad_norm": 0.005423523019999266, |
|
"learning_rate": 2.7284427284427284e-06, |
|
"loss": 0.0806, |
|
"step": 9770 |
|
}, |
|
{ |
|
"epoch": 37.01235521235521, |
|
"grad_norm": 0.011083639226853848, |
|
"learning_rate": 2.7198627198627204e-06, |
|
"loss": 0.3024, |
|
"step": 9780 |
|
}, |
|
{ |
|
"epoch": 37.013127413127414, |
|
"grad_norm": 1.4681493043899536, |
|
"learning_rate": 2.7112827112827115e-06, |
|
"loss": 0.0007, |
|
"step": 9790 |
|
}, |
|
{ |
|
"epoch": 37.01389961389961, |
|
"grad_norm": 0.012962309643626213, |
|
"learning_rate": 2.702702702702703e-06, |
|
"loss": 0.0016, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 37.014671814671814, |
|
"grad_norm": 0.044124990701675415, |
|
"learning_rate": 2.6941226941226945e-06, |
|
"loss": 0.3081, |
|
"step": 9810 |
|
}, |
|
{ |
|
"epoch": 37.01544401544402, |
|
"grad_norm": 0.019632622599601746, |
|
"learning_rate": 2.6855426855426856e-06, |
|
"loss": 0.1146, |
|
"step": 9820 |
|
}, |
|
{ |
|
"epoch": 37.016216216216215, |
|
"grad_norm": 0.005031611770391464, |
|
"learning_rate": 2.676962676962677e-06, |
|
"loss": 0.0003, |
|
"step": 9830 |
|
}, |
|
{ |
|
"epoch": 37.01698841698842, |
|
"grad_norm": 0.004604416433721781, |
|
"learning_rate": 2.6683826683826687e-06, |
|
"loss": 0.3262, |
|
"step": 9840 |
|
}, |
|
{ |
|
"epoch": 37.017760617760615, |
|
"grad_norm": 0.0116556566208601, |
|
"learning_rate": 2.65980265980266e-06, |
|
"loss": 0.0015, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 37.01853281853282, |
|
"grad_norm": 0.030262470245361328, |
|
"learning_rate": 2.6512226512226513e-06, |
|
"loss": 0.2633, |
|
"step": 9860 |
|
}, |
|
{ |
|
"epoch": 37.01930501930502, |
|
"grad_norm": 0.08174673467874527, |
|
"learning_rate": 2.642642642642643e-06, |
|
"loss": 0.0049, |
|
"step": 9870 |
|
}, |
|
{ |
|
"epoch": 37.02007722007722, |
|
"grad_norm": 0.029679138213396072, |
|
"learning_rate": 2.634062634062634e-06, |
|
"loss": 0.1781, |
|
"step": 9880 |
|
}, |
|
{ |
|
"epoch": 37.02007722007722, |
|
"eval_accuracy": 0.7333333333333333, |
|
"eval_loss": 1.863143801689148, |
|
"eval_runtime": 16.3783, |
|
"eval_samples_per_second": 2.748, |
|
"eval_steps_per_second": 0.916, |
|
"step": 9880 |
|
}, |
|
{ |
|
"epoch": 38.000772200772204, |
|
"grad_norm": 0.009378015995025635, |
|
"learning_rate": 2.6254826254826255e-06, |
|
"loss": 0.0003, |
|
"step": 9890 |
|
}, |
|
{ |
|
"epoch": 38.0015444015444, |
|
"grad_norm": 0.0015269636642187834, |
|
"learning_rate": 2.6169026169026174e-06, |
|
"loss": 0.0004, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 38.002316602316604, |
|
"grad_norm": 0.012068642303347588, |
|
"learning_rate": 2.608322608322609e-06, |
|
"loss": 0.3438, |
|
"step": 9910 |
|
}, |
|
{ |
|
"epoch": 38.0030888030888, |
|
"grad_norm": 0.030750026926398277, |
|
"learning_rate": 2.5997425997426e-06, |
|
"loss": 0.001, |
|
"step": 9920 |
|
}, |
|
{ |
|
"epoch": 38.003861003861005, |
|
"grad_norm": 0.016554566100239754, |
|
"learning_rate": 2.5911625911625916e-06, |
|
"loss": 0.082, |
|
"step": 9930 |
|
}, |
|
{ |
|
"epoch": 38.0046332046332, |
|
"grad_norm": 0.012171209789812565, |
|
"learning_rate": 2.5825825825825827e-06, |
|
"loss": 0.2353, |
|
"step": 9940 |
|
}, |
|
{ |
|
"epoch": 38.005405405405405, |
|
"grad_norm": 0.10157763212919235, |
|
"learning_rate": 2.574002574002574e-06, |
|
"loss": 0.3692, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 38.00617760617761, |
|
"grad_norm": 0.008098813705146313, |
|
"learning_rate": 2.5654225654225657e-06, |
|
"loss": 0.2255, |
|
"step": 9960 |
|
}, |
|
{ |
|
"epoch": 38.006949806949805, |
|
"grad_norm": 0.31319352984428406, |
|
"learning_rate": 2.556842556842557e-06, |
|
"loss": 0.0032, |
|
"step": 9970 |
|
}, |
|
{ |
|
"epoch": 38.00772200772201, |
|
"grad_norm": 17.32835578918457, |
|
"learning_rate": 2.5482625482625484e-06, |
|
"loss": 0.0031, |
|
"step": 9980 |
|
}, |
|
{ |
|
"epoch": 38.008494208494206, |
|
"grad_norm": 0.004962326493114233, |
|
"learning_rate": 2.53968253968254e-06, |
|
"loss": 0.0882, |
|
"step": 9990 |
|
}, |
|
{ |
|
"epoch": 38.00926640926641, |
|
"grad_norm": 0.007238659542053938, |
|
"learning_rate": 2.531102531102531e-06, |
|
"loss": 0.1908, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 38.01003861003861, |
|
"grad_norm": 0.003812581766396761, |
|
"learning_rate": 2.5225225225225225e-06, |
|
"loss": 0.0001, |
|
"step": 10010 |
|
}, |
|
{ |
|
"epoch": 38.01081081081081, |
|
"grad_norm": 0.020626194775104523, |
|
"learning_rate": 2.513942513942514e-06, |
|
"loss": 0.2302, |
|
"step": 10020 |
|
}, |
|
{ |
|
"epoch": 38.011583011583014, |
|
"grad_norm": 345.7996826171875, |
|
"learning_rate": 2.505362505362506e-06, |
|
"loss": 0.1319, |
|
"step": 10030 |
|
}, |
|
{ |
|
"epoch": 38.01235521235521, |
|
"grad_norm": 0.0005508098402060568, |
|
"learning_rate": 2.496782496782497e-06, |
|
"loss": 0.0003, |
|
"step": 10040 |
|
}, |
|
{ |
|
"epoch": 38.013127413127414, |
|
"grad_norm": 0.0054133799858391285, |
|
"learning_rate": 2.488202488202488e-06, |
|
"loss": 0.0004, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 38.01389961389961, |
|
"grad_norm": 0.025228584185242653, |
|
"learning_rate": 2.47962247962248e-06, |
|
"loss": 0.3296, |
|
"step": 10060 |
|
}, |
|
{ |
|
"epoch": 38.014671814671814, |
|
"grad_norm": 0.04835504665970802, |
|
"learning_rate": 2.4710424710424712e-06, |
|
"loss": 0.0011, |
|
"step": 10070 |
|
}, |
|
{ |
|
"epoch": 38.01544401544402, |
|
"grad_norm": 17.836328506469727, |
|
"learning_rate": 2.4624624624624628e-06, |
|
"loss": 0.0019, |
|
"step": 10080 |
|
}, |
|
{ |
|
"epoch": 38.016216216216215, |
|
"grad_norm": 0.018395883962512016, |
|
"learning_rate": 2.453882453882454e-06, |
|
"loss": 0.0002, |
|
"step": 10090 |
|
}, |
|
{ |
|
"epoch": 38.01698841698842, |
|
"grad_norm": 0.01676626317203045, |
|
"learning_rate": 2.4453024453024454e-06, |
|
"loss": 0.1144, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 38.017760617760615, |
|
"grad_norm": 0.007224744651466608, |
|
"learning_rate": 2.436722436722437e-06, |
|
"loss": 0.252, |
|
"step": 10110 |
|
}, |
|
{ |
|
"epoch": 38.01853281853282, |
|
"grad_norm": 0.006745734717696905, |
|
"learning_rate": 2.4281424281424284e-06, |
|
"loss": 0.0003, |
|
"step": 10120 |
|
}, |
|
{ |
|
"epoch": 38.01930501930502, |
|
"grad_norm": 0.012383128516376019, |
|
"learning_rate": 2.41956241956242e-06, |
|
"loss": 0.0005, |
|
"step": 10130 |
|
}, |
|
{ |
|
"epoch": 38.02007722007722, |
|
"grad_norm": 0.010990212671458721, |
|
"learning_rate": 2.410982410982411e-06, |
|
"loss": 0.4092, |
|
"step": 10140 |
|
}, |
|
{ |
|
"epoch": 38.02007722007722, |
|
"eval_accuracy": 0.7111111111111111, |
|
"eval_loss": 1.9994146823883057, |
|
"eval_runtime": 11.5839, |
|
"eval_samples_per_second": 3.885, |
|
"eval_steps_per_second": 1.295, |
|
"step": 10140 |
|
}, |
|
{ |
|
"epoch": 39.000772200772204, |
|
"grad_norm": 0.0006115052383393049, |
|
"learning_rate": 2.4024024024024026e-06, |
|
"loss": 0.0001, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 39.0015444015444, |
|
"grad_norm": 0.01732112281024456, |
|
"learning_rate": 2.393822393822394e-06, |
|
"loss": 0.0002, |
|
"step": 10160 |
|
}, |
|
{ |
|
"epoch": 39.002316602316604, |
|
"grad_norm": 388.23370361328125, |
|
"learning_rate": 2.3852423852423852e-06, |
|
"loss": 0.1257, |
|
"step": 10170 |
|
}, |
|
{ |
|
"epoch": 39.0030888030888, |
|
"grad_norm": 0.04425964504480362, |
|
"learning_rate": 2.376662376662377e-06, |
|
"loss": 0.0003, |
|
"step": 10180 |
|
}, |
|
{ |
|
"epoch": 39.003861003861005, |
|
"grad_norm": 0.02979341335594654, |
|
"learning_rate": 2.3680823680823683e-06, |
|
"loss": 0.0002, |
|
"step": 10190 |
|
}, |
|
{ |
|
"epoch": 39.0046332046332, |
|
"grad_norm": 0.03975719213485718, |
|
"learning_rate": 2.35950235950236e-06, |
|
"loss": 0.4852, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 39.005405405405405, |
|
"grad_norm": 0.005834504030644894, |
|
"learning_rate": 2.3509223509223513e-06, |
|
"loss": 0.0002, |
|
"step": 10210 |
|
}, |
|
{ |
|
"epoch": 39.00617760617761, |
|
"grad_norm": 0.0008963262080214918, |
|
"learning_rate": 2.3423423423423424e-06, |
|
"loss": 0.0004, |
|
"step": 10220 |
|
}, |
|
{ |
|
"epoch": 39.006949806949805, |
|
"grad_norm": 0.2953839600086212, |
|
"learning_rate": 2.333762333762334e-06, |
|
"loss": 0.0002, |
|
"step": 10230 |
|
}, |
|
{ |
|
"epoch": 39.00772200772201, |
|
"grad_norm": 0.005276846699416637, |
|
"learning_rate": 2.3251823251823255e-06, |
|
"loss": 0.0013, |
|
"step": 10240 |
|
}, |
|
{ |
|
"epoch": 39.008494208494206, |
|
"grad_norm": 0.00223553623072803, |
|
"learning_rate": 2.316602316602317e-06, |
|
"loss": 0.2671, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 39.00926640926641, |
|
"grad_norm": 0.006985859479755163, |
|
"learning_rate": 2.308022308022308e-06, |
|
"loss": 0.3094, |
|
"step": 10260 |
|
}, |
|
{ |
|
"epoch": 39.01003861003861, |
|
"grad_norm": 0.007334560621529818, |
|
"learning_rate": 2.2994422994422996e-06, |
|
"loss": 0.1669, |
|
"step": 10270 |
|
}, |
|
{ |
|
"epoch": 39.01081081081081, |
|
"grad_norm": 0.02189444750547409, |
|
"learning_rate": 2.290862290862291e-06, |
|
"loss": 0.0002, |
|
"step": 10280 |
|
}, |
|
{ |
|
"epoch": 39.011583011583014, |
|
"grad_norm": 0.002752017229795456, |
|
"learning_rate": 2.2822822822822822e-06, |
|
"loss": 0.0001, |
|
"step": 10290 |
|
}, |
|
{ |
|
"epoch": 39.01235521235521, |
|
"grad_norm": 0.003848872147500515, |
|
"learning_rate": 2.2737022737022738e-06, |
|
"loss": 0.1079, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 39.013127413127414, |
|
"grad_norm": 0.022609086707234383, |
|
"learning_rate": 2.2651222651222653e-06, |
|
"loss": 0.0002, |
|
"step": 10310 |
|
}, |
|
{ |
|
"epoch": 39.01389961389961, |
|
"grad_norm": 0.013301585800945759, |
|
"learning_rate": 2.256542256542257e-06, |
|
"loss": 0.1329, |
|
"step": 10320 |
|
}, |
|
{ |
|
"epoch": 39.014671814671814, |
|
"grad_norm": 0.010520892217755318, |
|
"learning_rate": 2.2479622479622483e-06, |
|
"loss": 0.1651, |
|
"step": 10330 |
|
}, |
|
{ |
|
"epoch": 39.01544401544402, |
|
"grad_norm": 0.01888640969991684, |
|
"learning_rate": 2.2393822393822394e-06, |
|
"loss": 0.0002, |
|
"step": 10340 |
|
}, |
|
{ |
|
"epoch": 39.016216216216215, |
|
"grad_norm": 0.005222676321864128, |
|
"learning_rate": 2.230802230802231e-06, |
|
"loss": 0.0056, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 39.01698841698842, |
|
"grad_norm": 0.003466697409749031, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 0.1216, |
|
"step": 10360 |
|
}, |
|
{ |
|
"epoch": 39.017760617760615, |
|
"grad_norm": 0.012701363302767277, |
|
"learning_rate": 2.213642213642214e-06, |
|
"loss": 0.2632, |
|
"step": 10370 |
|
}, |
|
{ |
|
"epoch": 39.01853281853282, |
|
"grad_norm": 0.0034604999236762524, |
|
"learning_rate": 2.205062205062205e-06, |
|
"loss": 0.0002, |
|
"step": 10380 |
|
}, |
|
{ |
|
"epoch": 39.01930501930502, |
|
"grad_norm": 0.22704817354679108, |
|
"learning_rate": 2.1964821964821967e-06, |
|
"loss": 0.1519, |
|
"step": 10390 |
|
}, |
|
{ |
|
"epoch": 39.02007722007722, |
|
"grad_norm": 0.00676871370524168, |
|
"learning_rate": 2.187902187902188e-06, |
|
"loss": 0.232, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 39.02007722007722, |
|
"eval_accuracy": 0.6888888888888889, |
|
"eval_loss": 1.9918631315231323, |
|
"eval_runtime": 13.6157, |
|
"eval_samples_per_second": 3.305, |
|
"eval_steps_per_second": 1.102, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 40.000772200772204, |
|
"grad_norm": 0.0037049120292067528, |
|
"learning_rate": 2.1793221793221793e-06, |
|
"loss": 0.2593, |
|
"step": 10410 |
|
}, |
|
{ |
|
"epoch": 40.0015444015444, |
|
"grad_norm": 0.007936818525195122, |
|
"learning_rate": 2.170742170742171e-06, |
|
"loss": 0.0004, |
|
"step": 10420 |
|
}, |
|
{ |
|
"epoch": 40.002316602316604, |
|
"grad_norm": 0.008796462789177895, |
|
"learning_rate": 2.1621621621621623e-06, |
|
"loss": 0.0002, |
|
"step": 10430 |
|
}, |
|
{ |
|
"epoch": 40.0030888030888, |
|
"grad_norm": 0.0008883124683052301, |
|
"learning_rate": 2.153582153582154e-06, |
|
"loss": 0.0003, |
|
"step": 10440 |
|
}, |
|
{ |
|
"epoch": 40.003861003861005, |
|
"grad_norm": 0.021493464708328247, |
|
"learning_rate": 2.1450021450021454e-06, |
|
"loss": 0.0004, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 40.0046332046332, |
|
"grad_norm": 0.013466846197843552, |
|
"learning_rate": 2.1364221364221365e-06, |
|
"loss": 0.0003, |
|
"step": 10460 |
|
}, |
|
{ |
|
"epoch": 40.005405405405405, |
|
"grad_norm": 0.0008845877600833774, |
|
"learning_rate": 2.127842127842128e-06, |
|
"loss": 0.0002, |
|
"step": 10470 |
|
}, |
|
{ |
|
"epoch": 40.00617760617761, |
|
"grad_norm": 0.004395687952637672, |
|
"learning_rate": 2.1192621192621195e-06, |
|
"loss": 0.0388, |
|
"step": 10480 |
|
}, |
|
{ |
|
"epoch": 40.006949806949805, |
|
"grad_norm": 0.0025481514167040586, |
|
"learning_rate": 2.110682110682111e-06, |
|
"loss": 0.0002, |
|
"step": 10490 |
|
}, |
|
{ |
|
"epoch": 40.00772200772201, |
|
"grad_norm": 0.05023454874753952, |
|
"learning_rate": 2.102102102102102e-06, |
|
"loss": 0.0584, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 40.008494208494206, |
|
"grad_norm": 0.0015958435833454132, |
|
"learning_rate": 2.0935220935220937e-06, |
|
"loss": 0.0004, |
|
"step": 10510 |
|
}, |
|
{ |
|
"epoch": 40.00926640926641, |
|
"grad_norm": 0.004534762352705002, |
|
"learning_rate": 2.084942084942085e-06, |
|
"loss": 0.001, |
|
"step": 10520 |
|
}, |
|
{ |
|
"epoch": 40.01003861003861, |
|
"grad_norm": 0.6501943469047546, |
|
"learning_rate": 2.0763620763620763e-06, |
|
"loss": 0.0005, |
|
"step": 10530 |
|
}, |
|
{ |
|
"epoch": 40.01081081081081, |
|
"grad_norm": 0.005923354998230934, |
|
"learning_rate": 2.067782067782068e-06, |
|
"loss": 0.4309, |
|
"step": 10540 |
|
}, |
|
{ |
|
"epoch": 40.011583011583014, |
|
"grad_norm": 0.007845559157431126, |
|
"learning_rate": 2.0592020592020594e-06, |
|
"loss": 0.0478, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 40.01235521235521, |
|
"grad_norm": 0.0035400239285081625, |
|
"learning_rate": 2.050622050622051e-06, |
|
"loss": 0.0003, |
|
"step": 10560 |
|
}, |
|
{ |
|
"epoch": 40.013127413127414, |
|
"grad_norm": 0.0006449856446124613, |
|
"learning_rate": 2.0420420420420424e-06, |
|
"loss": 0.1536, |
|
"step": 10570 |
|
}, |
|
{ |
|
"epoch": 40.01389961389961, |
|
"grad_norm": 0.011894451454281807, |
|
"learning_rate": 2.0334620334620335e-06, |
|
"loss": 0.2582, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 40.014671814671814, |
|
"grad_norm": 0.35729271173477173, |
|
"learning_rate": 2.024882024882025e-06, |
|
"loss": 0.0004, |
|
"step": 10590 |
|
}, |
|
{ |
|
"epoch": 40.01544401544402, |
|
"grad_norm": 410.946533203125, |
|
"learning_rate": 2.0163020163020166e-06, |
|
"loss": 0.3431, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 40.016216216216215, |
|
"grad_norm": 0.003937250003218651, |
|
"learning_rate": 2.0077220077220077e-06, |
|
"loss": 0.0001, |
|
"step": 10610 |
|
}, |
|
{ |
|
"epoch": 40.01698841698842, |
|
"grad_norm": 38.59312438964844, |
|
"learning_rate": 1.9991419991419996e-06, |
|
"loss": 0.2471, |
|
"step": 10620 |
|
}, |
|
{ |
|
"epoch": 40.017760617760615, |
|
"grad_norm": 2.9301681518554688, |
|
"learning_rate": 1.9905619905619907e-06, |
|
"loss": 0.0768, |
|
"step": 10630 |
|
}, |
|
{ |
|
"epoch": 40.01853281853282, |
|
"grad_norm": 0.002492203377187252, |
|
"learning_rate": 1.9819819819819822e-06, |
|
"loss": 0.1803, |
|
"step": 10640 |
|
}, |
|
{ |
|
"epoch": 40.01930501930502, |
|
"grad_norm": 0.02368726022541523, |
|
"learning_rate": 1.9734019734019733e-06, |
|
"loss": 0.1065, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 40.02007722007722, |
|
"grad_norm": 0.010710655711591244, |
|
"learning_rate": 1.964821964821965e-06, |
|
"loss": 0.2703, |
|
"step": 10660 |
|
}, |
|
{ |
|
"epoch": 40.02007722007722, |
|
"eval_accuracy": 0.7111111111111111, |
|
"eval_loss": 2.10081148147583, |
|
"eval_runtime": 13.0367, |
|
"eval_samples_per_second": 3.452, |
|
"eval_steps_per_second": 1.151, |
|
"step": 10660 |
|
}, |
|
{ |
|
"epoch": 41.000772200772204, |
|
"grad_norm": 0.003705637063831091, |
|
"learning_rate": 1.9562419562419564e-06, |
|
"loss": 0.4446, |
|
"step": 10670 |
|
}, |
|
{ |
|
"epoch": 41.0015444015444, |
|
"grad_norm": 0.006835234817117453, |
|
"learning_rate": 1.947661947661948e-06, |
|
"loss": 0.1434, |
|
"step": 10680 |
|
}, |
|
{ |
|
"epoch": 41.002316602316604, |
|
"grad_norm": 0.01129397563636303, |
|
"learning_rate": 1.9390819390819394e-06, |
|
"loss": 0.2595, |
|
"step": 10690 |
|
}, |
|
{ |
|
"epoch": 41.0030888030888, |
|
"grad_norm": 0.11355545371770859, |
|
"learning_rate": 1.9305019305019305e-06, |
|
"loss": 0.0321, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 41.003861003861005, |
|
"grad_norm": 0.007223920896649361, |
|
"learning_rate": 1.921921921921922e-06, |
|
"loss": 0.0444, |
|
"step": 10710 |
|
}, |
|
{ |
|
"epoch": 41.0046332046332, |
|
"grad_norm": 0.022122787311673164, |
|
"learning_rate": 1.9133419133419136e-06, |
|
"loss": 0.0556, |
|
"step": 10720 |
|
}, |
|
{ |
|
"epoch": 41.005405405405405, |
|
"grad_norm": 0.0013790694065392017, |
|
"learning_rate": 1.904761904761905e-06, |
|
"loss": 0.0003, |
|
"step": 10730 |
|
}, |
|
{ |
|
"epoch": 41.00617760617761, |
|
"grad_norm": 0.003318269969895482, |
|
"learning_rate": 1.8961818961818964e-06, |
|
"loss": 0.0006, |
|
"step": 10740 |
|
}, |
|
{ |
|
"epoch": 41.006949806949805, |
|
"grad_norm": 0.06160283461213112, |
|
"learning_rate": 1.8876018876018877e-06, |
|
"loss": 0.1408, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 41.00772200772201, |
|
"grad_norm": 0.0031962695065885782, |
|
"learning_rate": 1.8790218790218793e-06, |
|
"loss": 0.345, |
|
"step": 10760 |
|
}, |
|
{ |
|
"epoch": 41.008494208494206, |
|
"grad_norm": 0.008759942837059498, |
|
"learning_rate": 1.8704418704418706e-06, |
|
"loss": 0.0008, |
|
"step": 10770 |
|
}, |
|
{ |
|
"epoch": 41.00926640926641, |
|
"grad_norm": 0.007954655215144157, |
|
"learning_rate": 1.861861861861862e-06, |
|
"loss": 0.0002, |
|
"step": 10780 |
|
}, |
|
{ |
|
"epoch": 41.01003861003861, |
|
"grad_norm": 261.1466979980469, |
|
"learning_rate": 1.8532818532818534e-06, |
|
"loss": 0.2819, |
|
"step": 10790 |
|
}, |
|
{ |
|
"epoch": 41.01081081081081, |
|
"grad_norm": 0.440922349691391, |
|
"learning_rate": 1.844701844701845e-06, |
|
"loss": 0.1343, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 41.011583011583014, |
|
"grad_norm": 0.02840583585202694, |
|
"learning_rate": 1.8361218361218363e-06, |
|
"loss": 0.0002, |
|
"step": 10810 |
|
}, |
|
{ |
|
"epoch": 41.01235521235521, |
|
"grad_norm": 0.7776057720184326, |
|
"learning_rate": 1.8275418275418278e-06, |
|
"loss": 0.0344, |
|
"step": 10820 |
|
}, |
|
{ |
|
"epoch": 41.013127413127414, |
|
"grad_norm": 0.0304377693682909, |
|
"learning_rate": 1.818961818961819e-06, |
|
"loss": 0.2329, |
|
"step": 10830 |
|
}, |
|
{ |
|
"epoch": 41.01389961389961, |
|
"grad_norm": 0.03479522094130516, |
|
"learning_rate": 1.8103818103818104e-06, |
|
"loss": 0.1126, |
|
"step": 10840 |
|
}, |
|
{ |
|
"epoch": 41.014671814671814, |
|
"grad_norm": 0.00659992964938283, |
|
"learning_rate": 1.801801801801802e-06, |
|
"loss": 0.0004, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 41.01544401544402, |
|
"grad_norm": 0.012349890545010567, |
|
"learning_rate": 1.7932217932217933e-06, |
|
"loss": 0.0134, |
|
"step": 10860 |
|
}, |
|
{ |
|
"epoch": 41.016216216216215, |
|
"grad_norm": 0.017889924347400665, |
|
"learning_rate": 1.784641784641785e-06, |
|
"loss": 0.1972, |
|
"step": 10870 |
|
}, |
|
{ |
|
"epoch": 41.01698841698842, |
|
"grad_norm": 0.07884511351585388, |
|
"learning_rate": 1.7760617760617763e-06, |
|
"loss": 0.0936, |
|
"step": 10880 |
|
}, |
|
{ |
|
"epoch": 41.017760617760615, |
|
"grad_norm": 0.11204981803894043, |
|
"learning_rate": 1.7674817674817676e-06, |
|
"loss": 0.4485, |
|
"step": 10890 |
|
}, |
|
{ |
|
"epoch": 41.01853281853282, |
|
"grad_norm": 0.005154389888048172, |
|
"learning_rate": 1.758901758901759e-06, |
|
"loss": 0.164, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 41.01930501930502, |
|
"grad_norm": 4.545119285583496, |
|
"learning_rate": 1.7503217503217505e-06, |
|
"loss": 0.1486, |
|
"step": 10910 |
|
}, |
|
{ |
|
"epoch": 41.02007722007722, |
|
"grad_norm": 0.13940872251987457, |
|
"learning_rate": 1.7417417417417418e-06, |
|
"loss": 0.5169, |
|
"step": 10920 |
|
}, |
|
{ |
|
"epoch": 41.02007722007722, |
|
"eval_accuracy": 0.6888888888888889, |
|
"eval_loss": 2.201923131942749, |
|
"eval_runtime": 11.5921, |
|
"eval_samples_per_second": 3.882, |
|
"eval_steps_per_second": 1.294, |
|
"step": 10920 |
|
}, |
|
{ |
|
"epoch": 42.000772200772204, |
|
"grad_norm": 0.009260579943656921, |
|
"learning_rate": 1.7331617331617335e-06, |
|
"loss": 0.2545, |
|
"step": 10930 |
|
}, |
|
{ |
|
"epoch": 42.0015444015444, |
|
"grad_norm": 0.01058141142129898, |
|
"learning_rate": 1.7245817245817248e-06, |
|
"loss": 0.0004, |
|
"step": 10940 |
|
}, |
|
{ |
|
"epoch": 42.002316602316604, |
|
"grad_norm": 63.95749282836914, |
|
"learning_rate": 1.7160017160017161e-06, |
|
"loss": 0.1117, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 42.0030888030888, |
|
"grad_norm": 0.0007036329479888082, |
|
"learning_rate": 1.7074217074217074e-06, |
|
"loss": 0.0003, |
|
"step": 10960 |
|
}, |
|
{ |
|
"epoch": 42.003861003861005, |
|
"grad_norm": 0.025101276114583015, |
|
"learning_rate": 1.698841698841699e-06, |
|
"loss": 0.1805, |
|
"step": 10970 |
|
}, |
|
{ |
|
"epoch": 42.0046332046332, |
|
"grad_norm": 0.9655550122261047, |
|
"learning_rate": 1.6902616902616903e-06, |
|
"loss": 0.238, |
|
"step": 10980 |
|
}, |
|
{ |
|
"epoch": 42.005405405405405, |
|
"grad_norm": 0.11792098730802536, |
|
"learning_rate": 1.681681681681682e-06, |
|
"loss": 0.2002, |
|
"step": 10990 |
|
}, |
|
{ |
|
"epoch": 42.00617760617761, |
|
"grad_norm": 479.1819152832031, |
|
"learning_rate": 1.6731016731016733e-06, |
|
"loss": 0.1926, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 42.006949806949805, |
|
"grad_norm": 0.07615210115909576, |
|
"learning_rate": 1.6645216645216646e-06, |
|
"loss": 0.1492, |
|
"step": 11010 |
|
}, |
|
{ |
|
"epoch": 42.00772200772201, |
|
"grad_norm": 0.004804663825780153, |
|
"learning_rate": 1.655941655941656e-06, |
|
"loss": 0.3703, |
|
"step": 11020 |
|
}, |
|
{ |
|
"epoch": 42.008494208494206, |
|
"grad_norm": 0.006586713716387749, |
|
"learning_rate": 1.6473616473616475e-06, |
|
"loss": 0.0004, |
|
"step": 11030 |
|
}, |
|
{ |
|
"epoch": 42.00926640926641, |
|
"grad_norm": 18.117666244506836, |
|
"learning_rate": 1.6387816387816388e-06, |
|
"loss": 0.007, |
|
"step": 11040 |
|
}, |
|
{ |
|
"epoch": 42.01003861003861, |
|
"grad_norm": 0.028815440833568573, |
|
"learning_rate": 1.6302016302016305e-06, |
|
"loss": 0.2412, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 42.01081081081081, |
|
"grad_norm": 41.1140022277832, |
|
"learning_rate": 1.6216216216216219e-06, |
|
"loss": 0.2599, |
|
"step": 11060 |
|
}, |
|
{ |
|
"epoch": 42.011583011583014, |
|
"grad_norm": 330.62469482421875, |
|
"learning_rate": 1.6130416130416132e-06, |
|
"loss": 0.2363, |
|
"step": 11070 |
|
}, |
|
{ |
|
"epoch": 42.01235521235521, |
|
"grad_norm": 0.011965152807533741, |
|
"learning_rate": 1.6044616044616047e-06, |
|
"loss": 0.0008, |
|
"step": 11080 |
|
}, |
|
{ |
|
"epoch": 42.013127413127414, |
|
"grad_norm": 81.2577896118164, |
|
"learning_rate": 1.595881595881596e-06, |
|
"loss": 0.0116, |
|
"step": 11090 |
|
}, |
|
{ |
|
"epoch": 42.01389961389961, |
|
"grad_norm": 0.005599725525826216, |
|
"learning_rate": 1.5873015873015873e-06, |
|
"loss": 0.0003, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 42.014671814671814, |
|
"grad_norm": 0.03384312242269516, |
|
"learning_rate": 1.5787215787215786e-06, |
|
"loss": 0.1198, |
|
"step": 11110 |
|
}, |
|
{ |
|
"epoch": 42.01544401544402, |
|
"grad_norm": 0.019553786143660545, |
|
"learning_rate": 1.5701415701415704e-06, |
|
"loss": 0.274, |
|
"step": 11120 |
|
}, |
|
{ |
|
"epoch": 42.016216216216215, |
|
"grad_norm": 0.0044852448627352715, |
|
"learning_rate": 1.5615615615615617e-06, |
|
"loss": 0.2265, |
|
"step": 11130 |
|
}, |
|
{ |
|
"epoch": 42.01698841698842, |
|
"grad_norm": 0.008225970901548862, |
|
"learning_rate": 1.5529815529815532e-06, |
|
"loss": 0.0055, |
|
"step": 11140 |
|
}, |
|
{ |
|
"epoch": 42.017760617760615, |
|
"grad_norm": 0.006483301054686308, |
|
"learning_rate": 1.5444015444015445e-06, |
|
"loss": 0.0009, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 42.01853281853282, |
|
"grad_norm": 0.4023190140724182, |
|
"learning_rate": 1.5358215358215358e-06, |
|
"loss": 0.226, |
|
"step": 11160 |
|
}, |
|
{ |
|
"epoch": 42.01930501930502, |
|
"grad_norm": 0.14850662648677826, |
|
"learning_rate": 1.5272415272415271e-06, |
|
"loss": 0.2314, |
|
"step": 11170 |
|
}, |
|
{ |
|
"epoch": 42.02007722007722, |
|
"grad_norm": 81.3932113647461, |
|
"learning_rate": 1.5186615186615189e-06, |
|
"loss": 0.8418, |
|
"step": 11180 |
|
}, |
|
{ |
|
"epoch": 42.02007722007722, |
|
"eval_accuracy": 0.6888888888888889, |
|
"eval_loss": 2.199957847595215, |
|
"eval_runtime": 11.6437, |
|
"eval_samples_per_second": 3.865, |
|
"eval_steps_per_second": 1.288, |
|
"step": 11180 |
|
}, |
|
{ |
|
"epoch": 43.000772200772204, |
|
"grad_norm": 0.10664477944374084, |
|
"learning_rate": 1.5100815100815102e-06, |
|
"loss": 0.2151, |
|
"step": 11190 |
|
}, |
|
{ |
|
"epoch": 43.0015444015444, |
|
"grad_norm": 0.012597335502505302, |
|
"learning_rate": 1.5015015015015017e-06, |
|
"loss": 0.0007, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 43.002316602316604, |
|
"grad_norm": 0.3459191620349884, |
|
"learning_rate": 1.492921492921493e-06, |
|
"loss": 0.0005, |
|
"step": 11210 |
|
}, |
|
{ |
|
"epoch": 43.0030888030888, |
|
"grad_norm": 0.013864747248589993, |
|
"learning_rate": 1.4843414843414843e-06, |
|
"loss": 0.0008, |
|
"step": 11220 |
|
}, |
|
{ |
|
"epoch": 43.003861003861005, |
|
"grad_norm": 0.10407616198062897, |
|
"learning_rate": 1.4757614757614759e-06, |
|
"loss": 0.2126, |
|
"step": 11230 |
|
}, |
|
{ |
|
"epoch": 43.0046332046332, |
|
"grad_norm": 0.012360197491943836, |
|
"learning_rate": 1.4671814671814674e-06, |
|
"loss": 0.3865, |
|
"step": 11240 |
|
}, |
|
{ |
|
"epoch": 43.005405405405405, |
|
"grad_norm": 0.012420023791491985, |
|
"learning_rate": 1.4586014586014587e-06, |
|
"loss": 0.0007, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 43.00617760617761, |
|
"grad_norm": 0.051371827721595764, |
|
"learning_rate": 1.4500214500214502e-06, |
|
"loss": 0.0005, |
|
"step": 11260 |
|
}, |
|
{ |
|
"epoch": 43.006949806949805, |
|
"grad_norm": 0.029028650373220444, |
|
"learning_rate": 1.4414414414414416e-06, |
|
"loss": 0.0012, |
|
"step": 11270 |
|
}, |
|
{ |
|
"epoch": 43.00772200772201, |
|
"grad_norm": 0.011852563358843327, |
|
"learning_rate": 1.4328614328614329e-06, |
|
"loss": 0.1394, |
|
"step": 11280 |
|
}, |
|
{ |
|
"epoch": 43.008494208494206, |
|
"grad_norm": 0.9856188297271729, |
|
"learning_rate": 1.4242814242814244e-06, |
|
"loss": 0.1234, |
|
"step": 11290 |
|
}, |
|
{ |
|
"epoch": 43.00926640926641, |
|
"grad_norm": 0.07063347846269608, |
|
"learning_rate": 1.415701415701416e-06, |
|
"loss": 0.1585, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 43.01003861003861, |
|
"grad_norm": 0.002462083473801613, |
|
"learning_rate": 1.4071214071214072e-06, |
|
"loss": 0.034, |
|
"step": 11310 |
|
}, |
|
{ |
|
"epoch": 43.01081081081081, |
|
"grad_norm": 0.023495899513363838, |
|
"learning_rate": 1.3985413985413988e-06, |
|
"loss": 0.0026, |
|
"step": 11320 |
|
}, |
|
{ |
|
"epoch": 43.011583011583014, |
|
"grad_norm": 0.0035925237461924553, |
|
"learning_rate": 1.38996138996139e-06, |
|
"loss": 0.0002, |
|
"step": 11330 |
|
}, |
|
{ |
|
"epoch": 43.01235521235521, |
|
"grad_norm": 0.017117993906140327, |
|
"learning_rate": 1.3813813813813814e-06, |
|
"loss": 0.0005, |
|
"step": 11340 |
|
}, |
|
{ |
|
"epoch": 43.013127413127414, |
|
"grad_norm": 0.05065334215760231, |
|
"learning_rate": 1.372801372801373e-06, |
|
"loss": 0.0002, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 43.01389961389961, |
|
"grad_norm": 0.21535246074199677, |
|
"learning_rate": 1.3642213642213642e-06, |
|
"loss": 0.0004, |
|
"step": 11360 |
|
}, |
|
{ |
|
"epoch": 43.014671814671814, |
|
"grad_norm": 207.9502716064453, |
|
"learning_rate": 1.3556413556413557e-06, |
|
"loss": 0.1807, |
|
"step": 11370 |
|
}, |
|
{ |
|
"epoch": 43.01544401544402, |
|
"grad_norm": 0.0034615122713148594, |
|
"learning_rate": 1.3470613470613473e-06, |
|
"loss": 0.1658, |
|
"step": 11380 |
|
}, |
|
{ |
|
"epoch": 43.016216216216215, |
|
"grad_norm": 0.0042229765094816685, |
|
"learning_rate": 1.3384813384813386e-06, |
|
"loss": 0.0003, |
|
"step": 11390 |
|
}, |
|
{ |
|
"epoch": 43.01698841698842, |
|
"grad_norm": 42.964447021484375, |
|
"learning_rate": 1.32990132990133e-06, |
|
"loss": 0.2509, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 43.017760617760615, |
|
"grad_norm": 0.018056800588965416, |
|
"learning_rate": 1.3213213213213214e-06, |
|
"loss": 0.1674, |
|
"step": 11410 |
|
}, |
|
{ |
|
"epoch": 43.01853281853282, |
|
"grad_norm": 0.00773951830342412, |
|
"learning_rate": 1.3127413127413127e-06, |
|
"loss": 0.2136, |
|
"step": 11420 |
|
}, |
|
{ |
|
"epoch": 43.01930501930502, |
|
"grad_norm": 0.03975954279303551, |
|
"learning_rate": 1.3041613041613045e-06, |
|
"loss": 0.0003, |
|
"step": 11430 |
|
}, |
|
{ |
|
"epoch": 43.02007722007722, |
|
"grad_norm": 0.0015788354212418199, |
|
"learning_rate": 1.2955812955812958e-06, |
|
"loss": 0.0007, |
|
"step": 11440 |
|
}, |
|
{ |
|
"epoch": 43.02007722007722, |
|
"eval_accuracy": 0.7111111111111111, |
|
"eval_loss": 2.0410959720611572, |
|
"eval_runtime": 11.6235, |
|
"eval_samples_per_second": 3.871, |
|
"eval_steps_per_second": 1.29, |
|
"step": 11440 |
|
}, |
|
{ |
|
"epoch": 44.000772200772204, |
|
"grad_norm": 99.61504364013672, |
|
"learning_rate": 1.287001287001287e-06, |
|
"loss": 0.4639, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 44.0015444015444, |
|
"grad_norm": 0.004829080309718847, |
|
"learning_rate": 1.2784212784212784e-06, |
|
"loss": 0.2582, |
|
"step": 11460 |
|
}, |
|
{ |
|
"epoch": 44.002316602316604, |
|
"grad_norm": 0.001873974921181798, |
|
"learning_rate": 1.26984126984127e-06, |
|
"loss": 0.0002, |
|
"step": 11470 |
|
}, |
|
{ |
|
"epoch": 44.0030888030888, |
|
"grad_norm": 0.015835706144571304, |
|
"learning_rate": 1.2612612612612613e-06, |
|
"loss": 0.0005, |
|
"step": 11480 |
|
}, |
|
{ |
|
"epoch": 44.003861003861005, |
|
"grad_norm": 0.0006254987674765289, |
|
"learning_rate": 1.252681252681253e-06, |
|
"loss": 0.2537, |
|
"step": 11490 |
|
}, |
|
{ |
|
"epoch": 44.0046332046332, |
|
"grad_norm": 0.005784200504422188, |
|
"learning_rate": 1.244101244101244e-06, |
|
"loss": 0.1615, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 44.005405405405405, |
|
"grad_norm": 0.005181388929486275, |
|
"learning_rate": 1.2355212355212356e-06, |
|
"loss": 0.0002, |
|
"step": 11510 |
|
}, |
|
{ |
|
"epoch": 44.00617760617761, |
|
"grad_norm": 0.019228698685765266, |
|
"learning_rate": 1.226941226941227e-06, |
|
"loss": 0.0914, |
|
"step": 11520 |
|
}, |
|
{ |
|
"epoch": 44.006949806949805, |
|
"grad_norm": 0.0022733949590474367, |
|
"learning_rate": 1.2183612183612185e-06, |
|
"loss": 0.0053, |
|
"step": 11530 |
|
}, |
|
{ |
|
"epoch": 44.00772200772201, |
|
"grad_norm": 0.02803795598447323, |
|
"learning_rate": 1.20978120978121e-06, |
|
"loss": 0.3945, |
|
"step": 11540 |
|
}, |
|
{ |
|
"epoch": 44.008494208494206, |
|
"grad_norm": 0.006438721902668476, |
|
"learning_rate": 1.2012012012012013e-06, |
|
"loss": 0.0002, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 44.00926640926641, |
|
"grad_norm": 0.11115320026874542, |
|
"learning_rate": 1.1926211926211926e-06, |
|
"loss": 0.2235, |
|
"step": 11560 |
|
}, |
|
{ |
|
"epoch": 44.01003861003861, |
|
"grad_norm": 0.009124440141022205, |
|
"learning_rate": 1.1840411840411841e-06, |
|
"loss": 0.0353, |
|
"step": 11570 |
|
}, |
|
{ |
|
"epoch": 44.01081081081081, |
|
"grad_norm": 0.008701231330633163, |
|
"learning_rate": 1.1754611754611757e-06, |
|
"loss": 0.0008, |
|
"step": 11580 |
|
}, |
|
{ |
|
"epoch": 44.011583011583014, |
|
"grad_norm": 0.012650973163545132, |
|
"learning_rate": 1.166881166881167e-06, |
|
"loss": 0.0002, |
|
"step": 11590 |
|
}, |
|
{ |
|
"epoch": 44.01235521235521, |
|
"grad_norm": 0.01399000734090805, |
|
"learning_rate": 1.1583011583011585e-06, |
|
"loss": 0.0013, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 44.013127413127414, |
|
"grad_norm": 5.8696746826171875, |
|
"learning_rate": 1.1497211497211498e-06, |
|
"loss": 0.0831, |
|
"step": 11610 |
|
}, |
|
{ |
|
"epoch": 44.01389961389961, |
|
"grad_norm": 0.004275842569768429, |
|
"learning_rate": 1.1411411411411411e-06, |
|
"loss": 0.0221, |
|
"step": 11620 |
|
}, |
|
{ |
|
"epoch": 44.014671814671814, |
|
"grad_norm": 0.06453493237495422, |
|
"learning_rate": 1.1325611325611326e-06, |
|
"loss": 0.1346, |
|
"step": 11630 |
|
}, |
|
{ |
|
"epoch": 44.01544401544402, |
|
"grad_norm": 0.009648929350078106, |
|
"learning_rate": 1.1239811239811242e-06, |
|
"loss": 0.0002, |
|
"step": 11640 |
|
}, |
|
{ |
|
"epoch": 44.016216216216215, |
|
"grad_norm": 0.022414736449718475, |
|
"learning_rate": 1.1154011154011155e-06, |
|
"loss": 0.0001, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 44.01698841698842, |
|
"grad_norm": 0.004432376008480787, |
|
"learning_rate": 1.106821106821107e-06, |
|
"loss": 0.0003, |
|
"step": 11660 |
|
}, |
|
{ |
|
"epoch": 44.017760617760615, |
|
"grad_norm": 0.007916075177490711, |
|
"learning_rate": 1.0982410982410983e-06, |
|
"loss": 0.2268, |
|
"step": 11670 |
|
}, |
|
{ |
|
"epoch": 44.01853281853282, |
|
"grad_norm": 0.009835878387093544, |
|
"learning_rate": 1.0896610896610896e-06, |
|
"loss": 0.2344, |
|
"step": 11680 |
|
}, |
|
{ |
|
"epoch": 44.01930501930502, |
|
"grad_norm": 0.0064972443506121635, |
|
"learning_rate": 1.0810810810810812e-06, |
|
"loss": 0.0004, |
|
"step": 11690 |
|
}, |
|
{ |
|
"epoch": 44.02007722007722, |
|
"grad_norm": 0.0038432800211012363, |
|
"learning_rate": 1.0725010725010727e-06, |
|
"loss": 0.0001, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 44.02007722007722, |
|
"eval_accuracy": 0.6888888888888889, |
|
"eval_loss": 2.108095169067383, |
|
"eval_runtime": 11.673, |
|
"eval_samples_per_second": 3.855, |
|
"eval_steps_per_second": 1.285, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 45.000772200772204, |
|
"grad_norm": 0.003354213200509548, |
|
"learning_rate": 1.063921063921064e-06, |
|
"loss": 0.0873, |
|
"step": 11710 |
|
}, |
|
{ |
|
"epoch": 45.0015444015444, |
|
"grad_norm": 0.23387674987316132, |
|
"learning_rate": 1.0553410553410555e-06, |
|
"loss": 0.1457, |
|
"step": 11720 |
|
}, |
|
{ |
|
"epoch": 45.002316602316604, |
|
"grad_norm": 0.008299161680042744, |
|
"learning_rate": 1.0467610467610468e-06, |
|
"loss": 0.2289, |
|
"step": 11730 |
|
}, |
|
{ |
|
"epoch": 45.0030888030888, |
|
"grad_norm": 0.0024218843318521976, |
|
"learning_rate": 1.0381810381810382e-06, |
|
"loss": 0.0002, |
|
"step": 11740 |
|
}, |
|
{ |
|
"epoch": 45.003861003861005, |
|
"grad_norm": 0.0018756206845864654, |
|
"learning_rate": 1.0296010296010297e-06, |
|
"loss": 0.0014, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 45.0046332046332, |
|
"grad_norm": 0.005605476908385754, |
|
"learning_rate": 1.0210210210210212e-06, |
|
"loss": 0.226, |
|
"step": 11760 |
|
}, |
|
{ |
|
"epoch": 45.005405405405405, |
|
"grad_norm": 0.011059476062655449, |
|
"learning_rate": 1.0124410124410125e-06, |
|
"loss": 0.0547, |
|
"step": 11770 |
|
}, |
|
{ |
|
"epoch": 45.00617760617761, |
|
"grad_norm": 0.0044801486656069756, |
|
"learning_rate": 1.0038610038610038e-06, |
|
"loss": 0.0009, |
|
"step": 11780 |
|
}, |
|
{ |
|
"epoch": 45.006949806949805, |
|
"grad_norm": 258.8080139160156, |
|
"learning_rate": 9.952809952809954e-07, |
|
"loss": 0.0366, |
|
"step": 11790 |
|
}, |
|
{ |
|
"epoch": 45.00772200772201, |
|
"grad_norm": 0.002683807397261262, |
|
"learning_rate": 9.867009867009867e-07, |
|
"loss": 0.2216, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 45.008494208494206, |
|
"grad_norm": 0.005808249581605196, |
|
"learning_rate": 9.781209781209782e-07, |
|
"loss": 0.0002, |
|
"step": 11810 |
|
}, |
|
{ |
|
"epoch": 45.00926640926641, |
|
"grad_norm": 45.80361557006836, |
|
"learning_rate": 9.695409695409697e-07, |
|
"loss": 0.2395, |
|
"step": 11820 |
|
}, |
|
{ |
|
"epoch": 45.01003861003861, |
|
"grad_norm": 0.019828274846076965, |
|
"learning_rate": 9.60960960960961e-07, |
|
"loss": 0.0002, |
|
"step": 11830 |
|
}, |
|
{ |
|
"epoch": 45.01081081081081, |
|
"grad_norm": 0.04362059012055397, |
|
"learning_rate": 9.523809523809525e-07, |
|
"loss": 0.033, |
|
"step": 11840 |
|
}, |
|
{ |
|
"epoch": 45.011583011583014, |
|
"grad_norm": 0.003450556192547083, |
|
"learning_rate": 9.438009438009439e-07, |
|
"loss": 0.0145, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 45.01235521235521, |
|
"grad_norm": 0.0010107405250892043, |
|
"learning_rate": 9.352209352209353e-07, |
|
"loss": 0.0518, |
|
"step": 11860 |
|
}, |
|
{ |
|
"epoch": 45.013127413127414, |
|
"grad_norm": 0.003476174082607031, |
|
"learning_rate": 9.266409266409267e-07, |
|
"loss": 0.0012, |
|
"step": 11870 |
|
}, |
|
{ |
|
"epoch": 45.01389961389961, |
|
"grad_norm": 0.4040607511997223, |
|
"learning_rate": 9.180609180609181e-07, |
|
"loss": 0.0005, |
|
"step": 11880 |
|
}, |
|
{ |
|
"epoch": 45.014671814671814, |
|
"grad_norm": 0.004233676008880138, |
|
"learning_rate": 9.094809094809096e-07, |
|
"loss": 0.0004, |
|
"step": 11890 |
|
}, |
|
{ |
|
"epoch": 45.01544401544402, |
|
"grad_norm": 0.01031399890780449, |
|
"learning_rate": 9.00900900900901e-07, |
|
"loss": 0.0005, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 45.016216216216215, |
|
"grad_norm": 104.54281616210938, |
|
"learning_rate": 8.923208923208925e-07, |
|
"loss": 0.0774, |
|
"step": 11910 |
|
}, |
|
{ |
|
"epoch": 45.01698841698842, |
|
"grad_norm": 0.0005152209778316319, |
|
"learning_rate": 8.837408837408838e-07, |
|
"loss": 0.0003, |
|
"step": 11920 |
|
}, |
|
{ |
|
"epoch": 45.017760617760615, |
|
"grad_norm": 0.008937066420912743, |
|
"learning_rate": 8.751608751608752e-07, |
|
"loss": 0.0003, |
|
"step": 11930 |
|
}, |
|
{ |
|
"epoch": 45.01853281853282, |
|
"grad_norm": 0.020448965951800346, |
|
"learning_rate": 8.665808665808668e-07, |
|
"loss": 0.1503, |
|
"step": 11940 |
|
}, |
|
{ |
|
"epoch": 45.01930501930502, |
|
"grad_norm": 0.03731679543852806, |
|
"learning_rate": 8.580008580008581e-07, |
|
"loss": 0.0003, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 45.02007722007722, |
|
"grad_norm": 0.02011745609343052, |
|
"learning_rate": 8.494208494208495e-07, |
|
"loss": 0.0004, |
|
"step": 11960 |
|
}, |
|
{ |
|
"epoch": 45.02007722007722, |
|
"eval_accuracy": 0.6888888888888889, |
|
"eval_loss": 2.182063341140747, |
|
"eval_runtime": 11.6012, |
|
"eval_samples_per_second": 3.879, |
|
"eval_steps_per_second": 1.293, |
|
"step": 11960 |
|
}, |
|
{ |
|
"epoch": 46.000772200772204, |
|
"grad_norm": 0.003135395934805274, |
|
"learning_rate": 8.40840840840841e-07, |
|
"loss": 0.0008, |
|
"step": 11970 |
|
}, |
|
{ |
|
"epoch": 46.0015444015444, |
|
"grad_norm": 0.002082926919683814, |
|
"learning_rate": 8.322608322608323e-07, |
|
"loss": 0.0002, |
|
"step": 11980 |
|
}, |
|
{ |
|
"epoch": 46.002316602316604, |
|
"grad_norm": 0.04637220501899719, |
|
"learning_rate": 8.236808236808237e-07, |
|
"loss": 0.0022, |
|
"step": 11990 |
|
}, |
|
{ |
|
"epoch": 46.0030888030888, |
|
"grad_norm": 0.00046908273361623287, |
|
"learning_rate": 8.151008151008153e-07, |
|
"loss": 0.0002, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 46.003861003861005, |
|
"grad_norm": 0.006788803730159998, |
|
"learning_rate": 8.065208065208066e-07, |
|
"loss": 0.2737, |
|
"step": 12010 |
|
}, |
|
{ |
|
"epoch": 46.0046332046332, |
|
"grad_norm": 0.07136689126491547, |
|
"learning_rate": 7.97940797940798e-07, |
|
"loss": 0.0005, |
|
"step": 12020 |
|
}, |
|
{ |
|
"epoch": 46.005405405405405, |
|
"grad_norm": 0.0029624011367559433, |
|
"learning_rate": 7.893607893607893e-07, |
|
"loss": 0.1519, |
|
"step": 12030 |
|
}, |
|
{ |
|
"epoch": 46.00617760617761, |
|
"grad_norm": 0.001701715518720448, |
|
"learning_rate": 7.807807807807808e-07, |
|
"loss": 0.0001, |
|
"step": 12040 |
|
}, |
|
{ |
|
"epoch": 46.006949806949805, |
|
"grad_norm": 0.0016193045303225517, |
|
"learning_rate": 7.722007722007723e-07, |
|
"loss": 0.047, |
|
"step": 12050 |
|
}, |
|
{ |
|
"epoch": 46.00772200772201, |
|
"grad_norm": 0.0045381346717476845, |
|
"learning_rate": 7.636207636207636e-07, |
|
"loss": 0.0001, |
|
"step": 12060 |
|
}, |
|
{ |
|
"epoch": 46.008494208494206, |
|
"grad_norm": 0.003102656453847885, |
|
"learning_rate": 7.550407550407551e-07, |
|
"loss": 0.3918, |
|
"step": 12070 |
|
}, |
|
{ |
|
"epoch": 46.00926640926641, |
|
"grad_norm": 0.0017835380276665092, |
|
"learning_rate": 7.464607464607465e-07, |
|
"loss": 0.239, |
|
"step": 12080 |
|
}, |
|
{ |
|
"epoch": 46.01003861003861, |
|
"grad_norm": 0.004557139705866575, |
|
"learning_rate": 7.378807378807379e-07, |
|
"loss": 0.1398, |
|
"step": 12090 |
|
}, |
|
{ |
|
"epoch": 46.01081081081081, |
|
"grad_norm": 0.013430671766400337, |
|
"learning_rate": 7.293007293007294e-07, |
|
"loss": 0.0001, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 46.011583011583014, |
|
"grad_norm": 0.020614583045244217, |
|
"learning_rate": 7.207207207207208e-07, |
|
"loss": 0.2451, |
|
"step": 12110 |
|
}, |
|
{ |
|
"epoch": 46.01235521235521, |
|
"grad_norm": 0.024003766477108, |
|
"learning_rate": 7.121407121407122e-07, |
|
"loss": 0.0441, |
|
"step": 12120 |
|
}, |
|
{ |
|
"epoch": 46.013127413127414, |
|
"grad_norm": 0.007595748174935579, |
|
"learning_rate": 7.035607035607036e-07, |
|
"loss": 0.0862, |
|
"step": 12130 |
|
}, |
|
{ |
|
"epoch": 46.01389961389961, |
|
"grad_norm": 0.2669449746608734, |
|
"learning_rate": 6.94980694980695e-07, |
|
"loss": 0.0165, |
|
"step": 12140 |
|
}, |
|
{ |
|
"epoch": 46.014671814671814, |
|
"grad_norm": 0.004871792625635862, |
|
"learning_rate": 6.864006864006865e-07, |
|
"loss": 0.142, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 46.01544401544402, |
|
"grad_norm": 0.027607223019003868, |
|
"learning_rate": 6.778206778206779e-07, |
|
"loss": 0.0002, |
|
"step": 12160 |
|
}, |
|
{ |
|
"epoch": 46.016216216216215, |
|
"grad_norm": 0.0004795089189428836, |
|
"learning_rate": 6.692406692406693e-07, |
|
"loss": 0.0916, |
|
"step": 12170 |
|
}, |
|
{ |
|
"epoch": 46.01698841698842, |
|
"grad_norm": 0.0060905334539711475, |
|
"learning_rate": 6.606606606606607e-07, |
|
"loss": 0.3611, |
|
"step": 12180 |
|
}, |
|
{ |
|
"epoch": 46.017760617760615, |
|
"grad_norm": 0.003681829432025552, |
|
"learning_rate": 6.520806520806522e-07, |
|
"loss": 0.0003, |
|
"step": 12190 |
|
}, |
|
{ |
|
"epoch": 46.01853281853282, |
|
"grad_norm": 0.008164509199559689, |
|
"learning_rate": 6.435006435006435e-07, |
|
"loss": 0.0977, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 46.01930501930502, |
|
"grad_norm": 0.00053538748761639, |
|
"learning_rate": 6.34920634920635e-07, |
|
"loss": 0.0033, |
|
"step": 12210 |
|
}, |
|
{ |
|
"epoch": 46.02007722007722, |
|
"grad_norm": 0.010476024821400642, |
|
"learning_rate": 6.263406263406265e-07, |
|
"loss": 0.0013, |
|
"step": 12220 |
|
}, |
|
{ |
|
"epoch": 46.02007722007722, |
|
"eval_accuracy": 0.7111111111111111, |
|
"eval_loss": 2.131287097930908, |
|
"eval_runtime": 11.5156, |
|
"eval_samples_per_second": 3.908, |
|
"eval_steps_per_second": 1.303, |
|
"step": 12220 |
|
}, |
|
{ |
|
"epoch": 47.000772200772204, |
|
"grad_norm": 0.0032295091077685356, |
|
"learning_rate": 6.177606177606178e-07, |
|
"loss": 0.0295, |
|
"step": 12230 |
|
}, |
|
{ |
|
"epoch": 47.0015444015444, |
|
"grad_norm": 0.021106446161866188, |
|
"learning_rate": 6.091806091806092e-07, |
|
"loss": 0.048, |
|
"step": 12240 |
|
}, |
|
{ |
|
"epoch": 47.002316602316604, |
|
"grad_norm": 0.002815277548506856, |
|
"learning_rate": 6.006006006006006e-07, |
|
"loss": 0.0007, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 47.0030888030888, |
|
"grad_norm": 0.001195929478853941, |
|
"learning_rate": 5.920205920205921e-07, |
|
"loss": 0.1884, |
|
"step": 12260 |
|
}, |
|
{ |
|
"epoch": 47.003861003861005, |
|
"grad_norm": 80.01729583740234, |
|
"learning_rate": 5.834405834405835e-07, |
|
"loss": 0.0094, |
|
"step": 12270 |
|
}, |
|
{ |
|
"epoch": 47.0046332046332, |
|
"grad_norm": 0.0021410963963717222, |
|
"learning_rate": 5.748605748605749e-07, |
|
"loss": 0.0002, |
|
"step": 12280 |
|
}, |
|
{ |
|
"epoch": 47.005405405405405, |
|
"grad_norm": 0.8117086887359619, |
|
"learning_rate": 5.662805662805663e-07, |
|
"loss": 0.1675, |
|
"step": 12290 |
|
}, |
|
{ |
|
"epoch": 47.00617760617761, |
|
"grad_norm": 0.008614442311227322, |
|
"learning_rate": 5.577005577005577e-07, |
|
"loss": 0.0004, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 47.006949806949805, |
|
"grad_norm": 0.005176466889679432, |
|
"learning_rate": 5.491205491205492e-07, |
|
"loss": 0.0003, |
|
"step": 12310 |
|
}, |
|
{ |
|
"epoch": 47.00772200772201, |
|
"grad_norm": 0.0040098209865391254, |
|
"learning_rate": 5.405405405405406e-07, |
|
"loss": 0.1824, |
|
"step": 12320 |
|
}, |
|
{ |
|
"epoch": 47.008494208494206, |
|
"grad_norm": 0.0035810640547424555, |
|
"learning_rate": 5.31960531960532e-07, |
|
"loss": 0.0001, |
|
"step": 12330 |
|
}, |
|
{ |
|
"epoch": 47.00926640926641, |
|
"grad_norm": 0.043194353580474854, |
|
"learning_rate": 5.233805233805234e-07, |
|
"loss": 0.081, |
|
"step": 12340 |
|
}, |
|
{ |
|
"epoch": 47.01003861003861, |
|
"grad_norm": 0.016690198332071304, |
|
"learning_rate": 5.148005148005148e-07, |
|
"loss": 0.0004, |
|
"step": 12350 |
|
}, |
|
{ |
|
"epoch": 47.01081081081081, |
|
"grad_norm": 0.015656674280762672, |
|
"learning_rate": 5.062205062205063e-07, |
|
"loss": 0.0001, |
|
"step": 12360 |
|
}, |
|
{ |
|
"epoch": 47.011583011583014, |
|
"grad_norm": 0.008801618590950966, |
|
"learning_rate": 4.976404976404977e-07, |
|
"loss": 0.0001, |
|
"step": 12370 |
|
}, |
|
{ |
|
"epoch": 47.01235521235521, |
|
"grad_norm": 0.016372770071029663, |
|
"learning_rate": 4.890604890604891e-07, |
|
"loss": 0.0001, |
|
"step": 12380 |
|
}, |
|
{ |
|
"epoch": 47.013127413127414, |
|
"grad_norm": 0.0037778830155730247, |
|
"learning_rate": 4.804804804804805e-07, |
|
"loss": 0.0001, |
|
"step": 12390 |
|
}, |
|
{ |
|
"epoch": 47.01389961389961, |
|
"grad_norm": 0.09339027106761932, |
|
"learning_rate": 4.7190047190047194e-07, |
|
"loss": 0.0023, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 47.014671814671814, |
|
"grad_norm": 482.0824279785156, |
|
"learning_rate": 4.6332046332046336e-07, |
|
"loss": 0.0273, |
|
"step": 12410 |
|
}, |
|
{ |
|
"epoch": 47.01544401544402, |
|
"grad_norm": 0.010961725376546383, |
|
"learning_rate": 4.547404547404548e-07, |
|
"loss": 0.0002, |
|
"step": 12420 |
|
}, |
|
{ |
|
"epoch": 47.016216216216215, |
|
"grad_norm": 0.015132254920899868, |
|
"learning_rate": 4.4616044616044625e-07, |
|
"loss": 0.0001, |
|
"step": 12430 |
|
}, |
|
{ |
|
"epoch": 47.01698841698842, |
|
"grad_norm": 0.0039853425696492195, |
|
"learning_rate": 4.375804375804376e-07, |
|
"loss": 0.0001, |
|
"step": 12440 |
|
}, |
|
{ |
|
"epoch": 47.017760617760615, |
|
"grad_norm": 0.010092932730913162, |
|
"learning_rate": 4.2900042900042903e-07, |
|
"loss": 0.2261, |
|
"step": 12450 |
|
}, |
|
{ |
|
"epoch": 47.01853281853282, |
|
"grad_norm": 0.007288198918104172, |
|
"learning_rate": 4.204204204204205e-07, |
|
"loss": 0.0001, |
|
"step": 12460 |
|
}, |
|
{ |
|
"epoch": 47.01930501930502, |
|
"grad_norm": 0.0723380446434021, |
|
"learning_rate": 4.1184041184041187e-07, |
|
"loss": 0.0003, |
|
"step": 12470 |
|
}, |
|
{ |
|
"epoch": 47.02007722007722, |
|
"grad_norm": 0.737527072429657, |
|
"learning_rate": 4.032604032604033e-07, |
|
"loss": 0.0003, |
|
"step": 12480 |
|
}, |
|
{ |
|
"epoch": 47.02007722007722, |
|
"eval_accuracy": 0.7111111111111111, |
|
"eval_loss": 2.2112810611724854, |
|
"eval_runtime": 11.7955, |
|
"eval_samples_per_second": 3.815, |
|
"eval_steps_per_second": 1.272, |
|
"step": 12480 |
|
}, |
|
{ |
|
"epoch": 48.000772200772204, |
|
"grad_norm": 0.018116367980837822, |
|
"learning_rate": 3.9468039468039466e-07, |
|
"loss": 0.0002, |
|
"step": 12490 |
|
}, |
|
{ |
|
"epoch": 48.0015444015444, |
|
"grad_norm": 1.851183295249939, |
|
"learning_rate": 3.8610038610038613e-07, |
|
"loss": 0.0031, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 48.002316602316604, |
|
"grad_norm": 0.007373946253210306, |
|
"learning_rate": 3.7752037752037755e-07, |
|
"loss": 0.0001, |
|
"step": 12510 |
|
}, |
|
{ |
|
"epoch": 48.0030888030888, |
|
"grad_norm": 0.031634677201509476, |
|
"learning_rate": 3.6894036894036897e-07, |
|
"loss": 0.0002, |
|
"step": 12520 |
|
}, |
|
{ |
|
"epoch": 48.003861003861005, |
|
"grad_norm": 0.014068067073822021, |
|
"learning_rate": 3.603603603603604e-07, |
|
"loss": 0.0002, |
|
"step": 12530 |
|
}, |
|
{ |
|
"epoch": 48.0046332046332, |
|
"grad_norm": 0.0021505013573914766, |
|
"learning_rate": 3.517803517803518e-07, |
|
"loss": 0.0002, |
|
"step": 12540 |
|
}, |
|
{ |
|
"epoch": 48.005405405405405, |
|
"grad_norm": 0.007775390055030584, |
|
"learning_rate": 3.4320034320034323e-07, |
|
"loss": 0.0002, |
|
"step": 12550 |
|
}, |
|
{ |
|
"epoch": 48.00617760617761, |
|
"grad_norm": 0.006418993230909109, |
|
"learning_rate": 3.3462033462033465e-07, |
|
"loss": 0.0001, |
|
"step": 12560 |
|
}, |
|
{ |
|
"epoch": 48.006949806949805, |
|
"grad_norm": 0.0038523671682924032, |
|
"learning_rate": 3.260403260403261e-07, |
|
"loss": 0.0039, |
|
"step": 12570 |
|
}, |
|
{ |
|
"epoch": 48.00772200772201, |
|
"grad_norm": 0.011759024113416672, |
|
"learning_rate": 3.174603174603175e-07, |
|
"loss": 0.0003, |
|
"step": 12580 |
|
}, |
|
{ |
|
"epoch": 48.008494208494206, |
|
"grad_norm": 0.006618797779083252, |
|
"learning_rate": 3.088803088803089e-07, |
|
"loss": 0.0004, |
|
"step": 12590 |
|
}, |
|
{ |
|
"epoch": 48.00926640926641, |
|
"grad_norm": 0.0052281878888607025, |
|
"learning_rate": 3.003003003003003e-07, |
|
"loss": 0.0001, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 48.01003861003861, |
|
"grad_norm": 0.026694072410464287, |
|
"learning_rate": 2.9172029172029174e-07, |
|
"loss": 0.0005, |
|
"step": 12610 |
|
}, |
|
{ |
|
"epoch": 48.01081081081081, |
|
"grad_norm": 0.6997846364974976, |
|
"learning_rate": 2.8314028314028316e-07, |
|
"loss": 0.0002, |
|
"step": 12620 |
|
}, |
|
{ |
|
"epoch": 48.011583011583014, |
|
"grad_norm": 0.003470169845968485, |
|
"learning_rate": 2.745602745602746e-07, |
|
"loss": 0.0016, |
|
"step": 12630 |
|
}, |
|
{ |
|
"epoch": 48.01235521235521, |
|
"grad_norm": 1.1241451501846313, |
|
"learning_rate": 2.65980265980266e-07, |
|
"loss": 0.0005, |
|
"step": 12640 |
|
}, |
|
{ |
|
"epoch": 48.013127413127414, |
|
"grad_norm": 0.0048046838492155075, |
|
"learning_rate": 2.574002574002574e-07, |
|
"loss": 0.0001, |
|
"step": 12650 |
|
}, |
|
{ |
|
"epoch": 48.01389961389961, |
|
"grad_norm": 0.0028558846097439528, |
|
"learning_rate": 2.4882024882024884e-07, |
|
"loss": 0.0001, |
|
"step": 12660 |
|
}, |
|
{ |
|
"epoch": 48.014671814671814, |
|
"grad_norm": 0.0014227797510102391, |
|
"learning_rate": 2.4024024024024026e-07, |
|
"loss": 0.0002, |
|
"step": 12670 |
|
}, |
|
{ |
|
"epoch": 48.01544401544402, |
|
"grad_norm": 0.0037844879552721977, |
|
"learning_rate": 2.3166023166023168e-07, |
|
"loss": 0.017, |
|
"step": 12680 |
|
}, |
|
{ |
|
"epoch": 48.016216216216215, |
|
"grad_norm": 0.001309237559325993, |
|
"learning_rate": 2.2308022308022312e-07, |
|
"loss": 0.0012, |
|
"step": 12690 |
|
}, |
|
{ |
|
"epoch": 48.01698841698842, |
|
"grad_norm": 0.0035404835361987352, |
|
"learning_rate": 2.1450021450021452e-07, |
|
"loss": 0.0007, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 48.017760617760615, |
|
"grad_norm": 0.01660049706697464, |
|
"learning_rate": 2.0592020592020594e-07, |
|
"loss": 0.005, |
|
"step": 12710 |
|
}, |
|
{ |
|
"epoch": 48.01853281853282, |
|
"grad_norm": 0.013520884327590466, |
|
"learning_rate": 1.9734019734019733e-07, |
|
"loss": 0.0122, |
|
"step": 12720 |
|
}, |
|
{ |
|
"epoch": 48.01930501930502, |
|
"grad_norm": 0.0025345338508486748, |
|
"learning_rate": 1.8876018876018877e-07, |
|
"loss": 0.1069, |
|
"step": 12730 |
|
}, |
|
{ |
|
"epoch": 48.02007722007722, |
|
"grad_norm": 101.20980072021484, |
|
"learning_rate": 1.801801801801802e-07, |
|
"loss": 0.83, |
|
"step": 12740 |
|
}, |
|
{ |
|
"epoch": 48.02007722007722, |
|
"eval_accuracy": 0.6888888888888889, |
|
"eval_loss": 2.204815626144409, |
|
"eval_runtime": 11.3918, |
|
"eval_samples_per_second": 3.95, |
|
"eval_steps_per_second": 1.317, |
|
"step": 12740 |
|
}, |
|
{ |
|
"epoch": 49.000772200772204, |
|
"grad_norm": 0.008942559361457825, |
|
"learning_rate": 1.7160017160017161e-07, |
|
"loss": 0.0002, |
|
"step": 12750 |
|
}, |
|
{ |
|
"epoch": 49.0015444015444, |
|
"grad_norm": 0.005645198281854391, |
|
"learning_rate": 1.6302016302016306e-07, |
|
"loss": 0.0036, |
|
"step": 12760 |
|
}, |
|
{ |
|
"epoch": 49.002316602316604, |
|
"grad_norm": 0.0036984614562243223, |
|
"learning_rate": 1.5444015444015445e-07, |
|
"loss": 0.0001, |
|
"step": 12770 |
|
}, |
|
{ |
|
"epoch": 49.0030888030888, |
|
"grad_norm": 0.004046537913382053, |
|
"learning_rate": 1.4586014586014587e-07, |
|
"loss": 0.0001, |
|
"step": 12780 |
|
}, |
|
{ |
|
"epoch": 49.003861003861005, |
|
"grad_norm": 0.0005429061129689217, |
|
"learning_rate": 1.372801372801373e-07, |
|
"loss": 0.0002, |
|
"step": 12790 |
|
}, |
|
{ |
|
"epoch": 49.0046332046332, |
|
"grad_norm": 0.002891113283112645, |
|
"learning_rate": 1.287001287001287e-07, |
|
"loss": 0.0002, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 49.005405405405405, |
|
"grad_norm": 0.005318440031260252, |
|
"learning_rate": 1.2012012012012013e-07, |
|
"loss": 0.0001, |
|
"step": 12810 |
|
}, |
|
{ |
|
"epoch": 49.00617760617761, |
|
"grad_norm": 0.0029130184557288885, |
|
"learning_rate": 1.1154011154011156e-07, |
|
"loss": 0.0002, |
|
"step": 12820 |
|
}, |
|
{ |
|
"epoch": 49.006949806949805, |
|
"grad_norm": 0.7598032355308533, |
|
"learning_rate": 1.0296010296010297e-07, |
|
"loss": 0.0003, |
|
"step": 12830 |
|
}, |
|
{ |
|
"epoch": 49.00772200772201, |
|
"grad_norm": 0.02352285198867321, |
|
"learning_rate": 9.438009438009439e-08, |
|
"loss": 0.0002, |
|
"step": 12840 |
|
}, |
|
{ |
|
"epoch": 49.008494208494206, |
|
"grad_norm": 0.004137629177421331, |
|
"learning_rate": 8.580008580008581e-08, |
|
"loss": 0.0001, |
|
"step": 12850 |
|
}, |
|
{ |
|
"epoch": 49.00926640926641, |
|
"grad_norm": 0.007847619242966175, |
|
"learning_rate": 7.722007722007723e-08, |
|
"loss": 0.0001, |
|
"step": 12860 |
|
}, |
|
{ |
|
"epoch": 49.01003861003861, |
|
"grad_norm": 236.08055114746094, |
|
"learning_rate": 6.864006864006865e-08, |
|
"loss": 0.2852, |
|
"step": 12870 |
|
}, |
|
{ |
|
"epoch": 49.01081081081081, |
|
"grad_norm": 0.004942096769809723, |
|
"learning_rate": 6.006006006006006e-08, |
|
"loss": 0.0001, |
|
"step": 12880 |
|
}, |
|
{ |
|
"epoch": 49.011583011583014, |
|
"grad_norm": 0.009192113764584064, |
|
"learning_rate": 5.1480051480051484e-08, |
|
"loss": 0.2111, |
|
"step": 12890 |
|
}, |
|
{ |
|
"epoch": 49.01235521235521, |
|
"grad_norm": 0.002397160744294524, |
|
"learning_rate": 4.2900042900042903e-08, |
|
"loss": 0.0611, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 49.013127413127414, |
|
"grad_norm": 0.016426341608166695, |
|
"learning_rate": 3.432003432003432e-08, |
|
"loss": 0.0002, |
|
"step": 12910 |
|
}, |
|
{ |
|
"epoch": 49.01389961389961, |
|
"grad_norm": 0.37930360436439514, |
|
"learning_rate": 2.5740025740025742e-08, |
|
"loss": 0.0005, |
|
"step": 12920 |
|
}, |
|
{ |
|
"epoch": 49.014671814671814, |
|
"grad_norm": 0.004664725624024868, |
|
"learning_rate": 1.716001716001716e-08, |
|
"loss": 0.0003, |
|
"step": 12930 |
|
}, |
|
{ |
|
"epoch": 49.01544401544402, |
|
"grad_norm": 0.01354283094406128, |
|
"learning_rate": 8.58000858000858e-09, |
|
"loss": 0.0001, |
|
"step": 12940 |
|
}, |
|
{ |
|
"epoch": 49.016216216216215, |
|
"grad_norm": 0.0005170258227735758, |
|
"learning_rate": 0.0, |
|
"loss": 0.0001, |
|
"step": 12950 |
|
}, |
|
{ |
|
"epoch": 49.016216216216215, |
|
"eval_accuracy": 0.6888888888888889, |
|
"eval_loss": 2.199702739715576, |
|
"eval_runtime": 13.0793, |
|
"eval_samples_per_second": 3.441, |
|
"eval_steps_per_second": 1.147, |
|
"step": 12950 |
|
}, |
|
{ |
|
"epoch": 49.016216216216215, |
|
"step": 12950, |
|
"total_flos": 1.7016105177038035e+20, |
|
"train_loss": 0.32982333966647276, |
|
"train_runtime": 21819.3604, |
|
"train_samples_per_second": 1.781, |
|
"train_steps_per_second": 0.594 |
|
}, |
|
{ |
|
"epoch": 49.016216216216215, |
|
"eval_accuracy": 0.7777777777777778, |
|
"eval_loss": 0.7423439621925354, |
|
"eval_runtime": 11.4389, |
|
"eval_samples_per_second": 3.934, |
|
"eval_steps_per_second": 1.311, |
|
"step": 12950 |
|
}, |
|
{ |
|
"epoch": 49.016216216216215, |
|
"eval_accuracy": 0.7777777777777778, |
|
"eval_loss": 0.7423441410064697, |
|
"eval_runtime": 11.3661, |
|
"eval_samples_per_second": 3.959, |
|
"eval_steps_per_second": 1.32, |
|
"step": 12950 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 12950, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.7016105177038035e+20, |
|
"train_batch_size": 3, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|