|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 3000000, |
|
"global_step": 2254, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0044365572315882874, |
|
"grad_norm": 34.20283403950735, |
|
"learning_rate": 4.4247787610619474e-07, |
|
"loss": 1.6635, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008873114463176575, |
|
"grad_norm": 12.575384142861347, |
|
"learning_rate": 8.849557522123895e-07, |
|
"loss": 1.5066, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.013309671694764862, |
|
"grad_norm": 5.263263199846598, |
|
"learning_rate": 1.3274336283185843e-06, |
|
"loss": 1.2307, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01774622892635315, |
|
"grad_norm": 4.982907507940082, |
|
"learning_rate": 1.769911504424779e-06, |
|
"loss": 1.0518, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.022182786157941437, |
|
"grad_norm": 3.6874648219458925, |
|
"learning_rate": 2.212389380530974e-06, |
|
"loss": 0.9991, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.026619343389529725, |
|
"grad_norm": 4.125478287085026, |
|
"learning_rate": 2.6548672566371687e-06, |
|
"loss": 0.9461, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.031055900621118012, |
|
"grad_norm": 3.7527731246847797, |
|
"learning_rate": 3.097345132743363e-06, |
|
"loss": 0.9212, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0354924578527063, |
|
"grad_norm": 4.250041476792804, |
|
"learning_rate": 3.539823008849558e-06, |
|
"loss": 0.9086, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03992901508429459, |
|
"grad_norm": 4.884005781902834, |
|
"learning_rate": 3.982300884955752e-06, |
|
"loss": 0.9103, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.044365572315882874, |
|
"grad_norm": 3.363574198575473, |
|
"learning_rate": 4.424778761061948e-06, |
|
"loss": 0.9075, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.048802129547471165, |
|
"grad_norm": 4.481974877802558, |
|
"learning_rate": 4.867256637168142e-06, |
|
"loss": 0.8666, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05323868677905945, |
|
"grad_norm": 4.507301002428585, |
|
"learning_rate": 5.309734513274337e-06, |
|
"loss": 0.8747, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.05767524401064774, |
|
"grad_norm": 4.360704218511151, |
|
"learning_rate": 5.752212389380532e-06, |
|
"loss": 0.8662, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.062111801242236024, |
|
"grad_norm": 3.693651598803813, |
|
"learning_rate": 6.194690265486726e-06, |
|
"loss": 0.8903, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06654835847382432, |
|
"grad_norm": 4.347036121522918, |
|
"learning_rate": 6.6371681415929215e-06, |
|
"loss": 0.8777, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0709849157054126, |
|
"grad_norm": 4.180008806520496, |
|
"learning_rate": 7.079646017699116e-06, |
|
"loss": 0.8825, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07542147293700088, |
|
"grad_norm": 3.5743895818219964, |
|
"learning_rate": 7.5221238938053095e-06, |
|
"loss": 0.8432, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.07985803016858918, |
|
"grad_norm": 3.178231587914838, |
|
"learning_rate": 7.964601769911505e-06, |
|
"loss": 0.8815, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.08429458740017746, |
|
"grad_norm": 3.9776430514151713, |
|
"learning_rate": 8.4070796460177e-06, |
|
"loss": 0.842, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08873114463176575, |
|
"grad_norm": 3.525176682554, |
|
"learning_rate": 8.849557522123895e-06, |
|
"loss": 0.8823, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09316770186335403, |
|
"grad_norm": 3.6839303051450547, |
|
"learning_rate": 9.29203539823009e-06, |
|
"loss": 0.8879, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.09760425909494233, |
|
"grad_norm": 3.707960198706698, |
|
"learning_rate": 9.734513274336284e-06, |
|
"loss": 0.8592, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.10204081632653061, |
|
"grad_norm": 3.987885333720805, |
|
"learning_rate": 9.999904010783725e-06, |
|
"loss": 0.9191, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.1064773735581189, |
|
"grad_norm": 4.161618022645433, |
|
"learning_rate": 9.998824174426658e-06, |
|
"loss": 0.876, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.11091393078970718, |
|
"grad_norm": 3.936885714208245, |
|
"learning_rate": 9.99654477518325e-06, |
|
"loss": 0.8448, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.11535048802129548, |
|
"grad_norm": 3.381832157299745, |
|
"learning_rate": 9.993066360038679e-06, |
|
"loss": 0.8693, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.11978704525288376, |
|
"grad_norm": 3.4534900024735595, |
|
"learning_rate": 9.98838976370478e-06, |
|
"loss": 0.8907, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.12422360248447205, |
|
"grad_norm": 3.286109931988662, |
|
"learning_rate": 9.982516108419746e-06, |
|
"loss": 0.859, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.12866015971606035, |
|
"grad_norm": 3.8608367415146367, |
|
"learning_rate": 9.975446803678818e-06, |
|
"loss": 0.8497, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.13309671694764863, |
|
"grad_norm": 3.3536728069117747, |
|
"learning_rate": 9.967183545896055e-06, |
|
"loss": 0.8744, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.13753327417923691, |
|
"grad_norm": 2.9217222323855654, |
|
"learning_rate": 9.95772831799724e-06, |
|
"loss": 0.8552, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1419698314108252, |
|
"grad_norm": 3.166995337332963, |
|
"learning_rate": 9.94708338894405e-06, |
|
"loss": 0.8643, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.14640638864241348, |
|
"grad_norm": 3.5968311661201553, |
|
"learning_rate": 9.935251313189564e-06, |
|
"loss": 0.8861, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.15084294587400177, |
|
"grad_norm": 3.1984772961959127, |
|
"learning_rate": 9.922234930065286e-06, |
|
"loss": 0.8529, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.15527950310559005, |
|
"grad_norm": 2.994176553345875, |
|
"learning_rate": 9.908037363099782e-06, |
|
"loss": 0.8777, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.15971606033717836, |
|
"grad_norm": 2.8847550451762944, |
|
"learning_rate": 9.892662019269136e-06, |
|
"loss": 0.841, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.16415261756876665, |
|
"grad_norm": 3.4925529215345326, |
|
"learning_rate": 9.876112588179378e-06, |
|
"loss": 0.8581, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.16858917480035493, |
|
"grad_norm": 3.0392967138153875, |
|
"learning_rate": 9.858393041181096e-06, |
|
"loss": 0.8702, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.1730257320319432, |
|
"grad_norm": 3.3458625683350536, |
|
"learning_rate": 9.839507630416436e-06, |
|
"loss": 0.8584, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.1774622892635315, |
|
"grad_norm": 2.4613591773979904, |
|
"learning_rate": 9.819460887798714e-06, |
|
"loss": 0.865, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.18189884649511978, |
|
"grad_norm": 2.9097688328420803, |
|
"learning_rate": 9.7982576239249e-06, |
|
"loss": 0.8241, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.18633540372670807, |
|
"grad_norm": 2.7123773430443294, |
|
"learning_rate": 9.775902926921228e-06, |
|
"loss": 0.8497, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.19077196095829635, |
|
"grad_norm": 3.15278672904484, |
|
"learning_rate": 9.7524021612222e-06, |
|
"loss": 0.8382, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.19520851818988466, |
|
"grad_norm": 3.140823772708262, |
|
"learning_rate": 9.727760966283285e-06, |
|
"loss": 0.8762, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.19964507542147295, |
|
"grad_norm": 3.0291324290256916, |
|
"learning_rate": 9.701985255227624e-06, |
|
"loss": 0.8603, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.20408163265306123, |
|
"grad_norm": 3.244917065658077, |
|
"learning_rate": 9.675081213427076e-06, |
|
"loss": 0.8615, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.2085181898846495, |
|
"grad_norm": 2.955073572383622, |
|
"learning_rate": 9.647055297017901e-06, |
|
"loss": 0.8563, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.2129547471162378, |
|
"grad_norm": 2.9411304836471936, |
|
"learning_rate": 9.617914231351511e-06, |
|
"loss": 0.8516, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.21739130434782608, |
|
"grad_norm": 3.2750932461773092, |
|
"learning_rate": 9.587665009380565e-06, |
|
"loss": 0.8692, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.22182786157941436, |
|
"grad_norm": 3.504372310689358, |
|
"learning_rate": 9.556314889980906e-06, |
|
"loss": 0.8804, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.22626441881100265, |
|
"grad_norm": 2.7396306271254836, |
|
"learning_rate": 9.523871396209633e-06, |
|
"loss": 0.8404, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.23070097604259096, |
|
"grad_norm": 3.2247600207290357, |
|
"learning_rate": 9.49034231349982e-06, |
|
"loss": 0.869, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.23513753327417924, |
|
"grad_norm": 2.8471166448572243, |
|
"learning_rate": 9.455735687792233e-06, |
|
"loss": 0.8304, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.23957409050576753, |
|
"grad_norm": 3.257232995701795, |
|
"learning_rate": 9.420059823604573e-06, |
|
"loss": 0.884, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.2440106477373558, |
|
"grad_norm": 3.244453444955561, |
|
"learning_rate": 9.383323282038632e-06, |
|
"loss": 0.8379, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.2484472049689441, |
|
"grad_norm": 2.864674646057881, |
|
"learning_rate": 9.345534878725908e-06, |
|
"loss": 0.8443, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.2528837622005324, |
|
"grad_norm": 3.1532607318799997, |
|
"learning_rate": 9.30670368171212e-06, |
|
"loss": 0.8446, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.2573203194321207, |
|
"grad_norm": 3.0985282276684605, |
|
"learning_rate": 9.266839009281154e-06, |
|
"loss": 0.8463, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.26175687666370895, |
|
"grad_norm": 3.2388655001812947, |
|
"learning_rate": 9.225950427718974e-06, |
|
"loss": 0.8666, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.26619343389529726, |
|
"grad_norm": 3.6155166491443573, |
|
"learning_rate": 9.184047749018002e-06, |
|
"loss": 0.8382, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2706299911268855, |
|
"grad_norm": 2.4403938414282074, |
|
"learning_rate": 9.141141028522544e-06, |
|
"loss": 0.8188, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.27506654835847383, |
|
"grad_norm": 2.4856078708290115, |
|
"learning_rate": 9.097240562515825e-06, |
|
"loss": 0.8465, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.2795031055900621, |
|
"grad_norm": 2.920160366149781, |
|
"learning_rate": 9.052356885749191e-06, |
|
"loss": 0.8513, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.2839396628216504, |
|
"grad_norm": 3.215108369697536, |
|
"learning_rate": 9.006500768914106e-06, |
|
"loss": 0.81, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.2883762200532387, |
|
"grad_norm": 3.3125489624622357, |
|
"learning_rate": 8.959683216057512e-06, |
|
"loss": 0.8273, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.29281277728482696, |
|
"grad_norm": 2.913737564979685, |
|
"learning_rate": 8.911915461941198e-06, |
|
"loss": 0.836, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.2972493345164153, |
|
"grad_norm": 3.42492673444743, |
|
"learning_rate": 8.86320896934581e-06, |
|
"loss": 0.8254, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.30168589174800353, |
|
"grad_norm": 3.1014982685234633, |
|
"learning_rate": 8.81357542632014e-06, |
|
"loss": 0.838, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.30612244897959184, |
|
"grad_norm": 2.8652652554853693, |
|
"learning_rate": 8.763026743376349e-06, |
|
"loss": 0.8242, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.3105590062111801, |
|
"grad_norm": 3.117617428014748, |
|
"learning_rate": 8.711575050631823e-06, |
|
"loss": 0.7902, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.3149955634427684, |
|
"grad_norm": 2.8066840010172256, |
|
"learning_rate": 8.659232694898307e-06, |
|
"loss": 0.8398, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.3194321206743567, |
|
"grad_norm": 3.1802863661171648, |
|
"learning_rate": 8.606012236719073e-06, |
|
"loss": 0.8487, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.323868677905945, |
|
"grad_norm": 2.5919669683832645, |
|
"learning_rate": 8.551926447354759e-06, |
|
"loss": 0.8182, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.3283052351375333, |
|
"grad_norm": 2.1927751988515887, |
|
"learning_rate": 8.496988305718672e-06, |
|
"loss": 0.8357, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.33274179236912155, |
|
"grad_norm": 2.9601792065674766, |
|
"learning_rate": 8.44121099526225e-06, |
|
"loss": 0.8458, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.33717834960070986, |
|
"grad_norm": 2.9727625766130052, |
|
"learning_rate": 8.384607900811442e-06, |
|
"loss": 0.8271, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.3416149068322981, |
|
"grad_norm": 3.0013212096413393, |
|
"learning_rate": 8.327192605354766e-06, |
|
"loss": 0.7945, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.3460514640638864, |
|
"grad_norm": 2.7912070631337715, |
|
"learning_rate": 8.268978886783807e-06, |
|
"loss": 0.8222, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.35048802129547474, |
|
"grad_norm": 2.83451175997105, |
|
"learning_rate": 8.209980714586955e-06, |
|
"loss": 0.8321, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.354924578527063, |
|
"grad_norm": 2.9155174928570027, |
|
"learning_rate": 8.150212246497165e-06, |
|
"loss": 0.816, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.3593611357586513, |
|
"grad_norm": 3.4205616322803905, |
|
"learning_rate": 8.089687825094524e-06, |
|
"loss": 0.7996, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.36379769299023956, |
|
"grad_norm": 3.1050076903443298, |
|
"learning_rate": 8.0284219743645e-06, |
|
"loss": 0.8067, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.3682342502218279, |
|
"grad_norm": 2.912400492688149, |
|
"learning_rate": 7.96642939621261e-06, |
|
"loss": 0.8137, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.37267080745341613, |
|
"grad_norm": 3.2683909275955094, |
|
"learning_rate": 7.903724966936442e-06, |
|
"loss": 0.8507, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.37710736468500444, |
|
"grad_norm": 2.8879169834094385, |
|
"learning_rate": 7.84032373365578e-06, |
|
"loss": 0.8198, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.3815439219165927, |
|
"grad_norm": 3.1376295436995663, |
|
"learning_rate": 7.776240910701788e-06, |
|
"loss": 0.8259, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.385980479148181, |
|
"grad_norm": 2.7251763162480427, |
|
"learning_rate": 7.71149187596602e-06, |
|
"loss": 0.7873, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.3904170363797693, |
|
"grad_norm": 2.856774279255724, |
|
"learning_rate": 7.646092167210217e-06, |
|
"loss": 0.7761, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.3948535936113576, |
|
"grad_norm": 2.9462908501967187, |
|
"learning_rate": 7.580057478337717e-06, |
|
"loss": 0.7899, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.3992901508429459, |
|
"grad_norm": 2.5526957889688866, |
|
"learning_rate": 7.5134036556274085e-06, |
|
"loss": 0.8128, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.40372670807453415, |
|
"grad_norm": 2.8313232961928976, |
|
"learning_rate": 7.446146693931111e-06, |
|
"loss": 0.815, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.40816326530612246, |
|
"grad_norm": 2.560445546914752, |
|
"learning_rate": 7.378302732835317e-06, |
|
"loss": 0.8283, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.4125998225377107, |
|
"grad_norm": 3.1172570475169774, |
|
"learning_rate": 7.3098880527881755e-06, |
|
"loss": 0.7941, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.417036379769299, |
|
"grad_norm": 3.480216407476258, |
|
"learning_rate": 7.2409190711927015e-06, |
|
"loss": 0.7987, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.42147293700088734, |
|
"grad_norm": 3.2148941428286677, |
|
"learning_rate": 7.171412338467101e-06, |
|
"loss": 0.7864, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.4259094942324756, |
|
"grad_norm": 3.026908707292069, |
|
"learning_rate": 7.1013845340731865e-06, |
|
"loss": 0.8066, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.4303460514640639, |
|
"grad_norm": 3.13804883356682, |
|
"learning_rate": 7.030852462513827e-06, |
|
"loss": 0.7869, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.43478260869565216, |
|
"grad_norm": 3.0995750225017336, |
|
"learning_rate": 6.959833049300376e-06, |
|
"loss": 0.794, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.4392191659272405, |
|
"grad_norm": 2.452271023256726, |
|
"learning_rate": 6.888343336891088e-06, |
|
"loss": 0.8038, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.44365572315882873, |
|
"grad_norm": 2.945520429899043, |
|
"learning_rate": 6.816400480601445e-06, |
|
"loss": 0.8273, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.44809228039041704, |
|
"grad_norm": 2.7654772587664347, |
|
"learning_rate": 6.744021744487422e-06, |
|
"loss": 0.8232, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.4525288376220053, |
|
"grad_norm": 2.574659419159971, |
|
"learning_rate": 6.671224497202637e-06, |
|
"loss": 0.7968, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.4569653948535936, |
|
"grad_norm": 2.6941300017602723, |
|
"learning_rate": 6.598026207830428e-06, |
|
"loss": 0.7774, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.4614019520851819, |
|
"grad_norm": 2.3049982572840797, |
|
"learning_rate": 6.524444441691796e-06, |
|
"loss": 0.8201, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.4658385093167702, |
|
"grad_norm": 2.8950877335554352, |
|
"learning_rate": 6.4504968561302905e-06, |
|
"loss": 0.7908, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.4702750665483585, |
|
"grad_norm": 2.721699004440637, |
|
"learning_rate": 6.376201196274778e-06, |
|
"loss": 0.8126, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.47471162377994675, |
|
"grad_norm": 2.3821339398566046, |
|
"learning_rate": 6.301575290781174e-06, |
|
"loss": 0.7828, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.47914818101153506, |
|
"grad_norm": 2.895185018848345, |
|
"learning_rate": 6.226637047554113e-06, |
|
"loss": 0.8102, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.4835847382431233, |
|
"grad_norm": 2.5424898208946183, |
|
"learning_rate": 6.1514044494496e-06, |
|
"loss": 0.8043, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.4880212954747116, |
|
"grad_norm": 2.5740094881555025, |
|
"learning_rate": 6.075895549959694e-06, |
|
"loss": 0.8092, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.49245785270629994, |
|
"grad_norm": 3.0317460337993998, |
|
"learning_rate": 6.000128468880223e-06, |
|
"loss": 0.769, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.4968944099378882, |
|
"grad_norm": 2.4350512141767795, |
|
"learning_rate": 5.924121387962594e-06, |
|
"loss": 0.781, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.5013309671694764, |
|
"grad_norm": 2.4910751483001157, |
|
"learning_rate": 5.847892546550738e-06, |
|
"loss": 0.8354, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.5057675244010648, |
|
"grad_norm": 2.4987734030771476, |
|
"learning_rate": 5.771460237204231e-06, |
|
"loss": 0.7981, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.5102040816326531, |
|
"grad_norm": 2.78796449183458, |
|
"learning_rate": 5.694842801308651e-06, |
|
"loss": 0.7749, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.5146406388642414, |
|
"grad_norm": 2.636798263235474, |
|
"learning_rate": 5.618058624674207e-06, |
|
"loss": 0.7738, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.5190771960958296, |
|
"grad_norm": 2.4250601704506556, |
|
"learning_rate": 5.541126133123721e-06, |
|
"loss": 0.8487, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.5235137533274179, |
|
"grad_norm": 3.0366629210786398, |
|
"learning_rate": 5.464063788070996e-06, |
|
"loss": 0.7813, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.5279503105590062, |
|
"grad_norm": 2.9568222566071674, |
|
"learning_rate": 5.386890082090652e-06, |
|
"loss": 0.7945, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.5323868677905945, |
|
"grad_norm": 2.9732986690318812, |
|
"learning_rate": 5.309623534480481e-06, |
|
"loss": 0.805, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5368234250221828, |
|
"grad_norm": 2.449200180183246, |
|
"learning_rate": 5.232282686817392e-06, |
|
"loss": 0.8011, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.541259982253771, |
|
"grad_norm": 2.656104751785374, |
|
"learning_rate": 5.154886098507995e-06, |
|
"loss": 0.7727, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.5456965394853593, |
|
"grad_norm": 2.9067760420236737, |
|
"learning_rate": 5.077452342334939e-06, |
|
"loss": 0.7905, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.5501330967169477, |
|
"grad_norm": 2.527266541526558, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7957, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.554569653948536, |
|
"grad_norm": 3.260917632804995, |
|
"learning_rate": 4.922547657665062e-06, |
|
"loss": 0.769, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.5590062111801242, |
|
"grad_norm": 2.9675971347845373, |
|
"learning_rate": 4.845113901492005e-06, |
|
"loss": 0.755, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.5634427684117125, |
|
"grad_norm": 2.8643410190667047, |
|
"learning_rate": 4.767717313182611e-06, |
|
"loss": 0.7949, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.5678793256433008, |
|
"grad_norm": 2.2606470678665485, |
|
"learning_rate": 4.69037646551952e-06, |
|
"loss": 0.7711, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.5723158828748891, |
|
"grad_norm": 2.654813528917904, |
|
"learning_rate": 4.613109917909349e-06, |
|
"loss": 0.7714, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.5767524401064774, |
|
"grad_norm": 2.741384900081702, |
|
"learning_rate": 4.535936211929005e-06, |
|
"loss": 0.7891, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.5811889973380656, |
|
"grad_norm": 2.5799332532291004, |
|
"learning_rate": 4.458873866876282e-06, |
|
"loss": 0.7865, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.5856255545696539, |
|
"grad_norm": 2.3867677434001044, |
|
"learning_rate": 4.3819413753257945e-06, |
|
"loss": 0.7656, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.5900621118012422, |
|
"grad_norm": 2.6115344238359217, |
|
"learning_rate": 4.305157198691351e-06, |
|
"loss": 0.7934, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.5944986690328306, |
|
"grad_norm": 3.0185681719969293, |
|
"learning_rate": 4.228539762795769e-06, |
|
"loss": 0.7669, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.5989352262644189, |
|
"grad_norm": 2.628488411245796, |
|
"learning_rate": 4.152107453449263e-06, |
|
"loss": 0.7445, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.6033717834960071, |
|
"grad_norm": 2.480295060210172, |
|
"learning_rate": 4.075878612037408e-06, |
|
"loss": 0.7765, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.6078083407275954, |
|
"grad_norm": 3.1433976467269416, |
|
"learning_rate": 3.999871531119779e-06, |
|
"loss": 0.7817, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.6122448979591837, |
|
"grad_norm": 2.2970916095173335, |
|
"learning_rate": 3.924104450040308e-06, |
|
"loss": 0.7862, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.616681455190772, |
|
"grad_norm": 2.64995989465673, |
|
"learning_rate": 3.848595550550401e-06, |
|
"loss": 0.7494, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.6211180124223602, |
|
"grad_norm": 2.4179592820902647, |
|
"learning_rate": 3.773362952445889e-06, |
|
"loss": 0.7389, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6255545696539485, |
|
"grad_norm": 2.7575087976553023, |
|
"learning_rate": 3.6984247092188265e-06, |
|
"loss": 0.7748, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.6299911268855368, |
|
"grad_norm": 2.5672434080502753, |
|
"learning_rate": 3.623798803725223e-06, |
|
"loss": 0.7694, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.6344276841171251, |
|
"grad_norm": 2.370793395981591, |
|
"learning_rate": 3.5495031438697103e-06, |
|
"loss": 0.7638, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.6388642413487134, |
|
"grad_norm": 2.894475317512521, |
|
"learning_rate": 3.475555558308206e-06, |
|
"loss": 0.8022, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.6433007985803016, |
|
"grad_norm": 2.495989068116402, |
|
"learning_rate": 3.401973792169574e-06, |
|
"loss": 0.7626, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.64773735581189, |
|
"grad_norm": 2.3680512452024676, |
|
"learning_rate": 3.3287755027973634e-06, |
|
"loss": 0.7762, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.6521739130434783, |
|
"grad_norm": 2.865587957987949, |
|
"learning_rate": 3.2559782555125793e-06, |
|
"loss": 0.7756, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.6566104702750666, |
|
"grad_norm": 2.5614023023762273, |
|
"learning_rate": 3.1835995193985548e-06, |
|
"loss": 0.7917, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.6610470275066548, |
|
"grad_norm": 2.470759417948881, |
|
"learning_rate": 3.111656663108914e-06, |
|
"loss": 0.766, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.6654835847382431, |
|
"grad_norm": 2.6142907884168958, |
|
"learning_rate": 3.040166950699626e-06, |
|
"loss": 0.7798, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6699201419698314, |
|
"grad_norm": 2.5564545984732208, |
|
"learning_rate": 2.969147537486175e-06, |
|
"loss": 0.7961, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.6743566992014197, |
|
"grad_norm": 2.771325911908224, |
|
"learning_rate": 2.898615465926814e-06, |
|
"loss": 0.7621, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.678793256433008, |
|
"grad_norm": 2.6108513029462337, |
|
"learning_rate": 2.828587661532901e-06, |
|
"loss": 0.7543, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.6832298136645962, |
|
"grad_norm": 2.421876420317442, |
|
"learning_rate": 2.7590809288073e-06, |
|
"loss": 0.7801, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.6876663708961845, |
|
"grad_norm": 2.2222956329711607, |
|
"learning_rate": 2.6901119472118253e-06, |
|
"loss": 0.7959, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.6921029281277729, |
|
"grad_norm": 2.5970672573393294, |
|
"learning_rate": 2.6216972671646846e-06, |
|
"loss": 0.778, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.6965394853593612, |
|
"grad_norm": 2.7539817190713, |
|
"learning_rate": 2.553853306068888e-06, |
|
"loss": 0.7742, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.7009760425909495, |
|
"grad_norm": 2.7061208568232935, |
|
"learning_rate": 2.4865963443725945e-06, |
|
"loss": 0.7607, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.7054125998225377, |
|
"grad_norm": 2.4787094896326, |
|
"learning_rate": 2.419942521662285e-06, |
|
"loss": 0.7273, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.709849157054126, |
|
"grad_norm": 2.680134848840613, |
|
"learning_rate": 2.3539078327897846e-06, |
|
"loss": 0.7493, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 2.378836521235688, |
|
"learning_rate": 2.2885081240339813e-06, |
|
"loss": 0.7303, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.7187222715173026, |
|
"grad_norm": 2.419540388712507, |
|
"learning_rate": 2.223759089298214e-06, |
|
"loss": 0.7849, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.7231588287488908, |
|
"grad_norm": 2.7437113821072225, |
|
"learning_rate": 2.159676266344222e-06, |
|
"loss": 0.7491, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.7275953859804791, |
|
"grad_norm": 2.800814230902216, |
|
"learning_rate": 2.096275033063561e-06, |
|
"loss": 0.735, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.7320319432120674, |
|
"grad_norm": 2.5835441454701873, |
|
"learning_rate": 2.033570603787391e-06, |
|
"loss": 0.7848, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.7364685004436557, |
|
"grad_norm": 2.3927048837823848, |
|
"learning_rate": 1.9715780256355014e-06, |
|
"loss": 0.7785, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.7409050576752441, |
|
"grad_norm": 2.451375198464792, |
|
"learning_rate": 1.910312174905477e-06, |
|
"loss": 0.7737, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.7453416149068323, |
|
"grad_norm": 2.3689843619182165, |
|
"learning_rate": 1.849787753502838e-06, |
|
"loss": 0.7604, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.7497781721384206, |
|
"grad_norm": 2.3270088409766103, |
|
"learning_rate": 1.7900192854130465e-06, |
|
"loss": 0.7681, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.7542147293700089, |
|
"grad_norm": 2.4484436031124086, |
|
"learning_rate": 1.7310211132161936e-06, |
|
"loss": 0.7107, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.7586512866015972, |
|
"grad_norm": 2.8269432995587653, |
|
"learning_rate": 1.672807394645236e-06, |
|
"loss": 0.7169, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.7630878438331854, |
|
"grad_norm": 2.4522790771322596, |
|
"learning_rate": 1.6153920991885591e-06, |
|
"loss": 0.7465, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.7675244010647737, |
|
"grad_norm": 2.86172957277401, |
|
"learning_rate": 1.5587890047377512e-06, |
|
"loss": 0.7431, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.771960958296362, |
|
"grad_norm": 2.3353346131787838, |
|
"learning_rate": 1.50301169428133e-06, |
|
"loss": 0.788, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.7763975155279503, |
|
"grad_norm": 2.2356374657269957, |
|
"learning_rate": 1.4480735526452427e-06, |
|
"loss": 0.7722, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.7808340727595386, |
|
"grad_norm": 2.7719896017269257, |
|
"learning_rate": 1.3939877632809279e-06, |
|
"loss": 0.7281, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.7852706299911268, |
|
"grad_norm": 2.0933629026791696, |
|
"learning_rate": 1.340767305101694e-06, |
|
"loss": 0.7493, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.7897071872227152, |
|
"grad_norm": 2.5031162243937595, |
|
"learning_rate": 1.28842494936818e-06, |
|
"loss": 0.7484, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.7941437444543035, |
|
"grad_norm": 2.7578714943858844, |
|
"learning_rate": 1.2369732566236508e-06, |
|
"loss": 0.7525, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.7985803016858918, |
|
"grad_norm": 2.5586093386128406, |
|
"learning_rate": 1.1864245736798618e-06, |
|
"loss": 0.7577, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.80301685891748, |
|
"grad_norm": 2.8639096532826263, |
|
"learning_rate": 1.1367910306541918e-06, |
|
"loss": 0.764, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.8074534161490683, |
|
"grad_norm": 2.8829135266213175, |
|
"learning_rate": 1.088084538058804e-06, |
|
"loss": 0.7613, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.8118899733806566, |
|
"grad_norm": 2.1713022267735202, |
|
"learning_rate": 1.0403167839424883e-06, |
|
"loss": 0.7369, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.8163265306122449, |
|
"grad_norm": 1.895710946494562, |
|
"learning_rate": 9.934992310858944e-07, |
|
"loss": 0.7435, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.8207630878438332, |
|
"grad_norm": 2.2819940109062906, |
|
"learning_rate": 9.476431142508097e-07, |
|
"loss": 0.7413, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.8251996450754214, |
|
"grad_norm": 2.27074516610647, |
|
"learning_rate": 9.027594374841764e-07, |
|
"loss": 0.742, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.8296362023070097, |
|
"grad_norm": 2.9923428531081844, |
|
"learning_rate": 8.58858971477457e-07, |
|
"loss": 0.7529, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.834072759538598, |
|
"grad_norm": 2.297772259893077, |
|
"learning_rate": 8.159522509819995e-07, |
|
"loss": 0.7703, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.8385093167701864, |
|
"grad_norm": 2.7125609059674733, |
|
"learning_rate": 7.740495722810271e-07, |
|
"loss": 0.7242, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.8429458740017747, |
|
"grad_norm": 2.2166873258070523, |
|
"learning_rate": 7.33160990718847e-07, |
|
"loss": 0.7325, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.8473824312333629, |
|
"grad_norm": 3.181933616809904, |
|
"learning_rate": 6.932963182878821e-07, |
|
"loss": 0.7461, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.8518189884649512, |
|
"grad_norm": 2.457506471010056, |
|
"learning_rate": 6.544651212740915e-07, |
|
"loss": 0.7374, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.8562555456965395, |
|
"grad_norm": 2.427607748961866, |
|
"learning_rate": 6.166767179613691e-07, |
|
"loss": 0.7727, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.8606921029281278, |
|
"grad_norm": 3.432067630928332, |
|
"learning_rate": 5.799401763954287e-07, |
|
"loss": 0.7531, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.865128660159716, |
|
"grad_norm": 2.557627658159976, |
|
"learning_rate": 5.442643122077673e-07, |
|
"loss": 0.745, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 2.5202626872451845, |
|
"learning_rate": 5.096576865001802e-07, |
|
"loss": 0.7603, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.8740017746228926, |
|
"grad_norm": 2.656418699241345, |
|
"learning_rate": 4.7612860379036674e-07, |
|
"loss": 0.7804, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.878438331854481, |
|
"grad_norm": 2.7085851646031665, |
|
"learning_rate": 4.436851100190953e-07, |
|
"loss": 0.7398, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.8828748890860693, |
|
"grad_norm": 2.293307857396717, |
|
"learning_rate": 4.123349906194357e-07, |
|
"loss": 0.7538, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.8873114463176575, |
|
"grad_norm": 2.886861570546567, |
|
"learning_rate": 3.820857686484908e-07, |
|
"loss": 0.7291, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.8917480035492458, |
|
"grad_norm": 2.455668193013591, |
|
"learning_rate": 3.5294470298209817e-07, |
|
"loss": 0.7799, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.8961845607808341, |
|
"grad_norm": 2.708948960335714, |
|
"learning_rate": 3.2491878657292643e-07, |
|
"loss": 0.7492, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.9006211180124224, |
|
"grad_norm": 2.5597091034095194, |
|
"learning_rate": 2.980147447723775e-07, |
|
"loss": 0.7392, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.9050576752440106, |
|
"grad_norm": 2.463909703586001, |
|
"learning_rate": 2.72239033716718e-07, |
|
"loss": 0.7255, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.9094942324755989, |
|
"grad_norm": 2.714509240372469, |
|
"learning_rate": 2.475978387778e-07, |
|
"loss": 0.7412, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.9139307897071872, |
|
"grad_norm": 3.0070268485752103, |
|
"learning_rate": 2.2409707307877226e-07, |
|
"loss": 0.756, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.9183673469387755, |
|
"grad_norm": 2.6554320065677093, |
|
"learning_rate": 2.0174237607510138e-07, |
|
"loss": 0.7385, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.9228039041703638, |
|
"grad_norm": 2.19585204977499, |
|
"learning_rate": 1.805391122012884e-07, |
|
"loss": 0.7528, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.927240461401952, |
|
"grad_norm": 2.246232816291186, |
|
"learning_rate": 1.6049236958356475e-07, |
|
"loss": 0.7765, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.9316770186335404, |
|
"grad_norm": 2.781850208748413, |
|
"learning_rate": 1.416069588189045e-07, |
|
"loss": 0.7558, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.9361135758651287, |
|
"grad_norm": 3.2985753891715945, |
|
"learning_rate": 1.2388741182062348e-07, |
|
"loss": 0.7419, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.940550133096717, |
|
"grad_norm": 3.112687270289018, |
|
"learning_rate": 1.0733798073086498e-07, |
|
"loss": 0.7531, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.9449866903283053, |
|
"grad_norm": 2.5616717115112784, |
|
"learning_rate": 9.1962636900218e-08, |
|
"loss": 0.7605, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.9494232475598935, |
|
"grad_norm": 2.672849952243094, |
|
"learning_rate": 7.776506993471323e-08, |
|
"loss": 0.7408, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.9538598047914818, |
|
"grad_norm": 2.9266883416397724, |
|
"learning_rate": 6.474868681043578e-08, |
|
"loss": 0.7539, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.9582963620230701, |
|
"grad_norm": 2.593151981582472, |
|
"learning_rate": 5.291661105595147e-08, |
|
"loss": 0.7456, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.9627329192546584, |
|
"grad_norm": 2.7307372540316868, |
|
"learning_rate": 4.227168200276077e-08, |
|
"loss": 0.7348, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.9671694764862466, |
|
"grad_norm": 2.40736535660914, |
|
"learning_rate": 3.2816454103945514e-08, |
|
"loss": 0.7683, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.9716060337178349, |
|
"grad_norm": 2.6188698813112774, |
|
"learning_rate": 2.455319632118147e-08, |
|
"loss": 0.7436, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.9760425909494232, |
|
"grad_norm": 2.5274565265510525, |
|
"learning_rate": 1.7483891580253877e-08, |
|
"loss": 0.7647, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.9804791481810116, |
|
"grad_norm": 2.144358539554466, |
|
"learning_rate": 1.161023629522029e-08, |
|
"loss": 0.7297, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.9849157054125999, |
|
"grad_norm": 2.4523679132502316, |
|
"learning_rate": 6.933639961322347e-09, |
|
"loss": 0.7642, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.9893522626441881, |
|
"grad_norm": 2.9622315500466687, |
|
"learning_rate": 3.4552248167507576e-09, |
|
"loss": 0.7555, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.9937888198757764, |
|
"grad_norm": 2.652778627419083, |
|
"learning_rate": 1.1758255733423928e-09, |
|
"loss": 0.7527, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.9982253771073647, |
|
"grad_norm": 2.1355065406932594, |
|
"learning_rate": 9.598921627607116e-11, |
|
"loss": 0.7696, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 2254, |
|
"total_flos": 3809428622016512.0, |
|
"train_loss": 0.811508896611069, |
|
"train_runtime": 11825.8532, |
|
"train_samples_per_second": 24.397, |
|
"train_steps_per_second": 0.191 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2254, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3809428622016512.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|