{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 1350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007407407407407408, "grad_norm": 1.5567783117294312, "learning_rate": 1.4814814814814815e-05, "loss": 1.5566, "step": 1 }, { "epoch": 0.037037037037037035, "grad_norm": 1.6362663507461548, "learning_rate": 7.407407407407407e-05, "loss": 1.5121, "step": 5 }, { "epoch": 0.07407407407407407, "grad_norm": 0.9970850944519043, "learning_rate": 0.00014814814814814815, "loss": 1.2449, "step": 10 }, { "epoch": 0.1111111111111111, "grad_norm": 0.688450813293457, "learning_rate": 0.0002222222222222222, "loss": 0.8674, "step": 15 }, { "epoch": 0.14814814814814814, "grad_norm": 0.4350931942462921, "learning_rate": 0.0002962962962962963, "loss": 0.7491, "step": 20 }, { "epoch": 0.18518518518518517, "grad_norm": 0.377754271030426, "learning_rate": 0.00037037037037037035, "loss": 0.6649, "step": 25 }, { "epoch": 0.2222222222222222, "grad_norm": 0.19140222668647766, "learning_rate": 0.0004444444444444444, "loss": 0.6335, "step": 30 }, { "epoch": 0.25925925925925924, "grad_norm": 0.16032004356384277, "learning_rate": 0.0005185185185185185, "loss": 0.6068, "step": 35 }, { "epoch": 0.2962962962962963, "grad_norm": 0.15596596896648407, "learning_rate": 0.0005925925925925926, "loss": 0.5878, "step": 40 }, { "epoch": 0.3333333333333333, "grad_norm": 0.12440178543329239, "learning_rate": 0.0006666666666666666, "loss": 0.564, "step": 45 }, { "epoch": 0.37037037037037035, "grad_norm": 0.13573868572711945, "learning_rate": 0.0007407407407407407, "loss": 0.56, "step": 50 }, { "epoch": 0.4074074074074074, "grad_norm": 0.10565729439258575, "learning_rate": 0.0008148148148148148, "loss": 0.5418, "step": 55 }, { "epoch": 0.4444444444444444, "grad_norm": 0.11647824943065643, "learning_rate": 0.0008888888888888888, "loss": 0.5356, "step": 60 }, { "epoch": 0.48148148148148145, "grad_norm": 0.10235206037759781, "learning_rate": 0.0009629629629629629, "loss": 0.5294, "step": 65 }, { "epoch": 0.5185185185185185, "grad_norm": 0.17172235250473022, "learning_rate": 0.001037037037037037, "loss": 0.5216, "step": 70 }, { "epoch": 0.5555555555555556, "grad_norm": 0.12478260695934296, "learning_rate": 0.0011111111111111111, "loss": 0.5218, "step": 75 }, { "epoch": 0.5925925925925926, "grad_norm": 0.11295782774686813, "learning_rate": 0.0011851851851851852, "loss": 0.5144, "step": 80 }, { "epoch": 0.6296296296296297, "grad_norm": 0.1066703274846077, "learning_rate": 0.0012592592592592592, "loss": 0.5154, "step": 85 }, { "epoch": 0.6666666666666666, "grad_norm": 0.0921861082315445, "learning_rate": 0.0013333333333333333, "loss": 0.5003, "step": 90 }, { "epoch": 0.7037037037037037, "grad_norm": 0.09984983503818512, "learning_rate": 0.0014074074074074076, "loss": 0.5042, "step": 95 }, { "epoch": 0.7407407407407407, "grad_norm": 0.13562288880348206, "learning_rate": 0.0014814814814814814, "loss": 0.5003, "step": 100 }, { "epoch": 0.7777777777777778, "grad_norm": 0.09528646618127823, "learning_rate": 0.0015555555555555557, "loss": 0.4996, "step": 105 }, { "epoch": 0.8148148148148148, "grad_norm": 0.14511393010616302, "learning_rate": 0.0016296296296296295, "loss": 0.4929, "step": 110 }, { "epoch": 0.8518518518518519, "grad_norm": 0.0997464582324028, "learning_rate": 0.0017037037037037038, "loss": 0.4906, "step": 115 }, { "epoch": 0.8888888888888888, "grad_norm": 0.14288371801376343, "learning_rate": 0.0017777777777777776, "loss": 0.4924, "step": 120 }, { "epoch": 0.9259259259259259, "grad_norm": 0.12957949936389923, "learning_rate": 0.001851851851851852, "loss": 0.4879, "step": 125 }, { "epoch": 0.9629629629629629, "grad_norm": 0.10427533835172653, "learning_rate": 0.0019259259259259258, "loss": 0.4856, "step": 130 }, { "epoch": 1.0, "grad_norm": 0.11492197960615158, "learning_rate": 0.002, "loss": 0.4861, "step": 135 }, { "epoch": 1.0, "eval_loss": 1.2494666576385498, "eval_runtime": 1.452, "eval_samples_per_second": 2.755, "eval_steps_per_second": 0.689, "step": 135 }, { "epoch": 1.037037037037037, "grad_norm": 0.20287227630615234, "learning_rate": 0.0019999164298554373, "loss": 0.4759, "step": 140 }, { "epoch": 1.074074074074074, "grad_norm": 0.10903054475784302, "learning_rate": 0.0019996657333896874, "loss": 0.4752, "step": 145 }, { "epoch": 1.1111111111111112, "grad_norm": 0.09817971289157867, "learning_rate": 0.00199924795250423, "loss": 0.4771, "step": 150 }, { "epoch": 1.1481481481481481, "grad_norm": 0.10313747823238373, "learning_rate": 0.001998663157027083, "loss": 0.471, "step": 155 }, { "epoch": 1.1851851851851851, "grad_norm": 0.10586988180875778, "learning_rate": 0.001997911444701132, "loss": 0.4675, "step": 160 }, { "epoch": 1.2222222222222223, "grad_norm": 0.10676765441894531, "learning_rate": 0.001996992941167792, "loss": 0.4743, "step": 165 }, { "epoch": 1.2592592592592593, "grad_norm": 0.09956058114767075, "learning_rate": 0.0019959077999460095, "loss": 0.4682, "step": 170 }, { "epoch": 1.2962962962962963, "grad_norm": 0.08940426260232925, "learning_rate": 0.0019946562024066015, "loss": 0.4667, "step": 175 }, { "epoch": 1.3333333333333333, "grad_norm": 0.09569676965475082, "learning_rate": 0.001993238357741943, "loss": 0.4742, "step": 180 }, { "epoch": 1.3703703703703702, "grad_norm": 0.1325625479221344, "learning_rate": 0.0019916545029310014, "loss": 0.4735, "step": 185 }, { "epoch": 1.4074074074074074, "grad_norm": 0.09514327347278595, "learning_rate": 0.0019899049026997273, "loss": 0.473, "step": 190 }, { "epoch": 1.4444444444444444, "grad_norm": 0.10093410313129425, "learning_rate": 0.001987989849476809, "loss": 0.4693, "step": 195 }, { "epoch": 1.4814814814814814, "grad_norm": 0.10901673138141632, "learning_rate": 0.0019859096633447963, "loss": 0.4666, "step": 200 }, { "epoch": 1.5185185185185186, "grad_norm": 0.10339543968439102, "learning_rate": 0.001983664691986601, "loss": 0.4712, "step": 205 }, { "epoch": 1.5555555555555556, "grad_norm": 0.0933491587638855, "learning_rate": 0.001981255310627385, "loss": 0.4692, "step": 210 }, { "epoch": 1.5925925925925926, "grad_norm": 0.09102772176265717, "learning_rate": 0.0019786819219718443, "loss": 0.4666, "step": 215 }, { "epoch": 1.6296296296296298, "grad_norm": 0.08935283869504929, "learning_rate": 0.0019759449561369035, "loss": 0.4605, "step": 220 }, { "epoch": 1.6666666666666665, "grad_norm": 0.10174907743930817, "learning_rate": 0.0019730448705798237, "loss": 0.465, "step": 225 }, { "epoch": 1.7037037037037037, "grad_norm": 0.09616223722696304, "learning_rate": 0.0019699821500217436, "loss": 0.4684, "step": 230 }, { "epoch": 1.7407407407407407, "grad_norm": 0.09224896878004074, "learning_rate": 0.001966757306366662, "loss": 0.468, "step": 235 }, { "epoch": 1.7777777777777777, "grad_norm": 0.09403350204229355, "learning_rate": 0.0019633708786158804, "loss": 0.4658, "step": 240 }, { "epoch": 1.8148148148148149, "grad_norm": 0.09764115512371063, "learning_rate": 0.001959823432777912, "loss": 0.4682, "step": 245 }, { "epoch": 1.8518518518518519, "grad_norm": 0.08690910041332245, "learning_rate": 0.0019561155617738796, "loss": 0.4679, "step": 250 }, { "epoch": 1.8888888888888888, "grad_norm": 0.10882115364074707, "learning_rate": 0.0019522478853384153, "loss": 0.46, "step": 255 }, { "epoch": 1.925925925925926, "grad_norm": 0.10638313740491867, "learning_rate": 0.0019482210499160765, "loss": 0.4601, "step": 260 }, { "epoch": 1.9629629629629628, "grad_norm": 0.1394616961479187, "learning_rate": 0.0019440357285533, "loss": 0.4649, "step": 265 }, { "epoch": 2.0, "grad_norm": 0.11232441663742065, "learning_rate": 0.0019396926207859084, "loss": 0.458, "step": 270 }, { "epoch": 2.0, "eval_loss": 1.2390342950820923, "eval_runtime": 1.4571, "eval_samples_per_second": 2.745, "eval_steps_per_second": 0.686, "step": 270 }, { "epoch": 2.037037037037037, "grad_norm": 0.11058547347784042, "learning_rate": 0.0019351924525221897, "loss": 0.4336, "step": 275 }, { "epoch": 2.074074074074074, "grad_norm": 0.10051153600215912, "learning_rate": 0.0019305359759215685, "loss": 0.4345, "step": 280 }, { "epoch": 2.111111111111111, "grad_norm": 0.09568148106336594, "learning_rate": 0.0019257239692688907, "loss": 0.4356, "step": 285 }, { "epoch": 2.148148148148148, "grad_norm": 0.09497468173503876, "learning_rate": 0.0019207572368443383, "loss": 0.44, "step": 290 }, { "epoch": 2.185185185185185, "grad_norm": 0.10157410800457001, "learning_rate": 0.001915636608789006, "loss": 0.439, "step": 295 }, { "epoch": 2.2222222222222223, "grad_norm": 0.1098947674036026, "learning_rate": 0.0019103629409661467, "loss": 0.44, "step": 300 }, { "epoch": 2.259259259259259, "grad_norm": 0.09858433902263641, "learning_rate": 0.0019049371148181253, "loss": 0.4466, "step": 305 }, { "epoch": 2.2962962962962963, "grad_norm": 0.09663781523704529, "learning_rate": 0.0018993600372190932, "loss": 0.4408, "step": 310 }, { "epoch": 2.3333333333333335, "grad_norm": 0.100925512611866, "learning_rate": 0.0018936326403234123, "loss": 0.4393, "step": 315 }, { "epoch": 2.3703703703703702, "grad_norm": 0.10806870460510254, "learning_rate": 0.0018877558814098562, "loss": 0.4393, "step": 320 }, { "epoch": 2.4074074074074074, "grad_norm": 0.10387130826711655, "learning_rate": 0.001881730742721608, "loss": 0.4445, "step": 325 }, { "epoch": 2.4444444444444446, "grad_norm": 0.09329431504011154, "learning_rate": 0.0018755582313020908, "loss": 0.439, "step": 330 }, { "epoch": 2.4814814814814814, "grad_norm": 0.09396202117204666, "learning_rate": 0.0018692393788266478, "loss": 0.4379, "step": 335 }, { "epoch": 2.5185185185185186, "grad_norm": 0.10305757075548172, "learning_rate": 0.0018627752414301084, "loss": 0.4412, "step": 340 }, { "epoch": 2.5555555555555554, "grad_norm": 0.1034080982208252, "learning_rate": 0.0018561668995302665, "loss": 0.4445, "step": 345 }, { "epoch": 2.5925925925925926, "grad_norm": 0.09808813035488129, "learning_rate": 0.0018494154576472975, "loss": 0.4454, "step": 350 }, { "epoch": 2.6296296296296298, "grad_norm": 0.0979294404387474, "learning_rate": 0.0018425220442191495, "loss": 0.4498, "step": 355 }, { "epoch": 2.6666666666666665, "grad_norm": 0.09154608845710754, "learning_rate": 0.0018354878114129364, "loss": 0.4416, "step": 360 }, { "epoch": 2.7037037037037037, "grad_norm": 0.0931951180100441, "learning_rate": 0.0018283139349323631, "loss": 0.4439, "step": 365 }, { "epoch": 2.7407407407407405, "grad_norm": 0.09334207326173782, "learning_rate": 0.0018210016138212187, "loss": 0.4443, "step": 370 }, { "epoch": 2.7777777777777777, "grad_norm": 0.092356376349926, "learning_rate": 0.0018135520702629675, "loss": 0.4429, "step": 375 }, { "epoch": 2.814814814814815, "grad_norm": 0.09269876778125763, "learning_rate": 0.0018059665493764742, "loss": 0.4443, "step": 380 }, { "epoch": 2.851851851851852, "grad_norm": 0.10383119434118271, "learning_rate": 0.0017982463190078929, "loss": 0.4436, "step": 385 }, { "epoch": 2.888888888888889, "grad_norm": 0.0941496342420578, "learning_rate": 0.0017903926695187593, "loss": 0.4456, "step": 390 }, { "epoch": 2.925925925925926, "grad_norm": 0.09286876022815704, "learning_rate": 0.0017824069135703197, "loss": 0.445, "step": 395 }, { "epoch": 2.962962962962963, "grad_norm": 0.09152144938707352, "learning_rate": 0.0017742903859041324, "loss": 0.4499, "step": 400 }, { "epoch": 3.0, "grad_norm": 0.09554123133420944, "learning_rate": 0.001766044443118978, "loss": 0.4423, "step": 405 }, { "epoch": 3.0, "eval_loss": 1.2548832893371582, "eval_runtime": 1.4432, "eval_samples_per_second": 2.772, "eval_steps_per_second": 0.693, "step": 405 }, { "epoch": 3.037037037037037, "grad_norm": 0.0980718806385994, "learning_rate": 0.001757670463444118, "loss": 0.4156, "step": 410 }, { "epoch": 3.074074074074074, "grad_norm": 0.09404606372117996, "learning_rate": 0.0017491698465089362, "loss": 0.4121, "step": 415 }, { "epoch": 3.111111111111111, "grad_norm": 0.11583345383405685, "learning_rate": 0.0017405440131090047, "loss": 0.4172, "step": 420 }, { "epoch": 3.148148148148148, "grad_norm": 0.09314367175102234, "learning_rate": 0.0017317944049686123, "loss": 0.4166, "step": 425 }, { "epoch": 3.185185185185185, "grad_norm": 0.1007940024137497, "learning_rate": 0.001722922484499793, "loss": 0.4123, "step": 430 }, { "epoch": 3.2222222222222223, "grad_norm": 0.0978422611951828, "learning_rate": 0.0017139297345578992, "loss": 0.4199, "step": 435 }, { "epoch": 3.259259259259259, "grad_norm": 0.09395431727170944, "learning_rate": 0.0017048176581937562, "loss": 0.4133, "step": 440 }, { "epoch": 3.2962962962962963, "grad_norm": 0.09783247113227844, "learning_rate": 0.0016955877784024418, "loss": 0.4253, "step": 445 }, { "epoch": 3.3333333333333335, "grad_norm": 0.0955362394452095, "learning_rate": 0.0016862416378687337, "loss": 0.4204, "step": 450 }, { "epoch": 3.3703703703703702, "grad_norm": 0.09792539477348328, "learning_rate": 0.001676780798709262, "loss": 0.4232, "step": 455 }, { "epoch": 3.4074074074074074, "grad_norm": 0.09116431325674057, "learning_rate": 0.0016672068422114196, "loss": 0.4211, "step": 460 }, { "epoch": 3.4444444444444446, "grad_norm": 0.1013198271393776, "learning_rate": 0.0016575213685690638, "loss": 0.4202, "step": 465 }, { "epoch": 3.4814814814814814, "grad_norm": 0.09579209238290787, "learning_rate": 0.0016477259966150588, "loss": 0.4268, "step": 470 }, { "epoch": 3.5185185185185186, "grad_norm": 0.09915994852781296, "learning_rate": 0.001637822363550706, "loss": 0.4274, "step": 475 }, { "epoch": 3.5555555555555554, "grad_norm": 0.09255506843328476, "learning_rate": 0.0016278121246720988, "loss": 0.4237, "step": 480 }, { "epoch": 3.5925925925925926, "grad_norm": 0.09609281271696091, "learning_rate": 0.001617696953093457, "loss": 0.4287, "step": 485 }, { "epoch": 3.6296296296296298, "grad_norm": 0.10273302346467972, "learning_rate": 0.0016074785394674836, "loss": 0.4301, "step": 490 }, { "epoch": 3.6666666666666665, "grad_norm": 0.09646812826395035, "learning_rate": 0.0015971585917027862, "loss": 0.4282, "step": 495 }, { "epoch": 3.7037037037037037, "grad_norm": 0.09089499711990356, "learning_rate": 0.001586738834678418, "loss": 0.425, "step": 500 }, { "epoch": 3.7407407407407405, "grad_norm": 0.08903223276138306, "learning_rate": 0.0015762210099555802, "loss": 0.4222, "step": 505 }, { "epoch": 3.7777777777777777, "grad_norm": 0.09616349637508392, "learning_rate": 0.0015656068754865387, "loss": 0.4243, "step": 510 }, { "epoch": 3.814814814814815, "grad_norm": 0.0883878841996193, "learning_rate": 0.001554898205320797, "loss": 0.4278, "step": 515 }, { "epoch": 3.851851851851852, "grad_norm": 0.0974721610546112, "learning_rate": 0.0015440967893085827, "loss": 0.4239, "step": 520 }, { "epoch": 3.888888888888889, "grad_norm": 0.09724871814250946, "learning_rate": 0.0015332044328016914, "loss": 0.4269, "step": 525 }, { "epoch": 3.925925925925926, "grad_norm": 0.09783729910850525, "learning_rate": 0.0015222229563517384, "loss": 0.4289, "step": 530 }, { "epoch": 3.962962962962963, "grad_norm": 0.09811478108167648, "learning_rate": 0.0015111541954058731, "loss": 0.4265, "step": 535 }, { "epoch": 4.0, "grad_norm": 0.09431273490190506, "learning_rate": 0.0015, "loss": 0.4244, "step": 540 }, { "epoch": 4.0, "eval_loss": 1.266510009765625, "eval_runtime": 1.4433, "eval_samples_per_second": 2.771, "eval_steps_per_second": 0.693, "step": 540 }, { "epoch": 4.037037037037037, "grad_norm": 0.09484298527240753, "learning_rate": 0.0014887622344495642, "loss": 0.3886, "step": 545 }, { "epoch": 4.074074074074074, "grad_norm": 0.09763394296169281, "learning_rate": 0.001477442777037949, "loss": 0.3867, "step": 550 }, { "epoch": 4.111111111111111, "grad_norm": 0.09587694704532623, "learning_rate": 0.001466043519702539, "loss": 0.3934, "step": 555 }, { "epoch": 4.148148148148148, "grad_norm": 0.09805364161729813, "learning_rate": 0.0014545663677185006, "loss": 0.3913, "step": 560 }, { "epoch": 4.185185185185185, "grad_norm": 0.09342525154352188, "learning_rate": 0.0014430132393803352, "loss": 0.3962, "step": 565 }, { "epoch": 4.222222222222222, "grad_norm": 0.09320686757564545, "learning_rate": 0.0014313860656812536, "loss": 0.393, "step": 570 }, { "epoch": 4.2592592592592595, "grad_norm": 0.09312278032302856, "learning_rate": 0.001419686789990429, "loss": 0.3956, "step": 575 }, { "epoch": 4.296296296296296, "grad_norm": 0.09693987667560577, "learning_rate": 0.0014079173677281835, "loss": 0.4003, "step": 580 }, { "epoch": 4.333333333333333, "grad_norm": 0.09647519141435623, "learning_rate": 0.001396079766039157, "loss": 0.4026, "step": 585 }, { "epoch": 4.37037037037037, "grad_norm": 0.09753508865833282, "learning_rate": 0.0013841759634635176, "loss": 0.4047, "step": 590 }, { "epoch": 4.407407407407407, "grad_norm": 0.09481139481067657, "learning_rate": 0.00137220794960627, "loss": 0.4028, "step": 595 }, { "epoch": 4.444444444444445, "grad_norm": 0.0947929173707962, "learning_rate": 0.0013601777248047106, "loss": 0.4018, "step": 600 }, { "epoch": 4.481481481481482, "grad_norm": 0.10154122114181519, "learning_rate": 0.0013480872997940906, "loss": 0.4019, "step": 605 }, { "epoch": 4.518518518518518, "grad_norm": 0.09206040948629379, "learning_rate": 0.0013359386953715423, "loss": 0.401, "step": 610 }, { "epoch": 4.555555555555555, "grad_norm": 0.0979878157377243, "learning_rate": 0.0013237339420583212, "loss": 0.4046, "step": 615 }, { "epoch": 4.592592592592593, "grad_norm": 0.09499992430210114, "learning_rate": 0.0013114750797604247, "loss": 0.4092, "step": 620 }, { "epoch": 4.62962962962963, "grad_norm": 0.096099354326725, "learning_rate": 0.0012991641574276419, "loss": 0.4066, "step": 625 }, { "epoch": 4.666666666666667, "grad_norm": 0.09620559960603714, "learning_rate": 0.0012868032327110904, "loss": 0.4075, "step": 630 }, { "epoch": 4.703703703703704, "grad_norm": 0.09676729887723923, "learning_rate": 0.0012743943716193016, "loss": 0.4026, "step": 635 }, { "epoch": 4.7407407407407405, "grad_norm": 0.09284202009439468, "learning_rate": 0.0012619396481729059, "loss": 0.4055, "step": 640 }, { "epoch": 4.777777777777778, "grad_norm": 0.09661979228258133, "learning_rate": 0.0012494411440579815, "loss": 0.4068, "step": 645 }, { "epoch": 4.814814814814815, "grad_norm": 0.09566741436719894, "learning_rate": 0.001236900948278119, "loss": 0.4021, "step": 650 }, { "epoch": 4.851851851851852, "grad_norm": 0.09348838031291962, "learning_rate": 0.0012243211568052678, "loss": 0.4024, "step": 655 }, { "epoch": 4.888888888888889, "grad_norm": 0.09663164615631104, "learning_rate": 0.0012117038722294108, "loss": 0.4062, "step": 660 }, { "epoch": 4.925925925925926, "grad_norm": 0.09402230381965637, "learning_rate": 0.0011990512034071405, "loss": 0.4028, "step": 665 }, { "epoch": 4.962962962962963, "grad_norm": 0.10056190937757492, "learning_rate": 0.0011863652651091822, "loss": 0.4074, "step": 670 }, { "epoch": 5.0, "grad_norm": 0.09207284450531006, "learning_rate": 0.0011736481776669307, "loss": 0.4051, "step": 675 }, { "epoch": 5.0, "eval_loss": 1.271437644958496, "eval_runtime": 1.4527, "eval_samples_per_second": 2.753, "eval_steps_per_second": 0.688, "step": 675 }, { "epoch": 5.037037037037037, "grad_norm": 0.10166580975055695, "learning_rate": 0.0011609020666180573, "loss": 0.369, "step": 680 }, { "epoch": 5.074074074074074, "grad_norm": 0.10062967240810394, "learning_rate": 0.001148129062351249, "loss": 0.3679, "step": 685 }, { "epoch": 5.111111111111111, "grad_norm": 0.09751473367214203, "learning_rate": 0.0011353312997501312, "loss": 0.3679, "step": 690 }, { "epoch": 5.148148148148148, "grad_norm": 0.0990433469414711, "learning_rate": 0.0011225109178364455, "loss": 0.3685, "step": 695 }, { "epoch": 5.185185185185185, "grad_norm": 0.099190853536129, "learning_rate": 0.0011096700594125316, "loss": 0.3736, "step": 700 }, { "epoch": 5.222222222222222, "grad_norm": 0.1061362773180008, "learning_rate": 0.0010968108707031792, "loss": 0.3751, "step": 705 }, { "epoch": 5.2592592592592595, "grad_norm": 0.11798923462629318, "learning_rate": 0.0010839355009969068, "loss": 0.3757, "step": 710 }, { "epoch": 5.296296296296296, "grad_norm": 0.09905427694320679, "learning_rate": 0.00107104610228673, "loss": 0.375, "step": 715 }, { "epoch": 5.333333333333333, "grad_norm": 0.1025015115737915, "learning_rate": 0.0010581448289104759, "loss": 0.3763, "step": 720 }, { "epoch": 5.37037037037037, "grad_norm": 0.1149529442191124, "learning_rate": 0.0010452338371907063, "loss": 0.3882, "step": 725 }, { "epoch": 5.407407407407407, "grad_norm": 0.12777069211006165, "learning_rate": 0.0010323152850743107, "loss": 0.3812, "step": 730 }, { "epoch": 5.444444444444445, "grad_norm": 0.10847526788711548, "learning_rate": 0.0010193913317718243, "loss": 0.3832, "step": 735 }, { "epoch": 5.481481481481482, "grad_norm": 0.11118727922439575, "learning_rate": 0.0010064641373965393, "loss": 0.384, "step": 740 }, { "epoch": 5.518518518518518, "grad_norm": 0.12322206795215607, "learning_rate": 0.0009935358626034607, "loss": 0.3826, "step": 745 }, { "epoch": 5.555555555555555, "grad_norm": 0.1018897145986557, "learning_rate": 0.0009806086682281757, "loss": 0.3828, "step": 750 }, { "epoch": 5.592592592592593, "grad_norm": 0.10415022075176239, "learning_rate": 0.0009676847149256894, "loss": 0.3834, "step": 755 }, { "epoch": 5.62962962962963, "grad_norm": 0.10056640207767487, "learning_rate": 0.0009547661628092937, "loss": 0.379, "step": 760 }, { "epoch": 5.666666666666667, "grad_norm": 0.13305579125881195, "learning_rate": 0.0009418551710895242, "loss": 0.3836, "step": 765 }, { "epoch": 5.703703703703704, "grad_norm": 0.1132049411535263, "learning_rate": 0.0009289538977132702, "loss": 0.3815, "step": 770 }, { "epoch": 5.7407407407407405, "grad_norm": 0.12073713541030884, "learning_rate": 0.000916064499003093, "loss": 0.384, "step": 775 }, { "epoch": 5.777777777777778, "grad_norm": 0.0997130498290062, "learning_rate": 0.000903189129296821, "loss": 0.3836, "step": 780 }, { "epoch": 5.814814814814815, "grad_norm": 0.1098228394985199, "learning_rate": 0.0008903299405874684, "loss": 0.3901, "step": 785 }, { "epoch": 5.851851851851852, "grad_norm": 0.10409754514694214, "learning_rate": 0.0008774890821635547, "loss": 0.3806, "step": 790 }, { "epoch": 5.888888888888889, "grad_norm": 0.09349874407052994, "learning_rate": 0.0008646687002498692, "loss": 0.3821, "step": 795 }, { "epoch": 5.925925925925926, "grad_norm": 0.0936858206987381, "learning_rate": 0.0008518709376487514, "loss": 0.3842, "step": 800 }, { "epoch": 5.962962962962963, "grad_norm": 0.09671808034181595, "learning_rate": 0.0008390979333819426, "loss": 0.3813, "step": 805 }, { "epoch": 6.0, "grad_norm": 0.10419617593288422, "learning_rate": 0.0008263518223330697, "loss": 0.3815, "step": 810 }, { "epoch": 6.0, "eval_loss": 1.2959271669387817, "eval_runtime": 1.4543, "eval_samples_per_second": 2.75, "eval_steps_per_second": 0.688, "step": 810 }, { "epoch": 6.037037037037037, "grad_norm": 0.10344818234443665, "learning_rate": 0.0008136347348908179, "loss": 0.3456, "step": 815 }, { "epoch": 6.074074074074074, "grad_norm": 0.11311297118663788, "learning_rate": 0.0008009487965928596, "loss": 0.3457, "step": 820 }, { "epoch": 6.111111111111111, "grad_norm": 0.1126752719283104, "learning_rate": 0.0007882961277705895, "loss": 0.3446, "step": 825 }, { "epoch": 6.148148148148148, "grad_norm": 0.10734600573778152, "learning_rate": 0.0007756788431947326, "loss": 0.3441, "step": 830 }, { "epoch": 6.185185185185185, "grad_norm": 0.10664735734462738, "learning_rate": 0.0007630990517218807, "loss": 0.3464, "step": 835 }, { "epoch": 6.222222222222222, "grad_norm": 0.10784178227186203, "learning_rate": 0.0007505588559420188, "loss": 0.3484, "step": 840 }, { "epoch": 6.2592592592592595, "grad_norm": 0.09735342860221863, "learning_rate": 0.0007380603518270941, "loss": 0.3473, "step": 845 }, { "epoch": 6.296296296296296, "grad_norm": 0.10385815799236298, "learning_rate": 0.0007256056283806986, "loss": 0.3493, "step": 850 }, { "epoch": 6.333333333333333, "grad_norm": 0.10252544283866882, "learning_rate": 0.0007131967672889101, "loss": 0.3531, "step": 855 }, { "epoch": 6.37037037037037, "grad_norm": 0.10720902681350708, "learning_rate": 0.0007008358425723586, "loss": 0.3543, "step": 860 }, { "epoch": 6.407407407407407, "grad_norm": 0.1041482537984848, "learning_rate": 0.0006885249202395753, "loss": 0.3508, "step": 865 }, { "epoch": 6.444444444444445, "grad_norm": 0.10373668372631073, "learning_rate": 0.000676266057941679, "loss": 0.3534, "step": 870 }, { "epoch": 6.481481481481482, "grad_norm": 0.10245037823915482, "learning_rate": 0.0006640613046284581, "loss": 0.3521, "step": 875 }, { "epoch": 6.518518518518518, "grad_norm": 0.10254081338644028, "learning_rate": 0.0006519127002059096, "loss": 0.3538, "step": 880 }, { "epoch": 6.555555555555555, "grad_norm": 0.10260630398988724, "learning_rate": 0.0006398222751952898, "loss": 0.3537, "step": 885 }, { "epoch": 6.592592592592593, "grad_norm": 0.10130611807107925, "learning_rate": 0.0006277920503937303, "loss": 0.3515, "step": 890 }, { "epoch": 6.62962962962963, "grad_norm": 0.10895968973636627, "learning_rate": 0.0006158240365364823, "loss": 0.3578, "step": 895 }, { "epoch": 6.666666666666667, "grad_norm": 0.09870075434446335, "learning_rate": 0.0006039202339608432, "loss": 0.3512, "step": 900 }, { "epoch": 6.703703703703704, "grad_norm": 0.10364022850990295, "learning_rate": 0.0005920826322718165, "loss": 0.3537, "step": 905 }, { "epoch": 6.7407407407407405, "grad_norm": 0.10551033914089203, "learning_rate": 0.000580313210009571, "loss": 0.3566, "step": 910 }, { "epoch": 6.777777777777778, "grad_norm": 0.10657215118408203, "learning_rate": 0.0005686139343187468, "loss": 0.3552, "step": 915 }, { "epoch": 6.814814814814815, "grad_norm": 0.10223071277141571, "learning_rate": 0.0005569867606196651, "loss": 0.3584, "step": 920 }, { "epoch": 6.851851851851852, "grad_norm": 0.10146836936473846, "learning_rate": 0.0005454336322814994, "loss": 0.3535, "step": 925 }, { "epoch": 6.888888888888889, "grad_norm": 0.1027546152472496, "learning_rate": 0.0005339564802974615, "loss": 0.3501, "step": 930 }, { "epoch": 6.925925925925926, "grad_norm": 0.10245343297719955, "learning_rate": 0.0005225572229620509, "loss": 0.3512, "step": 935 }, { "epoch": 6.962962962962963, "grad_norm": 0.10256995260715485, "learning_rate": 0.0005112377655504359, "loss": 0.3543, "step": 940 }, { "epoch": 7.0, "grad_norm": 0.10365499556064606, "learning_rate": 0.0005000000000000002, "loss": 0.3546, "step": 945 }, { "epoch": 7.0, "eval_loss": 1.3559757471084595, "eval_runtime": 1.4533, "eval_samples_per_second": 2.752, "eval_steps_per_second": 0.688, "step": 945 }, { "epoch": 7.037037037037037, "grad_norm": 0.11704354733228683, "learning_rate": 0.0004888458045941269, "loss": 0.32, "step": 950 }, { "epoch": 7.074074074074074, "grad_norm": 0.11646736413240433, "learning_rate": 0.0004777770436482617, "loss": 0.3177, "step": 955 }, { "epoch": 7.111111111111111, "grad_norm": 0.11117232590913773, "learning_rate": 0.000466795567198309, "loss": 0.3144, "step": 960 }, { "epoch": 7.148148148148148, "grad_norm": 0.10937786102294922, "learning_rate": 0.0004559032106914173, "loss": 0.3212, "step": 965 }, { "epoch": 7.185185185185185, "grad_norm": 0.11477228999137878, "learning_rate": 0.0004451017946792032, "loss": 0.3219, "step": 970 }, { "epoch": 7.222222222222222, "grad_norm": 0.10524013638496399, "learning_rate": 0.0004343931245134616, "loss": 0.3167, "step": 975 }, { "epoch": 7.2592592592592595, "grad_norm": 0.10898813605308533, "learning_rate": 0.0004237789900444197, "loss": 0.3204, "step": 980 }, { "epoch": 7.296296296296296, "grad_norm": 0.11245737224817276, "learning_rate": 0.0004132611653215822, "loss": 0.3216, "step": 985 }, { "epoch": 7.333333333333333, "grad_norm": 0.1109575405716896, "learning_rate": 0.00040284140829721405, "loss": 0.3211, "step": 990 }, { "epoch": 7.37037037037037, "grad_norm": 0.10827223211526871, "learning_rate": 0.00039252146053251637, "loss": 0.3224, "step": 995 }, { "epoch": 7.407407407407407, "grad_norm": 0.11438044160604477, "learning_rate": 0.00038230304690654306, "loss": 0.3243, "step": 1000 }, { "epoch": 7.444444444444445, "grad_norm": 0.1066223755478859, "learning_rate": 0.00037218787532790164, "loss": 0.3253, "step": 1005 }, { "epoch": 7.481481481481482, "grad_norm": 0.11840826272964478, "learning_rate": 0.0003621776364492939, "loss": 0.327, "step": 1010 }, { "epoch": 7.518518518518518, "grad_norm": 0.1115390807390213, "learning_rate": 0.0003522740033849411, "loss": 0.3261, "step": 1015 }, { "epoch": 7.555555555555555, "grad_norm": 0.1104753240942955, "learning_rate": 0.0003424786314309365, "loss": 0.3225, "step": 1020 }, { "epoch": 7.592592592592593, "grad_norm": 0.10727293789386749, "learning_rate": 0.00033279315778858033, "loss": 0.3237, "step": 1025 }, { "epoch": 7.62962962962963, "grad_norm": 0.11141736060380936, "learning_rate": 0.0003232192012907381, "loss": 0.3237, "step": 1030 }, { "epoch": 7.666666666666667, "grad_norm": 0.10661887377500534, "learning_rate": 0.0003137583621312665, "loss": 0.3257, "step": 1035 }, { "epoch": 7.703703703703704, "grad_norm": 0.11351703852415085, "learning_rate": 0.000304412221597558, "loss": 0.3236, "step": 1040 }, { "epoch": 7.7407407407407405, "grad_norm": 0.10875287652015686, "learning_rate": 0.0002951823418062439, "loss": 0.3268, "step": 1045 }, { "epoch": 7.777777777777778, "grad_norm": 0.1116643026471138, "learning_rate": 0.0002860702654421011, "loss": 0.3239, "step": 1050 }, { "epoch": 7.814814814814815, "grad_norm": 0.10941953957080841, "learning_rate": 0.0002770775155002071, "loss": 0.3257, "step": 1055 }, { "epoch": 7.851851851851852, "grad_norm": 0.10806366801261902, "learning_rate": 0.00026820559503138797, "loss": 0.3235, "step": 1060 }, { "epoch": 7.888888888888889, "grad_norm": 0.1069023609161377, "learning_rate": 0.0002594559868909956, "loss": 0.3293, "step": 1065 }, { "epoch": 7.925925925925926, "grad_norm": 0.10821282863616943, "learning_rate": 0.000250830153491064, "loss": 0.3267, "step": 1070 }, { "epoch": 7.962962962962963, "grad_norm": 0.1114460751414299, "learning_rate": 0.00024232953655588209, "loss": 0.3225, "step": 1075 }, { "epoch": 8.0, "grad_norm": 0.1069159284234047, "learning_rate": 0.0002339555568810221, "loss": 0.3233, "step": 1080 }, { "epoch": 8.0, "eval_loss": 1.4125341176986694, "eval_runtime": 1.4532, "eval_samples_per_second": 2.753, "eval_steps_per_second": 0.688, "step": 1080 }, { "epoch": 8.037037037037036, "grad_norm": 0.13299891352653503, "learning_rate": 0.00022570961409586754, "loss": 0.2956, "step": 1085 }, { "epoch": 8.074074074074074, "grad_norm": 0.11145825684070587, "learning_rate": 0.00021759308642968023, "loss": 0.2952, "step": 1090 }, { "epoch": 8.11111111111111, "grad_norm": 0.11993227154016495, "learning_rate": 0.00020960733048124082, "loss": 0.2951, "step": 1095 }, { "epoch": 8.148148148148149, "grad_norm": 0.11672375351190567, "learning_rate": 0.00020175368099210702, "loss": 0.2961, "step": 1100 }, { "epoch": 8.185185185185185, "grad_norm": 0.11903874576091766, "learning_rate": 0.00019403345062352572, "loss": 0.2981, "step": 1105 }, { "epoch": 8.222222222222221, "grad_norm": 0.11281085014343262, "learning_rate": 0.00018644792973703252, "loss": 0.2983, "step": 1110 }, { "epoch": 8.25925925925926, "grad_norm": 0.11996857821941376, "learning_rate": 0.00017899838617878162, "loss": 0.2993, "step": 1115 }, { "epoch": 8.296296296296296, "grad_norm": 0.11176000535488129, "learning_rate": 0.00017168606506763696, "loss": 0.2972, "step": 1120 }, { "epoch": 8.333333333333334, "grad_norm": 0.11802016943693161, "learning_rate": 0.00016451218858706373, "loss": 0.2977, "step": 1125 }, { "epoch": 8.37037037037037, "grad_norm": 0.11810103803873062, "learning_rate": 0.00015747795578085046, "loss": 0.3002, "step": 1130 }, { "epoch": 8.407407407407407, "grad_norm": 0.12034923583269119, "learning_rate": 0.0001505845423527027, "loss": 0.2961, "step": 1135 }, { "epoch": 8.444444444444445, "grad_norm": 0.11696241796016693, "learning_rate": 0.00014383310046973364, "loss": 0.2977, "step": 1140 }, { "epoch": 8.481481481481481, "grad_norm": 0.11664384603500366, "learning_rate": 0.00013722475856989158, "loss": 0.2952, "step": 1145 }, { "epoch": 8.518518518518519, "grad_norm": 0.117145836353302, "learning_rate": 0.00013076062117335218, "loss": 0.3016, "step": 1150 }, { "epoch": 8.555555555555555, "grad_norm": 0.11361709982156754, "learning_rate": 0.00012444176869790924, "loss": 0.2988, "step": 1155 }, { "epoch": 8.592592592592592, "grad_norm": 0.1155712679028511, "learning_rate": 0.00011826925727839199, "loss": 0.2977, "step": 1160 }, { "epoch": 8.62962962962963, "grad_norm": 0.1212184950709343, "learning_rate": 0.00011224411859014417, "loss": 0.298, "step": 1165 }, { "epoch": 8.666666666666666, "grad_norm": 0.11797547340393066, "learning_rate": 0.00010636735967658784, "loss": 0.3007, "step": 1170 }, { "epoch": 8.703703703703704, "grad_norm": 0.11559978872537613, "learning_rate": 0.00010063996278090704, "loss": 0.2958, "step": 1175 }, { "epoch": 8.74074074074074, "grad_norm": 0.11640750616788864, "learning_rate": 9.506288518187466e-05, "loss": 0.2966, "step": 1180 }, { "epoch": 8.777777777777779, "grad_norm": 0.11927841603755951, "learning_rate": 8.963705903385344e-05, "loss": 0.2953, "step": 1185 }, { "epoch": 8.814814814814815, "grad_norm": 0.11425163596868515, "learning_rate": 8.436339121099412e-05, "loss": 0.2989, "step": 1190 }, { "epoch": 8.851851851851851, "grad_norm": 0.11373434215784073, "learning_rate": 7.92427631556617e-05, "loss": 0.2984, "step": 1195 }, { "epoch": 8.88888888888889, "grad_norm": 0.11790241301059723, "learning_rate": 7.427603073110967e-05, "loss": 0.2972, "step": 1200 }, { "epoch": 8.925925925925926, "grad_norm": 0.11773653328418732, "learning_rate": 6.946402407843155e-05, "loss": 0.3012, "step": 1205 }, { "epoch": 8.962962962962964, "grad_norm": 0.12333875894546509, "learning_rate": 6.480754747781037e-05, "loss": 0.2959, "step": 1210 }, { "epoch": 9.0, "grad_norm": 0.11808749288320541, "learning_rate": 6.0307379214091684e-05, "loss": 0.2969, "step": 1215 }, { "epoch": 9.0, "eval_loss": 1.4809459447860718, "eval_runtime": 1.4606, "eval_samples_per_second": 2.739, "eval_steps_per_second": 0.685, "step": 1215 }, { "epoch": 9.037037037037036, "grad_norm": 0.1122315376996994, "learning_rate": 5.596427144670002e-05, "loss": 0.2797, "step": 1220 }, { "epoch": 9.074074074074074, "grad_norm": 0.12401442974805832, "learning_rate": 5.1778950083923526e-05, "loss": 0.2823, "step": 1225 }, { "epoch": 9.11111111111111, "grad_norm": 0.11285774409770966, "learning_rate": 4.775211466158469e-05, "loss": 0.2825, "step": 1230 }, { "epoch": 9.148148148148149, "grad_norm": 0.11427191644906998, "learning_rate": 4.3884438226120426e-05, "loss": 0.2815, "step": 1235 }, { "epoch": 9.185185185185185, "grad_norm": 0.11915598809719086, "learning_rate": 4.017656722208807e-05, "loss": 0.2806, "step": 1240 }, { "epoch": 9.222222222222221, "grad_norm": 0.11690084636211395, "learning_rate": 3.6629121384119666e-05, "loss": 0.2811, "step": 1245 }, { "epoch": 9.25925925925926, "grad_norm": 0.11301770061254501, "learning_rate": 3.324269363333799e-05, "loss": 0.2811, "step": 1250 }, { "epoch": 9.296296296296296, "grad_norm": 0.1201760470867157, "learning_rate": 3.0017849978256518e-05, "loss": 0.2826, "step": 1255 }, { "epoch": 9.333333333333334, "grad_norm": 0.11819873005151749, "learning_rate": 2.6955129420176194e-05, "loss": 0.2787, "step": 1260 }, { "epoch": 9.37037037037037, "grad_norm": 0.11501555889844894, "learning_rate": 2.4055043863096426e-05, "loss": 0.2803, "step": 1265 }, { "epoch": 9.407407407407407, "grad_norm": 0.11644010990858078, "learning_rate": 2.1318078028155885e-05, "loss": 0.2823, "step": 1270 }, { "epoch": 9.444444444444445, "grad_norm": 0.11667327582836151, "learning_rate": 1.874468937261531e-05, "loss": 0.2836, "step": 1275 }, { "epoch": 9.481481481481481, "grad_norm": 0.11722666025161743, "learning_rate": 1.6335308013398887e-05, "loss": 0.2837, "step": 1280 }, { "epoch": 9.518518518518519, "grad_norm": 0.11853759735822678, "learning_rate": 1.4090336655203539e-05, "loss": 0.2816, "step": 1285 }, { "epoch": 9.555555555555555, "grad_norm": 0.11931838095188141, "learning_rate": 1.2010150523190988e-05, "loss": 0.2831, "step": 1290 }, { "epoch": 9.592592592592592, "grad_norm": 0.11536389589309692, "learning_rate": 1.0095097300273027e-05, "loss": 0.2824, "step": 1295 }, { "epoch": 9.62962962962963, "grad_norm": 0.11747179180383682, "learning_rate": 8.345497068998897e-06, "loss": 0.2849, "step": 1300 }, { "epoch": 9.666666666666666, "grad_norm": 0.12529109418392181, "learning_rate": 6.761642258056977e-06, "loss": 0.2841, "step": 1305 }, { "epoch": 9.703703703703704, "grad_norm": 0.11867683380842209, "learning_rate": 5.343797593398536e-06, "loss": 0.2822, "step": 1310 }, { "epoch": 9.74074074074074, "grad_norm": 0.11928539723157883, "learning_rate": 4.092200053990691e-06, "loss": 0.2809, "step": 1315 }, { "epoch": 9.777777777777779, "grad_norm": 0.11743851751089096, "learning_rate": 3.007058832207976e-06, "loss": 0.2819, "step": 1320 }, { "epoch": 9.814814814814815, "grad_norm": 0.11580769717693329, "learning_rate": 2.088555298867978e-06, "loss": 0.2819, "step": 1325 }, { "epoch": 9.851851851851851, "grad_norm": 0.11537632346153259, "learning_rate": 1.3368429729168074e-06, "loss": 0.2814, "step": 1330 }, { "epoch": 9.88888888888889, "grad_norm": 0.1173846423625946, "learning_rate": 7.520474957699585e-07, "loss": 0.2832, "step": 1335 }, { "epoch": 9.925925925925926, "grad_norm": 0.11481310427188873, "learning_rate": 3.3426661031255026e-07, "loss": 0.285, "step": 1340 }, { "epoch": 9.962962962962964, "grad_norm": 0.11797128617763519, "learning_rate": 8.357014456272793e-08, "loss": 0.2824, "step": 1345 }, { "epoch": 10.0, "grad_norm": 0.11149542033672333, "learning_rate": 0.0, "loss": 0.2818, "step": 1350 }, { "epoch": 10.0, "eval_loss": 1.517443060874939, "eval_runtime": 1.4076, "eval_samples_per_second": 2.842, "eval_steps_per_second": 0.71, "step": 1350 }, { "epoch": 10.0, "step": 1350, "total_flos": 7.982333171800736e+18, "train_loss": 0.3983345487382677, "train_runtime": 11606.8402, "train_samples_per_second": 14.865, "train_steps_per_second": 0.116 } ], "logging_steps": 5, "max_steps": 1350, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.982333171800736e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }