diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5944 @@ +{ + "best_metric": 2.4195876121520996, + "best_model_checkpoint": "./output/training_results/C016_random_sample_llama3-8b-base_pretrain_20240504_181744/checkpoint-1000", + "epoch": 4.0, + "eval_steps": 200, + "global_step": 4108, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0009737098344693282, + "grad_norm": 4.622597308389442, + "learning_rate": 7.5e-07, + "loss": 2.7686, + "step": 1 + }, + { + "epoch": 0.004868549172346641, + "grad_norm": 2.9010810501364586, + "learning_rate": 3e-06, + "loss": 2.7274, + "step": 5 + }, + { + "epoch": 0.009737098344693282, + "grad_norm": 2.6032598047562, + "learning_rate": 6e-06, + "loss": 2.667, + "step": 10 + }, + { + "epoch": 0.014605647517039922, + "grad_norm": 2.570724795196223, + "learning_rate": 9.75e-06, + "loss": 2.6551, + "step": 15 + }, + { + "epoch": 0.019474196689386564, + "grad_norm": 2.968267308985469, + "learning_rate": 1.3500000000000001e-05, + "loss": 2.643, + "step": 20 + }, + { + "epoch": 0.024342745861733205, + "grad_norm": 2.8739885340338285, + "learning_rate": 1.487975935855361e-05, + "loss": 2.6294, + "step": 25 + }, + { + "epoch": 0.029211295034079845, + "grad_norm": 3.023512383227986, + "learning_rate": 1.4681310547763171e-05, + "loss": 2.627, + "step": 30 + }, + { + "epoch": 0.034079844206426485, + "grad_norm": 2.528499970351483, + "learning_rate": 1.4485278864193794e-05, + "loss": 2.5955, + "step": 35 + }, + { + "epoch": 0.03894839337877313, + "grad_norm": 2.547095976749082, + "learning_rate": 1.4291637778717775e-05, + "loss": 2.6183, + "step": 40 + }, + { + "epoch": 0.043816942551119765, + "grad_norm": 1.9843941555908906, + "learning_rate": 1.4100361021341427e-05, + "loss": 2.5925, + "step": 45 + }, + { + "epoch": 0.04868549172346641, + "grad_norm": 2.0711055751945815, + "learning_rate": 1.3911422578987613e-05, + "loss": 2.5762, + "step": 50 + }, + { + "epoch": 0.053554040895813046, + "grad_norm": 2.1149009622981736, + "learning_rate": 1.3724796693294575e-05, + "loss": 2.6115, + "step": 55 + }, + { + "epoch": 0.05842259006815969, + "grad_norm": 2.125830374210055, + "learning_rate": 1.3540457858430796e-05, + "loss": 2.6139, + "step": 60 + }, + { + "epoch": 0.06329113924050633, + "grad_norm": 1.9410826947795146, + "learning_rate": 1.3394616487103794e-05, + "loss": 2.5748, + "step": 65 + }, + { + "epoch": 0.06815968841285297, + "grad_norm": 2.1512046862388146, + "learning_rate": 1.3214330866815837e-05, + "loss": 2.6165, + "step": 70 + }, + { + "epoch": 0.0730282375851996, + "grad_norm": 2.0456636080847157, + "learning_rate": 1.3036262206089142e-05, + "loss": 2.529, + "step": 75 + }, + { + "epoch": 0.07789678675754626, + "grad_norm": 2.232330041514811, + "learning_rate": 1.2860385939062166e-05, + "loss": 2.6501, + "step": 80 + }, + { + "epoch": 0.0827653359298929, + "grad_norm": 2.415240950467998, + "learning_rate": 1.2686677742137694e-05, + "loss": 2.5821, + "step": 85 + }, + { + "epoch": 0.08763388510223953, + "grad_norm": 2.4124279122793375, + "learning_rate": 1.2515113531889656e-05, + "loss": 2.5964, + "step": 90 + }, + { + "epoch": 0.09250243427458617, + "grad_norm": 2.2985857393283364, + "learning_rate": 1.2345669462985584e-05, + "loss": 2.6227, + "step": 95 + }, + { + "epoch": 0.09737098344693282, + "grad_norm": 1.984056163436905, + "learning_rate": 1.2178321926124443e-05, + "loss": 2.5646, + "step": 100 + }, + { + "epoch": 0.10223953261927946, + "grad_norm": 1.9469124194664196, + "learning_rate": 1.201304754598983e-05, + "loss": 2.5677, + "step": 105 + }, + { + "epoch": 0.10710808179162609, + "grad_norm": 2.000422025073212, + "learning_rate": 1.1849823179218302e-05, + "loss": 2.6047, + "step": 110 + }, + { + "epoch": 0.11197663096397274, + "grad_norm": 1.8457596949821262, + "learning_rate": 1.168862591238295e-05, + "loss": 2.6201, + "step": 115 + }, + { + "epoch": 0.11684518013631938, + "grad_norm": 1.8876145847065753, + "learning_rate": 1.1529433059992029e-05, + "loss": 2.5327, + "step": 120 + }, + { + "epoch": 0.12171372930866602, + "grad_norm": 1.8657408134698463, + "learning_rate": 1.1372222162502398e-05, + "loss": 2.5759, + "step": 125 + }, + { + "epoch": 0.12658227848101267, + "grad_norm": 1.9689520876834024, + "learning_rate": 1.1216970984347951e-05, + "loss": 2.5822, + "step": 130 + }, + { + "epoch": 0.1314508276533593, + "grad_norm": 1.8787257890744393, + "learning_rate": 1.1063657511982824e-05, + "loss": 2.5703, + "step": 135 + }, + { + "epoch": 0.13631937682570594, + "grad_norm": 2.3946720962934673, + "learning_rate": 1.0912259951939131e-05, + "loss": 2.5048, + "step": 140 + }, + { + "epoch": 0.14118792599805258, + "grad_norm": 2.069484826525082, + "learning_rate": 1.0762756728899413e-05, + "loss": 2.5433, + "step": 145 + }, + { + "epoch": 0.1460564751703992, + "grad_norm": 1.9728419903336738, + "learning_rate": 1.0615126483783578e-05, + "loss": 2.5451, + "step": 150 + }, + { + "epoch": 0.15092502434274585, + "grad_norm": 1.8965209810159926, + "learning_rate": 1.0469348071850116e-05, + "loss": 2.5761, + "step": 155 + }, + { + "epoch": 0.15579357351509251, + "grad_norm": 1.9728913609239898, + "learning_rate": 1.0325400560811745e-05, + "loss": 2.5509, + "step": 160 + }, + { + "epoch": 0.16066212268743915, + "grad_norm": 1.7695814333016529, + "learning_rate": 1.0183263228965269e-05, + "loss": 2.5695, + "step": 165 + }, + { + "epoch": 0.1655306718597858, + "grad_norm": 1.9109787480384295, + "learning_rate": 1.0042915563335448e-05, + "loss": 2.5843, + "step": 170 + }, + { + "epoch": 0.17039922103213243, + "grad_norm": 1.8950613853479759, + "learning_rate": 9.904337257833037e-06, + "loss": 2.5447, + "step": 175 + }, + { + "epoch": 0.17526777020447906, + "grad_norm": 1.9462634859013892, + "learning_rate": 9.767508211426758e-06, + "loss": 2.5236, + "step": 180 + }, + { + "epoch": 0.1801363193768257, + "grad_norm": 1.9007929646893995, + "learning_rate": 9.632408526329031e-06, + "loss": 2.5511, + "step": 185 + }, + { + "epoch": 0.18500486854917234, + "grad_norm": 1.9203529678658229, + "learning_rate": 9.4990185061956e-06, + "loss": 2.5294, + "step": 190 + }, + { + "epoch": 0.189873417721519, + "grad_norm": 1.8492777850666573, + "learning_rate": 9.367318654338783e-06, + "loss": 2.5313, + "step": 195 + }, + { + "epoch": 0.19474196689386564, + "grad_norm": 2.053277986812429, + "learning_rate": 9.23728967195438e-06, + "loss": 2.5472, + "step": 200 + }, + { + "epoch": 0.19474196689386564, + "eval_loss": 2.526153564453125, + "eval_runtime": 86.8653, + "eval_samples_per_second": 84.027, + "eval_steps_per_second": 0.668, + "step": 200 + }, + { + "epoch": 0.19961051606621227, + "grad_norm": 1.8721064281522946, + "learning_rate": 9.108912456361987e-06, + "loss": 2.5221, + "step": 205 + }, + { + "epoch": 0.2044790652385589, + "grad_norm": 2.0199844942072707, + "learning_rate": 8.982168099258849e-06, + "loss": 2.5093, + "step": 210 + }, + { + "epoch": 0.20934761441090555, + "grad_norm": 1.7890996151627292, + "learning_rate": 8.857037884987036e-06, + "loss": 2.5197, + "step": 215 + }, + { + "epoch": 0.21421616358325218, + "grad_norm": 1.8519229946590852, + "learning_rate": 8.733503288813807e-06, + "loss": 2.4945, + "step": 220 + }, + { + "epoch": 0.21908471275559882, + "grad_norm": 1.7635414105359053, + "learning_rate": 8.611545975225206e-06, + "loss": 2.508, + "step": 225 + }, + { + "epoch": 0.22395326192794549, + "grad_norm": 1.8628950690860595, + "learning_rate": 8.491147796232788e-06, + "loss": 2.5093, + "step": 230 + }, + { + "epoch": 0.22882181110029212, + "grad_norm": 1.8308684769690282, + "learning_rate": 8.372290789693232e-06, + "loss": 2.4912, + "step": 235 + }, + { + "epoch": 0.23369036027263876, + "grad_norm": 1.8771495332116028, + "learning_rate": 8.254957177640966e-06, + "loss": 2.5112, + "step": 240 + }, + { + "epoch": 0.2385589094449854, + "grad_norm": 1.848114881393312, + "learning_rate": 8.13912936463366e-06, + "loss": 2.5004, + "step": 245 + }, + { + "epoch": 0.24342745861733203, + "grad_norm": 1.8532877282661944, + "learning_rate": 8.024789936110332e-06, + "loss": 2.5063, + "step": 250 + }, + { + "epoch": 0.24829600778967867, + "grad_norm": 1.8072290437694474, + "learning_rate": 7.911921656762299e-06, + "loss": 2.4812, + "step": 255 + }, + { + "epoch": 0.25316455696202533, + "grad_norm": 1.7086014592127396, + "learning_rate": 7.800507468916596e-06, + "loss": 2.5107, + "step": 260 + }, + { + "epoch": 0.25803310613437197, + "grad_norm": 1.713632600119656, + "learning_rate": 7.690530490932004e-06, + "loss": 2.4908, + "step": 265 + }, + { + "epoch": 0.2629016553067186, + "grad_norm": 1.7625004775631927, + "learning_rate": 7.581974015607404e-06, + "loss": 2.5364, + "step": 270 + }, + { + "epoch": 0.26777020447906524, + "grad_norm": 1.73225200198116, + "learning_rate": 7.474821508602552e-06, + "loss": 2.4639, + "step": 275 + }, + { + "epoch": 0.2726387536514119, + "grad_norm": 1.7849989992249207, + "learning_rate": 7.369056606871157e-06, + "loss": 2.5102, + "step": 280 + }, + { + "epoch": 0.2775073028237585, + "grad_norm": 1.8370989545866505, + "learning_rate": 7.2646631171060325e-06, + "loss": 2.4668, + "step": 285 + }, + { + "epoch": 0.28237585199610515, + "grad_norm": 1.7638888058771318, + "learning_rate": 7.161625014196459e-06, + "loss": 2.4778, + "step": 290 + }, + { + "epoch": 0.2872444011684518, + "grad_norm": 1.840487665783078, + "learning_rate": 7.059926439697582e-06, + "loss": 2.4949, + "step": 295 + }, + { + "epoch": 0.2921129503407984, + "grad_norm": 1.7654501649580268, + "learning_rate": 6.959551700311679e-06, + "loss": 2.5254, + "step": 300 + }, + { + "epoch": 0.29698149951314506, + "grad_norm": 1.9353359489811188, + "learning_rate": 6.860485266381383e-06, + "loss": 2.4986, + "step": 305 + }, + { + "epoch": 0.3018500486854917, + "grad_norm": 1.9617969898380334, + "learning_rate": 6.7627117703947036e-06, + "loss": 2.5186, + "step": 310 + }, + { + "epoch": 0.30671859785783834, + "grad_norm": 1.702196140955499, + "learning_rate": 6.6662160055017e-06, + "loss": 2.4988, + "step": 315 + }, + { + "epoch": 0.31158714703018503, + "grad_norm": 1.6844416023733326, + "learning_rate": 6.570982924042871e-06, + "loss": 2.5118, + "step": 320 + }, + { + "epoch": 0.31645569620253167, + "grad_norm": 1.7294022438189514, + "learning_rate": 6.476997636089112e-06, + "loss": 2.5248, + "step": 325 + }, + { + "epoch": 0.3213242453748783, + "grad_norm": 2.0648141831567983, + "learning_rate": 6.384245407993103e-06, + "loss": 2.5276, + "step": 330 + }, + { + "epoch": 0.32619279454722494, + "grad_norm": 1.728780446047325, + "learning_rate": 6.292711660952165e-06, + "loss": 2.4661, + "step": 335 + }, + { + "epoch": 0.3310613437195716, + "grad_norm": 1.7061810945106262, + "learning_rate": 6.202381969582487e-06, + "loss": 2.4577, + "step": 340 + }, + { + "epoch": 0.3359298928919182, + "grad_norm": 1.6870860598605661, + "learning_rate": 6.11324206050453e-06, + "loss": 2.4968, + "step": 345 + }, + { + "epoch": 0.34079844206426485, + "grad_norm": 1.7294693647713457, + "learning_rate": 6.025277810939709e-06, + "loss": 2.4665, + "step": 350 + }, + { + "epoch": 0.3456669912366115, + "grad_norm": 1.771451497626332, + "learning_rate": 5.9384752473181495e-06, + "loss": 2.4601, + "step": 355 + }, + { + "epoch": 0.3505355404089581, + "grad_norm": 1.7336392065891828, + "learning_rate": 5.8528205438975404e-06, + "loss": 2.4733, + "step": 360 + }, + { + "epoch": 0.35540408958130476, + "grad_norm": 1.751587409627844, + "learning_rate": 5.768300021392871e-06, + "loss": 2.4184, + "step": 365 + }, + { + "epoch": 0.3602726387536514, + "grad_norm": 1.735366924052735, + "learning_rate": 5.684900145617154e-06, + "loss": 2.4582, + "step": 370 + }, + { + "epoch": 0.36514118792599803, + "grad_norm": 1.8313267360344203, + "learning_rate": 5.6026075261329595e-06, + "loss": 2.4746, + "step": 375 + }, + { + "epoch": 0.37000973709834467, + "grad_norm": 1.8199109237326785, + "learning_rate": 5.521408914914653e-06, + "loss": 2.4395, + "step": 380 + }, + { + "epoch": 0.3748782862706913, + "grad_norm": 1.655319976322653, + "learning_rate": 5.441291205021376e-06, + "loss": 2.4805, + "step": 385 + }, + { + "epoch": 0.379746835443038, + "grad_norm": 1.6994219665911234, + "learning_rate": 5.3622414292806385e-06, + "loss": 2.4505, + "step": 390 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 1.7131089370429131, + "learning_rate": 5.2842467589823945e-06, + "loss": 2.5166, + "step": 395 + }, + { + "epoch": 0.3894839337877313, + "grad_norm": 1.7028942553739241, + "learning_rate": 5.207294502583657e-06, + "loss": 2.4431, + "step": 400 + }, + { + "epoch": 0.3894839337877313, + "eval_loss": 2.4732775688171387, + "eval_runtime": 85.4562, + "eval_samples_per_second": 85.412, + "eval_steps_per_second": 0.679, + "step": 400 + }, + { + "epoch": 0.3943524829600779, + "grad_norm": 1.772274911090255, + "learning_rate": 5.131372104423501e-06, + "loss": 2.5029, + "step": 405 + }, + { + "epoch": 0.39922103213242455, + "grad_norm": 1.6531175372154208, + "learning_rate": 5.0564671434483495e-06, + "loss": 2.5081, + "step": 410 + }, + { + "epoch": 0.4040895813047712, + "grad_norm": 1.649280502586864, + "learning_rate": 4.9825673319475865e-06, + "loss": 2.4378, + "step": 415 + }, + { + "epoch": 0.4089581304771178, + "grad_norm": 1.8019468764704318, + "learning_rate": 4.909660514299323e-06, + "loss": 2.5013, + "step": 420 + }, + { + "epoch": 0.41382667964946446, + "grad_norm": 2.001260057759529, + "learning_rate": 4.837734665726331e-06, + "loss": 2.4586, + "step": 425 + }, + { + "epoch": 0.4186952288218111, + "grad_norm": 1.7081822720906132, + "learning_rate": 4.766777891061954e-06, + "loss": 2.513, + "step": 430 + }, + { + "epoch": 0.42356377799415773, + "grad_norm": 1.8238918791581111, + "learning_rate": 4.696778423526121e-06, + "loss": 2.4336, + "step": 435 + }, + { + "epoch": 0.42843232716650437, + "grad_norm": 1.7162241591421625, + "learning_rate": 4.627724623511167e-06, + "loss": 2.4923, + "step": 440 + }, + { + "epoch": 0.433300876338851, + "grad_norm": 1.791273570594169, + "learning_rate": 4.559604977377591e-06, + "loss": 2.4524, + "step": 445 + }, + { + "epoch": 0.43816942551119764, + "grad_norm": 1.585355030794431, + "learning_rate": 4.4924080962595615e-06, + "loss": 2.4547, + "step": 450 + }, + { + "epoch": 0.4430379746835443, + "grad_norm": 1.627715490257672, + "learning_rate": 4.426122714880177e-06, + "loss": 2.4271, + "step": 455 + }, + { + "epoch": 0.44790652385589097, + "grad_norm": 1.7001533187863673, + "learning_rate": 4.360737690376327e-06, + "loss": 2.5004, + "step": 460 + }, + { + "epoch": 0.4527750730282376, + "grad_norm": 1.645312039241492, + "learning_rate": 4.29624200113319e-06, + "loss": 2.4227, + "step": 465 + }, + { + "epoch": 0.45764362220058424, + "grad_norm": 1.6935805821575964, + "learning_rate": 4.232624745628264e-06, + "loss": 2.4804, + "step": 470 + }, + { + "epoch": 0.4625121713729309, + "grad_norm": 1.6013179989515345, + "learning_rate": 4.169875141284801e-06, + "loss": 2.4438, + "step": 475 + }, + { + "epoch": 0.4673807205452775, + "grad_norm": 1.6369493229279692, + "learning_rate": 4.107982523334686e-06, + "loss": 2.468, + "step": 480 + }, + { + "epoch": 0.47224926971762415, + "grad_norm": 1.5838983294803852, + "learning_rate": 4.046936343690647e-06, + "loss": 2.4468, + "step": 485 + }, + { + "epoch": 0.4771178188899708, + "grad_norm": 1.652021178997152, + "learning_rate": 3.986726169827688e-06, + "loss": 2.4484, + "step": 490 + }, + { + "epoch": 0.4819863680623174, + "grad_norm": 1.6100438156106431, + "learning_rate": 3.9391530173378065e-06, + "loss": 2.4699, + "step": 495 + }, + { + "epoch": 0.48685491723466406, + "grad_norm": 1.6357278449106312, + "learning_rate": 3.880421728549051e-06, + "loss": 2.4972, + "step": 500 + }, + { + "epoch": 0.4917234664070107, + "grad_norm": 1.665827366459226, + "learning_rate": 3.822497839855098e-06, + "loss": 2.4498, + "step": 505 + }, + { + "epoch": 0.49659201557935734, + "grad_norm": 1.6865163895540334, + "learning_rate": 3.7653713478280894e-06, + "loss": 2.4344, + "step": 510 + }, + { + "epoch": 0.501460564751704, + "grad_norm": 1.6041809903831323, + "learning_rate": 3.70903235936047e-06, + "loss": 2.4389, + "step": 515 + }, + { + "epoch": 0.5063291139240507, + "grad_norm": 1.5953703380549231, + "learning_rate": 3.653471090598976e-06, + "loss": 2.456, + "step": 520 + }, + { + "epoch": 0.5111976630963972, + "grad_norm": 1.651511590173226, + "learning_rate": 3.59867786588742e-06, + "loss": 2.4241, + "step": 525 + }, + { + "epoch": 0.5160662122687439, + "grad_norm": 1.7103129608293084, + "learning_rate": 3.5446431167182903e-06, + "loss": 2.4655, + "step": 530 + }, + { + "epoch": 0.5209347614410905, + "grad_norm": 1.6921307716416762, + "learning_rate": 3.49135738069308e-06, + "loss": 2.4682, + "step": 535 + }, + { + "epoch": 0.5258033106134372, + "grad_norm": 2.4654732646603064, + "learning_rate": 3.438811300491226e-06, + "loss": 2.4509, + "step": 540 + }, + { + "epoch": 0.5306718597857838, + "grad_norm": 1.8390744205983383, + "learning_rate": 3.386995622847693e-06, + "loss": 2.4649, + "step": 545 + }, + { + "epoch": 0.5355404089581305, + "grad_norm": 1.644182617363712, + "learning_rate": 3.3359011975390947e-06, + "loss": 2.4421, + "step": 550 + }, + { + "epoch": 0.5404089581304771, + "grad_norm": 1.6574734258059267, + "learning_rate": 3.2855189763782522e-06, + "loss": 2.4339, + "step": 555 + }, + { + "epoch": 0.5452775073028238, + "grad_norm": 1.645146888773675, + "learning_rate": 3.2358400122172194e-06, + "loss": 2.4573, + "step": 560 + }, + { + "epoch": 0.5501460564751705, + "grad_norm": 1.6192183546217223, + "learning_rate": 3.1868554579586605e-06, + "loss": 2.4574, + "step": 565 + }, + { + "epoch": 0.555014605647517, + "grad_norm": 1.671822469835026, + "learning_rate": 3.1385565655755006e-06, + "loss": 2.4685, + "step": 570 + }, + { + "epoch": 0.5598831548198637, + "grad_norm": 1.6395167213969633, + "learning_rate": 3.0909346851388586e-06, + "loss": 2.4845, + "step": 575 + }, + { + "epoch": 0.5647517039922103, + "grad_norm": 1.6945968363900514, + "learning_rate": 3.0439812638541433e-06, + "loss": 2.4732, + "step": 580 + }, + { + "epoch": 0.569620253164557, + "grad_norm": 1.6428936915822159, + "learning_rate": 2.9976878451053044e-06, + "loss": 2.4622, + "step": 585 + }, + { + "epoch": 0.5744888023369036, + "grad_norm": 1.6443646175732205, + "learning_rate": 2.9520460675071095e-06, + "loss": 2.4466, + "step": 590 + }, + { + "epoch": 0.5793573515092503, + "grad_norm": 1.6462331908034773, + "learning_rate": 2.907047663965498e-06, + "loss": 2.466, + "step": 595 + }, + { + "epoch": 0.5842259006815969, + "grad_norm": 1.6579152402369843, + "learning_rate": 2.862684460745835e-06, + "loss": 2.4163, + "step": 600 + }, + { + "epoch": 0.5842259006815969, + "eval_loss": 2.4442696571350098, + "eval_runtime": 85.0459, + "eval_samples_per_second": 85.824, + "eval_steps_per_second": 0.682, + "step": 600 + }, + { + "epoch": 0.5890944498539435, + "grad_norm": 1.6546825117019377, + "learning_rate": 2.8189483765491077e-06, + "loss": 2.4857, + "step": 605 + }, + { + "epoch": 0.5939629990262901, + "grad_norm": 1.6868467390816666, + "learning_rate": 2.775831421595948e-06, + "loss": 2.4588, + "step": 610 + }, + { + "epoch": 0.5988315481986368, + "grad_norm": 1.631864642870465, + "learning_rate": 2.7333256967184713e-06, + "loss": 2.4542, + "step": 615 + }, + { + "epoch": 0.6037000973709834, + "grad_norm": 1.5765796237275709, + "learning_rate": 2.6914233924598045e-06, + "loss": 2.4277, + "step": 620 + }, + { + "epoch": 0.6085686465433301, + "grad_norm": 1.6329988363272994, + "learning_rate": 2.6501167881813393e-06, + "loss": 2.3847, + "step": 625 + }, + { + "epoch": 0.6134371957156767, + "grad_norm": 1.6799124493504514, + "learning_rate": 2.6093982511776034e-06, + "loss": 2.4285, + "step": 630 + }, + { + "epoch": 0.6183057448880234, + "grad_norm": 1.6239504885103953, + "learning_rate": 2.569260235798673e-06, + "loss": 2.4432, + "step": 635 + }, + { + "epoch": 0.6231742940603701, + "grad_norm": 1.6009220872039354, + "learning_rate": 2.5296952825801428e-06, + "loss": 2.4118, + "step": 640 + }, + { + "epoch": 0.6280428432327166, + "grad_norm": 1.668850128404625, + "learning_rate": 2.49069601738053e-06, + "loss": 2.4968, + "step": 645 + }, + { + "epoch": 0.6329113924050633, + "grad_norm": 1.6629353311695643, + "learning_rate": 2.45225515052612e-06, + "loss": 2.4499, + "step": 650 + }, + { + "epoch": 0.6377799415774099, + "grad_norm": 1.5929095731919567, + "learning_rate": 2.414365475963123e-06, + "loss": 2.4344, + "step": 655 + }, + { + "epoch": 0.6426484907497566, + "grad_norm": 1.6171386676286523, + "learning_rate": 2.3770198704171897e-06, + "loss": 2.4291, + "step": 660 + }, + { + "epoch": 0.6475170399221032, + "grad_norm": 1.6420238027175638, + "learning_rate": 2.3402112925601284e-06, + "loss": 2.4089, + "step": 665 + }, + { + "epoch": 0.6523855890944499, + "grad_norm": 1.6788147362161117, + "learning_rate": 2.30393278218386e-06, + "loss": 2.4573, + "step": 670 + }, + { + "epoch": 0.6572541382667965, + "grad_norm": 1.6870203522696865, + "learning_rate": 2.2681774593814984e-06, + "loss": 2.4088, + "step": 675 + }, + { + "epoch": 0.6621226874391432, + "grad_norm": 1.635603955024262, + "learning_rate": 2.232938523735545e-06, + "loss": 2.4588, + "step": 680 + }, + { + "epoch": 0.6669912366114897, + "grad_norm": 1.6513072405624771, + "learning_rate": 2.1982092535131004e-06, + "loss": 2.4498, + "step": 685 + }, + { + "epoch": 0.6718597857838364, + "grad_norm": 1.6924700394115, + "learning_rate": 2.1639830048681073e-06, + "loss": 2.4365, + "step": 690 + }, + { + "epoch": 0.676728334956183, + "grad_norm": 1.6617480593936398, + "learning_rate": 2.130253211050499e-06, + "loss": 2.4644, + "step": 695 + }, + { + "epoch": 0.6815968841285297, + "grad_norm": 1.6227857828645276, + "learning_rate": 2.097013381622268e-06, + "loss": 2.4065, + "step": 700 + }, + { + "epoch": 0.6864654333008764, + "grad_norm": 1.6566150548613952, + "learning_rate": 2.0642571016803623e-06, + "loss": 2.4368, + "step": 705 + }, + { + "epoch": 0.691333982473223, + "grad_norm": 2.0603714204760255, + "learning_rate": 2.031978031086392e-06, + "loss": 2.4595, + "step": 710 + }, + { + "epoch": 0.6962025316455697, + "grad_norm": 1.6243505097480613, + "learning_rate": 2.0001699037030468e-06, + "loss": 2.4347, + "step": 715 + }, + { + "epoch": 0.7010710808179162, + "grad_norm": 1.7249749503578298, + "learning_rate": 1.968826526637239e-06, + "loss": 2.4648, + "step": 720 + }, + { + "epoch": 0.7059396299902629, + "grad_norm": 1.7262462882996954, + "learning_rate": 1.9379417794898855e-06, + "loss": 2.439, + "step": 725 + }, + { + "epoch": 0.7108081791626095, + "grad_norm": 1.6204017543437201, + "learning_rate": 1.907509613612265e-06, + "loss": 2.4204, + "step": 730 + }, + { + "epoch": 0.7156767283349562, + "grad_norm": 1.6204489322103508, + "learning_rate": 1.8775240513689499e-06, + "loss": 2.4594, + "step": 735 + }, + { + "epoch": 0.7205452775073028, + "grad_norm": 1.6106742671779046, + "learning_rate": 1.8479791854072195e-06, + "loss": 2.4215, + "step": 740 + }, + { + "epoch": 0.7254138266796495, + "grad_norm": 1.6278702344437153, + "learning_rate": 1.818869177932943e-06, + "loss": 2.4177, + "step": 745 + }, + { + "epoch": 0.7302823758519961, + "grad_norm": 1.702303345942966, + "learning_rate": 1.7901882599928386e-06, + "loss": 2.4412, + "step": 750 + }, + { + "epoch": 0.7351509250243428, + "grad_norm": 1.616327757952299, + "learning_rate": 1.7619307307631266e-06, + "loss": 2.4, + "step": 755 + }, + { + "epoch": 0.7400194741966893, + "grad_norm": 1.581113737981139, + "learning_rate": 1.734090956844456e-06, + "loss": 2.4072, + "step": 760 + }, + { + "epoch": 0.744888023369036, + "grad_norm": 1.556080290425103, + "learning_rate": 1.7066633715631195e-06, + "loss": 2.4384, + "step": 765 + }, + { + "epoch": 0.7497565725413826, + "grad_norm": 1.6038706235589455, + "learning_rate": 1.6796424742784764e-06, + "loss": 2.4738, + "step": 770 + }, + { + "epoch": 0.7546251217137293, + "grad_norm": 1.662078041711206, + "learning_rate": 1.6530228296965529e-06, + "loss": 2.4957, + "step": 775 + }, + { + "epoch": 0.759493670886076, + "grad_norm": 1.6328340091897662, + "learning_rate": 1.6267990671897495e-06, + "loss": 2.4816, + "step": 780 + }, + { + "epoch": 0.7643622200584226, + "grad_norm": 1.6438111553611983, + "learning_rate": 1.6009658801226467e-06, + "loss": 2.4171, + "step": 785 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 1.7211490194262304, + "learning_rate": 1.575518025183845e-06, + "loss": 2.4268, + "step": 790 + }, + { + "epoch": 0.7740993184031159, + "grad_norm": 1.7200203924352186, + "learning_rate": 1.5504503217237718e-06, + "loss": 2.4661, + "step": 795 + }, + { + "epoch": 0.7789678675754625, + "grad_norm": 1.6451751546473583, + "learning_rate": 1.5257576510984581e-06, + "loss": 2.4462, + "step": 800 + }, + { + "epoch": 0.7789678675754625, + "eval_loss": 2.4281296730041504, + "eval_runtime": 85.85, + "eval_samples_per_second": 85.02, + "eval_steps_per_second": 0.676, + "step": 800 + }, + { + "epoch": 0.7838364167478091, + "grad_norm": 1.7186162803254572, + "learning_rate": 1.5014349560191975e-06, + "loss": 2.4447, + "step": 805 + }, + { + "epoch": 0.7887049659201558, + "grad_norm": 1.6390532449357682, + "learning_rate": 1.4774772399080773e-06, + "loss": 2.3768, + "step": 810 + }, + { + "epoch": 0.7935735150925024, + "grad_norm": 1.6479758648696174, + "learning_rate": 1.4538795662592943e-06, + "loss": 2.4559, + "step": 815 + }, + { + "epoch": 0.7984420642648491, + "grad_norm": 1.6376458723290501, + "learning_rate": 1.4306370580062715e-06, + "loss": 2.4113, + "step": 820 + }, + { + "epoch": 0.8033106134371957, + "grad_norm": 1.6642605682645168, + "learning_rate": 1.4077448968944671e-06, + "loss": 2.4438, + "step": 825 + }, + { + "epoch": 0.8081791626095424, + "grad_norm": 1.5597665460776493, + "learning_rate": 1.3851983228598879e-06, + "loss": 2.4615, + "step": 830 + }, + { + "epoch": 0.813047711781889, + "grad_norm": 1.5998155659125195, + "learning_rate": 1.3629926334132263e-06, + "loss": 2.4245, + "step": 835 + }, + { + "epoch": 0.8179162609542356, + "grad_norm": 1.6231320102269173, + "learning_rate": 1.341123183029607e-06, + "loss": 2.4677, + "step": 840 + }, + { + "epoch": 0.8227848101265823, + "grad_norm": 1.587206597114467, + "learning_rate": 1.3195853825438637e-06, + "loss": 2.4268, + "step": 845 + }, + { + "epoch": 0.8276533592989289, + "grad_norm": 1.69258641098763, + "learning_rate": 1.2983746985513552e-06, + "loss": 2.4468, + "step": 850 + }, + { + "epoch": 0.8325219084712756, + "grad_norm": 1.6329982694488412, + "learning_rate": 1.2774866528142222e-06, + "loss": 2.4483, + "step": 855 + }, + { + "epoch": 0.8373904576436222, + "grad_norm": 1.6510229207341078, + "learning_rate": 1.2569168216730917e-06, + "loss": 2.4192, + "step": 860 + }, + { + "epoch": 0.8422590068159689, + "grad_norm": 1.5817582463558826, + "learning_rate": 1.2366608354641566e-06, + "loss": 2.4214, + "step": 865 + }, + { + "epoch": 0.8471275559883155, + "grad_norm": 2.4364871892272117, + "learning_rate": 1.216714377941609e-06, + "loss": 2.3713, + "step": 870 + }, + { + "epoch": 0.8519961051606622, + "grad_norm": 2.5372621502402186, + "learning_rate": 1.197073185705361e-06, + "loss": 2.415, + "step": 875 + }, + { + "epoch": 0.8568646543330087, + "grad_norm": 1.7440355879318754, + "learning_rate": 1.1777330476340439e-06, + "loss": 2.4468, + "step": 880 + }, + { + "epoch": 0.8617332035053554, + "grad_norm": 1.5959852776726504, + "learning_rate": 1.1624748993325157e-06, + "loss": 2.4416, + "step": 885 + }, + { + "epoch": 0.866601752677702, + "grad_norm": 1.5792041005336184, + "learning_rate": 1.1436662114572268e-06, + "loss": 2.4699, + "step": 890 + }, + { + "epoch": 0.8714703018500487, + "grad_norm": 1.6749035075354037, + "learning_rate": 1.1251470599677525e-06, + "loss": 2.3906, + "step": 895 + }, + { + "epoch": 0.8763388510223953, + "grad_norm": 1.6974831202238516, + "learning_rate": 1.106913427322351e-06, + "loss": 2.4473, + "step": 900 + }, + { + "epoch": 0.881207400194742, + "grad_norm": 1.629005550261729, + "learning_rate": 1.0889613456083674e-06, + "loss": 2.3865, + "step": 905 + }, + { + "epoch": 0.8860759493670886, + "grad_norm": 1.541318076309618, + "learning_rate": 1.0712868960049544e-06, + "loss": 2.4382, + "step": 910 + }, + { + "epoch": 0.8909444985394352, + "grad_norm": 1.5752275103135045, + "learning_rate": 1.0538862082508074e-06, + "loss": 2.452, + "step": 915 + }, + { + "epoch": 0.8958130477117819, + "grad_norm": 1.699492645238764, + "learning_rate": 1.036755460116832e-06, + "loss": 2.4333, + "step": 920 + }, + { + "epoch": 0.9006815968841285, + "grad_norm": 1.6186519953923517, + "learning_rate": 1.019890876883744e-06, + "loss": 2.4319, + "step": 925 + }, + { + "epoch": 0.9055501460564752, + "grad_norm": 1.586657649597322, + "learning_rate": 1.0032887308245372e-06, + "loss": 2.3888, + "step": 930 + }, + { + "epoch": 0.9104186952288218, + "grad_norm": 1.6044497498274506, + "learning_rate": 9.869453406918023e-07, + "loss": 2.4039, + "step": 935 + }, + { + "epoch": 0.9152872444011685, + "grad_norm": 1.6723989958086949, + "learning_rate": 9.708570712098284e-07, + "loss": 2.4005, + "step": 940 + }, + { + "epoch": 0.9201557935735151, + "grad_norm": 1.6902859760264846, + "learning_rate": 9.550203325714876e-07, + "loss": 2.4261, + "step": 945 + }, + { + "epoch": 0.9250243427458618, + "grad_norm": 1.6544740657432804, + "learning_rate": 9.394315799398425e-07, + "loss": 2.4451, + "step": 950 + }, + { + "epoch": 0.9298928919182083, + "grad_norm": 1.54882787053935, + "learning_rate": 9.240873129544315e-07, + "loss": 2.444, + "step": 955 + }, + { + "epoch": 0.934761441090555, + "grad_norm": 1.5909955762857377, + "learning_rate": 9.089840752422235e-07, + "loss": 2.3899, + "step": 960 + }, + { + "epoch": 0.9396299902629016, + "grad_norm": 1.6145099358774662, + "learning_rate": 8.941184539331783e-07, + "loss": 2.4215, + "step": 965 + }, + { + "epoch": 0.9444985394352483, + "grad_norm": 1.6027897997067373, + "learning_rate": 8.794870791803967e-07, + "loss": 2.4725, + "step": 970 + }, + { + "epoch": 0.9493670886075949, + "grad_norm": 1.5457405929725805, + "learning_rate": 8.650866236848001e-07, + "loss": 2.4511, + "step": 975 + }, + { + "epoch": 0.9542356377799416, + "grad_norm": 1.5741657202863206, + "learning_rate": 8.509138022243367e-07, + "loss": 2.4241, + "step": 980 + }, + { + "epoch": 0.9591041869522883, + "grad_norm": 1.6377152121905334, + "learning_rate": 8.369653711876414e-07, + "loss": 2.4034, + "step": 985 + }, + { + "epoch": 0.9639727361246349, + "grad_norm": 1.5672375026688792, + "learning_rate": 8.232381281121472e-07, + "loss": 2.4561, + "step": 990 + }, + { + "epoch": 0.9688412852969815, + "grad_norm": 1.5788396321776008, + "learning_rate": 8.097289112265904e-07, + "loss": 2.4023, + "step": 995 + }, + { + "epoch": 0.9737098344693281, + "grad_norm": 1.6492145516163694, + "learning_rate": 7.964345989978902e-07, + "loss": 2.4353, + "step": 1000 + }, + { + "epoch": 0.9737098344693281, + "eval_loss": 2.4195876121520996, + "eval_runtime": 85.322, + "eval_samples_per_second": 85.547, + "eval_steps_per_second": 0.68, + "step": 1000 + }, + { + "epoch": 0.9785783836416748, + "grad_norm": 1.5879573832642313, + "learning_rate": 7.83352109682346e-07, + "loss": 2.4211, + "step": 1005 + }, + { + "epoch": 0.9834469328140214, + "grad_norm": 1.6569952022837342, + "learning_rate": 7.704784008811466e-07, + "loss": 2.4405, + "step": 1010 + }, + { + "epoch": 0.9883154819863681, + "grad_norm": 1.59377337317319, + "learning_rate": 7.5781046910014e-07, + "loss": 2.4239, + "step": 1015 + }, + { + "epoch": 0.9931840311587147, + "grad_norm": 1.6314113718182768, + "learning_rate": 7.453453493138208e-07, + "loss": 2.3844, + "step": 1020 + }, + { + "epoch": 0.9980525803310614, + "grad_norm": 1.6620011915571502, + "learning_rate": 7.330801145335265e-07, + "loss": 2.4181, + "step": 1025 + }, + { + "epoch": 1.002921129503408, + "grad_norm": 1.9251273144439607, + "learning_rate": 7.210118753797866e-07, + "loss": 2.3064, + "step": 1030 + }, + { + "epoch": 1.0077896786757545, + "grad_norm": 1.6577940198631569, + "learning_rate": 7.091377796588075e-07, + "loss": 2.1998, + "step": 1035 + }, + { + "epoch": 1.0126582278481013, + "grad_norm": 1.7530256550139145, + "learning_rate": 6.974550119430372e-07, + "loss": 2.2708, + "step": 1040 + }, + { + "epoch": 1.017526777020448, + "grad_norm": 1.6969458887579825, + "learning_rate": 6.859607931558122e-07, + "loss": 2.2173, + "step": 1045 + }, + { + "epoch": 1.0223953261927945, + "grad_norm": 1.7016018631062932, + "learning_rate": 6.746523801600123e-07, + "loss": 2.2381, + "step": 1050 + }, + { + "epoch": 1.0272638753651413, + "grad_norm": 1.7593644025263198, + "learning_rate": 6.635270653507262e-07, + "loss": 2.2734, + "step": 1055 + }, + { + "epoch": 1.0321324245374879, + "grad_norm": 1.654744797257189, + "learning_rate": 6.525821762518746e-07, + "loss": 2.2064, + "step": 1060 + }, + { + "epoch": 1.0370009737098345, + "grad_norm": 1.7421937503061258, + "learning_rate": 6.41815075116768e-07, + "loss": 2.2413, + "step": 1065 + }, + { + "epoch": 1.041869522882181, + "grad_norm": 1.6658497077947585, + "learning_rate": 6.312231585325566e-07, + "loss": 2.2099, + "step": 1070 + }, + { + "epoch": 1.0467380720545278, + "grad_norm": 1.6993685133537, + "learning_rate": 6.208038570285602e-07, + "loss": 2.2881, + "step": 1075 + }, + { + "epoch": 1.0516066212268744, + "grad_norm": 1.7131084770608689, + "learning_rate": 6.105546346884208e-07, + "loss": 2.2381, + "step": 1080 + }, + { + "epoch": 1.056475170399221, + "grad_norm": 1.7081524078028645, + "learning_rate": 6.004729887660704e-07, + "loss": 2.236, + "step": 1085 + }, + { + "epoch": 1.0613437195715676, + "grad_norm": 1.8169379524340188, + "learning_rate": 5.905564493054672e-07, + "loss": 2.2342, + "step": 1090 + }, + { + "epoch": 1.0662122687439144, + "grad_norm": 1.6685262696722296, + "learning_rate": 5.808025787640756e-07, + "loss": 2.222, + "step": 1095 + }, + { + "epoch": 1.071080817916261, + "grad_norm": 1.6952684753454728, + "learning_rate": 5.712089716400602e-07, + "loss": 2.212, + "step": 1100 + }, + { + "epoch": 1.0759493670886076, + "grad_norm": 1.7168463474300166, + "learning_rate": 5.617732541031583e-07, + "loss": 2.2186, + "step": 1105 + }, + { + "epoch": 1.0808179162609541, + "grad_norm": 1.750097139848988, + "learning_rate": 5.524930836292023e-07, + "loss": 2.2494, + "step": 1110 + }, + { + "epoch": 1.085686465433301, + "grad_norm": 1.7330864953187293, + "learning_rate": 5.433661486382679e-07, + "loss": 2.2102, + "step": 1115 + }, + { + "epoch": 1.0905550146056475, + "grad_norm": 1.6803544506498977, + "learning_rate": 5.343901681364102e-07, + "loss": 2.2419, + "step": 1120 + }, + { + "epoch": 1.095423563777994, + "grad_norm": 1.7013937838659918, + "learning_rate": 5.255628913609601e-07, + "loss": 2.2194, + "step": 1125 + }, + { + "epoch": 1.1002921129503407, + "grad_norm": 1.7495270131848708, + "learning_rate": 5.168820974293567e-07, + "loss": 2.2213, + "step": 1130 + }, + { + "epoch": 1.1051606621226875, + "grad_norm": 1.7709656106164202, + "learning_rate": 5.08345594991474e-07, + "loss": 2.2428, + "step": 1135 + }, + { + "epoch": 1.110029211295034, + "grad_norm": 1.7338624116926167, + "learning_rate": 4.999512218854316e-07, + "loss": 2.1828, + "step": 1140 + }, + { + "epoch": 1.1148977604673806, + "grad_norm": 1.6915851593817088, + "learning_rate": 4.916968447968384e-07, + "loss": 2.2286, + "step": 1145 + }, + { + "epoch": 1.1197663096397275, + "grad_norm": 1.7318300441723782, + "learning_rate": 4.835803589214607e-07, + "loss": 2.2443, + "step": 1150 + }, + { + "epoch": 1.124634858812074, + "grad_norm": 1.6773573534938775, + "learning_rate": 4.7559968763127113e-07, + "loss": 2.2427, + "step": 1155 + }, + { + "epoch": 1.1295034079844206, + "grad_norm": 1.7187378671025355, + "learning_rate": 4.67752782143862e-07, + "loss": 2.2726, + "step": 1160 + }, + { + "epoch": 1.1343719571567672, + "grad_norm": 1.740542786504217, + "learning_rate": 4.6003762119518e-07, + "loss": 2.1869, + "step": 1165 + }, + { + "epoch": 1.139240506329114, + "grad_norm": 1.6945482053223642, + "learning_rate": 4.524522107155746e-07, + "loss": 2.2248, + "step": 1170 + }, + { + "epoch": 1.1441090555014606, + "grad_norm": 1.6615466442429803, + "learning_rate": 4.4499458350910965e-07, + "loss": 2.2248, + "step": 1175 + }, + { + "epoch": 1.1489776046738072, + "grad_norm": 1.7112548018169453, + "learning_rate": 4.3766279893612986e-07, + "loss": 2.2471, + "step": 1180 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 1.860899105148827, + "learning_rate": 4.304549425990434e-07, + "loss": 2.2272, + "step": 1185 + }, + { + "epoch": 1.1587147030185005, + "grad_norm": 1.691810639062161, + "learning_rate": 4.23369126031296e-07, + "loss": 2.263, + "step": 1190 + }, + { + "epoch": 1.1635832521908471, + "grad_norm": 1.775044069455589, + "learning_rate": 4.1640348638951344e-07, + "loss": 2.2447, + "step": 1195 + }, + { + "epoch": 1.1684518013631937, + "grad_norm": 1.9319359346716891, + "learning_rate": 4.0955618614877875e-07, + "loss": 2.2111, + "step": 1200 + }, + { + "epoch": 1.1684518013631937, + "eval_loss": 2.429011106491089, + "eval_runtime": 85.6795, + "eval_samples_per_second": 85.19, + "eval_steps_per_second": 0.677, + "step": 1200 + }, + { + "epoch": 1.1733203505355405, + "grad_norm": 1.7574340980020409, + "learning_rate": 4.0282541280102037e-07, + "loss": 2.2332, + "step": 1205 + }, + { + "epoch": 1.178188899707887, + "grad_norm": 1.6980825882291957, + "learning_rate": 3.9620937855648926e-07, + "loss": 2.2235, + "step": 1210 + }, + { + "epoch": 1.1830574488802337, + "grad_norm": 1.7998015292888088, + "learning_rate": 3.897063200482918e-07, + "loss": 2.21, + "step": 1215 + }, + { + "epoch": 1.1879259980525803, + "grad_norm": 1.631324517663901, + "learning_rate": 3.833144980399548e-07, + "loss": 2.219, + "step": 1220 + }, + { + "epoch": 1.192794547224927, + "grad_norm": 1.6407460745986788, + "learning_rate": 3.770321971360005e-07, + "loss": 2.1846, + "step": 1225 + }, + { + "epoch": 1.1976630963972736, + "grad_norm": 1.700809400503958, + "learning_rate": 3.7085772549549546e-07, + "loss": 2.2605, + "step": 1230 + }, + { + "epoch": 1.2025316455696202, + "grad_norm": 1.6715825199948962, + "learning_rate": 3.647894145485631e-07, + "loss": 2.2213, + "step": 1235 + }, + { + "epoch": 1.2074001947419668, + "grad_norm": 1.6992864948362203, + "learning_rate": 3.588256187158179e-07, + "loss": 2.2256, + "step": 1240 + }, + { + "epoch": 1.2122687439143136, + "grad_norm": 1.7332695276540628, + "learning_rate": 3.5296471513070895e-07, + "loss": 2.2602, + "step": 1245 + }, + { + "epoch": 1.2171372930866602, + "grad_norm": 1.7489006914371872, + "learning_rate": 3.472051033647415e-07, + "loss": 2.2514, + "step": 1250 + }, + { + "epoch": 1.2220058422590068, + "grad_norm": 1.7282302399042118, + "learning_rate": 3.4154520515555147e-07, + "loss": 2.2403, + "step": 1255 + }, + { + "epoch": 1.2268743914313536, + "grad_norm": 1.702612549564712, + "learning_rate": 3.3598346413781214e-07, + "loss": 2.2181, + "step": 1260 + }, + { + "epoch": 1.2317429406037002, + "grad_norm": 1.7033226596241964, + "learning_rate": 3.305183455769435e-07, + "loss": 2.2761, + "step": 1265 + }, + { + "epoch": 1.2366114897760467, + "grad_norm": 1.750385373058268, + "learning_rate": 3.251483361056018e-07, + "loss": 2.2387, + "step": 1270 + }, + { + "epoch": 1.2414800389483933, + "grad_norm": 1.7510710962758804, + "learning_rate": 3.1987194346292783e-07, + "loss": 2.2642, + "step": 1275 + }, + { + "epoch": 1.24634858812074, + "grad_norm": 1.699193378636474, + "learning_rate": 3.1468769623652614e-07, + "loss": 2.2675, + "step": 1280 + }, + { + "epoch": 1.2512171372930867, + "grad_norm": 1.6670573040794279, + "learning_rate": 3.0959414360715174e-07, + "loss": 2.2596, + "step": 1285 + }, + { + "epoch": 1.2560856864654333, + "grad_norm": 1.7667142100191355, + "learning_rate": 3.045898550960847e-07, + "loss": 2.2037, + "step": 1290 + }, + { + "epoch": 1.2609542356377799, + "grad_norm": 1.7069461836357607, + "learning_rate": 2.9967342031515984e-07, + "loss": 2.1602, + "step": 1295 + }, + { + "epoch": 1.2658227848101267, + "grad_norm": 1.6514498602889256, + "learning_rate": 2.948434487194415e-07, + "loss": 2.2278, + "step": 1300 + }, + { + "epoch": 1.2706913339824732, + "grad_norm": 1.661661974641321, + "learning_rate": 2.900985693625059e-07, + "loss": 2.2009, + "step": 1305 + }, + { + "epoch": 1.2755598831548198, + "grad_norm": 1.774692203861925, + "learning_rate": 2.854374306543197e-07, + "loss": 2.2549, + "step": 1310 + }, + { + "epoch": 1.2804284323271666, + "grad_norm": 1.678761386598376, + "learning_rate": 2.8085870012168447e-07, + "loss": 2.2362, + "step": 1315 + }, + { + "epoch": 1.2852969814995132, + "grad_norm": 1.717206031840906, + "learning_rate": 2.763610641712288e-07, + "loss": 2.2259, + "step": 1320 + }, + { + "epoch": 1.2901655306718598, + "grad_norm": 1.725788802654923, + "learning_rate": 2.7194322785492024e-07, + "loss": 2.2743, + "step": 1325 + }, + { + "epoch": 1.2950340798442064, + "grad_norm": 1.7272128729108773, + "learning_rate": 2.6760391463808273e-07, + "loss": 2.2238, + "step": 1330 + }, + { + "epoch": 1.299902629016553, + "grad_norm": 1.673730997692294, + "learning_rate": 2.633418661698872e-07, + "loss": 2.213, + "step": 1335 + }, + { + "epoch": 1.3047711781888998, + "grad_norm": 1.67421773725042, + "learning_rate": 2.591558420563027e-07, + "loss": 2.2159, + "step": 1340 + }, + { + "epoch": 1.3096397273612463, + "grad_norm": 1.7181866282412452, + "learning_rate": 2.550446196354801e-07, + "loss": 2.1931, + "step": 1345 + }, + { + "epoch": 1.314508276533593, + "grad_norm": 1.9995904186048974, + "learning_rate": 2.5100699375554764e-07, + "loss": 2.2394, + "step": 1350 + }, + { + "epoch": 1.3193768257059397, + "grad_norm": 1.6972697466343885, + "learning_rate": 2.4704177655480044e-07, + "loss": 2.2046, + "step": 1355 + }, + { + "epoch": 1.3242453748782863, + "grad_norm": 1.7352888329956384, + "learning_rate": 2.431477972442557e-07, + "loss": 2.2582, + "step": 1360 + }, + { + "epoch": 1.3291139240506329, + "grad_norm": 1.7536755936521693, + "learning_rate": 2.3932390189255786e-07, + "loss": 2.2573, + "step": 1365 + }, + { + "epoch": 1.3339824732229795, + "grad_norm": 1.7063896078961662, + "learning_rate": 2.3556895321321132e-07, + "loss": 2.2665, + "step": 1370 + }, + { + "epoch": 1.3388510223953263, + "grad_norm": 1.6740223494854998, + "learning_rate": 2.3188183035411849e-07, + "loss": 2.2163, + "step": 1375 + }, + { + "epoch": 1.3437195715676729, + "grad_norm": 1.7492684308128166, + "learning_rate": 2.2826142868940236e-07, + "loss": 2.1842, + "step": 1380 + }, + { + "epoch": 1.3485881207400194, + "grad_norm": 1.7405438484291114, + "learning_rate": 2.2470665961349557e-07, + "loss": 2.1983, + "step": 1385 + }, + { + "epoch": 1.353456669912366, + "grad_norm": 1.764967629336457, + "learning_rate": 2.2121645033746942e-07, + "loss": 2.2207, + "step": 1390 + }, + { + "epoch": 1.3583252190847128, + "grad_norm": 1.7077892049631243, + "learning_rate": 2.1778974368759137e-07, + "loss": 2.2475, + "step": 1395 + }, + { + "epoch": 1.3631937682570594, + "grad_norm": 1.7009940340048213, + "learning_rate": 2.1442549790608077e-07, + "loss": 2.2503, + "step": 1400 + }, + { + "epoch": 1.3631937682570594, + "eval_loss": 2.428063154220581, + "eval_runtime": 85.6482, + "eval_samples_per_second": 85.221, + "eval_steps_per_second": 0.677, + "step": 1400 + }, + { + "epoch": 1.368062317429406, + "grad_norm": 1.6852036518237539, + "learning_rate": 2.1112268645405111e-07, + "loss": 2.2092, + "step": 1405 + }, + { + "epoch": 1.3729308666017528, + "grad_norm": 1.7762881669503894, + "learning_rate": 2.0788029781661377e-07, + "loss": 2.275, + "step": 1410 + }, + { + "epoch": 1.3777994157740994, + "grad_norm": 1.7187728321020497, + "learning_rate": 2.0469733531012517e-07, + "loss": 2.205, + "step": 1415 + }, + { + "epoch": 1.382667964946446, + "grad_norm": 1.75561614859316, + "learning_rate": 2.015728168915585e-07, + "loss": 2.2513, + "step": 1420 + }, + { + "epoch": 1.3875365141187925, + "grad_norm": 1.7108273702116936, + "learning_rate": 1.9850577496997802e-07, + "loss": 2.3129, + "step": 1425 + }, + { + "epoch": 1.3924050632911391, + "grad_norm": 1.7373174761439103, + "learning_rate": 1.9549525622009822e-07, + "loss": 2.3009, + "step": 1430 + }, + { + "epoch": 1.397273612463486, + "grad_norm": 1.6944981878481962, + "learning_rate": 1.9254032139790948e-07, + "loss": 2.2276, + "step": 1435 + }, + { + "epoch": 1.4021421616358325, + "grad_norm": 1.7020044804533232, + "learning_rate": 1.8964004515834934e-07, + "loss": 2.2515, + "step": 1440 + }, + { + "epoch": 1.407010710808179, + "grad_norm": 1.7104910024357038, + "learning_rate": 1.8679351587500074e-07, + "loss": 2.2176, + "step": 1445 + }, + { + "epoch": 1.4118792599805259, + "grad_norm": 1.706154278461277, + "learning_rate": 1.839998354618001e-07, + "loss": 2.2426, + "step": 1450 + }, + { + "epoch": 1.4167478091528725, + "grad_norm": 1.6546136448975106, + "learning_rate": 1.8125811919673288e-07, + "loss": 2.2729, + "step": 1455 + }, + { + "epoch": 1.421616358325219, + "grad_norm": 1.7298929182929772, + "learning_rate": 1.7856749554750352e-07, + "loss": 2.2213, + "step": 1460 + }, + { + "epoch": 1.4264849074975658, + "grad_norm": 1.6799052546726045, + "learning_rate": 1.7592710599915514e-07, + "loss": 2.2494, + "step": 1465 + }, + { + "epoch": 1.4313534566699124, + "grad_norm": 1.7192873393376786, + "learning_rate": 1.7333610488362632e-07, + "loss": 2.2219, + "step": 1470 + }, + { + "epoch": 1.436222005842259, + "grad_norm": 1.6974268468535265, + "learning_rate": 1.7079365921122288e-07, + "loss": 2.2159, + "step": 1475 + }, + { + "epoch": 1.4410905550146056, + "grad_norm": 1.736932687979258, + "learning_rate": 1.682989485039898e-07, + "loss": 2.2001, + "step": 1480 + }, + { + "epoch": 1.4459591041869522, + "grad_norm": 1.7389015197229705, + "learning_rate": 1.65851164630961e-07, + "loss": 2.1815, + "step": 1485 + }, + { + "epoch": 1.450827653359299, + "grad_norm": 1.7402452727637348, + "learning_rate": 1.634495116452756e-07, + "loss": 2.274, + "step": 1490 + }, + { + "epoch": 1.4556962025316456, + "grad_norm": 1.689117128248476, + "learning_rate": 1.6109320562313633e-07, + "loss": 2.2296, + "step": 1495 + }, + { + "epoch": 1.4605647517039921, + "grad_norm": 1.7021757020476915, + "learning_rate": 1.587814745045978e-07, + "loss": 2.2488, + "step": 1500 + }, + { + "epoch": 1.465433300876339, + "grad_norm": 1.726585875668609, + "learning_rate": 1.565135579361648e-07, + "loss": 2.248, + "step": 1505 + }, + { + "epoch": 1.4703018500486855, + "grad_norm": 1.7896520417513544, + "learning_rate": 1.542887071151836e-07, + "loss": 2.2768, + "step": 1510 + }, + { + "epoch": 1.475170399221032, + "grad_norm": 1.8424772477578564, + "learning_rate": 1.5210618463601055e-07, + "loss": 2.2177, + "step": 1515 + }, + { + "epoch": 1.480038948393379, + "grad_norm": 1.7094835426748392, + "learning_rate": 1.4996526433793895e-07, + "loss": 2.1814, + "step": 1520 + }, + { + "epoch": 1.4849074975657255, + "grad_norm": 1.774002445036824, + "learning_rate": 1.478652311548687e-07, + "loss": 2.241, + "step": 1525 + }, + { + "epoch": 1.489776046738072, + "grad_norm": 1.7528924437568645, + "learning_rate": 1.4580538096670264e-07, + "loss": 2.2446, + "step": 1530 + }, + { + "epoch": 1.4946445959104186, + "grad_norm": 1.7450272228199162, + "learning_rate": 1.4378502045245165e-07, + "loss": 2.2098, + "step": 1535 + }, + { + "epoch": 1.4995131450827652, + "grad_norm": 1.71350388367754, + "learning_rate": 1.4180346694503253e-07, + "loss": 2.2407, + "step": 1540 + }, + { + "epoch": 1.504381694255112, + "grad_norm": 1.6655187619092102, + "learning_rate": 1.398600482877437e-07, + "loss": 2.2516, + "step": 1545 + }, + { + "epoch": 1.5092502434274586, + "grad_norm": 1.730038860188208, + "learning_rate": 1.379541026923992e-07, + "loss": 2.2244, + "step": 1550 + }, + { + "epoch": 1.5141187925998052, + "grad_norm": 1.7106880841934986, + "learning_rate": 1.360849785991099e-07, + "loss": 2.2359, + "step": 1555 + }, + { + "epoch": 1.518987341772152, + "grad_norm": 1.7507950020460012, + "learning_rate": 1.3425203453769053e-07, + "loss": 2.2067, + "step": 1560 + }, + { + "epoch": 1.5238558909444986, + "grad_norm": 1.6877714873976932, + "learning_rate": 1.3245463899068173e-07, + "loss": 2.2203, + "step": 1565 + }, + { + "epoch": 1.5287244401168452, + "grad_norm": 1.7990198522805025, + "learning_rate": 1.306921702579676e-07, + "loss": 2.2176, + "step": 1570 + }, + { + "epoch": 1.533592989289192, + "grad_norm": 1.7072472713071833, + "learning_rate": 1.2896401632297622e-07, + "loss": 2.2199, + "step": 1575 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 1.7137109372098451, + "learning_rate": 1.2726957472044431e-07, + "loss": 2.2593, + "step": 1580 + }, + { + "epoch": 1.5433300876338851, + "grad_norm": 1.7487536889121376, + "learning_rate": 1.2560825240573496e-07, + "loss": 2.2512, + "step": 1585 + }, + { + "epoch": 1.5481986368062317, + "grad_norm": 1.689517701653058, + "learning_rate": 1.23979465625689e-07, + "loss": 2.2594, + "step": 1590 + }, + { + "epoch": 1.5530671859785783, + "grad_norm": 1.7198840121589114, + "learning_rate": 1.223826397909982e-07, + "loss": 2.1724, + "step": 1595 + }, + { + "epoch": 1.557935735150925, + "grad_norm": 1.7280648830031222, + "learning_rate": 1.2081720935008362e-07, + "loss": 2.258, + "step": 1600 + }, + { + "epoch": 1.557935735150925, + "eval_loss": 2.4271326065063477, + "eval_runtime": 85.3342, + "eval_samples_per_second": 85.534, + "eval_steps_per_second": 0.68, + "step": 1600 + }, + { + "epoch": 1.5628042843232717, + "grad_norm": 1.7166373586948793, + "learning_rate": 1.1928261766446461e-07, + "loss": 2.2507, + "step": 1605 + }, + { + "epoch": 1.5676728334956183, + "grad_norm": 1.6962264139269825, + "learning_rate": 1.1777831688560428e-07, + "loss": 2.214, + "step": 1610 + }, + { + "epoch": 1.572541382667965, + "grad_norm": 1.667890993114025, + "learning_rate": 1.1630376783321605e-07, + "loss": 2.2107, + "step": 1615 + }, + { + "epoch": 1.5774099318403116, + "grad_norm": 1.7705130739867874, + "learning_rate": 1.1485843987501698e-07, + "loss": 2.2298, + "step": 1620 + }, + { + "epoch": 1.5822784810126582, + "grad_norm": 1.7067117715143372, + "learning_rate": 1.134418108079144e-07, + "loss": 2.2568, + "step": 1625 + }, + { + "epoch": 1.587147030185005, + "grad_norm": 1.698686461270445, + "learning_rate": 1.1205336674061039e-07, + "loss": 2.256, + "step": 1630 + }, + { + "epoch": 1.5920155793573514, + "grad_norm": 1.6914580859058206, + "learning_rate": 1.1069260197761055e-07, + "loss": 2.2255, + "step": 1635 + }, + { + "epoch": 1.5968841285296982, + "grad_norm": 1.695162008456652, + "learning_rate": 1.0935901890462346e-07, + "loss": 2.1925, + "step": 1640 + }, + { + "epoch": 1.6017526777020448, + "grad_norm": 1.8353600777066885, + "learning_rate": 1.0805212787533523e-07, + "loss": 2.259, + "step": 1645 + }, + { + "epoch": 1.6066212268743914, + "grad_norm": 1.7146893423140706, + "learning_rate": 1.0677144709954816e-07, + "loss": 2.251, + "step": 1650 + }, + { + "epoch": 1.6114897760467382, + "grad_norm": 1.7479655666276084, + "learning_rate": 1.0551650253266659e-07, + "loss": 2.2297, + "step": 1655 + }, + { + "epoch": 1.6163583252190847, + "grad_norm": 1.7879147626083056, + "learning_rate": 1.0428682776651918e-07, + "loss": 2.1994, + "step": 1660 + }, + { + "epoch": 1.6212268743914313, + "grad_norm": 1.7970614657397905, + "learning_rate": 1.030819639215023e-07, + "loss": 2.2049, + "step": 1665 + }, + { + "epoch": 1.6260954235637781, + "grad_norm": 1.7518141901380913, + "learning_rate": 1.0190145954003192e-07, + "loss": 2.2203, + "step": 1670 + }, + { + "epoch": 1.6309639727361245, + "grad_norm": 1.7501918042535358, + "learning_rate": 1.0074487048129093e-07, + "loss": 2.2585, + "step": 1675 + }, + { + "epoch": 1.6358325219084713, + "grad_norm": 1.756138340727838, + "learning_rate": 9.961175981725802e-08, + "loss": 2.2441, + "step": 1680 + }, + { + "epoch": 1.6407010710808179, + "grad_norm": 1.6916112013631275, + "learning_rate": 9.850169773000545e-08, + "loss": 2.2229, + "step": 1685 + }, + { + "epoch": 1.6455696202531644, + "grad_norm": 1.7520093451689733, + "learning_rate": 9.741426141025332e-08, + "loss": 2.2418, + "step": 1690 + }, + { + "epoch": 1.6504381694255112, + "grad_norm": 1.721678894466571, + "learning_rate": 9.634903495716675e-08, + "loss": 2.215, + "step": 1695 + }, + { + "epoch": 1.6553067185978578, + "grad_norm": 1.7726004550340564, + "learning_rate": 9.530560927938334e-08, + "loss": 2.2609, + "step": 1700 + }, + { + "epoch": 1.6601752677702044, + "grad_norm": 1.7520211987580594, + "learning_rate": 9.428358199725911e-08, + "loss": 2.2388, + "step": 1705 + }, + { + "epoch": 1.6650438169425512, + "grad_norm": 1.8152868603337584, + "learning_rate": 9.328255734631862e-08, + "loss": 2.2346, + "step": 1710 + }, + { + "epoch": 1.6699123661148978, + "grad_norm": 1.7051369362258122, + "learning_rate": 9.230214608189946e-08, + "loss": 2.201, + "step": 1715 + }, + { + "epoch": 1.6747809152872444, + "grad_norm": 1.7253608618380802, + "learning_rate": 9.1341965384976e-08, + "loss": 2.2579, + "step": 1720 + }, + { + "epoch": 1.6796494644595912, + "grad_norm": 1.6955731274986756, + "learning_rate": 9.040163876915256e-08, + "loss": 2.2456, + "step": 1725 + }, + { + "epoch": 1.6845180136319375, + "grad_norm": 1.8315504305607004, + "learning_rate": 8.948079598881221e-08, + "loss": 2.2085, + "step": 1730 + }, + { + "epoch": 1.6893865628042843, + "grad_norm": 1.680264316266455, + "learning_rate": 8.857907294841052e-08, + "loss": 2.2427, + "step": 1735 + }, + { + "epoch": 1.694255111976631, + "grad_norm": 1.7344138245065615, + "learning_rate": 8.76961116129008e-08, + "loss": 2.2503, + "step": 1740 + }, + { + "epoch": 1.6991236611489775, + "grad_norm": 1.7400368127622114, + "learning_rate": 8.683155991928096e-08, + "loss": 2.2245, + "step": 1745 + }, + { + "epoch": 1.7039922103213243, + "grad_norm": 1.7436841437009907, + "learning_rate": 8.598507168924832e-08, + "loss": 2.229, + "step": 1750 + }, + { + "epoch": 1.7088607594936709, + "grad_norm": 1.718735182879372, + "learning_rate": 8.515630654295236e-08, + "loss": 2.214, + "step": 1755 + }, + { + "epoch": 1.7137293086660175, + "grad_norm": 2.4325656334712416, + "learning_rate": 8.43449298138329e-08, + "loss": 2.2273, + "step": 1760 + }, + { + "epoch": 1.7185978578383643, + "grad_norm": 1.898796758188917, + "learning_rate": 8.355061246453264e-08, + "loss": 2.2182, + "step": 1765 + }, + { + "epoch": 1.7234664070107109, + "grad_norm": 1.7705269671220705, + "learning_rate": 8.277303100387306e-08, + "loss": 2.2375, + "step": 1770 + }, + { + "epoch": 1.7283349561830574, + "grad_norm": 1.7467032834457301, + "learning_rate": 8.201186740488162e-08, + "loss": 2.2547, + "step": 1775 + }, + { + "epoch": 1.7332035053554042, + "grad_norm": 1.7519508473607348, + "learning_rate": 8.126680902385984e-08, + "loss": 2.2533, + "step": 1780 + }, + { + "epoch": 1.7380720545277506, + "grad_norm": 1.6602010890382983, + "learning_rate": 8.053754852048118e-08, + "loss": 2.2409, + "step": 1785 + }, + { + "epoch": 1.7429406037000974, + "grad_norm": 1.7182534815184765, + "learning_rate": 7.982378377890754e-08, + "loss": 2.2169, + "step": 1790 + }, + { + "epoch": 1.747809152872444, + "grad_norm": 1.7132799050190683, + "learning_rate": 7.912521782991345e-08, + "loss": 2.21, + "step": 1795 + }, + { + "epoch": 1.7526777020447906, + "grad_norm": 1.7492486479656935, + "learning_rate": 7.844155877400776e-08, + "loss": 2.254, + "step": 1800 + }, + { + "epoch": 1.7526777020447906, + "eval_loss": 2.4266231060028076, + "eval_runtime": 84.8728, + "eval_samples_per_second": 85.999, + "eval_steps_per_second": 0.683, + "step": 1800 + }, + { + "epoch": 1.7575462512171374, + "grad_norm": 1.7982513495954076, + "learning_rate": 7.777251970554109e-08, + "loss": 2.2446, + "step": 1805 + }, + { + "epoch": 1.762414800389484, + "grad_norm": 1.7235517571422887, + "learning_rate": 7.711781863779007e-08, + "loss": 2.1939, + "step": 1810 + }, + { + "epoch": 1.7672833495618305, + "grad_norm": 1.7506770945570296, + "learning_rate": 7.64771784290061e-08, + "loss": 2.2339, + "step": 1815 + }, + { + "epoch": 1.7721518987341773, + "grad_norm": 1.961410951418005, + "learning_rate": 7.58503267094196e-08, + "loss": 2.2686, + "step": 1820 + }, + { + "epoch": 1.7770204479065237, + "grad_norm": 1.8140605591823493, + "learning_rate": 7.523699580918899e-08, + "loss": 2.2136, + "step": 1825 + }, + { + "epoch": 1.7818889970788705, + "grad_norm": 1.733018609908853, + "learning_rate": 7.463692268728381e-08, + "loss": 2.165, + "step": 1830 + }, + { + "epoch": 1.7867575462512173, + "grad_norm": 1.6824331979801876, + "learning_rate": 7.404984886129268e-08, + "loss": 2.2369, + "step": 1835 + }, + { + "epoch": 1.7916260954235637, + "grad_norm": 1.8520431760674112, + "learning_rate": 7.347552033814525e-08, + "loss": 2.2612, + "step": 1840 + }, + { + "epoch": 1.7964946445959105, + "grad_norm": 1.7117045631055423, + "learning_rate": 7.291368754573846e-08, + "loss": 2.2425, + "step": 1845 + }, + { + "epoch": 1.801363193768257, + "grad_norm": 1.7052819169533668, + "learning_rate": 7.236410526545777e-08, + "loss": 2.196, + "step": 1850 + }, + { + "epoch": 1.8062317429406036, + "grad_norm": 1.6384344799557868, + "learning_rate": 7.182653256558277e-08, + "loss": 2.2563, + "step": 1855 + }, + { + "epoch": 1.8111002921129504, + "grad_norm": 1.726079611316107, + "learning_rate": 7.130073273556794e-08, + "loss": 2.2455, + "step": 1860 + }, + { + "epoch": 1.815968841285297, + "grad_norm": 1.8072162191497745, + "learning_rate": 7.07864732211891e-08, + "loss": 2.2181, + "step": 1865 + }, + { + "epoch": 1.8208373904576436, + "grad_norm": 1.720087653439122, + "learning_rate": 7.028352556054533e-08, + "loss": 2.2295, + "step": 1870 + }, + { + "epoch": 1.8257059396299904, + "grad_norm": 1.7654960116615692, + "learning_rate": 6.979166532090796e-08, + "loss": 2.2391, + "step": 1875 + }, + { + "epoch": 1.8305744888023368, + "grad_norm": 1.735053092580652, + "learning_rate": 6.931067203640622e-08, + "loss": 2.2372, + "step": 1880 + }, + { + "epoch": 1.8354430379746836, + "grad_norm": 0.8624508534667337, + "learning_rate": 6.884032914654115e-08, + "loss": 2.2893, + "step": 1885 + }, + { + "epoch": 1.8403115871470301, + "grad_norm": 1.7141597133248032, + "learning_rate": 6.838042393551797e-08, + "loss": 2.1609, + "step": 1890 + }, + { + "epoch": 1.8451801363193767, + "grad_norm": 1.7126007991228827, + "learning_rate": 6.793074747238838e-08, + "loss": 2.1911, + "step": 1895 + }, + { + "epoch": 1.8500486854917235, + "grad_norm": 1.7133412238684549, + "learning_rate": 6.749109455199282e-08, + "loss": 2.2345, + "step": 1900 + }, + { + "epoch": 1.85491723466407, + "grad_norm": 1.7274466496979972, + "learning_rate": 6.706126363669515e-08, + "loss": 2.2514, + "step": 1905 + }, + { + "epoch": 1.8597857838364167, + "grad_norm": 1.7189469301901343, + "learning_rate": 6.664105679889947e-08, + "loss": 2.2458, + "step": 1910 + }, + { + "epoch": 1.8646543330087635, + "grad_norm": 1.690182741334165, + "learning_rate": 6.623027966434113e-08, + "loss": 2.2038, + "step": 1915 + }, + { + "epoch": 1.86952288218111, + "grad_norm": 1.7617528535634877, + "learning_rate": 6.582874135614325e-08, + "loss": 2.1569, + "step": 1920 + }, + { + "epoch": 1.8743914313534566, + "grad_norm": 1.7229096608807917, + "learning_rate": 6.54362544396295e-08, + "loss": 2.2416, + "step": 1925 + }, + { + "epoch": 1.8792599805258035, + "grad_norm": 1.796873830830146, + "learning_rate": 6.505263486788538e-08, + "loss": 2.2388, + "step": 1930 + }, + { + "epoch": 1.8841285296981498, + "grad_norm": 1.6550586704575283, + "learning_rate": 6.46777019280588e-08, + "loss": 2.2445, + "step": 1935 + }, + { + "epoch": 1.8889970788704966, + "grad_norm": 1.7133733906940865, + "learning_rate": 6.431127818839212e-08, + "loss": 2.1907, + "step": 1940 + }, + { + "epoch": 1.8938656280428432, + "grad_norm": 1.7131092326344233, + "learning_rate": 6.395318944597702e-08, + "loss": 2.2508, + "step": 1945 + }, + { + "epoch": 1.8987341772151898, + "grad_norm": 1.7975356797064426, + "learning_rate": 6.360326467522418e-08, + "loss": 2.2619, + "step": 1950 + }, + { + "epoch": 1.9036027263875366, + "grad_norm": 1.7927617071731754, + "learning_rate": 6.326133597703928e-08, + "loss": 2.2306, + "step": 1955 + }, + { + "epoch": 1.9084712755598832, + "grad_norm": 1.7327613795288057, + "learning_rate": 6.292723852869772e-08, + "loss": 2.2258, + "step": 1960 + }, + { + "epoch": 1.9133398247322297, + "grad_norm": 1.7114888287588472, + "learning_rate": 6.260081053440938e-08, + "loss": 2.2258, + "step": 1965 + }, + { + "epoch": 1.9182083739045765, + "grad_norm": 1.7617910174679117, + "learning_rate": 6.228189317656625e-08, + "loss": 2.2214, + "step": 1970 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 1.69813136105815, + "learning_rate": 6.197033056766434e-08, + "loss": 2.2189, + "step": 1975 + }, + { + "epoch": 1.9279454722492697, + "grad_norm": 1.668741197040639, + "learning_rate": 6.166596970289254e-08, + "loss": 2.2168, + "step": 1980 + }, + { + "epoch": 1.9328140214216165, + "grad_norm": 1.7177940007984969, + "learning_rate": 6.136866041338061e-08, + "loss": 2.2131, + "step": 1985 + }, + { + "epoch": 1.9376825705939629, + "grad_norm": 2.097413251733257, + "learning_rate": 6.107825532009846e-08, + "loss": 2.2431, + "step": 1990 + }, + { + "epoch": 1.9425511197663097, + "grad_norm": 1.6798259076762905, + "learning_rate": 6.079460978839945e-08, + "loss": 2.273, + "step": 1995 + }, + { + "epoch": 1.9474196689386563, + "grad_norm": 1.7260224966683506, + "learning_rate": 6.051758188319985e-08, + "loss": 2.2508, + "step": 2000 + }, + { + "epoch": 1.9474196689386563, + "eval_loss": 2.4265518188476562, + "eval_runtime": 85.2652, + "eval_samples_per_second": 85.604, + "eval_steps_per_second": 0.68, + "step": 2000 + }, + { + "epoch": 1.9522882181110028, + "grad_norm": 1.6953303266550364, + "learning_rate": 6.024703232478723e-08, + "loss": 2.2536, + "step": 2005 + }, + { + "epoch": 1.9571567672833496, + "grad_norm": 1.7537091815900079, + "learning_rate": 5.998282444525052e-08, + "loss": 2.1932, + "step": 2010 + }, + { + "epoch": 1.9620253164556962, + "grad_norm": 1.6913312559085918, + "learning_rate": 5.97248241455241e-08, + "loss": 2.2097, + "step": 2015 + }, + { + "epoch": 1.9668938656280428, + "grad_norm": 1.709640143824205, + "learning_rate": 5.947289985303898e-08, + "loss": 2.2739, + "step": 2020 + }, + { + "epoch": 1.9717624148003896, + "grad_norm": 1.769591500268775, + "learning_rate": 5.922692247997387e-08, + "loss": 2.2345, + "step": 2025 + }, + { + "epoch": 1.976630963972736, + "grad_norm": 1.7089545500913113, + "learning_rate": 5.898676538209882e-08, + "loss": 2.1989, + "step": 2030 + }, + { + "epoch": 1.9814995131450828, + "grad_norm": 1.7324755039666533, + "learning_rate": 5.8752304318204655e-08, + "loss": 2.2296, + "step": 2035 + }, + { + "epoch": 1.9863680623174296, + "grad_norm": 1.7254800842506772, + "learning_rate": 5.852341741011111e-08, + "loss": 2.193, + "step": 2040 + }, + { + "epoch": 1.991236611489776, + "grad_norm": 1.7451878538885703, + "learning_rate": 5.829998510324686e-08, + "loss": 2.2829, + "step": 2045 + }, + { + "epoch": 1.9961051606621227, + "grad_norm": 1.7607671547384287, + "learning_rate": 5.8081890127794496e-08, + "loss": 2.2133, + "step": 2050 + }, + { + "epoch": 2.0009737098344695, + "grad_norm": 1.7100416555620679, + "learning_rate": 5.7869017460393855e-08, + "loss": 2.233, + "step": 2055 + }, + { + "epoch": 2.005842259006816, + "grad_norm": 1.746899480839477, + "learning_rate": 5.7661254286396653e-08, + "loss": 2.2424, + "step": 2060 + }, + { + "epoch": 2.0107108081791627, + "grad_norm": 1.7305460820757212, + "learning_rate": 5.74584899626664e-08, + "loss": 2.1762, + "step": 2065 + }, + { + "epoch": 2.015579357351509, + "grad_norm": 1.7063747134288176, + "learning_rate": 5.726061598091625e-08, + "loss": 2.2345, + "step": 2070 + }, + { + "epoch": 2.020447906523856, + "grad_norm": 1.7115321830413128, + "learning_rate": 5.706752593157899e-08, + "loss": 2.2415, + "step": 2075 + }, + { + "epoch": 2.0253164556962027, + "grad_norm": 1.7315759852916324, + "learning_rate": 5.687911546820225e-08, + "loss": 2.2013, + "step": 2080 + }, + { + "epoch": 2.030185004868549, + "grad_norm": 1.7053153190560344, + "learning_rate": 5.669528227236286e-08, + "loss": 2.1833, + "step": 2085 + }, + { + "epoch": 2.035053554040896, + "grad_norm": 1.7480977685627526, + "learning_rate": 5.6515926019093655e-08, + "loss": 2.211, + "step": 2090 + }, + { + "epoch": 2.0399221032132426, + "grad_norm": 1.7325131107565894, + "learning_rate": 5.6340948342816956e-08, + "loss": 2.1746, + "step": 2095 + }, + { + "epoch": 2.044790652385589, + "grad_norm": 1.7861322062259097, + "learning_rate": 5.617025280377817e-08, + "loss": 2.2137, + "step": 2100 + }, + { + "epoch": 2.049659201557936, + "grad_norm": 1.7407845229020498, + "learning_rate": 5.6003744854973425e-08, + "loss": 2.2327, + "step": 2105 + }, + { + "epoch": 2.0545277507302826, + "grad_norm": 1.7703593488238105, + "learning_rate": 5.584133180956534e-08, + "loss": 2.2133, + "step": 2110 + }, + { + "epoch": 2.059396299902629, + "grad_norm": 1.7217536220585616, + "learning_rate": 5.568292280878073e-08, + "loss": 2.1892, + "step": 2115 + }, + { + "epoch": 2.0642648490749758, + "grad_norm": 1.7276020192361135, + "learning_rate": 5.552842879028437e-08, + "loss": 2.2204, + "step": 2120 + }, + { + "epoch": 2.069133398247322, + "grad_norm": 1.727960780515485, + "learning_rate": 5.537776245702285e-08, + "loss": 2.1996, + "step": 2125 + }, + { + "epoch": 2.074001947419669, + "grad_norm": 1.7514773544281632, + "learning_rate": 5.523083824653292e-08, + "loss": 2.1812, + "step": 2130 + }, + { + "epoch": 2.0788704965920157, + "grad_norm": 1.7204243933149874, + "learning_rate": 5.5087572300708064e-08, + "loss": 2.2412, + "step": 2135 + }, + { + "epoch": 2.083739045764362, + "grad_norm": 1.7770689913358608, + "learning_rate": 5.49478824360182e-08, + "loss": 2.2175, + "step": 2140 + }, + { + "epoch": 2.088607594936709, + "grad_norm": 1.8118845495638691, + "learning_rate": 5.4811688114176284e-08, + "loss": 2.2016, + "step": 2145 + }, + { + "epoch": 2.0934761441090557, + "grad_norm": 1.746154499517824, + "learning_rate": 5.46789104132466e-08, + "loss": 2.2338, + "step": 2150 + }, + { + "epoch": 2.098344693281402, + "grad_norm": 1.7040405445808213, + "learning_rate": 5.454947199918886e-08, + "loss": 2.2298, + "step": 2155 + }, + { + "epoch": 2.103213242453749, + "grad_norm": 1.6905682803922102, + "learning_rate": 5.4423297097832996e-08, + "loss": 2.2539, + "step": 2160 + }, + { + "epoch": 2.108081791626095, + "grad_norm": 1.7302575758498189, + "learning_rate": 5.430031146727882e-08, + "loss": 2.2279, + "step": 2165 + }, + { + "epoch": 2.112950340798442, + "grad_norm": 1.771661817904592, + "learning_rate": 5.418044237071543e-08, + "loss": 2.1828, + "step": 2170 + }, + { + "epoch": 2.117818889970789, + "grad_norm": 1.726387016486399, + "learning_rate": 5.406361854965489e-08, + "loss": 2.2167, + "step": 2175 + }, + { + "epoch": 2.122687439143135, + "grad_norm": 1.72736148684106, + "learning_rate": 5.394977019757503e-08, + "loss": 2.1627, + "step": 2180 + }, + { + "epoch": 2.127555988315482, + "grad_norm": 1.732688655222483, + "learning_rate": 5.3838828933965965e-08, + "loss": 2.2482, + "step": 2185 + }, + { + "epoch": 2.132424537487829, + "grad_norm": 1.765418024490754, + "learning_rate": 5.373072777877539e-08, + "loss": 2.2416, + "step": 2190 + }, + { + "epoch": 2.137293086660175, + "grad_norm": 1.7905409309741185, + "learning_rate": 5.3625401127247335e-08, + "loss": 2.2049, + "step": 2195 + }, + { + "epoch": 2.142161635832522, + "grad_norm": 1.7285450482838325, + "learning_rate": 5.3522784725149425e-08, + "loss": 2.2112, + "step": 2200 + }, + { + "epoch": 2.142161635832522, + "eval_loss": 2.4287049770355225, + "eval_runtime": 85.2698, + "eval_samples_per_second": 85.599, + "eval_steps_per_second": 0.68, + "step": 2200 + }, + { + "epoch": 2.1470301850048688, + "grad_norm": 1.7298233620729722, + "learning_rate": 5.3422815644383576e-08, + "loss": 2.2268, + "step": 2205 + }, + { + "epoch": 2.151898734177215, + "grad_norm": 1.7106243642725834, + "learning_rate": 5.332543225897528e-08, + "loss": 2.1823, + "step": 2210 + }, + { + "epoch": 2.156767283349562, + "grad_norm": 1.7093421731817005, + "learning_rate": 5.3230574221436374e-08, + "loss": 2.2282, + "step": 2215 + }, + { + "epoch": 2.1616358325219083, + "grad_norm": 1.7311568429821071, + "learning_rate": 5.313818243949664e-08, + "loss": 2.2213, + "step": 2220 + }, + { + "epoch": 2.166504381694255, + "grad_norm": 1.7674044638597066, + "learning_rate": 5.3048199053199294e-08, + "loss": 2.2254, + "step": 2225 + }, + { + "epoch": 2.171372930866602, + "grad_norm": 1.7115381111757135, + "learning_rate": 5.296056741235573e-08, + "loss": 2.2418, + "step": 2230 + }, + { + "epoch": 2.1762414800389482, + "grad_norm": 1.6790013492814684, + "learning_rate": 5.2875232054354564e-08, + "loss": 2.2243, + "step": 2235 + }, + { + "epoch": 2.181110029211295, + "grad_norm": 1.7513052402271196, + "learning_rate": 5.279213868232073e-08, + "loss": 2.247, + "step": 2240 + }, + { + "epoch": 2.185978578383642, + "grad_norm": 1.8041144340469755, + "learning_rate": 5.271123414361961e-08, + "loss": 2.2318, + "step": 2245 + }, + { + "epoch": 2.190847127555988, + "grad_norm": 1.711266621732132, + "learning_rate": 5.263246640870184e-08, + "loss": 2.2393, + "step": 2250 + }, + { + "epoch": 2.195715676728335, + "grad_norm": 1.6969264596911122, + "learning_rate": 5.2555784550284366e-08, + "loss": 2.2087, + "step": 2255 + }, + { + "epoch": 2.2005842259006814, + "grad_norm": 1.7651857140540494, + "learning_rate": 5.2481138722863035e-08, + "loss": 2.2069, + "step": 2260 + }, + { + "epoch": 2.205452775073028, + "grad_norm": 1.7817072494501394, + "learning_rate": 5.240848014255253e-08, + "loss": 2.2051, + "step": 2265 + }, + { + "epoch": 2.210321324245375, + "grad_norm": 1.7343663502264346, + "learning_rate": 5.233776106724918e-08, + "loss": 2.2389, + "step": 2270 + }, + { + "epoch": 2.2151898734177213, + "grad_norm": 1.6921255572364247, + "learning_rate": 5.2268934777112365e-08, + "loss": 2.2362, + "step": 2275 + }, + { + "epoch": 2.220058422590068, + "grad_norm": 1.7013539832560813, + "learning_rate": 5.220195555536015e-08, + "loss": 2.2519, + "step": 2280 + }, + { + "epoch": 2.224926971762415, + "grad_norm": 1.785593606024491, + "learning_rate": 5.213677866937508e-08, + "loss": 2.2231, + "step": 2285 + }, + { + "epoch": 2.2297955209347613, + "grad_norm": 1.7400661751479558, + "learning_rate": 5.207336035211581e-08, + "loss": 2.2306, + "step": 2290 + }, + { + "epoch": 2.234664070107108, + "grad_norm": 1.7513785103045856, + "learning_rate": 5.201165778383047e-08, + "loss": 2.184, + "step": 2295 + }, + { + "epoch": 2.239532619279455, + "grad_norm": 1.8372816676992572, + "learning_rate": 5.19516290740677e-08, + "loss": 2.1913, + "step": 2300 + }, + { + "epoch": 2.2444011684518013, + "grad_norm": 1.7411317592151172, + "learning_rate": 5.189323324398133e-08, + "loss": 2.1749, + "step": 2305 + }, + { + "epoch": 2.249269717624148, + "grad_norm": 1.7262426588954412, + "learning_rate": 5.1836430208924614e-08, + "loss": 2.1902, + "step": 2310 + }, + { + "epoch": 2.2541382667964944, + "grad_norm": 1.7496058178900977, + "learning_rate": 5.1781180761330104e-08, + "loss": 2.2354, + "step": 2315 + }, + { + "epoch": 2.2590068159688412, + "grad_norm": 1.7515829381648897, + "learning_rate": 5.1727446553871265e-08, + "loss": 2.1717, + "step": 2320 + }, + { + "epoch": 2.263875365141188, + "grad_norm": 1.7101981117837521, + "learning_rate": 5.1675190082901985e-08, + "loss": 2.1983, + "step": 2325 + }, + { + "epoch": 2.2687439143135344, + "grad_norm": 1.674731414303109, + "learning_rate": 5.1624374672169996e-08, + "loss": 2.1964, + "step": 2330 + }, + { + "epoch": 2.273612463485881, + "grad_norm": 1.8194729895132122, + "learning_rate": 5.157496445680062e-08, + "loss": 2.208, + "step": 2335 + }, + { + "epoch": 2.278481012658228, + "grad_norm": 1.7198751813449369, + "learning_rate": 5.152692436754697e-08, + "loss": 2.2088, + "step": 2340 + }, + { + "epoch": 2.2833495618305744, + "grad_norm": 1.7052083287832007, + "learning_rate": 5.1480220115302935e-08, + "loss": 2.2412, + "step": 2345 + }, + { + "epoch": 2.288218111002921, + "grad_norm": 1.6994269312553734, + "learning_rate": 5.143481817587523e-08, + "loss": 2.2523, + "step": 2350 + }, + { + "epoch": 2.293086660175268, + "grad_norm": 1.7821059623881625, + "learning_rate": 5.139068577501104e-08, + "loss": 2.2306, + "step": 2355 + }, + { + "epoch": 2.2979552093476143, + "grad_norm": 1.7655828205712845, + "learning_rate": 5.134779087367746e-08, + "loss": 2.2192, + "step": 2360 + }, + { + "epoch": 2.302823758519961, + "grad_norm": 1.764331702411005, + "learning_rate": 5.130610215358936e-08, + "loss": 2.2528, + "step": 2365 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 1.814774977446024, + "learning_rate": 5.126558900298217e-08, + "loss": 2.2311, + "step": 2370 + }, + { + "epoch": 2.3125608568646543, + "grad_norm": 1.7749398004090056, + "learning_rate": 5.122622150262591e-08, + "loss": 2.1965, + "step": 2375 + }, + { + "epoch": 2.317429406037001, + "grad_norm": 1.7732754323290614, + "learning_rate": 5.118797041207741e-08, + "loss": 2.2307, + "step": 2380 + }, + { + "epoch": 2.3222979552093475, + "grad_norm": 1.7204661711193003, + "learning_rate": 5.1150807156166916e-08, + "loss": 2.1813, + "step": 2385 + }, + { + "epoch": 2.3271665043816943, + "grad_norm": 1.7830208677372077, + "learning_rate": 5.111470381171611e-08, + "loss": 2.1976, + "step": 2390 + }, + { + "epoch": 2.332035053554041, + "grad_norm": 1.7100457234343944, + "learning_rate": 5.1079633094484e-08, + "loss": 2.2421, + "step": 2395 + }, + { + "epoch": 2.3369036027263874, + "grad_norm": 1.7579822935018907, + "learning_rate": 5.104556834633745e-08, + "loss": 2.2063, + "step": 2400 + }, + { + "epoch": 2.3369036027263874, + "eval_loss": 2.4292638301849365, + "eval_runtime": 85.8612, + "eval_samples_per_second": 85.009, + "eval_steps_per_second": 0.676, + "step": 2400 + }, + { + "epoch": 2.3417721518987342, + "grad_norm": 1.7345007813133775, + "learning_rate": 5.101248352264326e-08, + "loss": 2.2243, + "step": 2405 + }, + { + "epoch": 2.346640701071081, + "grad_norm": 1.7248955210954013, + "learning_rate": 5.098035317987838e-08, + "loss": 2.2096, + "step": 2410 + }, + { + "epoch": 2.3515092502434274, + "grad_norm": 1.7460382186199548, + "learning_rate": 5.0949152463455285e-08, + "loss": 2.2034, + "step": 2415 + }, + { + "epoch": 2.356377799415774, + "grad_norm": 1.7441544396124007, + "learning_rate": 5.09188570957593e-08, + "loss": 2.1584, + "step": 2420 + }, + { + "epoch": 2.3612463485881205, + "grad_norm": 1.7037153131860179, + "learning_rate": 5.0889443364394804e-08, + "loss": 2.222, + "step": 2425 + }, + { + "epoch": 2.3661148977604674, + "grad_norm": 1.7280060531644463, + "learning_rate": 5.0860888110637265e-08, + "loss": 2.2324, + "step": 2430 + }, + { + "epoch": 2.370983446932814, + "grad_norm": 1.7906585557070347, + "learning_rate": 5.083316871808814e-08, + "loss": 2.2047, + "step": 2435 + }, + { + "epoch": 2.3758519961051605, + "grad_norm": 1.7845861555659082, + "learning_rate": 5.080626310152955e-08, + "loss": 2.1902, + "step": 2440 + }, + { + "epoch": 2.3807205452775073, + "grad_norm": 1.6806913458915649, + "learning_rate": 5.078014969597595e-08, + "loss": 2.2, + "step": 2445 + }, + { + "epoch": 2.385589094449854, + "grad_norm": 1.7905906194923593, + "learning_rate": 5.075480744591971e-08, + "loss": 2.1989, + "step": 2450 + }, + { + "epoch": 2.3904576436222005, + "grad_norm": 1.7807103616717044, + "learning_rate": 5.073021579476786e-08, + "loss": 2.2194, + "step": 2455 + }, + { + "epoch": 2.3953261927945473, + "grad_norm": 1.6957268999562412, + "learning_rate": 5.070635467446715e-08, + "loss": 2.1982, + "step": 2460 + }, + { + "epoch": 2.400194741966894, + "grad_norm": 1.775675901114813, + "learning_rate": 5.0683204495314504e-08, + "loss": 2.213, + "step": 2465 + }, + { + "epoch": 2.4050632911392404, + "grad_norm": 1.7829403999278695, + "learning_rate": 5.06607461359503e-08, + "loss": 2.2226, + "step": 2470 + }, + { + "epoch": 2.4099318403115872, + "grad_norm": 1.766705522512629, + "learning_rate": 5.06389609335315e-08, + "loss": 2.1896, + "step": 2475 + }, + { + "epoch": 2.4148003894839336, + "grad_norm": 1.7827496357552255, + "learning_rate": 5.0617830674082116e-08, + "loss": 2.1947, + "step": 2480 + }, + { + "epoch": 2.4196689386562804, + "grad_norm": 1.7451999252246684, + "learning_rate": 5.059733758301827e-08, + "loss": 2.2393, + "step": 2485 + }, + { + "epoch": 2.424537487828627, + "grad_norm": 1.721359686507254, + "learning_rate": 5.057746431584517e-08, + "loss": 2.2388, + "step": 2490 + }, + { + "epoch": 2.4294060370009736, + "grad_norm": 1.7903488742098908, + "learning_rate": 5.055819394902345e-08, + "loss": 2.1765, + "step": 2495 + }, + { + "epoch": 2.4342745861733204, + "grad_norm": 1.7278059859253259, + "learning_rate": 5.053950997100227e-08, + "loss": 2.188, + "step": 2500 + }, + { + "epoch": 2.439143135345667, + "grad_norm": 1.723905932162126, + "learning_rate": 5.052139627341665e-08, + "loss": 2.1994, + "step": 2505 + }, + { + "epoch": 2.4440116845180135, + "grad_norm": 1.8796866769396487, + "learning_rate": 5.050383714244649e-08, + "loss": 2.2311, + "step": 2510 + }, + { + "epoch": 2.4488802336903603, + "grad_norm": 1.7459716379486228, + "learning_rate": 5.0486817250334816e-08, + "loss": 2.2485, + "step": 2515 + }, + { + "epoch": 2.453748782862707, + "grad_norm": 1.7365026105109531, + "learning_rate": 5.047032164706284e-08, + "loss": 2.1676, + "step": 2520 + }, + { + "epoch": 2.4586173320350535, + "grad_norm": 1.7081299835491885, + "learning_rate": 5.045433575217931e-08, + "loss": 2.1875, + "step": 2525 + }, + { + "epoch": 2.4634858812074003, + "grad_norm": 1.744755862782243, + "learning_rate": 5.043884534678184e-08, + "loss": 2.245, + "step": 2530 + }, + { + "epoch": 2.4683544303797467, + "grad_norm": 1.7422537825667799, + "learning_rate": 5.042383656564784e-08, + "loss": 2.217, + "step": 2535 + }, + { + "epoch": 2.4732229795520935, + "grad_norm": 1.734841704470881, + "learning_rate": 5.040929588951272e-08, + "loss": 2.2173, + "step": 2540 + }, + { + "epoch": 2.4780915287244403, + "grad_norm": 1.779810459600651, + "learning_rate": 5.039521013749303e-08, + "loss": 2.247, + "step": 2545 + }, + { + "epoch": 2.4829600778967866, + "grad_norm": 1.8117164085811932, + "learning_rate": 5.0381566459652284e-08, + "loss": 2.237, + "step": 2550 + }, + { + "epoch": 2.4878286270691334, + "grad_norm": 1.7287658099278131, + "learning_rate": 5.0368352329707235e-08, + "loss": 2.1902, + "step": 2555 + }, + { + "epoch": 2.49269717624148, + "grad_norm": 1.7649214315832387, + "learning_rate": 5.0355555537872345e-08, + "loss": 2.2109, + "step": 2560 + }, + { + "epoch": 2.4975657254138266, + "grad_norm": 1.662642992634986, + "learning_rate": 5.0343164183840344e-08, + "loss": 2.197, + "step": 2565 + }, + { + "epoch": 2.5024342745861734, + "grad_norm": 1.7337104838406339, + "learning_rate": 5.033116666989654e-08, + "loss": 2.2101, + "step": 2570 + }, + { + "epoch": 2.50730282375852, + "grad_norm": 1.8134178787153215, + "learning_rate": 5.031955169416503e-08, + "loss": 2.1644, + "step": 2575 + }, + { + "epoch": 2.5121713729308666, + "grad_norm": 1.780434641418463, + "learning_rate": 5.0308308243984355e-08, + "loss": 2.2137, + "step": 2580 + }, + { + "epoch": 2.5170399221032134, + "grad_norm": 1.744873235024722, + "learning_rate": 5.0297425589410844e-08, + "loss": 2.1802, + "step": 2585 + }, + { + "epoch": 2.5219084712755597, + "grad_norm": 1.7946463871437135, + "learning_rate": 5.0286893276847386e-08, + "loss": 2.2194, + "step": 2590 + }, + { + "epoch": 2.5267770204479065, + "grad_norm": 1.7550387629748228, + "learning_rate": 5.0276701122795665e-08, + "loss": 2.1656, + "step": 2595 + }, + { + "epoch": 2.5316455696202533, + "grad_norm": 1.7656891772283707, + "learning_rate": 5.02668392077299e-08, + "loss": 2.2544, + "step": 2600 + }, + { + "epoch": 2.5316455696202533, + "eval_loss": 2.429072141647339, + "eval_runtime": 85.9551, + "eval_samples_per_second": 84.916, + "eval_steps_per_second": 0.675, + "step": 2600 + }, + { + "epoch": 2.5365141187925997, + "grad_norm": 1.8122468666175253, + "learning_rate": 5.025729787009003e-08, + "loss": 2.2305, + "step": 2605 + }, + { + "epoch": 2.5413826679649465, + "grad_norm": 1.6973041191639477, + "learning_rate": 5.024806770039247e-08, + "loss": 2.2292, + "step": 2610 + }, + { + "epoch": 2.546251217137293, + "grad_norm": 1.729259299213179, + "learning_rate": 5.023913953545651e-08, + "loss": 2.2614, + "step": 2615 + }, + { + "epoch": 2.5511197663096397, + "grad_norm": 1.7611375878198783, + "learning_rate": 5.023050445274437e-08, + "loss": 2.2225, + "step": 2620 + }, + { + "epoch": 2.5559883154819865, + "grad_norm": 1.7936750412570266, + "learning_rate": 5.022215376481317e-08, + "loss": 2.2203, + "step": 2625 + }, + { + "epoch": 2.5608568646543333, + "grad_norm": 1.7697623039425605, + "learning_rate": 5.021407901387688e-08, + "loss": 2.1863, + "step": 2630 + }, + { + "epoch": 2.5657254138266796, + "grad_norm": 1.7467899336813517, + "learning_rate": 5.02062719664764e-08, + "loss": 2.2103, + "step": 2635 + }, + { + "epoch": 2.5705939629990264, + "grad_norm": 1.7889652968951377, + "learning_rate": 5.019872460825613e-08, + "loss": 2.2229, + "step": 2640 + }, + { + "epoch": 2.575462512171373, + "grad_norm": 1.722371905671811, + "learning_rate": 5.019142913884503e-08, + "loss": 2.1491, + "step": 2645 + }, + { + "epoch": 2.5803310613437196, + "grad_norm": 1.7502842489092898, + "learning_rate": 5.018437796684058e-08, + "loss": 2.2225, + "step": 2650 + }, + { + "epoch": 2.5851996105160664, + "grad_norm": 1.733643511652611, + "learning_rate": 5.0177563704893894e-08, + "loss": 2.2308, + "step": 2655 + }, + { + "epoch": 2.5900681596884128, + "grad_norm": 1.7507169934067448, + "learning_rate": 5.0170979164894177e-08, + "loss": 2.2319, + "step": 2660 + }, + { + "epoch": 2.5949367088607596, + "grad_norm": 1.7597206213080367, + "learning_rate": 5.016461735325101e-08, + "loss": 2.2221, + "step": 2665 + }, + { + "epoch": 2.599805258033106, + "grad_norm": 1.7484941859889187, + "learning_rate": 5.0158471466272625e-08, + "loss": 2.2408, + "step": 2670 + }, + { + "epoch": 2.6046738072054527, + "grad_norm": 1.7575142192005702, + "learning_rate": 5.0152534885638713e-08, + "loss": 2.2199, + "step": 2675 + }, + { + "epoch": 2.6095423563777995, + "grad_norm": 1.7220313810152217, + "learning_rate": 5.014680117396598e-08, + "loss": 2.172, + "step": 2680 + }, + { + "epoch": 2.6144109055501463, + "grad_norm": 1.7246730790979932, + "learning_rate": 5.0141264070464985e-08, + "loss": 2.2053, + "step": 2685 + }, + { + "epoch": 2.6192794547224927, + "grad_norm": 1.733539469198201, + "learning_rate": 5.013591748668665e-08, + "loss": 2.1883, + "step": 2690 + }, + { + "epoch": 2.6241480038948395, + "grad_norm": 1.708722183137744, + "learning_rate": 5.0130755502356856e-08, + "loss": 2.2084, + "step": 2695 + }, + { + "epoch": 2.629016553067186, + "grad_norm": 1.743046661098172, + "learning_rate": 5.0125772361297664e-08, + "loss": 2.1743, + "step": 2700 + }, + { + "epoch": 2.6338851022395326, + "grad_norm": 1.8128256440501649, + "learning_rate": 5.0120962467433614e-08, + "loss": 2.2052, + "step": 2705 + }, + { + "epoch": 2.6387536514118795, + "grad_norm": 1.7757973503695443, + "learning_rate": 5.0116320380881606e-08, + "loss": 2.2504, + "step": 2710 + }, + { + "epoch": 2.643622200584226, + "grad_norm": 1.7774946391866604, + "learning_rate": 5.0111840814122985e-08, + "loss": 2.1926, + "step": 2715 + }, + { + "epoch": 2.6484907497565726, + "grad_norm": 1.7302618686374809, + "learning_rate": 5.010751862825623e-08, + "loss": 2.1849, + "step": 2720 + }, + { + "epoch": 2.653359298928919, + "grad_norm": 1.7071516593785698, + "learning_rate": 5.0103348829328986e-08, + "loss": 2.1692, + "step": 2725 + }, + { + "epoch": 2.6582278481012658, + "grad_norm": 1.721147660276721, + "learning_rate": 5.009932656474795e-08, + "loss": 2.2232, + "step": 2730 + }, + { + "epoch": 2.6630963972736126, + "grad_norm": 1.8757024025402924, + "learning_rate": 5.0095447119765244e-08, + "loss": 2.193, + "step": 2735 + }, + { + "epoch": 2.667964946445959, + "grad_norm": 1.7734949594595664, + "learning_rate": 5.009170591403991e-08, + "loss": 2.2176, + "step": 2740 + }, + { + "epoch": 2.6728334956183057, + "grad_norm": 1.7300923376938375, + "learning_rate": 5.0088098498273176e-08, + "loss": 2.2405, + "step": 2745 + }, + { + "epoch": 2.6777020447906525, + "grad_norm": 1.715381643484051, + "learning_rate": 5.008462055091624e-08, + "loss": 2.1716, + "step": 2750 + }, + { + "epoch": 2.682570593962999, + "grad_norm": 1.7201490901820933, + "learning_rate": 5.00812678749491e-08, + "loss": 2.2163, + "step": 2755 + }, + { + "epoch": 2.6874391431353457, + "grad_norm": 1.776422114245558, + "learning_rate": 5.0078036394729376e-08, + "loss": 2.2385, + "step": 2760 + }, + { + "epoch": 2.6923076923076925, + "grad_norm": 1.7434342909806673, + "learning_rate": 5.007492215290964e-08, + "loss": 2.2148, + "step": 2765 + }, + { + "epoch": 2.697176241480039, + "grad_norm": 1.7303585866040545, + "learning_rate": 5.0071921307422136e-08, + "loss": 2.2202, + "step": 2770 + }, + { + "epoch": 2.7020447906523857, + "grad_norm": 1.7653235238408616, + "learning_rate": 5.00690301285296e-08, + "loss": 2.2438, + "step": 2775 + }, + { + "epoch": 2.706913339824732, + "grad_norm": 1.732574656691103, + "learning_rate": 5.006624499594101e-08, + "loss": 2.1988, + "step": 2780 + }, + { + "epoch": 2.711781888997079, + "grad_norm": 1.7030324021473955, + "learning_rate": 5.0063562395991076e-08, + "loss": 2.2623, + "step": 2785 + }, + { + "epoch": 2.7166504381694256, + "grad_norm": 1.9331342916421579, + "learning_rate": 5.006097891888214e-08, + "loss": 2.1995, + "step": 2790 + }, + { + "epoch": 2.721518987341772, + "grad_norm": 1.7201710098478449, + "learning_rate": 5.005849125598762e-08, + "loss": 2.2411, + "step": 2795 + }, + { + "epoch": 2.726387536514119, + "grad_norm": 1.6978407566709903, + "learning_rate": 5.0056096197215594e-08, + "loss": 2.2024, + "step": 2800 + }, + { + "epoch": 2.726387536514119, + "eval_loss": 2.4288954734802246, + "eval_runtime": 85.7694, + "eval_samples_per_second": 85.1, + "eval_steps_per_second": 0.676, + "step": 2800 + }, + { + "epoch": 2.731256085686465, + "grad_norm": 1.7625821445707386, + "learning_rate": 5.005379062843146e-08, + "loss": 2.245, + "step": 2805 + }, + { + "epoch": 2.736124634858812, + "grad_norm": 1.7304216492048587, + "learning_rate": 5.005157152893869e-08, + "loss": 2.223, + "step": 2810 + }, + { + "epoch": 2.7409931840311588, + "grad_norm": 1.755183680113624, + "learning_rate": 5.0049435969016435e-08, + "loss": 2.2239, + "step": 2815 + }, + { + "epoch": 2.7458617332035056, + "grad_norm": 1.728272875903093, + "learning_rate": 5.004738110751295e-08, + "loss": 2.237, + "step": 2820 + }, + { + "epoch": 2.750730282375852, + "grad_norm": 1.764407536913778, + "learning_rate": 5.0045404189493816e-08, + "loss": 2.2193, + "step": 2825 + }, + { + "epoch": 2.7555988315481987, + "grad_norm": 1.7145100854618731, + "learning_rate": 5.0043502543943914e-08, + "loss": 2.1959, + "step": 2830 + }, + { + "epoch": 2.760467380720545, + "grad_norm": 1.7322275336580597, + "learning_rate": 5.0041673581522057e-08, + "loss": 2.1898, + "step": 2835 + }, + { + "epoch": 2.765335929892892, + "grad_norm": 1.7454769702317845, + "learning_rate": 5.003991479236733e-08, + "loss": 2.2852, + "step": 2840 + }, + { + "epoch": 2.7702044790652387, + "grad_norm": 1.8214614189910026, + "learning_rate": 5.003822374395615e-08, + "loss": 2.2305, + "step": 2845 + }, + { + "epoch": 2.775073028237585, + "grad_norm": 1.8310587465226418, + "learning_rate": 5.0036598079009014e-08, + "loss": 2.1533, + "step": 2850 + }, + { + "epoch": 2.779941577409932, + "grad_norm": 1.773580257235929, + "learning_rate": 5.003503551344602e-08, + "loss": 2.2247, + "step": 2855 + }, + { + "epoch": 2.7848101265822782, + "grad_norm": 1.6972419215418109, + "learning_rate": 5.003353383439017e-08, + "loss": 2.2464, + "step": 2860 + }, + { + "epoch": 2.789678675754625, + "grad_norm": 1.7541525971250715, + "learning_rate": 5.003209089821755e-08, + "loss": 2.2041, + "step": 2865 + }, + { + "epoch": 2.794547224926972, + "grad_norm": 1.7726611140742992, + "learning_rate": 5.003070462865347e-08, + "loss": 2.2351, + "step": 2870 + }, + { + "epoch": 2.7994157740993186, + "grad_norm": 1.7257606879168454, + "learning_rate": 5.002937301491361e-08, + "loss": 2.1964, + "step": 2875 + }, + { + "epoch": 2.804284323271665, + "grad_norm": 1.7665569916717356, + "learning_rate": 5.002809410988936e-08, + "loss": 2.2198, + "step": 2880 + }, + { + "epoch": 2.809152872444012, + "grad_norm": 0.8460817495380386, + "learning_rate": 5.00268660283764e-08, + "loss": 2.2017, + "step": 2885 + }, + { + "epoch": 2.814021421616358, + "grad_norm": 1.7781068744183743, + "learning_rate": 5.002568694534567e-08, + "loss": 2.2113, + "step": 2890 + }, + { + "epoch": 2.818889970788705, + "grad_norm": 1.70436430018188, + "learning_rate": 5.0024555094256e-08, + "loss": 2.2296, + "step": 2895 + }, + { + "epoch": 2.8237585199610518, + "grad_norm": 1.801068375798501, + "learning_rate": 5.002346876540729e-08, + "loss": 2.2755, + "step": 2900 + }, + { + "epoch": 2.828627069133398, + "grad_norm": 1.7984300376552464, + "learning_rate": 5.0022426304333776e-08, + "loss": 2.2464, + "step": 2905 + }, + { + "epoch": 2.833495618305745, + "grad_norm": 1.7278702590224992, + "learning_rate": 5.002142611023626e-08, + "loss": 2.2223, + "step": 2910 + }, + { + "epoch": 2.8383641674780913, + "grad_norm": 1.7544765756817247, + "learning_rate": 5.0020466634452695e-08, + "loss": 2.2096, + "step": 2915 + }, + { + "epoch": 2.843232716650438, + "grad_norm": 1.7574601723708598, + "learning_rate": 5.001954637896626e-08, + "loss": 2.2034, + "step": 2920 + }, + { + "epoch": 2.848101265822785, + "grad_norm": 1.6850076585078704, + "learning_rate": 5.0018663894950185e-08, + "loss": 2.1788, + "step": 2925 + }, + { + "epoch": 2.8529698149951317, + "grad_norm": 1.734136831684721, + "learning_rate": 5.001781778134857e-08, + "loss": 2.171, + "step": 2930 + }, + { + "epoch": 2.857838364167478, + "grad_norm": 1.8530752594311186, + "learning_rate": 5.0017006683492444e-08, + "loss": 2.2323, + "step": 2935 + }, + { + "epoch": 2.862706913339825, + "grad_norm": 1.7452631968653791, + "learning_rate": 5.001622929175032e-08, + "loss": 2.2002, + "step": 2940 + }, + { + "epoch": 2.867575462512171, + "grad_norm": 1.765852324361209, + "learning_rate": 5.0015484340212554e-08, + "loss": 2.173, + "step": 2945 + }, + { + "epoch": 2.872444011684518, + "grad_norm": 1.7504024622386836, + "learning_rate": 5.0014770605408766e-08, + "loss": 2.1818, + "step": 2950 + }, + { + "epoch": 2.877312560856865, + "grad_norm": 1.709084582717418, + "learning_rate": 5.001408690505769e-08, + "loss": 2.2248, + "step": 2955 + }, + { + "epoch": 2.882181110029211, + "grad_norm": 1.726469521262587, + "learning_rate": 5.0013432096848647e-08, + "loss": 2.1754, + "step": 2960 + }, + { + "epoch": 2.887049659201558, + "grad_norm": 1.735487847062397, + "learning_rate": 5.001280507725414e-08, + "loss": 2.1754, + "step": 2965 + }, + { + "epoch": 2.8919182083739043, + "grad_norm": 1.7901732327186204, + "learning_rate": 5.0012204780372755e-08, + "loss": 2.2104, + "step": 2970 + }, + { + "epoch": 2.896786757546251, + "grad_norm": 1.7497305238142942, + "learning_rate": 5.001163017680177e-08, + "loss": 2.2131, + "step": 2975 + }, + { + "epoch": 2.901655306718598, + "grad_norm": 1.6912311894555403, + "learning_rate": 5.0011080272538895e-08, + "loss": 2.1984, + "step": 2980 + }, + { + "epoch": 2.9065238558909448, + "grad_norm": 1.7550540502025136, + "learning_rate": 5.0010657486604285e-08, + "loss": 2.2325, + "step": 2985 + }, + { + "epoch": 2.911392405063291, + "grad_norm": 1.7830928527818988, + "learning_rate": 5.001014964512753e-08, + "loss": 2.2295, + "step": 2990 + }, + { + "epoch": 2.916260954235638, + "grad_norm": 1.7967938470353484, + "learning_rate": 5.000966389908247e-08, + "loss": 2.2238, + "step": 2995 + }, + { + "epoch": 2.9211295034079843, + "grad_norm": 1.73275794534534, + "learning_rate": 5.0009199379466084e-08, + "loss": 2.2074, + "step": 3000 + }, + { + "epoch": 2.9211295034079843, + "eval_loss": 2.4288480281829834, + "eval_runtime": 85.4779, + "eval_samples_per_second": 85.391, + "eval_steps_per_second": 0.679, + "step": 3000 + }, + { + "epoch": 2.925998052580331, + "grad_norm": 1.798848636728742, + "learning_rate": 5.000875524778925e-08, + "loss": 2.1889, + "step": 3005 + }, + { + "epoch": 2.930866601752678, + "grad_norm": 1.7669718423903236, + "learning_rate": 5.0008330695135104e-08, + "loss": 2.2008, + "step": 3010 + }, + { + "epoch": 2.9357351509250242, + "grad_norm": 1.7532733346720555, + "learning_rate": 5.000792494124235e-08, + "loss": 2.193, + "step": 3015 + }, + { + "epoch": 2.940603700097371, + "grad_norm": 1.7095158480632124, + "learning_rate": 5.00075372336131e-08, + "loss": 2.2389, + "step": 3020 + }, + { + "epoch": 2.9454722492697174, + "grad_norm": 1.7236728786130506, + "learning_rate": 5.0007166846644555e-08, + "loss": 2.2444, + "step": 3025 + }, + { + "epoch": 2.950340798442064, + "grad_norm": 1.7363740627339725, + "learning_rate": 5.000681308078417e-08, + "loss": 2.22, + "step": 3030 + }, + { + "epoch": 2.955209347614411, + "grad_norm": 1.728808934644446, + "learning_rate": 5.000647526170754e-08, + "loss": 2.2163, + "step": 3035 + }, + { + "epoch": 2.960077896786758, + "grad_norm": 1.7449413432648693, + "learning_rate": 5.000615273951875e-08, + "loss": 2.2779, + "step": 3040 + }, + { + "epoch": 2.964946445959104, + "grad_norm": 1.753647141186398, + "learning_rate": 5.000584488797249e-08, + "loss": 2.1997, + "step": 3045 + }, + { + "epoch": 2.969814995131451, + "grad_norm": 1.729320335247063, + "learning_rate": 5.000555110371747e-08, + "loss": 2.2329, + "step": 3050 + }, + { + "epoch": 2.9746835443037973, + "grad_norm": 1.7435846324509203, + "learning_rate": 5.000527080556074e-08, + "loss": 2.2455, + "step": 3055 + }, + { + "epoch": 2.979552093476144, + "grad_norm": 1.7544585151474008, + "learning_rate": 5.000500343375234e-08, + "loss": 2.2232, + "step": 3060 + }, + { + "epoch": 2.984420642648491, + "grad_norm": 1.8297856934072276, + "learning_rate": 5.00047484492898e-08, + "loss": 2.2025, + "step": 3065 + }, + { + "epoch": 2.9892891918208373, + "grad_norm": 1.8331124463008972, + "learning_rate": 5.000450533324217e-08, + "loss": 2.2346, + "step": 3070 + }, + { + "epoch": 2.994157740993184, + "grad_norm": 1.77523332941347, + "learning_rate": 5.0004273586092874e-08, + "loss": 2.2476, + "step": 3075 + }, + { + "epoch": 2.9990262901655305, + "grad_norm": 1.719560891287608, + "learning_rate": 5.0004052727101194e-08, + "loss": 2.2219, + "step": 3080 + }, + { + "epoch": 3.0038948393378773, + "grad_norm": 1.7451899184767314, + "learning_rate": 5.000384229368183e-08, + "loss": 2.267, + "step": 3085 + }, + { + "epoch": 3.008763388510224, + "grad_norm": 1.7132088078853227, + "learning_rate": 5.0003641840802084e-08, + "loss": 2.232, + "step": 3090 + }, + { + "epoch": 3.0136319376825704, + "grad_norm": 1.7074710408698504, + "learning_rate": 5.000345094039628e-08, + "loss": 2.2226, + "step": 3095 + }, + { + "epoch": 3.0185004868549172, + "grad_norm": 1.7116453045970612, + "learning_rate": 5.000326918079705e-08, + "loss": 2.2132, + "step": 3100 + }, + { + "epoch": 3.023369036027264, + "grad_norm": 1.7478082412483065, + "learning_rate": 5.000309616618301e-08, + "loss": 2.2595, + "step": 3105 + }, + { + "epoch": 3.0282375851996104, + "grad_norm": 1.7390659451407764, + "learning_rate": 5.0002931516042464e-08, + "loss": 2.2532, + "step": 3110 + }, + { + "epoch": 3.033106134371957, + "grad_norm": 1.6629867778767704, + "learning_rate": 5.0002774864652723e-08, + "loss": 2.2157, + "step": 3115 + }, + { + "epoch": 3.037974683544304, + "grad_norm": 1.7430106410373127, + "learning_rate": 5.000262586057472e-08, + "loss": 2.2654, + "step": 3120 + }, + { + "epoch": 3.0428432327166504, + "grad_norm": 1.7699263854675031, + "learning_rate": 5.0002484166162455e-08, + "loss": 2.1838, + "step": 3125 + }, + { + "epoch": 3.047711781888997, + "grad_norm": 1.7970379405255692, + "learning_rate": 5.0002349457087e-08, + "loss": 2.1844, + "step": 3130 + }, + { + "epoch": 3.0525803310613435, + "grad_norm": 1.740063554373285, + "learning_rate": 5.00022214218746e-08, + "loss": 2.1975, + "step": 3135 + }, + { + "epoch": 3.0574488802336903, + "grad_norm": 1.7669764533755763, + "learning_rate": 5.000209976145863e-08, + "loss": 2.173, + "step": 3140 + }, + { + "epoch": 3.062317429406037, + "grad_norm": 1.7570288798515004, + "learning_rate": 5.000198418874495e-08, + "loss": 2.202, + "step": 3145 + }, + { + "epoch": 3.0671859785783835, + "grad_norm": 1.7640313096994278, + "learning_rate": 5.000187442819041e-08, + "loss": 2.2743, + "step": 3150 + }, + { + "epoch": 3.0720545277507303, + "grad_norm": 1.776697414652845, + "learning_rate": 5.000177021539416e-08, + "loss": 2.1953, + "step": 3155 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 1.7469177371601763, + "learning_rate": 5.000167129670136e-08, + "loss": 2.2643, + "step": 3160 + }, + { + "epoch": 3.0817916260954235, + "grad_norm": 1.7981065248029078, + "learning_rate": 5.000157742881906e-08, + "loss": 2.2192, + "step": 3165 + }, + { + "epoch": 3.0866601752677703, + "grad_norm": 1.8048538458330963, + "learning_rate": 5.000148837844394e-08, + "loss": 2.2188, + "step": 3170 + }, + { + "epoch": 3.091528724440117, + "grad_norm": 1.77587620793656, + "learning_rate": 5.000140392190154e-08, + "loss": 2.205, + "step": 3175 + }, + { + "epoch": 3.0963972736124634, + "grad_norm": 1.7277030937813718, + "learning_rate": 5.000132384479669e-08, + "loss": 2.2418, + "step": 3180 + }, + { + "epoch": 3.1012658227848102, + "grad_norm": 1.7715455250981111, + "learning_rate": 5.0001247941674944e-08, + "loss": 2.195, + "step": 3185 + }, + { + "epoch": 3.1061343719571566, + "grad_norm": 1.7418270265834659, + "learning_rate": 5.00011760156946e-08, + "loss": 2.2188, + "step": 3190 + }, + { + "epoch": 3.1110029211295034, + "grad_norm": 1.7626475968666995, + "learning_rate": 5.000110787830913e-08, + "loss": 2.1969, + "step": 3195 + }, + { + "epoch": 3.11587147030185, + "grad_norm": 1.7554268739931094, + "learning_rate": 5.000104334895965e-08, + "loss": 2.2268, + "step": 3200 + }, + { + "epoch": 3.11587147030185, + "eval_loss": 2.4296844005584717, + "eval_runtime": 85.8007, + "eval_samples_per_second": 85.069, + "eval_steps_per_second": 0.676, + "step": 3200 + }, + { + "epoch": 3.1207400194741965, + "grad_norm": 1.768111175754245, + "learning_rate": 5.000098225477726e-08, + "loss": 2.1925, + "step": 3205 + }, + { + "epoch": 3.1256085686465434, + "grad_norm": 1.731673904293537, + "learning_rate": 5.000092443029491e-08, + "loss": 2.2291, + "step": 3210 + }, + { + "epoch": 3.13047711781889, + "grad_norm": 1.7474269730832317, + "learning_rate": 5.0000869717168615e-08, + "loss": 2.2431, + "step": 3215 + }, + { + "epoch": 3.1353456669912365, + "grad_norm": 1.7068082256216377, + "learning_rate": 5.000081796390766e-08, + "loss": 2.2197, + "step": 3220 + }, + { + "epoch": 3.1402142161635833, + "grad_norm": 1.791689784795618, + "learning_rate": 5.000076902561367e-08, + "loss": 2.1983, + "step": 3225 + }, + { + "epoch": 3.14508276533593, + "grad_norm": 1.7624390988374115, + "learning_rate": 5.000072276372817e-08, + "loss": 2.2693, + "step": 3230 + }, + { + "epoch": 3.1499513145082765, + "grad_norm": 1.8338687668629183, + "learning_rate": 5.0000679045788575e-08, + "loss": 2.1917, + "step": 3235 + }, + { + "epoch": 3.1548198636806233, + "grad_norm": 1.709244589398017, + "learning_rate": 5.000063774519218e-08, + "loss": 2.1911, + "step": 3240 + }, + { + "epoch": 3.1596884128529696, + "grad_norm": 1.7192540273943981, + "learning_rate": 5.0000598740968074e-08, + "loss": 2.1583, + "step": 3245 + }, + { + "epoch": 3.1645569620253164, + "grad_norm": 1.728010839757635, + "learning_rate": 5.000056191755672e-08, + "loss": 2.1972, + "step": 3250 + }, + { + "epoch": 3.1694255111976632, + "grad_norm": 1.788796272197145, + "learning_rate": 5.0000527164596915e-08, + "loss": 2.2011, + "step": 3255 + }, + { + "epoch": 3.1742940603700096, + "grad_norm": 1.7635432660617776, + "learning_rate": 5.000049437672004e-08, + "loss": 2.2152, + "step": 3260 + }, + { + "epoch": 3.1791626095423564, + "grad_norm": 1.7957521394770797, + "learning_rate": 5.000046345335129e-08, + "loss": 2.2182, + "step": 3265 + }, + { + "epoch": 3.184031158714703, + "grad_norm": 1.8009009243137868, + "learning_rate": 5.000043429851777e-08, + "loss": 2.2061, + "step": 3270 + }, + { + "epoch": 3.1888997078870496, + "grad_norm": 1.7948019721772699, + "learning_rate": 5.0000406820663126e-08, + "loss": 2.1857, + "step": 3275 + }, + { + "epoch": 3.1937682570593964, + "grad_norm": 1.8100815667189885, + "learning_rate": 5.0000380932468733e-08, + "loss": 2.1777, + "step": 3280 + }, + { + "epoch": 3.1986368062317427, + "grad_norm": 1.7827984808365251, + "learning_rate": 5.000035655068104e-08, + "loss": 2.2366, + "step": 3285 + }, + { + "epoch": 3.2035053554040895, + "grad_norm": 1.748286864117926, + "learning_rate": 5.0000333595944974e-08, + "loss": 2.207, + "step": 3290 + }, + { + "epoch": 3.2083739045764363, + "grad_norm": 1.703650849390858, + "learning_rate": 5.0000311992643325e-08, + "loss": 2.1896, + "step": 3295 + }, + { + "epoch": 3.2132424537487827, + "grad_norm": 1.7698280644275959, + "learning_rate": 5.0000291668741705e-08, + "loss": 2.1639, + "step": 3300 + }, + { + "epoch": 3.2181110029211295, + "grad_norm": 1.756366599744074, + "learning_rate": 5.000027255563917e-08, + "loss": 2.2012, + "step": 3305 + }, + { + "epoch": 3.2229795520934763, + "grad_norm": 1.759461002068701, + "learning_rate": 5.000025458802413e-08, + "loss": 2.2341, + "step": 3310 + }, + { + "epoch": 3.2278481012658227, + "grad_norm": 1.7494660148125987, + "learning_rate": 5.000023770373551e-08, + "loss": 2.207, + "step": 3315 + }, + { + "epoch": 3.2327166504381695, + "grad_norm": 1.7785100579340334, + "learning_rate": 5.000022184362899e-08, + "loss": 2.1999, + "step": 3320 + }, + { + "epoch": 3.2375851996105163, + "grad_norm": 1.7645954769969932, + "learning_rate": 5.0000206951448066e-08, + "loss": 2.2683, + "step": 3325 + }, + { + "epoch": 3.2424537487828626, + "grad_norm": 1.7745088927616617, + "learning_rate": 5.000019297369995e-08, + "loss": 2.2251, + "step": 3330 + }, + { + "epoch": 3.2473222979552094, + "grad_norm": 1.8131533062966378, + "learning_rate": 5.0000179859535986e-08, + "loss": 2.2074, + "step": 3335 + }, + { + "epoch": 3.252190847127556, + "grad_norm": 1.744211372153281, + "learning_rate": 5.000016756063664e-08, + "loss": 2.172, + "step": 3340 + }, + { + "epoch": 3.2570593962999026, + "grad_norm": 1.7741244031891719, + "learning_rate": 5.000015603110066e-08, + "loss": 2.2425, + "step": 3345 + }, + { + "epoch": 3.2619279454722494, + "grad_norm": 1.7486152934920032, + "learning_rate": 5.0000145227338575e-08, + "loss": 2.2143, + "step": 3350 + }, + { + "epoch": 3.2667964946445958, + "grad_norm": 1.757112828413238, + "learning_rate": 5.000013510797011e-08, + "loss": 2.1976, + "step": 3355 + }, + { + "epoch": 3.2716650438169426, + "grad_norm": 1.7240716262707467, + "learning_rate": 5.000012563372563e-08, + "loss": 2.2618, + "step": 3360 + }, + { + "epoch": 3.2765335929892894, + "grad_norm": 1.7213155493984982, + "learning_rate": 5.0000116767351296e-08, + "loss": 2.2115, + "step": 3365 + }, + { + "epoch": 3.2814021421616357, + "grad_norm": 1.8311141582937929, + "learning_rate": 5.000010847351797e-08, + "loss": 2.1974, + "step": 3370 + }, + { + "epoch": 3.2862706913339825, + "grad_norm": 1.7864630099921617, + "learning_rate": 5.000010071873363e-08, + "loss": 2.238, + "step": 3375 + }, + { + "epoch": 3.291139240506329, + "grad_norm": 1.7883459145585372, + "learning_rate": 5.00000934712592e-08, + "loss": 2.2031, + "step": 3380 + }, + { + "epoch": 3.2960077896786757, + "grad_norm": 1.699599322514979, + "learning_rate": 5.000008670102778e-08, + "loss": 2.2327, + "step": 3385 + }, + { + "epoch": 3.3008763388510225, + "grad_norm": 1.7788870047828989, + "learning_rate": 5.0000080379567014e-08, + "loss": 2.1472, + "step": 3390 + }, + { + "epoch": 3.305744888023369, + "grad_norm": 1.7365094304977247, + "learning_rate": 5.000007447992463e-08, + "loss": 2.2068, + "step": 3395 + }, + { + "epoch": 3.3106134371957157, + "grad_norm": 1.8492451880826386, + "learning_rate": 5.0000068976596906e-08, + "loss": 2.1556, + "step": 3400 + }, + { + "epoch": 3.3106134371957157, + "eval_loss": 2.429415464401245, + "eval_runtime": 85.0239, + "eval_samples_per_second": 85.846, + "eval_steps_per_second": 0.682, + "step": 3400 + }, + { + "epoch": 3.3154819863680625, + "grad_norm": 1.769012775563444, + "learning_rate": 5.0000063845460134e-08, + "loss": 2.1791, + "step": 3405 + }, + { + "epoch": 3.320350535540409, + "grad_norm": 1.7426924775202128, + "learning_rate": 5.000005999315767e-08, + "loss": 2.2323, + "step": 3410 + }, + { + "epoch": 3.3252190847127556, + "grad_norm": 1.764543573084179, + "learning_rate": 5.000005547532745e-08, + "loss": 2.1965, + "step": 3415 + }, + { + "epoch": 3.3300876338851024, + "grad_norm": 1.8373323305401401, + "learning_rate": 5.0000051268933666e-08, + "loss": 2.2295, + "step": 3420 + }, + { + "epoch": 3.334956183057449, + "grad_norm": 1.713591544576181, + "learning_rate": 5.00000473545193e-08, + "loss": 2.1847, + "step": 3425 + }, + { + "epoch": 3.3398247322297956, + "grad_norm": 1.7559454568881032, + "learning_rate": 5.000004371371542e-08, + "loss": 2.2148, + "step": 3430 + }, + { + "epoch": 3.344693281402142, + "grad_norm": 1.7485006911314465, + "learning_rate": 5.000004032918764e-08, + "loss": 2.2065, + "step": 3435 + }, + { + "epoch": 3.3495618305744888, + "grad_norm": 1.7533601433403279, + "learning_rate": 5.00000371845847e-08, + "loss": 2.1955, + "step": 3440 + }, + { + "epoch": 3.3544303797468356, + "grad_norm": 1.7543658707594614, + "learning_rate": 5.000003426448936e-08, + "loss": 2.2087, + "step": 3445 + }, + { + "epoch": 3.359298928919182, + "grad_norm": 1.7904175039614314, + "learning_rate": 5.0000031554371375e-08, + "loss": 2.272, + "step": 3450 + }, + { + "epoch": 3.3641674780915287, + "grad_norm": 1.7502721668385577, + "learning_rate": 5.000002904054251e-08, + "loss": 2.2101, + "step": 3455 + }, + { + "epoch": 3.3690360272638755, + "grad_norm": 1.706493138085354, + "learning_rate": 5.000002671011354e-08, + "loss": 2.1955, + "step": 3460 + }, + { + "epoch": 3.373904576436222, + "grad_norm": 1.7655034010253434, + "learning_rate": 5.0000024550953135e-08, + "loss": 2.2103, + "step": 3465 + }, + { + "epoch": 3.3787731256085687, + "grad_norm": 1.7580566741004937, + "learning_rate": 5.0000022551648575e-08, + "loss": 2.2243, + "step": 3470 + }, + { + "epoch": 3.3836416747809155, + "grad_norm": 1.7951986660353298, + "learning_rate": 5.000002070146821e-08, + "loss": 2.233, + "step": 3475 + }, + { + "epoch": 3.388510223953262, + "grad_norm": 1.7464673450399157, + "learning_rate": 5.000001899032566e-08, + "loss": 2.1938, + "step": 3480 + }, + { + "epoch": 3.3933787731256086, + "grad_norm": 1.7506731049190618, + "learning_rate": 5.0000017408745575e-08, + "loss": 2.2132, + "step": 3485 + }, + { + "epoch": 3.398247322297955, + "grad_norm": 1.7158313234275915, + "learning_rate": 5.0000015947831005e-08, + "loss": 2.2255, + "step": 3490 + }, + { + "epoch": 3.403115871470302, + "grad_norm": 1.7741448257648114, + "learning_rate": 5.000001459923229e-08, + "loss": 2.2561, + "step": 3495 + }, + { + "epoch": 3.4079844206426486, + "grad_norm": 1.7680229196146826, + "learning_rate": 5.0000013355117324e-08, + "loss": 2.2174, + "step": 3500 + }, + { + "epoch": 3.412852969814995, + "grad_norm": 1.7566868775329945, + "learning_rate": 5.000001220814333e-08, + "loss": 2.2049, + "step": 3505 + }, + { + "epoch": 3.4177215189873418, + "grad_norm": 1.7231310116084255, + "learning_rate": 5.0000011151429815e-08, + "loss": 2.243, + "step": 3510 + }, + { + "epoch": 3.4225900681596886, + "grad_norm": 1.7525419468509682, + "learning_rate": 5.0000010178532987e-08, + "loss": 2.2271, + "step": 3515 + }, + { + "epoch": 3.427458617332035, + "grad_norm": 1.7743431604185957, + "learning_rate": 5.000000928342124e-08, + "loss": 2.1975, + "step": 3520 + }, + { + "epoch": 3.4323271665043817, + "grad_norm": 1.738567333307858, + "learning_rate": 5.000000846045193e-08, + "loss": 2.1717, + "step": 3525 + }, + { + "epoch": 3.4371957156767285, + "grad_norm": 1.7930958206448022, + "learning_rate": 5.000000770434924e-08, + "loss": 2.1939, + "step": 3530 + }, + { + "epoch": 3.442064264849075, + "grad_norm": 1.7498436482409632, + "learning_rate": 5.0000007010183126e-08, + "loss": 2.213, + "step": 3535 + }, + { + "epoch": 3.4469328140214217, + "grad_norm": 1.7055196904583134, + "learning_rate": 5.0000006373349365e-08, + "loss": 2.2058, + "step": 3540 + }, + { + "epoch": 3.451801363193768, + "grad_norm": 1.7510250220237107, + "learning_rate": 5.00000057895505e-08, + "loss": 2.2337, + "step": 3545 + }, + { + "epoch": 3.456669912366115, + "grad_norm": 1.713344136042387, + "learning_rate": 5.0000005254777845e-08, + "loss": 2.1989, + "step": 3550 + }, + { + "epoch": 3.4615384615384617, + "grad_norm": 1.7371767245978642, + "learning_rate": 5.000000476529434e-08, + "loss": 2.233, + "step": 3555 + }, + { + "epoch": 3.466407010710808, + "grad_norm": 1.7636410740956914, + "learning_rate": 5.000000431761833e-08, + "loss": 2.1615, + "step": 3560 + }, + { + "epoch": 3.471275559883155, + "grad_norm": 1.7144161091063708, + "learning_rate": 5.000000390850812e-08, + "loss": 2.2205, + "step": 3565 + }, + { + "epoch": 3.4761441090555016, + "grad_norm": 1.8042049963234217, + "learning_rate": 5.0000003534947425e-08, + "loss": 2.178, + "step": 3570 + }, + { + "epoch": 3.481012658227848, + "grad_norm": 1.7853951456456785, + "learning_rate": 5.00000031941315e-08, + "loss": 2.2185, + "step": 3575 + }, + { + "epoch": 3.485881207400195, + "grad_norm": 1.6707790882813032, + "learning_rate": 5.000000288345406e-08, + "loss": 2.1868, + "step": 3580 + }, + { + "epoch": 3.4907497565725416, + "grad_norm": 1.7236640638911755, + "learning_rate": 5.000000260049486e-08, + "loss": 2.2153, + "step": 3585 + }, + { + "epoch": 3.495618305744888, + "grad_norm": 1.7103907676750174, + "learning_rate": 5.0000002343008e-08, + "loss": 2.2806, + "step": 3590 + }, + { + "epoch": 3.5004868549172348, + "grad_norm": 1.7458900493895921, + "learning_rate": 5.000000210891081e-08, + "loss": 2.1832, + "step": 3595 + }, + { + "epoch": 3.505355404089581, + "grad_norm": 1.8597626382974297, + "learning_rate": 5.00000018962734e-08, + "loss": 2.1953, + "step": 3600 + }, + { + "epoch": 3.505355404089581, + "eval_loss": 2.4295620918273926, + "eval_runtime": 85.4888, + "eval_samples_per_second": 85.38, + "eval_steps_per_second": 0.678, + "step": 3600 + }, + { + "epoch": 3.510223953261928, + "grad_norm": 1.7712796854715114, + "learning_rate": 5.000000170330872e-08, + "loss": 2.1841, + "step": 3605 + }, + { + "epoch": 3.5150925024342747, + "grad_norm": 1.7646034875776253, + "learning_rate": 5.000000152836327e-08, + "loss": 2.2182, + "step": 3610 + }, + { + "epoch": 3.519961051606621, + "grad_norm": 1.7404348200627402, + "learning_rate": 5.000000136990825e-08, + "loss": 2.2245, + "step": 3615 + }, + { + "epoch": 3.524829600778968, + "grad_norm": 1.7098638272240443, + "learning_rate": 5.000000122653126e-08, + "loss": 2.2094, + "step": 3620 + }, + { + "epoch": 3.5296981499513143, + "grad_norm": 1.7920393050904846, + "learning_rate": 5.000000109692848e-08, + "loss": 2.2366, + "step": 3625 + }, + { + "epoch": 3.534566699123661, + "grad_norm": 1.746241513422538, + "learning_rate": 5.0000000979897296e-08, + "loss": 2.1925, + "step": 3630 + }, + { + "epoch": 3.539435248296008, + "grad_norm": 1.8183591426270245, + "learning_rate": 5.000000087432932e-08, + "loss": 2.2159, + "step": 3635 + }, + { + "epoch": 3.5443037974683547, + "grad_norm": 1.821208130477108, + "learning_rate": 5.0000000779203936e-08, + "loss": 2.2038, + "step": 3640 + }, + { + "epoch": 3.549172346640701, + "grad_norm": 1.7255936567126264, + "learning_rate": 5.000000069358209e-08, + "loss": 2.1901, + "step": 3645 + }, + { + "epoch": 3.554040895813048, + "grad_norm": 1.855947848851622, + "learning_rate": 5.0000000616600557e-08, + "loss": 2.2198, + "step": 3650 + }, + { + "epoch": 3.558909444985394, + "grad_norm": 1.8303786214402666, + "learning_rate": 5.000000054746654e-08, + "loss": 2.1946, + "step": 3655 + }, + { + "epoch": 3.563777994157741, + "grad_norm": 1.7392508415046033, + "learning_rate": 5.000000048545256e-08, + "loss": 2.1778, + "step": 3660 + }, + { + "epoch": 3.568646543330088, + "grad_norm": 1.8213274923689418, + "learning_rate": 5.0000000429891724e-08, + "loss": 2.1742, + "step": 3665 + }, + { + "epoch": 3.573515092502434, + "grad_norm": 1.8017867977855504, + "learning_rate": 5.0000000380173246e-08, + "loss": 2.1915, + "step": 3670 + }, + { + "epoch": 3.578383641674781, + "grad_norm": 1.7382969587337458, + "learning_rate": 5.000000033573829e-08, + "loss": 2.1693, + "step": 3675 + }, + { + "epoch": 3.5832521908471273, + "grad_norm": 1.6944625431597526, + "learning_rate": 5.0000000296076037e-08, + "loss": 2.1787, + "step": 3680 + }, + { + "epoch": 3.588120740019474, + "grad_norm": 1.726132620813893, + "learning_rate": 5.0000000260720054e-08, + "loss": 2.2046, + "step": 3685 + }, + { + "epoch": 3.592989289191821, + "grad_norm": 1.715704002261475, + "learning_rate": 5.0000000229244886e-08, + "loss": 2.2242, + "step": 3690 + }, + { + "epoch": 3.5978578383641677, + "grad_norm": 1.7355446090598186, + "learning_rate": 5.0000000201262854e-08, + "loss": 2.2162, + "step": 3695 + }, + { + "epoch": 3.602726387536514, + "grad_norm": 1.744140516288637, + "learning_rate": 5.0000000176421085e-08, + "loss": 2.1987, + "step": 3700 + }, + { + "epoch": 3.607594936708861, + "grad_norm": 1.801416253511192, + "learning_rate": 5.000000015439878e-08, + "loss": 2.2142, + "step": 3705 + }, + { + "epoch": 3.6124634858812072, + "grad_norm": 1.72380914141294, + "learning_rate": 5.000000013490458e-08, + "loss": 2.1938, + "step": 3710 + }, + { + "epoch": 3.617332035053554, + "grad_norm": 1.6829634226373547, + "learning_rate": 5.000000011767422e-08, + "loss": 2.2291, + "step": 3715 + }, + { + "epoch": 3.622200584225901, + "grad_norm": 1.71665602485241, + "learning_rate": 5.0000000102468287e-08, + "loss": 2.2024, + "step": 3720 + }, + { + "epoch": 3.627069133398247, + "grad_norm": 1.7001891231981483, + "learning_rate": 5.000000008907013e-08, + "loss": 2.2397, + "step": 3725 + }, + { + "epoch": 3.631937682570594, + "grad_norm": 1.7560852361384687, + "learning_rate": 5.000000007728397e-08, + "loss": 2.2149, + "step": 3730 + }, + { + "epoch": 3.6368062317429404, + "grad_norm": 1.8239331341308023, + "learning_rate": 5.0000000066933104e-08, + "loss": 2.2073, + "step": 3735 + }, + { + "epoch": 3.641674780915287, + "grad_norm": 1.7435406529125186, + "learning_rate": 5.000000005785826e-08, + "loss": 2.202, + "step": 3740 + }, + { + "epoch": 3.646543330087634, + "grad_norm": 1.769830204490899, + "learning_rate": 5.0000000049916066e-08, + "loss": 2.1874, + "step": 3745 + }, + { + "epoch": 3.651411879259981, + "grad_norm": 1.7365544971501097, + "learning_rate": 5.0000000042977675e-08, + "loss": 2.2312, + "step": 3750 + }, + { + "epoch": 3.656280428432327, + "grad_norm": 1.754918261553824, + "learning_rate": 5.000000003692742e-08, + "loss": 2.233, + "step": 3755 + }, + { + "epoch": 3.661148977604674, + "grad_norm": 1.7251476744734402, + "learning_rate": 5.000000003166163e-08, + "loss": 2.1785, + "step": 3760 + }, + { + "epoch": 3.6660175267770203, + "grad_norm": 1.6885151956951188, + "learning_rate": 5.000000002708756e-08, + "loss": 2.213, + "step": 3765 + }, + { + "epoch": 3.670886075949367, + "grad_norm": 1.8104164593434913, + "learning_rate": 5.000000002312232e-08, + "loss": 2.2031, + "step": 3770 + }, + { + "epoch": 3.675754625121714, + "grad_norm": 1.6964800041666812, + "learning_rate": 5.0000000019691985e-08, + "loss": 2.2106, + "step": 3775 + }, + { + "epoch": 3.6806231742940603, + "grad_norm": 1.7770021912734775, + "learning_rate": 5.0000000016730705e-08, + "loss": 2.1958, + "step": 3780 + }, + { + "epoch": 3.685491723466407, + "grad_norm": 1.7854463027276486, + "learning_rate": 5.000000001417995e-08, + "loss": 2.1748, + "step": 3785 + }, + { + "epoch": 3.6903602726387534, + "grad_norm": 1.7314209770363447, + "learning_rate": 5.000000001198777e-08, + "loss": 2.2285, + "step": 3790 + }, + { + "epoch": 3.6952288218111002, + "grad_norm": 1.7315368458494336, + "learning_rate": 5.000000001010814e-08, + "loss": 2.2168, + "step": 3795 + }, + { + "epoch": 3.700097370983447, + "grad_norm": 1.7953711064417144, + "learning_rate": 5.0000000008500364e-08, + "loss": 2.2002, + "step": 3800 + }, + { + "epoch": 3.700097370983447, + "eval_loss": 2.4294352531433105, + "eval_runtime": 85.6309, + "eval_samples_per_second": 85.238, + "eval_steps_per_second": 0.677, + "step": 3800 + }, + { + "epoch": 3.704965920155794, + "grad_norm": 1.792810318597494, + "learning_rate": 5.000000000712853e-08, + "loss": 2.2033, + "step": 3805 + }, + { + "epoch": 3.70983446932814, + "grad_norm": 1.7519531928443899, + "learning_rate": 5.0000000005961005e-08, + "loss": 2.2504, + "step": 3810 + }, + { + "epoch": 3.714703018500487, + "grad_norm": 1.8276341931990958, + "learning_rate": 5.0000000004969983e-08, + "loss": 2.1868, + "step": 3815 + }, + { + "epoch": 3.7195715676728334, + "grad_norm": 1.7578731153653255, + "learning_rate": 5.0000000004131077e-08, + "loss": 2.2114, + "step": 3820 + }, + { + "epoch": 3.72444011684518, + "grad_norm": 1.7261756289309933, + "learning_rate": 5.000000000342294e-08, + "loss": 2.1714, + "step": 3825 + }, + { + "epoch": 3.729308666017527, + "grad_norm": 1.7847142724316984, + "learning_rate": 5.0000000002826924e-08, + "loss": 2.1744, + "step": 3830 + }, + { + "epoch": 3.7341772151898733, + "grad_norm": 1.7506323765963263, + "learning_rate": 5.0000000002326804e-08, + "loss": 2.2474, + "step": 3835 + }, + { + "epoch": 3.73904576436222, + "grad_norm": 1.7726668919984896, + "learning_rate": 5.000000000190845e-08, + "loss": 2.1832, + "step": 3840 + }, + { + "epoch": 3.7439143135345665, + "grad_norm": 1.791217388707096, + "learning_rate": 5.000000000155963e-08, + "loss": 2.2552, + "step": 3845 + }, + { + "epoch": 3.7487828627069133, + "grad_norm": 1.7260937822564961, + "learning_rate": 5.000000000126977e-08, + "loss": 2.2135, + "step": 3850 + }, + { + "epoch": 3.75365141187926, + "grad_norm": 1.8761923429228229, + "learning_rate": 5.000000000102974e-08, + "loss": 2.2143, + "step": 3855 + }, + { + "epoch": 3.7585199610516065, + "grad_norm": 1.8000916225190866, + "learning_rate": 5.000000000083169e-08, + "loss": 2.1963, + "step": 3860 + }, + { + "epoch": 3.7633885102239533, + "grad_norm": 1.8139182234780402, + "learning_rate": 5.0000000000668896e-08, + "loss": 2.2333, + "step": 3865 + }, + { + "epoch": 3.7682570593963, + "grad_norm": 1.779253821129295, + "learning_rate": 5.000000000053561e-08, + "loss": 2.2217, + "step": 3870 + }, + { + "epoch": 3.7731256085686464, + "grad_norm": 1.714027215371266, + "learning_rate": 5.0000000000426916e-08, + "loss": 2.2047, + "step": 3875 + }, + { + "epoch": 3.7779941577409932, + "grad_norm": 1.803155267019016, + "learning_rate": 5.000000000033866e-08, + "loss": 2.2063, + "step": 3880 + }, + { + "epoch": 3.78286270691334, + "grad_norm": 1.7914082013975081, + "learning_rate": 5.0000000000267316e-08, + "loss": 2.2199, + "step": 3885 + }, + { + "epoch": 3.7877312560856864, + "grad_norm": 1.7354663482603165, + "learning_rate": 5.00000000002099e-08, + "loss": 2.2118, + "step": 3890 + }, + { + "epoch": 3.792599805258033, + "grad_norm": 1.9003792042576655, + "learning_rate": 5.000000000016393e-08, + "loss": 2.1497, + "step": 3895 + }, + { + "epoch": 3.7974683544303796, + "grad_norm": 1.76200601661607, + "learning_rate": 5.0000000000127304e-08, + "loss": 2.1832, + "step": 3900 + }, + { + "epoch": 3.8023369036027264, + "grad_norm": 1.831527213740064, + "learning_rate": 5.0000000000098273e-08, + "loss": 2.2266, + "step": 3905 + }, + { + "epoch": 3.807205452775073, + "grad_norm": 1.796422966992554, + "learning_rate": 5.000000000007539e-08, + "loss": 2.2171, + "step": 3910 + }, + { + "epoch": 3.8120740019474195, + "grad_norm": 1.8025926494413154, + "learning_rate": 5.000000000005746e-08, + "loss": 2.2325, + "step": 3915 + }, + { + "epoch": 3.8169425511197663, + "grad_norm": 1.793793977270209, + "learning_rate": 5.000000000004349e-08, + "loss": 2.2545, + "step": 3920 + }, + { + "epoch": 3.8218111002921127, + "grad_norm": 1.7928832433425952, + "learning_rate": 5.000000000003268e-08, + "loss": 2.182, + "step": 3925 + }, + { + "epoch": 3.8266796494644595, + "grad_norm": 1.7947342233214896, + "learning_rate": 5.000000000002437e-08, + "loss": 2.1897, + "step": 3930 + }, + { + "epoch": 3.8315481986368063, + "grad_norm": 1.7630875066863383, + "learning_rate": 5.000000000001803e-08, + "loss": 2.1764, + "step": 3935 + }, + { + "epoch": 3.836416747809153, + "grad_norm": 1.7899032248006117, + "learning_rate": 5.0000000000013226e-08, + "loss": 2.1868, + "step": 3940 + }, + { + "epoch": 3.8412852969814995, + "grad_norm": 1.7615217853308245, + "learning_rate": 5.000000000000961e-08, + "loss": 2.1874, + "step": 3945 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 1.8191785253367547, + "learning_rate": 5.000000000000692e-08, + "loss": 2.2096, + "step": 3950 + }, + { + "epoch": 3.8510223953261926, + "grad_norm": 1.7142116315629525, + "learning_rate": 5.0000000000004934e-08, + "loss": 2.1825, + "step": 3955 + }, + { + "epoch": 3.8558909444985394, + "grad_norm": 1.8193971497080461, + "learning_rate": 5.000000000000348e-08, + "loss": 2.1726, + "step": 3960 + }, + { + "epoch": 3.8607594936708862, + "grad_norm": 1.7556627121596253, + "learning_rate": 5.0000000000002426e-08, + "loss": 2.2139, + "step": 3965 + }, + { + "epoch": 3.8656280428432326, + "grad_norm": 1.744589516587126, + "learning_rate": 5.000000000000167e-08, + "loss": 2.2201, + "step": 3970 + }, + { + "epoch": 3.8704965920155794, + "grad_norm": 1.714747302449748, + "learning_rate": 5.0000000000001136e-08, + "loss": 2.2009, + "step": 3975 + }, + { + "epoch": 3.8753651411879257, + "grad_norm": 1.768710509140083, + "learning_rate": 5.000000000000076e-08, + "loss": 2.2031, + "step": 3980 + }, + { + "epoch": 3.8802336903602725, + "grad_norm": 1.7322811102830442, + "learning_rate": 5.00000000000005e-08, + "loss": 2.2165, + "step": 3985 + }, + { + "epoch": 3.8851022395326194, + "grad_norm": 1.8064547108979563, + "learning_rate": 5.000000000000032e-08, + "loss": 2.251, + "step": 3990 + }, + { + "epoch": 3.889970788704966, + "grad_norm": 1.7490086693064326, + "learning_rate": 5.000000000000021e-08, + "loss": 2.2078, + "step": 3995 + }, + { + "epoch": 3.8948393378773125, + "grad_norm": 1.8272955450221657, + "learning_rate": 5.000000000000013e-08, + "loss": 2.2437, + "step": 4000 + }, + { + "epoch": 3.8948393378773125, + "eval_loss": 2.4291422367095947, + "eval_runtime": 85.7591, + "eval_samples_per_second": 85.111, + "eval_steps_per_second": 0.676, + "step": 4000 + }, + { + "epoch": 3.8997078870496593, + "grad_norm": 1.7590084186341215, + "learning_rate": 5.000000000000008e-08, + "loss": 2.2069, + "step": 4005 + }, + { + "epoch": 3.9045764362220057, + "grad_norm": 1.7702717336824394, + "learning_rate": 5.0000000000000044e-08, + "loss": 2.1958, + "step": 4010 + }, + { + "epoch": 3.9094449853943525, + "grad_norm": 1.7719070457541741, + "learning_rate": 5.0000000000000024e-08, + "loss": 2.1862, + "step": 4015 + }, + { + "epoch": 3.9143135345666993, + "grad_norm": 1.729790978044211, + "learning_rate": 5.000000000000001e-08, + "loss": 2.201, + "step": 4020 + }, + { + "epoch": 3.9191820837390456, + "grad_norm": 1.751806658583263, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.2071, + "step": 4025 + }, + { + "epoch": 3.9240506329113924, + "grad_norm": 1.7425275946141223, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.2775, + "step": 4030 + }, + { + "epoch": 3.928919182083739, + "grad_norm": 1.792394595138285, + "learning_rate": 5e-08, + "loss": 2.1683, + "step": 4035 + }, + { + "epoch": 3.9337877312560856, + "grad_norm": 1.7949487182841233, + "learning_rate": 5e-08, + "loss": 2.1962, + "step": 4040 + }, + { + "epoch": 3.9386562804284324, + "grad_norm": 1.8476999521081328, + "learning_rate": 5e-08, + "loss": 2.2004, + "step": 4045 + }, + { + "epoch": 3.943524829600779, + "grad_norm": 1.8225182124751913, + "learning_rate": 5e-08, + "loss": 2.2092, + "step": 4050 + }, + { + "epoch": 3.9483933787731256, + "grad_norm": 1.819449204564278, + "learning_rate": 5e-08, + "loss": 2.2543, + "step": 4055 + }, + { + "epoch": 3.9532619279454724, + "grad_norm": 1.8290111060640115, + "learning_rate": 5e-08, + "loss": 2.1963, + "step": 4060 + }, + { + "epoch": 3.9581304771178187, + "grad_norm": 1.7296724483111061, + "learning_rate": 5e-08, + "loss": 2.2167, + "step": 4065 + }, + { + "epoch": 3.9629990262901655, + "grad_norm": 1.768342281901305, + "learning_rate": 5e-08, + "loss": 2.2145, + "step": 4070 + }, + { + "epoch": 3.9678675754625123, + "grad_norm": 1.7810841025261306, + "learning_rate": 5e-08, + "loss": 2.2205, + "step": 4075 + }, + { + "epoch": 3.9727361246348587, + "grad_norm": 1.7724437878330235, + "learning_rate": 5e-08, + "loss": 2.2227, + "step": 4080 + }, + { + "epoch": 3.9776046738072055, + "grad_norm": 1.7693706025925438, + "learning_rate": 5e-08, + "loss": 2.26, + "step": 4085 + }, + { + "epoch": 3.982473222979552, + "grad_norm": 1.8547893721966033, + "learning_rate": 5e-08, + "loss": 2.1705, + "step": 4090 + }, + { + "epoch": 3.9873417721518987, + "grad_norm": 1.7856750671515411, + "learning_rate": 5e-08, + "loss": 2.2304, + "step": 4095 + }, + { + "epoch": 3.9922103213242455, + "grad_norm": 1.789882341208583, + "learning_rate": 5e-08, + "loss": 2.2391, + "step": 4100 + }, + { + "epoch": 3.9970788704965923, + "grad_norm": 1.8424154832009545, + "learning_rate": 5e-08, + "loss": 2.2157, + "step": 4105 + }, + { + "epoch": 4.0, + "step": 4108, + "total_flos": 429332983971840.0, + "train_loss": 2.284057284167586, + "train_runtime": 15107.0718, + "train_samples_per_second": 17.393, + "train_steps_per_second": 0.272 + } + ], + "logging_steps": 5, + "max_steps": 4108, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 200, + "total_flos": 429332983971840.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}