{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.16410929679166325, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016410929679166325, "grad_norm": 0.1563968062400818, "learning_rate": 9.999992612842675e-06, "loss": 0.6605, "step": 5 }, { "epoch": 0.003282185935833265, "grad_norm": 0.15270280838012695, "learning_rate": 9.999970451392527e-06, "loss": 0.6491, "step": 10 }, { "epoch": 0.0049232789037498975, "grad_norm": 0.17153096199035645, "learning_rate": 9.999933515715042e-06, "loss": 0.6383, "step": 15 }, { "epoch": 0.00656437187166653, "grad_norm": 0.172921285033226, "learning_rate": 9.999881805919356e-06, "loss": 0.6506, "step": 20 }, { "epoch": 0.008205464839583163, "grad_norm": 0.16361959278583527, "learning_rate": 9.999815322158266e-06, "loss": 0.6067, "step": 25 }, { "epoch": 0.009846557807499795, "grad_norm": 0.12810567021369934, "learning_rate": 9.999734064628224e-06, "loss": 0.6157, "step": 30 }, { "epoch": 0.011487650775416428, "grad_norm": 0.13882791996002197, "learning_rate": 9.999638033569334e-06, "loss": 0.6163, "step": 35 }, { "epoch": 0.01312874374333306, "grad_norm": 0.08581311255693436, "learning_rate": 9.999527229265353e-06, "loss": 0.5795, "step": 40 }, { "epoch": 0.014769836711249693, "grad_norm": 0.09464729577302933, "learning_rate": 9.999401652043697e-06, "loss": 0.5908, "step": 45 }, { "epoch": 0.016410929679166325, "grad_norm": 0.08246736973524094, "learning_rate": 9.999261302275424e-06, "loss": 0.5681, "step": 50 }, { "epoch": 0.018052022647082958, "grad_norm": 0.07881084084510803, "learning_rate": 9.999106180375251e-06, "loss": 0.5566, "step": 55 }, { "epoch": 0.01969311561499959, "grad_norm": 0.07179544121026993, "learning_rate": 9.998936286801541e-06, "loss": 0.5806, "step": 60 }, { "epoch": 0.021334208582916223, "grad_norm": 0.11372455954551697, "learning_rate": 9.99875162205631e-06, "loss": 0.541, "step": 65 }, { "epoch": 0.022975301550832855, "grad_norm": 0.07621748745441437, "learning_rate": 9.998552186685211e-06, "loss": 0.5421, "step": 70 }, { "epoch": 0.024616394518749488, "grad_norm": 0.07502977550029755, "learning_rate": 9.998337981277552e-06, "loss": 0.5634, "step": 75 }, { "epoch": 0.02625748748666612, "grad_norm": 0.06710907071828842, "learning_rate": 9.998109006466281e-06, "loss": 0.5322, "step": 80 }, { "epoch": 0.027898580454582753, "grad_norm": 0.066213458776474, "learning_rate": 9.997865262927984e-06, "loss": 0.5474, "step": 85 }, { "epoch": 0.029539673422499385, "grad_norm": 0.0674639567732811, "learning_rate": 9.997606751382894e-06, "loss": 0.5525, "step": 90 }, { "epoch": 0.031180766390416018, "grad_norm": 0.07476690411567688, "learning_rate": 9.997333472594872e-06, "loss": 0.526, "step": 95 }, { "epoch": 0.03282185935833265, "grad_norm": 0.05499599874019623, "learning_rate": 9.997045427371423e-06, "loss": 0.5262, "step": 100 }, { "epoch": 0.03446295232624928, "grad_norm": 0.06396327167749405, "learning_rate": 9.996742616563682e-06, "loss": 0.5141, "step": 105 }, { "epoch": 0.036104045294165915, "grad_norm": 0.06143304333090782, "learning_rate": 9.99642504106641e-06, "loss": 0.5107, "step": 110 }, { "epoch": 0.03774513826208255, "grad_norm": 0.0630095973610878, "learning_rate": 9.996092701818004e-06, "loss": 0.5177, "step": 115 }, { "epoch": 0.03938623122999918, "grad_norm": 0.059215761721134186, "learning_rate": 9.995745599800476e-06, "loss": 0.5098, "step": 120 }, { "epoch": 0.04102732419791581, "grad_norm": 0.05862729996442795, "learning_rate": 9.995383736039465e-06, "loss": 0.5027, "step": 125 }, { "epoch": 0.042668417165832445, "grad_norm": 0.05443592369556427, "learning_rate": 9.995007111604232e-06, "loss": 0.4688, "step": 130 }, { "epoch": 0.04430951013374908, "grad_norm": 0.05443759262561798, "learning_rate": 9.994615727607648e-06, "loss": 0.4988, "step": 135 }, { "epoch": 0.04595060310166571, "grad_norm": 0.05553797259926796, "learning_rate": 9.994209585206201e-06, "loss": 0.4864, "step": 140 }, { "epoch": 0.04759169606958234, "grad_norm": 0.055965058505535126, "learning_rate": 9.993788685599985e-06, "loss": 0.4768, "step": 145 }, { "epoch": 0.049232789037498975, "grad_norm": 0.06474044173955917, "learning_rate": 9.993353030032701e-06, "loss": 0.4942, "step": 150 }, { "epoch": 0.05087388200541561, "grad_norm": 0.0566246323287487, "learning_rate": 9.992902619791652e-06, "loss": 0.4888, "step": 155 }, { "epoch": 0.05251497497333224, "grad_norm": 0.0555800199508667, "learning_rate": 9.992437456207738e-06, "loss": 0.4687, "step": 160 }, { "epoch": 0.05415606794124887, "grad_norm": 0.0563640259206295, "learning_rate": 9.991957540655453e-06, "loss": 0.4923, "step": 165 }, { "epoch": 0.055797160909165505, "grad_norm": 0.05194167420268059, "learning_rate": 9.991462874552882e-06, "loss": 0.4811, "step": 170 }, { "epoch": 0.05743825387708214, "grad_norm": 0.1205214262008667, "learning_rate": 9.990953459361696e-06, "loss": 0.4696, "step": 175 }, { "epoch": 0.05907934684499877, "grad_norm": 0.0542314276099205, "learning_rate": 9.990429296587148e-06, "loss": 0.4547, "step": 180 }, { "epoch": 0.0607204398129154, "grad_norm": 0.06125911697745323, "learning_rate": 9.989890387778065e-06, "loss": 0.4817, "step": 185 }, { "epoch": 0.062361532780832035, "grad_norm": 0.06071058660745621, "learning_rate": 9.98933673452685e-06, "loss": 0.4553, "step": 190 }, { "epoch": 0.06400262574874867, "grad_norm": 0.05769752338528633, "learning_rate": 9.98876833846947e-06, "loss": 0.4632, "step": 195 }, { "epoch": 0.0656437187166653, "grad_norm": 0.05761849135160446, "learning_rate": 9.988185201285461e-06, "loss": 0.4439, "step": 200 }, { "epoch": 0.06728481168458193, "grad_norm": 0.05703369528055191, "learning_rate": 9.987587324697912e-06, "loss": 0.4506, "step": 205 }, { "epoch": 0.06892590465249857, "grad_norm": 0.05599252134561539, "learning_rate": 9.986974710473467e-06, "loss": 0.4573, "step": 210 }, { "epoch": 0.0705669976204152, "grad_norm": 0.05374148488044739, "learning_rate": 9.986347360422316e-06, "loss": 0.4555, "step": 215 }, { "epoch": 0.07220809058833183, "grad_norm": 0.054009810090065, "learning_rate": 9.985705276398193e-06, "loss": 0.4436, "step": 220 }, { "epoch": 0.07384918355624846, "grad_norm": 0.07604236155748367, "learning_rate": 9.985048460298367e-06, "loss": 0.4583, "step": 225 }, { "epoch": 0.0754902765241651, "grad_norm": 0.052760086953639984, "learning_rate": 9.984376914063643e-06, "loss": 0.4409, "step": 230 }, { "epoch": 0.07713136949208173, "grad_norm": 0.06065182387828827, "learning_rate": 9.983690639678343e-06, "loss": 0.4637, "step": 235 }, { "epoch": 0.07877246245999836, "grad_norm": 0.06539740413427353, "learning_rate": 9.982989639170319e-06, "loss": 0.4636, "step": 240 }, { "epoch": 0.08041355542791499, "grad_norm": 0.06656944006681442, "learning_rate": 9.982273914610927e-06, "loss": 0.4487, "step": 245 }, { "epoch": 0.08205464839583163, "grad_norm": 0.05745495483279228, "learning_rate": 9.981543468115039e-06, "loss": 0.4393, "step": 250 }, { "epoch": 0.08369574136374826, "grad_norm": 0.06525252759456635, "learning_rate": 9.98079830184102e-06, "loss": 0.4713, "step": 255 }, { "epoch": 0.08533683433166489, "grad_norm": 0.0555146224796772, "learning_rate": 9.980038417990736e-06, "loss": 0.4278, "step": 260 }, { "epoch": 0.08697792729958152, "grad_norm": 0.0773826315999031, "learning_rate": 9.979263818809542e-06, "loss": 0.4527, "step": 265 }, { "epoch": 0.08861902026749816, "grad_norm": 0.06476614624261856, "learning_rate": 9.978474506586269e-06, "loss": 0.4404, "step": 270 }, { "epoch": 0.09026011323541479, "grad_norm": 0.06019666790962219, "learning_rate": 9.977670483653228e-06, "loss": 0.4414, "step": 275 }, { "epoch": 0.09190120620333142, "grad_norm": 0.06592460721731186, "learning_rate": 9.976851752386196e-06, "loss": 0.4313, "step": 280 }, { "epoch": 0.09354229917124805, "grad_norm": 0.06817147135734558, "learning_rate": 9.976018315204412e-06, "loss": 0.4357, "step": 285 }, { "epoch": 0.09518339213916469, "grad_norm": 0.06738044321537018, "learning_rate": 9.97517017457057e-06, "loss": 0.4511, "step": 290 }, { "epoch": 0.09682448510708132, "grad_norm": 0.07049284130334854, "learning_rate": 9.974307332990806e-06, "loss": 0.447, "step": 295 }, { "epoch": 0.09846557807499795, "grad_norm": 0.06609766185283661, "learning_rate": 9.973429793014703e-06, "loss": 0.4576, "step": 300 }, { "epoch": 0.10010667104291458, "grad_norm": 0.06007273495197296, "learning_rate": 9.972537557235267e-06, "loss": 0.4397, "step": 305 }, { "epoch": 0.10174776401083122, "grad_norm": 0.0659220889210701, "learning_rate": 9.971630628288935e-06, "loss": 0.4538, "step": 310 }, { "epoch": 0.10338885697874785, "grad_norm": 0.08680638670921326, "learning_rate": 9.970709008855557e-06, "loss": 0.442, "step": 315 }, { "epoch": 0.10502994994666448, "grad_norm": 0.06552395224571228, "learning_rate": 9.969772701658393e-06, "loss": 0.4195, "step": 320 }, { "epoch": 0.10667104291458111, "grad_norm": 0.06147119030356407, "learning_rate": 9.968821709464101e-06, "loss": 0.4419, "step": 325 }, { "epoch": 0.10831213588249775, "grad_norm": 0.06841282546520233, "learning_rate": 9.967856035082732e-06, "loss": 0.4313, "step": 330 }, { "epoch": 0.10995322885041438, "grad_norm": 0.07704174518585205, "learning_rate": 9.966875681367724e-06, "loss": 0.4373, "step": 335 }, { "epoch": 0.11159432181833101, "grad_norm": 0.06586287170648575, "learning_rate": 9.965880651215885e-06, "loss": 0.4475, "step": 340 }, { "epoch": 0.11323541478624764, "grad_norm": 0.13131344318389893, "learning_rate": 9.964870947567396e-06, "loss": 0.42, "step": 345 }, { "epoch": 0.11487650775416428, "grad_norm": 0.06351525336503983, "learning_rate": 9.963846573405791e-06, "loss": 0.4247, "step": 350 }, { "epoch": 0.11651760072208091, "grad_norm": 0.06860467791557312, "learning_rate": 9.962807531757955e-06, "loss": 0.432, "step": 355 }, { "epoch": 0.11815869368999754, "grad_norm": 0.06938762962818146, "learning_rate": 9.961753825694112e-06, "loss": 0.4324, "step": 360 }, { "epoch": 0.11979978665791417, "grad_norm": 0.06990928202867508, "learning_rate": 9.960685458327824e-06, "loss": 0.4136, "step": 365 }, { "epoch": 0.1214408796258308, "grad_norm": 0.06884902715682983, "learning_rate": 9.959602432815964e-06, "loss": 0.4301, "step": 370 }, { "epoch": 0.12308197259374744, "grad_norm": 0.06610533595085144, "learning_rate": 9.958504752358729e-06, "loss": 0.418, "step": 375 }, { "epoch": 0.12472306556166407, "grad_norm": 0.07162132859230042, "learning_rate": 9.957392420199612e-06, "loss": 0.4335, "step": 380 }, { "epoch": 0.1263641585295807, "grad_norm": 0.07637803256511688, "learning_rate": 9.956265439625401e-06, "loss": 0.4508, "step": 385 }, { "epoch": 0.12800525149749734, "grad_norm": 0.06776853650808334, "learning_rate": 9.955123813966172e-06, "loss": 0.4235, "step": 390 }, { "epoch": 0.12964634446541395, "grad_norm": 0.06781169027090073, "learning_rate": 9.953967546595272e-06, "loss": 0.4296, "step": 395 }, { "epoch": 0.1312874374333306, "grad_norm": 0.06995800137519836, "learning_rate": 9.952796640929309e-06, "loss": 0.4328, "step": 400 }, { "epoch": 0.13292853040124722, "grad_norm": 0.06544926762580872, "learning_rate": 9.951611100428151e-06, "loss": 0.4235, "step": 405 }, { "epoch": 0.13456962336916387, "grad_norm": 0.07403396815061569, "learning_rate": 9.95041092859491e-06, "loss": 0.4362, "step": 410 }, { "epoch": 0.13621071633708048, "grad_norm": 0.06964828819036484, "learning_rate": 9.949196128975925e-06, "loss": 0.4134, "step": 415 }, { "epoch": 0.13785180930499713, "grad_norm": 0.07269076257944107, "learning_rate": 9.947966705160765e-06, "loss": 0.4288, "step": 420 }, { "epoch": 0.13949290227291375, "grad_norm": 0.0716971680521965, "learning_rate": 9.946722660782209e-06, "loss": 0.4113, "step": 425 }, { "epoch": 0.1411339952408304, "grad_norm": 0.06757480651140213, "learning_rate": 9.945463999516236e-06, "loss": 0.4311, "step": 430 }, { "epoch": 0.14277508820874701, "grad_norm": 0.07381222397089005, "learning_rate": 9.944190725082019e-06, "loss": 0.4313, "step": 435 }, { "epoch": 0.14441618117666366, "grad_norm": 0.07273319363594055, "learning_rate": 9.94290284124191e-06, "loss": 0.4195, "step": 440 }, { "epoch": 0.14605727414458028, "grad_norm": 0.07356058806180954, "learning_rate": 9.941600351801426e-06, "loss": 0.425, "step": 445 }, { "epoch": 0.14769836711249693, "grad_norm": 0.07552187144756317, "learning_rate": 9.940283260609248e-06, "loss": 0.4295, "step": 450 }, { "epoch": 0.14933946008041354, "grad_norm": 0.0747319757938385, "learning_rate": 9.938951571557198e-06, "loss": 0.4426, "step": 455 }, { "epoch": 0.1509805530483302, "grad_norm": 0.06968298554420471, "learning_rate": 9.937605288580237e-06, "loss": 0.4244, "step": 460 }, { "epoch": 0.1526216460162468, "grad_norm": 0.07281242311000824, "learning_rate": 9.936244415656443e-06, "loss": 0.4263, "step": 465 }, { "epoch": 0.15426273898416346, "grad_norm": 0.07637212425470352, "learning_rate": 9.934868956807012e-06, "loss": 0.4217, "step": 470 }, { "epoch": 0.15590383195208007, "grad_norm": 0.07728656381368637, "learning_rate": 9.933478916096235e-06, "loss": 0.4214, "step": 475 }, { "epoch": 0.15754492491999672, "grad_norm": 0.09080182015895844, "learning_rate": 9.932074297631494e-06, "loss": 0.4143, "step": 480 }, { "epoch": 0.15918601788791334, "grad_norm": 0.07812851667404175, "learning_rate": 9.930655105563241e-06, "loss": 0.4307, "step": 485 }, { "epoch": 0.16082711085582999, "grad_norm": 0.0779787003993988, "learning_rate": 9.929221344084994e-06, "loss": 0.433, "step": 490 }, { "epoch": 0.1624682038237466, "grad_norm": 0.0747016966342926, "learning_rate": 9.927773017433325e-06, "loss": 0.4026, "step": 495 }, { "epoch": 0.16410929679166325, "grad_norm": 0.07794748246669769, "learning_rate": 9.926310129887836e-06, "loss": 0.4331, "step": 500 } ], "logging_steps": 5, "max_steps": 9138, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.57326968815616e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }