{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 9123, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005480653293872629, "grad_norm": 9.609877586364746, "learning_rate": 4.999996294265421e-05, "loss": 5.868, "num_input_tokens_seen": 3944, "step": 5 }, { "epoch": 0.0010961306587745259, "grad_norm": 8.435359001159668, "learning_rate": 4.999985177072669e-05, "loss": 5.1519, "num_input_tokens_seen": 7552, "step": 10 }, { "epoch": 0.001644195988161789, "grad_norm": 4.555312156677246, "learning_rate": 4.999966648454702e-05, "loss": 4.5297, "num_input_tokens_seen": 10552, "step": 15 }, { "epoch": 0.0021922613175490518, "grad_norm": 5.34758186340332, "learning_rate": 4.9999407084664514e-05, "loss": 4.1016, "num_input_tokens_seen": 14720, "step": 20 }, { "epoch": 0.002740326646936315, "grad_norm": 4.284458160400391, "learning_rate": 4.999907357184816e-05, "loss": 4.0075, "num_input_tokens_seen": 17648, "step": 25 }, { "epoch": 0.003288391976323578, "grad_norm": 6.062355041503906, "learning_rate": 4.99986659470867e-05, "loss": 3.9682, "num_input_tokens_seen": 21192, "step": 30 }, { "epoch": 0.003836457305710841, "grad_norm": 3.1782262325286865, "learning_rate": 4.9998184211588574e-05, "loss": 3.6158, "num_input_tokens_seen": 24680, "step": 35 }, { "epoch": 0.0043845226350981035, "grad_norm": 4.492194652557373, "learning_rate": 4.999762836678192e-05, "loss": 4.4312, "num_input_tokens_seen": 27304, "step": 40 }, { "epoch": 0.004932587964485367, "grad_norm": 4.35511589050293, "learning_rate": 4.99969984143146e-05, "loss": 4.0391, "num_input_tokens_seen": 29824, "step": 45 }, { "epoch": 0.00548065329387263, "grad_norm": 4.070927619934082, "learning_rate": 4.999629435605416e-05, "loss": 3.9559, "num_input_tokens_seen": 32496, "step": 50 }, { "epoch": 0.006028718623259892, "grad_norm": 3.5581634044647217, "learning_rate": 4.9995516194087845e-05, "loss": 3.6342, "num_input_tokens_seen": 35624, "step": 55 }, { "epoch": 0.006576783952647156, "grad_norm": 3.646406888961792, "learning_rate": 4.999466393072258e-05, "loss": 3.8581, "num_input_tokens_seen": 38896, "step": 60 }, { "epoch": 0.007124849282034418, "grad_norm": 3.964329719543457, "learning_rate": 4.9993737568484967e-05, "loss": 4.0054, "num_input_tokens_seen": 42736, "step": 65 }, { "epoch": 0.007672914611421682, "grad_norm": 4.500335693359375, "learning_rate": 4.99927371101213e-05, "loss": 3.3325, "num_input_tokens_seen": 45256, "step": 70 }, { "epoch": 0.008220979940808944, "grad_norm": 4.3628315925598145, "learning_rate": 4.999166255859752e-05, "loss": 3.5725, "num_input_tokens_seen": 48576, "step": 75 }, { "epoch": 0.008769045270196207, "grad_norm": 3.4167840480804443, "learning_rate": 4.9990513917099225e-05, "loss": 3.7729, "num_input_tokens_seen": 52736, "step": 80 }, { "epoch": 0.00931711059958347, "grad_norm": 4.027678489685059, "learning_rate": 4.998929118903167e-05, "loss": 3.7879, "num_input_tokens_seen": 56256, "step": 85 }, { "epoch": 0.009865175928970734, "grad_norm": 4.3075056076049805, "learning_rate": 4.9987994378019746e-05, "loss": 3.5822, "num_input_tokens_seen": 59448, "step": 90 }, { "epoch": 0.010413241258357997, "grad_norm": 3.550978899002075, "learning_rate": 4.9986623487907955e-05, "loss": 3.8015, "num_input_tokens_seen": 63424, "step": 95 }, { "epoch": 0.01096130658774526, "grad_norm": 3.6582727432250977, "learning_rate": 4.998517852276042e-05, "loss": 3.7712, "num_input_tokens_seen": 66720, "step": 100 }, { "epoch": 0.011509371917132522, "grad_norm": 5.284353733062744, "learning_rate": 4.9983659486860865e-05, "loss": 3.5192, "num_input_tokens_seen": 69280, "step": 105 }, { "epoch": 0.012057437246519784, "grad_norm": 3.712407350540161, "learning_rate": 4.998206638471261e-05, "loss": 3.9006, "num_input_tokens_seen": 72488, "step": 110 }, { "epoch": 0.012605502575907049, "grad_norm": 5.380141258239746, "learning_rate": 4.9980399221038544e-05, "loss": 3.7691, "num_input_tokens_seen": 75728, "step": 115 }, { "epoch": 0.013153567905294311, "grad_norm": 6.7210693359375, "learning_rate": 4.997865800078112e-05, "loss": 3.4306, "num_input_tokens_seen": 78456, "step": 120 }, { "epoch": 0.013701633234681574, "grad_norm": 3.6822457313537598, "learning_rate": 4.997684272910233e-05, "loss": 3.7098, "num_input_tokens_seen": 81912, "step": 125 }, { "epoch": 0.014249698564068837, "grad_norm": 4.587904453277588, "learning_rate": 4.997495341138373e-05, "loss": 3.7503, "num_input_tokens_seen": 85768, "step": 130 }, { "epoch": 0.0147977638934561, "grad_norm": 4.4221510887146, "learning_rate": 4.997299005322634e-05, "loss": 3.6916, "num_input_tokens_seen": 89744, "step": 135 }, { "epoch": 0.015345829222843364, "grad_norm": 4.955567359924316, "learning_rate": 4.9970952660450734e-05, "loss": 3.8345, "num_input_tokens_seen": 93584, "step": 140 }, { "epoch": 0.015893894552230625, "grad_norm": 3.8360307216644287, "learning_rate": 4.996884123909692e-05, "loss": 3.8622, "num_input_tokens_seen": 96880, "step": 145 }, { "epoch": 0.01644195988161789, "grad_norm": 4.293831825256348, "learning_rate": 4.996665579542439e-05, "loss": 3.6978, "num_input_tokens_seen": 99736, "step": 150 }, { "epoch": 0.016990025211005153, "grad_norm": 3.8615922927856445, "learning_rate": 4.99643963359121e-05, "loss": 3.7886, "num_input_tokens_seen": 102768, "step": 155 }, { "epoch": 0.017538090540392414, "grad_norm": 4.592337608337402, "learning_rate": 4.996206286725841e-05, "loss": 3.4776, "num_input_tokens_seen": 107960, "step": 160 }, { "epoch": 0.01808615586977968, "grad_norm": 5.695650577545166, "learning_rate": 4.995965539638108e-05, "loss": 3.9904, "num_input_tokens_seen": 110712, "step": 165 }, { "epoch": 0.01863422119916694, "grad_norm": 6.341024398803711, "learning_rate": 4.995717393041729e-05, "loss": 3.727, "num_input_tokens_seen": 114496, "step": 170 }, { "epoch": 0.019182286528554204, "grad_norm": 5.523504734039307, "learning_rate": 4.995461847672354e-05, "loss": 3.5366, "num_input_tokens_seen": 118408, "step": 175 }, { "epoch": 0.019730351857941468, "grad_norm": 4.576908111572266, "learning_rate": 4.995198904287572e-05, "loss": 3.4552, "num_input_tokens_seen": 122024, "step": 180 }, { "epoch": 0.02027841718732873, "grad_norm": 4.912643909454346, "learning_rate": 4.9949285636669e-05, "loss": 3.878, "num_input_tokens_seen": 125680, "step": 185 }, { "epoch": 0.020826482516715993, "grad_norm": 3.790379047393799, "learning_rate": 4.994650826611787e-05, "loss": 3.7852, "num_input_tokens_seen": 129056, "step": 190 }, { "epoch": 0.021374547846103254, "grad_norm": 4.877086162567139, "learning_rate": 4.9943656939456094e-05, "loss": 3.7977, "num_input_tokens_seen": 132072, "step": 195 }, { "epoch": 0.02192261317549052, "grad_norm": 4.675802230834961, "learning_rate": 4.994073166513667e-05, "loss": 3.6024, "num_input_tokens_seen": 134448, "step": 200 }, { "epoch": 0.022470678504877783, "grad_norm": 9.45524787902832, "learning_rate": 4.9937732451831845e-05, "loss": 3.9247, "num_input_tokens_seen": 137808, "step": 205 }, { "epoch": 0.023018743834265044, "grad_norm": 4.349103927612305, "learning_rate": 4.9934659308433024e-05, "loss": 3.5971, "num_input_tokens_seen": 140752, "step": 210 }, { "epoch": 0.023566809163652308, "grad_norm": 3.90029239654541, "learning_rate": 4.993151224405084e-05, "loss": 3.656, "num_input_tokens_seen": 143328, "step": 215 }, { "epoch": 0.02411487449303957, "grad_norm": 3.4128267765045166, "learning_rate": 4.992829126801502e-05, "loss": 3.7457, "num_input_tokens_seen": 146792, "step": 220 }, { "epoch": 0.024662939822426833, "grad_norm": 5.266091346740723, "learning_rate": 4.9924996389874435e-05, "loss": 3.3972, "num_input_tokens_seen": 150352, "step": 225 }, { "epoch": 0.025211005151814098, "grad_norm": 3.7570605278015137, "learning_rate": 4.992162761939704e-05, "loss": 2.8386, "num_input_tokens_seen": 153688, "step": 230 }, { "epoch": 0.02575907048120136, "grad_norm": 3.587785243988037, "learning_rate": 4.991818496656986e-05, "loss": 3.909, "num_input_tokens_seen": 156824, "step": 235 }, { "epoch": 0.026307135810588623, "grad_norm": 4.7243757247924805, "learning_rate": 4.991466844159893e-05, "loss": 3.7806, "num_input_tokens_seen": 159728, "step": 240 }, { "epoch": 0.026855201139975884, "grad_norm": 4.537757396697998, "learning_rate": 4.99110780549093e-05, "loss": 3.7949, "num_input_tokens_seen": 162456, "step": 245 }, { "epoch": 0.027403266469363148, "grad_norm": 5.187793731689453, "learning_rate": 4.990741381714498e-05, "loss": 3.7304, "num_input_tokens_seen": 165176, "step": 250 }, { "epoch": 0.027951331798750412, "grad_norm": 5.144887447357178, "learning_rate": 4.990367573916894e-05, "loss": 3.7232, "num_input_tokens_seen": 168824, "step": 255 }, { "epoch": 0.028499397128137673, "grad_norm": 5.238748550415039, "learning_rate": 4.989986383206302e-05, "loss": 3.5484, "num_input_tokens_seen": 172512, "step": 260 }, { "epoch": 0.029047462457524938, "grad_norm": 4.251674652099609, "learning_rate": 4.9895978107127975e-05, "loss": 3.3929, "num_input_tokens_seen": 175544, "step": 265 }, { "epoch": 0.0295955277869122, "grad_norm": 7.541206359863281, "learning_rate": 4.9892018575883354e-05, "loss": 3.5038, "num_input_tokens_seen": 178784, "step": 270 }, { "epoch": 0.030143593116299463, "grad_norm": 3.8806400299072266, "learning_rate": 4.988798525006755e-05, "loss": 3.9488, "num_input_tokens_seen": 181112, "step": 275 }, { "epoch": 0.030691658445686727, "grad_norm": 3.7918715476989746, "learning_rate": 4.988387814163771e-05, "loss": 3.4375, "num_input_tokens_seen": 185416, "step": 280 }, { "epoch": 0.031239723775073988, "grad_norm": 4.9953813552856445, "learning_rate": 4.9879697262769706e-05, "loss": 3.7866, "num_input_tokens_seen": 188528, "step": 285 }, { "epoch": 0.03178778910446125, "grad_norm": 4.683384418487549, "learning_rate": 4.9875442625858125e-05, "loss": 3.4738, "num_input_tokens_seen": 191472, "step": 290 }, { "epoch": 0.03233585443384852, "grad_norm": 3.5414726734161377, "learning_rate": 4.987111424351622e-05, "loss": 3.6306, "num_input_tokens_seen": 195416, "step": 295 }, { "epoch": 0.03288391976323578, "grad_norm": 6.5463547706604, "learning_rate": 4.9866712128575855e-05, "loss": 3.6409, "num_input_tokens_seen": 198576, "step": 300 }, { "epoch": 0.03343198509262304, "grad_norm": 4.8504180908203125, "learning_rate": 4.9862236294087485e-05, "loss": 3.9698, "num_input_tokens_seen": 201432, "step": 305 }, { "epoch": 0.033980050422010306, "grad_norm": 4.2637739181518555, "learning_rate": 4.98576867533201e-05, "loss": 3.4978, "num_input_tokens_seen": 204776, "step": 310 }, { "epoch": 0.03452811575139757, "grad_norm": 6.201929569244385, "learning_rate": 4.9853063519761234e-05, "loss": 3.5306, "num_input_tokens_seen": 207984, "step": 315 }, { "epoch": 0.03507618108078483, "grad_norm": 5.745614528656006, "learning_rate": 4.984836660711686e-05, "loss": 3.4114, "num_input_tokens_seen": 211304, "step": 320 }, { "epoch": 0.035624246410172096, "grad_norm": 7.258711338043213, "learning_rate": 4.9843596029311386e-05, "loss": 3.5909, "num_input_tokens_seen": 214680, "step": 325 }, { "epoch": 0.03617231173955936, "grad_norm": 5.421024799346924, "learning_rate": 4.9838751800487606e-05, "loss": 3.9625, "num_input_tokens_seen": 217472, "step": 330 }, { "epoch": 0.03672037706894662, "grad_norm": 4.33311653137207, "learning_rate": 4.983383393500667e-05, "loss": 3.1581, "num_input_tokens_seen": 220824, "step": 335 }, { "epoch": 0.03726844239833388, "grad_norm": 3.667479991912842, "learning_rate": 4.982884244744801e-05, "loss": 3.6578, "num_input_tokens_seen": 224464, "step": 340 }, { "epoch": 0.037816507727721146, "grad_norm": 4.797352313995361, "learning_rate": 4.982377735260933e-05, "loss": 3.4615, "num_input_tokens_seen": 228120, "step": 345 }, { "epoch": 0.03836457305710841, "grad_norm": 6.432485103607178, "learning_rate": 4.981863866550656e-05, "loss": 3.7862, "num_input_tokens_seen": 231112, "step": 350 }, { "epoch": 0.03891263838649567, "grad_norm": 5.501232624053955, "learning_rate": 4.981342640137377e-05, "loss": 3.5962, "num_input_tokens_seen": 234456, "step": 355 }, { "epoch": 0.039460703715882936, "grad_norm": 4.993545055389404, "learning_rate": 4.9808140575663186e-05, "loss": 3.4178, "num_input_tokens_seen": 237744, "step": 360 }, { "epoch": 0.0400087690452702, "grad_norm": 4.6652421951293945, "learning_rate": 4.98027812040451e-05, "loss": 3.3215, "num_input_tokens_seen": 240240, "step": 365 }, { "epoch": 0.04055683437465746, "grad_norm": 7.660661220550537, "learning_rate": 4.979734830240784e-05, "loss": 3.4482, "num_input_tokens_seen": 243344, "step": 370 }, { "epoch": 0.041104899704044726, "grad_norm": 5.362435340881348, "learning_rate": 4.979184188685772e-05, "loss": 3.6152, "num_input_tokens_seen": 246928, "step": 375 }, { "epoch": 0.041652965033431986, "grad_norm": 4.019466876983643, "learning_rate": 4.9786261973718984e-05, "loss": 3.4659, "num_input_tokens_seen": 250592, "step": 380 }, { "epoch": 0.04220103036281925, "grad_norm": 3.5128304958343506, "learning_rate": 4.9780608579533774e-05, "loss": 3.369, "num_input_tokens_seen": 254136, "step": 385 }, { "epoch": 0.04274909569220651, "grad_norm": 5.328804969787598, "learning_rate": 4.9774881721062083e-05, "loss": 3.396, "num_input_tokens_seen": 257000, "step": 390 }, { "epoch": 0.043297161021593776, "grad_norm": 3.9344732761383057, "learning_rate": 4.976908141528168e-05, "loss": 3.5748, "num_input_tokens_seen": 259544, "step": 395 }, { "epoch": 0.04384522635098104, "grad_norm": 6.34092903137207, "learning_rate": 4.976320767938808e-05, "loss": 3.2784, "num_input_tokens_seen": 262648, "step": 400 }, { "epoch": 0.0443932916803683, "grad_norm": 6.228747367858887, "learning_rate": 4.975726053079448e-05, "loss": 3.7733, "num_input_tokens_seen": 265800, "step": 405 }, { "epoch": 0.044941357009755566, "grad_norm": 6.360103130340576, "learning_rate": 4.9751239987131735e-05, "loss": 3.3795, "num_input_tokens_seen": 268352, "step": 410 }, { "epoch": 0.045489422339142827, "grad_norm": 5.080907821655273, "learning_rate": 4.9745146066248275e-05, "loss": 3.4467, "num_input_tokens_seen": 271416, "step": 415 }, { "epoch": 0.04603748766853009, "grad_norm": 4.075165271759033, "learning_rate": 4.973897878621005e-05, "loss": 3.4581, "num_input_tokens_seen": 274912, "step": 420 }, { "epoch": 0.046585552997917355, "grad_norm": 4.517000675201416, "learning_rate": 4.973273816530051e-05, "loss": 3.3681, "num_input_tokens_seen": 279184, "step": 425 }, { "epoch": 0.047133618327304616, "grad_norm": 5.66272497177124, "learning_rate": 4.9726424222020527e-05, "loss": 3.8983, "num_input_tokens_seen": 283008, "step": 430 }, { "epoch": 0.04768168365669188, "grad_norm": 5.277008056640625, "learning_rate": 4.9720036975088334e-05, "loss": 3.8482, "num_input_tokens_seen": 285408, "step": 435 }, { "epoch": 0.04822974898607914, "grad_norm": 5.911515235900879, "learning_rate": 4.971357644343948e-05, "loss": 3.7086, "num_input_tokens_seen": 287672, "step": 440 }, { "epoch": 0.048777814315466406, "grad_norm": 5.71356725692749, "learning_rate": 4.9707042646226784e-05, "loss": 3.7235, "num_input_tokens_seen": 290608, "step": 445 }, { "epoch": 0.04932587964485367, "grad_norm": 4.606592178344727, "learning_rate": 4.9700435602820276e-05, "loss": 3.5481, "num_input_tokens_seen": 293688, "step": 450 }, { "epoch": 0.04987394497424093, "grad_norm": 5.814152240753174, "learning_rate": 4.969375533280708e-05, "loss": 3.38, "num_input_tokens_seen": 297160, "step": 455 }, { "epoch": 0.050422010303628195, "grad_norm": 5.669627666473389, "learning_rate": 4.968700185599147e-05, "loss": 3.5052, "num_input_tokens_seen": 300608, "step": 460 }, { "epoch": 0.050970075633015456, "grad_norm": 4.943079471588135, "learning_rate": 4.96801751923947e-05, "loss": 3.5689, "num_input_tokens_seen": 303680, "step": 465 }, { "epoch": 0.05151814096240272, "grad_norm": 5.5774664878845215, "learning_rate": 4.9673275362255035e-05, "loss": 3.1872, "num_input_tokens_seen": 306664, "step": 470 }, { "epoch": 0.052066206291789985, "grad_norm": 5.742215633392334, "learning_rate": 4.966630238602761e-05, "loss": 3.873, "num_input_tokens_seen": 310024, "step": 475 }, { "epoch": 0.052614271621177246, "grad_norm": 5.4475507736206055, "learning_rate": 4.9659256284384434e-05, "loss": 3.5306, "num_input_tokens_seen": 313296, "step": 480 }, { "epoch": 0.05316233695056451, "grad_norm": 5.270495414733887, "learning_rate": 4.965213707821428e-05, "loss": 3.3911, "num_input_tokens_seen": 317528, "step": 485 }, { "epoch": 0.05371040227995177, "grad_norm": 4.345836639404297, "learning_rate": 4.964494478862267e-05, "loss": 3.338, "num_input_tokens_seen": 320224, "step": 490 }, { "epoch": 0.054258467609339035, "grad_norm": 8.715791702270508, "learning_rate": 4.963767943693178e-05, "loss": 3.6676, "num_input_tokens_seen": 323576, "step": 495 }, { "epoch": 0.054806532938726296, "grad_norm": 6.43541955947876, "learning_rate": 4.9630341044680375e-05, "loss": 3.4779, "num_input_tokens_seen": 326840, "step": 500 }, { "epoch": 0.05535459826811356, "grad_norm": 5.299740314483643, "learning_rate": 4.962292963362376e-05, "loss": 3.0794, "num_input_tokens_seen": 330400, "step": 505 }, { "epoch": 0.055902663597500825, "grad_norm": 5.377191543579102, "learning_rate": 4.9615445225733714e-05, "loss": 3.3778, "num_input_tokens_seen": 334264, "step": 510 }, { "epoch": 0.056450728926888086, "grad_norm": 4.671337127685547, "learning_rate": 4.9607887843198417e-05, "loss": 3.2423, "num_input_tokens_seen": 338632, "step": 515 }, { "epoch": 0.05699879425627535, "grad_norm": 4.917747497558594, "learning_rate": 4.960025750842241e-05, "loss": 3.2912, "num_input_tokens_seen": 341576, "step": 520 }, { "epoch": 0.057546859585662614, "grad_norm": 5.633148670196533, "learning_rate": 4.959255424402647e-05, "loss": 3.9649, "num_input_tokens_seen": 343752, "step": 525 }, { "epoch": 0.058094924915049875, "grad_norm": 5.843842506408691, "learning_rate": 4.9584778072847605e-05, "loss": 3.5301, "num_input_tokens_seen": 346768, "step": 530 }, { "epoch": 0.058642990244437136, "grad_norm": 6.019566059112549, "learning_rate": 4.957692901793896e-05, "loss": 3.7123, "num_input_tokens_seen": 349488, "step": 535 }, { "epoch": 0.0591910555738244, "grad_norm": 5.83019495010376, "learning_rate": 4.9569007102569746e-05, "loss": 4.0987, "num_input_tokens_seen": 353448, "step": 540 }, { "epoch": 0.059739120903211665, "grad_norm": 7.744917392730713, "learning_rate": 4.9561012350225174e-05, "loss": 3.4271, "num_input_tokens_seen": 357336, "step": 545 }, { "epoch": 0.060287186232598926, "grad_norm": 6.845799922943115, "learning_rate": 4.955294478460638e-05, "loss": 3.7176, "num_input_tokens_seen": 361272, "step": 550 }, { "epoch": 0.06083525156198619, "grad_norm": 7.8909592628479, "learning_rate": 4.954480442963038e-05, "loss": 3.3092, "num_input_tokens_seen": 364048, "step": 555 }, { "epoch": 0.061383316891373454, "grad_norm": 6.57379674911499, "learning_rate": 4.953659130942997e-05, "loss": 4.0073, "num_input_tokens_seen": 368336, "step": 560 }, { "epoch": 0.061931382220760715, "grad_norm": 5.875579833984375, "learning_rate": 4.952830544835366e-05, "loss": 3.4651, "num_input_tokens_seen": 370824, "step": 565 }, { "epoch": 0.062479447550147976, "grad_norm": 5.310330867767334, "learning_rate": 4.951994687096562e-05, "loss": 3.8036, "num_input_tokens_seen": 374104, "step": 570 }, { "epoch": 0.06302751287953524, "grad_norm": 6.611202239990234, "learning_rate": 4.9511515602045563e-05, "loss": 3.2939, "num_input_tokens_seen": 376176, "step": 575 }, { "epoch": 0.0635755782089225, "grad_norm": 4.5933451652526855, "learning_rate": 4.950301166658875e-05, "loss": 3.529, "num_input_tokens_seen": 378600, "step": 580 }, { "epoch": 0.06412364353830977, "grad_norm": 5.080543518066406, "learning_rate": 4.9494435089805835e-05, "loss": 4.0958, "num_input_tokens_seen": 382584, "step": 585 }, { "epoch": 0.06467170886769703, "grad_norm": 4.658755779266357, "learning_rate": 4.948578589712283e-05, "loss": 3.3213, "num_input_tokens_seen": 386376, "step": 590 }, { "epoch": 0.06521977419708429, "grad_norm": 5.556814670562744, "learning_rate": 4.9477064114181026e-05, "loss": 3.5986, "num_input_tokens_seen": 390784, "step": 595 }, { "epoch": 0.06576783952647156, "grad_norm": 6.1433491706848145, "learning_rate": 4.946826976683691e-05, "loss": 3.4305, "num_input_tokens_seen": 395104, "step": 600 }, { "epoch": 0.06631590485585882, "grad_norm": 4.176370143890381, "learning_rate": 4.9459402881162095e-05, "loss": 3.6053, "num_input_tokens_seen": 398072, "step": 605 }, { "epoch": 0.06686397018524608, "grad_norm": 4.746314525604248, "learning_rate": 4.945046348344325e-05, "loss": 3.4613, "num_input_tokens_seen": 401112, "step": 610 }, { "epoch": 0.06741203551463334, "grad_norm": 6.04541015625, "learning_rate": 4.9441451600182e-05, "loss": 3.3843, "num_input_tokens_seen": 404728, "step": 615 }, { "epoch": 0.06796010084402061, "grad_norm": 4.687957763671875, "learning_rate": 4.943236725809485e-05, "loss": 3.6494, "num_input_tokens_seen": 407824, "step": 620 }, { "epoch": 0.06850816617340787, "grad_norm": 5.392053604125977, "learning_rate": 4.942321048411314e-05, "loss": 3.7716, "num_input_tokens_seen": 410064, "step": 625 }, { "epoch": 0.06905623150279513, "grad_norm": 5.196096420288086, "learning_rate": 4.9413981305382936e-05, "loss": 3.7037, "num_input_tokens_seen": 413664, "step": 630 }, { "epoch": 0.0696042968321824, "grad_norm": 4.464987754821777, "learning_rate": 4.940467974926493e-05, "loss": 3.0886, "num_input_tokens_seen": 416752, "step": 635 }, { "epoch": 0.07015236216156966, "grad_norm": 4.81376838684082, "learning_rate": 4.939530584333441e-05, "loss": 3.11, "num_input_tokens_seen": 420552, "step": 640 }, { "epoch": 0.07070042749095692, "grad_norm": 5.184936046600342, "learning_rate": 4.938585961538115e-05, "loss": 3.1776, "num_input_tokens_seen": 423200, "step": 645 }, { "epoch": 0.07124849282034419, "grad_norm": 7.05800724029541, "learning_rate": 4.9376341093409305e-05, "loss": 3.2882, "num_input_tokens_seen": 426840, "step": 650 }, { "epoch": 0.07179655814973145, "grad_norm": 7.437703609466553, "learning_rate": 4.9366750305637385e-05, "loss": 3.3796, "num_input_tokens_seen": 430168, "step": 655 }, { "epoch": 0.07234462347911871, "grad_norm": 7.665436744689941, "learning_rate": 4.9357087280498105e-05, "loss": 3.6646, "num_input_tokens_seen": 433080, "step": 660 }, { "epoch": 0.07289268880850597, "grad_norm": 7.2700324058532715, "learning_rate": 4.934735204663835e-05, "loss": 3.4558, "num_input_tokens_seen": 436600, "step": 665 }, { "epoch": 0.07344075413789324, "grad_norm": 4.932444095611572, "learning_rate": 4.9337544632919085e-05, "loss": 3.1135, "num_input_tokens_seen": 439552, "step": 670 }, { "epoch": 0.0739888194672805, "grad_norm": 6.515824794769287, "learning_rate": 4.9327665068415254e-05, "loss": 3.3952, "num_input_tokens_seen": 442776, "step": 675 }, { "epoch": 0.07453688479666776, "grad_norm": 6.392978668212891, "learning_rate": 4.931771338241566e-05, "loss": 3.5728, "num_input_tokens_seen": 445344, "step": 680 }, { "epoch": 0.07508495012605503, "grad_norm": 5.692570209503174, "learning_rate": 4.930768960442299e-05, "loss": 3.3921, "num_input_tokens_seen": 449360, "step": 685 }, { "epoch": 0.07563301545544229, "grad_norm": 10.294317245483398, "learning_rate": 4.929759376415358e-05, "loss": 3.6814, "num_input_tokens_seen": 452736, "step": 690 }, { "epoch": 0.07618108078482955, "grad_norm": 7.613968849182129, "learning_rate": 4.9287425891537454e-05, "loss": 3.5298, "num_input_tokens_seen": 455648, "step": 695 }, { "epoch": 0.07672914611421681, "grad_norm": 5.538883209228516, "learning_rate": 4.927718601671816e-05, "loss": 3.4538, "num_input_tokens_seen": 458256, "step": 700 }, { "epoch": 0.07727721144360408, "grad_norm": 5.105963706970215, "learning_rate": 4.926687417005268e-05, "loss": 3.3759, "num_input_tokens_seen": 461984, "step": 705 }, { "epoch": 0.07782527677299134, "grad_norm": 5.424991130828857, "learning_rate": 4.925649038211142e-05, "loss": 3.4941, "num_input_tokens_seen": 465216, "step": 710 }, { "epoch": 0.0783733421023786, "grad_norm": 6.287330627441406, "learning_rate": 4.924603468367801e-05, "loss": 3.3536, "num_input_tokens_seen": 468496, "step": 715 }, { "epoch": 0.07892140743176587, "grad_norm": 7.270327568054199, "learning_rate": 4.923550710574929e-05, "loss": 3.1898, "num_input_tokens_seen": 471784, "step": 720 }, { "epoch": 0.07946947276115313, "grad_norm": 5.402751922607422, "learning_rate": 4.922490767953519e-05, "loss": 3.7645, "num_input_tokens_seen": 474928, "step": 725 }, { "epoch": 0.0800175380905404, "grad_norm": 5.472609996795654, "learning_rate": 4.921423643645863e-05, "loss": 3.5023, "num_input_tokens_seen": 479376, "step": 730 }, { "epoch": 0.08056560341992766, "grad_norm": 4.318566799163818, "learning_rate": 4.9203493408155455e-05, "loss": 3.1444, "num_input_tokens_seen": 482328, "step": 735 }, { "epoch": 0.08111366874931492, "grad_norm": 6.903258800506592, "learning_rate": 4.919267862647431e-05, "loss": 3.8837, "num_input_tokens_seen": 486248, "step": 740 }, { "epoch": 0.08166173407870218, "grad_norm": 4.821303844451904, "learning_rate": 4.918179212347657e-05, "loss": 3.7363, "num_input_tokens_seen": 489736, "step": 745 }, { "epoch": 0.08220979940808945, "grad_norm": 4.108252048492432, "learning_rate": 4.917083393143621e-05, "loss": 3.0709, "num_input_tokens_seen": 492784, "step": 750 }, { "epoch": 0.0827578647374767, "grad_norm": 6.259218215942383, "learning_rate": 4.915980408283977e-05, "loss": 3.4733, "num_input_tokens_seen": 496528, "step": 755 }, { "epoch": 0.08330593006686397, "grad_norm": 5.9338531494140625, "learning_rate": 4.91487026103862e-05, "loss": 3.8987, "num_input_tokens_seen": 500832, "step": 760 }, { "epoch": 0.08385399539625123, "grad_norm": 5.397777557373047, "learning_rate": 4.913752954698677e-05, "loss": 3.3764, "num_input_tokens_seen": 503744, "step": 765 }, { "epoch": 0.0844020607256385, "grad_norm": 5.536934852600098, "learning_rate": 4.912628492576503e-05, "loss": 3.7953, "num_input_tokens_seen": 507656, "step": 770 }, { "epoch": 0.08495012605502576, "grad_norm": 5.932541847229004, "learning_rate": 4.9114968780056635e-05, "loss": 3.4254, "num_input_tokens_seen": 511216, "step": 775 }, { "epoch": 0.08549819138441302, "grad_norm": 5.971353530883789, "learning_rate": 4.910358114340929e-05, "loss": 3.6466, "num_input_tokens_seen": 514328, "step": 780 }, { "epoch": 0.08604625671380028, "grad_norm": 8.010024070739746, "learning_rate": 4.9092122049582636e-05, "loss": 3.9475, "num_input_tokens_seen": 518200, "step": 785 }, { "epoch": 0.08659432204318755, "grad_norm": 6.520806312561035, "learning_rate": 4.9080591532548175e-05, "loss": 3.4056, "num_input_tokens_seen": 521704, "step": 790 }, { "epoch": 0.0871423873725748, "grad_norm": 5.646440029144287, "learning_rate": 4.9068989626489126e-05, "loss": 3.5912, "num_input_tokens_seen": 524456, "step": 795 }, { "epoch": 0.08769045270196207, "grad_norm": 4.937885284423828, "learning_rate": 4.9057316365800366e-05, "loss": 3.4854, "num_input_tokens_seen": 526920, "step": 800 }, { "epoch": 0.08823851803134934, "grad_norm": 6.204067230224609, "learning_rate": 4.904557178508829e-05, "loss": 3.3649, "num_input_tokens_seen": 530544, "step": 805 }, { "epoch": 0.0887865833607366, "grad_norm": 6.427296161651611, "learning_rate": 4.9033755919170733e-05, "loss": 3.8582, "num_input_tokens_seen": 532832, "step": 810 }, { "epoch": 0.08933464869012386, "grad_norm": 7.1010589599609375, "learning_rate": 4.9021868803076875e-05, "loss": 3.5353, "num_input_tokens_seen": 536056, "step": 815 }, { "epoch": 0.08988271401951113, "grad_norm": 4.813199043273926, "learning_rate": 4.900991047204712e-05, "loss": 3.2529, "num_input_tokens_seen": 539248, "step": 820 }, { "epoch": 0.09043077934889839, "grad_norm": 7.545267581939697, "learning_rate": 4.899788096153297e-05, "loss": 3.0758, "num_input_tokens_seen": 543584, "step": 825 }, { "epoch": 0.09097884467828565, "grad_norm": 5.574884414672852, "learning_rate": 4.898578030719698e-05, "loss": 3.0291, "num_input_tokens_seen": 546792, "step": 830 }, { "epoch": 0.09152691000767292, "grad_norm": 5.587398529052734, "learning_rate": 4.897360854491259e-05, "loss": 3.2747, "num_input_tokens_seen": 549296, "step": 835 }, { "epoch": 0.09207497533706017, "grad_norm": 6.558215618133545, "learning_rate": 4.896136571076406e-05, "loss": 3.4765, "num_input_tokens_seen": 551784, "step": 840 }, { "epoch": 0.09262304066644744, "grad_norm": 5.221803188323975, "learning_rate": 4.894905184104634e-05, "loss": 3.3299, "num_input_tokens_seen": 555608, "step": 845 }, { "epoch": 0.09317110599583471, "grad_norm": NaN, "learning_rate": 4.8939149624187016e-05, "loss": 3.5208, "num_input_tokens_seen": 558848, "step": 850 }, { "epoch": 0.09371917132522196, "grad_norm": 5.915983200073242, "learning_rate": 4.8926707982580194e-05, "loss": 3.5031, "num_input_tokens_seen": 562384, "step": 855 }, { "epoch": 0.09426723665460923, "grad_norm": 6.868443965911865, "learning_rate": 4.891419540815006e-05, "loss": 3.5194, "num_input_tokens_seen": 565648, "step": 860 }, { "epoch": 0.09481530198399649, "grad_norm": 6.696837902069092, "learning_rate": 4.8901611937991244e-05, "loss": 3.4405, "num_input_tokens_seen": 568384, "step": 865 }, { "epoch": 0.09536336731338375, "grad_norm": 6.879650592803955, "learning_rate": 4.8888957609408535e-05, "loss": 3.2062, "num_input_tokens_seen": 571184, "step": 870 }, { "epoch": 0.09591143264277102, "grad_norm": 5.235931396484375, "learning_rate": 4.8876232459916805e-05, "loss": 3.351, "num_input_tokens_seen": 575328, "step": 875 }, { "epoch": 0.09645949797215828, "grad_norm": 6.496284008026123, "learning_rate": 4.886343652724088e-05, "loss": 3.3753, "num_input_tokens_seen": 578520, "step": 880 }, { "epoch": 0.09700756330154554, "grad_norm": 8.708456039428711, "learning_rate": 4.8850569849315414e-05, "loss": 3.4456, "num_input_tokens_seen": 581688, "step": 885 }, { "epoch": 0.09755562863093281, "grad_norm": 5.558722496032715, "learning_rate": 4.883763246428481e-05, "loss": 3.3753, "num_input_tokens_seen": 584736, "step": 890 }, { "epoch": 0.09810369396032007, "grad_norm": 6.443663597106934, "learning_rate": 4.882462441050308e-05, "loss": 3.5381, "num_input_tokens_seen": 587952, "step": 895 }, { "epoch": 0.09865175928970733, "grad_norm": 6.3144073486328125, "learning_rate": 4.881154572653373e-05, "loss": 3.5416, "num_input_tokens_seen": 590704, "step": 900 }, { "epoch": 0.0991998246190946, "grad_norm": 5.615172386169434, "learning_rate": 4.8798396451149676e-05, "loss": 3.5944, "num_input_tokens_seen": 593056, "step": 905 }, { "epoch": 0.09974788994848185, "grad_norm": 6.011329174041748, "learning_rate": 4.8785176623333094e-05, "loss": 3.2378, "num_input_tokens_seen": 596584, "step": 910 }, { "epoch": 0.10029595527786912, "grad_norm": 5.445102214813232, "learning_rate": 4.8771886282275324e-05, "loss": 3.6375, "num_input_tokens_seen": 600080, "step": 915 }, { "epoch": 0.10084402060725639, "grad_norm": 6.635453701019287, "learning_rate": 4.875852546737675e-05, "loss": 3.5498, "num_input_tokens_seen": 602696, "step": 920 }, { "epoch": 0.10139208593664364, "grad_norm": 5.236489772796631, "learning_rate": 4.874509421824667e-05, "loss": 3.4216, "num_input_tokens_seen": 606200, "step": 925 }, { "epoch": 0.10194015126603091, "grad_norm": 6.734245300292969, "learning_rate": 4.87315925747032e-05, "loss": 3.3747, "num_input_tokens_seen": 609848, "step": 930 }, { "epoch": 0.10248821659541818, "grad_norm": 6.802552223205566, "learning_rate": 4.871802057677315e-05, "loss": 3.2441, "num_input_tokens_seen": 613440, "step": 935 }, { "epoch": 0.10303628192480543, "grad_norm": 6.780172824859619, "learning_rate": 4.8704378264691894e-05, "loss": 3.4606, "num_input_tokens_seen": 617088, "step": 940 }, { "epoch": 0.1035843472541927, "grad_norm": 6.527922630310059, "learning_rate": 4.869066567890327e-05, "loss": 3.4019, "num_input_tokens_seen": 619952, "step": 945 }, { "epoch": 0.10413241258357997, "grad_norm": 6.2412214279174805, "learning_rate": 4.867688286005944e-05, "loss": 3.2408, "num_input_tokens_seen": 623088, "step": 950 }, { "epoch": 0.10468047791296722, "grad_norm": 6.477228164672852, "learning_rate": 4.8663029849020775e-05, "loss": 3.2491, "num_input_tokens_seen": 626376, "step": 955 }, { "epoch": 0.10522854324235449, "grad_norm": 5.359529495239258, "learning_rate": 4.864910668685574e-05, "loss": 3.1534, "num_input_tokens_seen": 628800, "step": 960 }, { "epoch": 0.10577660857174175, "grad_norm": 5.2979960441589355, "learning_rate": 4.863511341484077e-05, "loss": 3.4653, "num_input_tokens_seen": 631312, "step": 965 }, { "epoch": 0.10632467390112901, "grad_norm": 12.67263126373291, "learning_rate": 4.8621050074460136e-05, "loss": 3.8407, "num_input_tokens_seen": 634144, "step": 970 }, { "epoch": 0.10687273923051628, "grad_norm": 4.020299434661865, "learning_rate": 4.860691670740587e-05, "loss": 3.6273, "num_input_tokens_seen": 637568, "step": 975 }, { "epoch": 0.10742080455990353, "grad_norm": 5.12907075881958, "learning_rate": 4.8592713355577555e-05, "loss": 2.9803, "num_input_tokens_seen": 640368, "step": 980 }, { "epoch": 0.1079688698892908, "grad_norm": 5.088891983032227, "learning_rate": 4.8578440061082275e-05, "loss": 3.0532, "num_input_tokens_seen": 643928, "step": 985 }, { "epoch": 0.10851693521867807, "grad_norm": 6.150454521179199, "learning_rate": 4.856409686623447e-05, "loss": 3.5733, "num_input_tokens_seen": 648192, "step": 990 }, { "epoch": 0.10906500054806532, "grad_norm": 6.601188659667969, "learning_rate": 4.85496838135558e-05, "loss": 3.4824, "num_input_tokens_seen": 652272, "step": 995 }, { "epoch": 0.10961306587745259, "grad_norm": 6.9974141120910645, "learning_rate": 4.8535200945775016e-05, "loss": 3.516, "num_input_tokens_seen": 655696, "step": 1000 }, { "epoch": 0.11016113120683986, "grad_norm": 7.116706371307373, "learning_rate": 4.8520648305827855e-05, "loss": 3.4208, "num_input_tokens_seen": 658560, "step": 1005 }, { "epoch": 0.11070919653622711, "grad_norm": 5.209189414978027, "learning_rate": 4.850602593685689e-05, "loss": 3.353, "num_input_tokens_seen": 662152, "step": 1010 }, { "epoch": 0.11125726186561438, "grad_norm": 5.9092278480529785, "learning_rate": 4.8491333882211416e-05, "loss": 3.2833, "num_input_tokens_seen": 665968, "step": 1015 }, { "epoch": 0.11180532719500165, "grad_norm": 7.026948928833008, "learning_rate": 4.847657218544732e-05, "loss": 3.291, "num_input_tokens_seen": 668808, "step": 1020 }, { "epoch": 0.1123533925243889, "grad_norm": 6.154213905334473, "learning_rate": 4.8461740890326936e-05, "loss": 3.3035, "num_input_tokens_seen": 672280, "step": 1025 }, { "epoch": 0.11290145785377617, "grad_norm": 6.6929521560668945, "learning_rate": 4.844684004081895e-05, "loss": 3.6387, "num_input_tokens_seen": 675184, "step": 1030 }, { "epoch": 0.11344952318316344, "grad_norm": 5.449969291687012, "learning_rate": 4.843186968109823e-05, "loss": 3.1393, "num_input_tokens_seen": 677824, "step": 1035 }, { "epoch": 0.1139975885125507, "grad_norm": 3.6720149517059326, "learning_rate": 4.841682985554573e-05, "loss": 3.2646, "num_input_tokens_seen": 682856, "step": 1040 }, { "epoch": 0.11454565384193796, "grad_norm": 5.606584072113037, "learning_rate": 4.8401720608748324e-05, "loss": 3.3697, "num_input_tokens_seen": 687680, "step": 1045 }, { "epoch": 0.11509371917132523, "grad_norm": 5.044498920440674, "learning_rate": 4.83865419854987e-05, "loss": 3.3275, "num_input_tokens_seen": 690616, "step": 1050 }, { "epoch": 0.11564178450071248, "grad_norm": 5.938497543334961, "learning_rate": 4.83712940307952e-05, "loss": 3.1055, "num_input_tokens_seen": 693808, "step": 1055 }, { "epoch": 0.11618984983009975, "grad_norm": 7.216318607330322, "learning_rate": 4.8355976789841754e-05, "loss": 3.5388, "num_input_tokens_seen": 696992, "step": 1060 }, { "epoch": 0.116737915159487, "grad_norm": 5.2063164710998535, "learning_rate": 4.834059030804764e-05, "loss": 3.3436, "num_input_tokens_seen": 700448, "step": 1065 }, { "epoch": 0.11728598048887427, "grad_norm": 6.457626819610596, "learning_rate": 4.832513463102745e-05, "loss": 3.281, "num_input_tokens_seen": 702928, "step": 1070 }, { "epoch": 0.11783404581826154, "grad_norm": 5.837212562561035, "learning_rate": 4.8309609804600886e-05, "loss": 3.3414, "num_input_tokens_seen": 707064, "step": 1075 }, { "epoch": 0.1183821111476488, "grad_norm": 5.227325439453125, "learning_rate": 4.829401587479265e-05, "loss": 3.0907, "num_input_tokens_seen": 711056, "step": 1080 }, { "epoch": 0.11893017647703606, "grad_norm": 7.185408115386963, "learning_rate": 4.8278352887832326e-05, "loss": 3.159, "num_input_tokens_seen": 714472, "step": 1085 }, { "epoch": 0.11947824180642333, "grad_norm": 7.311601638793945, "learning_rate": 4.82626208901542e-05, "loss": 3.5405, "num_input_tokens_seen": 717400, "step": 1090 }, { "epoch": 0.12002630713581058, "grad_norm": 4.9710693359375, "learning_rate": 4.824681992839717e-05, "loss": 3.3058, "num_input_tokens_seen": 720472, "step": 1095 }, { "epoch": 0.12057437246519785, "grad_norm": 4.5781779289245605, "learning_rate": 4.823095004940456e-05, "loss": 3.1374, "num_input_tokens_seen": 723808, "step": 1100 }, { "epoch": 0.12112243779458512, "grad_norm": 6.077118396759033, "learning_rate": 4.8215011300224027e-05, "loss": 3.1628, "num_input_tokens_seen": 727576, "step": 1105 }, { "epoch": 0.12167050312397237, "grad_norm": 6.6747870445251465, "learning_rate": 4.819900372810739e-05, "loss": 3.5095, "num_input_tokens_seen": 730536, "step": 1110 }, { "epoch": 0.12221856845335964, "grad_norm": 5.468014240264893, "learning_rate": 4.818292738051049e-05, "loss": 3.521, "num_input_tokens_seen": 733024, "step": 1115 }, { "epoch": 0.12276663378274691, "grad_norm": 6.263638019561768, "learning_rate": 4.816678230509308e-05, "loss": 3.2318, "num_input_tokens_seen": 736048, "step": 1120 }, { "epoch": 0.12331469911213416, "grad_norm": 5.998656272888184, "learning_rate": 4.8150568549718655e-05, "loss": 3.0286, "num_input_tokens_seen": 739264, "step": 1125 }, { "epoch": 0.12386276444152143, "grad_norm": 6.395206928253174, "learning_rate": 4.81342861624543e-05, "loss": 3.4223, "num_input_tokens_seen": 742008, "step": 1130 }, { "epoch": 0.1244108297709087, "grad_norm": 6.199779510498047, "learning_rate": 4.811793519157059e-05, "loss": 3.5237, "num_input_tokens_seen": 745064, "step": 1135 }, { "epoch": 0.12495889510029595, "grad_norm": 6.504228115081787, "learning_rate": 4.81015156855414e-05, "loss": 3.4249, "num_input_tokens_seen": 748104, "step": 1140 }, { "epoch": 0.1255069604296832, "grad_norm": 6.280592441558838, "learning_rate": 4.80850276930438e-05, "loss": 3.0411, "num_input_tokens_seen": 752032, "step": 1145 }, { "epoch": 0.1260550257590705, "grad_norm": 8.529096603393555, "learning_rate": 4.806847126295789e-05, "loss": 3.1457, "num_input_tokens_seen": 755400, "step": 1150 }, { "epoch": 0.12660309108845774, "grad_norm": 6.454196453094482, "learning_rate": 4.8051846444366676e-05, "loss": 3.0008, "num_input_tokens_seen": 758392, "step": 1155 }, { "epoch": 0.127151156417845, "grad_norm": 6.862017631530762, "learning_rate": 4.803515328655586e-05, "loss": 3.3972, "num_input_tokens_seen": 760824, "step": 1160 }, { "epoch": 0.12769922174723228, "grad_norm": 6.56373929977417, "learning_rate": 4.8018391839013784e-05, "loss": 3.4338, "num_input_tokens_seen": 763680, "step": 1165 }, { "epoch": 0.12824728707661953, "grad_norm": 5.431229114532471, "learning_rate": 4.800156215143124e-05, "loss": 3.2619, "num_input_tokens_seen": 767352, "step": 1170 }, { "epoch": 0.12879535240600679, "grad_norm": 5.761483192443848, "learning_rate": 4.7984664273701305e-05, "loss": 3.3616, "num_input_tokens_seen": 771096, "step": 1175 }, { "epoch": 0.12934341773539407, "grad_norm": 7.804869651794434, "learning_rate": 4.796769825591921e-05, "loss": 3.2658, "num_input_tokens_seen": 774192, "step": 1180 }, { "epoch": 0.12989148306478132, "grad_norm": 5.688300609588623, "learning_rate": 4.7950664148382205e-05, "loss": 3.7069, "num_input_tokens_seen": 777712, "step": 1185 }, { "epoch": 0.13043954839416858, "grad_norm": 4.980658054351807, "learning_rate": 4.793356200158941e-05, "loss": 3.0386, "num_input_tokens_seen": 780680, "step": 1190 }, { "epoch": 0.13098761372355586, "grad_norm": 6.9450249671936035, "learning_rate": 4.791639186624162e-05, "loss": 3.4293, "num_input_tokens_seen": 783664, "step": 1195 }, { "epoch": 0.1315356790529431, "grad_norm": 6.7938408851623535, "learning_rate": 4.789915379324121e-05, "loss": 3.2908, "num_input_tokens_seen": 787480, "step": 1200 }, { "epoch": 0.13208374438233036, "grad_norm": 5.833454608917236, "learning_rate": 4.788184783369196e-05, "loss": 3.3431, "num_input_tokens_seen": 791560, "step": 1205 }, { "epoch": 0.13263180971171765, "grad_norm": 6.020946502685547, "learning_rate": 4.786447403889891e-05, "loss": 3.1235, "num_input_tokens_seen": 794600, "step": 1210 }, { "epoch": 0.1331798750411049, "grad_norm": 9.639689445495605, "learning_rate": 4.78470324603682e-05, "loss": 3.357, "num_input_tokens_seen": 796976, "step": 1215 }, { "epoch": 0.13372794037049215, "grad_norm": 5.102296829223633, "learning_rate": 4.782952314980691e-05, "loss": 3.4762, "num_input_tokens_seen": 801208, "step": 1220 }, { "epoch": 0.13427600569987944, "grad_norm": 6.015713214874268, "learning_rate": 4.781194615912292e-05, "loss": 3.2738, "num_input_tokens_seen": 804472, "step": 1225 }, { "epoch": 0.1348240710292667, "grad_norm": 7.88398551940918, "learning_rate": 4.7794301540424774e-05, "loss": 3.3333, "num_input_tokens_seen": 807568, "step": 1230 }, { "epoch": 0.13537213635865394, "grad_norm": 6.841670989990234, "learning_rate": 4.7776589346021486e-05, "loss": 3.5167, "num_input_tokens_seen": 811016, "step": 1235 }, { "epoch": 0.13592020168804123, "grad_norm": 6.089728355407715, "learning_rate": 4.775880962842241e-05, "loss": 3.703, "num_input_tokens_seen": 814536, "step": 1240 }, { "epoch": 0.13646826701742848, "grad_norm": 6.35260009765625, "learning_rate": 4.774096244033707e-05, "loss": 3.1131, "num_input_tokens_seen": 817496, "step": 1245 }, { "epoch": 0.13701633234681573, "grad_norm": 5.8579254150390625, "learning_rate": 4.772304783467503e-05, "loss": 3.2992, "num_input_tokens_seen": 821712, "step": 1250 }, { "epoch": 0.13756439767620302, "grad_norm": 5.486454963684082, "learning_rate": 4.7705065864545695e-05, "loss": 3.1721, "num_input_tokens_seen": 824688, "step": 1255 }, { "epoch": 0.13811246300559027, "grad_norm": 6.544208526611328, "learning_rate": 4.7687016583258203e-05, "loss": 3.4493, "num_input_tokens_seen": 828400, "step": 1260 }, { "epoch": 0.13866052833497752, "grad_norm": 4.948637008666992, "learning_rate": 4.7668900044321236e-05, "loss": 3.0927, "num_input_tokens_seen": 831936, "step": 1265 }, { "epoch": 0.1392085936643648, "grad_norm": 6.64813756942749, "learning_rate": 4.7650716301442856e-05, "loss": 3.6065, "num_input_tokens_seen": 834912, "step": 1270 }, { "epoch": 0.13975665899375206, "grad_norm": 7.289310455322266, "learning_rate": 4.763246540853035e-05, "loss": 3.3871, "num_input_tokens_seen": 839072, "step": 1275 }, { "epoch": 0.1403047243231393, "grad_norm": 5.887922763824463, "learning_rate": 4.761414741969011e-05, "loss": 3.1424, "num_input_tokens_seen": 842568, "step": 1280 }, { "epoch": 0.1408527896525266, "grad_norm": 6.820570468902588, "learning_rate": 4.7595762389227406e-05, "loss": 3.0197, "num_input_tokens_seen": 845808, "step": 1285 }, { "epoch": 0.14140085498191385, "grad_norm": 6.593437671661377, "learning_rate": 4.757731037164628e-05, "loss": 3.2013, "num_input_tokens_seen": 849184, "step": 1290 }, { "epoch": 0.1419489203113011, "grad_norm": 8.89852523803711, "learning_rate": 4.7558791421649354e-05, "loss": 3.5085, "num_input_tokens_seen": 852392, "step": 1295 }, { "epoch": 0.14249698564068838, "grad_norm": 7.368271827697754, "learning_rate": 4.754020559413768e-05, "loss": 3.3167, "num_input_tokens_seen": 855376, "step": 1300 }, { "epoch": 0.14304505097007564, "grad_norm": 5.54932975769043, "learning_rate": 4.752155294421056e-05, "loss": 3.0516, "num_input_tokens_seen": 858720, "step": 1305 }, { "epoch": 0.1435931162994629, "grad_norm": 8.180092811584473, "learning_rate": 4.750283352716543e-05, "loss": 3.4647, "num_input_tokens_seen": 861312, "step": 1310 }, { "epoch": 0.14414118162885015, "grad_norm": 6.608414173126221, "learning_rate": 4.748404739849763e-05, "loss": 3.3686, "num_input_tokens_seen": 864368, "step": 1315 }, { "epoch": 0.14468924695823743, "grad_norm": 6.880706787109375, "learning_rate": 4.746519461390029e-05, "loss": 3.0061, "num_input_tokens_seen": 868000, "step": 1320 }, { "epoch": 0.14523731228762468, "grad_norm": 4.034643650054932, "learning_rate": 4.744627522926414e-05, "loss": 3.3709, "num_input_tokens_seen": 871648, "step": 1325 }, { "epoch": 0.14578537761701194, "grad_norm": 5.335696220397949, "learning_rate": 4.742728930067736e-05, "loss": 3.0955, "num_input_tokens_seen": 875440, "step": 1330 }, { "epoch": 0.14633344294639922, "grad_norm": 8.005532264709473, "learning_rate": 4.7408236884425396e-05, "loss": 3.6277, "num_input_tokens_seen": 879208, "step": 1335 }, { "epoch": 0.14688150827578647, "grad_norm": 7.770083904266357, "learning_rate": 4.7389118036990795e-05, "loss": 3.5794, "num_input_tokens_seen": 882040, "step": 1340 }, { "epoch": 0.14742957360517372, "grad_norm": 6.539053916931152, "learning_rate": 4.736993281505307e-05, "loss": 3.2326, "num_input_tokens_seen": 884984, "step": 1345 }, { "epoch": 0.147977638934561, "grad_norm": 7.831300258636475, "learning_rate": 4.73506812754885e-05, "loss": 3.2767, "num_input_tokens_seen": 888128, "step": 1350 }, { "epoch": 0.14852570426394826, "grad_norm": 5.242404937744141, "learning_rate": 4.733136347536995e-05, "loss": 3.4698, "num_input_tokens_seen": 890520, "step": 1355 }, { "epoch": 0.14907376959333551, "grad_norm": 5.803912162780762, "learning_rate": 4.731197947196673e-05, "loss": 3.4711, "num_input_tokens_seen": 893464, "step": 1360 }, { "epoch": 0.1496218349227228, "grad_norm": 8.300127983093262, "learning_rate": 4.7292529322744416e-05, "loss": 3.2302, "num_input_tokens_seen": 897520, "step": 1365 }, { "epoch": 0.15016990025211005, "grad_norm": 5.02566385269165, "learning_rate": 4.7273013085364694e-05, "loss": 3.2959, "num_input_tokens_seen": 901416, "step": 1370 }, { "epoch": 0.1507179655814973, "grad_norm": 4.600845813751221, "learning_rate": 4.725343081768514e-05, "loss": 3.3303, "num_input_tokens_seen": 906432, "step": 1375 }, { "epoch": 0.15126603091088459, "grad_norm": 6.849578380584717, "learning_rate": 4.723378257775912e-05, "loss": 3.1125, "num_input_tokens_seen": 909264, "step": 1380 }, { "epoch": 0.15181409624027184, "grad_norm": 7.15298318862915, "learning_rate": 4.7214068423835566e-05, "loss": 3.2795, "num_input_tokens_seen": 912464, "step": 1385 }, { "epoch": 0.1523621615696591, "grad_norm": 5.415898323059082, "learning_rate": 4.7194288414358804e-05, "loss": 3.1385, "num_input_tokens_seen": 915960, "step": 1390 }, { "epoch": 0.15291022689904638, "grad_norm": 6.559721946716309, "learning_rate": 4.717444260796841e-05, "loss": 3.4027, "num_input_tokens_seen": 918984, "step": 1395 }, { "epoch": 0.15345829222843363, "grad_norm": 5.312758922576904, "learning_rate": 4.715453106349902e-05, "loss": 3.4349, "num_input_tokens_seen": 921912, "step": 1400 }, { "epoch": 0.15400635755782088, "grad_norm": 6.985774040222168, "learning_rate": 4.7134553839980143e-05, "loss": 3.7019, "num_input_tokens_seen": 925848, "step": 1405 }, { "epoch": 0.15455442288720816, "grad_norm": 6.191575527191162, "learning_rate": 4.711451099663603e-05, "loss": 3.4276, "num_input_tokens_seen": 929792, "step": 1410 }, { "epoch": 0.15510248821659542, "grad_norm": 6.040350437164307, "learning_rate": 4.709440259288542e-05, "loss": 2.9173, "num_input_tokens_seen": 932400, "step": 1415 }, { "epoch": 0.15565055354598267, "grad_norm": 6.164414405822754, "learning_rate": 4.707422868834146e-05, "loss": 3.1684, "num_input_tokens_seen": 935408, "step": 1420 }, { "epoch": 0.15619861887536995, "grad_norm": 7.248453140258789, "learning_rate": 4.705398934281145e-05, "loss": 3.6365, "num_input_tokens_seen": 938184, "step": 1425 }, { "epoch": 0.1567466842047572, "grad_norm": 5.813863754272461, "learning_rate": 4.70336846162967e-05, "loss": 3.405, "num_input_tokens_seen": 941272, "step": 1430 }, { "epoch": 0.15729474953414446, "grad_norm": 6.239504337310791, "learning_rate": 4.701331456899236e-05, "loss": 3.0722, "num_input_tokens_seen": 944728, "step": 1435 }, { "epoch": 0.15784281486353174, "grad_norm": 9.224727630615234, "learning_rate": 4.6992879261287226e-05, "loss": 3.2262, "num_input_tokens_seen": 947528, "step": 1440 }, { "epoch": 0.158390880192919, "grad_norm": 7.570671558380127, "learning_rate": 4.6972378753763545e-05, "loss": 3.2116, "num_input_tokens_seen": 950128, "step": 1445 }, { "epoch": 0.15893894552230625, "grad_norm": 4.781320095062256, "learning_rate": 4.6951813107196874e-05, "loss": 3.2953, "num_input_tokens_seen": 954336, "step": 1450 }, { "epoch": 0.15948701085169353, "grad_norm": 7.117349147796631, "learning_rate": 4.693118238255587e-05, "loss": 3.2755, "num_input_tokens_seen": 957704, "step": 1455 }, { "epoch": 0.1600350761810808, "grad_norm": 6.41115665435791, "learning_rate": 4.6910486641002136e-05, "loss": 3.2523, "num_input_tokens_seen": 960184, "step": 1460 }, { "epoch": 0.16058314151046804, "grad_norm": 8.865285873413086, "learning_rate": 4.688972594389001e-05, "loss": 3.3998, "num_input_tokens_seen": 963264, "step": 1465 }, { "epoch": 0.16113120683985532, "grad_norm": 4.722679615020752, "learning_rate": 4.6868900352766394e-05, "loss": 3.0958, "num_input_tokens_seen": 966536, "step": 1470 }, { "epoch": 0.16167927216924258, "grad_norm": 8.334817886352539, "learning_rate": 4.6848009929370575e-05, "loss": 3.2969, "num_input_tokens_seen": 969008, "step": 1475 }, { "epoch": 0.16222733749862983, "grad_norm": 6.063559055328369, "learning_rate": 4.682705473563406e-05, "loss": 3.0186, "num_input_tokens_seen": 972168, "step": 1480 }, { "epoch": 0.1627754028280171, "grad_norm": 6.434414386749268, "learning_rate": 4.680603483368033e-05, "loss": 3.4689, "num_input_tokens_seen": 976096, "step": 1485 }, { "epoch": 0.16332346815740437, "grad_norm": 8.82730770111084, "learning_rate": 4.678495028582476e-05, "loss": 3.2562, "num_input_tokens_seen": 979080, "step": 1490 }, { "epoch": 0.16387153348679162, "grad_norm": 6.3244171142578125, "learning_rate": 4.676380115457431e-05, "loss": 3.0127, "num_input_tokens_seen": 981896, "step": 1495 }, { "epoch": 0.1644195988161789, "grad_norm": 6.033606052398682, "learning_rate": 4.674258750262745e-05, "loss": 3.1823, "num_input_tokens_seen": 985072, "step": 1500 }, { "epoch": 0.16496766414556616, "grad_norm": 4.211119174957275, "learning_rate": 4.6721309392873926e-05, "loss": 3.1351, "num_input_tokens_seen": 987448, "step": 1505 }, { "epoch": 0.1655157294749534, "grad_norm": 6.105933666229248, "learning_rate": 4.669996688839453e-05, "loss": 3.2884, "num_input_tokens_seen": 990840, "step": 1510 }, { "epoch": 0.16606379480434066, "grad_norm": 8.247055053710938, "learning_rate": 4.6678560052460994e-05, "loss": 3.1378, "num_input_tokens_seen": 994768, "step": 1515 }, { "epoch": 0.16661186013372795, "grad_norm": 5.653783798217773, "learning_rate": 4.6657088948535776e-05, "loss": 3.7376, "num_input_tokens_seen": 997840, "step": 1520 }, { "epoch": 0.1671599254631152, "grad_norm": 5.42575216293335, "learning_rate": 4.6635553640271835e-05, "loss": 3.4831, "num_input_tokens_seen": 1000536, "step": 1525 }, { "epoch": 0.16770799079250245, "grad_norm": 7.640921115875244, "learning_rate": 4.6613954191512474e-05, "loss": 3.5714, "num_input_tokens_seen": 1003952, "step": 1530 }, { "epoch": 0.16825605612188974, "grad_norm": 5.931758880615234, "learning_rate": 4.6592290666291163e-05, "loss": 3.4493, "num_input_tokens_seen": 1006544, "step": 1535 }, { "epoch": 0.168804121451277, "grad_norm": 4.96866512298584, "learning_rate": 4.657056312883132e-05, "loss": 3.0963, "num_input_tokens_seen": 1009920, "step": 1540 }, { "epoch": 0.16935218678066424, "grad_norm": 7.009856224060059, "learning_rate": 4.6548771643546134e-05, "loss": 3.0819, "num_input_tokens_seen": 1012544, "step": 1545 }, { "epoch": 0.16990025211005153, "grad_norm": 6.719354629516602, "learning_rate": 4.652691627503837e-05, "loss": 3.3187, "num_input_tokens_seen": 1015248, "step": 1550 }, { "epoch": 0.17044831743943878, "grad_norm": 7.1751837730407715, "learning_rate": 4.650499708810018e-05, "loss": 3.6579, "num_input_tokens_seen": 1018720, "step": 1555 }, { "epoch": 0.17099638276882603, "grad_norm": 11.277824401855469, "learning_rate": 4.648301414771293e-05, "loss": 3.5192, "num_input_tokens_seen": 1021424, "step": 1560 }, { "epoch": 0.17154444809821331, "grad_norm": 9.307093620300293, "learning_rate": 4.646096751904696e-05, "loss": 3.2431, "num_input_tokens_seen": 1024192, "step": 1565 }, { "epoch": 0.17209251342760057, "grad_norm": 6.657312393188477, "learning_rate": 4.643885726746143e-05, "loss": 3.1878, "num_input_tokens_seen": 1027600, "step": 1570 }, { "epoch": 0.17264057875698782, "grad_norm": 5.908510208129883, "learning_rate": 4.641668345850414e-05, "loss": 3.67, "num_input_tokens_seen": 1030168, "step": 1575 }, { "epoch": 0.1731886440863751, "grad_norm": 6.540554046630859, "learning_rate": 4.639444615791128e-05, "loss": 2.9285, "num_input_tokens_seen": 1034472, "step": 1580 }, { "epoch": 0.17373670941576236, "grad_norm": 6.857239723205566, "learning_rate": 4.6372145431607264e-05, "loss": 3.3879, "num_input_tokens_seen": 1038520, "step": 1585 }, { "epoch": 0.1742847747451496, "grad_norm": 5.343799591064453, "learning_rate": 4.634978134570456e-05, "loss": 3.3824, "num_input_tokens_seen": 1041864, "step": 1590 }, { "epoch": 0.1748328400745369, "grad_norm": 5.971281051635742, "learning_rate": 4.632735396650346e-05, "loss": 3.5344, "num_input_tokens_seen": 1045192, "step": 1595 }, { "epoch": 0.17538090540392415, "grad_norm": 5.474274158477783, "learning_rate": 4.6304863360491906e-05, "loss": 3.0682, "num_input_tokens_seen": 1048680, "step": 1600 }, { "epoch": 0.1759289707333114, "grad_norm": 6.720623970031738, "learning_rate": 4.6282309594345266e-05, "loss": 3.0808, "num_input_tokens_seen": 1051776, "step": 1605 }, { "epoch": 0.17647703606269868, "grad_norm": 6.88260555267334, "learning_rate": 4.625969273492614e-05, "loss": 3.5346, "num_input_tokens_seen": 1054256, "step": 1610 }, { "epoch": 0.17702510139208594, "grad_norm": 6.154021263122559, "learning_rate": 4.623701284928421e-05, "loss": 3.2947, "num_input_tokens_seen": 1057536, "step": 1615 }, { "epoch": 0.1775731667214732, "grad_norm": 6.108212471008301, "learning_rate": 4.6214270004655985e-05, "loss": 3.3287, "num_input_tokens_seen": 1060872, "step": 1620 }, { "epoch": 0.17812123205086047, "grad_norm": 4.82647705078125, "learning_rate": 4.6191464268464614e-05, "loss": 3.3231, "num_input_tokens_seen": 1063536, "step": 1625 }, { "epoch": 0.17866929738024773, "grad_norm": 6.965377330780029, "learning_rate": 4.61685957083197e-05, "loss": 3.5096, "num_input_tokens_seen": 1066392, "step": 1630 }, { "epoch": 0.17921736270963498, "grad_norm": 7.133657455444336, "learning_rate": 4.6145664392017096e-05, "loss": 3.2534, "num_input_tokens_seen": 1068920, "step": 1635 }, { "epoch": 0.17976542803902226, "grad_norm": 8.859077453613281, "learning_rate": 4.6122670387538704e-05, "loss": 3.2012, "num_input_tokens_seen": 1071696, "step": 1640 }, { "epoch": 0.18031349336840952, "grad_norm": 6.119090557098389, "learning_rate": 4.6099613763052264e-05, "loss": 3.6088, "num_input_tokens_seen": 1074720, "step": 1645 }, { "epoch": 0.18086155869779677, "grad_norm": 6.804201126098633, "learning_rate": 4.607649458691115e-05, "loss": 3.2794, "num_input_tokens_seen": 1077944, "step": 1650 }, { "epoch": 0.18140962402718405, "grad_norm": 7.389477729797363, "learning_rate": 4.60533129276542e-05, "loss": 3.4432, "num_input_tokens_seen": 1080792, "step": 1655 }, { "epoch": 0.1819576893565713, "grad_norm": 5.930356502532959, "learning_rate": 4.6030068854005476e-05, "loss": 3.2158, "num_input_tokens_seen": 1083520, "step": 1660 }, { "epoch": 0.18250575468595856, "grad_norm": 6.847218036651611, "learning_rate": 4.6006762434874065e-05, "loss": 3.4395, "num_input_tokens_seen": 1086128, "step": 1665 }, { "epoch": 0.18305382001534584, "grad_norm": 9.511390686035156, "learning_rate": 4.598339373935389e-05, "loss": 3.2795, "num_input_tokens_seen": 1088560, "step": 1670 }, { "epoch": 0.1836018853447331, "grad_norm": 4.90114688873291, "learning_rate": 4.595996283672349e-05, "loss": 3.2474, "num_input_tokens_seen": 1091832, "step": 1675 }, { "epoch": 0.18414995067412035, "grad_norm": 9.29576301574707, "learning_rate": 4.5936469796445854e-05, "loss": 3.3011, "num_input_tokens_seen": 1095048, "step": 1680 }, { "epoch": 0.18469801600350763, "grad_norm": 6.643434524536133, "learning_rate": 4.5912914688168134e-05, "loss": 3.4029, "num_input_tokens_seen": 1097704, "step": 1685 }, { "epoch": 0.18524608133289489, "grad_norm": 4.961350440979004, "learning_rate": 4.5889297581721526e-05, "loss": 3.0958, "num_input_tokens_seen": 1100736, "step": 1690 }, { "epoch": 0.18579414666228214, "grad_norm": 7.057353496551514, "learning_rate": 4.5865618547121016e-05, "loss": 3.1003, "num_input_tokens_seen": 1104184, "step": 1695 }, { "epoch": 0.18634221199166942, "grad_norm": 3.688004970550537, "learning_rate": 4.584187765456516e-05, "loss": 3.5992, "num_input_tokens_seen": 1107880, "step": 1700 }, { "epoch": 0.18689027732105667, "grad_norm": 6.79044246673584, "learning_rate": 4.5818074974435935e-05, "loss": 3.5112, "num_input_tokens_seen": 1110728, "step": 1705 }, { "epoch": 0.18743834265044393, "grad_norm": 5.125957489013672, "learning_rate": 4.579421057729846e-05, "loss": 3.4606, "num_input_tokens_seen": 1113632, "step": 1710 }, { "epoch": 0.18798640797983118, "grad_norm": 6.708007335662842, "learning_rate": 4.577028453390084e-05, "loss": 3.4139, "num_input_tokens_seen": 1117248, "step": 1715 }, { "epoch": 0.18853447330921846, "grad_norm": 4.76835298538208, "learning_rate": 4.5746296915173924e-05, "loss": 3.4408, "num_input_tokens_seen": 1120600, "step": 1720 }, { "epoch": 0.18908253863860572, "grad_norm": 6.29659366607666, "learning_rate": 4.572224779223111e-05, "loss": 3.4817, "num_input_tokens_seen": 1123856, "step": 1725 }, { "epoch": 0.18963060396799297, "grad_norm": 9.75003433227539, "learning_rate": 4.569813723636813e-05, "loss": 3.5152, "num_input_tokens_seen": 1127872, "step": 1730 }, { "epoch": 0.19017866929738025, "grad_norm": 6.846242427825928, "learning_rate": 4.567396531906285e-05, "loss": 3.4197, "num_input_tokens_seen": 1131656, "step": 1735 }, { "epoch": 0.1907267346267675, "grad_norm": 6.956099033355713, "learning_rate": 4.564973211197503e-05, "loss": 3.5098, "num_input_tokens_seen": 1135160, "step": 1740 }, { "epoch": 0.19127479995615476, "grad_norm": 5.187982559204102, "learning_rate": 4.562543768694614e-05, "loss": 3.2708, "num_input_tokens_seen": 1137640, "step": 1745 }, { "epoch": 0.19182286528554204, "grad_norm": 6.0655035972595215, "learning_rate": 4.5601082115999126e-05, "loss": 3.1415, "num_input_tokens_seen": 1140624, "step": 1750 }, { "epoch": 0.1923709306149293, "grad_norm": 7.111659049987793, "learning_rate": 4.557666547133822e-05, "loss": 3.419, "num_input_tokens_seen": 1143352, "step": 1755 }, { "epoch": 0.19291899594431655, "grad_norm": 5.601785659790039, "learning_rate": 4.55521878253487e-05, "loss": 3.1537, "num_input_tokens_seen": 1146552, "step": 1760 }, { "epoch": 0.19346706127370383, "grad_norm": 5.885753154754639, "learning_rate": 4.5527649250596705e-05, "loss": 3.1606, "num_input_tokens_seen": 1150064, "step": 1765 }, { "epoch": 0.1940151266030911, "grad_norm": 7.787903785705566, "learning_rate": 4.5503049819828975e-05, "loss": 3.5314, "num_input_tokens_seen": 1152720, "step": 1770 }, { "epoch": 0.19456319193247834, "grad_norm": 6.6935133934021, "learning_rate": 4.5478389605972695e-05, "loss": 3.2798, "num_input_tokens_seen": 1155704, "step": 1775 }, { "epoch": 0.19511125726186562, "grad_norm": 5.613322734832764, "learning_rate": 4.545366868213521e-05, "loss": 2.9432, "num_input_tokens_seen": 1159064, "step": 1780 }, { "epoch": 0.19565932259125288, "grad_norm": 5.332114219665527, "learning_rate": 4.542888712160389e-05, "loss": 3.417, "num_input_tokens_seen": 1162384, "step": 1785 }, { "epoch": 0.19620738792064013, "grad_norm": 5.810116291046143, "learning_rate": 4.540404499784582e-05, "loss": 3.4744, "num_input_tokens_seen": 1165168, "step": 1790 }, { "epoch": 0.1967554532500274, "grad_norm": 6.959201335906982, "learning_rate": 4.537914238450768e-05, "loss": 3.6205, "num_input_tokens_seen": 1168288, "step": 1795 }, { "epoch": 0.19730351857941467, "grad_norm": 7.266166687011719, "learning_rate": 4.535417935541543e-05, "loss": 3.5834, "num_input_tokens_seen": 1170536, "step": 1800 }, { "epoch": 0.19785158390880192, "grad_norm": 6.565328598022461, "learning_rate": 4.5329155984574154e-05, "loss": 3.094, "num_input_tokens_seen": 1174016, "step": 1805 }, { "epoch": 0.1983996492381892, "grad_norm": 6.1436944007873535, "learning_rate": 4.5304072346167846e-05, "loss": 3.6874, "num_input_tokens_seen": 1177584, "step": 1810 }, { "epoch": 0.19894771456757646, "grad_norm": 6.344284534454346, "learning_rate": 4.527892851455915e-05, "loss": 3.5916, "num_input_tokens_seen": 1180544, "step": 1815 }, { "epoch": 0.1994957798969637, "grad_norm": 6.047328472137451, "learning_rate": 4.5253724564289144e-05, "loss": 3.1019, "num_input_tokens_seen": 1184376, "step": 1820 }, { "epoch": 0.200043845226351, "grad_norm": 5.976099491119385, "learning_rate": 4.522846057007716e-05, "loss": 3.0793, "num_input_tokens_seen": 1187280, "step": 1825 }, { "epoch": 0.20059191055573825, "grad_norm": 6.050201892852783, "learning_rate": 4.5203136606820515e-05, "loss": 3.1914, "num_input_tokens_seen": 1190952, "step": 1830 }, { "epoch": 0.2011399758851255, "grad_norm": 5.573675632476807, "learning_rate": 4.517775274959434e-05, "loss": 3.3849, "num_input_tokens_seen": 1194568, "step": 1835 }, { "epoch": 0.20168804121451278, "grad_norm": 10.978282928466797, "learning_rate": 4.5152309073651266e-05, "loss": 3.3821, "num_input_tokens_seen": 1197992, "step": 1840 }, { "epoch": 0.20223610654390003, "grad_norm": 6.215994358062744, "learning_rate": 4.512680565442133e-05, "loss": 2.9822, "num_input_tokens_seen": 1201456, "step": 1845 }, { "epoch": 0.2027841718732873, "grad_norm": 5.15269660949707, "learning_rate": 4.510124256751166e-05, "loss": 3.0034, "num_input_tokens_seen": 1205552, "step": 1850 }, { "epoch": 0.20333223720267457, "grad_norm": 8.590337753295898, "learning_rate": 4.507561988870624e-05, "loss": 3.3385, "num_input_tokens_seen": 1208496, "step": 1855 }, { "epoch": 0.20388030253206182, "grad_norm": 6.038626194000244, "learning_rate": 4.5049937693965764e-05, "loss": 3.3063, "num_input_tokens_seen": 1211856, "step": 1860 }, { "epoch": 0.20442836786144908, "grad_norm": 6.621918678283691, "learning_rate": 4.502419605942735e-05, "loss": 3.2243, "num_input_tokens_seen": 1216152, "step": 1865 }, { "epoch": 0.20497643319083636, "grad_norm": 6.029962062835693, "learning_rate": 4.499839506140433e-05, "loss": 3.4138, "num_input_tokens_seen": 1219840, "step": 1870 }, { "epoch": 0.20552449852022361, "grad_norm": 7.1330952644348145, "learning_rate": 4.497253477638602e-05, "loss": 3.3366, "num_input_tokens_seen": 1222888, "step": 1875 }, { "epoch": 0.20607256384961087, "grad_norm": 7.775686264038086, "learning_rate": 4.494661528103751e-05, "loss": 3.1706, "num_input_tokens_seen": 1227096, "step": 1880 }, { "epoch": 0.20662062917899815, "grad_norm": 8.789952278137207, "learning_rate": 4.492063665219941e-05, "loss": 3.4648, "num_input_tokens_seen": 1230856, "step": 1885 }, { "epoch": 0.2071686945083854, "grad_norm": 7.492274284362793, "learning_rate": 4.489459896688764e-05, "loss": 3.6099, "num_input_tokens_seen": 1234160, "step": 1890 }, { "epoch": 0.20771675983777266, "grad_norm": 6.971865177154541, "learning_rate": 4.48685023022932e-05, "loss": 3.037, "num_input_tokens_seen": 1236904, "step": 1895 }, { "epoch": 0.20826482516715994, "grad_norm": 9.107683181762695, "learning_rate": 4.484234673578196e-05, "loss": 3.435, "num_input_tokens_seen": 1239936, "step": 1900 }, { "epoch": 0.2088128904965472, "grad_norm": 6.467232704162598, "learning_rate": 4.4816132344894354e-05, "loss": 3.6629, "num_input_tokens_seen": 1242952, "step": 1905 }, { "epoch": 0.20936095582593445, "grad_norm": 6.295756816864014, "learning_rate": 4.4789859207345274e-05, "loss": 3.1083, "num_input_tokens_seen": 1246560, "step": 1910 }, { "epoch": 0.2099090211553217, "grad_norm": 5.817240238189697, "learning_rate": 4.4763527401023724e-05, "loss": 3.2389, "num_input_tokens_seen": 1249904, "step": 1915 }, { "epoch": 0.21045708648470898, "grad_norm": 7.3531317710876465, "learning_rate": 4.473713700399266e-05, "loss": 3.1022, "num_input_tokens_seen": 1252272, "step": 1920 }, { "epoch": 0.21100515181409624, "grad_norm": 7.078802108764648, "learning_rate": 4.471068809448872e-05, "loss": 3.2372, "num_input_tokens_seen": 1255904, "step": 1925 }, { "epoch": 0.2115532171434835, "grad_norm": 5.776179313659668, "learning_rate": 4.468418075092201e-05, "loss": 3.2817, "num_input_tokens_seen": 1259024, "step": 1930 }, { "epoch": 0.21210128247287077, "grad_norm": 9.986640930175781, "learning_rate": 4.465761505187589e-05, "loss": 3.349, "num_input_tokens_seen": 1262584, "step": 1935 }, { "epoch": 0.21264934780225803, "grad_norm": 8.421146392822266, "learning_rate": 4.463099107610669e-05, "loss": 3.2711, "num_input_tokens_seen": 1266072, "step": 1940 }, { "epoch": 0.21319741313164528, "grad_norm": 8.646468162536621, "learning_rate": 4.460430890254353e-05, "loss": 3.264, "num_input_tokens_seen": 1269528, "step": 1945 }, { "epoch": 0.21374547846103256, "grad_norm": 6.439562797546387, "learning_rate": 4.457756861028804e-05, "loss": 3.2899, "num_input_tokens_seen": 1272200, "step": 1950 }, { "epoch": 0.21429354379041982, "grad_norm": 8.170503616333008, "learning_rate": 4.455077027861417e-05, "loss": 3.3649, "num_input_tokens_seen": 1275360, "step": 1955 }, { "epoch": 0.21484160911980707, "grad_norm": 6.329521179199219, "learning_rate": 4.452391398696794e-05, "loss": 3.4714, "num_input_tokens_seen": 1278480, "step": 1960 }, { "epoch": 0.21538967444919435, "grad_norm": 7.618672847747803, "learning_rate": 4.449699981496714e-05, "loss": 3.1889, "num_input_tokens_seen": 1281312, "step": 1965 }, { "epoch": 0.2159377397785816, "grad_norm": 5.937787055969238, "learning_rate": 4.447002784240122e-05, "loss": 3.2998, "num_input_tokens_seen": 1284456, "step": 1970 }, { "epoch": 0.21648580510796886, "grad_norm": 6.004344463348389, "learning_rate": 4.444299814923096e-05, "loss": 3.5535, "num_input_tokens_seen": 1287512, "step": 1975 }, { "epoch": 0.21703387043735614, "grad_norm": 6.512199878692627, "learning_rate": 4.4415910815588235e-05, "loss": 3.4036, "num_input_tokens_seen": 1290336, "step": 1980 }, { "epoch": 0.2175819357667434, "grad_norm": 6.4987616539001465, "learning_rate": 4.438876592177584e-05, "loss": 3.6318, "num_input_tokens_seen": 1292832, "step": 1985 }, { "epoch": 0.21813000109613065, "grad_norm": 5.955297946929932, "learning_rate": 4.4361563548267186e-05, "loss": 3.4087, "num_input_tokens_seen": 1296336, "step": 1990 }, { "epoch": 0.21867806642551793, "grad_norm": 9.001585960388184, "learning_rate": 4.4334303775706087e-05, "loss": 3.0256, "num_input_tokens_seen": 1299928, "step": 1995 }, { "epoch": 0.21922613175490518, "grad_norm": 8.543002128601074, "learning_rate": 4.4306986684906534e-05, "loss": 3.0983, "num_input_tokens_seen": 1303344, "step": 2000 }, { "epoch": 0.21977419708429244, "grad_norm": 5.445712089538574, "learning_rate": 4.427961235685245e-05, "loss": 3.5193, "num_input_tokens_seen": 1306536, "step": 2005 }, { "epoch": 0.22032226241367972, "grad_norm": 4.273796558380127, "learning_rate": 4.4252180872697403e-05, "loss": 3.036, "num_input_tokens_seen": 1311056, "step": 2010 }, { "epoch": 0.22087032774306697, "grad_norm": 5.357060432434082, "learning_rate": 4.422469231376445e-05, "loss": 3.2927, "num_input_tokens_seen": 1314432, "step": 2015 }, { "epoch": 0.22141839307245423, "grad_norm": 6.554574012756348, "learning_rate": 4.4197146761545825e-05, "loss": 3.4088, "num_input_tokens_seen": 1317568, "step": 2020 }, { "epoch": 0.2219664584018415, "grad_norm": 5.920197486877441, "learning_rate": 4.4169544297702745e-05, "loss": 3.1075, "num_input_tokens_seen": 1321288, "step": 2025 }, { "epoch": 0.22251452373122876, "grad_norm": 5.399965763092041, "learning_rate": 4.414188500406513e-05, "loss": 3.023, "num_input_tokens_seen": 1324832, "step": 2030 }, { "epoch": 0.22306258906061602, "grad_norm": 4.449610710144043, "learning_rate": 4.411416896263137e-05, "loss": 3.2649, "num_input_tokens_seen": 1327992, "step": 2035 }, { "epoch": 0.2236106543900033, "grad_norm": 5.2429304122924805, "learning_rate": 4.408639625556812e-05, "loss": 3.2027, "num_input_tokens_seen": 1331448, "step": 2040 }, { "epoch": 0.22415871971939055, "grad_norm": 5.563135623931885, "learning_rate": 4.405856696520998e-05, "loss": 3.0106, "num_input_tokens_seen": 1334672, "step": 2045 }, { "epoch": 0.2247067850487778, "grad_norm": 9.401083946228027, "learning_rate": 4.403068117405933e-05, "loss": 3.5604, "num_input_tokens_seen": 1338664, "step": 2050 }, { "epoch": 0.2252548503781651, "grad_norm": 6.381105899810791, "learning_rate": 4.4002738964786047e-05, "loss": 3.1456, "num_input_tokens_seen": 1341320, "step": 2055 }, { "epoch": 0.22580291570755234, "grad_norm": 8.379097938537598, "learning_rate": 4.397474042022727e-05, "loss": 3.7295, "num_input_tokens_seen": 1344712, "step": 2060 }, { "epoch": 0.2263509810369396, "grad_norm": 5.414994239807129, "learning_rate": 4.394668562338711e-05, "loss": 3.2339, "num_input_tokens_seen": 1348704, "step": 2065 }, { "epoch": 0.22689904636632688, "grad_norm": 6.6783447265625, "learning_rate": 4.391857465743649e-05, "loss": 3.1633, "num_input_tokens_seen": 1352136, "step": 2070 }, { "epoch": 0.22744711169571413, "grad_norm": 6.781215667724609, "learning_rate": 4.389040760571284e-05, "loss": 3.2454, "num_input_tokens_seen": 1355704, "step": 2075 }, { "epoch": 0.2279951770251014, "grad_norm": 8.376158714294434, "learning_rate": 4.386218455171984e-05, "loss": 3.2688, "num_input_tokens_seen": 1358224, "step": 2080 }, { "epoch": 0.22854324235448867, "grad_norm": 6.815377712249756, "learning_rate": 4.383390557912722e-05, "loss": 3.2047, "num_input_tokens_seen": 1361624, "step": 2085 }, { "epoch": 0.22909130768387592, "grad_norm": 9.893330574035645, "learning_rate": 4.380557077177046e-05, "loss": 3.3861, "num_input_tokens_seen": 1365672, "step": 2090 }, { "epoch": 0.22963937301326318, "grad_norm": 5.984465599060059, "learning_rate": 4.3777180213650587e-05, "loss": 3.2901, "num_input_tokens_seen": 1368440, "step": 2095 }, { "epoch": 0.23018743834265046, "grad_norm": 8.21902847290039, "learning_rate": 4.37487339889339e-05, "loss": 3.135, "num_input_tokens_seen": 1370736, "step": 2100 }, { "epoch": 0.2307355036720377, "grad_norm": 7.617781639099121, "learning_rate": 4.3720232181951726e-05, "loss": 3.2967, "num_input_tokens_seen": 1373632, "step": 2105 }, { "epoch": 0.23128356900142497, "grad_norm": 5.901704788208008, "learning_rate": 4.3691674877200164e-05, "loss": 3.0304, "num_input_tokens_seen": 1376840, "step": 2110 }, { "epoch": 0.23183163433081222, "grad_norm": 7.1147074699401855, "learning_rate": 4.3663062159339855e-05, "loss": 3.2797, "num_input_tokens_seen": 1380024, "step": 2115 }, { "epoch": 0.2323796996601995, "grad_norm": 6.9793243408203125, "learning_rate": 4.363439411319571e-05, "loss": 3.6079, "num_input_tokens_seen": 1382992, "step": 2120 }, { "epoch": 0.23292776498958676, "grad_norm": 5.454427242279053, "learning_rate": 4.360567082375666e-05, "loss": 3.1035, "num_input_tokens_seen": 1385936, "step": 2125 }, { "epoch": 0.233475830318974, "grad_norm": 9.776113510131836, "learning_rate": 4.3576892376175414e-05, "loss": 3.1049, "num_input_tokens_seen": 1389176, "step": 2130 }, { "epoch": 0.2340238956483613, "grad_norm": 5.588262557983398, "learning_rate": 4.3553829961575053e-05, "loss": 3.0589, "num_input_tokens_seen": 1392080, "step": 2135 }, { "epoch": 0.23457196097774854, "grad_norm": 7.208589553833008, "learning_rate": 4.352495244444449e-05, "loss": 3.3501, "num_input_tokens_seen": 1395360, "step": 2140 }, { "epoch": 0.2351200263071358, "grad_norm": 5.150116920471191, "learning_rate": 4.349602000846844e-05, "loss": 3.4204, "num_input_tokens_seen": 1398760, "step": 2145 }, { "epoch": 0.23566809163652308, "grad_norm": 7.456035137176514, "learning_rate": 4.346703273941965e-05, "loss": 2.9937, "num_input_tokens_seen": 1402384, "step": 2150 }, { "epoch": 0.23621615696591033, "grad_norm": 5.8624067306518555, "learning_rate": 4.3437990723233416e-05, "loss": 3.233, "num_input_tokens_seen": 1406152, "step": 2155 }, { "epoch": 0.2367642222952976, "grad_norm": 5.129085063934326, "learning_rate": 4.3408894046007354e-05, "loss": 3.3833, "num_input_tokens_seen": 1409704, "step": 2160 }, { "epoch": 0.23731228762468487, "grad_norm": 7.074642658233643, "learning_rate": 4.337974279400111e-05, "loss": 3.2288, "num_input_tokens_seen": 1412984, "step": 2165 }, { "epoch": 0.23786035295407212, "grad_norm": 7.073869228363037, "learning_rate": 4.335053705363611e-05, "loss": 3.1338, "num_input_tokens_seen": 1416232, "step": 2170 }, { "epoch": 0.23840841828345938, "grad_norm": 6.7071990966796875, "learning_rate": 4.332127691149535e-05, "loss": 3.1272, "num_input_tokens_seen": 1419904, "step": 2175 }, { "epoch": 0.23895648361284666, "grad_norm": 8.463297843933105, "learning_rate": 4.3291962454323076e-05, "loss": 3.3227, "num_input_tokens_seen": 1423048, "step": 2180 }, { "epoch": 0.2395045489422339, "grad_norm": 7.098794460296631, "learning_rate": 4.3262593769024576e-05, "loss": 3.1422, "num_input_tokens_seen": 1425568, "step": 2185 }, { "epoch": 0.24005261427162117, "grad_norm": 5.919711589813232, "learning_rate": 4.323317094266589e-05, "loss": 3.0584, "num_input_tokens_seen": 1429464, "step": 2190 }, { "epoch": 0.24060067960100845, "grad_norm": 5.311784267425537, "learning_rate": 4.320369406247356e-05, "loss": 2.8391, "num_input_tokens_seen": 1432832, "step": 2195 }, { "epoch": 0.2411487449303957, "grad_norm": 6.239211559295654, "learning_rate": 4.317416321583437e-05, "loss": 3.1701, "num_input_tokens_seen": 1435960, "step": 2200 }, { "epoch": 0.24169681025978296, "grad_norm": 9.268356323242188, "learning_rate": 4.314457849029513e-05, "loss": 3.3796, "num_input_tokens_seen": 1439752, "step": 2205 }, { "epoch": 0.24224487558917024, "grad_norm": 7.6005449295043945, "learning_rate": 4.311493997356234e-05, "loss": 3.189, "num_input_tokens_seen": 1442488, "step": 2210 }, { "epoch": 0.2427929409185575, "grad_norm": 6.128123760223389, "learning_rate": 4.308524775350198e-05, "loss": 3.2867, "num_input_tokens_seen": 1445800, "step": 2215 }, { "epoch": 0.24334100624794475, "grad_norm": 6.555956840515137, "learning_rate": 4.305550191813923e-05, "loss": 3.1985, "num_input_tokens_seen": 1448992, "step": 2220 }, { "epoch": 0.24388907157733203, "grad_norm": 6.0009446144104, "learning_rate": 4.302570255565825e-05, "loss": 3.1752, "num_input_tokens_seen": 1452104, "step": 2225 }, { "epoch": 0.24443713690671928, "grad_norm": 5.329344749450684, "learning_rate": 4.299584975440184e-05, "loss": 2.9533, "num_input_tokens_seen": 1457016, "step": 2230 }, { "epoch": 0.24498520223610654, "grad_norm": 4.869180202484131, "learning_rate": 4.296594360287126e-05, "loss": 2.9869, "num_input_tokens_seen": 1459624, "step": 2235 }, { "epoch": 0.24553326756549382, "grad_norm": 6.4714202880859375, "learning_rate": 4.293598418972592e-05, "loss": 3.2594, "num_input_tokens_seen": 1462696, "step": 2240 }, { "epoch": 0.24608133289488107, "grad_norm": 10.35406494140625, "learning_rate": 4.2905971603783116e-05, "loss": 3.164, "num_input_tokens_seen": 1466832, "step": 2245 }, { "epoch": 0.24662939822426833, "grad_norm": 5.773983001708984, "learning_rate": 4.287590593401778e-05, "loss": 3.2342, "num_input_tokens_seen": 1470288, "step": 2250 }, { "epoch": 0.2471774635536556, "grad_norm": 5.758610248565674, "learning_rate": 4.284578726956225e-05, "loss": 3.38, "num_input_tokens_seen": 1473032, "step": 2255 }, { "epoch": 0.24772552888304286, "grad_norm": 7.092349529266357, "learning_rate": 4.2815615699705943e-05, "loss": 3.1884, "num_input_tokens_seen": 1476104, "step": 2260 }, { "epoch": 0.24827359421243012, "grad_norm": 8.047478675842285, "learning_rate": 4.2785391313895103e-05, "loss": 3.3215, "num_input_tokens_seen": 1479376, "step": 2265 }, { "epoch": 0.2488216595418174, "grad_norm": 7.5882439613342285, "learning_rate": 4.27551142017326e-05, "loss": 3.0476, "num_input_tokens_seen": 1482248, "step": 2270 }, { "epoch": 0.24936972487120465, "grad_norm": 5.922421932220459, "learning_rate": 4.2724784452977565e-05, "loss": 3.3373, "num_input_tokens_seen": 1485232, "step": 2275 }, { "epoch": 0.2499177902005919, "grad_norm": 6.161900520324707, "learning_rate": 4.26944021575452e-05, "loss": 3.0011, "num_input_tokens_seen": 1488896, "step": 2280 }, { "epoch": 0.2504658555299792, "grad_norm": 7.3562397956848145, "learning_rate": 4.2663967405506486e-05, "loss": 2.9991, "num_input_tokens_seen": 1492072, "step": 2285 }, { "epoch": 0.2510139208593664, "grad_norm": 6.788776397705078, "learning_rate": 4.263348028708792e-05, "loss": 2.9735, "num_input_tokens_seen": 1495224, "step": 2290 }, { "epoch": 0.2515619861887537, "grad_norm": 8.632386207580566, "learning_rate": 4.260294089267123e-05, "loss": 3.2221, "num_input_tokens_seen": 1498256, "step": 2295 }, { "epoch": 0.252110051518141, "grad_norm": 6.462652683258057, "learning_rate": 4.257234931279313e-05, "loss": 2.8929, "num_input_tokens_seen": 1501824, "step": 2300 }, { "epoch": 0.2526581168475282, "grad_norm": 7.380079746246338, "learning_rate": 4.254170563814505e-05, "loss": 3.2545, "num_input_tokens_seen": 1504768, "step": 2305 }, { "epoch": 0.2532061821769155, "grad_norm": 5.370420455932617, "learning_rate": 4.2511009959572826e-05, "loss": 3.4558, "num_input_tokens_seen": 1508056, "step": 2310 }, { "epoch": 0.25375424750630277, "grad_norm": 5.953249454498291, "learning_rate": 4.2480262368076504e-05, "loss": 3.2177, "num_input_tokens_seen": 1511920, "step": 2315 }, { "epoch": 0.25430231283569, "grad_norm": 5.694786548614502, "learning_rate": 4.244946295481001e-05, "loss": 3.2378, "num_input_tokens_seen": 1514936, "step": 2320 }, { "epoch": 0.2548503781650773, "grad_norm": 7.257277965545654, "learning_rate": 4.241861181108092e-05, "loss": 3.616, "num_input_tokens_seen": 1518416, "step": 2325 }, { "epoch": 0.25539844349446456, "grad_norm": 6.388315200805664, "learning_rate": 4.238770902835013e-05, "loss": 3.2898, "num_input_tokens_seen": 1521960, "step": 2330 }, { "epoch": 0.2559465088238518, "grad_norm": 8.813338279724121, "learning_rate": 4.235675469823166e-05, "loss": 3.4491, "num_input_tokens_seen": 1525312, "step": 2335 }, { "epoch": 0.25649457415323906, "grad_norm": 6.0403947830200195, "learning_rate": 4.232574891249234e-05, "loss": 3.0747, "num_input_tokens_seen": 1528632, "step": 2340 }, { "epoch": 0.25704263948262634, "grad_norm": 6.77452278137207, "learning_rate": 4.229469176305153e-05, "loss": 3.2356, "num_input_tokens_seen": 1532200, "step": 2345 }, { "epoch": 0.25759070481201357, "grad_norm": 6.781161785125732, "learning_rate": 4.2263583341980885e-05, "loss": 3.1273, "num_input_tokens_seen": 1535624, "step": 2350 }, { "epoch": 0.25813877014140085, "grad_norm": 6.070975303649902, "learning_rate": 4.223242374150402e-05, "loss": 3.0905, "num_input_tokens_seen": 1538504, "step": 2355 }, { "epoch": 0.25868683547078813, "grad_norm": 6.770239353179932, "learning_rate": 4.220121305399634e-05, "loss": 3.2115, "num_input_tokens_seen": 1541520, "step": 2360 }, { "epoch": 0.25923490080017536, "grad_norm": 6.523434638977051, "learning_rate": 4.216995137198463e-05, "loss": 3.2605, "num_input_tokens_seen": 1545656, "step": 2365 }, { "epoch": 0.25978296612956264, "grad_norm": 6.475868225097656, "learning_rate": 4.213863878814691e-05, "loss": 3.2498, "num_input_tokens_seen": 1549464, "step": 2370 }, { "epoch": 0.2603310314589499, "grad_norm": 7.743395805358887, "learning_rate": 4.210727539531206e-05, "loss": 3.0166, "num_input_tokens_seen": 1553408, "step": 2375 }, { "epoch": 0.26087909678833715, "grad_norm": 6.206083297729492, "learning_rate": 4.207586128645963e-05, "loss": 3.2151, "num_input_tokens_seen": 1557112, "step": 2380 }, { "epoch": 0.26142716211772443, "grad_norm": 7.58196496963501, "learning_rate": 4.204439655471949e-05, "loss": 3.5573, "num_input_tokens_seen": 1560984, "step": 2385 }, { "epoch": 0.2619752274471117, "grad_norm": 8.101637840270996, "learning_rate": 4.201288129337158e-05, "loss": 3.4451, "num_input_tokens_seen": 1563808, "step": 2390 }, { "epoch": 0.26252329277649894, "grad_norm": 9.19637680053711, "learning_rate": 4.1981315595845684e-05, "loss": 3.191, "num_input_tokens_seen": 1567344, "step": 2395 }, { "epoch": 0.2630713581058862, "grad_norm": 7.602110862731934, "learning_rate": 4.194969955572105e-05, "loss": 3.7303, "num_input_tokens_seen": 1570104, "step": 2400 }, { "epoch": 0.2636194234352735, "grad_norm": 10.502030372619629, "learning_rate": 4.191803326672622e-05, "loss": 3.2205, "num_input_tokens_seen": 1572864, "step": 2405 }, { "epoch": 0.26416748876466073, "grad_norm": 5.903884410858154, "learning_rate": 4.188631682273868e-05, "loss": 3.5156, "num_input_tokens_seen": 1575720, "step": 2410 }, { "epoch": 0.264715554094048, "grad_norm": 5.067075729370117, "learning_rate": 4.1854550317784604e-05, "loss": 3.1053, "num_input_tokens_seen": 1579008, "step": 2415 }, { "epoch": 0.2652636194234353, "grad_norm": 6.393657207489014, "learning_rate": 4.1822733846038584e-05, "loss": 3.1813, "num_input_tokens_seen": 1582216, "step": 2420 }, { "epoch": 0.2658116847528225, "grad_norm": 10.575018882751465, "learning_rate": 4.1790867501823345e-05, "loss": 3.7197, "num_input_tokens_seen": 1585440, "step": 2425 }, { "epoch": 0.2663597500822098, "grad_norm": 7.280240535736084, "learning_rate": 4.175895137960945e-05, "loss": 3.0196, "num_input_tokens_seen": 1588248, "step": 2430 }, { "epoch": 0.2669078154115971, "grad_norm": 6.695456504821777, "learning_rate": 4.172698557401503e-05, "loss": 2.9587, "num_input_tokens_seen": 1591288, "step": 2435 }, { "epoch": 0.2674558807409843, "grad_norm": 6.2725653648376465, "learning_rate": 4.169497017980555e-05, "loss": 3.3583, "num_input_tokens_seen": 1595056, "step": 2440 }, { "epoch": 0.2680039460703716, "grad_norm": 6.505600929260254, "learning_rate": 4.166290529189342e-05, "loss": 3.474, "num_input_tokens_seen": 1598096, "step": 2445 }, { "epoch": 0.26855201139975887, "grad_norm": 7.131421089172363, "learning_rate": 4.163079100533783e-05, "loss": 3.2172, "num_input_tokens_seen": 1602648, "step": 2450 }, { "epoch": 0.2691000767291461, "grad_norm": 5.818497657775879, "learning_rate": 4.1598627415344394e-05, "loss": 3.2497, "num_input_tokens_seen": 1605776, "step": 2455 }, { "epoch": 0.2696481420585334, "grad_norm": 8.350225448608398, "learning_rate": 4.156641461726489e-05, "loss": 3.2372, "num_input_tokens_seen": 1609960, "step": 2460 }, { "epoch": 0.27019620738792066, "grad_norm": 10.619945526123047, "learning_rate": 4.153415270659699e-05, "loss": 3.0958, "num_input_tokens_seen": 1612808, "step": 2465 }, { "epoch": 0.2707442727173079, "grad_norm": 6.475553035736084, "learning_rate": 4.150184177898394e-05, "loss": 3.4121, "num_input_tokens_seen": 1616104, "step": 2470 }, { "epoch": 0.27129233804669517, "grad_norm": 9.670978546142578, "learning_rate": 4.1469481930214335e-05, "loss": 3.1002, "num_input_tokens_seen": 1618920, "step": 2475 }, { "epoch": 0.27184040337608245, "grad_norm": 5.271237850189209, "learning_rate": 4.1437073256221784e-05, "loss": 3.1366, "num_input_tokens_seen": 1622272, "step": 2480 }, { "epoch": 0.2723884687054697, "grad_norm": 6.107699394226074, "learning_rate": 4.1404615853084626e-05, "loss": 3.5266, "num_input_tokens_seen": 1624928, "step": 2485 }, { "epoch": 0.27293653403485696, "grad_norm": 8.945226669311523, "learning_rate": 4.137210981702568e-05, "loss": 3.627, "num_input_tokens_seen": 1628632, "step": 2490 }, { "epoch": 0.27348459936424424, "grad_norm": 5.393161296844482, "learning_rate": 4.133955524441196e-05, "loss": 3.6371, "num_input_tokens_seen": 1631272, "step": 2495 }, { "epoch": 0.27403266469363147, "grad_norm": 7.735115051269531, "learning_rate": 4.130695223175434e-05, "loss": 3.4529, "num_input_tokens_seen": 1634272, "step": 2500 }, { "epoch": 0.27458073002301875, "grad_norm": 9.375452041625977, "learning_rate": 4.1274300875707295e-05, "loss": 3.2474, "num_input_tokens_seen": 1638000, "step": 2505 }, { "epoch": 0.27512879535240603, "grad_norm": 6.957891464233398, "learning_rate": 4.124160127306864e-05, "loss": 3.0279, "num_input_tokens_seen": 1641896, "step": 2510 }, { "epoch": 0.27567686068179326, "grad_norm": 6.637111663818359, "learning_rate": 4.120885352077922e-05, "loss": 3.5516, "num_input_tokens_seen": 1645288, "step": 2515 }, { "epoch": 0.27622492601118054, "grad_norm": 6.921294212341309, "learning_rate": 4.1176057715922624e-05, "loss": 3.2415, "num_input_tokens_seen": 1648800, "step": 2520 }, { "epoch": 0.2767729913405678, "grad_norm": 6.21347713470459, "learning_rate": 4.114321395572488e-05, "loss": 3.3217, "num_input_tokens_seen": 1652416, "step": 2525 }, { "epoch": 0.27732105666995505, "grad_norm": 7.985599040985107, "learning_rate": 4.111032233755418e-05, "loss": 3.0362, "num_input_tokens_seen": 1655720, "step": 2530 }, { "epoch": 0.27786912199934233, "grad_norm": 6.855371952056885, "learning_rate": 4.107738295892063e-05, "loss": 3.0962, "num_input_tokens_seen": 1659440, "step": 2535 }, { "epoch": 0.2784171873287296, "grad_norm": 7.123937129974365, "learning_rate": 4.104439591747591e-05, "loss": 3.102, "num_input_tokens_seen": 1662400, "step": 2540 }, { "epoch": 0.27896525265811684, "grad_norm": 6.53096866607666, "learning_rate": 4.101136131101297e-05, "loss": 2.9064, "num_input_tokens_seen": 1665336, "step": 2545 }, { "epoch": 0.2795133179875041, "grad_norm": 8.0481538772583, "learning_rate": 4.0978279237465825e-05, "loss": 3.103, "num_input_tokens_seen": 1668288, "step": 2550 }, { "epoch": 0.2800613833168914, "grad_norm": 4.704191207885742, "learning_rate": 4.094514979490917e-05, "loss": 2.9912, "num_input_tokens_seen": 1671840, "step": 2555 }, { "epoch": 0.2806094486462786, "grad_norm": 6.396568775177002, "learning_rate": 4.091197308155814e-05, "loss": 3.0125, "num_input_tokens_seen": 1675512, "step": 2560 }, { "epoch": 0.2811575139756659, "grad_norm": 6.377243518829346, "learning_rate": 4.087874919576801e-05, "loss": 2.9588, "num_input_tokens_seen": 1679232, "step": 2565 }, { "epoch": 0.2817055793050532, "grad_norm": 7.850512981414795, "learning_rate": 4.084547823603391e-05, "loss": 3.1181, "num_input_tokens_seen": 1682432, "step": 2570 }, { "epoch": 0.2822536446344404, "grad_norm": 7.351206302642822, "learning_rate": 4.08121603009905e-05, "loss": 3.2493, "num_input_tokens_seen": 1686064, "step": 2575 }, { "epoch": 0.2828017099638277, "grad_norm": 6.765766620635986, "learning_rate": 4.077879548941172e-05, "loss": 2.9447, "num_input_tokens_seen": 1689312, "step": 2580 }, { "epoch": 0.283349775293215, "grad_norm": 6.162474155426025, "learning_rate": 4.0745383900210514e-05, "loss": 3.0923, "num_input_tokens_seen": 1692976, "step": 2585 }, { "epoch": 0.2838978406226022, "grad_norm": 6.094540119171143, "learning_rate": 4.071192563243843e-05, "loss": 3.4034, "num_input_tokens_seen": 1695344, "step": 2590 }, { "epoch": 0.2844459059519895, "grad_norm": 9.006319999694824, "learning_rate": 4.0678420785285446e-05, "loss": 3.3876, "num_input_tokens_seen": 1698336, "step": 2595 }, { "epoch": 0.28499397128137677, "grad_norm": 7.306302070617676, "learning_rate": 4.064486945807963e-05, "loss": 2.9591, "num_input_tokens_seen": 1703912, "step": 2600 }, { "epoch": 0.285542036610764, "grad_norm": 5.706150054931641, "learning_rate": 4.0611271750286805e-05, "loss": 3.0137, "num_input_tokens_seen": 1707664, "step": 2605 }, { "epoch": 0.2860901019401513, "grad_norm": 7.290525436401367, "learning_rate": 4.057762776151035e-05, "loss": 3.4755, "num_input_tokens_seen": 1710832, "step": 2610 }, { "epoch": 0.2866381672695385, "grad_norm": 7.548462867736816, "learning_rate": 4.054393759149081e-05, "loss": 3.1482, "num_input_tokens_seen": 1713616, "step": 2615 }, { "epoch": 0.2871862325989258, "grad_norm": 7.191598415374756, "learning_rate": 4.051020134010564e-05, "loss": 3.5189, "num_input_tokens_seen": 1717328, "step": 2620 }, { "epoch": 0.28773429792831307, "grad_norm": 5.576016426086426, "learning_rate": 4.0476419107368924e-05, "loss": 3.1058, "num_input_tokens_seen": 1720976, "step": 2625 }, { "epoch": 0.2882823632577003, "grad_norm": 5.512149333953857, "learning_rate": 4.044259099343104e-05, "loss": 3.3606, "num_input_tokens_seen": 1723840, "step": 2630 }, { "epoch": 0.2888304285870876, "grad_norm": 6.475109100341797, "learning_rate": 4.040871709857842e-05, "loss": 3.2876, "num_input_tokens_seen": 1726944, "step": 2635 }, { "epoch": 0.28937849391647485, "grad_norm": 6.24223518371582, "learning_rate": 4.037479752323317e-05, "loss": 3.2583, "num_input_tokens_seen": 1730056, "step": 2640 }, { "epoch": 0.2899265592458621, "grad_norm": 7.499751091003418, "learning_rate": 4.034083236795286e-05, "loss": 3.6548, "num_input_tokens_seen": 1733800, "step": 2645 }, { "epoch": 0.29047462457524936, "grad_norm": 5.272352695465088, "learning_rate": 4.030682173343016e-05, "loss": 3.345, "num_input_tokens_seen": 1738176, "step": 2650 }, { "epoch": 0.29102268990463664, "grad_norm": 4.747354030609131, "learning_rate": 4.027276572049259e-05, "loss": 2.8691, "num_input_tokens_seen": 1742088, "step": 2655 }, { "epoch": 0.29157075523402387, "grad_norm": 4.695064544677734, "learning_rate": 4.0238664430102175e-05, "loss": 3.3259, "num_input_tokens_seen": 1746032, "step": 2660 }, { "epoch": 0.29211882056341115, "grad_norm": 5.169468402862549, "learning_rate": 4.020451796335518e-05, "loss": 3.193, "num_input_tokens_seen": 1749336, "step": 2665 }, { "epoch": 0.29266688589279843, "grad_norm": 6.7505340576171875, "learning_rate": 4.017032642148181e-05, "loss": 3.1603, "num_input_tokens_seen": 1752808, "step": 2670 }, { "epoch": 0.29321495122218566, "grad_norm": 8.776106834411621, "learning_rate": 4.0136089905845874e-05, "loss": 3.065, "num_input_tokens_seen": 1756768, "step": 2675 }, { "epoch": 0.29376301655157294, "grad_norm": 5.4388203620910645, "learning_rate": 4.010180851794453e-05, "loss": 3.3523, "num_input_tokens_seen": 1759960, "step": 2680 }, { "epoch": 0.2943110818809602, "grad_norm": 7.309511661529541, "learning_rate": 4.006748235940796e-05, "loss": 3.1897, "num_input_tokens_seen": 1763848, "step": 2685 }, { "epoch": 0.29485914721034745, "grad_norm": 7.108086109161377, "learning_rate": 4.003311153199908e-05, "loss": 3.2525, "num_input_tokens_seen": 1767224, "step": 2690 }, { "epoch": 0.29540721253973473, "grad_norm": 6.940639495849609, "learning_rate": 3.99986961376132e-05, "loss": 3.0928, "num_input_tokens_seen": 1770816, "step": 2695 }, { "epoch": 0.295955277869122, "grad_norm": 8.109939575195312, "learning_rate": 3.996423627827778e-05, "loss": 3.2992, "num_input_tokens_seen": 1775144, "step": 2700 }, { "epoch": 0.29650334319850924, "grad_norm": 8.848753929138184, "learning_rate": 3.9929732056152104e-05, "loss": 3.1256, "num_input_tokens_seen": 1777888, "step": 2705 }, { "epoch": 0.2970514085278965, "grad_norm": 6.489472389221191, "learning_rate": 3.989518357352695e-05, "loss": 3.0047, "num_input_tokens_seen": 1782160, "step": 2710 }, { "epoch": 0.2975994738572838, "grad_norm": 7.247778415679932, "learning_rate": 3.986059093282433e-05, "loss": 3.075, "num_input_tokens_seen": 1784824, "step": 2715 }, { "epoch": 0.29814753918667103, "grad_norm": 7.691065788269043, "learning_rate": 3.982595423659716e-05, "loss": 3.4486, "num_input_tokens_seen": 1788072, "step": 2720 }, { "epoch": 0.2986956045160583, "grad_norm": 7.700766086578369, "learning_rate": 3.979127358752897e-05, "loss": 3.4979, "num_input_tokens_seen": 1790944, "step": 2725 }, { "epoch": 0.2992436698454456, "grad_norm": 5.059070110321045, "learning_rate": 3.975654908843356e-05, "loss": 3.305, "num_input_tokens_seen": 1794368, "step": 2730 }, { "epoch": 0.2997917351748328, "grad_norm": 6.1541595458984375, "learning_rate": 3.972178084225478e-05, "loss": 3.2146, "num_input_tokens_seen": 1798760, "step": 2735 }, { "epoch": 0.3003398005042201, "grad_norm": 8.040989875793457, "learning_rate": 3.968696895206613e-05, "loss": 3.482, "num_input_tokens_seen": 1801512, "step": 2740 }, { "epoch": 0.3008878658336074, "grad_norm": 5.050278186798096, "learning_rate": 3.9652113521070513e-05, "loss": 3.3143, "num_input_tokens_seen": 1805240, "step": 2745 }, { "epoch": 0.3014359311629946, "grad_norm": 5.1891279220581055, "learning_rate": 3.9617214652599904e-05, "loss": 2.8368, "num_input_tokens_seen": 1809040, "step": 2750 }, { "epoch": 0.3019839964923819, "grad_norm": 6.89003849029541, "learning_rate": 3.958227245011506e-05, "loss": 3.3205, "num_input_tokens_seen": 1812536, "step": 2755 }, { "epoch": 0.30253206182176917, "grad_norm": 6.001296043395996, "learning_rate": 3.954728701720521e-05, "loss": 3.4753, "num_input_tokens_seen": 1816296, "step": 2760 }, { "epoch": 0.3030801271511564, "grad_norm": 4.202249050140381, "learning_rate": 3.951225845758773e-05, "loss": 3.3659, "num_input_tokens_seen": 1819896, "step": 2765 }, { "epoch": 0.3036281924805437, "grad_norm": 6.209683418273926, "learning_rate": 3.9477186875107865e-05, "loss": 3.5706, "num_input_tokens_seen": 1823960, "step": 2770 }, { "epoch": 0.30417625780993096, "grad_norm": 5.219339847564697, "learning_rate": 3.944207237373838e-05, "loss": 3.121, "num_input_tokens_seen": 1827176, "step": 2775 }, { "epoch": 0.3047243231393182, "grad_norm": 6.556133270263672, "learning_rate": 3.940691505757931e-05, "loss": 3.1289, "num_input_tokens_seen": 1830016, "step": 2780 }, { "epoch": 0.30527238846870547, "grad_norm": 5.480815887451172, "learning_rate": 3.9371715030857595e-05, "loss": 2.8851, "num_input_tokens_seen": 1833280, "step": 2785 }, { "epoch": 0.30582045379809275, "grad_norm": 4.781624794006348, "learning_rate": 3.933647239792679e-05, "loss": 3.066, "num_input_tokens_seen": 1836784, "step": 2790 }, { "epoch": 0.30636851912748, "grad_norm": 5.901027202606201, "learning_rate": 3.930118726326678e-05, "loss": 3.0618, "num_input_tokens_seen": 1840600, "step": 2795 }, { "epoch": 0.30691658445686726, "grad_norm": 4.3098649978637695, "learning_rate": 3.926585973148344e-05, "loss": 3.0273, "num_input_tokens_seen": 1844456, "step": 2800 }, { "epoch": 0.30746464978625454, "grad_norm": 7.2452521324157715, "learning_rate": 3.923048990730832e-05, "loss": 3.3328, "num_input_tokens_seen": 1847648, "step": 2805 }, { "epoch": 0.30801271511564177, "grad_norm": 9.102137565612793, "learning_rate": 3.9195077895598385e-05, "loss": 3.4577, "num_input_tokens_seen": 1851080, "step": 2810 }, { "epoch": 0.30856078044502905, "grad_norm": 7.165421009063721, "learning_rate": 3.9159623801335635e-05, "loss": 3.2345, "num_input_tokens_seen": 1854544, "step": 2815 }, { "epoch": 0.30910884577441633, "grad_norm": 6.918674468994141, "learning_rate": 3.912412772962685e-05, "loss": 3.3151, "num_input_tokens_seen": 1857488, "step": 2820 }, { "epoch": 0.30965691110380356, "grad_norm": 7.7270660400390625, "learning_rate": 3.908858978570324e-05, "loss": 3.0722, "num_input_tokens_seen": 1859744, "step": 2825 }, { "epoch": 0.31020497643319084, "grad_norm": 5.471165657043457, "learning_rate": 3.905301007492016e-05, "loss": 3.3752, "num_input_tokens_seen": 1862520, "step": 2830 }, { "epoch": 0.3107530417625781, "grad_norm": 8.547778129577637, "learning_rate": 3.9017388702756766e-05, "loss": 3.4572, "num_input_tokens_seen": 1865688, "step": 2835 }, { "epoch": 0.31130110709196535, "grad_norm": 5.8289289474487305, "learning_rate": 3.898172577481577e-05, "loss": 3.0442, "num_input_tokens_seen": 1869008, "step": 2840 }, { "epoch": 0.3118491724213526, "grad_norm": 5.646442413330078, "learning_rate": 3.894602139682301e-05, "loss": 3.3365, "num_input_tokens_seen": 1872200, "step": 2845 }, { "epoch": 0.3123972377507399, "grad_norm": 5.7611565589904785, "learning_rate": 3.891027567462727e-05, "loss": 3.0501, "num_input_tokens_seen": 1874936, "step": 2850 }, { "epoch": 0.31294530308012714, "grad_norm": 6.07964563369751, "learning_rate": 3.8874488714199874e-05, "loss": 3.1584, "num_input_tokens_seen": 1877880, "step": 2855 }, { "epoch": 0.3134933684095144, "grad_norm": 6.76899528503418, "learning_rate": 3.883866062163439e-05, "loss": 3.2215, "num_input_tokens_seen": 1880632, "step": 2860 }, { "epoch": 0.3140414337389017, "grad_norm": 9.11755657196045, "learning_rate": 3.880279150314636e-05, "loss": 3.4992, "num_input_tokens_seen": 1883792, "step": 2865 }, { "epoch": 0.3145894990682889, "grad_norm": 4.672335147857666, "learning_rate": 3.876688146507291e-05, "loss": 3.2378, "num_input_tokens_seen": 1887984, "step": 2870 }, { "epoch": 0.3151375643976762, "grad_norm": 8.21897029876709, "learning_rate": 3.873093061387251e-05, "loss": 3.4215, "num_input_tokens_seen": 1890952, "step": 2875 }, { "epoch": 0.3156856297270635, "grad_norm": 6.4296674728393555, "learning_rate": 3.869493905612461e-05, "loss": 3.1436, "num_input_tokens_seen": 1894376, "step": 2880 }, { "epoch": 0.3162336950564507, "grad_norm": 6.088110446929932, "learning_rate": 3.8658906898529325e-05, "loss": 3.1597, "num_input_tokens_seen": 1897632, "step": 2885 }, { "epoch": 0.316781760385838, "grad_norm": 7.144382953643799, "learning_rate": 3.8622834247907155e-05, "loss": 3.3071, "num_input_tokens_seen": 1899992, "step": 2890 }, { "epoch": 0.3173298257152253, "grad_norm": 5.95371675491333, "learning_rate": 3.858672121119863e-05, "loss": 3.1272, "num_input_tokens_seen": 1902928, "step": 2895 }, { "epoch": 0.3178778910446125, "grad_norm": 5.033254623413086, "learning_rate": 3.855056789546402e-05, "loss": 3.5104, "num_input_tokens_seen": 1905872, "step": 2900 }, { "epoch": 0.3184259563739998, "grad_norm": 9.2310209274292, "learning_rate": 3.8514374407883e-05, "loss": 3.22, "num_input_tokens_seen": 1910456, "step": 2905 }, { "epoch": 0.31897402170338707, "grad_norm": 13.305641174316406, "learning_rate": 3.847814085575432e-05, "loss": 3.5537, "num_input_tokens_seen": 1914432, "step": 2910 }, { "epoch": 0.3195220870327743, "grad_norm": 4.90524959564209, "learning_rate": 3.844186734649554e-05, "loss": 3.1428, "num_input_tokens_seen": 1917176, "step": 2915 }, { "epoch": 0.3200701523621616, "grad_norm": 7.605042457580566, "learning_rate": 3.840555398764265e-05, "loss": 2.6933, "num_input_tokens_seen": 1919488, "step": 2920 }, { "epoch": 0.32061821769154886, "grad_norm": 6.435617923736572, "learning_rate": 3.836920088684979e-05, "loss": 3.1942, "num_input_tokens_seen": 1922184, "step": 2925 }, { "epoch": 0.3211662830209361, "grad_norm": 5.5276288986206055, "learning_rate": 3.8332808151888906e-05, "loss": 3.3987, "num_input_tokens_seen": 1925760, "step": 2930 }, { "epoch": 0.32171434835032336, "grad_norm": 7.981554985046387, "learning_rate": 3.829637589064946e-05, "loss": 3.107, "num_input_tokens_seen": 1928024, "step": 2935 }, { "epoch": 0.32226241367971065, "grad_norm": 6.667475700378418, "learning_rate": 3.8259904211138074e-05, "loss": 2.8259, "num_input_tokens_seen": 1931992, "step": 2940 }, { "epoch": 0.3228104790090979, "grad_norm": 6.904677867889404, "learning_rate": 3.8223393221478257e-05, "loss": 3.3099, "num_input_tokens_seen": 1934432, "step": 2945 }, { "epoch": 0.32335854433848515, "grad_norm": 6.4357008934021, "learning_rate": 3.818684302991001e-05, "loss": 3.5156, "num_input_tokens_seen": 1938288, "step": 2950 }, { "epoch": 0.32390660966787244, "grad_norm": 6.910282611846924, "learning_rate": 3.8150253744789624e-05, "loss": 3.7432, "num_input_tokens_seen": 1941552, "step": 2955 }, { "epoch": 0.32445467499725966, "grad_norm": 6.355223178863525, "learning_rate": 3.811362547458919e-05, "loss": 3.3951, "num_input_tokens_seen": 1944848, "step": 2960 }, { "epoch": 0.32500274032664694, "grad_norm": 5.630364418029785, "learning_rate": 3.807695832789646e-05, "loss": 3.1733, "num_input_tokens_seen": 1947576, "step": 2965 }, { "epoch": 0.3255508056560342, "grad_norm": 7.782848358154297, "learning_rate": 3.80402524134144e-05, "loss": 2.9549, "num_input_tokens_seen": 1950920, "step": 2970 }, { "epoch": 0.32609887098542145, "grad_norm": 6.886142730712891, "learning_rate": 3.8003507839960895e-05, "loss": 3.1884, "num_input_tokens_seen": 1954424, "step": 2975 }, { "epoch": 0.32664693631480873, "grad_norm": 6.035950660705566, "learning_rate": 3.796672471646848e-05, "loss": 2.9874, "num_input_tokens_seen": 1957928, "step": 2980 }, { "epoch": 0.327195001644196, "grad_norm": 8.303248405456543, "learning_rate": 3.7929903151983934e-05, "loss": 3.4268, "num_input_tokens_seen": 1961240, "step": 2985 }, { "epoch": 0.32774306697358324, "grad_norm": 6.161063194274902, "learning_rate": 3.789304325566801e-05, "loss": 2.8965, "num_input_tokens_seen": 1963864, "step": 2990 }, { "epoch": 0.3282911323029705, "grad_norm": 5.629215717315674, "learning_rate": 3.7856145136795104e-05, "loss": 3.0241, "num_input_tokens_seen": 1967656, "step": 2995 }, { "epoch": 0.3288391976323578, "grad_norm": 9.494491577148438, "learning_rate": 3.781920890475294e-05, "loss": 3.2297, "num_input_tokens_seen": 1970608, "step": 3000 }, { "epoch": 0.32938726296174503, "grad_norm": 4.975097179412842, "learning_rate": 3.7782234669042186e-05, "loss": 3.1757, "num_input_tokens_seen": 1973664, "step": 3005 }, { "epoch": 0.3299353282911323, "grad_norm": 7.1082258224487305, "learning_rate": 3.7745222539276224e-05, "loss": 3.1921, "num_input_tokens_seen": 1976944, "step": 3010 }, { "epoch": 0.33048339362051954, "grad_norm": 11.492435455322266, "learning_rate": 3.770817262518076e-05, "loss": 3.1751, "num_input_tokens_seen": 1980160, "step": 3015 }, { "epoch": 0.3310314589499068, "grad_norm": 6.560080051422119, "learning_rate": 3.76710850365935e-05, "loss": 3.0906, "num_input_tokens_seen": 1983576, "step": 3020 }, { "epoch": 0.3315795242792941, "grad_norm": 7.438432216644287, "learning_rate": 3.763395988346386e-05, "loss": 3.1074, "num_input_tokens_seen": 1985784, "step": 3025 }, { "epoch": 0.33212758960868133, "grad_norm": 7.6575164794921875, "learning_rate": 3.759679727585262e-05, "loss": 3.1625, "num_input_tokens_seen": 1989344, "step": 3030 }, { "epoch": 0.3326756549380686, "grad_norm": 6.756874084472656, "learning_rate": 3.7559597323931566e-05, "loss": 3.2758, "num_input_tokens_seen": 1992304, "step": 3035 }, { "epoch": 0.3332237202674559, "grad_norm": 5.427942276000977, "learning_rate": 3.7522360137983235e-05, "loss": 3.1905, "num_input_tokens_seen": 1996120, "step": 3040 }, { "epoch": 0.3337717855968431, "grad_norm": 5.814554691314697, "learning_rate": 3.748508582840052e-05, "loss": 2.8693, "num_input_tokens_seen": 1999176, "step": 3045 }, { "epoch": 0.3343198509262304, "grad_norm": 7.720613956451416, "learning_rate": 3.744777450568638e-05, "loss": 3.3644, "num_input_tokens_seen": 2002112, "step": 3050 }, { "epoch": 0.3348679162556177, "grad_norm": 5.780377388000488, "learning_rate": 3.7410426280453505e-05, "loss": 2.8918, "num_input_tokens_seen": 2005800, "step": 3055 }, { "epoch": 0.3354159815850049, "grad_norm": 5.939544677734375, "learning_rate": 3.737304126342398e-05, "loss": 3.0217, "num_input_tokens_seen": 2009192, "step": 3060 }, { "epoch": 0.3359640469143922, "grad_norm": 6.661081314086914, "learning_rate": 3.7335619565428964e-05, "loss": 3.2056, "num_input_tokens_seen": 2012280, "step": 3065 }, { "epoch": 0.33651211224377947, "grad_norm": 4.9228620529174805, "learning_rate": 3.729816129740836e-05, "loss": 3.106, "num_input_tokens_seen": 2014984, "step": 3070 }, { "epoch": 0.3370601775731667, "grad_norm": 6.285070896148682, "learning_rate": 3.726066657041051e-05, "loss": 3.1639, "num_input_tokens_seen": 2019048, "step": 3075 }, { "epoch": 0.337608242902554, "grad_norm": 6.625104904174805, "learning_rate": 3.7223135495591776e-05, "loss": 3.2258, "num_input_tokens_seen": 2022776, "step": 3080 }, { "epoch": 0.33815630823194126, "grad_norm": 8.347160339355469, "learning_rate": 3.718556818421636e-05, "loss": 3.4006, "num_input_tokens_seen": 2026304, "step": 3085 }, { "epoch": 0.3387043735613285, "grad_norm": 9.37065601348877, "learning_rate": 3.7147964747655836e-05, "loss": 3.2778, "num_input_tokens_seen": 2030200, "step": 3090 }, { "epoch": 0.33925243889071577, "grad_norm": 6.341724872589111, "learning_rate": 3.711032529738887e-05, "loss": 3.5654, "num_input_tokens_seen": 2033656, "step": 3095 }, { "epoch": 0.33980050422010305, "grad_norm": 6.54714298248291, "learning_rate": 3.7072649945000936e-05, "loss": 3.0664, "num_input_tokens_seen": 2037328, "step": 3100 }, { "epoch": 0.3403485695494903, "grad_norm": 6.289731979370117, "learning_rate": 3.703493880218391e-05, "loss": 2.8214, "num_input_tokens_seen": 2040488, "step": 3105 }, { "epoch": 0.34089663487887756, "grad_norm": 8.150530815124512, "learning_rate": 3.699719198073578e-05, "loss": 3.2654, "num_input_tokens_seen": 2043256, "step": 3110 }, { "epoch": 0.34144470020826484, "grad_norm": 7.053910255432129, "learning_rate": 3.6959409592560304e-05, "loss": 3.3008, "num_input_tokens_seen": 2046064, "step": 3115 }, { "epoch": 0.34199276553765207, "grad_norm": 5.083940505981445, "learning_rate": 3.69215917496667e-05, "loss": 3.0999, "num_input_tokens_seen": 2049568, "step": 3120 }, { "epoch": 0.34254083086703935, "grad_norm": 5.558229446411133, "learning_rate": 3.6883738564169254e-05, "loss": 3.4491, "num_input_tokens_seen": 2052400, "step": 3125 }, { "epoch": 0.34308889619642663, "grad_norm": 7.365407466888428, "learning_rate": 3.684585014828708e-05, "loss": 3.1569, "num_input_tokens_seen": 2055864, "step": 3130 }, { "epoch": 0.34363696152581386, "grad_norm": 7.316169738769531, "learning_rate": 3.680792661434368e-05, "loss": 3.1274, "num_input_tokens_seen": 2058856, "step": 3135 }, { "epoch": 0.34418502685520114, "grad_norm": 8.32957935333252, "learning_rate": 3.676996807476671e-05, "loss": 2.9842, "num_input_tokens_seen": 2062056, "step": 3140 }, { "epoch": 0.3447330921845884, "grad_norm": 7.238974094390869, "learning_rate": 3.673197464208759e-05, "loss": 3.1055, "num_input_tokens_seen": 2064760, "step": 3145 }, { "epoch": 0.34528115751397565, "grad_norm": 8.2353515625, "learning_rate": 3.669394642894118e-05, "loss": 2.7765, "num_input_tokens_seen": 2068440, "step": 3150 }, { "epoch": 0.3458292228433629, "grad_norm": 7.214339256286621, "learning_rate": 3.665588354806545e-05, "loss": 3.0102, "num_input_tokens_seen": 2072136, "step": 3155 }, { "epoch": 0.3463772881727502, "grad_norm": 6.484249114990234, "learning_rate": 3.661778611230114e-05, "loss": 3.2456, "num_input_tokens_seen": 2074560, "step": 3160 }, { "epoch": 0.34692535350213743, "grad_norm": 6.298303604125977, "learning_rate": 3.657965423459145e-05, "loss": 3.3588, "num_input_tokens_seen": 2077248, "step": 3165 }, { "epoch": 0.3474734188315247, "grad_norm": 8.595486640930176, "learning_rate": 3.6541488027981675e-05, "loss": 2.9303, "num_input_tokens_seen": 2080160, "step": 3170 }, { "epoch": 0.348021484160912, "grad_norm": 7.8414740562438965, "learning_rate": 3.650328760561887e-05, "loss": 3.5767, "num_input_tokens_seen": 2082320, "step": 3175 }, { "epoch": 0.3485695494902992, "grad_norm": 5.1522908210754395, "learning_rate": 3.646505308075154e-05, "loss": 3.1739, "num_input_tokens_seen": 2085104, "step": 3180 }, { "epoch": 0.3491176148196865, "grad_norm": 9.065922737121582, "learning_rate": 3.642678456672929e-05, "loss": 3.3567, "num_input_tokens_seen": 2087800, "step": 3185 }, { "epoch": 0.3496656801490738, "grad_norm": 11.175498962402344, "learning_rate": 3.638848217700248e-05, "loss": 3.3376, "num_input_tokens_seen": 2090776, "step": 3190 }, { "epoch": 0.350213745478461, "grad_norm": 7.90383768081665, "learning_rate": 3.63501460251219e-05, "loss": 2.9388, "num_input_tokens_seen": 2093152, "step": 3195 }, { "epoch": 0.3507618108078483, "grad_norm": 7.013014316558838, "learning_rate": 3.6311776224738435e-05, "loss": 3.0298, "num_input_tokens_seen": 2096192, "step": 3200 }, { "epoch": 0.3513098761372356, "grad_norm": 4.87260103225708, "learning_rate": 3.627337288960272e-05, "loss": 3.3596, "num_input_tokens_seen": 2100256, "step": 3205 }, { "epoch": 0.3518579414666228, "grad_norm": 7.644909858703613, "learning_rate": 3.6234936133564823e-05, "loss": 3.1154, "num_input_tokens_seen": 2102928, "step": 3210 }, { "epoch": 0.3524060067960101, "grad_norm": 5.678354263305664, "learning_rate": 3.619646607057386e-05, "loss": 2.8941, "num_input_tokens_seen": 2106944, "step": 3215 }, { "epoch": 0.35295407212539737, "grad_norm": 5.123593330383301, "learning_rate": 3.61579628146777e-05, "loss": 3.1417, "num_input_tokens_seen": 2111496, "step": 3220 }, { "epoch": 0.3535021374547846, "grad_norm": 5.542695999145508, "learning_rate": 3.611942648002265e-05, "loss": 3.1733, "num_input_tokens_seen": 2114960, "step": 3225 }, { "epoch": 0.3540502027841719, "grad_norm": 8.204092025756836, "learning_rate": 3.6080857180853025e-05, "loss": 3.4422, "num_input_tokens_seen": 2117528, "step": 3230 }, { "epoch": 0.35459826811355916, "grad_norm": 6.3048014640808105, "learning_rate": 3.6042255031510895e-05, "loss": 3.3049, "num_input_tokens_seen": 2121312, "step": 3235 }, { "epoch": 0.3551463334429464, "grad_norm": 8.287495613098145, "learning_rate": 3.600362014643573e-05, "loss": 3.2349, "num_input_tokens_seen": 2125296, "step": 3240 }, { "epoch": 0.35569439877233366, "grad_norm": 7.690340995788574, "learning_rate": 3.5964952640164016e-05, "loss": 3.4982, "num_input_tokens_seen": 2127944, "step": 3245 }, { "epoch": 0.35624246410172095, "grad_norm": 5.382369518280029, "learning_rate": 3.592625262732898e-05, "loss": 3.3248, "num_input_tokens_seen": 2131200, "step": 3250 }, { "epoch": 0.35679052943110817, "grad_norm": 7.964527606964111, "learning_rate": 3.58875202226602e-05, "loss": 3.2188, "num_input_tokens_seen": 2133648, "step": 3255 }, { "epoch": 0.35733859476049545, "grad_norm": 5.458812236785889, "learning_rate": 3.5848755540983286e-05, "loss": 3.3385, "num_input_tokens_seen": 2136960, "step": 3260 }, { "epoch": 0.35788666008988274, "grad_norm": 7.087930679321289, "learning_rate": 3.580995869721953e-05, "loss": 3.0703, "num_input_tokens_seen": 2140656, "step": 3265 }, { "epoch": 0.35843472541926996, "grad_norm": 6.762202262878418, "learning_rate": 3.577112980638557e-05, "loss": 2.9214, "num_input_tokens_seen": 2143360, "step": 3270 }, { "epoch": 0.35898279074865724, "grad_norm": 6.3621649742126465, "learning_rate": 3.573226898359308e-05, "loss": 3.4276, "num_input_tokens_seen": 2146456, "step": 3275 }, { "epoch": 0.3595308560780445, "grad_norm": 8.797203063964844, "learning_rate": 3.5693376344048344e-05, "loss": 3.0474, "num_input_tokens_seen": 2149336, "step": 3280 }, { "epoch": 0.36007892140743175, "grad_norm": 7.268299579620361, "learning_rate": 3.5654452003052033e-05, "loss": 2.8497, "num_input_tokens_seen": 2152960, "step": 3285 }, { "epoch": 0.36062698673681903, "grad_norm": 8.053544044494629, "learning_rate": 3.5615496075998744e-05, "loss": 3.6495, "num_input_tokens_seen": 2157104, "step": 3290 }, { "epoch": 0.3611750520662063, "grad_norm": 6.6186604499816895, "learning_rate": 3.5576508678376743e-05, "loss": 2.9909, "num_input_tokens_seen": 2159576, "step": 3295 }, { "epoch": 0.36172311739559354, "grad_norm": 6.244167327880859, "learning_rate": 3.55374899257676e-05, "loss": 3.064, "num_input_tokens_seen": 2163112, "step": 3300 }, { "epoch": 0.3622711827249808, "grad_norm": 7.658557891845703, "learning_rate": 3.549843993384582e-05, "loss": 3.1039, "num_input_tokens_seen": 2166048, "step": 3305 }, { "epoch": 0.3628192480543681, "grad_norm": 5.7698140144348145, "learning_rate": 3.545935881837852e-05, "loss": 2.9442, "num_input_tokens_seen": 2169192, "step": 3310 }, { "epoch": 0.36336731338375533, "grad_norm": 6.534774303436279, "learning_rate": 3.542024669522511e-05, "loss": 2.9845, "num_input_tokens_seen": 2172544, "step": 3315 }, { "epoch": 0.3639153787131426, "grad_norm": 5.373234748840332, "learning_rate": 3.538110368033689e-05, "loss": 3.0865, "num_input_tokens_seen": 2176280, "step": 3320 }, { "epoch": 0.3644634440425299, "grad_norm": 6.9778547286987305, "learning_rate": 3.5341929889756775e-05, "loss": 3.1341, "num_input_tokens_seen": 2179792, "step": 3325 }, { "epoch": 0.3650115093719171, "grad_norm": 10.10000991821289, "learning_rate": 3.530272543961888e-05, "loss": 3.3558, "num_input_tokens_seen": 2182776, "step": 3330 }, { "epoch": 0.3655595747013044, "grad_norm": 6.022150993347168, "learning_rate": 3.526349044614826e-05, "loss": 3.1005, "num_input_tokens_seen": 2186112, "step": 3335 }, { "epoch": 0.3661076400306917, "grad_norm": 6.781782150268555, "learning_rate": 3.522422502566047e-05, "loss": 3.3438, "num_input_tokens_seen": 2188600, "step": 3340 }, { "epoch": 0.3666557053600789, "grad_norm": 4.399787425994873, "learning_rate": 3.51849292945613e-05, "loss": 3.0477, "num_input_tokens_seen": 2191600, "step": 3345 }, { "epoch": 0.3672037706894662, "grad_norm": 6.852601528167725, "learning_rate": 3.51456033693464e-05, "loss": 2.8756, "num_input_tokens_seen": 2194544, "step": 3350 }, { "epoch": 0.3677518360188535, "grad_norm": 7.015017509460449, "learning_rate": 3.510624736660091e-05, "loss": 3.6253, "num_input_tokens_seen": 2198296, "step": 3355 }, { "epoch": 0.3682999013482407, "grad_norm": 4.540085792541504, "learning_rate": 3.506686140299915e-05, "loss": 2.9568, "num_input_tokens_seen": 2201384, "step": 3360 }, { "epoch": 0.368847966677628, "grad_norm": 9.393879890441895, "learning_rate": 3.502744559530426e-05, "loss": 3.1794, "num_input_tokens_seen": 2205720, "step": 3365 }, { "epoch": 0.36939603200701526, "grad_norm": 7.7508344650268555, "learning_rate": 3.498800006036788e-05, "loss": 3.0188, "num_input_tokens_seen": 2210344, "step": 3370 }, { "epoch": 0.3699440973364025, "grad_norm": 5.801796913146973, "learning_rate": 3.4948524915129726e-05, "loss": 3.1028, "num_input_tokens_seen": 2213264, "step": 3375 }, { "epoch": 0.37049216266578977, "grad_norm": 6.9859938621521, "learning_rate": 3.490902027661734e-05, "loss": 3.5774, "num_input_tokens_seen": 2216560, "step": 3380 }, { "epoch": 0.37104022799517705, "grad_norm": 5.871939659118652, "learning_rate": 3.4869486261945695e-05, "loss": 3.3648, "num_input_tokens_seen": 2219376, "step": 3385 }, { "epoch": 0.3715882933245643, "grad_norm": 6.051314830780029, "learning_rate": 3.482992298831682e-05, "loss": 3.2641, "num_input_tokens_seen": 2222568, "step": 3390 }, { "epoch": 0.37213635865395156, "grad_norm": 7.149409294128418, "learning_rate": 3.4790330573019524e-05, "loss": 3.0127, "num_input_tokens_seen": 2225232, "step": 3395 }, { "epoch": 0.37268442398333884, "grad_norm": 5.8362650871276855, "learning_rate": 3.4750709133429e-05, "loss": 3.2417, "num_input_tokens_seen": 2228360, "step": 3400 }, { "epoch": 0.37323248931272607, "grad_norm": 6.061380386352539, "learning_rate": 3.471105878700646e-05, "loss": 3.4256, "num_input_tokens_seen": 2231864, "step": 3405 }, { "epoch": 0.37378055464211335, "grad_norm": 7.543921947479248, "learning_rate": 3.467137965129884e-05, "loss": 3.1154, "num_input_tokens_seen": 2234400, "step": 3410 }, { "epoch": 0.3743286199715006, "grad_norm": 4.8110151290893555, "learning_rate": 3.463167184393843e-05, "loss": 3.1221, "num_input_tokens_seen": 2238056, "step": 3415 }, { "epoch": 0.37487668530088786, "grad_norm": 7.194852352142334, "learning_rate": 3.459193548264248e-05, "loss": 3.4609, "num_input_tokens_seen": 2240472, "step": 3420 }, { "epoch": 0.37542475063027514, "grad_norm": 7.457151889801025, "learning_rate": 3.4552170685212936e-05, "loss": 3.1907, "num_input_tokens_seen": 2243944, "step": 3425 }, { "epoch": 0.37597281595966237, "grad_norm": 8.671926498413086, "learning_rate": 3.4512377569536025e-05, "loss": 3.0142, "num_input_tokens_seen": 2246376, "step": 3430 }, { "epoch": 0.37652088128904965, "grad_norm": 6.243984222412109, "learning_rate": 3.447255625358191e-05, "loss": 3.094, "num_input_tokens_seen": 2249288, "step": 3435 }, { "epoch": 0.37706894661843693, "grad_norm": 7.37971830368042, "learning_rate": 3.443270685540439e-05, "loss": 3.4606, "num_input_tokens_seen": 2252536, "step": 3440 }, { "epoch": 0.37761701194782415, "grad_norm": 6.270237445831299, "learning_rate": 3.43928294931405e-05, "loss": 3.1928, "num_input_tokens_seen": 2255576, "step": 3445 }, { "epoch": 0.37816507727721144, "grad_norm": 5.272236347198486, "learning_rate": 3.435292428501016e-05, "loss": 3.4196, "num_input_tokens_seen": 2258456, "step": 3450 }, { "epoch": 0.3787131426065987, "grad_norm": 6.378783226013184, "learning_rate": 3.431299134931587e-05, "loss": 3.3069, "num_input_tokens_seen": 2261160, "step": 3455 }, { "epoch": 0.37926120793598594, "grad_norm": 7.296474456787109, "learning_rate": 3.427303080444232e-05, "loss": 3.3306, "num_input_tokens_seen": 2263808, "step": 3460 }, { "epoch": 0.3798092732653732, "grad_norm": 6.654740333557129, "learning_rate": 3.423304276885605e-05, "loss": 2.871, "num_input_tokens_seen": 2267280, "step": 3465 }, { "epoch": 0.3803573385947605, "grad_norm": 7.27192497253418, "learning_rate": 3.419302736110508e-05, "loss": 3.3171, "num_input_tokens_seen": 2270632, "step": 3470 }, { "epoch": 0.38090540392414773, "grad_norm": 5.948354721069336, "learning_rate": 3.4152984699818614e-05, "loss": 3.4794, "num_input_tokens_seen": 2273960, "step": 3475 }, { "epoch": 0.381453469253535, "grad_norm": 6.537465572357178, "learning_rate": 3.4112914903706616e-05, "loss": 3.1609, "num_input_tokens_seen": 2277568, "step": 3480 }, { "epoch": 0.3820015345829223, "grad_norm": 13.15424919128418, "learning_rate": 3.4072818091559524e-05, "loss": 3.0777, "num_input_tokens_seen": 2279976, "step": 3485 }, { "epoch": 0.3825495999123095, "grad_norm": 5.581765174865723, "learning_rate": 3.403269438224784e-05, "loss": 3.1242, "num_input_tokens_seen": 2282912, "step": 3490 }, { "epoch": 0.3830976652416968, "grad_norm": 5.730728626251221, "learning_rate": 3.3992543894721825e-05, "loss": 3.2418, "num_input_tokens_seen": 2286272, "step": 3495 }, { "epoch": 0.3836457305710841, "grad_norm": 9.713155746459961, "learning_rate": 3.3952366748011114e-05, "loss": 3.17, "num_input_tokens_seen": 2289944, "step": 3500 }, { "epoch": 0.3841937959004713, "grad_norm": 6.645389556884766, "learning_rate": 3.391216306122439e-05, "loss": 3.3796, "num_input_tokens_seen": 2292688, "step": 3505 }, { "epoch": 0.3847418612298586, "grad_norm": 7.148984432220459, "learning_rate": 3.3871932953549005e-05, "loss": 3.282, "num_input_tokens_seen": 2295584, "step": 3510 }, { "epoch": 0.3852899265592459, "grad_norm": 5.25370979309082, "learning_rate": 3.3831676544250616e-05, "loss": 2.9293, "num_input_tokens_seen": 2298440, "step": 3515 }, { "epoch": 0.3858379918886331, "grad_norm": 5.668978214263916, "learning_rate": 3.3791393952672915e-05, "loss": 3.0635, "num_input_tokens_seen": 2301024, "step": 3520 }, { "epoch": 0.3863860572180204, "grad_norm": 4.52470064163208, "learning_rate": 3.375108529823715e-05, "loss": 3.0398, "num_input_tokens_seen": 2304392, "step": 3525 }, { "epoch": 0.38693412254740767, "grad_norm": 5.700072288513184, "learning_rate": 3.371075070044186e-05, "loss": 3.0855, "num_input_tokens_seen": 2307688, "step": 3530 }, { "epoch": 0.3874821878767949, "grad_norm": 5.35679292678833, "learning_rate": 3.367039027886252e-05, "loss": 3.2953, "num_input_tokens_seen": 2312384, "step": 3535 }, { "epoch": 0.3880302532061822, "grad_norm": 6.735170841217041, "learning_rate": 3.363000415315111e-05, "loss": 3.1434, "num_input_tokens_seen": 2315864, "step": 3540 }, { "epoch": 0.38857831853556946, "grad_norm": 6.647335052490234, "learning_rate": 3.358959244303585e-05, "loss": 3.2033, "num_input_tokens_seen": 2319744, "step": 3545 }, { "epoch": 0.3891263838649567, "grad_norm": 6.841831684112549, "learning_rate": 3.354915526832082e-05, "loss": 3.3414, "num_input_tokens_seen": 2322856, "step": 3550 }, { "epoch": 0.38967444919434396, "grad_norm": 7.023780822753906, "learning_rate": 3.350869274888554e-05, "loss": 3.1525, "num_input_tokens_seen": 2326016, "step": 3555 }, { "epoch": 0.39022251452373125, "grad_norm": 8.96906852722168, "learning_rate": 3.3468205004684695e-05, "loss": 3.2852, "num_input_tokens_seen": 2330120, "step": 3560 }, { "epoch": 0.39077057985311847, "grad_norm": 7.874572277069092, "learning_rate": 3.3427692155747766e-05, "loss": 2.9457, "num_input_tokens_seen": 2332776, "step": 3565 }, { "epoch": 0.39131864518250575, "grad_norm": 6.962822914123535, "learning_rate": 3.338715432217865e-05, "loss": 3.0687, "num_input_tokens_seen": 2336856, "step": 3570 }, { "epoch": 0.39186671051189303, "grad_norm": 6.802676200866699, "learning_rate": 3.334659162415529e-05, "loss": 3.6562, "num_input_tokens_seen": 2339768, "step": 3575 }, { "epoch": 0.39241477584128026, "grad_norm": 7.828624725341797, "learning_rate": 3.3306004181929375e-05, "loss": 3.2111, "num_input_tokens_seen": 2342920, "step": 3580 }, { "epoch": 0.39296284117066754, "grad_norm": 7.1746320724487305, "learning_rate": 3.326539211582592e-05, "loss": 3.2333, "num_input_tokens_seen": 2346656, "step": 3585 }, { "epoch": 0.3935109065000548, "grad_norm": 7.000988006591797, "learning_rate": 3.3224755546242967e-05, "loss": 3.3291, "num_input_tokens_seen": 2351008, "step": 3590 }, { "epoch": 0.39405897182944205, "grad_norm": 6.557620048522949, "learning_rate": 3.3184094593651196e-05, "loss": 2.7686, "num_input_tokens_seen": 2354160, "step": 3595 }, { "epoch": 0.39460703715882933, "grad_norm": 7.011937618255615, "learning_rate": 3.314340937859356e-05, "loss": 3.4913, "num_input_tokens_seen": 2357464, "step": 3600 }, { "epoch": 0.3951551024882166, "grad_norm": 6.284838676452637, "learning_rate": 3.310270002168493e-05, "loss": 2.835, "num_input_tokens_seen": 2360488, "step": 3605 }, { "epoch": 0.39570316781760384, "grad_norm": 7.415198802947998, "learning_rate": 3.306196664361178e-05, "loss": 2.9347, "num_input_tokens_seen": 2363448, "step": 3610 }, { "epoch": 0.3962512331469911, "grad_norm": 7.382150650024414, "learning_rate": 3.302120936513177e-05, "loss": 3.3669, "num_input_tokens_seen": 2365800, "step": 3615 }, { "epoch": 0.3967992984763784, "grad_norm": 5.894745349884033, "learning_rate": 3.2980428307073435e-05, "loss": 2.8094, "num_input_tokens_seen": 2369016, "step": 3620 }, { "epoch": 0.39734736380576563, "grad_norm": 6.539662837982178, "learning_rate": 3.29396235903358e-05, "loss": 3.1544, "num_input_tokens_seen": 2372144, "step": 3625 }, { "epoch": 0.3978954291351529, "grad_norm": 6.1463799476623535, "learning_rate": 3.2898795335888005e-05, "loss": 3.2679, "num_input_tokens_seen": 2374656, "step": 3630 }, { "epoch": 0.3984434944645402, "grad_norm": 8.810948371887207, "learning_rate": 3.2857943664769e-05, "loss": 3.394, "num_input_tokens_seen": 2378056, "step": 3635 }, { "epoch": 0.3989915597939274, "grad_norm": 10.048519134521484, "learning_rate": 3.2817068698087164e-05, "loss": 3.4094, "num_input_tokens_seen": 2380792, "step": 3640 }, { "epoch": 0.3995396251233147, "grad_norm": 8.441570281982422, "learning_rate": 3.277617055701989e-05, "loss": 2.9142, "num_input_tokens_seen": 2383912, "step": 3645 }, { "epoch": 0.400087690452702, "grad_norm": 5.723228931427002, "learning_rate": 3.273524936281331e-05, "loss": 3.2162, "num_input_tokens_seen": 2386592, "step": 3650 }, { "epoch": 0.4006357557820892, "grad_norm": 5.869374752044678, "learning_rate": 3.2694305236781904e-05, "loss": 3.301, "num_input_tokens_seen": 2390144, "step": 3655 }, { "epoch": 0.4011838211114765, "grad_norm": 6.342257499694824, "learning_rate": 3.26533383003081e-05, "loss": 3.2055, "num_input_tokens_seen": 2393872, "step": 3660 }, { "epoch": 0.4017318864408638, "grad_norm": 6.534188270568848, "learning_rate": 3.2612348674841995e-05, "loss": 3.0935, "num_input_tokens_seen": 2396648, "step": 3665 }, { "epoch": 0.402279951770251, "grad_norm": 7.0050272941589355, "learning_rate": 3.2571336481900926e-05, "loss": 3.2582, "num_input_tokens_seen": 2400328, "step": 3670 }, { "epoch": 0.4028280170996383, "grad_norm": 8.4814453125, "learning_rate": 3.253030184306912e-05, "loss": 3.3026, "num_input_tokens_seen": 2403080, "step": 3675 }, { "epoch": 0.40337608242902556, "grad_norm": 7.716960906982422, "learning_rate": 3.248924487999737e-05, "loss": 3.052, "num_input_tokens_seen": 2406352, "step": 3680 }, { "epoch": 0.4039241477584128, "grad_norm": 6.716127395629883, "learning_rate": 3.244816571440265e-05, "loss": 3.2428, "num_input_tokens_seen": 2409496, "step": 3685 }, { "epoch": 0.40447221308780007, "grad_norm": 8.213761329650879, "learning_rate": 3.240706446806773e-05, "loss": 2.9107, "num_input_tokens_seen": 2414032, "step": 3690 }, { "epoch": 0.40502027841718735, "grad_norm": 6.492610931396484, "learning_rate": 3.236594126284086e-05, "loss": 3.293, "num_input_tokens_seen": 2417472, "step": 3695 }, { "epoch": 0.4055683437465746, "grad_norm": 6.562194347381592, "learning_rate": 3.23247962206354e-05, "loss": 3.4693, "num_input_tokens_seen": 2420224, "step": 3700 }, { "epoch": 0.40611640907596186, "grad_norm": 6.379699230194092, "learning_rate": 3.228362946342942e-05, "loss": 3.2036, "num_input_tokens_seen": 2425376, "step": 3705 }, { "epoch": 0.40666447440534914, "grad_norm": 8.669161796569824, "learning_rate": 3.2242441113265395e-05, "loss": 3.3417, "num_input_tokens_seen": 2429616, "step": 3710 }, { "epoch": 0.40721253973473637, "grad_norm": 4.813148021697998, "learning_rate": 3.220123129224979e-05, "loss": 2.9484, "num_input_tokens_seen": 2433168, "step": 3715 }, { "epoch": 0.40776060506412365, "grad_norm": 6.526965141296387, "learning_rate": 3.216000012255273e-05, "loss": 3.5202, "num_input_tokens_seen": 2435880, "step": 3720 }, { "epoch": 0.40830867039351093, "grad_norm": 7.899510860443115, "learning_rate": 3.211874772640765e-05, "loss": 3.2844, "num_input_tokens_seen": 2439232, "step": 3725 }, { "epoch": 0.40885673572289816, "grad_norm": 6.932427406311035, "learning_rate": 3.2077474226110866e-05, "loss": 3.5213, "num_input_tokens_seen": 2443400, "step": 3730 }, { "epoch": 0.40940480105228544, "grad_norm": 6.4443793296813965, "learning_rate": 3.203617974402131e-05, "loss": 3.4504, "num_input_tokens_seen": 2446448, "step": 3735 }, { "epoch": 0.4099528663816727, "grad_norm": 6.693415641784668, "learning_rate": 3.199486440256009e-05, "loss": 3.6388, "num_input_tokens_seen": 2450016, "step": 3740 }, { "epoch": 0.41050093171105995, "grad_norm": 6.27035665512085, "learning_rate": 3.195352832421015e-05, "loss": 3.4589, "num_input_tokens_seen": 2452584, "step": 3745 }, { "epoch": 0.41104899704044723, "grad_norm": 6.987046241760254, "learning_rate": 3.191217163151593e-05, "loss": 3.484, "num_input_tokens_seen": 2455440, "step": 3750 }, { "epoch": 0.4115970623698345, "grad_norm": 5.9024200439453125, "learning_rate": 3.187079444708296e-05, "loss": 2.9859, "num_input_tokens_seen": 2459048, "step": 3755 }, { "epoch": 0.41214512769922174, "grad_norm": 5.624914646148682, "learning_rate": 3.182939689357753e-05, "loss": 3.317, "num_input_tokens_seen": 2463488, "step": 3760 }, { "epoch": 0.412693193028609, "grad_norm": 5.933727264404297, "learning_rate": 3.1787979093726314e-05, "loss": 3.1318, "num_input_tokens_seen": 2466560, "step": 3765 }, { "epoch": 0.4132412583579963, "grad_norm": 8.507558822631836, "learning_rate": 3.1746541170316036e-05, "loss": 3.5896, "num_input_tokens_seen": 2469072, "step": 3770 }, { "epoch": 0.4137893236873835, "grad_norm": 6.940069198608398, "learning_rate": 3.1705083246193015e-05, "loss": 3.5636, "num_input_tokens_seen": 2471528, "step": 3775 }, { "epoch": 0.4143373890167708, "grad_norm": 7.710633277893066, "learning_rate": 3.166360544426293e-05, "loss": 3.373, "num_input_tokens_seen": 2474672, "step": 3780 }, { "epoch": 0.4148854543461581, "grad_norm": 6.710258960723877, "learning_rate": 3.1622107887490354e-05, "loss": 2.9773, "num_input_tokens_seen": 2478184, "step": 3785 }, { "epoch": 0.4154335196755453, "grad_norm": 6.593062400817871, "learning_rate": 3.158059069889843e-05, "loss": 3.1045, "num_input_tokens_seen": 2481016, "step": 3790 }, { "epoch": 0.4159815850049326, "grad_norm": 8.369247436523438, "learning_rate": 3.1539054001568493e-05, "loss": 2.7624, "num_input_tokens_seen": 2483976, "step": 3795 }, { "epoch": 0.4165296503343199, "grad_norm": 5.184842586517334, "learning_rate": 3.149749791863974e-05, "loss": 3.2427, "num_input_tokens_seen": 2486960, "step": 3800 }, { "epoch": 0.4170777156637071, "grad_norm": 5.449498653411865, "learning_rate": 3.145592257330881e-05, "loss": 3.3931, "num_input_tokens_seen": 2490928, "step": 3805 }, { "epoch": 0.4176257809930944, "grad_norm": 7.610599994659424, "learning_rate": 3.141432808882946e-05, "loss": 3.3562, "num_input_tokens_seen": 2494760, "step": 3810 }, { "epoch": 0.4181738463224816, "grad_norm": 6.789968490600586, "learning_rate": 3.13727145885122e-05, "loss": 2.823, "num_input_tokens_seen": 2498352, "step": 3815 }, { "epoch": 0.4187219116518689, "grad_norm": 6.654449462890625, "learning_rate": 3.133108219572388e-05, "loss": 3.2867, "num_input_tokens_seen": 2501440, "step": 3820 }, { "epoch": 0.4192699769812562, "grad_norm": 6.487675189971924, "learning_rate": 3.1289431033887386e-05, "loss": 3.3113, "num_input_tokens_seen": 2504560, "step": 3825 }, { "epoch": 0.4198180423106434, "grad_norm": 7.911233901977539, "learning_rate": 3.1247761226481244e-05, "loss": 2.8476, "num_input_tokens_seen": 2507984, "step": 3830 }, { "epoch": 0.4203661076400307, "grad_norm": 7.292878150939941, "learning_rate": 3.120607289703925e-05, "loss": 2.9229, "num_input_tokens_seen": 2511632, "step": 3835 }, { "epoch": 0.42091417296941797, "grad_norm": 7.699312686920166, "learning_rate": 3.11643661691501e-05, "loss": 3.2728, "num_input_tokens_seen": 2514512, "step": 3840 }, { "epoch": 0.4214622382988052, "grad_norm": 7.424167156219482, "learning_rate": 3.112264116645705e-05, "loss": 3.0013, "num_input_tokens_seen": 2517840, "step": 3845 }, { "epoch": 0.4220103036281925, "grad_norm": 6.991738796234131, "learning_rate": 3.1080898012657536e-05, "loss": 2.9434, "num_input_tokens_seen": 2521296, "step": 3850 }, { "epoch": 0.42255836895757976, "grad_norm": 6.644684314727783, "learning_rate": 3.103913683150278e-05, "loss": 3.4346, "num_input_tokens_seen": 2523800, "step": 3855 }, { "epoch": 0.423106434286967, "grad_norm": 6.666325092315674, "learning_rate": 3.099735774679749e-05, "loss": 3.2123, "num_input_tokens_seen": 2526096, "step": 3860 }, { "epoch": 0.42365449961635426, "grad_norm": 9.987031936645508, "learning_rate": 3.09555608823994e-05, "loss": 3.2205, "num_input_tokens_seen": 2528464, "step": 3865 }, { "epoch": 0.42420256494574154, "grad_norm": 8.114043235778809, "learning_rate": 3.091374636221899e-05, "loss": 3.1648, "num_input_tokens_seen": 2530808, "step": 3870 }, { "epoch": 0.42475063027512877, "grad_norm": 7.4291229248046875, "learning_rate": 3.087191431021908e-05, "loss": 2.874, "num_input_tokens_seen": 2534400, "step": 3875 }, { "epoch": 0.42529869560451605, "grad_norm": 6.414401054382324, "learning_rate": 3.083006485041444e-05, "loss": 3.0927, "num_input_tokens_seen": 2538584, "step": 3880 }, { "epoch": 0.42584676093390333, "grad_norm": 12.14594554901123, "learning_rate": 3.078819810687147e-05, "loss": 3.1133, "num_input_tokens_seen": 2542184, "step": 3885 }, { "epoch": 0.42639482626329056, "grad_norm": 6.391221046447754, "learning_rate": 3.074631420370779e-05, "loss": 3.0244, "num_input_tokens_seen": 2545592, "step": 3890 }, { "epoch": 0.42694289159267784, "grad_norm": 6.802542686462402, "learning_rate": 3.0704413265091916e-05, "loss": 3.2812, "num_input_tokens_seen": 2548816, "step": 3895 }, { "epoch": 0.4274909569220651, "grad_norm": 7.281493186950684, "learning_rate": 3.066249541524285e-05, "loss": 3.3321, "num_input_tokens_seen": 2552352, "step": 3900 }, { "epoch": 0.42803902225145235, "grad_norm": 6.2967047691345215, "learning_rate": 3.0620560778429736e-05, "loss": 3.1571, "num_input_tokens_seen": 2556072, "step": 3905 }, { "epoch": 0.42858708758083963, "grad_norm": 5.46196174621582, "learning_rate": 3.0578609478971474e-05, "loss": 2.9312, "num_input_tokens_seen": 2559680, "step": 3910 }, { "epoch": 0.4291351529102269, "grad_norm": 6.703193664550781, "learning_rate": 3.0536641641236366e-05, "loss": 3.1173, "num_input_tokens_seen": 2564072, "step": 3915 }, { "epoch": 0.42968321823961414, "grad_norm": 6.250140190124512, "learning_rate": 3.0494657389641763e-05, "loss": 2.8173, "num_input_tokens_seen": 2567848, "step": 3920 }, { "epoch": 0.4302312835690014, "grad_norm": 8.19283676147461, "learning_rate": 3.0452656848653643e-05, "loss": 3.1555, "num_input_tokens_seen": 2570760, "step": 3925 }, { "epoch": 0.4307793488983887, "grad_norm": 4.393120288848877, "learning_rate": 3.041064014278629e-05, "loss": 3.3082, "num_input_tokens_seen": 2574112, "step": 3930 }, { "epoch": 0.43132741422777593, "grad_norm": 7.910434246063232, "learning_rate": 3.036860739660193e-05, "loss": 3.0528, "num_input_tokens_seen": 2578144, "step": 3935 }, { "epoch": 0.4318754795571632, "grad_norm": 8.536887168884277, "learning_rate": 3.0326558734710304e-05, "loss": 3.224, "num_input_tokens_seen": 2581008, "step": 3940 }, { "epoch": 0.4324235448865505, "grad_norm": 5.810432434082031, "learning_rate": 3.028449428176836e-05, "loss": 3.2157, "num_input_tokens_seen": 2583616, "step": 3945 }, { "epoch": 0.4329716102159377, "grad_norm": 7.819321632385254, "learning_rate": 3.024241416247987e-05, "loss": 3.3845, "num_input_tokens_seen": 2587680, "step": 3950 }, { "epoch": 0.433519675545325, "grad_norm": 7.583765506744385, "learning_rate": 3.0200318501595028e-05, "loss": 3.4347, "num_input_tokens_seen": 2590536, "step": 3955 }, { "epoch": 0.4340677408747123, "grad_norm": 6.201939105987549, "learning_rate": 3.01582074239101e-05, "loss": 3.0368, "num_input_tokens_seen": 2593560, "step": 3960 }, { "epoch": 0.4346158062040995, "grad_norm": 6.4165425300598145, "learning_rate": 3.0116081054267086e-05, "loss": 3.1866, "num_input_tokens_seen": 2597464, "step": 3965 }, { "epoch": 0.4351638715334868, "grad_norm": 5.670197486877441, "learning_rate": 3.007393951755329e-05, "loss": 3.1721, "num_input_tokens_seen": 2600616, "step": 3970 }, { "epoch": 0.43571193686287407, "grad_norm": 6.542341709136963, "learning_rate": 3.0031782938701004e-05, "loss": 3.1902, "num_input_tokens_seen": 2603832, "step": 3975 }, { "epoch": 0.4362600021922613, "grad_norm": 11.36231803894043, "learning_rate": 2.9989611442687087e-05, "loss": 3.1505, "num_input_tokens_seen": 2607032, "step": 3980 }, { "epoch": 0.4368080675216486, "grad_norm": 8.223766326904297, "learning_rate": 2.994742515453264e-05, "loss": 3.2596, "num_input_tokens_seen": 2609848, "step": 3985 }, { "epoch": 0.43735613285103586, "grad_norm": 6.220792770385742, "learning_rate": 2.9905224199302612e-05, "loss": 3.105, "num_input_tokens_seen": 2613072, "step": 3990 }, { "epoch": 0.4379041981804231, "grad_norm": 9.295598983764648, "learning_rate": 2.9863008702105444e-05, "loss": 3.5309, "num_input_tokens_seen": 2617216, "step": 3995 }, { "epoch": 0.43845226350981037, "grad_norm": 7.482667446136475, "learning_rate": 2.9820778788092662e-05, "loss": 3.0894, "num_input_tokens_seen": 2620440, "step": 4000 }, { "epoch": 0.43900032883919765, "grad_norm": 8.263635635375977, "learning_rate": 2.9778534582458563e-05, "loss": 3.2592, "num_input_tokens_seen": 2624136, "step": 4005 }, { "epoch": 0.4395483941685849, "grad_norm": 6.1141180992126465, "learning_rate": 2.973627621043979e-05, "loss": 2.9611, "num_input_tokens_seen": 2628416, "step": 4010 }, { "epoch": 0.44009645949797216, "grad_norm": 5.068775653839111, "learning_rate": 2.969400379731499e-05, "loss": 3.2408, "num_input_tokens_seen": 2632360, "step": 4015 }, { "epoch": 0.44064452482735944, "grad_norm": 4.8074049949646, "learning_rate": 2.965171746840445e-05, "loss": 3.3503, "num_input_tokens_seen": 2635144, "step": 4020 }, { "epoch": 0.44119259015674667, "grad_norm": 5.924848556518555, "learning_rate": 2.9609417349069685e-05, "loss": 2.8347, "num_input_tokens_seen": 2638880, "step": 4025 }, { "epoch": 0.44174065548613395, "grad_norm": 6.371955871582031, "learning_rate": 2.9567103564713107e-05, "loss": 3.0076, "num_input_tokens_seen": 2642200, "step": 4030 }, { "epoch": 0.44228872081552123, "grad_norm": 6.616983890533447, "learning_rate": 2.952477624077764e-05, "loss": 3.1063, "num_input_tokens_seen": 2647008, "step": 4035 }, { "epoch": 0.44283678614490846, "grad_norm": 6.057950973510742, "learning_rate": 2.9482435502746363e-05, "loss": 2.9816, "num_input_tokens_seen": 2649824, "step": 4040 }, { "epoch": 0.44338485147429574, "grad_norm": 5.292036533355713, "learning_rate": 2.944008147614208e-05, "loss": 2.9774, "num_input_tokens_seen": 2652424, "step": 4045 }, { "epoch": 0.443932916803683, "grad_norm": 6.374473571777344, "learning_rate": 2.9397714286527034e-05, "loss": 2.9106, "num_input_tokens_seen": 2655792, "step": 4050 }, { "epoch": 0.44448098213307025, "grad_norm": 5.729962348937988, "learning_rate": 2.9355334059502472e-05, "loss": 3.1529, "num_input_tokens_seen": 2658608, "step": 4055 }, { "epoch": 0.4450290474624575, "grad_norm": 8.748932838439941, "learning_rate": 2.9312940920708277e-05, "loss": 3.236, "num_input_tokens_seen": 2661312, "step": 4060 }, { "epoch": 0.4455771127918448, "grad_norm": 8.778289794921875, "learning_rate": 2.927053499582264e-05, "loss": 3.1197, "num_input_tokens_seen": 2665256, "step": 4065 }, { "epoch": 0.44612517812123204, "grad_norm": 8.748550415039062, "learning_rate": 2.922811641056164e-05, "loss": 3.2486, "num_input_tokens_seen": 2669288, "step": 4070 }, { "epoch": 0.4466732434506193, "grad_norm": 5.559131145477295, "learning_rate": 2.9185685290678888e-05, "loss": 2.9932, "num_input_tokens_seen": 2672312, "step": 4075 }, { "epoch": 0.4472213087800066, "grad_norm": 5.6860575675964355, "learning_rate": 2.9143241761965155e-05, "loss": 3.1337, "num_input_tokens_seen": 2676312, "step": 4080 }, { "epoch": 0.4477693741093938, "grad_norm": 7.295080184936523, "learning_rate": 2.9100785950248015e-05, "loss": 2.9724, "num_input_tokens_seen": 2679592, "step": 4085 }, { "epoch": 0.4483174394387811, "grad_norm": 9.514237403869629, "learning_rate": 2.9058317981391437e-05, "loss": 3.1765, "num_input_tokens_seen": 2682472, "step": 4090 }, { "epoch": 0.4488655047681684, "grad_norm": 7.216882705688477, "learning_rate": 2.901583798129543e-05, "loss": 3.3707, "num_input_tokens_seen": 2685328, "step": 4095 }, { "epoch": 0.4494135700975556, "grad_norm": 7.9535298347473145, "learning_rate": 2.8973346075895695e-05, "loss": 3.4585, "num_input_tokens_seen": 2688080, "step": 4100 }, { "epoch": 0.4499616354269429, "grad_norm": 7.782059669494629, "learning_rate": 2.8930842391163192e-05, "loss": 2.9516, "num_input_tokens_seen": 2691112, "step": 4105 }, { "epoch": 0.4505097007563302, "grad_norm": 6.065903186798096, "learning_rate": 2.8888327053103836e-05, "loss": 3.0919, "num_input_tokens_seen": 2694328, "step": 4110 }, { "epoch": 0.4510577660857174, "grad_norm": 6.912715435028076, "learning_rate": 2.884580018775807e-05, "loss": 2.9052, "num_input_tokens_seen": 2696856, "step": 4115 }, { "epoch": 0.4516058314151047, "grad_norm": 8.30929946899414, "learning_rate": 2.8803261921200503e-05, "loss": 3.3268, "num_input_tokens_seen": 2699968, "step": 4120 }, { "epoch": 0.45215389674449197, "grad_norm": 8.51347541809082, "learning_rate": 2.8760712379539567e-05, "loss": 3.3617, "num_input_tokens_seen": 2702416, "step": 4125 }, { "epoch": 0.4527019620738792, "grad_norm": 6.167294979095459, "learning_rate": 2.8718151688917105e-05, "loss": 3.1805, "num_input_tokens_seen": 2705440, "step": 4130 }, { "epoch": 0.4532500274032665, "grad_norm": 8.299149513244629, "learning_rate": 2.867557997550801e-05, "loss": 3.2122, "num_input_tokens_seen": 2708248, "step": 4135 }, { "epoch": 0.45379809273265376, "grad_norm": 8.19796085357666, "learning_rate": 2.8632997365519877e-05, "loss": 3.0817, "num_input_tokens_seen": 2712464, "step": 4140 }, { "epoch": 0.454346158062041, "grad_norm": 6.964700698852539, "learning_rate": 2.859040398519256e-05, "loss": 3.4051, "num_input_tokens_seen": 2715048, "step": 4145 }, { "epoch": 0.45489422339142827, "grad_norm": 6.310876846313477, "learning_rate": 2.8547799960797883e-05, "loss": 2.7846, "num_input_tokens_seen": 2718192, "step": 4150 }, { "epoch": 0.45544228872081555, "grad_norm": 6.786360263824463, "learning_rate": 2.8505185418639212e-05, "loss": 2.829, "num_input_tokens_seen": 2722064, "step": 4155 }, { "epoch": 0.4559903540502028, "grad_norm": 7.1503520011901855, "learning_rate": 2.8462560485051098e-05, "loss": 2.9883, "num_input_tokens_seen": 2725640, "step": 4160 }, { "epoch": 0.45653841937959005, "grad_norm": 5.350907802581787, "learning_rate": 2.841992528639888e-05, "loss": 3.0743, "num_input_tokens_seen": 2729992, "step": 4165 }, { "epoch": 0.45708648470897734, "grad_norm": 5.482122421264648, "learning_rate": 2.837727994907835e-05, "loss": 3.2459, "num_input_tokens_seen": 2733424, "step": 4170 }, { "epoch": 0.45763455003836456, "grad_norm": 4.941489219665527, "learning_rate": 2.833462459951534e-05, "loss": 3.2963, "num_input_tokens_seen": 2736656, "step": 4175 }, { "epoch": 0.45818261536775184, "grad_norm": 10.229253768920898, "learning_rate": 2.8291959364165387e-05, "loss": 3.2607, "num_input_tokens_seen": 2739808, "step": 4180 }, { "epoch": 0.4587306806971391, "grad_norm": 5.911849498748779, "learning_rate": 2.824928436951332e-05, "loss": 3.3887, "num_input_tokens_seen": 2742752, "step": 4185 }, { "epoch": 0.45927874602652635, "grad_norm": 6.14879846572876, "learning_rate": 2.8206599742072883e-05, "loss": 3.0095, "num_input_tokens_seen": 2746256, "step": 4190 }, { "epoch": 0.45982681135591363, "grad_norm": 6.8150529861450195, "learning_rate": 2.8163905608386415e-05, "loss": 3.0599, "num_input_tokens_seen": 2750736, "step": 4195 }, { "epoch": 0.4603748766853009, "grad_norm": 5.578204154968262, "learning_rate": 2.812120209502441e-05, "loss": 3.4177, "num_input_tokens_seen": 2753832, "step": 4200 }, { "epoch": 0.46092294201468814, "grad_norm": 7.075170040130615, "learning_rate": 2.8078489328585184e-05, "loss": 3.2787, "num_input_tokens_seen": 2757176, "step": 4205 }, { "epoch": 0.4614710073440754, "grad_norm": 7.633877754211426, "learning_rate": 2.803576743569447e-05, "loss": 3.2838, "num_input_tokens_seen": 2760632, "step": 4210 }, { "epoch": 0.46201907267346265, "grad_norm": 7.296063423156738, "learning_rate": 2.7993036543005073e-05, "loss": 3.2533, "num_input_tokens_seen": 2763160, "step": 4215 }, { "epoch": 0.46256713800284993, "grad_norm": 9.778048515319824, "learning_rate": 2.7950296777196454e-05, "loss": 3.2876, "num_input_tokens_seen": 2766304, "step": 4220 }, { "epoch": 0.4631152033322372, "grad_norm": 6.1279826164245605, "learning_rate": 2.7907548264974408e-05, "loss": 3.3613, "num_input_tokens_seen": 2769112, "step": 4225 }, { "epoch": 0.46366326866162444, "grad_norm": 7.0411458015441895, "learning_rate": 2.7864791133070655e-05, "loss": 2.9218, "num_input_tokens_seen": 2773120, "step": 4230 }, { "epoch": 0.4642113339910117, "grad_norm": 7.575366497039795, "learning_rate": 2.782202550824244e-05, "loss": 2.7816, "num_input_tokens_seen": 2775712, "step": 4235 }, { "epoch": 0.464759399320399, "grad_norm": 4.21223258972168, "learning_rate": 2.777925151727222e-05, "loss": 2.913, "num_input_tokens_seen": 2778872, "step": 4240 }, { "epoch": 0.46530746464978623, "grad_norm": 7.198635101318359, "learning_rate": 2.7736469286967244e-05, "loss": 3.3944, "num_input_tokens_seen": 2783424, "step": 4245 }, { "epoch": 0.4658555299791735, "grad_norm": 6.785750389099121, "learning_rate": 2.7693678944159168e-05, "loss": 3.0493, "num_input_tokens_seen": 2787720, "step": 4250 }, { "epoch": 0.4664035953085608, "grad_norm": 5.799097061157227, "learning_rate": 2.7650880615703735e-05, "loss": 3.043, "num_input_tokens_seen": 2790528, "step": 4255 }, { "epoch": 0.466951660637948, "grad_norm": 5.558688163757324, "learning_rate": 2.760807442848033e-05, "loss": 3.0476, "num_input_tokens_seen": 2794088, "step": 4260 }, { "epoch": 0.4674997259673353, "grad_norm": 7.959995269775391, "learning_rate": 2.7565260509391644e-05, "loss": 3.3705, "num_input_tokens_seen": 2797168, "step": 4265 }, { "epoch": 0.4680477912967226, "grad_norm": 5.836214542388916, "learning_rate": 2.7522438985363297e-05, "loss": 3.1173, "num_input_tokens_seen": 2799752, "step": 4270 }, { "epoch": 0.4685958566261098, "grad_norm": 5.6099348068237305, "learning_rate": 2.7479609983343457e-05, "loss": 3.4298, "num_input_tokens_seen": 2803560, "step": 4275 }, { "epoch": 0.4691439219554971, "grad_norm": 6.971024513244629, "learning_rate": 2.7436773630302448e-05, "loss": 3.4299, "num_input_tokens_seen": 2806360, "step": 4280 }, { "epoch": 0.46969198728488437, "grad_norm": 5.738091945648193, "learning_rate": 2.7393930053232393e-05, "loss": 3.0872, "num_input_tokens_seen": 2809408, "step": 4285 }, { "epoch": 0.4702400526142716, "grad_norm": 10.746182441711426, "learning_rate": 2.7351079379146844e-05, "loss": 3.5487, "num_input_tokens_seen": 2812752, "step": 4290 }, { "epoch": 0.4707881179436589, "grad_norm": 6.557742595672607, "learning_rate": 2.7308221735080363e-05, "loss": 3.1006, "num_input_tokens_seen": 2816432, "step": 4295 }, { "epoch": 0.47133618327304616, "grad_norm": 7.124549865722656, "learning_rate": 2.726535724808821e-05, "loss": 3.2491, "num_input_tokens_seen": 2819608, "step": 4300 }, { "epoch": 0.4718842486024334, "grad_norm": 8.328391075134277, "learning_rate": 2.7222486045245905e-05, "loss": 2.9571, "num_input_tokens_seen": 2822304, "step": 4305 }, { "epoch": 0.47243231393182067, "grad_norm": 8.121037483215332, "learning_rate": 2.717960825364888e-05, "loss": 3.0946, "num_input_tokens_seen": 2826112, "step": 4310 }, { "epoch": 0.47298037926120795, "grad_norm": 7.5214715003967285, "learning_rate": 2.7136724000412122e-05, "loss": 3.2682, "num_input_tokens_seen": 2829640, "step": 4315 }, { "epoch": 0.4735284445905952, "grad_norm": 5.765413761138916, "learning_rate": 2.709383341266975e-05, "loss": 3.3871, "num_input_tokens_seen": 2832536, "step": 4320 }, { "epoch": 0.47407650991998246, "grad_norm": 7.573315143585205, "learning_rate": 2.7050936617574674e-05, "loss": 3.0505, "num_input_tokens_seen": 2835312, "step": 4325 }, { "epoch": 0.47462457524936974, "grad_norm": 5.444807052612305, "learning_rate": 2.70080337422982e-05, "loss": 3.1385, "num_input_tokens_seen": 2839520, "step": 4330 }, { "epoch": 0.47517264057875697, "grad_norm": 5.842774868011475, "learning_rate": 2.696512491402967e-05, "loss": 3.0295, "num_input_tokens_seen": 2842096, "step": 4335 }, { "epoch": 0.47572070590814425, "grad_norm": 6.1106157302856445, "learning_rate": 2.692221025997606e-05, "loss": 3.0393, "num_input_tokens_seen": 2845424, "step": 4340 }, { "epoch": 0.47626877123753153, "grad_norm": 7.988515377044678, "learning_rate": 2.687928990736163e-05, "loss": 3.3657, "num_input_tokens_seen": 2847648, "step": 4345 }, { "epoch": 0.47681683656691876, "grad_norm": 7.0514655113220215, "learning_rate": 2.683636398342753e-05, "loss": 3.4438, "num_input_tokens_seen": 2850432, "step": 4350 }, { "epoch": 0.47736490189630604, "grad_norm": 5.54784631729126, "learning_rate": 2.6793432615431406e-05, "loss": 2.9583, "num_input_tokens_seen": 2854176, "step": 4355 }, { "epoch": 0.4779129672256933, "grad_norm": 6.001830577850342, "learning_rate": 2.6750495930647083e-05, "loss": 3.4694, "num_input_tokens_seen": 2857368, "step": 4360 }, { "epoch": 0.47846103255508055, "grad_norm": 7.455556392669678, "learning_rate": 2.670755405636412e-05, "loss": 3.0839, "num_input_tokens_seen": 2860064, "step": 4365 }, { "epoch": 0.4790090978844678, "grad_norm": 6.409590721130371, "learning_rate": 2.6664607119887462e-05, "loss": 3.0962, "num_input_tokens_seen": 2863128, "step": 4370 }, { "epoch": 0.4795571632138551, "grad_norm": 5.903439044952393, "learning_rate": 2.6621655248537075e-05, "loss": 3.0613, "num_input_tokens_seen": 2866720, "step": 4375 }, { "epoch": 0.48010522854324233, "grad_norm": 7.286397457122803, "learning_rate": 2.657869856964754e-05, "loss": 2.9673, "num_input_tokens_seen": 2869568, "step": 4380 }, { "epoch": 0.4806532938726296, "grad_norm": 7.941439151763916, "learning_rate": 2.6535737210567707e-05, "loss": 3.3656, "num_input_tokens_seen": 2874584, "step": 4385 }, { "epoch": 0.4812013592020169, "grad_norm": 3.8733413219451904, "learning_rate": 2.6492771298660286e-05, "loss": 2.8012, "num_input_tokens_seen": 2879248, "step": 4390 }, { "epoch": 0.4817494245314041, "grad_norm": 4.492478370666504, "learning_rate": 2.6449800961301485e-05, "loss": 2.9495, "num_input_tokens_seen": 2882824, "step": 4395 }, { "epoch": 0.4822974898607914, "grad_norm": 7.726132392883301, "learning_rate": 2.640682632588064e-05, "loss": 3.1087, "num_input_tokens_seen": 2886440, "step": 4400 }, { "epoch": 0.4828455551901787, "grad_norm": 6.549642562866211, "learning_rate": 2.6363847519799822e-05, "loss": 2.985, "num_input_tokens_seen": 2889808, "step": 4405 }, { "epoch": 0.4833936205195659, "grad_norm": 8.789740562438965, "learning_rate": 2.632086467047348e-05, "loss": 3.1352, "num_input_tokens_seen": 2893680, "step": 4410 }, { "epoch": 0.4839416858489532, "grad_norm": 8.024590492248535, "learning_rate": 2.6277877905328023e-05, "loss": 3.3008, "num_input_tokens_seen": 2895872, "step": 4415 }, { "epoch": 0.4844897511783405, "grad_norm": 6.235259532928467, "learning_rate": 2.623488735180149e-05, "loss": 3.1758, "num_input_tokens_seen": 2898680, "step": 4420 }, { "epoch": 0.4850378165077277, "grad_norm": 7.674651145935059, "learning_rate": 2.619189313734316e-05, "loss": 2.9519, "num_input_tokens_seen": 2903496, "step": 4425 }, { "epoch": 0.485585881837115, "grad_norm": 5.884274959564209, "learning_rate": 2.614889538941313e-05, "loss": 3.3259, "num_input_tokens_seen": 2906248, "step": 4430 }, { "epoch": 0.48613394716650227, "grad_norm": 5.681421279907227, "learning_rate": 2.610589423548201e-05, "loss": 3.4432, "num_input_tokens_seen": 2909352, "step": 4435 }, { "epoch": 0.4866820124958895, "grad_norm": 8.08205795288086, "learning_rate": 2.6062889803030477e-05, "loss": 3.6165, "num_input_tokens_seen": 2911960, "step": 4440 }, { "epoch": 0.4872300778252768, "grad_norm": 7.7329277992248535, "learning_rate": 2.601988221954894e-05, "loss": 3.2172, "num_input_tokens_seen": 2915256, "step": 4445 }, { "epoch": 0.48777814315466406, "grad_norm": 6.208625793457031, "learning_rate": 2.5976871612537164e-05, "loss": 3.2373, "num_input_tokens_seen": 2919040, "step": 4450 }, { "epoch": 0.4883262084840513, "grad_norm": 8.127032279968262, "learning_rate": 2.593385810950386e-05, "loss": 2.9402, "num_input_tokens_seen": 2922272, "step": 4455 }, { "epoch": 0.48887427381343856, "grad_norm": 6.481329441070557, "learning_rate": 2.589084183796632e-05, "loss": 3.0208, "num_input_tokens_seen": 2926072, "step": 4460 }, { "epoch": 0.48942233914282585, "grad_norm": 6.350535869598389, "learning_rate": 2.5847822925450055e-05, "loss": 3.1026, "num_input_tokens_seen": 2928760, "step": 4465 }, { "epoch": 0.4899704044722131, "grad_norm": 7.3511457443237305, "learning_rate": 2.5804801499488407e-05, "loss": 2.9358, "num_input_tokens_seen": 2932088, "step": 4470 }, { "epoch": 0.49051846980160035, "grad_norm": 5.9759521484375, "learning_rate": 2.576177768762216e-05, "loss": 3.1564, "num_input_tokens_seen": 2935272, "step": 4475 }, { "epoch": 0.49106653513098764, "grad_norm": 7.138418674468994, "learning_rate": 2.5718751617399182e-05, "loss": 3.0998, "num_input_tokens_seen": 2938280, "step": 4480 }, { "epoch": 0.49161460046037486, "grad_norm": 10.551050186157227, "learning_rate": 2.5675723416374026e-05, "loss": 3.1874, "num_input_tokens_seen": 2941648, "step": 4485 }, { "epoch": 0.49216266578976214, "grad_norm": 6.085887432098389, "learning_rate": 2.5632693212107567e-05, "loss": 2.8506, "num_input_tokens_seen": 2944680, "step": 4490 }, { "epoch": 0.4927107311191494, "grad_norm": 6.314172267913818, "learning_rate": 2.5589661132166613e-05, "loss": 2.8206, "num_input_tokens_seen": 2948744, "step": 4495 }, { "epoch": 0.49325879644853665, "grad_norm": 6.3680853843688965, "learning_rate": 2.5546627304123545e-05, "loss": 2.85, "num_input_tokens_seen": 2951256, "step": 4500 }, { "epoch": 0.49380686177792393, "grad_norm": 6.314942359924316, "learning_rate": 2.5503591855555908e-05, "loss": 3.2021, "num_input_tokens_seen": 2954536, "step": 4505 }, { "epoch": 0.4943549271073112, "grad_norm": 6.349035739898682, "learning_rate": 2.546055491404607e-05, "loss": 2.9022, "num_input_tokens_seen": 2958112, "step": 4510 }, { "epoch": 0.49490299243669844, "grad_norm": 6.812668800354004, "learning_rate": 2.5417516607180825e-05, "loss": 3.2304, "num_input_tokens_seen": 2961024, "step": 4515 }, { "epoch": 0.4954510577660857, "grad_norm": 4.483590126037598, "learning_rate": 2.5374477062550984e-05, "loss": 2.8489, "num_input_tokens_seen": 2964344, "step": 4520 }, { "epoch": 0.495999123095473, "grad_norm": 6.769683837890625, "learning_rate": 2.5331436407751074e-05, "loss": 3.1946, "num_input_tokens_seen": 2967608, "step": 4525 }, { "epoch": 0.49654718842486023, "grad_norm": 9.059048652648926, "learning_rate": 2.528839477037887e-05, "loss": 3.2895, "num_input_tokens_seen": 2970488, "step": 4530 }, { "epoch": 0.4970952537542475, "grad_norm": 9.555692672729492, "learning_rate": 2.5245352278035095e-05, "loss": 3.0595, "num_input_tokens_seen": 2973200, "step": 4535 }, { "epoch": 0.4976433190836348, "grad_norm": 8.808011054992676, "learning_rate": 2.520230905832298e-05, "loss": 3.1939, "num_input_tokens_seen": 2976576, "step": 4540 }, { "epoch": 0.498191384413022, "grad_norm": 7.059693336486816, "learning_rate": 2.515926523884792e-05, "loss": 3.3154, "num_input_tokens_seen": 2980624, "step": 4545 }, { "epoch": 0.4987394497424093, "grad_norm": 5.0204973220825195, "learning_rate": 2.5116220947217107e-05, "loss": 3.2012, "num_input_tokens_seen": 2983328, "step": 4550 }, { "epoch": 0.4992875150717966, "grad_norm": 8.473772048950195, "learning_rate": 2.507317631103911e-05, "loss": 3.3448, "num_input_tokens_seen": 2986664, "step": 4555 }, { "epoch": 0.4998355804011838, "grad_norm": 5.891829490661621, "learning_rate": 2.5030131457923512e-05, "loss": 3.0624, "num_input_tokens_seen": 2990088, "step": 4560 }, { "epoch": 0.500383645730571, "grad_norm": 8.812019348144531, "learning_rate": 2.498708651548057e-05, "loss": 3.1606, "num_input_tokens_seen": 2993152, "step": 4565 }, { "epoch": 0.5009317110599584, "grad_norm": 6.772736549377441, "learning_rate": 2.494404161132079e-05, "loss": 2.6401, "num_input_tokens_seen": 2996104, "step": 4570 }, { "epoch": 0.5014797763893456, "grad_norm": 6.640130996704102, "learning_rate": 2.490099687305455e-05, "loss": 2.8047, "num_input_tokens_seen": 3000664, "step": 4575 }, { "epoch": 0.5020278417187328, "grad_norm": 8.050363540649414, "learning_rate": 2.485795242829177e-05, "loss": 2.9757, "num_input_tokens_seen": 3004312, "step": 4580 }, { "epoch": 0.5025759070481202, "grad_norm": 7.689075469970703, "learning_rate": 2.481490840464147e-05, "loss": 3.6823, "num_input_tokens_seen": 3008056, "step": 4585 }, { "epoch": 0.5031239723775074, "grad_norm": 7.890453815460205, "learning_rate": 2.4771864929711414e-05, "loss": 3.5555, "num_input_tokens_seen": 3010640, "step": 4590 }, { "epoch": 0.5036720377068946, "grad_norm": 8.07981014251709, "learning_rate": 2.4728822131107784e-05, "loss": 2.9504, "num_input_tokens_seen": 3013752, "step": 4595 }, { "epoch": 0.504220103036282, "grad_norm": 5.753955364227295, "learning_rate": 2.468578013643472e-05, "loss": 3.1703, "num_input_tokens_seen": 3016248, "step": 4600 }, { "epoch": 0.5047681683656692, "grad_norm": 5.296700954437256, "learning_rate": 2.4642739073293978e-05, "loss": 2.8482, "num_input_tokens_seen": 3019256, "step": 4605 }, { "epoch": 0.5053162336950564, "grad_norm": 11.357376098632812, "learning_rate": 2.459969906928458e-05, "loss": 2.8125, "num_input_tokens_seen": 3021936, "step": 4610 }, { "epoch": 0.5058642990244437, "grad_norm": 9.2806396484375, "learning_rate": 2.4556660252002384e-05, "loss": 3.1294, "num_input_tokens_seen": 3025888, "step": 4615 }, { "epoch": 0.506412364353831, "grad_norm": 7.156399250030518, "learning_rate": 2.451362274903973e-05, "loss": 3.202, "num_input_tokens_seen": 3029752, "step": 4620 }, { "epoch": 0.5069604296832182, "grad_norm": 7.298778533935547, "learning_rate": 2.4470586687985077e-05, "loss": 3.2958, "num_input_tokens_seen": 3033576, "step": 4625 }, { "epoch": 0.5075084950126055, "grad_norm": 7.478179454803467, "learning_rate": 2.4427552196422602e-05, "loss": 3.1416, "num_input_tokens_seen": 3037016, "step": 4630 }, { "epoch": 0.5080565603419928, "grad_norm": 8.109244346618652, "learning_rate": 2.438451940193181e-05, "loss": 2.7633, "num_input_tokens_seen": 3040640, "step": 4635 }, { "epoch": 0.50860462567138, "grad_norm": 6.991682052612305, "learning_rate": 2.434148843208722e-05, "loss": 2.9995, "num_input_tokens_seen": 3043424, "step": 4640 }, { "epoch": 0.5091526910007673, "grad_norm": 5.315702438354492, "learning_rate": 2.4298459414457896e-05, "loss": 2.9122, "num_input_tokens_seen": 3046672, "step": 4645 }, { "epoch": 0.5097007563301545, "grad_norm": 8.090765953063965, "learning_rate": 2.425543247660713e-05, "loss": 3.3741, "num_input_tokens_seen": 3049736, "step": 4650 }, { "epoch": 0.5102488216595418, "grad_norm": 9.288080215454102, "learning_rate": 2.4212407746092066e-05, "loss": 3.4609, "num_input_tokens_seen": 3053656, "step": 4655 }, { "epoch": 0.5107968869889291, "grad_norm": 5.754721164703369, "learning_rate": 2.4169385350463282e-05, "loss": 2.9946, "num_input_tokens_seen": 3056144, "step": 4660 }, { "epoch": 0.5113449523183163, "grad_norm": 6.588372230529785, "learning_rate": 2.412636541726444e-05, "loss": 3.0074, "num_input_tokens_seen": 3059712, "step": 4665 }, { "epoch": 0.5118930176477036, "grad_norm": 7.401770114898682, "learning_rate": 2.4083348074031904e-05, "loss": 3.4029, "num_input_tokens_seen": 3062288, "step": 4670 }, { "epoch": 0.5124410829770909, "grad_norm": 5.612600803375244, "learning_rate": 2.4040333448294364e-05, "loss": 3.2012, "num_input_tokens_seen": 3065728, "step": 4675 }, { "epoch": 0.5129891483064781, "grad_norm": 5.925127983093262, "learning_rate": 2.399732166757243e-05, "loss": 3.0461, "num_input_tokens_seen": 3068632, "step": 4680 }, { "epoch": 0.5135372136358654, "grad_norm": 8.738677978515625, "learning_rate": 2.3954312859378325e-05, "loss": 3.4782, "num_input_tokens_seen": 3070968, "step": 4685 }, { "epoch": 0.5140852789652527, "grad_norm": 9.27092170715332, "learning_rate": 2.3911307151215413e-05, "loss": 3.2625, "num_input_tokens_seen": 3074696, "step": 4690 }, { "epoch": 0.5146333442946399, "grad_norm": 5.855086326599121, "learning_rate": 2.3868304670577886e-05, "loss": 3.045, "num_input_tokens_seen": 3078584, "step": 4695 }, { "epoch": 0.5151814096240271, "grad_norm": 8.794078826904297, "learning_rate": 2.3825305544950374e-05, "loss": 2.7209, "num_input_tokens_seen": 3081624, "step": 4700 }, { "epoch": 0.5157294749534145, "grad_norm": 7.675835132598877, "learning_rate": 2.3782309901807555e-05, "loss": 3.3431, "num_input_tokens_seen": 3084152, "step": 4705 }, { "epoch": 0.5162775402828017, "grad_norm": 7.583930969238281, "learning_rate": 2.3739317868613776e-05, "loss": 3.1141, "num_input_tokens_seen": 3087040, "step": 4710 }, { "epoch": 0.5168256056121889, "grad_norm": 7.561563968658447, "learning_rate": 2.369632957282269e-05, "loss": 3.4023, "num_input_tokens_seen": 3090352, "step": 4715 }, { "epoch": 0.5173736709415763, "grad_norm": 6.868551254272461, "learning_rate": 2.365334514187687e-05, "loss": 3.0766, "num_input_tokens_seen": 3093552, "step": 4720 }, { "epoch": 0.5179217362709635, "grad_norm": 5.663219928741455, "learning_rate": 2.3610364703207432e-05, "loss": 3.1136, "num_input_tokens_seen": 3097168, "step": 4725 }, { "epoch": 0.5184698016003507, "grad_norm": 7.611098766326904, "learning_rate": 2.3567388384233648e-05, "loss": 3.0911, "num_input_tokens_seen": 3101648, "step": 4730 }, { "epoch": 0.5190178669297381, "grad_norm": 6.850576877593994, "learning_rate": 2.352441631236259e-05, "loss": 2.9311, "num_input_tokens_seen": 3105888, "step": 4735 }, { "epoch": 0.5195659322591253, "grad_norm": 5.57901668548584, "learning_rate": 2.348144861498873e-05, "loss": 3.0239, "num_input_tokens_seen": 3110648, "step": 4740 }, { "epoch": 0.5201139975885125, "grad_norm": 6.950675010681152, "learning_rate": 2.343848541949356e-05, "loss": 3.053, "num_input_tokens_seen": 3113400, "step": 4745 }, { "epoch": 0.5206620629178998, "grad_norm": 5.661995887756348, "learning_rate": 2.3395526853245264e-05, "loss": 3.2619, "num_input_tokens_seen": 3117000, "step": 4750 }, { "epoch": 0.5212101282472871, "grad_norm": 6.956995010375977, "learning_rate": 2.3352573043598267e-05, "loss": 3.6572, "num_input_tokens_seen": 3121664, "step": 4755 }, { "epoch": 0.5217581935766743, "grad_norm": 4.707006454467773, "learning_rate": 2.3309624117892885e-05, "loss": 2.9066, "num_input_tokens_seen": 3124872, "step": 4760 }, { "epoch": 0.5223062589060616, "grad_norm": 5.503338813781738, "learning_rate": 2.3266680203455004e-05, "loss": 3.2066, "num_input_tokens_seen": 3128760, "step": 4765 }, { "epoch": 0.5228543242354489, "grad_norm": 7.054602146148682, "learning_rate": 2.322374142759561e-05, "loss": 2.8683, "num_input_tokens_seen": 3131480, "step": 4770 }, { "epoch": 0.5234023895648361, "grad_norm": 8.06494140625, "learning_rate": 2.318080791761046e-05, "loss": 3.2634, "num_input_tokens_seen": 3135040, "step": 4775 }, { "epoch": 0.5239504548942234, "grad_norm": 8.718894958496094, "learning_rate": 2.313787980077972e-05, "loss": 3.3735, "num_input_tokens_seen": 3137816, "step": 4780 }, { "epoch": 0.5244985202236107, "grad_norm": 6.601426124572754, "learning_rate": 2.309495720436755e-05, "loss": 3.0622, "num_input_tokens_seen": 3141752, "step": 4785 }, { "epoch": 0.5250465855529979, "grad_norm": 7.08184814453125, "learning_rate": 2.305204025562174e-05, "loss": 2.6361, "num_input_tokens_seen": 3144792, "step": 4790 }, { "epoch": 0.5255946508823852, "grad_norm": 8.298012733459473, "learning_rate": 2.3009129081773366e-05, "loss": 2.8071, "num_input_tokens_seen": 3147904, "step": 4795 }, { "epoch": 0.5261427162117724, "grad_norm": 7.070413589477539, "learning_rate": 2.2966223810036357e-05, "loss": 3.2667, "num_input_tokens_seen": 3150344, "step": 4800 }, { "epoch": 0.5266907815411597, "grad_norm": 8.037806510925293, "learning_rate": 2.292332456760714e-05, "loss": 3.3148, "num_input_tokens_seen": 3154328, "step": 4805 }, { "epoch": 0.527238846870547, "grad_norm": 5.284430980682373, "learning_rate": 2.2880431481664306e-05, "loss": 2.6196, "num_input_tokens_seen": 3157392, "step": 4810 }, { "epoch": 0.5277869121999342, "grad_norm": 7.804793357849121, "learning_rate": 2.283754467936815e-05, "loss": 2.9899, "num_input_tokens_seen": 3160304, "step": 4815 }, { "epoch": 0.5283349775293215, "grad_norm": 8.394335746765137, "learning_rate": 2.279466428786035e-05, "loss": 3.2071, "num_input_tokens_seen": 3163736, "step": 4820 }, { "epoch": 0.5288830428587088, "grad_norm": 6.269372463226318, "learning_rate": 2.2751790434263608e-05, "loss": 3.1003, "num_input_tokens_seen": 3166368, "step": 4825 }, { "epoch": 0.529431108188096, "grad_norm": 7.112332820892334, "learning_rate": 2.2708923245681203e-05, "loss": 3.2725, "num_input_tokens_seen": 3169960, "step": 4830 }, { "epoch": 0.5299791735174832, "grad_norm": 8.58667278289795, "learning_rate": 2.266606284919667e-05, "loss": 2.7479, "num_input_tokens_seen": 3172744, "step": 4835 }, { "epoch": 0.5305272388468706, "grad_norm": 7.745898723602295, "learning_rate": 2.262320937187344e-05, "loss": 3.4911, "num_input_tokens_seen": 3175984, "step": 4840 }, { "epoch": 0.5310753041762578, "grad_norm": 6.885601997375488, "learning_rate": 2.258036294075438e-05, "loss": 2.8831, "num_input_tokens_seen": 3178800, "step": 4845 }, { "epoch": 0.531623369505645, "grad_norm": 6.387146472930908, "learning_rate": 2.2537523682861484e-05, "loss": 3.0745, "num_input_tokens_seen": 3182328, "step": 4850 }, { "epoch": 0.5321714348350324, "grad_norm": 4.868107795715332, "learning_rate": 2.249469172519551e-05, "loss": 3.0048, "num_input_tokens_seen": 3185912, "step": 4855 }, { "epoch": 0.5327195001644196, "grad_norm": 8.075777053833008, "learning_rate": 2.2451867194735542e-05, "loss": 3.3234, "num_input_tokens_seen": 3189352, "step": 4860 }, { "epoch": 0.5332675654938068, "grad_norm": 5.830811500549316, "learning_rate": 2.2409050218438645e-05, "loss": 3.0588, "num_input_tokens_seen": 3193072, "step": 4865 }, { "epoch": 0.5338156308231942, "grad_norm": 5.349551200866699, "learning_rate": 2.2366240923239514e-05, "loss": 2.7223, "num_input_tokens_seen": 3196104, "step": 4870 }, { "epoch": 0.5343636961525814, "grad_norm": 8.454142570495605, "learning_rate": 2.2323439436050054e-05, "loss": 3.1157, "num_input_tokens_seen": 3198648, "step": 4875 }, { "epoch": 0.5349117614819686, "grad_norm": 7.110290050506592, "learning_rate": 2.2280645883759006e-05, "loss": 3.0379, "num_input_tokens_seen": 3201056, "step": 4880 }, { "epoch": 0.535459826811356, "grad_norm": 5.1915154457092285, "learning_rate": 2.2237860393231634e-05, "loss": 3.575, "num_input_tokens_seen": 3203712, "step": 4885 }, { "epoch": 0.5360078921407432, "grad_norm": 8.497429847717285, "learning_rate": 2.219508309130927e-05, "loss": 2.9379, "num_input_tokens_seen": 3206288, "step": 4890 }, { "epoch": 0.5365559574701304, "grad_norm": 8.26462173461914, "learning_rate": 2.2152314104808956e-05, "loss": 3.1587, "num_input_tokens_seen": 3209928, "step": 4895 }, { "epoch": 0.5371040227995177, "grad_norm": 6.499933242797852, "learning_rate": 2.210955356052313e-05, "loss": 2.9181, "num_input_tokens_seen": 3213336, "step": 4900 }, { "epoch": 0.537652088128905, "grad_norm": 5.8398590087890625, "learning_rate": 2.2066801585219156e-05, "loss": 2.8303, "num_input_tokens_seen": 3216464, "step": 4905 }, { "epoch": 0.5382001534582922, "grad_norm": 6.813495635986328, "learning_rate": 2.2024058305639015e-05, "loss": 2.9079, "num_input_tokens_seen": 3221256, "step": 4910 }, { "epoch": 0.5387482187876795, "grad_norm": 8.064513206481934, "learning_rate": 2.198132384849891e-05, "loss": 3.2373, "num_input_tokens_seen": 3224320, "step": 4915 }, { "epoch": 0.5392962841170668, "grad_norm": 7.14154577255249, "learning_rate": 2.1938598340488886e-05, "loss": 3.0737, "num_input_tokens_seen": 3227128, "step": 4920 }, { "epoch": 0.539844349446454, "grad_norm": 6.514719009399414, "learning_rate": 2.1895881908272446e-05, "loss": 2.8825, "num_input_tokens_seen": 3230352, "step": 4925 }, { "epoch": 0.5403924147758413, "grad_norm": 7.076175212860107, "learning_rate": 2.1853174678486213e-05, "loss": 2.8721, "num_input_tokens_seen": 3234440, "step": 4930 }, { "epoch": 0.5409404801052285, "grad_norm": 5.526149749755859, "learning_rate": 2.1810476777739508e-05, "loss": 3.1112, "num_input_tokens_seen": 3238176, "step": 4935 }, { "epoch": 0.5414885454346158, "grad_norm": 8.458449363708496, "learning_rate": 2.176778833261399e-05, "loss": 3.2798, "num_input_tokens_seen": 3241728, "step": 4940 }, { "epoch": 0.5420366107640031, "grad_norm": 7.216832160949707, "learning_rate": 2.1725109469663318e-05, "loss": 3.1847, "num_input_tokens_seen": 3244416, "step": 4945 }, { "epoch": 0.5425846760933903, "grad_norm": 5.6720147132873535, "learning_rate": 2.168244031541271e-05, "loss": 3.4552, "num_input_tokens_seen": 3247816, "step": 4950 }, { "epoch": 0.5431327414227776, "grad_norm": 7.452066898345947, "learning_rate": 2.163978099635861e-05, "loss": 2.958, "num_input_tokens_seen": 3250432, "step": 4955 }, { "epoch": 0.5436808067521649, "grad_norm": 6.589701175689697, "learning_rate": 2.159713163896832e-05, "loss": 3.4633, "num_input_tokens_seen": 3253376, "step": 4960 }, { "epoch": 0.5442288720815521, "grad_norm": 4.926830768585205, "learning_rate": 2.1554492369679598e-05, "loss": 3.0458, "num_input_tokens_seen": 3257640, "step": 4965 }, { "epoch": 0.5447769374109394, "grad_norm": 8.084177017211914, "learning_rate": 2.1511863314900275e-05, "loss": 2.992, "num_input_tokens_seen": 3261952, "step": 4970 }, { "epoch": 0.5453250027403267, "grad_norm": 5.291374683380127, "learning_rate": 2.146924460100795e-05, "loss": 2.5116, "num_input_tokens_seen": 3265912, "step": 4975 }, { "epoch": 0.5458730680697139, "grad_norm": 9.101826667785645, "learning_rate": 2.1426636354349523e-05, "loss": 3.0809, "num_input_tokens_seen": 3269624, "step": 4980 }, { "epoch": 0.5464211333991011, "grad_norm": 9.933355331420898, "learning_rate": 2.1384038701240865e-05, "loss": 2.6956, "num_input_tokens_seen": 3273112, "step": 4985 }, { "epoch": 0.5469691987284885, "grad_norm": 8.288704872131348, "learning_rate": 2.1341451767966475e-05, "loss": 3.319, "num_input_tokens_seen": 3275624, "step": 4990 }, { "epoch": 0.5475172640578757, "grad_norm": 6.39847469329834, "learning_rate": 2.129887568077904e-05, "loss": 3.0552, "num_input_tokens_seen": 3279792, "step": 4995 }, { "epoch": 0.5480653293872629, "grad_norm": 6.739533424377441, "learning_rate": 2.12563105658991e-05, "loss": 3.1218, "num_input_tokens_seen": 3283560, "step": 5000 }, { "epoch": 0.5486133947166503, "grad_norm": 7.888918399810791, "learning_rate": 2.1213756549514674e-05, "loss": 3.0369, "num_input_tokens_seen": 3286504, "step": 5005 }, { "epoch": 0.5491614600460375, "grad_norm": 6.957367897033691, "learning_rate": 2.1171213757780873e-05, "loss": 2.9968, "num_input_tokens_seen": 3289512, "step": 5010 }, { "epoch": 0.5497095253754247, "grad_norm": 6.351596355438232, "learning_rate": 2.1128682316819522e-05, "loss": 3.0657, "num_input_tokens_seen": 3293512, "step": 5015 }, { "epoch": 0.5502575907048121, "grad_norm": 7.056116104125977, "learning_rate": 2.1086162352718825e-05, "loss": 3.029, "num_input_tokens_seen": 3298024, "step": 5020 }, { "epoch": 0.5508056560341993, "grad_norm": 6.343071937561035, "learning_rate": 2.1043653991532934e-05, "loss": 2.8398, "num_input_tokens_seen": 3301000, "step": 5025 }, { "epoch": 0.5513537213635865, "grad_norm": 8.5012788772583, "learning_rate": 2.1001157359281605e-05, "loss": 3.1406, "num_input_tokens_seen": 3304064, "step": 5030 }, { "epoch": 0.5519017866929739, "grad_norm": 5.8669819831848145, "learning_rate": 2.095867258194984e-05, "loss": 2.7844, "num_input_tokens_seen": 3308616, "step": 5035 }, { "epoch": 0.5524498520223611, "grad_norm": 6.373290061950684, "learning_rate": 2.0916199785487488e-05, "loss": 3.346, "num_input_tokens_seen": 3312128, "step": 5040 }, { "epoch": 0.5529979173517483, "grad_norm": 7.038343906402588, "learning_rate": 2.0873739095808865e-05, "loss": 3.1385, "num_input_tokens_seen": 3315040, "step": 5045 }, { "epoch": 0.5535459826811356, "grad_norm": 7.340169429779053, "learning_rate": 2.083129063879242e-05, "loss": 2.9194, "num_input_tokens_seen": 3319432, "step": 5050 }, { "epoch": 0.5540940480105229, "grad_norm": 5.199733734130859, "learning_rate": 2.0788854540280315e-05, "loss": 3.5487, "num_input_tokens_seen": 3322568, "step": 5055 }, { "epoch": 0.5546421133399101, "grad_norm": 7.935201168060303, "learning_rate": 2.0746430926078086e-05, "loss": 2.8886, "num_input_tokens_seen": 3325536, "step": 5060 }, { "epoch": 0.5551901786692974, "grad_norm": 7.43034029006958, "learning_rate": 2.0704019921954264e-05, "loss": 3.0405, "num_input_tokens_seen": 3329312, "step": 5065 }, { "epoch": 0.5557382439986847, "grad_norm": 5.411002159118652, "learning_rate": 2.0661621653639987e-05, "loss": 3.1599, "num_input_tokens_seen": 3333232, "step": 5070 }, { "epoch": 0.5562863093280719, "grad_norm": 8.897222518920898, "learning_rate": 2.0619236246828622e-05, "loss": 2.8413, "num_input_tokens_seen": 3336312, "step": 5075 }, { "epoch": 0.5568343746574592, "grad_norm": 8.512425422668457, "learning_rate": 2.0576863827175447e-05, "loss": 2.9528, "num_input_tokens_seen": 3339344, "step": 5080 }, { "epoch": 0.5573824399868464, "grad_norm": 7.003962516784668, "learning_rate": 2.0534504520297203e-05, "loss": 3.3579, "num_input_tokens_seen": 3342520, "step": 5085 }, { "epoch": 0.5579305053162337, "grad_norm": 6.14302396774292, "learning_rate": 2.0492158451771767e-05, "loss": 3.3721, "num_input_tokens_seen": 3346272, "step": 5090 }, { "epoch": 0.558478570645621, "grad_norm": 8.199108123779297, "learning_rate": 2.0449825747137778e-05, "loss": 2.9852, "num_input_tokens_seen": 3350232, "step": 5095 }, { "epoch": 0.5590266359750082, "grad_norm": 7.849426746368408, "learning_rate": 2.0407506531894245e-05, "loss": 3.1338, "num_input_tokens_seen": 3353144, "step": 5100 }, { "epoch": 0.5595747013043955, "grad_norm": 6.752470016479492, "learning_rate": 2.0365200931500177e-05, "loss": 2.9589, "num_input_tokens_seen": 3356952, "step": 5105 }, { "epoch": 0.5601227666337828, "grad_norm": 7.846312046051025, "learning_rate": 2.0322909071374265e-05, "loss": 3.2629, "num_input_tokens_seen": 3360424, "step": 5110 }, { "epoch": 0.56067083196317, "grad_norm": 6.629732131958008, "learning_rate": 2.028063107689442e-05, "loss": 3.2232, "num_input_tokens_seen": 3363544, "step": 5115 }, { "epoch": 0.5612188972925573, "grad_norm": 7.26005220413208, "learning_rate": 2.023836707339745e-05, "loss": 3.2771, "num_input_tokens_seen": 3366664, "step": 5120 }, { "epoch": 0.5617669626219446, "grad_norm": 7.383485317230225, "learning_rate": 2.0196117186178727e-05, "loss": 2.8273, "num_input_tokens_seen": 3369848, "step": 5125 }, { "epoch": 0.5623150279513318, "grad_norm": 7.374210357666016, "learning_rate": 2.015388154049173e-05, "loss": 3.2708, "num_input_tokens_seen": 3373208, "step": 5130 }, { "epoch": 0.562863093280719, "grad_norm": 6.803157329559326, "learning_rate": 2.0111660261547728e-05, "loss": 3.1036, "num_input_tokens_seen": 3376872, "step": 5135 }, { "epoch": 0.5634111586101064, "grad_norm": 6.192258358001709, "learning_rate": 2.006945347451541e-05, "loss": 3.0572, "num_input_tokens_seen": 3382136, "step": 5140 }, { "epoch": 0.5639592239394936, "grad_norm": 9.468875885009766, "learning_rate": 2.00272613045205e-05, "loss": 3.2346, "num_input_tokens_seen": 3385456, "step": 5145 }, { "epoch": 0.5645072892688808, "grad_norm": 6.274002552032471, "learning_rate": 1.9985083876645368e-05, "loss": 3.1731, "num_input_tokens_seen": 3388976, "step": 5150 }, { "epoch": 0.5650553545982682, "grad_norm": 5.550570487976074, "learning_rate": 1.994292131592872e-05, "loss": 3.2257, "num_input_tokens_seen": 3392736, "step": 5155 }, { "epoch": 0.5656034199276554, "grad_norm": 8.218210220336914, "learning_rate": 1.990077374736515e-05, "loss": 3.0855, "num_input_tokens_seen": 3396128, "step": 5160 }, { "epoch": 0.5661514852570426, "grad_norm": 7.721156597137451, "learning_rate": 1.9858641295904813e-05, "loss": 2.9721, "num_input_tokens_seen": 3399376, "step": 5165 }, { "epoch": 0.56669955058643, "grad_norm": 6.2414231300354, "learning_rate": 1.981652408645307e-05, "loss": 3.3822, "num_input_tokens_seen": 3401928, "step": 5170 }, { "epoch": 0.5672476159158172, "grad_norm": 8.496658325195312, "learning_rate": 1.9774422243870078e-05, "loss": 3.0474, "num_input_tokens_seen": 3404744, "step": 5175 }, { "epoch": 0.5677956812452044, "grad_norm": 7.224369049072266, "learning_rate": 1.9732335892970427e-05, "loss": 3.259, "num_input_tokens_seen": 3407824, "step": 5180 }, { "epoch": 0.5683437465745917, "grad_norm": 9.386946678161621, "learning_rate": 1.969026515852281e-05, "loss": 3.0473, "num_input_tokens_seen": 3410608, "step": 5185 }, { "epoch": 0.568891811903979, "grad_norm": 8.189655303955078, "learning_rate": 1.96482101652496e-05, "loss": 3.3926, "num_input_tokens_seen": 3413592, "step": 5190 }, { "epoch": 0.5694398772333662, "grad_norm": 6.405150890350342, "learning_rate": 1.9606171037826502e-05, "loss": 2.9921, "num_input_tokens_seen": 3417320, "step": 5195 }, { "epoch": 0.5699879425627535, "grad_norm": 6.89292573928833, "learning_rate": 1.9564147900882213e-05, "loss": 2.9261, "num_input_tokens_seen": 3420888, "step": 5200 }, { "epoch": 0.5705360078921408, "grad_norm": 6.517080307006836, "learning_rate": 1.9522140878997995e-05, "loss": 3.3255, "num_input_tokens_seen": 3424336, "step": 5205 }, { "epoch": 0.571084073221528, "grad_norm": 8.910572052001953, "learning_rate": 1.9480150096707344e-05, "loss": 2.9723, "num_input_tokens_seen": 3428120, "step": 5210 }, { "epoch": 0.5716321385509152, "grad_norm": 8.455070495605469, "learning_rate": 1.943817567849563e-05, "loss": 3.0703, "num_input_tokens_seen": 3430880, "step": 5215 }, { "epoch": 0.5721802038803026, "grad_norm": 6.948888778686523, "learning_rate": 1.9396217748799682e-05, "loss": 2.9862, "num_input_tokens_seen": 3435560, "step": 5220 }, { "epoch": 0.5727282692096898, "grad_norm": 6.147201061248779, "learning_rate": 1.935427643200746e-05, "loss": 3.0719, "num_input_tokens_seen": 3438352, "step": 5225 }, { "epoch": 0.573276334539077, "grad_norm": 7.213772773742676, "learning_rate": 1.9312351852457686e-05, "loss": 2.9474, "num_input_tokens_seen": 3441216, "step": 5230 }, { "epoch": 0.5738243998684643, "grad_norm": 6.16003942489624, "learning_rate": 1.9270444134439434e-05, "loss": 3.0849, "num_input_tokens_seen": 3444944, "step": 5235 }, { "epoch": 0.5743724651978516, "grad_norm": 7.64081335067749, "learning_rate": 1.9228553402191822e-05, "loss": 3.0799, "num_input_tokens_seen": 3449568, "step": 5240 }, { "epoch": 0.5749205305272388, "grad_norm": 7.353094577789307, "learning_rate": 1.91866797799036e-05, "loss": 3.3501, "num_input_tokens_seen": 3452544, "step": 5245 }, { "epoch": 0.5754685958566261, "grad_norm": 7.696213722229004, "learning_rate": 1.9144823391712785e-05, "loss": 3.2286, "num_input_tokens_seen": 3455600, "step": 5250 }, { "epoch": 0.5760166611860134, "grad_norm": 9.90982723236084, "learning_rate": 1.91029843617063e-05, "loss": 3.3799, "num_input_tokens_seen": 3458728, "step": 5255 }, { "epoch": 0.5765647265154006, "grad_norm": 6.676484107971191, "learning_rate": 1.9061162813919637e-05, "loss": 3.2611, "num_input_tokens_seen": 3461888, "step": 5260 }, { "epoch": 0.5771127918447879, "grad_norm": 6.546321868896484, "learning_rate": 1.9019358872336428e-05, "loss": 2.9518, "num_input_tokens_seen": 3464880, "step": 5265 }, { "epoch": 0.5776608571741751, "grad_norm": 5.9848151206970215, "learning_rate": 1.8977572660888122e-05, "loss": 3.1144, "num_input_tokens_seen": 3467712, "step": 5270 }, { "epoch": 0.5782089225035624, "grad_norm": 6.030148506164551, "learning_rate": 1.8935804303453612e-05, "loss": 3.0001, "num_input_tokens_seen": 3471760, "step": 5275 }, { "epoch": 0.5787569878329497, "grad_norm": 9.319378852844238, "learning_rate": 1.8894053923858857e-05, "loss": 2.7935, "num_input_tokens_seen": 3475928, "step": 5280 }, { "epoch": 0.5793050531623369, "grad_norm": 7.607476711273193, "learning_rate": 1.8852321645876507e-05, "loss": 2.9319, "num_input_tokens_seen": 3478968, "step": 5285 }, { "epoch": 0.5798531184917242, "grad_norm": 7.065295219421387, "learning_rate": 1.8810607593225567e-05, "loss": 2.9655, "num_input_tokens_seen": 3482160, "step": 5290 }, { "epoch": 0.5804011838211115, "grad_norm": 6.528260707855225, "learning_rate": 1.8768911889571002e-05, "loss": 3.0625, "num_input_tokens_seen": 3486016, "step": 5295 }, { "epoch": 0.5809492491504987, "grad_norm": 8.56631851196289, "learning_rate": 1.8727234658523368e-05, "loss": 3.1642, "num_input_tokens_seen": 3488552, "step": 5300 }, { "epoch": 0.581497314479886, "grad_norm": 6.70935583114624, "learning_rate": 1.8685576023638495e-05, "loss": 2.908, "num_input_tokens_seen": 3492192, "step": 5305 }, { "epoch": 0.5820453798092733, "grad_norm": 9.139800071716309, "learning_rate": 1.864393610841704e-05, "loss": 3.0694, "num_input_tokens_seen": 3495032, "step": 5310 }, { "epoch": 0.5825934451386605, "grad_norm": 6.343008041381836, "learning_rate": 1.8602315036304175e-05, "loss": 2.939, "num_input_tokens_seen": 3498288, "step": 5315 }, { "epoch": 0.5831415104680477, "grad_norm": 6.961386203765869, "learning_rate": 1.8560712930689238e-05, "loss": 2.7722, "num_input_tokens_seen": 3501112, "step": 5320 }, { "epoch": 0.5836895757974351, "grad_norm": 8.582582473754883, "learning_rate": 1.851912991490531e-05, "loss": 3.0957, "num_input_tokens_seen": 3504384, "step": 5325 }, { "epoch": 0.5842376411268223, "grad_norm": 6.227029800415039, "learning_rate": 1.8477566112228878e-05, "loss": 3.2204, "num_input_tokens_seen": 3508024, "step": 5330 }, { "epoch": 0.5847857064562095, "grad_norm": 6.587297439575195, "learning_rate": 1.8436021645879494e-05, "loss": 3.1471, "num_input_tokens_seen": 3511392, "step": 5335 }, { "epoch": 0.5853337717855969, "grad_norm": 5.520746231079102, "learning_rate": 1.839449663901936e-05, "loss": 2.9406, "num_input_tokens_seen": 3514568, "step": 5340 }, { "epoch": 0.5858818371149841, "grad_norm": 5.80632209777832, "learning_rate": 1.8352991214752983e-05, "loss": 2.9652, "num_input_tokens_seen": 3517672, "step": 5345 }, { "epoch": 0.5864299024443713, "grad_norm": 4.704535484313965, "learning_rate": 1.8311505496126868e-05, "loss": 2.7212, "num_input_tokens_seen": 3522392, "step": 5350 }, { "epoch": 0.5869779677737587, "grad_norm": 12.650748252868652, "learning_rate": 1.8270039606129045e-05, "loss": 3.7118, "num_input_tokens_seen": 3526336, "step": 5355 }, { "epoch": 0.5875260331031459, "grad_norm": 9.578808784484863, "learning_rate": 1.8228593667688772e-05, "loss": 3.2441, "num_input_tokens_seen": 3530656, "step": 5360 }, { "epoch": 0.5880740984325331, "grad_norm": 6.2789812088012695, "learning_rate": 1.818716780367618e-05, "loss": 2.7651, "num_input_tokens_seen": 3533184, "step": 5365 }, { "epoch": 0.5886221637619204, "grad_norm": 8.422161102294922, "learning_rate": 1.8145762136901874e-05, "loss": 3.3134, "num_input_tokens_seen": 3536976, "step": 5370 }, { "epoch": 0.5891702290913077, "grad_norm": 7.674281597137451, "learning_rate": 1.8104376790116572e-05, "loss": 3.1223, "num_input_tokens_seen": 3540496, "step": 5375 }, { "epoch": 0.5897182944206949, "grad_norm": 7.617640495300293, "learning_rate": 1.8063011886010777e-05, "loss": 3.4106, "num_input_tokens_seen": 3542952, "step": 5380 }, { "epoch": 0.5902663597500822, "grad_norm": 6.847158908843994, "learning_rate": 1.8021667547214367e-05, "loss": 3.4031, "num_input_tokens_seen": 3545952, "step": 5385 }, { "epoch": 0.5908144250794695, "grad_norm": 7.656712532043457, "learning_rate": 1.7980343896296243e-05, "loss": 3.1261, "num_input_tokens_seen": 3548960, "step": 5390 }, { "epoch": 0.5913624904088567, "grad_norm": 6.854838848114014, "learning_rate": 1.7939041055764015e-05, "loss": 2.8715, "num_input_tokens_seen": 3552888, "step": 5395 }, { "epoch": 0.591910555738244, "grad_norm": 7.809703350067139, "learning_rate": 1.789775914806357e-05, "loss": 3.0002, "num_input_tokens_seen": 3556448, "step": 5400 }, { "epoch": 0.5924586210676313, "grad_norm": 9.405502319335938, "learning_rate": 1.785649829557873e-05, "loss": 3.4519, "num_input_tokens_seen": 3560392, "step": 5405 }, { "epoch": 0.5930066863970185, "grad_norm": 9.429394721984863, "learning_rate": 1.781525862063092e-05, "loss": 3.2288, "num_input_tokens_seen": 3563680, "step": 5410 }, { "epoch": 0.5935547517264058, "grad_norm": 6.114898204803467, "learning_rate": 1.7774040245478767e-05, "loss": 3.3265, "num_input_tokens_seen": 3567200, "step": 5415 }, { "epoch": 0.594102817055793, "grad_norm": 6.565958499908447, "learning_rate": 1.7732843292317757e-05, "loss": 3.0318, "num_input_tokens_seen": 3570120, "step": 5420 }, { "epoch": 0.5946508823851803, "grad_norm": 7.470787048339844, "learning_rate": 1.7691667883279877e-05, "loss": 2.9758, "num_input_tokens_seen": 3573704, "step": 5425 }, { "epoch": 0.5951989477145676, "grad_norm": 6.305603504180908, "learning_rate": 1.7650514140433226e-05, "loss": 2.8946, "num_input_tokens_seen": 3577472, "step": 5430 }, { "epoch": 0.5957470130439548, "grad_norm": 7.486173629760742, "learning_rate": 1.760938218578168e-05, "loss": 3.0453, "num_input_tokens_seen": 3579928, "step": 5435 }, { "epoch": 0.5962950783733421, "grad_norm": 5.27332067489624, "learning_rate": 1.7568272141264542e-05, "loss": 3.0027, "num_input_tokens_seen": 3582744, "step": 5440 }, { "epoch": 0.5968431437027294, "grad_norm": 5.261857986450195, "learning_rate": 1.752718412875613e-05, "loss": 3.373, "num_input_tokens_seen": 3586344, "step": 5445 }, { "epoch": 0.5973912090321166, "grad_norm": 7.151644706726074, "learning_rate": 1.748611827006545e-05, "loss": 3.0059, "num_input_tokens_seen": 3590696, "step": 5450 }, { "epoch": 0.5979392743615038, "grad_norm": 6.867771148681641, "learning_rate": 1.7445074686935865e-05, "loss": 2.9594, "num_input_tokens_seen": 3593960, "step": 5455 }, { "epoch": 0.5984873396908912, "grad_norm": 10.243605613708496, "learning_rate": 1.740405350104466e-05, "loss": 3.1614, "num_input_tokens_seen": 3597248, "step": 5460 }, { "epoch": 0.5990354050202784, "grad_norm": 7.2442827224731445, "learning_rate": 1.736305483400273e-05, "loss": 3.444, "num_input_tokens_seen": 3600048, "step": 5465 }, { "epoch": 0.5995834703496656, "grad_norm": 8.634395599365234, "learning_rate": 1.7322078807354232e-05, "loss": 3.6502, "num_input_tokens_seen": 3603160, "step": 5470 }, { "epoch": 0.600131535679053, "grad_norm": 7.339416027069092, "learning_rate": 1.728112554257618e-05, "loss": 2.9444, "num_input_tokens_seen": 3606976, "step": 5475 }, { "epoch": 0.6006796010084402, "grad_norm": 6.438117027282715, "learning_rate": 1.7240195161078112e-05, "loss": 2.7825, "num_input_tokens_seen": 3610368, "step": 5480 }, { "epoch": 0.6012276663378274, "grad_norm": 8.13581657409668, "learning_rate": 1.7199287784201752e-05, "loss": 3.1469, "num_input_tokens_seen": 3613240, "step": 5485 }, { "epoch": 0.6017757316672148, "grad_norm": 9.25243854522705, "learning_rate": 1.715840353322059e-05, "loss": 3.1494, "num_input_tokens_seen": 3616384, "step": 5490 }, { "epoch": 0.602323796996602, "grad_norm": 6.846777439117432, "learning_rate": 1.7117542529339564e-05, "loss": 3.0651, "num_input_tokens_seen": 3620600, "step": 5495 }, { "epoch": 0.6028718623259892, "grad_norm": 9.576505661010742, "learning_rate": 1.7076704893694725e-05, "loss": 3.2062, "num_input_tokens_seen": 3624184, "step": 5500 }, { "epoch": 0.6034199276553766, "grad_norm": 5.831842422485352, "learning_rate": 1.7035890747352812e-05, "loss": 2.9302, "num_input_tokens_seen": 3628160, "step": 5505 }, { "epoch": 0.6039679929847638, "grad_norm": 6.526121139526367, "learning_rate": 1.699510021131093e-05, "loss": 3.0619, "num_input_tokens_seen": 3632144, "step": 5510 }, { "epoch": 0.604516058314151, "grad_norm": 8.087743759155273, "learning_rate": 1.695433340649622e-05, "loss": 3.1402, "num_input_tokens_seen": 3635512, "step": 5515 }, { "epoch": 0.6050641236435383, "grad_norm": 4.840604305267334, "learning_rate": 1.6913590453765436e-05, "loss": 3.0223, "num_input_tokens_seen": 3638824, "step": 5520 }, { "epoch": 0.6056121889729256, "grad_norm": 7.919428825378418, "learning_rate": 1.687287147390463e-05, "loss": 2.7976, "num_input_tokens_seen": 3642704, "step": 5525 }, { "epoch": 0.6061602543023128, "grad_norm": 5.97782039642334, "learning_rate": 1.6832176587628784e-05, "loss": 2.9795, "num_input_tokens_seen": 3645432, "step": 5530 }, { "epoch": 0.6067083196317001, "grad_norm": 7.9558539390563965, "learning_rate": 1.6791505915581474e-05, "loss": 3.0965, "num_input_tokens_seen": 3647912, "step": 5535 }, { "epoch": 0.6072563849610874, "grad_norm": 7.399658203125, "learning_rate": 1.675085957833446e-05, "loss": 3.0064, "num_input_tokens_seen": 3651176, "step": 5540 }, { "epoch": 0.6078044502904746, "grad_norm": 5.475082874298096, "learning_rate": 1.6710237696387364e-05, "loss": 3.0204, "num_input_tokens_seen": 3653864, "step": 5545 }, { "epoch": 0.6083525156198619, "grad_norm": 7.328055381774902, "learning_rate": 1.666964039016734e-05, "loss": 3.4209, "num_input_tokens_seen": 3656896, "step": 5550 }, { "epoch": 0.6089005809492491, "grad_norm": 6.844607353210449, "learning_rate": 1.6629067780028643e-05, "loss": 2.8587, "num_input_tokens_seen": 3660032, "step": 5555 }, { "epoch": 0.6094486462786364, "grad_norm": 8.957280158996582, "learning_rate": 1.6588519986252334e-05, "loss": 3.3932, "num_input_tokens_seen": 3662592, "step": 5560 }, { "epoch": 0.6099967116080237, "grad_norm": 6.236993789672852, "learning_rate": 1.6547997129045907e-05, "loss": 2.8217, "num_input_tokens_seen": 3665480, "step": 5565 }, { "epoch": 0.6105447769374109, "grad_norm": 6.7575201988220215, "learning_rate": 1.6507499328542926e-05, "loss": 3.1285, "num_input_tokens_seen": 3668296, "step": 5570 }, { "epoch": 0.6110928422667982, "grad_norm": 6.297115802764893, "learning_rate": 1.6467026704802652e-05, "loss": 3.0519, "num_input_tokens_seen": 3671088, "step": 5575 }, { "epoch": 0.6116409075961855, "grad_norm": 5.6386003494262695, "learning_rate": 1.6426579377809755e-05, "loss": 3.0005, "num_input_tokens_seen": 3674856, "step": 5580 }, { "epoch": 0.6121889729255727, "grad_norm": 5.507198333740234, "learning_rate": 1.6386157467473867e-05, "loss": 3.0995, "num_input_tokens_seen": 3677256, "step": 5585 }, { "epoch": 0.61273703825496, "grad_norm": 6.467530250549316, "learning_rate": 1.6345761093629276e-05, "loss": 3.1279, "num_input_tokens_seen": 3680248, "step": 5590 }, { "epoch": 0.6132851035843473, "grad_norm": 6.12019681930542, "learning_rate": 1.630539037603459e-05, "loss": 3.0768, "num_input_tokens_seen": 3683464, "step": 5595 }, { "epoch": 0.6138331689137345, "grad_norm": 6.198227882385254, "learning_rate": 1.626504543437234e-05, "loss": 3.1144, "num_input_tokens_seen": 3686448, "step": 5600 }, { "epoch": 0.6143812342431217, "grad_norm": 8.729185104370117, "learning_rate": 1.6224726388248622e-05, "loss": 3.2992, "num_input_tokens_seen": 3690360, "step": 5605 }, { "epoch": 0.6149292995725091, "grad_norm": 8.366303443908691, "learning_rate": 1.618443335719281e-05, "loss": 3.1796, "num_input_tokens_seen": 3693344, "step": 5610 }, { "epoch": 0.6154773649018963, "grad_norm": 5.997150897979736, "learning_rate": 1.614416646065711e-05, "loss": 3.0782, "num_input_tokens_seen": 3696488, "step": 5615 }, { "epoch": 0.6160254302312835, "grad_norm": 6.210281848907471, "learning_rate": 1.6103925818016257e-05, "loss": 3.0592, "num_input_tokens_seen": 3700080, "step": 5620 }, { "epoch": 0.6165734955606709, "grad_norm": 10.414953231811523, "learning_rate": 1.606371154856719e-05, "loss": 2.9467, "num_input_tokens_seen": 3703264, "step": 5625 }, { "epoch": 0.6171215608900581, "grad_norm": 6.666655540466309, "learning_rate": 1.6023523771528623e-05, "loss": 3.3406, "num_input_tokens_seen": 3706232, "step": 5630 }, { "epoch": 0.6176696262194453, "grad_norm": 6.776188373565674, "learning_rate": 1.5983362606040733e-05, "loss": 2.9584, "num_input_tokens_seen": 3709728, "step": 5635 }, { "epoch": 0.6182176915488327, "grad_norm": 6.977499008178711, "learning_rate": 1.5943228171164837e-05, "loss": 3.607, "num_input_tokens_seen": 3713824, "step": 5640 }, { "epoch": 0.6187657568782199, "grad_norm": 6.040121555328369, "learning_rate": 1.5903120585882974e-05, "loss": 3.4444, "num_input_tokens_seen": 3718048, "step": 5645 }, { "epoch": 0.6193138222076071, "grad_norm": 7.120656967163086, "learning_rate": 1.5863039969097592e-05, "loss": 3.3153, "num_input_tokens_seen": 3720360, "step": 5650 }, { "epoch": 0.6198618875369944, "grad_norm": 10.212481498718262, "learning_rate": 1.5822986439631207e-05, "loss": 3.0222, "num_input_tokens_seen": 3723136, "step": 5655 }, { "epoch": 0.6204099528663817, "grad_norm": 6.770248889923096, "learning_rate": 1.5782960116226007e-05, "loss": 2.9785, "num_input_tokens_seen": 3726064, "step": 5660 }, { "epoch": 0.6209580181957689, "grad_norm": 5.595423221588135, "learning_rate": 1.574296111754353e-05, "loss": 3.03, "num_input_tokens_seen": 3729800, "step": 5665 }, { "epoch": 0.6215060835251562, "grad_norm": 6.7276225090026855, "learning_rate": 1.5702989562164337e-05, "loss": 3.2465, "num_input_tokens_seen": 3733608, "step": 5670 }, { "epoch": 0.6220541488545435, "grad_norm": 7.501856327056885, "learning_rate": 1.5663045568587592e-05, "loss": 2.8702, "num_input_tokens_seen": 3736928, "step": 5675 }, { "epoch": 0.6226022141839307, "grad_norm": 4.790249824523926, "learning_rate": 1.562312925523076e-05, "loss": 3.0023, "num_input_tokens_seen": 3740256, "step": 5680 }, { "epoch": 0.623150279513318, "grad_norm": 6.182326316833496, "learning_rate": 1.5583240740429266e-05, "loss": 2.9844, "num_input_tokens_seen": 3743504, "step": 5685 }, { "epoch": 0.6236983448427053, "grad_norm": 8.316134452819824, "learning_rate": 1.5543380142436108e-05, "loss": 3.1194, "num_input_tokens_seen": 3746976, "step": 5690 }, { "epoch": 0.6242464101720925, "grad_norm": 4.825036525726318, "learning_rate": 1.5503547579421507e-05, "loss": 2.9029, "num_input_tokens_seen": 3749736, "step": 5695 }, { "epoch": 0.6247944755014798, "grad_norm": 5.379034996032715, "learning_rate": 1.5463743169472604e-05, "loss": 2.813, "num_input_tokens_seen": 3754312, "step": 5700 }, { "epoch": 0.625342540830867, "grad_norm": 7.649238586425781, "learning_rate": 1.5423967030593054e-05, "loss": 2.9726, "num_input_tokens_seen": 3757320, "step": 5705 }, { "epoch": 0.6258906061602543, "grad_norm": 8.456625938415527, "learning_rate": 1.5384219280702707e-05, "loss": 2.9852, "num_input_tokens_seen": 3761320, "step": 5710 }, { "epoch": 0.6264386714896416, "grad_norm": 5.238711833953857, "learning_rate": 1.534450003763726e-05, "loss": 2.8722, "num_input_tokens_seen": 3764536, "step": 5715 }, { "epoch": 0.6269867368190288, "grad_norm": 7.77496337890625, "learning_rate": 1.5304809419147885e-05, "loss": 3.0119, "num_input_tokens_seen": 3766832, "step": 5720 }, { "epoch": 0.6275348021484161, "grad_norm": 6.092039108276367, "learning_rate": 1.526514754290089e-05, "loss": 3.1644, "num_input_tokens_seen": 3770960, "step": 5725 }, { "epoch": 0.6280828674778034, "grad_norm": 8.289813995361328, "learning_rate": 1.5225514526477408e-05, "loss": 3.0392, "num_input_tokens_seen": 3774184, "step": 5730 }, { "epoch": 0.6286309328071906, "grad_norm": 7.361676216125488, "learning_rate": 1.5185910487372973e-05, "loss": 2.9171, "num_input_tokens_seen": 3778784, "step": 5735 }, { "epoch": 0.6291789981365778, "grad_norm": 6.253126621246338, "learning_rate": 1.514633554299723e-05, "loss": 2.9294, "num_input_tokens_seen": 3781568, "step": 5740 }, { "epoch": 0.6297270634659652, "grad_norm": 10.453216552734375, "learning_rate": 1.5106789810673578e-05, "loss": 3.2064, "num_input_tokens_seen": 3784152, "step": 5745 }, { "epoch": 0.6302751287953524, "grad_norm": 7.798788547515869, "learning_rate": 1.506727340763881e-05, "loss": 2.9679, "num_input_tokens_seen": 3786864, "step": 5750 }, { "epoch": 0.6308231941247396, "grad_norm": 7.438601493835449, "learning_rate": 1.5027786451042758e-05, "loss": 2.9835, "num_input_tokens_seen": 3790360, "step": 5755 }, { "epoch": 0.631371259454127, "grad_norm": 8.202717781066895, "learning_rate": 1.498832905794797e-05, "loss": 3.1209, "num_input_tokens_seen": 3793160, "step": 5760 }, { "epoch": 0.6319193247835142, "grad_norm": 7.448530673980713, "learning_rate": 1.4948901345329352e-05, "loss": 3.1779, "num_input_tokens_seen": 3797568, "step": 5765 }, { "epoch": 0.6324673901129014, "grad_norm": 5.029766082763672, "learning_rate": 1.4909503430073796e-05, "loss": 2.8519, "num_input_tokens_seen": 3801096, "step": 5770 }, { "epoch": 0.6330154554422888, "grad_norm": 5.234902858734131, "learning_rate": 1.48701354289799e-05, "loss": 3.1461, "num_input_tokens_seen": 3806256, "step": 5775 }, { "epoch": 0.633563520771676, "grad_norm": 8.089512825012207, "learning_rate": 1.4830797458757544e-05, "loss": 3.12, "num_input_tokens_seen": 3808880, "step": 5780 }, { "epoch": 0.6341115861010632, "grad_norm": 5.7707839012146, "learning_rate": 1.4791489636027583e-05, "loss": 2.7087, "num_input_tokens_seen": 3813584, "step": 5785 }, { "epoch": 0.6346596514304506, "grad_norm": 6.020088195800781, "learning_rate": 1.475221207732151e-05, "loss": 2.9224, "num_input_tokens_seen": 3816848, "step": 5790 }, { "epoch": 0.6352077167598378, "grad_norm": 6.976149082183838, "learning_rate": 1.4712964899081093e-05, "loss": 3.0359, "num_input_tokens_seen": 3820368, "step": 5795 }, { "epoch": 0.635755782089225, "grad_norm": 7.066904544830322, "learning_rate": 1.4673748217658026e-05, "loss": 3.0753, "num_input_tokens_seen": 3823064, "step": 5800 }, { "epoch": 0.6363038474186123, "grad_norm": 5.929400444030762, "learning_rate": 1.4634562149313607e-05, "loss": 3.1222, "num_input_tokens_seen": 3826048, "step": 5805 }, { "epoch": 0.6368519127479996, "grad_norm": 6.900379657745361, "learning_rate": 1.459540681021836e-05, "loss": 3.4275, "num_input_tokens_seen": 3829584, "step": 5810 }, { "epoch": 0.6373999780773868, "grad_norm": 6.451569080352783, "learning_rate": 1.4556282316451733e-05, "loss": 3.0381, "num_input_tokens_seen": 3832848, "step": 5815 }, { "epoch": 0.6379480434067741, "grad_norm": 6.459670066833496, "learning_rate": 1.4517188784001712e-05, "loss": 2.9231, "num_input_tokens_seen": 3835392, "step": 5820 }, { "epoch": 0.6384961087361614, "grad_norm": 9.6491117477417, "learning_rate": 1.4478126328764496e-05, "loss": 3.1121, "num_input_tokens_seen": 3839016, "step": 5825 }, { "epoch": 0.6390441740655486, "grad_norm": 6.9248552322387695, "learning_rate": 1.4439095066544154e-05, "loss": 3.0439, "num_input_tokens_seen": 3841424, "step": 5830 }, { "epoch": 0.6395922393949359, "grad_norm": 8.927162170410156, "learning_rate": 1.44000951130523e-05, "loss": 2.9511, "num_input_tokens_seen": 3843624, "step": 5835 }, { "epoch": 0.6401403047243232, "grad_norm": 7.547786712646484, "learning_rate": 1.4361126583907708e-05, "loss": 3.2556, "num_input_tokens_seen": 3846024, "step": 5840 }, { "epoch": 0.6406883700537104, "grad_norm": 9.325125694274902, "learning_rate": 1.432218959463599e-05, "loss": 3.2518, "num_input_tokens_seen": 3849176, "step": 5845 }, { "epoch": 0.6412364353830977, "grad_norm": 7.831711292266846, "learning_rate": 1.4283284260669282e-05, "loss": 3.3252, "num_input_tokens_seen": 3851496, "step": 5850 }, { "epoch": 0.6417845007124849, "grad_norm": 5.674088001251221, "learning_rate": 1.4244410697345845e-05, "loss": 3.1402, "num_input_tokens_seen": 3854384, "step": 5855 }, { "epoch": 0.6423325660418722, "grad_norm": 5.759450912475586, "learning_rate": 1.4205569019909759e-05, "loss": 3.2573, "num_input_tokens_seen": 3857336, "step": 5860 }, { "epoch": 0.6428806313712595, "grad_norm": 6.425468921661377, "learning_rate": 1.4166759343510599e-05, "loss": 2.994, "num_input_tokens_seen": 3860008, "step": 5865 }, { "epoch": 0.6434286967006467, "grad_norm": 8.979571342468262, "learning_rate": 1.4127981783203049e-05, "loss": 2.8518, "num_input_tokens_seen": 3863232, "step": 5870 }, { "epoch": 0.643976762030034, "grad_norm": 7.848270416259766, "learning_rate": 1.4089236453946563e-05, "loss": 3.312, "num_input_tokens_seen": 3867768, "step": 5875 }, { "epoch": 0.6445248273594213, "grad_norm": 6.893942832946777, "learning_rate": 1.4050523470605099e-05, "loss": 3.0278, "num_input_tokens_seen": 3870384, "step": 5880 }, { "epoch": 0.6450728926888085, "grad_norm": 6.547880172729492, "learning_rate": 1.4011842947946674e-05, "loss": 2.7762, "num_input_tokens_seen": 3873064, "step": 5885 }, { "epoch": 0.6456209580181957, "grad_norm": 8.624503135681152, "learning_rate": 1.397319500064308e-05, "loss": 2.8362, "num_input_tokens_seen": 3876656, "step": 5890 }, { "epoch": 0.6461690233475831, "grad_norm": 7.134870529174805, "learning_rate": 1.3934579743269561e-05, "loss": 2.6202, "num_input_tokens_seen": 3880296, "step": 5895 }, { "epoch": 0.6467170886769703, "grad_norm": 7.61886739730835, "learning_rate": 1.389599729030443e-05, "loss": 2.9104, "num_input_tokens_seen": 3883280, "step": 5900 }, { "epoch": 0.6472651540063575, "grad_norm": 6.761881351470947, "learning_rate": 1.3857447756128744e-05, "loss": 2.9658, "num_input_tokens_seen": 3885848, "step": 5905 }, { "epoch": 0.6478132193357449, "grad_norm": 9.020877838134766, "learning_rate": 1.381893125502598e-05, "loss": 3.1887, "num_input_tokens_seen": 3889168, "step": 5910 }, { "epoch": 0.6483612846651321, "grad_norm": 7.6226091384887695, "learning_rate": 1.3780447901181681e-05, "loss": 3.2913, "num_input_tokens_seen": 3892368, "step": 5915 }, { "epoch": 0.6489093499945193, "grad_norm": 6.327563285827637, "learning_rate": 1.374199780868311e-05, "loss": 2.868, "num_input_tokens_seen": 3895192, "step": 5920 }, { "epoch": 0.6494574153239067, "grad_norm": 7.200982093811035, "learning_rate": 1.3703581091518964e-05, "loss": 2.9841, "num_input_tokens_seen": 3899104, "step": 5925 }, { "epoch": 0.6500054806532939, "grad_norm": 7.297597885131836, "learning_rate": 1.3665197863578954e-05, "loss": 3.1225, "num_input_tokens_seen": 3901696, "step": 5930 }, { "epoch": 0.6505535459826811, "grad_norm": 6.203746318817139, "learning_rate": 1.3626848238653516e-05, "loss": 3.082, "num_input_tokens_seen": 3905192, "step": 5935 }, { "epoch": 0.6511016113120685, "grad_norm": 7.677253246307373, "learning_rate": 1.358853233043349e-05, "loss": 3.2795, "num_input_tokens_seen": 3908456, "step": 5940 }, { "epoch": 0.6516496766414557, "grad_norm": 6.703474044799805, "learning_rate": 1.3550250252509744e-05, "loss": 3.123, "num_input_tokens_seen": 3910504, "step": 5945 }, { "epoch": 0.6521977419708429, "grad_norm": 7.855628967285156, "learning_rate": 1.3512002118372835e-05, "loss": 2.8393, "num_input_tokens_seen": 3913032, "step": 5950 }, { "epoch": 0.6527458073002302, "grad_norm": 7.922531604766846, "learning_rate": 1.3473788041412732e-05, "loss": 2.7007, "num_input_tokens_seen": 3916392, "step": 5955 }, { "epoch": 0.6532938726296175, "grad_norm": 10.957340240478516, "learning_rate": 1.3435608134918412e-05, "loss": 2.9213, "num_input_tokens_seen": 3919248, "step": 5960 }, { "epoch": 0.6538419379590047, "grad_norm": 5.184296607971191, "learning_rate": 1.3397462512077535e-05, "loss": 3.203, "num_input_tokens_seen": 3922528, "step": 5965 }, { "epoch": 0.654390003288392, "grad_norm": 8.037724494934082, "learning_rate": 1.3359351285976174e-05, "loss": 3.1737, "num_input_tokens_seen": 3925200, "step": 5970 }, { "epoch": 0.6549380686177793, "grad_norm": 7.275876045227051, "learning_rate": 1.3321274569598382e-05, "loss": 2.848, "num_input_tokens_seen": 3928128, "step": 5975 }, { "epoch": 0.6554861339471665, "grad_norm": 5.043073654174805, "learning_rate": 1.3283232475825916e-05, "loss": 2.8843, "num_input_tokens_seen": 3931696, "step": 5980 }, { "epoch": 0.6560341992765538, "grad_norm": 8.235861778259277, "learning_rate": 1.3245225117437918e-05, "loss": 3.3592, "num_input_tokens_seen": 3934656, "step": 5985 }, { "epoch": 0.656582264605941, "grad_norm": 7.135794162750244, "learning_rate": 1.3207252607110521e-05, "loss": 3.263, "num_input_tokens_seen": 3937536, "step": 5990 }, { "epoch": 0.6571303299353283, "grad_norm": 8.360773086547852, "learning_rate": 1.3169315057416564e-05, "loss": 3.1673, "num_input_tokens_seen": 3940200, "step": 5995 }, { "epoch": 0.6576783952647156, "grad_norm": 9.115818977355957, "learning_rate": 1.3131412580825236e-05, "loss": 3.1802, "num_input_tokens_seen": 3942688, "step": 6000 }, { "epoch": 0.6582264605941028, "grad_norm": 8.476052284240723, "learning_rate": 1.3093545289701747e-05, "loss": 3.1919, "num_input_tokens_seen": 3945760, "step": 6005 }, { "epoch": 0.6587745259234901, "grad_norm": 6.621984004974365, "learning_rate": 1.3055713296307016e-05, "loss": 2.8701, "num_input_tokens_seen": 3948512, "step": 6010 }, { "epoch": 0.6593225912528773, "grad_norm": 8.03313159942627, "learning_rate": 1.3017916712797293e-05, "loss": 3.3227, "num_input_tokens_seen": 3951520, "step": 6015 }, { "epoch": 0.6598706565822646, "grad_norm": 7.0439677238464355, "learning_rate": 1.2980155651223867e-05, "loss": 2.8738, "num_input_tokens_seen": 3955392, "step": 6020 }, { "epoch": 0.6604187219116519, "grad_norm": 7.3785529136657715, "learning_rate": 1.2942430223532703e-05, "loss": 3.3427, "num_input_tokens_seen": 3959592, "step": 6025 }, { "epoch": 0.6609667872410391, "grad_norm": 5.641672134399414, "learning_rate": 1.2904740541564159e-05, "loss": 3.0156, "num_input_tokens_seen": 3963064, "step": 6030 }, { "epoch": 0.6615148525704264, "grad_norm": 6.209802150726318, "learning_rate": 1.286708671705259e-05, "loss": 3.0553, "num_input_tokens_seen": 3965552, "step": 6035 }, { "epoch": 0.6620629178998136, "grad_norm": 6.092316627502441, "learning_rate": 1.2829468861626052e-05, "loss": 2.9092, "num_input_tokens_seen": 3968480, "step": 6040 }, { "epoch": 0.6626109832292009, "grad_norm": 10.323710441589355, "learning_rate": 1.2791887086805993e-05, "loss": 3.4687, "num_input_tokens_seen": 3971464, "step": 6045 }, { "epoch": 0.6631590485585882, "grad_norm": 6.506869792938232, "learning_rate": 1.2754341504006872e-05, "loss": 3.0349, "num_input_tokens_seen": 3975640, "step": 6050 }, { "epoch": 0.6637071138879754, "grad_norm": 6.929319381713867, "learning_rate": 1.2716832224535847e-05, "loss": 3.1761, "num_input_tokens_seen": 3978928, "step": 6055 }, { "epoch": 0.6642551792173627, "grad_norm": 6.731025218963623, "learning_rate": 1.2679359359592488e-05, "loss": 2.7582, "num_input_tokens_seen": 3984016, "step": 6060 }, { "epoch": 0.66480324454675, "grad_norm": 7.775283336639404, "learning_rate": 1.2641923020268377e-05, "loss": 3.222, "num_input_tokens_seen": 3986544, "step": 6065 }, { "epoch": 0.6653513098761372, "grad_norm": 9.189234733581543, "learning_rate": 1.2604523317546813e-05, "loss": 2.7329, "num_input_tokens_seen": 3989440, "step": 6070 }, { "epoch": 0.6658993752055244, "grad_norm": 6.482409954071045, "learning_rate": 1.2567160362302515e-05, "loss": 3.0355, "num_input_tokens_seen": 3993928, "step": 6075 }, { "epoch": 0.6664474405349118, "grad_norm": 6.9843878746032715, "learning_rate": 1.2529834265301227e-05, "loss": 3.1331, "num_input_tokens_seen": 3997312, "step": 6080 }, { "epoch": 0.666995505864299, "grad_norm": 7.9999308586120605, "learning_rate": 1.2492545137199426e-05, "loss": 3.2756, "num_input_tokens_seen": 4000160, "step": 6085 }, { "epoch": 0.6675435711936862, "grad_norm": 5.13596773147583, "learning_rate": 1.2455293088544023e-05, "loss": 3.382, "num_input_tokens_seen": 4003720, "step": 6090 }, { "epoch": 0.6680916365230736, "grad_norm": 6.42021369934082, "learning_rate": 1.2418078229771973e-05, "loss": 2.9692, "num_input_tokens_seen": 4006680, "step": 6095 }, { "epoch": 0.6686397018524608, "grad_norm": 9.268325805664062, "learning_rate": 1.2380900671209984e-05, "loss": 2.9399, "num_input_tokens_seen": 4009632, "step": 6100 }, { "epoch": 0.669187767181848, "grad_norm": 5.049006938934326, "learning_rate": 1.2343760523074186e-05, "loss": 3.0858, "num_input_tokens_seen": 4012552, "step": 6105 }, { "epoch": 0.6697358325112354, "grad_norm": 6.255411148071289, "learning_rate": 1.2306657895469809e-05, "loss": 3.16, "num_input_tokens_seen": 4016240, "step": 6110 }, { "epoch": 0.6702838978406226, "grad_norm": 10.016054153442383, "learning_rate": 1.2269592898390833e-05, "loss": 3.0065, "num_input_tokens_seen": 4019680, "step": 6115 }, { "epoch": 0.6708319631700098, "grad_norm": 7.499462604522705, "learning_rate": 1.223256564171971e-05, "loss": 3.3602, "num_input_tokens_seen": 4022288, "step": 6120 }, { "epoch": 0.6713800284993972, "grad_norm": 7.838258266448975, "learning_rate": 1.2195576235226977e-05, "loss": 2.7866, "num_input_tokens_seen": 4025216, "step": 6125 }, { "epoch": 0.6719280938287844, "grad_norm": 7.931380271911621, "learning_rate": 1.2158624788570965e-05, "loss": 3.4889, "num_input_tokens_seen": 4029376, "step": 6130 }, { "epoch": 0.6724761591581716, "grad_norm": 5.675364971160889, "learning_rate": 1.2121711411297498e-05, "loss": 3.3344, "num_input_tokens_seen": 4031616, "step": 6135 }, { "epoch": 0.6730242244875589, "grad_norm": 5.3835577964782715, "learning_rate": 1.2084836212839507e-05, "loss": 3.1429, "num_input_tokens_seen": 4034840, "step": 6140 }, { "epoch": 0.6735722898169462, "grad_norm": 7.542428016662598, "learning_rate": 1.2047999302516737e-05, "loss": 2.9853, "num_input_tokens_seen": 4037792, "step": 6145 }, { "epoch": 0.6741203551463334, "grad_norm": 7.841860771179199, "learning_rate": 1.2011200789535464e-05, "loss": 3.011, "num_input_tokens_seen": 4041272, "step": 6150 }, { "epoch": 0.6746684204757207, "grad_norm": 10.116206169128418, "learning_rate": 1.1974440782988094e-05, "loss": 3.1755, "num_input_tokens_seen": 4044360, "step": 6155 }, { "epoch": 0.675216485805108, "grad_norm": 6.566442489624023, "learning_rate": 1.1937719391852877e-05, "loss": 3.0532, "num_input_tokens_seen": 4047544, "step": 6160 }, { "epoch": 0.6757645511344952, "grad_norm": 6.767369747161865, "learning_rate": 1.1901036724993616e-05, "loss": 2.9114, "num_input_tokens_seen": 4050584, "step": 6165 }, { "epoch": 0.6763126164638825, "grad_norm": 5.782663822174072, "learning_rate": 1.1864392891159284e-05, "loss": 3.4902, "num_input_tokens_seen": 4053392, "step": 6170 }, { "epoch": 0.6768606817932697, "grad_norm": 7.807350158691406, "learning_rate": 1.1827787998983731e-05, "loss": 3.1896, "num_input_tokens_seen": 4056184, "step": 6175 }, { "epoch": 0.677408747122657, "grad_norm": 8.840995788574219, "learning_rate": 1.1791222156985382e-05, "loss": 3.4261, "num_input_tokens_seen": 4060616, "step": 6180 }, { "epoch": 0.6779568124520443, "grad_norm": 5.441840171813965, "learning_rate": 1.1754695473566877e-05, "loss": 2.8645, "num_input_tokens_seen": 4065008, "step": 6185 }, { "epoch": 0.6785048777814315, "grad_norm": 7.820642471313477, "learning_rate": 1.1718208057014768e-05, "loss": 3.1664, "num_input_tokens_seen": 4068872, "step": 6190 }, { "epoch": 0.6790529431108188, "grad_norm": 7.290872573852539, "learning_rate": 1.1681760015499201e-05, "loss": 3.4087, "num_input_tokens_seen": 4071376, "step": 6195 }, { "epoch": 0.6796010084402061, "grad_norm": 5.5174360275268555, "learning_rate": 1.1645351457073594e-05, "loss": 3.3074, "num_input_tokens_seen": 4074528, "step": 6200 }, { "epoch": 0.6801490737695933, "grad_norm": 6.114542484283447, "learning_rate": 1.1608982489674295e-05, "loss": 3.0535, "num_input_tokens_seen": 4077600, "step": 6205 }, { "epoch": 0.6806971390989806, "grad_norm": 8.515054702758789, "learning_rate": 1.1572653221120316e-05, "loss": 3.2291, "num_input_tokens_seen": 4080664, "step": 6210 }, { "epoch": 0.6812452044283679, "grad_norm": 8.11023235321045, "learning_rate": 1.1536363759112952e-05, "loss": 3.1448, "num_input_tokens_seen": 4083256, "step": 6215 }, { "epoch": 0.6817932697577551, "grad_norm": 7.834672927856445, "learning_rate": 1.1500114211235482e-05, "loss": 3.1213, "num_input_tokens_seen": 4085568, "step": 6220 }, { "epoch": 0.6823413350871423, "grad_norm": 6.758762836456299, "learning_rate": 1.146390468495289e-05, "loss": 3.0515, "num_input_tokens_seen": 4088248, "step": 6225 }, { "epoch": 0.6828894004165297, "grad_norm": 6.3487372398376465, "learning_rate": 1.1427735287611477e-05, "loss": 2.5775, "num_input_tokens_seen": 4090848, "step": 6230 }, { "epoch": 0.6834374657459169, "grad_norm": 5.81227445602417, "learning_rate": 1.1391606126438586e-05, "loss": 3.0297, "num_input_tokens_seen": 4094232, "step": 6235 }, { "epoch": 0.6839855310753041, "grad_norm": 7.857996463775635, "learning_rate": 1.1355517308542301e-05, "loss": 3.0582, "num_input_tokens_seen": 4097096, "step": 6240 }, { "epoch": 0.6845335964046915, "grad_norm": 5.819544792175293, "learning_rate": 1.1319468940911079e-05, "loss": 2.8814, "num_input_tokens_seen": 4099912, "step": 6245 }, { "epoch": 0.6850816617340787, "grad_norm": 9.14799976348877, "learning_rate": 1.1283461130413453e-05, "loss": 3.3229, "num_input_tokens_seen": 4102320, "step": 6250 }, { "epoch": 0.6856297270634659, "grad_norm": 7.087406158447266, "learning_rate": 1.1247493983797754e-05, "loss": 2.8581, "num_input_tokens_seen": 4106480, "step": 6255 }, { "epoch": 0.6861777923928533, "grad_norm": 7.298010349273682, "learning_rate": 1.1218749616158092e-05, "loss": 3.1186, "num_input_tokens_seen": 4110064, "step": 6260 }, { "epoch": 0.6867258577222405, "grad_norm": 6.6678290367126465, "learning_rate": 1.1182855933150582e-05, "loss": 2.971, "num_input_tokens_seen": 4113304, "step": 6265 }, { "epoch": 0.6872739230516277, "grad_norm": 8.044167518615723, "learning_rate": 1.1147003212277912e-05, "loss": 3.3036, "num_input_tokens_seen": 4115752, "step": 6270 }, { "epoch": 0.687821988381015, "grad_norm": 6.803138256072998, "learning_rate": 1.1111191559828627e-05, "loss": 2.7812, "num_input_tokens_seen": 4119488, "step": 6275 }, { "epoch": 0.6883700537104023, "grad_norm": 5.070322513580322, "learning_rate": 1.1075421081969502e-05, "loss": 3.152, "num_input_tokens_seen": 4122168, "step": 6280 }, { "epoch": 0.6889181190397895, "grad_norm": 6.463720321655273, "learning_rate": 1.1039691884745252e-05, "loss": 2.9657, "num_input_tokens_seen": 4125704, "step": 6285 }, { "epoch": 0.6894661843691768, "grad_norm": 9.405960083007812, "learning_rate": 1.1004004074078223e-05, "loss": 3.5484, "num_input_tokens_seen": 4128608, "step": 6290 }, { "epoch": 0.6900142496985641, "grad_norm": 6.504082679748535, "learning_rate": 1.0968357755768051e-05, "loss": 2.7744, "num_input_tokens_seen": 4131416, "step": 6295 }, { "epoch": 0.6905623150279513, "grad_norm": 7.679104804992676, "learning_rate": 1.093275303549137e-05, "loss": 3.1396, "num_input_tokens_seen": 4135168, "step": 6300 }, { "epoch": 0.6911103803573386, "grad_norm": 10.499975204467773, "learning_rate": 1.0897190018801503e-05, "loss": 3.4244, "num_input_tokens_seen": 4138320, "step": 6305 }, { "epoch": 0.6916584456867259, "grad_norm": 5.967805862426758, "learning_rate": 1.0861668811128129e-05, "loss": 3.0676, "num_input_tokens_seen": 4140880, "step": 6310 }, { "epoch": 0.6922065110161131, "grad_norm": 6.552985668182373, "learning_rate": 1.0826189517776975e-05, "loss": 3.0805, "num_input_tokens_seen": 4143912, "step": 6315 }, { "epoch": 0.6927545763455004, "grad_norm": 8.34593677520752, "learning_rate": 1.0790752243929523e-05, "loss": 3.2587, "num_input_tokens_seen": 4147320, "step": 6320 }, { "epoch": 0.6933026416748876, "grad_norm": 6.536946773529053, "learning_rate": 1.0755357094642674e-05, "loss": 3.0053, "num_input_tokens_seen": 4150928, "step": 6325 }, { "epoch": 0.6938507070042749, "grad_norm": 7.138943672180176, "learning_rate": 1.0720004174848444e-05, "loss": 2.9898, "num_input_tokens_seen": 4154120, "step": 6330 }, { "epoch": 0.6943987723336622, "grad_norm": 9.60561466217041, "learning_rate": 1.0684693589353678e-05, "loss": 3.4849, "num_input_tokens_seen": 4156832, "step": 6335 }, { "epoch": 0.6949468376630494, "grad_norm": 8.691582679748535, "learning_rate": 1.0649425442839697e-05, "loss": 3.1178, "num_input_tokens_seen": 4159704, "step": 6340 }, { "epoch": 0.6954949029924367, "grad_norm": 8.004415512084961, "learning_rate": 1.0614199839862002e-05, "loss": 3.0848, "num_input_tokens_seen": 4162168, "step": 6345 }, { "epoch": 0.696042968321824, "grad_norm": 12.674962043762207, "learning_rate": 1.0579016884849999e-05, "loss": 3.4026, "num_input_tokens_seen": 4165384, "step": 6350 }, { "epoch": 0.6965910336512112, "grad_norm": 7.9511284828186035, "learning_rate": 1.0543876682106632e-05, "loss": 3.0329, "num_input_tokens_seen": 4168128, "step": 6355 }, { "epoch": 0.6971390989805984, "grad_norm": 9.268970489501953, "learning_rate": 1.0508779335808105e-05, "loss": 3.1994, "num_input_tokens_seen": 4171888, "step": 6360 }, { "epoch": 0.6976871643099858, "grad_norm": 6.21211051940918, "learning_rate": 1.04737249500036e-05, "loss": 3.1242, "num_input_tokens_seen": 4174896, "step": 6365 }, { "epoch": 0.698235229639373, "grad_norm": 7.668500900268555, "learning_rate": 1.04387136286149e-05, "loss": 3.0467, "num_input_tokens_seen": 4178504, "step": 6370 }, { "epoch": 0.6987832949687602, "grad_norm": 5.02815580368042, "learning_rate": 1.040374547543613e-05, "loss": 2.9279, "num_input_tokens_seen": 4182040, "step": 6375 }, { "epoch": 0.6993313602981476, "grad_norm": 5.940211772918701, "learning_rate": 1.0368820594133466e-05, "loss": 2.968, "num_input_tokens_seen": 4185880, "step": 6380 }, { "epoch": 0.6998794256275348, "grad_norm": 6.044907093048096, "learning_rate": 1.0333939088244771e-05, "loss": 3.3093, "num_input_tokens_seen": 4189000, "step": 6385 }, { "epoch": 0.700427490956922, "grad_norm": 6.427306652069092, "learning_rate": 1.0299101061179317e-05, "loss": 3.2814, "num_input_tokens_seen": 4191736, "step": 6390 }, { "epoch": 0.7009755562863094, "grad_norm": 7.336453914642334, "learning_rate": 1.0264306616217507e-05, "loss": 2.8437, "num_input_tokens_seen": 4194360, "step": 6395 }, { "epoch": 0.7015236216156966, "grad_norm": 7.562320709228516, "learning_rate": 1.0229555856510512e-05, "loss": 2.828, "num_input_tokens_seen": 4197920, "step": 6400 }, { "epoch": 0.7020716869450838, "grad_norm": 7.142042636871338, "learning_rate": 1.0194848885080011e-05, "loss": 3.1228, "num_input_tokens_seen": 4201984, "step": 6405 }, { "epoch": 0.7026197522744712, "grad_norm": 6.18742036819458, "learning_rate": 1.0160185804817859e-05, "loss": 2.8393, "num_input_tokens_seen": 4205328, "step": 6410 }, { "epoch": 0.7031678176038584, "grad_norm": 7.195977687835693, "learning_rate": 1.0125566718485788e-05, "loss": 2.9868, "num_input_tokens_seen": 4208312, "step": 6415 }, { "epoch": 0.7037158829332456, "grad_norm": 10.329099655151367, "learning_rate": 1.0090991728715132e-05, "loss": 2.829, "num_input_tokens_seen": 4211312, "step": 6420 }, { "epoch": 0.7042639482626329, "grad_norm": 6.6712236404418945, "learning_rate": 1.0056460938006473e-05, "loss": 2.9549, "num_input_tokens_seen": 4213800, "step": 6425 }, { "epoch": 0.7048120135920202, "grad_norm": 4.803092002868652, "learning_rate": 1.0021974448729365e-05, "loss": 3.3355, "num_input_tokens_seen": 4217200, "step": 6430 }, { "epoch": 0.7053600789214074, "grad_norm": 6.527164459228516, "learning_rate": 9.987532363122018e-06, "loss": 2.9652, "num_input_tokens_seen": 4220768, "step": 6435 }, { "epoch": 0.7059081442507947, "grad_norm": 7.362782955169678, "learning_rate": 9.953134783291036e-06, "loss": 2.8684, "num_input_tokens_seen": 4224224, "step": 6440 }, { "epoch": 0.706456209580182, "grad_norm": 9.984780311584473, "learning_rate": 9.918781811211045e-06, "loss": 2.8968, "num_input_tokens_seen": 4229272, "step": 6445 }, { "epoch": 0.7070042749095692, "grad_norm": 6.219121932983398, "learning_rate": 9.884473548724441e-06, "loss": 3.1832, "num_input_tokens_seen": 4232096, "step": 6450 }, { "epoch": 0.7075523402389565, "grad_norm": 6.208556652069092, "learning_rate": 9.850210097541085e-06, "loss": 3.108, "num_input_tokens_seen": 4235496, "step": 6455 }, { "epoch": 0.7081004055683437, "grad_norm": 7.7808003425598145, "learning_rate": 9.81599155923798e-06, "loss": 3.0694, "num_input_tokens_seen": 4238320, "step": 6460 }, { "epoch": 0.708648470897731, "grad_norm": 8.587124824523926, "learning_rate": 9.781818035258972e-06, "loss": 3.1773, "num_input_tokens_seen": 4240792, "step": 6465 }, { "epoch": 0.7091965362271183, "grad_norm": 11.057994842529297, "learning_rate": 9.747689626914483e-06, "loss": 3.4154, "num_input_tokens_seen": 4244904, "step": 6470 }, { "epoch": 0.7097446015565055, "grad_norm": 6.430279731750488, "learning_rate": 9.713606435381165e-06, "loss": 3.1772, "num_input_tokens_seen": 4247632, "step": 6475 }, { "epoch": 0.7102926668858928, "grad_norm": 7.846237659454346, "learning_rate": 9.679568561701615e-06, "loss": 2.9962, "num_input_tokens_seen": 4250768, "step": 6480 }, { "epoch": 0.7108407322152801, "grad_norm": 8.467151641845703, "learning_rate": 9.645576106784118e-06, "loss": 2.8687, "num_input_tokens_seen": 4253904, "step": 6485 }, { "epoch": 0.7113887975446673, "grad_norm": 16.991235733032227, "learning_rate": 9.611629171402273e-06, "loss": 3.1696, "num_input_tokens_seen": 4256768, "step": 6490 }, { "epoch": 0.7119368628740546, "grad_norm": 7.091182231903076, "learning_rate": 9.577727856194746e-06, "loss": 2.7567, "num_input_tokens_seen": 4260192, "step": 6495 }, { "epoch": 0.7124849282034419, "grad_norm": 7.963916778564453, "learning_rate": 9.543872261664952e-06, "loss": 2.9586, "num_input_tokens_seen": 4263560, "step": 6500 }, { "epoch": 0.7130329935328291, "grad_norm": 6.632905006408691, "learning_rate": 9.510062488180781e-06, "loss": 2.8122, "num_input_tokens_seen": 4266624, "step": 6505 }, { "epoch": 0.7135810588622163, "grad_norm": 8.157563209533691, "learning_rate": 9.476298635974265e-06, "loss": 2.9458, "num_input_tokens_seen": 4269488, "step": 6510 }, { "epoch": 0.7141291241916037, "grad_norm": 7.982326507568359, "learning_rate": 9.442580805141305e-06, "loss": 3.172, "num_input_tokens_seen": 4272592, "step": 6515 }, { "epoch": 0.7146771895209909, "grad_norm": 5.6351423263549805, "learning_rate": 9.408909095641363e-06, "loss": 3.139, "num_input_tokens_seen": 4275552, "step": 6520 }, { "epoch": 0.7152252548503781, "grad_norm": 7.883710861206055, "learning_rate": 9.375283607297175e-06, "loss": 3.3458, "num_input_tokens_seen": 4277912, "step": 6525 }, { "epoch": 0.7157733201797655, "grad_norm": 5.036897659301758, "learning_rate": 9.341704439794441e-06, "loss": 2.9759, "num_input_tokens_seen": 4280520, "step": 6530 }, { "epoch": 0.7163213855091527, "grad_norm": 6.539727687835693, "learning_rate": 9.308171692681565e-06, "loss": 2.7201, "num_input_tokens_seen": 4284248, "step": 6535 }, { "epoch": 0.7168694508385399, "grad_norm": 7.108365058898926, "learning_rate": 9.274685465369303e-06, "loss": 3.1882, "num_input_tokens_seen": 4288664, "step": 6540 }, { "epoch": 0.7174175161679273, "grad_norm": 5.567689418792725, "learning_rate": 9.241245857130507e-06, "loss": 3.3889, "num_input_tokens_seen": 4292104, "step": 6545 }, { "epoch": 0.7179655814973145, "grad_norm": 7.539772033691406, "learning_rate": 9.207852967099841e-06, "loss": 3.2677, "num_input_tokens_seen": 4296664, "step": 6550 }, { "epoch": 0.7185136468267017, "grad_norm": 11.019807815551758, "learning_rate": 9.174506894273448e-06, "loss": 3.2587, "num_input_tokens_seen": 4298936, "step": 6555 }, { "epoch": 0.719061712156089, "grad_norm": 4.87662935256958, "learning_rate": 9.141207737508677e-06, "loss": 3.4056, "num_input_tokens_seen": 4301872, "step": 6560 }, { "epoch": 0.7196097774854763, "grad_norm": 7.396250247955322, "learning_rate": 9.107955595523812e-06, "loss": 3.0741, "num_input_tokens_seen": 4305096, "step": 6565 }, { "epoch": 0.7201578428148635, "grad_norm": 9.769874572753906, "learning_rate": 9.074750566897733e-06, "loss": 2.8083, "num_input_tokens_seen": 4309576, "step": 6570 }, { "epoch": 0.7207059081442508, "grad_norm": 7.023451805114746, "learning_rate": 9.041592750069652e-06, "loss": 3.162, "num_input_tokens_seen": 4313728, "step": 6575 }, { "epoch": 0.7212539734736381, "grad_norm": 7.67805814743042, "learning_rate": 9.008482243338841e-06, "loss": 3.1487, "num_input_tokens_seen": 4316864, "step": 6580 }, { "epoch": 0.7218020388030253, "grad_norm": 5.812924385070801, "learning_rate": 8.975419144864292e-06, "loss": 2.6071, "num_input_tokens_seen": 4320688, "step": 6585 }, { "epoch": 0.7223501041324126, "grad_norm": 9.005423545837402, "learning_rate": 8.94240355266445e-06, "loss": 3.2333, "num_input_tokens_seen": 4323184, "step": 6590 }, { "epoch": 0.7228981694617999, "grad_norm": 5.683709144592285, "learning_rate": 8.909435564616944e-06, "loss": 2.9484, "num_input_tokens_seen": 4326304, "step": 6595 }, { "epoch": 0.7234462347911871, "grad_norm": 9.263490676879883, "learning_rate": 8.876515278458265e-06, "loss": 3.2337, "num_input_tokens_seen": 4329120, "step": 6600 }, { "epoch": 0.7239943001205744, "grad_norm": 6.478157997131348, "learning_rate": 8.84364279178348e-06, "loss": 3.0925, "num_input_tokens_seen": 4332440, "step": 6605 }, { "epoch": 0.7245423654499616, "grad_norm": 8.741613388061523, "learning_rate": 8.810818202045962e-06, "loss": 3.3093, "num_input_tokens_seen": 4335440, "step": 6610 }, { "epoch": 0.7250904307793489, "grad_norm": 7.031724452972412, "learning_rate": 8.77804160655708e-06, "loss": 3.3767, "num_input_tokens_seen": 4337912, "step": 6615 }, { "epoch": 0.7256384961087362, "grad_norm": 8.763786315917969, "learning_rate": 8.745313102485923e-06, "loss": 3.201, "num_input_tokens_seen": 4341472, "step": 6620 }, { "epoch": 0.7261865614381234, "grad_norm": 5.877601623535156, "learning_rate": 8.712632786859021e-06, "loss": 2.7422, "num_input_tokens_seen": 4345304, "step": 6625 }, { "epoch": 0.7267346267675107, "grad_norm": 7.608758926391602, "learning_rate": 8.68000075656003e-06, "loss": 3.2688, "num_input_tokens_seen": 4348264, "step": 6630 }, { "epoch": 0.727282692096898, "grad_norm": 6.207149982452393, "learning_rate": 8.647417108329454e-06, "loss": 3.1522, "num_input_tokens_seen": 4352144, "step": 6635 }, { "epoch": 0.7278307574262852, "grad_norm": 6.543735504150391, "learning_rate": 8.61488193876439e-06, "loss": 2.968, "num_input_tokens_seen": 4355840, "step": 6640 }, { "epoch": 0.7283788227556725, "grad_norm": 7.882357597351074, "learning_rate": 8.582395344318197e-06, "loss": 2.8674, "num_input_tokens_seen": 4358640, "step": 6645 }, { "epoch": 0.7289268880850598, "grad_norm": 10.999910354614258, "learning_rate": 8.54995742130022e-06, "loss": 3.2327, "num_input_tokens_seen": 4361656, "step": 6650 }, { "epoch": 0.729474953414447, "grad_norm": 8.629473686218262, "learning_rate": 8.517568265875541e-06, "loss": 3.1042, "num_input_tokens_seen": 4363968, "step": 6655 }, { "epoch": 0.7300230187438342, "grad_norm": 8.353252410888672, "learning_rate": 8.485227974064647e-06, "loss": 2.7692, "num_input_tokens_seen": 4367200, "step": 6660 }, { "epoch": 0.7305710840732216, "grad_norm": 7.927604675292969, "learning_rate": 8.452936641743156e-06, "loss": 3.2321, "num_input_tokens_seen": 4370096, "step": 6665 }, { "epoch": 0.7311191494026088, "grad_norm": 5.507778644561768, "learning_rate": 8.42069436464157e-06, "loss": 3.1024, "num_input_tokens_seen": 4374264, "step": 6670 }, { "epoch": 0.731667214731996, "grad_norm": 6.3533172607421875, "learning_rate": 8.38850123834494e-06, "loss": 2.7559, "num_input_tokens_seen": 4378824, "step": 6675 }, { "epoch": 0.7322152800613834, "grad_norm": 6.395352840423584, "learning_rate": 8.356357358292601e-06, "loss": 3.243, "num_input_tokens_seen": 4382616, "step": 6680 }, { "epoch": 0.7327633453907706, "grad_norm": 8.324797630310059, "learning_rate": 8.32426281977792e-06, "loss": 3.6588, "num_input_tokens_seen": 4385488, "step": 6685 }, { "epoch": 0.7333114107201578, "grad_norm": 6.711746692657471, "learning_rate": 8.292217717947962e-06, "loss": 3.1062, "num_input_tokens_seen": 4388592, "step": 6690 }, { "epoch": 0.7338594760495452, "grad_norm": 11.369217872619629, "learning_rate": 8.26022214780324e-06, "loss": 3.0253, "num_input_tokens_seen": 4391640, "step": 6695 }, { "epoch": 0.7344075413789324, "grad_norm": 7.522586822509766, "learning_rate": 8.228276204197427e-06, "loss": 3.3273, "num_input_tokens_seen": 4394456, "step": 6700 }, { "epoch": 0.7349556067083196, "grad_norm": 7.1993207931518555, "learning_rate": 8.196379981837071e-06, "loss": 2.9679, "num_input_tokens_seen": 4397352, "step": 6705 }, { "epoch": 0.735503672037707, "grad_norm": 9.711231231689453, "learning_rate": 8.164533575281316e-06, "loss": 3.5035, "num_input_tokens_seen": 4400744, "step": 6710 }, { "epoch": 0.7360517373670942, "grad_norm": 8.696206092834473, "learning_rate": 8.132737078941642e-06, "loss": 2.8264, "num_input_tokens_seen": 4404712, "step": 6715 }, { "epoch": 0.7365998026964814, "grad_norm": 8.558262825012207, "learning_rate": 8.100990587081536e-06, "loss": 3.0127, "num_input_tokens_seen": 4407448, "step": 6720 }, { "epoch": 0.7371478680258687, "grad_norm": 7.874935626983643, "learning_rate": 8.069294193816252e-06, "loss": 2.9852, "num_input_tokens_seen": 4410096, "step": 6725 }, { "epoch": 0.737695933355256, "grad_norm": 10.938785552978516, "learning_rate": 8.037647993112543e-06, "loss": 2.8523, "num_input_tokens_seen": 4413248, "step": 6730 }, { "epoch": 0.7382439986846432, "grad_norm": 6.2363786697387695, "learning_rate": 8.006052078788335e-06, "loss": 3.5423, "num_input_tokens_seen": 4417016, "step": 6735 }, { "epoch": 0.7387920640140305, "grad_norm": 7.439382553100586, "learning_rate": 7.974506544512478e-06, "loss": 3.0829, "num_input_tokens_seen": 4420144, "step": 6740 }, { "epoch": 0.7393401293434178, "grad_norm": 8.05595588684082, "learning_rate": 7.943011483804494e-06, "loss": 2.8291, "num_input_tokens_seen": 4422672, "step": 6745 }, { "epoch": 0.739888194672805, "grad_norm": 7.396727561950684, "learning_rate": 7.91156699003424e-06, "loss": 3.1015, "num_input_tokens_seen": 4425368, "step": 6750 }, { "epoch": 0.7404362600021923, "grad_norm": 5.773197650909424, "learning_rate": 7.880173156421661e-06, "loss": 3.0124, "num_input_tokens_seen": 4427720, "step": 6755 }, { "epoch": 0.7409843253315795, "grad_norm": 7.078009128570557, "learning_rate": 7.848830076036556e-06, "loss": 3.007, "num_input_tokens_seen": 4430872, "step": 6760 }, { "epoch": 0.7415323906609668, "grad_norm": 6.219594478607178, "learning_rate": 7.817537841798216e-06, "loss": 3.0966, "num_input_tokens_seen": 4434816, "step": 6765 }, { "epoch": 0.7420804559903541, "grad_norm": 7.2829365730285645, "learning_rate": 7.786296546475213e-06, "loss": 3.4504, "num_input_tokens_seen": 4437960, "step": 6770 }, { "epoch": 0.7426285213197413, "grad_norm": 7.280004978179932, "learning_rate": 7.755106282685118e-06, "loss": 3.0042, "num_input_tokens_seen": 4440624, "step": 6775 }, { "epoch": 0.7431765866491286, "grad_norm": 6.213809490203857, "learning_rate": 7.723967142894195e-06, "loss": 3.0603, "num_input_tokens_seen": 4444120, "step": 6780 }, { "epoch": 0.7437246519785159, "grad_norm": 6.277675628662109, "learning_rate": 7.69287921941715e-06, "loss": 2.9716, "num_input_tokens_seen": 4447152, "step": 6785 }, { "epoch": 0.7442727173079031, "grad_norm": 8.690731048583984, "learning_rate": 7.661842604416863e-06, "loss": 3.2242, "num_input_tokens_seen": 4450720, "step": 6790 }, { "epoch": 0.7448207826372903, "grad_norm": 6.518171787261963, "learning_rate": 7.630857389904095e-06, "loss": 2.8793, "num_input_tokens_seen": 4454448, "step": 6795 }, { "epoch": 0.7453688479666777, "grad_norm": 10.606318473815918, "learning_rate": 7.599923667737227e-06, "loss": 2.9673, "num_input_tokens_seen": 4457816, "step": 6800 }, { "epoch": 0.7459169132960649, "grad_norm": 10.472159385681152, "learning_rate": 7.5690415296220035e-06, "loss": 3.0352, "num_input_tokens_seen": 4460936, "step": 6805 }, { "epoch": 0.7464649786254521, "grad_norm": 7.0004496574401855, "learning_rate": 7.538211067111223e-06, "loss": 3.165, "num_input_tokens_seen": 4463688, "step": 6810 }, { "epoch": 0.7470130439548394, "grad_norm": 7.692315101623535, "learning_rate": 7.5074323716044835e-06, "loss": 3.3064, "num_input_tokens_seen": 4466616, "step": 6815 }, { "epoch": 0.7475611092842267, "grad_norm": 5.7364702224731445, "learning_rate": 7.476705534347947e-06, "loss": 3.2443, "num_input_tokens_seen": 4470464, "step": 6820 }, { "epoch": 0.7481091746136139, "grad_norm": 6.589802265167236, "learning_rate": 7.446030646434008e-06, "loss": 2.9859, "num_input_tokens_seen": 4472944, "step": 6825 }, { "epoch": 0.7486572399430012, "grad_norm": 8.241453170776367, "learning_rate": 7.4154077988010466e-06, "loss": 3.1194, "num_input_tokens_seen": 4475896, "step": 6830 }, { "epoch": 0.7492053052723885, "grad_norm": 7.177932262420654, "learning_rate": 7.3848370822332005e-06, "loss": 2.9095, "num_input_tokens_seen": 4478424, "step": 6835 }, { "epoch": 0.7497533706017757, "grad_norm": 6.683755397796631, "learning_rate": 7.354318587360029e-06, "loss": 2.8105, "num_input_tokens_seen": 4481120, "step": 6840 }, { "epoch": 0.7503014359311629, "grad_norm": 7.998584747314453, "learning_rate": 7.323852404656279e-06, "loss": 2.5817, "num_input_tokens_seen": 4484912, "step": 6845 }, { "epoch": 0.7508495012605503, "grad_norm": 5.244688034057617, "learning_rate": 7.293438624441637e-06, "loss": 3.1018, "num_input_tokens_seen": 4488416, "step": 6850 }, { "epoch": 0.7513975665899375, "grad_norm": 7.417481422424316, "learning_rate": 7.263077336880406e-06, "loss": 3.2385, "num_input_tokens_seen": 4491392, "step": 6855 }, { "epoch": 0.7519456319193247, "grad_norm": 5.952940464019775, "learning_rate": 7.232768631981285e-06, "loss": 2.5967, "num_input_tokens_seen": 4494608, "step": 6860 }, { "epoch": 0.7524936972487121, "grad_norm": 7.974299907684326, "learning_rate": 7.202512599597097e-06, "loss": 3.3131, "num_input_tokens_seen": 4497952, "step": 6865 }, { "epoch": 0.7530417625780993, "grad_norm": 10.40588092803955, "learning_rate": 7.172309329424495e-06, "loss": 2.8735, "num_input_tokens_seen": 4500792, "step": 6870 }, { "epoch": 0.7535898279074865, "grad_norm": 7.208824634552002, "learning_rate": 7.142158911003724e-06, "loss": 3.3135, "num_input_tokens_seen": 4504032, "step": 6875 }, { "epoch": 0.7541378932368739, "grad_norm": 7.409761428833008, "learning_rate": 7.112061433718339e-06, "loss": 2.955, "num_input_tokens_seen": 4506784, "step": 6880 }, { "epoch": 0.7546859585662611, "grad_norm": 6.84408712387085, "learning_rate": 7.082016986794951e-06, "loss": 3.3193, "num_input_tokens_seen": 4510016, "step": 6885 }, { "epoch": 0.7552340238956483, "grad_norm": 5.721726417541504, "learning_rate": 7.052025659302952e-06, "loss": 3.1054, "num_input_tokens_seen": 4512496, "step": 6890 }, { "epoch": 0.7557820892250356, "grad_norm": 7.73302698135376, "learning_rate": 7.022087540154274e-06, "loss": 3.0514, "num_input_tokens_seen": 4515040, "step": 6895 }, { "epoch": 0.7563301545544229, "grad_norm": 8.347733497619629, "learning_rate": 6.992202718103086e-06, "loss": 2.9805, "num_input_tokens_seen": 4517944, "step": 6900 }, { "epoch": 0.7568782198838101, "grad_norm": 7.3970255851745605, "learning_rate": 6.962371281745561e-06, "loss": 3.3263, "num_input_tokens_seen": 4520568, "step": 6905 }, { "epoch": 0.7574262852131974, "grad_norm": 7.3923797607421875, "learning_rate": 6.932593319519618e-06, "loss": 3.2219, "num_input_tokens_seen": 4524592, "step": 6910 }, { "epoch": 0.7579743505425847, "grad_norm": 7.414371490478516, "learning_rate": 6.902868919704627e-06, "loss": 2.5203, "num_input_tokens_seen": 4528528, "step": 6915 }, { "epoch": 0.7585224158719719, "grad_norm": 7.776823043823242, "learning_rate": 6.873198170421175e-06, "loss": 3.1746, "num_input_tokens_seen": 4532008, "step": 6920 }, { "epoch": 0.7590704812013592, "grad_norm": 7.0230889320373535, "learning_rate": 6.84358115963081e-06, "loss": 3.0865, "num_input_tokens_seen": 4536232, "step": 6925 }, { "epoch": 0.7596185465307465, "grad_norm": 4.996485233306885, "learning_rate": 6.814017975135753e-06, "loss": 3.2363, "num_input_tokens_seen": 4539680, "step": 6930 }, { "epoch": 0.7601666118601337, "grad_norm": 9.683207511901855, "learning_rate": 6.784508704578646e-06, "loss": 3.2016, "num_input_tokens_seen": 4542848, "step": 6935 }, { "epoch": 0.760714677189521, "grad_norm": 5.796095848083496, "learning_rate": 6.755053435442324e-06, "loss": 2.9563, "num_input_tokens_seen": 4547104, "step": 6940 }, { "epoch": 0.7612627425189082, "grad_norm": 7.686697959899902, "learning_rate": 6.725652255049508e-06, "loss": 2.7968, "num_input_tokens_seen": 4550392, "step": 6945 }, { "epoch": 0.7618108078482955, "grad_norm": 7.243149280548096, "learning_rate": 6.696305250562562e-06, "loss": 2.9016, "num_input_tokens_seen": 4553760, "step": 6950 }, { "epoch": 0.7623588731776828, "grad_norm": 5.771494388580322, "learning_rate": 6.667012508983278e-06, "loss": 3.1646, "num_input_tokens_seen": 4558080, "step": 6955 }, { "epoch": 0.76290693850707, "grad_norm": 7.9829816818237305, "learning_rate": 6.63777411715254e-06, "loss": 2.946, "num_input_tokens_seen": 4560904, "step": 6960 }, { "epoch": 0.7634550038364573, "grad_norm": 6.072175979614258, "learning_rate": 6.608590161750131e-06, "loss": 3.2183, "num_input_tokens_seen": 4563864, "step": 6965 }, { "epoch": 0.7640030691658446, "grad_norm": 6.895592212677002, "learning_rate": 6.579460729294429e-06, "loss": 3.2887, "num_input_tokens_seen": 4566800, "step": 6970 }, { "epoch": 0.7645511344952318, "grad_norm": 7.528575897216797, "learning_rate": 6.550385906142212e-06, "loss": 3.0147, "num_input_tokens_seen": 4569680, "step": 6975 }, { "epoch": 0.765099199824619, "grad_norm": 5.899028301239014, "learning_rate": 6.521365778488331e-06, "loss": 2.9008, "num_input_tokens_seen": 4573704, "step": 6980 }, { "epoch": 0.7656472651540064, "grad_norm": 7.313390254974365, "learning_rate": 6.492400432365503e-06, "loss": 3.1414, "num_input_tokens_seen": 4576368, "step": 6985 }, { "epoch": 0.7661953304833936, "grad_norm": 7.083227634429932, "learning_rate": 6.463489953644031e-06, "loss": 2.7539, "num_input_tokens_seen": 4578936, "step": 6990 }, { "epoch": 0.7667433958127808, "grad_norm": 7.272182941436768, "learning_rate": 6.434634428031558e-06, "loss": 3.1749, "num_input_tokens_seen": 4582096, "step": 6995 }, { "epoch": 0.7672914611421682, "grad_norm": 9.697888374328613, "learning_rate": 6.405833941072834e-06, "loss": 3.1397, "num_input_tokens_seen": 4584400, "step": 7000 }, { "epoch": 0.7678395264715554, "grad_norm": 7.066343307495117, "learning_rate": 6.377088578149418e-06, "loss": 2.8686, "num_input_tokens_seen": 4587688, "step": 7005 }, { "epoch": 0.7683875918009426, "grad_norm": 5.80040979385376, "learning_rate": 6.348398424479454e-06, "loss": 2.7322, "num_input_tokens_seen": 4591120, "step": 7010 }, { "epoch": 0.76893565713033, "grad_norm": 8.803409576416016, "learning_rate": 6.319763565117432e-06, "loss": 3.2123, "num_input_tokens_seen": 4594456, "step": 7015 }, { "epoch": 0.7694837224597172, "grad_norm": 6.382712364196777, "learning_rate": 6.291184084953894e-06, "loss": 3.3465, "num_input_tokens_seen": 4597120, "step": 7020 }, { "epoch": 0.7700317877891044, "grad_norm": 6.3958740234375, "learning_rate": 6.2626600687152064e-06, "loss": 2.9045, "num_input_tokens_seen": 4599416, "step": 7025 }, { "epoch": 0.7705798531184918, "grad_norm": 5.454673767089844, "learning_rate": 6.234191600963335e-06, "loss": 3.1258, "num_input_tokens_seen": 4602760, "step": 7030 }, { "epoch": 0.771127918447879, "grad_norm": 4.992536544799805, "learning_rate": 6.205778766095533e-06, "loss": 3.0881, "num_input_tokens_seen": 4605312, "step": 7035 }, { "epoch": 0.7716759837772662, "grad_norm": 7.264188766479492, "learning_rate": 6.1774216483441394e-06, "loss": 3.117, "num_input_tokens_seen": 4608784, "step": 7040 }, { "epoch": 0.7722240491066535, "grad_norm": 7.106401443481445, "learning_rate": 6.149120331776329e-06, "loss": 2.8674, "num_input_tokens_seen": 4612728, "step": 7045 }, { "epoch": 0.7727721144360408, "grad_norm": 8.04111385345459, "learning_rate": 6.120874900293827e-06, "loss": 3.0187, "num_input_tokens_seen": 4616096, "step": 7050 }, { "epoch": 0.773320179765428, "grad_norm": 7.114358901977539, "learning_rate": 6.092685437632683e-06, "loss": 2.9277, "num_input_tokens_seen": 4619312, "step": 7055 }, { "epoch": 0.7738682450948153, "grad_norm": 6.135927200317383, "learning_rate": 6.064552027363049e-06, "loss": 2.8, "num_input_tokens_seen": 4623080, "step": 7060 }, { "epoch": 0.7744163104242026, "grad_norm": 9.407398223876953, "learning_rate": 6.0364747528888734e-06, "loss": 2.8471, "num_input_tokens_seen": 4625720, "step": 7065 }, { "epoch": 0.7749643757535898, "grad_norm": 8.590024948120117, "learning_rate": 6.0084536974476995e-06, "loss": 3.1369, "num_input_tokens_seen": 4628368, "step": 7070 }, { "epoch": 0.7755124410829771, "grad_norm": 6.1918721199035645, "learning_rate": 5.980488944110408e-06, "loss": 2.9941, "num_input_tokens_seen": 4631128, "step": 7075 }, { "epoch": 0.7760605064123643, "grad_norm": 6.956912994384766, "learning_rate": 5.9525805757809524e-06, "loss": 3.3899, "num_input_tokens_seen": 4634672, "step": 7080 }, { "epoch": 0.7766085717417516, "grad_norm": 6.198210716247559, "learning_rate": 5.9247286751961366e-06, "loss": 3.165, "num_input_tokens_seen": 4638184, "step": 7085 }, { "epoch": 0.7771566370711389, "grad_norm": 6.877211570739746, "learning_rate": 5.896933324925372e-06, "loss": 3.1694, "num_input_tokens_seen": 4641976, "step": 7090 }, { "epoch": 0.7777047024005261, "grad_norm": 6.007309436798096, "learning_rate": 5.869194607370409e-06, "loss": 3.1036, "num_input_tokens_seen": 4645280, "step": 7095 }, { "epoch": 0.7782527677299134, "grad_norm": 7.9656572341918945, "learning_rate": 5.8415126047650955e-06, "loss": 3.2545, "num_input_tokens_seen": 4648904, "step": 7100 }, { "epoch": 0.7788008330593007, "grad_norm": 7.05634069442749, "learning_rate": 5.813887399175169e-06, "loss": 2.912, "num_input_tokens_seen": 4651232, "step": 7105 }, { "epoch": 0.7793488983886879, "grad_norm": 8.77833080291748, "learning_rate": 5.7863190724979695e-06, "loss": 3.0476, "num_input_tokens_seen": 4654288, "step": 7110 }, { "epoch": 0.7798969637180752, "grad_norm": 6.191843032836914, "learning_rate": 5.75880770646221e-06, "loss": 3.1158, "num_input_tokens_seen": 4657808, "step": 7115 }, { "epoch": 0.7804450290474625, "grad_norm": 5.634969234466553, "learning_rate": 5.73135338262776e-06, "loss": 2.8591, "num_input_tokens_seen": 4661440, "step": 7120 }, { "epoch": 0.7809930943768497, "grad_norm": 6.004340648651123, "learning_rate": 5.7039561823853615e-06, "loss": 2.8518, "num_input_tokens_seen": 4665104, "step": 7125 }, { "epoch": 0.7815411597062369, "grad_norm": 7.3791680335998535, "learning_rate": 5.676616186956413e-06, "loss": 3.1628, "num_input_tokens_seen": 4668432, "step": 7130 }, { "epoch": 0.7820892250356243, "grad_norm": 9.166860580444336, "learning_rate": 5.649333477392735e-06, "loss": 3.3455, "num_input_tokens_seen": 4671688, "step": 7135 }, { "epoch": 0.7826372903650115, "grad_norm": 6.651597023010254, "learning_rate": 5.622108134576312e-06, "loss": 3.4196, "num_input_tokens_seen": 4675408, "step": 7140 }, { "epoch": 0.7831853556943987, "grad_norm": 7.5387797355651855, "learning_rate": 5.594940239219049e-06, "loss": 3.2571, "num_input_tokens_seen": 4678440, "step": 7145 }, { "epoch": 0.7837334210237861, "grad_norm": 9.256987571716309, "learning_rate": 5.5678298718625674e-06, "loss": 3.1553, "num_input_tokens_seen": 4681320, "step": 7150 }, { "epoch": 0.7842814863531733, "grad_norm": 8.727250099182129, "learning_rate": 5.54077711287792e-06, "loss": 3.2874, "num_input_tokens_seen": 4685024, "step": 7155 }, { "epoch": 0.7848295516825605, "grad_norm": 8.900041580200195, "learning_rate": 5.513782042465385e-06, "loss": 2.8368, "num_input_tokens_seen": 4687568, "step": 7160 }, { "epoch": 0.7853776170119479, "grad_norm": 10.776511192321777, "learning_rate": 5.4868447406542125e-06, "loss": 2.9062, "num_input_tokens_seen": 4690632, "step": 7165 }, { "epoch": 0.7859256823413351, "grad_norm": 6.669962406158447, "learning_rate": 5.459965287302396e-06, "loss": 3.3375, "num_input_tokens_seen": 4694528, "step": 7170 }, { "epoch": 0.7864737476707223, "grad_norm": 8.748539924621582, "learning_rate": 5.4331437620964235e-06, "loss": 3.1538, "num_input_tokens_seen": 4697304, "step": 7175 }, { "epoch": 0.7870218130001096, "grad_norm": 6.20130729675293, "learning_rate": 5.406380244551077e-06, "loss": 3.3296, "num_input_tokens_seen": 4701400, "step": 7180 }, { "epoch": 0.7875698783294969, "grad_norm": 6.8918304443359375, "learning_rate": 5.379674814009133e-06, "loss": 2.9058, "num_input_tokens_seen": 4704688, "step": 7185 }, { "epoch": 0.7881179436588841, "grad_norm": 8.053811073303223, "learning_rate": 5.353027549641185e-06, "loss": 3.19, "num_input_tokens_seen": 4707832, "step": 7190 }, { "epoch": 0.7886660089882714, "grad_norm": 8.722176551818848, "learning_rate": 5.326438530445394e-06, "loss": 3.1039, "num_input_tokens_seen": 4711272, "step": 7195 }, { "epoch": 0.7892140743176587, "grad_norm": 8.22156810760498, "learning_rate": 5.299907835247228e-06, "loss": 2.9179, "num_input_tokens_seen": 4714584, "step": 7200 }, { "epoch": 0.7897621396470459, "grad_norm": 8.812997817993164, "learning_rate": 5.273435542699259e-06, "loss": 2.9421, "num_input_tokens_seen": 4717960, "step": 7205 }, { "epoch": 0.7903102049764332, "grad_norm": 7.295377731323242, "learning_rate": 5.247021731280927e-06, "loss": 3.1538, "num_input_tokens_seen": 4721208, "step": 7210 }, { "epoch": 0.7908582703058205, "grad_norm": 6.8964762687683105, "learning_rate": 5.220666479298283e-06, "loss": 2.9399, "num_input_tokens_seen": 4723760, "step": 7215 }, { "epoch": 0.7914063356352077, "grad_norm": 8.851302146911621, "learning_rate": 5.194369864883783e-06, "loss": 3.0368, "num_input_tokens_seen": 4727808, "step": 7220 }, { "epoch": 0.791954400964595, "grad_norm": 6.765636444091797, "learning_rate": 5.168131965996051e-06, "loss": 2.5498, "num_input_tokens_seen": 4730984, "step": 7225 }, { "epoch": 0.7925024662939822, "grad_norm": 6.0574750900268555, "learning_rate": 5.1419528604196385e-06, "loss": 2.9546, "num_input_tokens_seen": 4734472, "step": 7230 }, { "epoch": 0.7930505316233695, "grad_norm": 6.703484535217285, "learning_rate": 5.1158326257647855e-06, "loss": 3.0816, "num_input_tokens_seen": 4736976, "step": 7235 }, { "epoch": 0.7935985969527568, "grad_norm": 5.429347038269043, "learning_rate": 5.089771339467236e-06, "loss": 2.8567, "num_input_tokens_seen": 4740592, "step": 7240 }, { "epoch": 0.794146662282144, "grad_norm": 6.805422306060791, "learning_rate": 5.06376907878795e-06, "loss": 3.0524, "num_input_tokens_seen": 4744232, "step": 7245 }, { "epoch": 0.7946947276115313, "grad_norm": 7.566915512084961, "learning_rate": 5.0378259208129054e-06, "loss": 2.7767, "num_input_tokens_seen": 4748392, "step": 7250 }, { "epoch": 0.7952427929409186, "grad_norm": 8.171722412109375, "learning_rate": 5.011941942452872e-06, "loss": 2.9925, "num_input_tokens_seen": 4751496, "step": 7255 }, { "epoch": 0.7957908582703058, "grad_norm": 9.192333221435547, "learning_rate": 4.986117220443173e-06, "loss": 3.3195, "num_input_tokens_seen": 4754624, "step": 7260 }, { "epoch": 0.796338923599693, "grad_norm": 6.089689254760742, "learning_rate": 4.960351831343452e-06, "loss": 3.3298, "num_input_tokens_seen": 4758304, "step": 7265 }, { "epoch": 0.7968869889290804, "grad_norm": 7.405531883239746, "learning_rate": 4.9346458515374785e-06, "loss": 3.3122, "num_input_tokens_seen": 4760592, "step": 7270 }, { "epoch": 0.7974350542584676, "grad_norm": 7.917971611022949, "learning_rate": 4.908999357232874e-06, "loss": 3.0276, "num_input_tokens_seen": 4763392, "step": 7275 }, { "epoch": 0.7979831195878548, "grad_norm": 8.550086975097656, "learning_rate": 4.8834124244609145e-06, "loss": 3.2591, "num_input_tokens_seen": 4766544, "step": 7280 }, { "epoch": 0.7985311849172422, "grad_norm": 7.939424514770508, "learning_rate": 4.857885129076317e-06, "loss": 2.8357, "num_input_tokens_seen": 4769408, "step": 7285 }, { "epoch": 0.7990792502466294, "grad_norm": 6.404162406921387, "learning_rate": 4.8324175467569845e-06, "loss": 3.0799, "num_input_tokens_seen": 4773344, "step": 7290 }, { "epoch": 0.7996273155760166, "grad_norm": 7.251323699951172, "learning_rate": 4.807009753003791e-06, "loss": 3.1363, "num_input_tokens_seen": 4776640, "step": 7295 }, { "epoch": 0.800175380905404, "grad_norm": 8.667237281799316, "learning_rate": 4.781661823140366e-06, "loss": 3.2124, "num_input_tokens_seen": 4779376, "step": 7300 }, { "epoch": 0.8007234462347912, "grad_norm": 8.147212028503418, "learning_rate": 4.756373832312879e-06, "loss": 2.874, "num_input_tokens_seen": 4781952, "step": 7305 }, { "epoch": 0.8012715115641784, "grad_norm": 8.90487003326416, "learning_rate": 4.731145855489794e-06, "loss": 3.2025, "num_input_tokens_seen": 4784816, "step": 7310 }, { "epoch": 0.8018195768935658, "grad_norm": 7.192740440368652, "learning_rate": 4.70597796746165e-06, "loss": 2.9843, "num_input_tokens_seen": 4787592, "step": 7315 }, { "epoch": 0.802367642222953, "grad_norm": 6.346043586730957, "learning_rate": 4.6808702428408706e-06, "loss": 3.1331, "num_input_tokens_seen": 4790256, "step": 7320 }, { "epoch": 0.8029157075523402, "grad_norm": 8.076735496520996, "learning_rate": 4.655822756061503e-06, "loss": 3.1571, "num_input_tokens_seen": 4792768, "step": 7325 }, { "epoch": 0.8034637728817275, "grad_norm": 7.521450519561768, "learning_rate": 4.630835581379006e-06, "loss": 2.929, "num_input_tokens_seen": 4796152, "step": 7330 }, { "epoch": 0.8040118382111148, "grad_norm": 12.113771438598633, "learning_rate": 4.605908792870067e-06, "loss": 3.1268, "num_input_tokens_seen": 4798376, "step": 7335 }, { "epoch": 0.804559903540502, "grad_norm": 5.997092247009277, "learning_rate": 4.581042464432328e-06, "loss": 2.8665, "num_input_tokens_seen": 4802104, "step": 7340 }, { "epoch": 0.8051079688698893, "grad_norm": 6.922906875610352, "learning_rate": 4.556236669784197e-06, "loss": 3.3316, "num_input_tokens_seen": 4805648, "step": 7345 }, { "epoch": 0.8056560341992766, "grad_norm": 9.63893985748291, "learning_rate": 4.531491482464628e-06, "loss": 3.2614, "num_input_tokens_seen": 4810112, "step": 7350 }, { "epoch": 0.8062040995286638, "grad_norm": 8.894881248474121, "learning_rate": 4.5068069758329e-06, "loss": 3.2695, "num_input_tokens_seen": 4813192, "step": 7355 }, { "epoch": 0.8067521648580511, "grad_norm": 6.436181545257568, "learning_rate": 4.482183223068387e-06, "loss": 2.8622, "num_input_tokens_seen": 4815768, "step": 7360 }, { "epoch": 0.8073002301874384, "grad_norm": 7.975905895233154, "learning_rate": 4.457620297170381e-06, "loss": 3.3166, "num_input_tokens_seen": 4819144, "step": 7365 }, { "epoch": 0.8078482955168256, "grad_norm": 7.515452861785889, "learning_rate": 4.433118270957818e-06, "loss": 2.5207, "num_input_tokens_seen": 4822152, "step": 7370 }, { "epoch": 0.8083963608462129, "grad_norm": 6.722434997558594, "learning_rate": 4.408677217069096e-06, "loss": 3.1815, "num_input_tokens_seen": 4825920, "step": 7375 }, { "epoch": 0.8089444261756001, "grad_norm": 6.1937031745910645, "learning_rate": 4.3842972079618765e-06, "loss": 3.0536, "num_input_tokens_seen": 4829224, "step": 7380 }, { "epoch": 0.8094924915049874, "grad_norm": 7.4900898933410645, "learning_rate": 4.359978315912827e-06, "loss": 2.9555, "num_input_tokens_seen": 4832576, "step": 7385 }, { "epoch": 0.8100405568343747, "grad_norm": 7.267132759094238, "learning_rate": 4.33572061301743e-06, "loss": 3.376, "num_input_tokens_seen": 4834896, "step": 7390 }, { "epoch": 0.8105886221637619, "grad_norm": 6.553824424743652, "learning_rate": 4.311524171189782e-06, "loss": 3.1203, "num_input_tokens_seen": 4838536, "step": 7395 }, { "epoch": 0.8111366874931492, "grad_norm": 6.04332971572876, "learning_rate": 4.28738906216235e-06, "loss": 2.898, "num_input_tokens_seen": 4842312, "step": 7400 }, { "epoch": 0.8116847528225365, "grad_norm": 6.300970077514648, "learning_rate": 4.263315357485775e-06, "loss": 3.2478, "num_input_tokens_seen": 4845640, "step": 7405 }, { "epoch": 0.8122328181519237, "grad_norm": 8.834260940551758, "learning_rate": 4.2393031285286796e-06, "loss": 3.1214, "num_input_tokens_seen": 4848880, "step": 7410 }, { "epoch": 0.812780883481311, "grad_norm": 7.611583709716797, "learning_rate": 4.215352446477413e-06, "loss": 2.8593, "num_input_tokens_seen": 4852904, "step": 7415 }, { "epoch": 0.8133289488106983, "grad_norm": 5.708853244781494, "learning_rate": 4.191463382335867e-06, "loss": 3.1984, "num_input_tokens_seen": 4855720, "step": 7420 }, { "epoch": 0.8138770141400855, "grad_norm": 5.545560836791992, "learning_rate": 4.167636006925274e-06, "loss": 3.1826, "num_input_tokens_seen": 4859488, "step": 7425 }, { "epoch": 0.8144250794694727, "grad_norm": 9.735588073730469, "learning_rate": 4.143870390883978e-06, "loss": 2.8356, "num_input_tokens_seen": 4862808, "step": 7430 }, { "epoch": 0.8149731447988601, "grad_norm": 10.298928260803223, "learning_rate": 4.120166604667225e-06, "loss": 2.9738, "num_input_tokens_seen": 4866608, "step": 7435 }, { "epoch": 0.8155212101282473, "grad_norm": 8.623414039611816, "learning_rate": 4.096524718546974e-06, "loss": 3.0776, "num_input_tokens_seen": 4868832, "step": 7440 }, { "epoch": 0.8160692754576345, "grad_norm": 10.033533096313477, "learning_rate": 4.072944802611655e-06, "loss": 3.1786, "num_input_tokens_seen": 4872536, "step": 7445 }, { "epoch": 0.8166173407870219, "grad_norm": 8.511270523071289, "learning_rate": 4.0494269267660144e-06, "loss": 3.4183, "num_input_tokens_seen": 4876032, "step": 7450 }, { "epoch": 0.8171654061164091, "grad_norm": 6.882598876953125, "learning_rate": 4.025971160730846e-06, "loss": 3.0995, "num_input_tokens_seen": 4878536, "step": 7455 }, { "epoch": 0.8177134714457963, "grad_norm": 6.228262901306152, "learning_rate": 4.002577574042829e-06, "loss": 2.8603, "num_input_tokens_seen": 4880976, "step": 7460 }, { "epoch": 0.8182615367751837, "grad_norm": 9.165740013122559, "learning_rate": 3.9792462360542935e-06, "loss": 2.8565, "num_input_tokens_seen": 4884688, "step": 7465 }, { "epoch": 0.8188096021045709, "grad_norm": 7.1637701988220215, "learning_rate": 3.955977215933046e-06, "loss": 2.9947, "num_input_tokens_seen": 4888200, "step": 7470 }, { "epoch": 0.8193576674339581, "grad_norm": 7.321343421936035, "learning_rate": 3.932770582662135e-06, "loss": 3.1105, "num_input_tokens_seen": 4890856, "step": 7475 }, { "epoch": 0.8199057327633454, "grad_norm": 7.804381847381592, "learning_rate": 3.9096264050396485e-06, "loss": 2.9519, "num_input_tokens_seen": 4893712, "step": 7480 }, { "epoch": 0.8204537980927327, "grad_norm": 6.569583415985107, "learning_rate": 3.886544751678547e-06, "loss": 3.0457, "num_input_tokens_seen": 4897104, "step": 7485 }, { "epoch": 0.8210018634221199, "grad_norm": 10.908699035644531, "learning_rate": 3.863525691006406e-06, "loss": 3.5541, "num_input_tokens_seen": 4900616, "step": 7490 }, { "epoch": 0.8215499287515072, "grad_norm": 8.427760124206543, "learning_rate": 3.840569291265242e-06, "loss": 2.9541, "num_input_tokens_seen": 4902848, "step": 7495 }, { "epoch": 0.8220979940808945, "grad_norm": 10.59475040435791, "learning_rate": 3.817675620511329e-06, "loss": 2.932, "num_input_tokens_seen": 4905424, "step": 7500 }, { "epoch": 0.8226460594102817, "grad_norm": 8.56042194366455, "learning_rate": 3.794844746614956e-06, "loss": 3.3314, "num_input_tokens_seen": 4908016, "step": 7505 }, { "epoch": 0.823194124739669, "grad_norm": 8.957588195800781, "learning_rate": 3.772076737260241e-06, "loss": 3.4287, "num_input_tokens_seen": 4912944, "step": 7510 }, { "epoch": 0.8237421900690562, "grad_norm": 8.641453742980957, "learning_rate": 3.7493716599449557e-06, "loss": 2.7836, "num_input_tokens_seen": 4915344, "step": 7515 }, { "epoch": 0.8242902553984435, "grad_norm": 9.905373573303223, "learning_rate": 3.726729581980287e-06, "loss": 3.3792, "num_input_tokens_seen": 4918280, "step": 7520 }, { "epoch": 0.8248383207278308, "grad_norm": 6.359044075012207, "learning_rate": 3.7041505704906554e-06, "loss": 2.6283, "num_input_tokens_seen": 4923056, "step": 7525 }, { "epoch": 0.825386386057218, "grad_norm": 8.611063957214355, "learning_rate": 3.681634692413527e-06, "loss": 3.0805, "num_input_tokens_seen": 4925992, "step": 7530 }, { "epoch": 0.8259344513866053, "grad_norm": 6.022265911102295, "learning_rate": 3.659182014499199e-06, "loss": 2.9173, "num_input_tokens_seen": 4928312, "step": 7535 }, { "epoch": 0.8264825167159926, "grad_norm": 7.828344821929932, "learning_rate": 3.636792603310593e-06, "loss": 3.3786, "num_input_tokens_seen": 4931816, "step": 7540 }, { "epoch": 0.8270305820453798, "grad_norm": 9.197246551513672, "learning_rate": 3.6144665252230897e-06, "loss": 3.1869, "num_input_tokens_seen": 4934904, "step": 7545 }, { "epoch": 0.827578647374767, "grad_norm": 6.626698017120361, "learning_rate": 3.5922038464243e-06, "loss": 2.864, "num_input_tokens_seen": 4937320, "step": 7550 }, { "epoch": 0.8281267127041544, "grad_norm": 6.149302959442139, "learning_rate": 3.570004632913884e-06, "loss": 2.9841, "num_input_tokens_seen": 4940472, "step": 7555 }, { "epoch": 0.8286747780335416, "grad_norm": 5.897488117218018, "learning_rate": 3.5478689505033635e-06, "loss": 3.0083, "num_input_tokens_seen": 4943240, "step": 7560 }, { "epoch": 0.8292228433629288, "grad_norm": 5.379867076873779, "learning_rate": 3.5257968648159085e-06, "loss": 3.2044, "num_input_tokens_seen": 4947448, "step": 7565 }, { "epoch": 0.8297709086923162, "grad_norm": 8.127168655395508, "learning_rate": 3.503788441286143e-06, "loss": 3.0341, "num_input_tokens_seen": 4950720, "step": 7570 }, { "epoch": 0.8303189740217034, "grad_norm": 7.3780364990234375, "learning_rate": 3.4818437451599796e-06, "loss": 3.2321, "num_input_tokens_seen": 4954728, "step": 7575 }, { "epoch": 0.8308670393510906, "grad_norm": 6.4768757820129395, "learning_rate": 3.459962841494391e-06, "loss": 3.1017, "num_input_tokens_seen": 4957936, "step": 7580 }, { "epoch": 0.831415104680478, "grad_norm": 7.365682125091553, "learning_rate": 3.4381457951572245e-06, "loss": 2.8212, "num_input_tokens_seen": 4961240, "step": 7585 }, { "epoch": 0.8319631700098652, "grad_norm": 7.922868251800537, "learning_rate": 3.41639267082704e-06, "loss": 2.8681, "num_input_tokens_seen": 4964016, "step": 7590 }, { "epoch": 0.8325112353392524, "grad_norm": 4.56962251663208, "learning_rate": 3.3947035329928768e-06, "loss": 3.0944, "num_input_tokens_seen": 4966208, "step": 7595 }, { "epoch": 0.8330593006686398, "grad_norm": 8.027546882629395, "learning_rate": 3.3730784459540755e-06, "loss": 2.62, "num_input_tokens_seen": 4969656, "step": 7600 }, { "epoch": 0.833607365998027, "grad_norm": 9.634477615356445, "learning_rate": 3.3515174738201204e-06, "loss": 3.0848, "num_input_tokens_seen": 4972656, "step": 7605 }, { "epoch": 0.8341554313274142, "grad_norm": 6.137497901916504, "learning_rate": 3.3300206805103902e-06, "loss": 2.8019, "num_input_tokens_seen": 4976816, "step": 7610 }, { "epoch": 0.8347034966568014, "grad_norm": 6.958483695983887, "learning_rate": 3.3085881297540143e-06, "loss": 3.1585, "num_input_tokens_seen": 4979448, "step": 7615 }, { "epoch": 0.8352515619861888, "grad_norm": 6.135876178741455, "learning_rate": 3.2872198850896763e-06, "loss": 3.4485, "num_input_tokens_seen": 4982096, "step": 7620 }, { "epoch": 0.835799627315576, "grad_norm": 5.784817218780518, "learning_rate": 3.265916009865405e-06, "loss": 2.5781, "num_input_tokens_seen": 4987624, "step": 7625 }, { "epoch": 0.8363476926449632, "grad_norm": 7.2112603187561035, "learning_rate": 3.2446765672384083e-06, "loss": 3.1842, "num_input_tokens_seen": 4991016, "step": 7630 }, { "epoch": 0.8368957579743506, "grad_norm": 8.30711555480957, "learning_rate": 3.223501620174871e-06, "loss": 2.8567, "num_input_tokens_seen": 4994496, "step": 7635 }, { "epoch": 0.8374438233037378, "grad_norm": 5.6931915283203125, "learning_rate": 3.2023912314497835e-06, "loss": 3.109, "num_input_tokens_seen": 4997176, "step": 7640 }, { "epoch": 0.837991888633125, "grad_norm": 7.178470611572266, "learning_rate": 3.18134546364674e-06, "loss": 3.1472, "num_input_tokens_seen": 5001168, "step": 7645 }, { "epoch": 0.8385399539625124, "grad_norm": 6.247611045837402, "learning_rate": 3.160364379157771e-06, "loss": 3.0272, "num_input_tokens_seen": 5004928, "step": 7650 }, { "epoch": 0.8390880192918996, "grad_norm": 8.314835548400879, "learning_rate": 3.1394480401831376e-06, "loss": 3.1062, "num_input_tokens_seen": 5007976, "step": 7655 }, { "epoch": 0.8396360846212868, "grad_norm": 8.253650665283203, "learning_rate": 3.118596508731153e-06, "loss": 3.1373, "num_input_tokens_seen": 5010840, "step": 7660 }, { "epoch": 0.8401841499506741, "grad_norm": 8.37070083618164, "learning_rate": 3.0978098466180246e-06, "loss": 3.1474, "num_input_tokens_seen": 5013264, "step": 7665 }, { "epoch": 0.8407322152800614, "grad_norm": 7.3890700340271, "learning_rate": 3.0770881154676244e-06, "loss": 2.9336, "num_input_tokens_seen": 5016288, "step": 7670 }, { "epoch": 0.8412802806094486, "grad_norm": 9.55408000946045, "learning_rate": 3.056431376711341e-06, "loss": 3.1662, "num_input_tokens_seen": 5019184, "step": 7675 }, { "epoch": 0.8418283459388359, "grad_norm": 9.764185905456543, "learning_rate": 3.035839691587891e-06, "loss": 3.3416, "num_input_tokens_seen": 5022032, "step": 7680 }, { "epoch": 0.8423764112682232, "grad_norm": 6.572988510131836, "learning_rate": 3.015313121143132e-06, "loss": 3.44, "num_input_tokens_seen": 5025704, "step": 7685 }, { "epoch": 0.8429244765976104, "grad_norm": 6.35365629196167, "learning_rate": 2.994851726229872e-06, "loss": 2.8245, "num_input_tokens_seen": 5029360, "step": 7690 }, { "epoch": 0.8434725419269977, "grad_norm": 5.579585552215576, "learning_rate": 2.9744555675077195e-06, "loss": 2.9123, "num_input_tokens_seen": 5032232, "step": 7695 }, { "epoch": 0.844020607256385, "grad_norm": 9.263272285461426, "learning_rate": 2.9541247054428732e-06, "loss": 3.1231, "num_input_tokens_seen": 5034616, "step": 7700 }, { "epoch": 0.8445686725857722, "grad_norm": 6.095417022705078, "learning_rate": 2.933859200307948e-06, "loss": 2.822, "num_input_tokens_seen": 5037736, "step": 7705 }, { "epoch": 0.8451167379151595, "grad_norm": 7.388354778289795, "learning_rate": 2.913659112181824e-06, "loss": 2.8813, "num_input_tokens_seen": 5040224, "step": 7710 }, { "epoch": 0.8456648032445467, "grad_norm": 5.476953983306885, "learning_rate": 2.893524500949424e-06, "loss": 2.9058, "num_input_tokens_seen": 5042920, "step": 7715 }, { "epoch": 0.846212868573934, "grad_norm": 8.243193626403809, "learning_rate": 2.8734554263015717e-06, "loss": 3.0815, "num_input_tokens_seen": 5046384, "step": 7720 }, { "epoch": 0.8467609339033213, "grad_norm": 5.285266399383545, "learning_rate": 2.853451947734795e-06, "loss": 2.8613, "num_input_tokens_seen": 5050096, "step": 7725 }, { "epoch": 0.8473089992327085, "grad_norm": 7.07433557510376, "learning_rate": 2.833514124551162e-06, "loss": 3.2751, "num_input_tokens_seen": 5053016, "step": 7730 }, { "epoch": 0.8478570645620958, "grad_norm": 7.447408676147461, "learning_rate": 2.8136420158580923e-06, "loss": 3.199, "num_input_tokens_seen": 5055816, "step": 7735 }, { "epoch": 0.8484051298914831, "grad_norm": 6.6446757316589355, "learning_rate": 2.793835680568202e-06, "loss": 2.9382, "num_input_tokens_seen": 5059872, "step": 7740 }, { "epoch": 0.8489531952208703, "grad_norm": 6.634135723114014, "learning_rate": 2.774095177399108e-06, "loss": 2.7486, "num_input_tokens_seen": 5063104, "step": 7745 }, { "epoch": 0.8495012605502575, "grad_norm": 6.349103927612305, "learning_rate": 2.75442056487325e-06, "loss": 2.8114, "num_input_tokens_seen": 5067312, "step": 7750 }, { "epoch": 0.8500493258796449, "grad_norm": 9.979939460754395, "learning_rate": 2.7348119013177605e-06, "loss": 3.0652, "num_input_tokens_seen": 5070232, "step": 7755 }, { "epoch": 0.8505973912090321, "grad_norm": 9.005098342895508, "learning_rate": 2.7152692448642297e-06, "loss": 2.7476, "num_input_tokens_seen": 5073736, "step": 7760 }, { "epoch": 0.8511454565384193, "grad_norm": 7.502773761749268, "learning_rate": 2.695792653448573e-06, "loss": 2.6705, "num_input_tokens_seen": 5076032, "step": 7765 }, { "epoch": 0.8516935218678067, "grad_norm": 6.317687511444092, "learning_rate": 2.6763821848108634e-06, "loss": 2.7642, "num_input_tokens_seen": 5078736, "step": 7770 }, { "epoch": 0.8522415871971939, "grad_norm": 6.520786762237549, "learning_rate": 2.6570378964951322e-06, "loss": 2.9362, "num_input_tokens_seen": 5081560, "step": 7775 }, { "epoch": 0.8527896525265811, "grad_norm": 7.41638708114624, "learning_rate": 2.637759845849211e-06, "loss": 2.9981, "num_input_tokens_seen": 5084504, "step": 7780 }, { "epoch": 0.8533377178559685, "grad_norm": 7.572868824005127, "learning_rate": 2.6185480900245836e-06, "loss": 2.7595, "num_input_tokens_seen": 5088232, "step": 7785 }, { "epoch": 0.8538857831853557, "grad_norm": 6.104272842407227, "learning_rate": 2.5994026859761766e-06, "loss": 2.9084, "num_input_tokens_seen": 5090552, "step": 7790 }, { "epoch": 0.8544338485147429, "grad_norm": 8.887699127197266, "learning_rate": 2.5803236904622134e-06, "loss": 3.3633, "num_input_tokens_seen": 5093720, "step": 7795 }, { "epoch": 0.8549819138441302, "grad_norm": 7.048088550567627, "learning_rate": 2.5613111600440637e-06, "loss": 2.94, "num_input_tokens_seen": 5096984, "step": 7800 }, { "epoch": 0.8555299791735175, "grad_norm": 7.457699775695801, "learning_rate": 2.5423651510860292e-06, "loss": 2.9086, "num_input_tokens_seen": 5100088, "step": 7805 }, { "epoch": 0.8560780445029047, "grad_norm": 7.127599239349365, "learning_rate": 2.5234857197552197e-06, "loss": 3.2513, "num_input_tokens_seen": 5102776, "step": 7810 }, { "epoch": 0.856626109832292, "grad_norm": 6.716034412384033, "learning_rate": 2.5046729220213615e-06, "loss": 3.1929, "num_input_tokens_seen": 5106680, "step": 7815 }, { "epoch": 0.8571741751616793, "grad_norm": 8.033172607421875, "learning_rate": 2.4859268136566415e-06, "loss": 3.2828, "num_input_tokens_seen": 5110400, "step": 7820 }, { "epoch": 0.8577222404910665, "grad_norm": 7.232936859130859, "learning_rate": 2.4672474502355406e-06, "loss": 2.9178, "num_input_tokens_seen": 5113896, "step": 7825 }, { "epoch": 0.8582703058204538, "grad_norm": 7.433042526245117, "learning_rate": 2.4486348871346738e-06, "loss": 3.2398, "num_input_tokens_seen": 5116440, "step": 7830 }, { "epoch": 0.858818371149841, "grad_norm": 6.7432756423950195, "learning_rate": 2.4300891795326157e-06, "loss": 2.8448, "num_input_tokens_seen": 5119296, "step": 7835 }, { "epoch": 0.8593664364792283, "grad_norm": 6.955072402954102, "learning_rate": 2.4116103824097345e-06, "loss": 3.0554, "num_input_tokens_seen": 5122136, "step": 7840 }, { "epoch": 0.8599145018086156, "grad_norm": 7.900850296020508, "learning_rate": 2.3931985505480564e-06, "loss": 2.9951, "num_input_tokens_seen": 5125056, "step": 7845 }, { "epoch": 0.8604625671380028, "grad_norm": 5.292073726654053, "learning_rate": 2.374853738531063e-06, "loss": 3.1992, "num_input_tokens_seen": 5128688, "step": 7850 }, { "epoch": 0.8610106324673901, "grad_norm": 6.894753932952881, "learning_rate": 2.356576000743557e-06, "loss": 3.2569, "num_input_tokens_seen": 5132184, "step": 7855 }, { "epoch": 0.8615586977967774, "grad_norm": 6.101509094238281, "learning_rate": 2.3383653913714996e-06, "loss": 2.8422, "num_input_tokens_seen": 5136352, "step": 7860 }, { "epoch": 0.8621067631261646, "grad_norm": 6.467989444732666, "learning_rate": 2.3202219644018365e-06, "loss": 3.0615, "num_input_tokens_seen": 5139152, "step": 7865 }, { "epoch": 0.8626548284555519, "grad_norm": 6.982528209686279, "learning_rate": 2.3021457736223412e-06, "loss": 3.0371, "num_input_tokens_seen": 5142336, "step": 7870 }, { "epoch": 0.8632028937849392, "grad_norm": 5.719668388366699, "learning_rate": 2.2841368726214755e-06, "loss": 3.1793, "num_input_tokens_seen": 5145504, "step": 7875 }, { "epoch": 0.8637509591143264, "grad_norm": 6.815168380737305, "learning_rate": 2.2661953147882024e-06, "loss": 3.2501, "num_input_tokens_seen": 5148672, "step": 7880 }, { "epoch": 0.8642990244437136, "grad_norm": 6.836389541625977, "learning_rate": 2.2483211533118357e-06, "loss": 3.2825, "num_input_tokens_seen": 5152104, "step": 7885 }, { "epoch": 0.864847089773101, "grad_norm": 9.11992359161377, "learning_rate": 2.2305144411819052e-06, "loss": 3.1458, "num_input_tokens_seen": 5154840, "step": 7890 }, { "epoch": 0.8653951551024882, "grad_norm": 7.1421308517456055, "learning_rate": 2.212775231187966e-06, "loss": 3.2977, "num_input_tokens_seen": 5157496, "step": 7895 }, { "epoch": 0.8659432204318754, "grad_norm": 6.900385856628418, "learning_rate": 2.1951035759194605e-06, "loss": 2.9658, "num_input_tokens_seen": 5161824, "step": 7900 }, { "epoch": 0.8664912857612628, "grad_norm": 8.681853294372559, "learning_rate": 2.1774995277655556e-06, "loss": 2.9868, "num_input_tokens_seen": 5164840, "step": 7905 }, { "epoch": 0.86703935109065, "grad_norm": 6.421346187591553, "learning_rate": 2.1599631389150027e-06, "loss": 3.3, "num_input_tokens_seen": 5169320, "step": 7910 }, { "epoch": 0.8675874164200372, "grad_norm": 6.86265754699707, "learning_rate": 2.1424944613559537e-06, "loss": 3.1633, "num_input_tokens_seen": 5172784, "step": 7915 }, { "epoch": 0.8681354817494246, "grad_norm": 4.766587257385254, "learning_rate": 2.1250935468758446e-06, "loss": 3.2877, "num_input_tokens_seen": 5175600, "step": 7920 }, { "epoch": 0.8686835470788118, "grad_norm": 6.533714771270752, "learning_rate": 2.1077604470612106e-06, "loss": 2.9995, "num_input_tokens_seen": 5178624, "step": 7925 }, { "epoch": 0.869231612408199, "grad_norm": 7.438570022583008, "learning_rate": 2.0904952132975386e-06, "loss": 2.7973, "num_input_tokens_seen": 5181688, "step": 7930 }, { "epoch": 0.8697796777375864, "grad_norm": 7.600935459136963, "learning_rate": 2.0732978967691357e-06, "loss": 3.4927, "num_input_tokens_seen": 5184008, "step": 7935 }, { "epoch": 0.8703277430669736, "grad_norm": 10.930978775024414, "learning_rate": 2.0561685484589506e-06, "loss": 3.0121, "num_input_tokens_seen": 5187600, "step": 7940 }, { "epoch": 0.8708758083963608, "grad_norm": 8.671449661254883, "learning_rate": 2.0391072191484338e-06, "loss": 3.1692, "num_input_tokens_seen": 5190976, "step": 7945 }, { "epoch": 0.8714238737257481, "grad_norm": 9.432777404785156, "learning_rate": 2.0221139594174018e-06, "loss": 3.0802, "num_input_tokens_seen": 5193664, "step": 7950 }, { "epoch": 0.8719719390551354, "grad_norm": 8.096484184265137, "learning_rate": 2.0051888196438552e-06, "loss": 2.8438, "num_input_tokens_seen": 5196696, "step": 7955 }, { "epoch": 0.8725200043845226, "grad_norm": 8.458807945251465, "learning_rate": 1.988331850003855e-06, "loss": 3.4075, "num_input_tokens_seen": 5200640, "step": 7960 }, { "epoch": 0.8730680697139099, "grad_norm": 9.191377639770508, "learning_rate": 1.971543100471368e-06, "loss": 3.276, "num_input_tokens_seen": 5204240, "step": 7965 }, { "epoch": 0.8736161350432972, "grad_norm": 6.790607929229736, "learning_rate": 1.954822620818114e-06, "loss": 2.9706, "num_input_tokens_seen": 5208024, "step": 7970 }, { "epoch": 0.8741642003726844, "grad_norm": 7.511916637420654, "learning_rate": 1.938170460613417e-06, "loss": 2.8037, "num_input_tokens_seen": 5211272, "step": 7975 }, { "epoch": 0.8747122657020717, "grad_norm": 6.600817680358887, "learning_rate": 1.921586669224071e-06, "loss": 3.3576, "num_input_tokens_seen": 5215392, "step": 7980 }, { "epoch": 0.875260331031459, "grad_norm": 5.347980976104736, "learning_rate": 1.9050712958141758e-06, "loss": 3.3071, "num_input_tokens_seen": 5217928, "step": 7985 }, { "epoch": 0.8758083963608462, "grad_norm": 6.689899921417236, "learning_rate": 1.8886243893450061e-06, "loss": 3.2119, "num_input_tokens_seen": 5220984, "step": 7990 }, { "epoch": 0.8763564616902335, "grad_norm": 6.363076210021973, "learning_rate": 1.8722459985748563e-06, "loss": 2.9524, "num_input_tokens_seen": 5224504, "step": 7995 }, { "epoch": 0.8769045270196207, "grad_norm": 7.521759986877441, "learning_rate": 1.8559361720588974e-06, "loss": 3.1379, "num_input_tokens_seen": 5227336, "step": 8000 }, { "epoch": 0.877452592349008, "grad_norm": 8.488334655761719, "learning_rate": 1.8396949581490463e-06, "loss": 3.2758, "num_input_tokens_seen": 5229968, "step": 8005 }, { "epoch": 0.8780006576783953, "grad_norm": 7.164643287658691, "learning_rate": 1.8235224049938049e-06, "loss": 3.0142, "num_input_tokens_seen": 5233280, "step": 8010 }, { "epoch": 0.8785487230077825, "grad_norm": 8.150335311889648, "learning_rate": 1.8074185605381239e-06, "loss": 3.2278, "num_input_tokens_seen": 5236408, "step": 8015 }, { "epoch": 0.8790967883371698, "grad_norm": 9.74315357208252, "learning_rate": 1.791383472523256e-06, "loss": 3.3009, "num_input_tokens_seen": 5240040, "step": 8020 }, { "epoch": 0.8796448536665571, "grad_norm": 6.548309326171875, "learning_rate": 1.7754171884866362e-06, "loss": 3.0949, "num_input_tokens_seen": 5243480, "step": 8025 }, { "epoch": 0.8801929189959443, "grad_norm": 6.918182373046875, "learning_rate": 1.7595197557617044e-06, "loss": 3.1496, "num_input_tokens_seen": 5246664, "step": 8030 }, { "epoch": 0.8807409843253315, "grad_norm": 6.263129711151123, "learning_rate": 1.7436912214777945e-06, "loss": 2.9099, "num_input_tokens_seen": 5249392, "step": 8035 }, { "epoch": 0.8812890496547189, "grad_norm": 8.55476188659668, "learning_rate": 1.7279316325599898e-06, "loss": 2.8569, "num_input_tokens_seen": 5252584, "step": 8040 }, { "epoch": 0.8818371149841061, "grad_norm": 7.661272048950195, "learning_rate": 1.7122410357289703e-06, "loss": 2.9037, "num_input_tokens_seen": 5256184, "step": 8045 }, { "epoch": 0.8823851803134933, "grad_norm": 5.52952766418457, "learning_rate": 1.6966194775008798e-06, "loss": 3.0452, "num_input_tokens_seen": 5260048, "step": 8050 }, { "epoch": 0.8829332456428807, "grad_norm": 8.354534149169922, "learning_rate": 1.6810670041872062e-06, "loss": 3.005, "num_input_tokens_seen": 5264288, "step": 8055 }, { "epoch": 0.8834813109722679, "grad_norm": 7.364735126495361, "learning_rate": 1.6655836618946151e-06, "loss": 3.1181, "num_input_tokens_seen": 5268000, "step": 8060 }, { "epoch": 0.8840293763016551, "grad_norm": 7.844119071960449, "learning_rate": 1.650169496524831e-06, "loss": 2.9376, "num_input_tokens_seen": 5270984, "step": 8065 }, { "epoch": 0.8845774416310425, "grad_norm": 5.87100076675415, "learning_rate": 1.6348245537745028e-06, "loss": 3.1916, "num_input_tokens_seen": 5274448, "step": 8070 }, { "epoch": 0.8851255069604297, "grad_norm": 7.44371223449707, "learning_rate": 1.6195488791350548e-06, "loss": 2.9924, "num_input_tokens_seen": 5277432, "step": 8075 }, { "epoch": 0.8856735722898169, "grad_norm": 6.34487771987915, "learning_rate": 1.6043425178925652e-06, "loss": 3.0224, "num_input_tokens_seen": 5279944, "step": 8080 }, { "epoch": 0.8862216376192042, "grad_norm": 5.726871490478516, "learning_rate": 1.5892055151276258e-06, "loss": 2.7579, "num_input_tokens_seen": 5283720, "step": 8085 }, { "epoch": 0.8867697029485915, "grad_norm": 9.92805004119873, "learning_rate": 1.574137915715207e-06, "loss": 3.0515, "num_input_tokens_seen": 5286392, "step": 8090 }, { "epoch": 0.8873177682779787, "grad_norm": 9.383995056152344, "learning_rate": 1.559139764324527e-06, "loss": 3.3639, "num_input_tokens_seen": 5289440, "step": 8095 }, { "epoch": 0.887865833607366, "grad_norm": 6.371479034423828, "learning_rate": 1.5442111054189246e-06, "loss": 3.0694, "num_input_tokens_seen": 5293168, "step": 8100 }, { "epoch": 0.8884138989367533, "grad_norm": 7.600619316101074, "learning_rate": 1.5293519832557113e-06, "loss": 3.1645, "num_input_tokens_seen": 5296272, "step": 8105 }, { "epoch": 0.8889619642661405, "grad_norm": 10.624588966369629, "learning_rate": 1.5145624418860637e-06, "loss": 2.9331, "num_input_tokens_seen": 5299248, "step": 8110 }, { "epoch": 0.8895100295955278, "grad_norm": 6.536969184875488, "learning_rate": 1.4998425251548654e-06, "loss": 2.962, "num_input_tokens_seen": 5302376, "step": 8115 }, { "epoch": 0.890058094924915, "grad_norm": 5.556844234466553, "learning_rate": 1.4851922767006088e-06, "loss": 2.9318, "num_input_tokens_seen": 5305704, "step": 8120 }, { "epoch": 0.8906061602543023, "grad_norm": 7.522222995758057, "learning_rate": 1.4706117399552383e-06, "loss": 3.0438, "num_input_tokens_seen": 5308112, "step": 8125 }, { "epoch": 0.8911542255836896, "grad_norm": 9.176352500915527, "learning_rate": 1.4561009581440272e-06, "loss": 3.0732, "num_input_tokens_seen": 5310768, "step": 8130 }, { "epoch": 0.8917022909130768, "grad_norm": 6.739439010620117, "learning_rate": 1.441659974285467e-06, "loss": 3.0154, "num_input_tokens_seen": 5313544, "step": 8135 }, { "epoch": 0.8922503562424641, "grad_norm": 6.810214042663574, "learning_rate": 1.4272888311911176e-06, "loss": 3.0619, "num_input_tokens_seen": 5316352, "step": 8140 }, { "epoch": 0.8927984215718514, "grad_norm": 5.931697368621826, "learning_rate": 1.4129875714654905e-06, "loss": 3.3196, "num_input_tokens_seen": 5320160, "step": 8145 }, { "epoch": 0.8933464869012386, "grad_norm": 7.526365280151367, "learning_rate": 1.398756237505927e-06, "loss": 2.9404, "num_input_tokens_seen": 5323560, "step": 8150 }, { "epoch": 0.8938945522306259, "grad_norm": 6.762884616851807, "learning_rate": 1.3845948715024648e-06, "loss": 3.2493, "num_input_tokens_seen": 5326504, "step": 8155 }, { "epoch": 0.8944426175600132, "grad_norm": 4.969104290008545, "learning_rate": 1.37050351543771e-06, "loss": 3.3379, "num_input_tokens_seen": 5329424, "step": 8160 }, { "epoch": 0.8949906828894004, "grad_norm": 6.4593586921691895, "learning_rate": 1.3564822110867264e-06, "loss": 3.2228, "num_input_tokens_seen": 5332600, "step": 8165 }, { "epoch": 0.8955387482187877, "grad_norm": 7.721135139465332, "learning_rate": 1.3425310000169028e-06, "loss": 3.2133, "num_input_tokens_seen": 5335792, "step": 8170 }, { "epoch": 0.896086813548175, "grad_norm": 8.572230339050293, "learning_rate": 1.3286499235878214e-06, "loss": 3.1945, "num_input_tokens_seen": 5339616, "step": 8175 }, { "epoch": 0.8966348788775622, "grad_norm": 7.773857593536377, "learning_rate": 1.3148390229511532e-06, "loss": 2.9125, "num_input_tokens_seen": 5342320, "step": 8180 }, { "epoch": 0.8971829442069494, "grad_norm": 7.451086521148682, "learning_rate": 1.3010983390505244e-06, "loss": 3.1514, "num_input_tokens_seen": 5345336, "step": 8185 }, { "epoch": 0.8977310095363368, "grad_norm": 7.28810453414917, "learning_rate": 1.2874279126213973e-06, "loss": 3.1191, "num_input_tokens_seen": 5348880, "step": 8190 }, { "epoch": 0.898279074865724, "grad_norm": 4.2049078941345215, "learning_rate": 1.2738277841909479e-06, "loss": 2.9685, "num_input_tokens_seen": 5352936, "step": 8195 }, { "epoch": 0.8988271401951112, "grad_norm": 7.404577732086182, "learning_rate": 1.2602979940779524e-06, "loss": 3.107, "num_input_tokens_seen": 5355952, "step": 8200 }, { "epoch": 0.8993752055244986, "grad_norm": 11.230597496032715, "learning_rate": 1.2468385823926481e-06, "loss": 2.9561, "num_input_tokens_seen": 5359608, "step": 8205 }, { "epoch": 0.8999232708538858, "grad_norm": 8.928146362304688, "learning_rate": 1.233449589036656e-06, "loss": 3.172, "num_input_tokens_seen": 5363024, "step": 8210 }, { "epoch": 0.900471336183273, "grad_norm": 5.939243316650391, "learning_rate": 1.2201310537028138e-06, "loss": 3.0996, "num_input_tokens_seen": 5366928, "step": 8215 }, { "epoch": 0.9010194015126604, "grad_norm": 7.374519348144531, "learning_rate": 1.206883015875085e-06, "loss": 3.0966, "num_input_tokens_seen": 5369984, "step": 8220 }, { "epoch": 0.9015674668420476, "grad_norm": 8.059386253356934, "learning_rate": 1.1937055148284444e-06, "loss": 3.0717, "num_input_tokens_seen": 5372632, "step": 8225 }, { "epoch": 0.9021155321714348, "grad_norm": 8.80373764038086, "learning_rate": 1.1805985896287452e-06, "loss": 3.1543, "num_input_tokens_seen": 5375544, "step": 8230 }, { "epoch": 0.9026635975008221, "grad_norm": 6.8497443199157715, "learning_rate": 1.1675622791326169e-06, "loss": 2.9531, "num_input_tokens_seen": 5378856, "step": 8235 }, { "epoch": 0.9032116628302094, "grad_norm": 7.791383266448975, "learning_rate": 1.1545966219873444e-06, "loss": 2.9187, "num_input_tokens_seen": 5382752, "step": 8240 }, { "epoch": 0.9037597281595966, "grad_norm": 6.825507640838623, "learning_rate": 1.1417016566307586e-06, "loss": 2.8782, "num_input_tokens_seen": 5386080, "step": 8245 }, { "epoch": 0.9043077934889839, "grad_norm": 6.135127544403076, "learning_rate": 1.1288774212911052e-06, "loss": 2.8879, "num_input_tokens_seen": 5389680, "step": 8250 }, { "epoch": 0.9048558588183712, "grad_norm": 8.292460441589355, "learning_rate": 1.1161239539869668e-06, "loss": 2.9108, "num_input_tokens_seen": 5393112, "step": 8255 }, { "epoch": 0.9054039241477584, "grad_norm": 6.192307949066162, "learning_rate": 1.1034412925271075e-06, "loss": 2.72, "num_input_tokens_seen": 5397056, "step": 8260 }, { "epoch": 0.9059519894771457, "grad_norm": 6.773381233215332, "learning_rate": 1.0908294745103882e-06, "loss": 2.7747, "num_input_tokens_seen": 5400928, "step": 8265 }, { "epoch": 0.906500054806533, "grad_norm": 9.411810874938965, "learning_rate": 1.078288537325653e-06, "loss": 3.1762, "num_input_tokens_seen": 5403744, "step": 8270 }, { "epoch": 0.9070481201359202, "grad_norm": 5.909646511077881, "learning_rate": 1.0658185181516094e-06, "loss": 2.9356, "num_input_tokens_seen": 5406888, "step": 8275 }, { "epoch": 0.9075961854653075, "grad_norm": 8.18594741821289, "learning_rate": 1.0534194539567194e-06, "loss": 3.0487, "num_input_tokens_seen": 5409856, "step": 8280 }, { "epoch": 0.9081442507946947, "grad_norm": 10.775045394897461, "learning_rate": 1.0410913814990985e-06, "loss": 2.8025, "num_input_tokens_seen": 5412416, "step": 8285 }, { "epoch": 0.908692316124082, "grad_norm": 8.237727165222168, "learning_rate": 1.0288343373263954e-06, "loss": 3.0227, "num_input_tokens_seen": 5415176, "step": 8290 }, { "epoch": 0.9092403814534693, "grad_norm": 7.0511884689331055, "learning_rate": 1.016648357775693e-06, "loss": 2.8189, "num_input_tokens_seen": 5418552, "step": 8295 }, { "epoch": 0.9097884467828565, "grad_norm": 6.959300518035889, "learning_rate": 1.004533478973399e-06, "loss": 3.3864, "num_input_tokens_seen": 5421712, "step": 8300 }, { "epoch": 0.9103365121122438, "grad_norm": 7.333334922790527, "learning_rate": 9.924897368351282e-07, "loss": 3.1543, "num_input_tokens_seen": 5425312, "step": 8305 }, { "epoch": 0.9108845774416311, "grad_norm": 7.005816459655762, "learning_rate": 9.805171670656117e-07, "loss": 3.1113, "num_input_tokens_seen": 5428680, "step": 8310 }, { "epoch": 0.9114326427710183, "grad_norm": 5.512388229370117, "learning_rate": 9.686158051585874e-07, "loss": 3.0001, "num_input_tokens_seen": 5431848, "step": 8315 }, { "epoch": 0.9119807081004055, "grad_norm": 6.378774642944336, "learning_rate": 9.56785686396683e-07, "loss": 3.1063, "num_input_tokens_seen": 5434648, "step": 8320 }, { "epoch": 0.9125287734297929, "grad_norm": 6.719765663146973, "learning_rate": 9.450268458513156e-07, "loss": 2.7967, "num_input_tokens_seen": 5438728, "step": 8325 }, { "epoch": 0.9130768387591801, "grad_norm": 8.518233299255371, "learning_rate": 9.333393183826089e-07, "loss": 2.7597, "num_input_tokens_seen": 5442232, "step": 8330 }, { "epoch": 0.9136249040885673, "grad_norm": 7.718142986297607, "learning_rate": 9.217231386392577e-07, "loss": 3.5149, "num_input_tokens_seen": 5445320, "step": 8335 }, { "epoch": 0.9141729694179547, "grad_norm": 7.286013603210449, "learning_rate": 9.101783410584458e-07, "loss": 3.2542, "num_input_tokens_seen": 5448280, "step": 8340 }, { "epoch": 0.9147210347473419, "grad_norm": 6.524003028869629, "learning_rate": 8.987049598657398e-07, "loss": 3.0042, "num_input_tokens_seen": 5452360, "step": 8345 }, { "epoch": 0.9152691000767291, "grad_norm": 6.262417316436768, "learning_rate": 8.87303029074979e-07, "loss": 2.6819, "num_input_tokens_seen": 5455872, "step": 8350 }, { "epoch": 0.9158171654061165, "grad_norm": 6.51323127746582, "learning_rate": 8.75972582488191e-07, "loss": 3.1662, "num_input_tokens_seen": 5458616, "step": 8355 }, { "epoch": 0.9163652307355037, "grad_norm": 7.502628803253174, "learning_rate": 8.647136536954787e-07, "loss": 2.4922, "num_input_tokens_seen": 5461408, "step": 8360 }, { "epoch": 0.9169132960648909, "grad_norm": 6.768873691558838, "learning_rate": 8.535262760749202e-07, "loss": 2.7696, "num_input_tokens_seen": 5466664, "step": 8365 }, { "epoch": 0.9174613613942783, "grad_norm": 9.054154396057129, "learning_rate": 8.4241048279248e-07, "loss": 3.3125, "num_input_tokens_seen": 5469400, "step": 8370 }, { "epoch": 0.9180094267236655, "grad_norm": 7.729340076446533, "learning_rate": 8.313663068019007e-07, "loss": 3.383, "num_input_tokens_seen": 5472936, "step": 8375 }, { "epoch": 0.9185574920530527, "grad_norm": 8.844609260559082, "learning_rate": 8.203937808446083e-07, "loss": 2.7089, "num_input_tokens_seen": 5476176, "step": 8380 }, { "epoch": 0.91910555738244, "grad_norm": 7.043740272521973, "learning_rate": 8.094929374496185e-07, "loss": 3.2024, "num_input_tokens_seen": 5479576, "step": 8385 }, { "epoch": 0.9196536227118273, "grad_norm": 8.144498825073242, "learning_rate": 7.986638089334392e-07, "loss": 3.4681, "num_input_tokens_seen": 5483592, "step": 8390 }, { "epoch": 0.9202016880412145, "grad_norm": 7.295477867126465, "learning_rate": 7.879064273999731e-07, "loss": 3.3592, "num_input_tokens_seen": 5486736, "step": 8395 }, { "epoch": 0.9207497533706018, "grad_norm": 6.9401960372924805, "learning_rate": 7.772208247404128e-07, "loss": 2.8916, "num_input_tokens_seen": 5489720, "step": 8400 }, { "epoch": 0.9212978186999891, "grad_norm": 5.044391632080078, "learning_rate": 7.666070326331709e-07, "loss": 2.9984, "num_input_tokens_seen": 5494312, "step": 8405 }, { "epoch": 0.9218458840293763, "grad_norm": 7.426214218139648, "learning_rate": 7.560650825437637e-07, "loss": 2.6398, "num_input_tokens_seen": 5498536, "step": 8410 }, { "epoch": 0.9223939493587635, "grad_norm": 6.066382884979248, "learning_rate": 7.455950057247252e-07, "loss": 3.0293, "num_input_tokens_seen": 5501256, "step": 8415 }, { "epoch": 0.9229420146881508, "grad_norm": 6.4779181480407715, "learning_rate": 7.351968332155152e-07, "loss": 3.0215, "num_input_tokens_seen": 5504440, "step": 8420 }, { "epoch": 0.9234900800175381, "grad_norm": 5.473248481750488, "learning_rate": 7.248705958424307e-07, "loss": 2.9114, "num_input_tokens_seen": 5507752, "step": 8425 }, { "epoch": 0.9240381453469253, "grad_norm": 7.87445592880249, "learning_rate": 7.146163242185033e-07, "loss": 3.0642, "num_input_tokens_seen": 5511168, "step": 8430 }, { "epoch": 0.9245862106763126, "grad_norm": 7.2715959548950195, "learning_rate": 7.044340487434242e-07, "loss": 3.0391, "num_input_tokens_seen": 5513984, "step": 8435 }, { "epoch": 0.9251342760056999, "grad_norm": 7.839521408081055, "learning_rate": 6.943237996034386e-07, "loss": 3.2316, "num_input_tokens_seen": 5516632, "step": 8440 }, { "epoch": 0.9256823413350871, "grad_norm": 7.8146820068359375, "learning_rate": 6.842856067712677e-07, "loss": 3.0688, "num_input_tokens_seen": 5520488, "step": 8445 }, { "epoch": 0.9262304066644744, "grad_norm": 7.480862140655518, "learning_rate": 6.743195000060154e-07, "loss": 2.8072, "num_input_tokens_seen": 5524136, "step": 8450 }, { "epoch": 0.9267784719938617, "grad_norm": 6.187289237976074, "learning_rate": 6.644255088530782e-07, "loss": 3.1597, "num_input_tokens_seen": 5528256, "step": 8455 }, { "epoch": 0.9273265373232489, "grad_norm": 7.108201026916504, "learning_rate": 6.546036626440599e-07, "loss": 2.8195, "num_input_tokens_seen": 5531368, "step": 8460 }, { "epoch": 0.9278746026526362, "grad_norm": 9.429540634155273, "learning_rate": 6.448539904966827e-07, "loss": 3.1321, "num_input_tokens_seen": 5534144, "step": 8465 }, { "epoch": 0.9284226679820234, "grad_norm": 6.745710849761963, "learning_rate": 6.351765213147037e-07, "loss": 2.8217, "num_input_tokens_seen": 5536848, "step": 8470 }, { "epoch": 0.9289707333114107, "grad_norm": 6.650664806365967, "learning_rate": 6.255712837878347e-07, "loss": 3.1658, "num_input_tokens_seen": 5540136, "step": 8475 }, { "epoch": 0.929518798640798, "grad_norm": 7.63946008682251, "learning_rate": 6.160383063916419e-07, "loss": 3.1177, "num_input_tokens_seen": 5543192, "step": 8480 }, { "epoch": 0.9300668639701852, "grad_norm": 7.223082542419434, "learning_rate": 6.065776173874687e-07, "loss": 3.6049, "num_input_tokens_seen": 5547392, "step": 8485 }, { "epoch": 0.9306149292995725, "grad_norm": 7.673356533050537, "learning_rate": 5.971892448223576e-07, "loss": 2.8851, "num_input_tokens_seen": 5550056, "step": 8490 }, { "epoch": 0.9311629946289598, "grad_norm": 7.799294471740723, "learning_rate": 5.878732165289668e-07, "loss": 3.2135, "num_input_tokens_seen": 5552728, "step": 8495 }, { "epoch": 0.931711059958347, "grad_norm": 5.8991312980651855, "learning_rate": 5.786295601254765e-07, "loss": 3.5495, "num_input_tokens_seen": 5556008, "step": 8500 }, { "epoch": 0.9322591252877342, "grad_norm": 8.919817924499512, "learning_rate": 5.694583030155131e-07, "loss": 3.2696, "num_input_tokens_seen": 5558680, "step": 8505 }, { "epoch": 0.9328071906171216, "grad_norm": 6.0595293045043945, "learning_rate": 5.60359472388075e-07, "loss": 3.1983, "num_input_tokens_seen": 5561976, "step": 8510 }, { "epoch": 0.9333552559465088, "grad_norm": 7.8532185554504395, "learning_rate": 5.513330952174462e-07, "loss": 2.8831, "num_input_tokens_seen": 5565032, "step": 8515 }, { "epoch": 0.933903321275896, "grad_norm": 6.592312335968018, "learning_rate": 5.423791982631071e-07, "loss": 3.2783, "num_input_tokens_seen": 5567976, "step": 8520 }, { "epoch": 0.9344513866052834, "grad_norm": 5.455694198608398, "learning_rate": 5.334978080696773e-07, "loss": 2.3299, "num_input_tokens_seen": 5572544, "step": 8525 }, { "epoch": 0.9349994519346706, "grad_norm": 6.956151008605957, "learning_rate": 5.246889509668118e-07, "loss": 3.0221, "num_input_tokens_seen": 5575256, "step": 8530 }, { "epoch": 0.9355475172640578, "grad_norm": 7.278057098388672, "learning_rate": 5.159526530691378e-07, "loss": 3.2783, "num_input_tokens_seen": 5577928, "step": 8535 }, { "epoch": 0.9360955825934452, "grad_norm": 5.909106731414795, "learning_rate": 5.072889402761821e-07, "loss": 3.2452, "num_input_tokens_seen": 5580632, "step": 8540 }, { "epoch": 0.9366436479228324, "grad_norm": 6.952794075012207, "learning_rate": 4.986978382722773e-07, "loss": 3.0232, "num_input_tokens_seen": 5584824, "step": 8545 }, { "epoch": 0.9371917132522196, "grad_norm": 8.14654541015625, "learning_rate": 4.901793725264975e-07, "loss": 3.0803, "num_input_tokens_seen": 5589208, "step": 8550 }, { "epoch": 0.937739778581607, "grad_norm": 6.610713958740234, "learning_rate": 4.817335682925805e-07, "loss": 2.8802, "num_input_tokens_seen": 5592056, "step": 8555 }, { "epoch": 0.9382878439109942, "grad_norm": 10.567109107971191, "learning_rate": 4.73360450608859e-07, "loss": 3.3952, "num_input_tokens_seen": 5595120, "step": 8560 }, { "epoch": 0.9388359092403814, "grad_norm": 7.1954545974731445, "learning_rate": 4.6506004429817117e-07, "loss": 3.2835, "num_input_tokens_seen": 5598408, "step": 8565 }, { "epoch": 0.9393839745697687, "grad_norm": 7.200895309448242, "learning_rate": 4.568323739677971e-07, "loss": 3.2721, "num_input_tokens_seen": 5602328, "step": 8570 }, { "epoch": 0.939932039899156, "grad_norm": 7.637218952178955, "learning_rate": 4.486774640093894e-07, "loss": 3.0411, "num_input_tokens_seen": 5606096, "step": 8575 }, { "epoch": 0.9404801052285432, "grad_norm": 8.214374542236328, "learning_rate": 4.405953385988898e-07, "loss": 3.1399, "num_input_tokens_seen": 5608544, "step": 8580 }, { "epoch": 0.9410281705579305, "grad_norm": 7.163279056549072, "learning_rate": 4.325860216964711e-07, "loss": 2.7451, "num_input_tokens_seen": 5611872, "step": 8585 }, { "epoch": 0.9415762358873178, "grad_norm": 7.930347919464111, "learning_rate": 4.2464953704645647e-07, "loss": 2.9838, "num_input_tokens_seen": 5614440, "step": 8590 }, { "epoch": 0.942124301216705, "grad_norm": 4.849373817443848, "learning_rate": 4.167859081772446e-07, "loss": 2.9805, "num_input_tokens_seen": 5617856, "step": 8595 }, { "epoch": 0.9426723665460923, "grad_norm": 8.461563110351562, "learning_rate": 4.0899515840125966e-07, "loss": 3.2951, "num_input_tokens_seen": 5620824, "step": 8600 }, { "epoch": 0.9432204318754795, "grad_norm": 8.734384536743164, "learning_rate": 4.0127731081485987e-07, "loss": 3.3802, "num_input_tokens_seen": 5624696, "step": 8605 }, { "epoch": 0.9437684972048668, "grad_norm": 9.480766296386719, "learning_rate": 3.936323882982762e-07, "loss": 2.8742, "num_input_tokens_seen": 5628648, "step": 8610 }, { "epoch": 0.9443165625342541, "grad_norm": 8.393555641174316, "learning_rate": 3.8606041351555986e-07, "loss": 3.3445, "num_input_tokens_seen": 5631048, "step": 8615 }, { "epoch": 0.9448646278636413, "grad_norm": 5.754420757293701, "learning_rate": 3.785614089144879e-07, "loss": 3.2994, "num_input_tokens_seen": 5634840, "step": 8620 }, { "epoch": 0.9454126931930286, "grad_norm": 7.406842231750488, "learning_rate": 3.7113539672651853e-07, "loss": 3.2169, "num_input_tokens_seen": 5639056, "step": 8625 }, { "epoch": 0.9459607585224159, "grad_norm": 8.346644401550293, "learning_rate": 3.637823989667166e-07, "loss": 3.5016, "num_input_tokens_seen": 5642368, "step": 8630 }, { "epoch": 0.9465088238518031, "grad_norm": 6.256731033325195, "learning_rate": 3.565024374336895e-07, "loss": 2.9251, "num_input_tokens_seen": 5645288, "step": 8635 }, { "epoch": 0.9470568891811904, "grad_norm": 8.30922794342041, "learning_rate": 3.4929553370951496e-07, "loss": 2.897, "num_input_tokens_seen": 5648256, "step": 8640 }, { "epoch": 0.9476049545105777, "grad_norm": 5.839921951293945, "learning_rate": 3.421617091596996e-07, "loss": 3.0709, "num_input_tokens_seen": 5651456, "step": 8645 }, { "epoch": 0.9481530198399649, "grad_norm": 8.873268127441406, "learning_rate": 3.3510098493308715e-07, "loss": 2.8349, "num_input_tokens_seen": 5654936, "step": 8650 }, { "epoch": 0.9487010851693521, "grad_norm": 7.447127342224121, "learning_rate": 3.2811338196181706e-07, "loss": 3.1457, "num_input_tokens_seen": 5658344, "step": 8655 }, { "epoch": 0.9492491504987395, "grad_norm": 7.901216506958008, "learning_rate": 3.211989209612437e-07, "loss": 3.0331, "num_input_tokens_seen": 5661088, "step": 8660 }, { "epoch": 0.9497972158281267, "grad_norm": 6.363575458526611, "learning_rate": 3.1435762242990053e-07, "loss": 3.0904, "num_input_tokens_seen": 5664544, "step": 8665 }, { "epoch": 0.9503452811575139, "grad_norm": 8.245457649230957, "learning_rate": 3.0758950664940833e-07, "loss": 2.9634, "num_input_tokens_seen": 5667704, "step": 8670 }, { "epoch": 0.9508933464869013, "grad_norm": 6.969222068786621, "learning_rate": 3.008945936844504e-07, "loss": 2.9006, "num_input_tokens_seen": 5671088, "step": 8675 }, { "epoch": 0.9514414118162885, "grad_norm": 9.956710815429688, "learning_rate": 2.942729033826752e-07, "loss": 3.3092, "num_input_tokens_seen": 5673784, "step": 8680 }, { "epoch": 0.9519894771456757, "grad_norm": 6.730470657348633, "learning_rate": 2.877244553746633e-07, "loss": 2.8794, "num_input_tokens_seen": 5677024, "step": 8685 }, { "epoch": 0.9525375424750631, "grad_norm": 7.628656387329102, "learning_rate": 2.8124926907386885e-07, "loss": 2.9683, "num_input_tokens_seen": 5680552, "step": 8690 }, { "epoch": 0.9530856078044503, "grad_norm": 8.587575912475586, "learning_rate": 2.748473636765475e-07, "loss": 3.0311, "num_input_tokens_seen": 5684128, "step": 8695 }, { "epoch": 0.9536336731338375, "grad_norm": 8.781567573547363, "learning_rate": 2.6851875816170655e-07, "loss": 2.9722, "num_input_tokens_seen": 5687784, "step": 8700 }, { "epoch": 0.9541817384632248, "grad_norm": 6.88287353515625, "learning_rate": 2.622634712910521e-07, "loss": 3.3128, "num_input_tokens_seen": 5690464, "step": 8705 }, { "epoch": 0.9547298037926121, "grad_norm": 7.1090874671936035, "learning_rate": 2.560815216089335e-07, "loss": 3.0189, "num_input_tokens_seen": 5693312, "step": 8710 }, { "epoch": 0.9552778691219993, "grad_norm": 7.3000168800354, "learning_rate": 2.499729274422796e-07, "loss": 3.5534, "num_input_tokens_seen": 5697232, "step": 8715 }, { "epoch": 0.9558259344513866, "grad_norm": 8.97269344329834, "learning_rate": 2.439377069005544e-07, "loss": 3.5597, "num_input_tokens_seen": 5699808, "step": 8720 }, { "epoch": 0.9563739997807739, "grad_norm": 8.973227500915527, "learning_rate": 2.3797587787569852e-07, "loss": 3.0848, "num_input_tokens_seen": 5703784, "step": 8725 }, { "epoch": 0.9569220651101611, "grad_norm": 7.142612934112549, "learning_rate": 2.3208745804207398e-07, "loss": 2.8029, "num_input_tokens_seen": 5706344, "step": 8730 }, { "epoch": 0.9574701304395484, "grad_norm": 8.567402839660645, "learning_rate": 2.262724648564224e-07, "loss": 3.3482, "num_input_tokens_seen": 5710600, "step": 8735 }, { "epoch": 0.9580181957689357, "grad_norm": 11.277481079101562, "learning_rate": 2.2053091555779837e-07, "loss": 3.0415, "num_input_tokens_seen": 5714152, "step": 8740 }, { "epoch": 0.9585662610983229, "grad_norm": 7.343226432800293, "learning_rate": 2.1486282716752791e-07, "loss": 3.0087, "num_input_tokens_seen": 5716376, "step": 8745 }, { "epoch": 0.9591143264277102, "grad_norm": 6.354895114898682, "learning_rate": 2.0926821648915574e-07, "loss": 3.0672, "num_input_tokens_seen": 5719152, "step": 8750 }, { "epoch": 0.9596623917570974, "grad_norm": 7.212831497192383, "learning_rate": 2.0374710010839793e-07, "loss": 3.3, "num_input_tokens_seen": 5723064, "step": 8755 }, { "epoch": 0.9602104570864847, "grad_norm": 6.967692852020264, "learning_rate": 1.982994943930838e-07, "loss": 3.1401, "num_input_tokens_seen": 5725768, "step": 8760 }, { "epoch": 0.960758522415872, "grad_norm": 8.500665664672852, "learning_rate": 1.9292541549311983e-07, "loss": 3.2358, "num_input_tokens_seen": 5728104, "step": 8765 }, { "epoch": 0.9613065877452592, "grad_norm": 7.204361915588379, "learning_rate": 1.876248793404367e-07, "loss": 2.9241, "num_input_tokens_seen": 5730688, "step": 8770 }, { "epoch": 0.9618546530746465, "grad_norm": 7.031684398651123, "learning_rate": 1.8239790164893412e-07, "loss": 3.2293, "num_input_tokens_seen": 5733936, "step": 8775 }, { "epoch": 0.9624027184040338, "grad_norm": 8.101325035095215, "learning_rate": 1.7724449791444997e-07, "loss": 2.7716, "num_input_tokens_seen": 5737880, "step": 8780 }, { "epoch": 0.962950783733421, "grad_norm": 6.74721622467041, "learning_rate": 1.721646834146967e-07, "loss": 2.715, "num_input_tokens_seen": 5741936, "step": 8785 }, { "epoch": 0.9634988490628082, "grad_norm": 9.26173210144043, "learning_rate": 1.671584732092335e-07, "loss": 2.8224, "num_input_tokens_seen": 5746160, "step": 8790 }, { "epoch": 0.9640469143921956, "grad_norm": 5.797330856323242, "learning_rate": 1.6222588213940792e-07, "loss": 3.3261, "num_input_tokens_seen": 5750696, "step": 8795 }, { "epoch": 0.9645949797215828, "grad_norm": 9.205500602722168, "learning_rate": 1.5736692482831995e-07, "loss": 2.9268, "num_input_tokens_seen": 5753384, "step": 8800 }, { "epoch": 0.96514304505097, "grad_norm": 6.270941257476807, "learning_rate": 1.5258161568077188e-07, "loss": 2.8041, "num_input_tokens_seen": 5756640, "step": 8805 }, { "epoch": 0.9656911103803574, "grad_norm": 7.947140693664551, "learning_rate": 1.4786996888323524e-07, "loss": 3.1006, "num_input_tokens_seen": 5759848, "step": 8810 }, { "epoch": 0.9662391757097446, "grad_norm": 8.765256881713867, "learning_rate": 1.4323199840380053e-07, "loss": 3.2065, "num_input_tokens_seen": 5763416, "step": 8815 }, { "epoch": 0.9667872410391318, "grad_norm": 5.335040092468262, "learning_rate": 1.3866771799213307e-07, "loss": 2.9768, "num_input_tokens_seen": 5766160, "step": 8820 }, { "epoch": 0.9673353063685192, "grad_norm": 5.483620643615723, "learning_rate": 1.3417714117944513e-07, "loss": 2.8682, "num_input_tokens_seen": 5771024, "step": 8825 }, { "epoch": 0.9678833716979064, "grad_norm": 8.511704444885254, "learning_rate": 1.2976028127844597e-07, "loss": 3.1851, "num_input_tokens_seen": 5774632, "step": 8830 }, { "epoch": 0.9684314370272936, "grad_norm": 6.916325569152832, "learning_rate": 1.25417151383303e-07, "loss": 3.2018, "num_input_tokens_seen": 5778048, "step": 8835 }, { "epoch": 0.968979502356681, "grad_norm": 6.791527271270752, "learning_rate": 1.2114776436960294e-07, "loss": 3.1153, "num_input_tokens_seen": 5781288, "step": 8840 }, { "epoch": 0.9695275676860682, "grad_norm": 7.304278373718262, "learning_rate": 1.1695213289432406e-07, "loss": 2.7359, "num_input_tokens_seen": 5783776, "step": 8845 }, { "epoch": 0.9700756330154554, "grad_norm": 7.467769145965576, "learning_rate": 1.128302693957778e-07, "loss": 3.1941, "num_input_tokens_seen": 5786120, "step": 8850 }, { "epoch": 0.9706236983448427, "grad_norm": 8.969725608825684, "learning_rate": 1.0878218609359502e-07, "loss": 3.0654, "num_input_tokens_seen": 5789672, "step": 8855 }, { "epoch": 0.97117176367423, "grad_norm": 8.292722702026367, "learning_rate": 1.0480789498866772e-07, "loss": 2.9517, "num_input_tokens_seen": 5792480, "step": 8860 }, { "epoch": 0.9717198290036172, "grad_norm": 5.788974285125732, "learning_rate": 1.0090740786313502e-07, "loss": 2.9964, "num_input_tokens_seen": 5796848, "step": 8865 }, { "epoch": 0.9722678943330045, "grad_norm": 8.003725051879883, "learning_rate": 9.708073628033055e-08, "loss": 2.8592, "num_input_tokens_seen": 5801376, "step": 8870 }, { "epoch": 0.9728159596623918, "grad_norm": 6.711467742919922, "learning_rate": 9.332789158476018e-08, "loss": 2.9653, "num_input_tokens_seen": 5804480, "step": 8875 }, { "epoch": 0.973364024991779, "grad_norm": 5.3671417236328125, "learning_rate": 8.964888490205769e-08, "loss": 3.1577, "num_input_tokens_seen": 5807632, "step": 8880 }, { "epoch": 0.9739120903211663, "grad_norm": 6.408278942108154, "learning_rate": 8.604372713896247e-08, "loss": 2.7764, "num_input_tokens_seen": 5810096, "step": 8885 }, { "epoch": 0.9744601556505536, "grad_norm": 8.041277885437012, "learning_rate": 8.251242898328071e-08, "loss": 3.2175, "num_input_tokens_seen": 5813808, "step": 8890 }, { "epoch": 0.9750082209799408, "grad_norm": 6.138535499572754, "learning_rate": 7.905500090385487e-08, "loss": 2.9364, "num_input_tokens_seen": 5816552, "step": 8895 }, { "epoch": 0.9755562863093281, "grad_norm": 8.328486442565918, "learning_rate": 7.567145315053314e-08, "loss": 3.163, "num_input_tokens_seen": 5820568, "step": 8900 }, { "epoch": 0.9761043516387153, "grad_norm": 9.473198890686035, "learning_rate": 7.236179575414448e-08, "loss": 3.2253, "num_input_tokens_seen": 5823808, "step": 8905 }, { "epoch": 0.9766524169681026, "grad_norm": 5.804590225219727, "learning_rate": 6.912603852645138e-08, "loss": 3.0782, "num_input_tokens_seen": 5826744, "step": 8910 }, { "epoch": 0.9772004822974899, "grad_norm": 5.613870620727539, "learning_rate": 6.596419106014163e-08, "loss": 2.9843, "num_input_tokens_seen": 5831144, "step": 8915 }, { "epoch": 0.9777485476268771, "grad_norm": 8.519886016845703, "learning_rate": 6.28762627287921e-08, "loss": 3.0685, "num_input_tokens_seen": 5834792, "step": 8920 }, { "epoch": 0.9782966129562644, "grad_norm": 7.168541431427002, "learning_rate": 5.986226268683282e-08, "loss": 3.2515, "num_input_tokens_seen": 5838368, "step": 8925 }, { "epoch": 0.9788446782856517, "grad_norm": 10.949654579162598, "learning_rate": 5.692219986953573e-08, "loss": 2.9654, "num_input_tokens_seen": 5842120, "step": 8930 }, { "epoch": 0.9793927436150389, "grad_norm": 6.906786918640137, "learning_rate": 5.4056082992973155e-08, "loss": 3.0675, "num_input_tokens_seen": 5845248, "step": 8935 }, { "epoch": 0.9799408089444261, "grad_norm": 5.457529067993164, "learning_rate": 5.1263920553998315e-08, "loss": 2.9989, "num_input_tokens_seen": 5848536, "step": 8940 }, { "epoch": 0.9804888742738135, "grad_norm": 9.393891334533691, "learning_rate": 4.854572083022313e-08, "loss": 3.1355, "num_input_tokens_seen": 5851824, "step": 8945 }, { "epoch": 0.9810369396032007, "grad_norm": 8.42390251159668, "learning_rate": 4.5901491879984934e-08, "loss": 3.0677, "num_input_tokens_seen": 5855152, "step": 8950 }, { "epoch": 0.9815850049325879, "grad_norm": 7.749826908111572, "learning_rate": 4.3331241542340916e-08, "loss": 3.1391, "num_input_tokens_seen": 5858576, "step": 8955 }, { "epoch": 0.9821330702619753, "grad_norm": 8.214120864868164, "learning_rate": 4.083497743701259e-08, "loss": 2.8317, "num_input_tokens_seen": 5861528, "step": 8960 }, { "epoch": 0.9826811355913625, "grad_norm": 6.369811058044434, "learning_rate": 3.8412706964402465e-08, "loss": 2.9487, "num_input_tokens_seen": 5865128, "step": 8965 }, { "epoch": 0.9832292009207497, "grad_norm": 8.29269027709961, "learning_rate": 3.606443730554132e-08, "loss": 3.0666, "num_input_tokens_seen": 5867928, "step": 8970 }, { "epoch": 0.9837772662501371, "grad_norm": 7.444830417633057, "learning_rate": 3.379017542207707e-08, "loss": 3.0067, "num_input_tokens_seen": 5870968, "step": 8975 }, { "epoch": 0.9843253315795243, "grad_norm": 7.021453380584717, "learning_rate": 3.1589928056263704e-08, "loss": 3.1972, "num_input_tokens_seen": 5874496, "step": 8980 }, { "epoch": 0.9848733969089115, "grad_norm": 7.41176176071167, "learning_rate": 2.9463701730922388e-08, "loss": 2.826, "num_input_tokens_seen": 5878088, "step": 8985 }, { "epoch": 0.9854214622382989, "grad_norm": 9.515088081359863, "learning_rate": 2.7411502749441488e-08, "loss": 3.1693, "num_input_tokens_seen": 5881752, "step": 8990 }, { "epoch": 0.9859695275676861, "grad_norm": 8.658610343933105, "learning_rate": 2.5433337195743258e-08, "loss": 2.8453, "num_input_tokens_seen": 5884816, "step": 8995 }, { "epoch": 0.9865175928970733, "grad_norm": 7.5331830978393555, "learning_rate": 2.3529210934272738e-08, "loss": 2.8423, "num_input_tokens_seen": 5887864, "step": 9000 }, { "epoch": 0.9870656582264606, "grad_norm": 8.601006507873535, "learning_rate": 2.2059222016279636e-08, "loss": 3.5074, "num_input_tokens_seen": 5892776, "step": 9005 }, { "epoch": 0.9876137235558479, "grad_norm": 9.700572967529297, "learning_rate": 2.0288380558580732e-08, "loss": 2.9729, "num_input_tokens_seen": 5895976, "step": 9010 }, { "epoch": 0.9881617888852351, "grad_norm": 7.793155193328857, "learning_rate": 1.859159364578089e-08, "loss": 3.1164, "num_input_tokens_seen": 5897952, "step": 9015 }, { "epoch": 0.9887098542146224, "grad_norm": 6.612551212310791, "learning_rate": 1.696886630815908e-08, "loss": 2.9729, "num_input_tokens_seen": 5901264, "step": 9020 }, { "epoch": 0.9892579195440097, "grad_norm": 7.382999897003174, "learning_rate": 1.5420203356431018e-08, "loss": 3.2611, "num_input_tokens_seen": 5904096, "step": 9025 }, { "epoch": 0.9898059848733969, "grad_norm": 6.810866832733154, "learning_rate": 1.3945609381743607e-08, "loss": 2.8127, "num_input_tokens_seen": 5907072, "step": 9030 }, { "epoch": 0.9903540502027842, "grad_norm": 7.927409648895264, "learning_rate": 1.2545088755658296e-08, "loss": 3.2365, "num_input_tokens_seen": 5910056, "step": 9035 }, { "epoch": 0.9909021155321714, "grad_norm": 7.214841842651367, "learning_rate": 1.121864563014552e-08, "loss": 3.0081, "num_input_tokens_seen": 5913112, "step": 9040 }, { "epoch": 0.9914501808615587, "grad_norm": 8.652878761291504, "learning_rate": 9.966283937559716e-09, "loss": 3.0332, "num_input_tokens_seen": 5916360, "step": 9045 }, { "epoch": 0.991998246190946, "grad_norm": 8.960352897644043, "learning_rate": 8.78800739063379e-09, "loss": 2.6109, "num_input_tokens_seen": 5918704, "step": 9050 }, { "epoch": 0.9925463115203332, "grad_norm": 7.337709903717041, "learning_rate": 7.683819482479094e-09, "loss": 2.7987, "num_input_tokens_seen": 5921928, "step": 9055 }, { "epoch": 0.9930943768497205, "grad_norm": 7.972464561462402, "learning_rate": 6.653723486549357e-09, "loss": 3.1164, "num_input_tokens_seen": 5924176, "step": 9060 }, { "epoch": 0.9936424421791078, "grad_norm": 5.17326021194458, "learning_rate": 5.69772245666289e-09, "loss": 2.8857, "num_input_tokens_seen": 5927832, "step": 9065 }, { "epoch": 0.994190507508495, "grad_norm": 9.227761268615723, "learning_rate": 4.815819226960949e-09, "loss": 3.0089, "num_input_tokens_seen": 5931264, "step": 9070 }, { "epoch": 0.9947385728378823, "grad_norm": 8.926158905029297, "learning_rate": 4.008016411927162e-09, "loss": 3.3191, "num_input_tokens_seen": 5933904, "step": 9075 }, { "epoch": 0.9952866381672696, "grad_norm": 10.433160781860352, "learning_rate": 3.274316406362554e-09, "loss": 3.447, "num_input_tokens_seen": 5936464, "step": 9080 }, { "epoch": 0.9958347034966568, "grad_norm": 7.052779197692871, "learning_rate": 2.6147213853855436e-09, "loss": 3.0385, "num_input_tokens_seen": 5939544, "step": 9085 }, { "epoch": 0.996382768826044, "grad_norm": 5.819647789001465, "learning_rate": 2.0292333044236166e-09, "loss": 3.3745, "num_input_tokens_seen": 5943312, "step": 9090 }, { "epoch": 0.9969308341554314, "grad_norm": 7.4259748458862305, "learning_rate": 1.5178538992050018e-09, "loss": 2.8346, "num_input_tokens_seen": 5946248, "step": 9095 }, { "epoch": 0.9974788994848186, "grad_norm": 9.022146224975586, "learning_rate": 1.0805846857642188e-09, "loss": 2.969, "num_input_tokens_seen": 5949520, "step": 9100 }, { "epoch": 0.9980269648142058, "grad_norm": 7.631455898284912, "learning_rate": 7.174269604171002e-10, "loss": 3.0908, "num_input_tokens_seen": 5953392, "step": 9105 }, { "epoch": 0.9985750301435932, "grad_norm": 8.837788581848145, "learning_rate": 4.283817997829953e-10, "loss": 2.8613, "num_input_tokens_seen": 5957048, "step": 9110 }, { "epoch": 0.9991230954729804, "grad_norm": 6.420173645019531, "learning_rate": 2.1345006075979e-10, "loss": 2.8579, "num_input_tokens_seen": 5959744, "step": 9115 }, { "epoch": 0.9996711608023676, "grad_norm": 8.133180618286133, "learning_rate": 7.263238052668264e-11, "loss": 3.1424, "num_input_tokens_seen": 5962752, "step": 9120 } ], "logging_steps": 5, "max_steps": 9123, "num_input_tokens_seen": 5964208, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.722124677282202e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }