{ "best_metric": null, "best_model_checkpoint": null, "epoch": 29.0, "eval_steps": 500, "global_step": 900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.16, "grad_norm": 2.2721662521362305, "learning_rate": 4.999643406399275e-05, "loss": 4.0376, "num_input_tokens_seen": 6208, "step": 5 }, { "epoch": 0.32, "grad_norm": 2.1794309616088867, "learning_rate": 4.998573727324295e-05, "loss": 3.9633, "num_input_tokens_seen": 11968, "step": 10 }, { "epoch": 0.48, "grad_norm": 1.3466308116912842, "learning_rate": 4.9967912679276316e-05, "loss": 3.7175, "num_input_tokens_seen": 18016, "step": 15 }, { "epoch": 0.64, "grad_norm": 1.3052715063095093, "learning_rate": 4.994296536700177e-05, "loss": 3.6465, "num_input_tokens_seen": 23760, "step": 20 }, { "epoch": 0.8, "grad_norm": 1.289433479309082, "learning_rate": 4.9910902453260824e-05, "loss": 3.6384, "num_input_tokens_seen": 29648, "step": 25 }, { "epoch": 0.96, "grad_norm": 1.0926741361618042, "learning_rate": 4.987173308479738e-05, "loss": 3.5703, "num_input_tokens_seen": 35968, "step": 30 }, { "epoch": 1.12, "grad_norm": 1.3285070657730103, "learning_rate": 4.982546843564834e-05, "loss": 3.5096, "num_input_tokens_seen": 42112, "step": 35 }, { "epoch": 1.28, "grad_norm": 1.691308856010437, "learning_rate": 4.977212170395598e-05, "loss": 3.2806, "num_input_tokens_seen": 48352, "step": 40 }, { "epoch": 1.44, "grad_norm": 1.3265893459320068, "learning_rate": 4.971170810820279e-05, "loss": 3.4431, "num_input_tokens_seen": 54144, "step": 45 }, { "epoch": 1.6, "grad_norm": 1.2742599248886108, "learning_rate": 4.964424488287009e-05, "loss": 3.2597, "num_input_tokens_seen": 60224, "step": 50 }, { "epoch": 1.76, "grad_norm": 1.2702339887619019, "learning_rate": 4.9569751273521454e-05, "loss": 3.275, "num_input_tokens_seen": 66512, "step": 55 }, { "epoch": 1.92, "grad_norm": 1.6329398155212402, "learning_rate": 4.948824853131236e-05, "loss": 3.1563, "num_input_tokens_seen": 72208, "step": 60 }, { "epoch": 2.08, "grad_norm": 1.348823070526123, "learning_rate": 4.939975990692789e-05, "loss": 3.1883, "num_input_tokens_seen": 78368, "step": 65 }, { "epoch": 2.24, "grad_norm": 1.441171646118164, "learning_rate": 4.930431064394977e-05, "loss": 3.2454, "num_input_tokens_seen": 84288, "step": 70 }, { "epoch": 2.4, "grad_norm": 1.3091288805007935, "learning_rate": 4.920192797165511e-05, "loss": 3.1905, "num_input_tokens_seen": 90464, "step": 75 }, { "epoch": 2.56, "grad_norm": 1.6830319166183472, "learning_rate": 4.909264109724853e-05, "loss": 3.0087, "num_input_tokens_seen": 96704, "step": 80 }, { "epoch": 2.7199999999999998, "grad_norm": 1.941502332687378, "learning_rate": 4.897648119753006e-05, "loss": 3.0016, "num_input_tokens_seen": 102352, "step": 85 }, { "epoch": 2.88, "grad_norm": 1.8244129419326782, "learning_rate": 4.885348141000122e-05, "loss": 3.1813, "num_input_tokens_seen": 108112, "step": 90 }, { "epoch": 3.04, "grad_norm": 1.6675082445144653, "learning_rate": 4.872367682341173e-05, "loss": 3.1158, "num_input_tokens_seen": 114240, "step": 95 }, { "epoch": 3.2, "grad_norm": 1.8476835489273071, "learning_rate": 4.858710446774951e-05, "loss": 2.9404, "num_input_tokens_seen": 119936, "step": 100 }, { "epoch": 3.36, "grad_norm": 1.8608956336975098, "learning_rate": 4.844380330367701e-05, "loss": 2.9749, "num_input_tokens_seen": 125984, "step": 105 }, { "epoch": 3.52, "grad_norm": 1.9425894021987915, "learning_rate": 4.829381421141671e-05, "loss": 2.9456, "num_input_tokens_seen": 131808, "step": 110 }, { "epoch": 3.68, "grad_norm": 2.114993095397949, "learning_rate": 4.8137179979088995e-05, "loss": 2.8505, "num_input_tokens_seen": 137792, "step": 115 }, { "epoch": 3.84, "grad_norm": 1.7963783740997314, "learning_rate": 4.7973945290505766e-05, "loss": 3.0044, "num_input_tokens_seen": 144336, "step": 120 }, { "epoch": 4.0, "grad_norm": 1.7597969770431519, "learning_rate": 4.780415671242334e-05, "loss": 3.0709, "num_input_tokens_seen": 150336, "step": 125 }, { "epoch": 4.16, "grad_norm": 1.8621456623077393, "learning_rate": 4.7627862681258037e-05, "loss": 2.912, "num_input_tokens_seen": 156768, "step": 130 }, { "epoch": 4.32, "grad_norm": 2.021226167678833, "learning_rate": 4.7445113489268544e-05, "loss": 2.8063, "num_input_tokens_seen": 163232, "step": 135 }, { "epoch": 4.48, "grad_norm": 2.2448067665100098, "learning_rate": 4.725596127020879e-05, "loss": 2.7714, "num_input_tokens_seen": 169616, "step": 140 }, { "epoch": 4.64, "grad_norm": 5.535823822021484, "learning_rate": 4.706045998445548e-05, "loss": 2.8277, "num_input_tokens_seen": 175664, "step": 145 }, { "epoch": 4.8, "grad_norm": 2.4096429347991943, "learning_rate": 4.685866540361456e-05, "loss": 2.7456, "num_input_tokens_seen": 181232, "step": 150 }, { "epoch": 4.96, "grad_norm": 2.699846029281616, "learning_rate": 4.665063509461097e-05, "loss": 2.7187, "num_input_tokens_seen": 186960, "step": 155 }, { "epoch": 5.12, "grad_norm": 2.661548376083374, "learning_rate": 4.643642840326627e-05, "loss": 2.7918, "num_input_tokens_seen": 192640, "step": 160 }, { "epoch": 5.28, "grad_norm": 2.395092725753784, "learning_rate": 4.621610643736878e-05, "loss": 2.6223, "num_input_tokens_seen": 198672, "step": 165 }, { "epoch": 5.44, "grad_norm": 2.329503059387207, "learning_rate": 4.598973204924097e-05, "loss": 2.654, "num_input_tokens_seen": 204976, "step": 170 }, { "epoch": 5.6, "grad_norm": 2.3466107845306396, "learning_rate": 4.5757369817809415e-05, "loss": 2.7321, "num_input_tokens_seen": 211168, "step": 175 }, { "epoch": 5.76, "grad_norm": 2.9179327487945557, "learning_rate": 4.551908603018191e-05, "loss": 2.71, "num_input_tokens_seen": 217072, "step": 180 }, { "epoch": 5.92, "grad_norm": 2.7011966705322266, "learning_rate": 4.527494866273753e-05, "loss": 2.7663, "num_input_tokens_seen": 223136, "step": 185 }, { "epoch": 6.08, "grad_norm": 2.610811948776245, "learning_rate": 4.502502736173462e-05, "loss": 2.4595, "num_input_tokens_seen": 229088, "step": 190 }, { "epoch": 6.24, "grad_norm": 2.6016838550567627, "learning_rate": 4.476939342344246e-05, "loss": 2.5734, "num_input_tokens_seen": 235184, "step": 195 }, { "epoch": 6.4, "grad_norm": 3.063948392868042, "learning_rate": 4.45081197738023e-05, "loss": 2.3804, "num_input_tokens_seen": 241152, "step": 200 }, { "epoch": 6.5600000000000005, "grad_norm": 3.9621613025665283, "learning_rate": 4.424128094762331e-05, "loss": 2.4395, "num_input_tokens_seen": 247136, "step": 205 }, { "epoch": 6.72, "grad_norm": 2.887089967727661, "learning_rate": 4.3968953067319777e-05, "loss": 2.3477, "num_input_tokens_seen": 253232, "step": 210 }, { "epoch": 6.88, "grad_norm": 2.8491384983062744, "learning_rate": 4.369121382119523e-05, "loss": 2.5377, "num_input_tokens_seen": 258992, "step": 215 }, { "epoch": 7.04, "grad_norm": 3.3222053050994873, "learning_rate": 4.340814244127993e-05, "loss": 2.5576, "num_input_tokens_seen": 265392, "step": 220 }, { "epoch": 7.2, "grad_norm": 3.4029548168182373, "learning_rate": 4.3119819680728e-05, "loss": 2.2721, "num_input_tokens_seen": 271344, "step": 225 }, { "epoch": 7.36, "grad_norm": 3.4295144081115723, "learning_rate": 4.282632779078051e-05, "loss": 2.4025, "num_input_tokens_seen": 277120, "step": 230 }, { "epoch": 7.52, "grad_norm": 3.2438597679138184, "learning_rate": 4.2527750497301323e-05, "loss": 2.2175, "num_input_tokens_seen": 283152, "step": 235 }, { "epoch": 7.68, "grad_norm": 3.3982954025268555, "learning_rate": 4.222417297689217e-05, "loss": 2.1803, "num_input_tokens_seen": 289136, "step": 240 }, { "epoch": 7.84, "grad_norm": 2.70150089263916, "learning_rate": 4.191568183259394e-05, "loss": 2.4197, "num_input_tokens_seen": 295968, "step": 245 }, { "epoch": 8.0, "grad_norm": 3.426187515258789, "learning_rate": 4.160236506918098e-05, "loss": 2.2351, "num_input_tokens_seen": 301472, "step": 250 }, { "epoch": 8.16, "grad_norm": 4.023019313812256, "learning_rate": 4.128431206805557e-05, "loss": 2.1748, "num_input_tokens_seen": 307216, "step": 255 }, { "epoch": 8.32, "grad_norm": 3.1614391803741455, "learning_rate": 4.096161356174959e-05, "loss": 2.1173, "num_input_tokens_seen": 313424, "step": 260 }, { "epoch": 8.48, "grad_norm": 3.4006361961364746, "learning_rate": 4.063436160804092e-05, "loss": 2.0966, "num_input_tokens_seen": 319472, "step": 265 }, { "epoch": 8.64, "grad_norm": 4.167815208435059, "learning_rate": 4.030264956369157e-05, "loss": 2.0825, "num_input_tokens_seen": 325504, "step": 270 }, { "epoch": 8.8, "grad_norm": 4.040388584136963, "learning_rate": 3.9966572057815373e-05, "loss": 2.0189, "num_input_tokens_seen": 331520, "step": 275 }, { "epoch": 8.96, "grad_norm": 5.056305885314941, "learning_rate": 3.962622496488269e-05, "loss": 2.0671, "num_input_tokens_seen": 337680, "step": 280 }, { "epoch": 9.12, "grad_norm": 4.466280937194824, "learning_rate": 3.928170537736981e-05, "loss": 2.0337, "num_input_tokens_seen": 343936, "step": 285 }, { "epoch": 9.28, "grad_norm": 4.8046698570251465, "learning_rate": 3.893311157806091e-05, "loss": 1.7892, "num_input_tokens_seen": 349632, "step": 290 }, { "epoch": 9.44, "grad_norm": 5.066511631011963, "learning_rate": 3.858054301201047e-05, "loss": 1.8558, "num_input_tokens_seen": 356080, "step": 295 }, { "epoch": 9.6, "grad_norm": 4.678648948669434, "learning_rate": 3.822410025817406e-05, "loss": 1.9685, "num_input_tokens_seen": 362048, "step": 300 }, { "epoch": 9.76, "grad_norm": 4.883659839630127, "learning_rate": 3.786388500071572e-05, "loss": 1.6883, "num_input_tokens_seen": 368128, "step": 305 }, { "epoch": 9.92, "grad_norm": 4.799614429473877, "learning_rate": 3.7500000000000003e-05, "loss": 1.8049, "num_input_tokens_seen": 373920, "step": 310 }, { "epoch": 10.08, "grad_norm": 4.910759925842285, "learning_rate": 3.713254906327703e-05, "loss": 1.8785, "num_input_tokens_seen": 379920, "step": 315 }, { "epoch": 10.24, "grad_norm": 6.2078447341918945, "learning_rate": 3.67616370150689e-05, "loss": 1.8797, "num_input_tokens_seen": 386464, "step": 320 }, { "epoch": 10.4, "grad_norm": 6.023489952087402, "learning_rate": 3.638736966726585e-05, "loss": 1.6139, "num_input_tokens_seen": 392608, "step": 325 }, { "epoch": 10.56, "grad_norm": 5.2762017250061035, "learning_rate": 3.600985378894086e-05, "loss": 1.4428, "num_input_tokens_seen": 398224, "step": 330 }, { "epoch": 10.72, "grad_norm": 6.009916305541992, "learning_rate": 3.562919707589102e-05, "loss": 1.6858, "num_input_tokens_seen": 404480, "step": 335 }, { "epoch": 10.88, "grad_norm": 5.862870216369629, "learning_rate": 3.5245508119914687e-05, "loss": 1.5669, "num_input_tokens_seen": 410448, "step": 340 }, { "epoch": 11.04, "grad_norm": 5.693241119384766, "learning_rate": 3.4858896377832966e-05, "loss": 1.5857, "num_input_tokens_seen": 416544, "step": 345 }, { "epoch": 11.2, "grad_norm": 7.012350082397461, "learning_rate": 3.44694721402644e-05, "loss": 1.4511, "num_input_tokens_seen": 422032, "step": 350 }, { "epoch": 11.36, "grad_norm": 6.417391777038574, "learning_rate": 3.407734650016187e-05, "loss": 1.4448, "num_input_tokens_seen": 428224, "step": 355 }, { "epoch": 11.52, "grad_norm": 5.746353626251221, "learning_rate": 3.3682631321120504e-05, "loss": 1.4648, "num_input_tokens_seen": 434368, "step": 360 }, { "epoch": 11.68, "grad_norm": 6.427619457244873, "learning_rate": 3.32854392054659e-05, "loss": 1.4896, "num_input_tokens_seen": 440816, "step": 365 }, { "epoch": 11.84, "grad_norm": 6.434299468994141, "learning_rate": 3.2885883462131394e-05, "loss": 1.2713, "num_input_tokens_seen": 446992, "step": 370 }, { "epoch": 12.0, "grad_norm": 6.6553521156311035, "learning_rate": 3.2484078074333954e-05, "loss": 1.4016, "num_input_tokens_seen": 452864, "step": 375 }, { "epoch": 12.16, "grad_norm": 6.007937431335449, "learning_rate": 3.2080137667057595e-05, "loss": 1.1953, "num_input_tokens_seen": 458800, "step": 380 }, { "epoch": 12.32, "grad_norm": 7.414127826690674, "learning_rate": 3.167417747435379e-05, "loss": 1.235, "num_input_tokens_seen": 465040, "step": 385 }, { "epoch": 12.48, "grad_norm": 7.6087565422058105, "learning_rate": 3.126631330646802e-05, "loss": 1.1375, "num_input_tokens_seen": 470544, "step": 390 }, { "epoch": 12.64, "grad_norm": 7.889481067657471, "learning_rate": 3.0856661516802054e-05, "loss": 1.1427, "num_input_tokens_seen": 476608, "step": 395 }, { "epoch": 12.8, "grad_norm": 5.883874416351318, "learning_rate": 3.0445338968721287e-05, "loss": 1.3749, "num_input_tokens_seen": 483136, "step": 400 }, { "epoch": 12.96, "grad_norm": 5.944591045379639, "learning_rate": 3.0032463002216505e-05, "loss": 1.2318, "num_input_tokens_seen": 488752, "step": 405 }, { "epoch": 13.12, "grad_norm": 5.7361931800842285, "learning_rate": 2.961815140042974e-05, "loss": 1.0918, "num_input_tokens_seen": 495040, "step": 410 }, { "epoch": 13.28, "grad_norm": 5.651491165161133, "learning_rate": 2.920252235605371e-05, "loss": 1.0229, "num_input_tokens_seen": 500816, "step": 415 }, { "epoch": 13.44, "grad_norm": 7.200684547424316, "learning_rate": 2.878569443761442e-05, "loss": 0.9925, "num_input_tokens_seen": 506336, "step": 420 }, { "epoch": 13.6, "grad_norm": 5.146182060241699, "learning_rate": 2.836778655564653e-05, "loss": 1.05, "num_input_tokens_seen": 512848, "step": 425 }, { "epoch": 13.76, "grad_norm": 5.351944923400879, "learning_rate": 2.7948917928771158e-05, "loss": 0.9813, "num_input_tokens_seen": 519024, "step": 430 }, { "epoch": 13.92, "grad_norm": 6.804362773895264, "learning_rate": 2.7529208049685807e-05, "loss": 1.1116, "num_input_tokens_seen": 525536, "step": 435 }, { "epoch": 14.08, "grad_norm": 5.708404064178467, "learning_rate": 2.7108776651076118e-05, "loss": 0.9063, "num_input_tokens_seen": 531200, "step": 440 }, { "epoch": 14.24, "grad_norm": 5.502604007720947, "learning_rate": 2.668774367145913e-05, "loss": 0.8812, "num_input_tokens_seen": 537232, "step": 445 }, { "epoch": 14.4, "grad_norm": 5.171998500823975, "learning_rate": 2.6266229220967818e-05, "loss": 0.7509, "num_input_tokens_seen": 543120, "step": 450 }, { "epoch": 14.56, "grad_norm": 5.874445915222168, "learning_rate": 2.584435354708671e-05, "loss": 1.0827, "num_input_tokens_seen": 549312, "step": 455 }, { "epoch": 14.72, "grad_norm": 6.16728401184082, "learning_rate": 2.5422237000348276e-05, "loss": 0.8113, "num_input_tokens_seen": 555408, "step": 460 }, { "epoch": 14.88, "grad_norm": 7.150355339050293, "learning_rate": 2.5e-05, "loss": 0.8095, "num_input_tokens_seen": 561648, "step": 465 }, { "epoch": 15.04, "grad_norm": 4.866459369659424, "learning_rate": 2.4577762999651726e-05, "loss": 0.9768, "num_input_tokens_seen": 568048, "step": 470 }, { "epoch": 15.2, "grad_norm": 5.219202995300293, "learning_rate": 2.4155646452913296e-05, "loss": 0.8017, "num_input_tokens_seen": 574352, "step": 475 }, { "epoch": 15.36, "grad_norm": 5.405744552612305, "learning_rate": 2.3733770779032184e-05, "loss": 0.8299, "num_input_tokens_seen": 580160, "step": 480 }, { "epoch": 15.52, "grad_norm": 6.140029430389404, "learning_rate": 2.331225632854087e-05, "loss": 0.581, "num_input_tokens_seen": 585952, "step": 485 }, { "epoch": 15.68, "grad_norm": 5.105503559112549, "learning_rate": 2.2891223348923884e-05, "loss": 0.6687, "num_input_tokens_seen": 591968, "step": 490 }, { "epoch": 15.84, "grad_norm": 6.098103046417236, "learning_rate": 2.24707919503142e-05, "loss": 0.8088, "num_input_tokens_seen": 598128, "step": 495 }, { "epoch": 16.0, "grad_norm": 6.3401780128479, "learning_rate": 2.2051082071228854e-05, "loss": 0.8223, "num_input_tokens_seen": 604192, "step": 500 }, { "epoch": 16.16, "grad_norm": 5.5078630447387695, "learning_rate": 2.1632213444353482e-05, "loss": 0.5949, "num_input_tokens_seen": 609872, "step": 505 }, { "epoch": 16.32, "grad_norm": 5.564432621002197, "learning_rate": 2.1214305562385592e-05, "loss": 0.705, "num_input_tokens_seen": 615584, "step": 510 }, { "epoch": 16.48, "grad_norm": 4.540548801422119, "learning_rate": 2.07974776439463e-05, "loss": 0.3912, "num_input_tokens_seen": 621536, "step": 515 }, { "epoch": 16.64, "grad_norm": 5.815915107727051, "learning_rate": 2.0381848599570276e-05, "loss": 0.6312, "num_input_tokens_seen": 627616, "step": 520 }, { "epoch": 16.8, "grad_norm": 5.4646501541137695, "learning_rate": 1.9967536997783494e-05, "loss": 0.8473, "num_input_tokens_seen": 633952, "step": 525 }, { "epoch": 16.96, "grad_norm": 6.268056869506836, "learning_rate": 1.9554661031278712e-05, "loss": 0.7956, "num_input_tokens_seen": 640720, "step": 530 }, { "epoch": 17.12, "grad_norm": 5.364756107330322, "learning_rate": 1.914333848319795e-05, "loss": 0.6384, "num_input_tokens_seen": 646976, "step": 535 }, { "epoch": 17.28, "grad_norm": 5.4850263595581055, "learning_rate": 1.8733686693531985e-05, "loss": 0.4768, "num_input_tokens_seen": 653296, "step": 540 }, { "epoch": 17.44, "grad_norm": 5.440878391265869, "learning_rate": 1.8325822525646208e-05, "loss": 0.6033, "num_input_tokens_seen": 659232, "step": 545 }, { "epoch": 17.6, "grad_norm": 7.1814374923706055, "learning_rate": 1.79198623329424e-05, "loss": 0.549, "num_input_tokens_seen": 665248, "step": 550 }, { "epoch": 17.76, "grad_norm": 4.951470851898193, "learning_rate": 1.7515921925666052e-05, "loss": 0.5032, "num_input_tokens_seen": 671104, "step": 555 }, { "epoch": 17.92, "grad_norm": 4.948657989501953, "learning_rate": 1.711411653786861e-05, "loss": 0.524, "num_input_tokens_seen": 676720, "step": 560 }, { "epoch": 18.08, "grad_norm": 4.3077898025512695, "learning_rate": 1.6714560794534108e-05, "loss": 0.5206, "num_input_tokens_seen": 683104, "step": 565 }, { "epoch": 18.24, "grad_norm": 4.8670220375061035, "learning_rate": 1.6317368678879495e-05, "loss": 0.4662, "num_input_tokens_seen": 688928, "step": 570 }, { "epoch": 18.4, "grad_norm": 5.523406028747559, "learning_rate": 1.5922653499838137e-05, "loss": 0.5093, "num_input_tokens_seen": 695072, "step": 575 }, { "epoch": 18.56, "grad_norm": 4.92819356918335, "learning_rate": 1.55305278597356e-05, "loss": 0.5162, "num_input_tokens_seen": 701248, "step": 580 }, { "epoch": 18.72, "grad_norm": 4.869935989379883, "learning_rate": 1.5141103622167041e-05, "loss": 0.5355, "num_input_tokens_seen": 707296, "step": 585 }, { "epoch": 18.88, "grad_norm": 5.4244818687438965, "learning_rate": 1.475449188008532e-05, "loss": 0.3836, "num_input_tokens_seen": 713120, "step": 590 }, { "epoch": 19.04, "grad_norm": 4.891148090362549, "learning_rate": 1.437080292410899e-05, "loss": 0.519, "num_input_tokens_seen": 719664, "step": 595 }, { "epoch": 19.2, "grad_norm": 4.736128330230713, "learning_rate": 1.399014621105914e-05, "loss": 0.4391, "num_input_tokens_seen": 726224, "step": 600 }, { "epoch": 19.36, "grad_norm": 4.512727737426758, "learning_rate": 1.361263033273415e-05, "loss": 0.4404, "num_input_tokens_seen": 732144, "step": 605 }, { "epoch": 19.52, "grad_norm": 4.712576389312744, "learning_rate": 1.3238362984931113e-05, "loss": 0.3758, "num_input_tokens_seen": 737824, "step": 610 }, { "epoch": 19.68, "grad_norm": 5.2509379386901855, "learning_rate": 1.286745093672298e-05, "loss": 0.483, "num_input_tokens_seen": 743984, "step": 615 }, { "epoch": 19.84, "grad_norm": 5.451502323150635, "learning_rate": 1.2500000000000006e-05, "loss": 0.4637, "num_input_tokens_seen": 750016, "step": 620 }, { "epoch": 20.0, "grad_norm": 5.625941753387451, "learning_rate": 1.2136114999284288e-05, "loss": 0.4252, "num_input_tokens_seen": 756160, "step": 625 }, { "epoch": 20.16, "grad_norm": 4.41165018081665, "learning_rate": 1.1775899741825947e-05, "loss": 0.3591, "num_input_tokens_seen": 762576, "step": 630 }, { "epoch": 20.32, "grad_norm": 4.646330833435059, "learning_rate": 1.141945698798954e-05, "loss": 0.5274, "num_input_tokens_seen": 768768, "step": 635 }, { "epoch": 20.48, "grad_norm": 3.5506694316864014, "learning_rate": 1.1066888421939093e-05, "loss": 0.4133, "num_input_tokens_seen": 774480, "step": 640 }, { "epoch": 20.64, "grad_norm": 4.34029483795166, "learning_rate": 1.0718294622630188e-05, "loss": 0.274, "num_input_tokens_seen": 780368, "step": 645 }, { "epoch": 20.8, "grad_norm": 4.8732476234436035, "learning_rate": 1.0373775035117305e-05, "loss": 0.3506, "num_input_tokens_seen": 786368, "step": 650 }, { "epoch": 20.96, "grad_norm": 3.6054153442382812, "learning_rate": 1.0033427942184622e-05, "loss": 0.3099, "num_input_tokens_seen": 792304, "step": 655 }, { "epoch": 21.12, "grad_norm": 4.9192914962768555, "learning_rate": 9.697350436308427e-06, "loss": 0.409, "num_input_tokens_seen": 798464, "step": 660 }, { "epoch": 21.28, "grad_norm": 3.9316163063049316, "learning_rate": 9.36563839195908e-06, "loss": 0.4555, "num_input_tokens_seen": 805152, "step": 665 }, { "epoch": 21.44, "grad_norm": 4.707286834716797, "learning_rate": 9.038386438250415e-06, "loss": 0.3132, "num_input_tokens_seen": 810848, "step": 670 }, { "epoch": 21.6, "grad_norm": 5.040837287902832, "learning_rate": 8.715687931944449e-06, "loss": 0.2986, "num_input_tokens_seen": 816704, "step": 675 }, { "epoch": 21.76, "grad_norm": 5.323684215545654, "learning_rate": 8.397634930819021e-06, "loss": 0.3246, "num_input_tokens_seen": 822272, "step": 680 }, { "epoch": 21.92, "grad_norm": 4.811425685882568, "learning_rate": 8.084318167406066e-06, "loss": 0.349, "num_input_tokens_seen": 828256, "step": 685 }, { "epoch": 22.08, "grad_norm": 3.185715675354004, "learning_rate": 7.775827023107838e-06, "loss": 0.2495, "num_input_tokens_seen": 834672, "step": 690 }, { "epoch": 22.24, "grad_norm": 5.034013748168945, "learning_rate": 7.472249502698686e-06, "loss": 0.4472, "num_input_tokens_seen": 841184, "step": 695 }, { "epoch": 22.4, "grad_norm": 3.9689395427703857, "learning_rate": 7.173672209219495e-06, "loss": 0.2427, "num_input_tokens_seen": 847104, "step": 700 }, { "epoch": 22.56, "grad_norm": 5.952221870422363, "learning_rate": 6.880180319272006e-06, "loss": 0.2333, "num_input_tokens_seen": 853040, "step": 705 }, { "epoch": 22.72, "grad_norm": 5.011849403381348, "learning_rate": 6.591857558720071e-06, "loss": 0.3539, "num_input_tokens_seen": 859040, "step": 710 }, { "epoch": 22.88, "grad_norm": 5.248806953430176, "learning_rate": 6.308786178804782e-06, "loss": 0.3516, "num_input_tokens_seen": 865168, "step": 715 }, { "epoch": 23.04, "grad_norm": 3.6265501976013184, "learning_rate": 6.031046932680226e-06, "loss": 0.2574, "num_input_tokens_seen": 870624, "step": 720 }, { "epoch": 23.2, "grad_norm": 4.028469562530518, "learning_rate": 5.758719052376693e-06, "loss": 0.2764, "num_input_tokens_seen": 876288, "step": 725 }, { "epoch": 23.36, "grad_norm": 4.448588848114014, "learning_rate": 5.491880226197707e-06, "loss": 0.2085, "num_input_tokens_seen": 881968, "step": 730 }, { "epoch": 23.52, "grad_norm": 4.58131742477417, "learning_rate": 5.23060657655754e-06, "loss": 0.2651, "num_input_tokens_seen": 888496, "step": 735 }, { "epoch": 23.68, "grad_norm": 3.284576416015625, "learning_rate": 4.9749726382653905e-06, "loss": 0.3042, "num_input_tokens_seen": 894880, "step": 740 }, { "epoch": 23.84, "grad_norm": 4.4178972244262695, "learning_rate": 4.725051337262476e-06, "loss": 0.334, "num_input_tokens_seen": 900624, "step": 745 }, { "epoch": 24.0, "grad_norm": 4.631526470184326, "learning_rate": 4.480913969818098e-06, "loss": 0.382, "num_input_tokens_seen": 906880, "step": 750 }, { "epoch": 24.16, "grad_norm": 3.961879014968872, "learning_rate": 4.242630182190594e-06, "loss": 0.2669, "num_input_tokens_seen": 913472, "step": 755 }, { "epoch": 24.32, "grad_norm": 4.3941779136657715, "learning_rate": 4.010267950759025e-06, "loss": 0.2149, "num_input_tokens_seen": 919360, "step": 760 }, { "epoch": 24.48, "grad_norm": 3.2143290042877197, "learning_rate": 3.7838935626312242e-06, "loss": 0.2215, "num_input_tokens_seen": 924928, "step": 765 }, { "epoch": 24.64, "grad_norm": 4.793003559112549, "learning_rate": 3.5635715967337223e-06, "loss": 0.2804, "num_input_tokens_seen": 930816, "step": 770 }, { "epoch": 24.8, "grad_norm": 3.6798858642578125, "learning_rate": 3.3493649053890326e-06, "loss": 0.348, "num_input_tokens_seen": 937088, "step": 775 }, { "epoch": 24.96, "grad_norm": 4.566779136657715, "learning_rate": 3.141334596385448e-06, "loss": 0.3526, "num_input_tokens_seen": 943536, "step": 780 }, { "epoch": 25.12, "grad_norm": 3.3694982528686523, "learning_rate": 2.939540015544523e-06, "loss": 0.1965, "num_input_tokens_seen": 949360, "step": 785 }, { "epoch": 25.28, "grad_norm": 3.4644641876220703, "learning_rate": 2.7440387297912123e-06, "loss": 0.2734, "num_input_tokens_seen": 955952, "step": 790 }, { "epoch": 25.44, "grad_norm": 3.234680652618408, "learning_rate": 2.5548865107314607e-06, "loss": 0.2059, "num_input_tokens_seen": 961696, "step": 795 }, { "epoch": 25.6, "grad_norm": 4.125905513763428, "learning_rate": 2.372137318741968e-06, "loss": 0.1972, "num_input_tokens_seen": 967456, "step": 800 }, { "epoch": 25.96, "grad_norm": 3.860513925552368, "learning_rate": 2.1958432875766653e-06, "loss": 0.3445, "num_input_tokens_seen": 973440, "step": 805 }, { "epoch": 26.12, "grad_norm": 3.0495705604553223, "learning_rate": 2.026054709494235e-06, "loss": 0.1877, "num_input_tokens_seen": 979008, "step": 810 }, { "epoch": 26.28, "grad_norm": 4.571675777435303, "learning_rate": 1.8628200209110131e-06, "loss": 0.2929, "num_input_tokens_seen": 984928, "step": 815 }, { "epoch": 26.44, "grad_norm": 3.220628261566162, "learning_rate": 1.7061857885832893e-06, "loss": 0.2057, "num_input_tokens_seen": 991088, "step": 820 }, { "epoch": 26.6, "grad_norm": 3.709768772125244, "learning_rate": 1.5561966963229897e-06, "loss": 0.2783, "num_input_tokens_seen": 997424, "step": 825 }, { "epoch": 26.76, "grad_norm": 3.6007933616638184, "learning_rate": 1.4128955322504966e-06, "loss": 0.1953, "num_input_tokens_seen": 1003680, "step": 830 }, { "epoch": 26.92, "grad_norm": 3.8975977897644043, "learning_rate": 1.2763231765882732e-06, "loss": 0.3187, "num_input_tokens_seen": 1009696, "step": 835 }, { "epoch": 27.08, "grad_norm": 3.8394689559936523, "learning_rate": 1.1465185899987797e-06, "loss": 0.3491, "num_input_tokens_seen": 1015648, "step": 840 }, { "epoch": 27.24, "grad_norm": 4.134805202484131, "learning_rate": 1.023518802469947e-06, "loss": 0.2368, "num_input_tokens_seen": 1021664, "step": 845 }, { "epoch": 27.4, "grad_norm": 2.897068738937378, "learning_rate": 9.073589027514789e-07, "loss": 0.2457, "num_input_tokens_seen": 1027696, "step": 850 }, { "epoch": 27.56, "grad_norm": 3.8189313411712646, "learning_rate": 7.980720283448956e-07, "loss": 0.3219, "num_input_tokens_seen": 1033904, "step": 855 }, { "epoch": 27.72, "grad_norm": 4.735554218292236, "learning_rate": 6.956893560502359e-07, "loss": 0.2515, "num_input_tokens_seen": 1039888, "step": 860 }, { "epoch": 27.88, "grad_norm": 3.0966691970825195, "learning_rate": 6.002400930721186e-07, "loss": 0.1924, "num_input_tokens_seen": 1045872, "step": 865 }, { "epoch": 28.04, "grad_norm": 4.062044620513916, "learning_rate": 5.117514686876379e-07, "loss": 0.2513, "num_input_tokens_seen": 1052192, "step": 870 }, { "epoch": 28.2, "grad_norm": 3.9572033882141113, "learning_rate": 4.302487264785521e-07, "loss": 0.2095, "num_input_tokens_seen": 1058272, "step": 875 }, { "epoch": 28.36, "grad_norm": 4.306464672088623, "learning_rate": 3.557551171299051e-07, "loss": 0.298, "num_input_tokens_seen": 1064976, "step": 880 }, { "epoch": 28.52, "grad_norm": 3.562727212905884, "learning_rate": 2.8829189179721547e-07, "loss": 0.1964, "num_input_tokens_seen": 1070944, "step": 885 }, { "epoch": 28.68, "grad_norm": 4.07358980178833, "learning_rate": 2.27878296044029e-07, "loss": 0.2523, "num_input_tokens_seen": 1076656, "step": 890 }, { "epoch": 28.84, "grad_norm": 4.010415554046631, "learning_rate": 1.7453156435165986e-07, "loss": 0.3446, "num_input_tokens_seen": 1082896, "step": 895 }, { "epoch": 29.0, "grad_norm": 2.8657336235046387, "learning_rate": 1.2826691520262114e-07, "loss": 0.1546, "num_input_tokens_seen": 1088576, "step": 900 } ], "logging_steps": 5, "max_steps": 930, "num_input_tokens_seen": 1088576, "num_train_epochs": 30, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8622946508931072.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }