{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998492272898606, "eval_steps": 500, "global_step": 4421, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011307953260459858, "grad_norm": 0.205332413315773, "learning_rate": 2.5e-06, "loss": 0.9778, "step": 5 }, { "epoch": 0.0022615906520919715, "grad_norm": 0.2380959391593933, "learning_rate": 5e-06, "loss": 0.9816, "step": 10 }, { "epoch": 0.003392385978137957, "grad_norm": 0.22828762233257294, "learning_rate": 7.5e-06, "loss": 1.0123, "step": 15 }, { "epoch": 0.004523181304183943, "grad_norm": 0.1957542896270752, "learning_rate": 1e-05, "loss": 0.9404, "step": 20 }, { "epoch": 0.005653976630229929, "grad_norm": 0.2502771019935608, "learning_rate": 1.25e-05, "loss": 0.9604, "step": 25 }, { "epoch": 0.006784771956275914, "grad_norm": 0.24806493520736694, "learning_rate": 1.5e-05, "loss": 1.0407, "step": 30 }, { "epoch": 0.0079155672823219, "grad_norm": 0.28463977575302124, "learning_rate": 1.75e-05, "loss": 1.0461, "step": 35 }, { "epoch": 0.009046362608367886, "grad_norm": 0.2142462134361267, "learning_rate": 2e-05, "loss": 0.9104, "step": 40 }, { "epoch": 0.010177157934413872, "grad_norm": 0.21732334792613983, "learning_rate": 2.25e-05, "loss": 0.8991, "step": 45 }, { "epoch": 0.011307953260459858, "grad_norm": 0.2227325588464737, "learning_rate": 2.5e-05, "loss": 0.8901, "step": 50 }, { "epoch": 0.012438748586505842, "grad_norm": 0.19881105422973633, "learning_rate": 2.7500000000000004e-05, "loss": 0.8378, "step": 55 }, { "epoch": 0.013569543912551827, "grad_norm": 0.21935518085956573, "learning_rate": 3e-05, "loss": 0.8743, "step": 60 }, { "epoch": 0.014700339238597813, "grad_norm": 0.21730449795722961, "learning_rate": 3.2500000000000004e-05, "loss": 0.8588, "step": 65 }, { "epoch": 0.0158311345646438, "grad_norm": 0.23200418055057526, "learning_rate": 3.5e-05, "loss": 0.7527, "step": 70 }, { "epoch": 0.016961929890689786, "grad_norm": 0.20900775492191315, "learning_rate": 3.7500000000000003e-05, "loss": 0.8365, "step": 75 }, { "epoch": 0.018092725216735772, "grad_norm": 0.31192561984062195, "learning_rate": 4e-05, "loss": 0.7791, "step": 80 }, { "epoch": 0.019223520542781758, "grad_norm": 0.25915804505348206, "learning_rate": 4.25e-05, "loss": 0.8506, "step": 85 }, { "epoch": 0.020354315868827744, "grad_norm": 0.20527321100234985, "learning_rate": 4.5e-05, "loss": 0.8062, "step": 90 }, { "epoch": 0.02148511119487373, "grad_norm": 0.2385016530752182, "learning_rate": 4.75e-05, "loss": 0.7525, "step": 95 }, { "epoch": 0.022615906520919715, "grad_norm": 0.2394818663597107, "learning_rate": 5e-05, "loss": 0.7416, "step": 100 }, { "epoch": 0.023746701846965697, "grad_norm": 0.269607275724411, "learning_rate": 4.999983481113995e-05, "loss": 0.7653, "step": 105 }, { "epoch": 0.024877497173011683, "grad_norm": 0.21368731558322906, "learning_rate": 4.9999339246742786e-05, "loss": 0.75, "step": 110 }, { "epoch": 0.02600829249905767, "grad_norm": 0.25945496559143066, "learning_rate": 4.9998513313357435e-05, "loss": 0.7693, "step": 115 }, { "epoch": 0.027139087825103655, "grad_norm": 0.2617523968219757, "learning_rate": 4.999735702189871e-05, "loss": 0.7995, "step": 120 }, { "epoch": 0.02826988315114964, "grad_norm": 0.26992905139923096, "learning_rate": 4.999587038764713e-05, "loss": 0.7784, "step": 125 }, { "epoch": 0.029400678477195626, "grad_norm": 0.23823940753936768, "learning_rate": 4.999405343024871e-05, "loss": 0.7316, "step": 130 }, { "epoch": 0.030531473803241612, "grad_norm": 0.2858569920063019, "learning_rate": 4.9991906173714756e-05, "loss": 0.7796, "step": 135 }, { "epoch": 0.0316622691292876, "grad_norm": 0.25298023223876953, "learning_rate": 4.99894286464215e-05, "loss": 0.7169, "step": 140 }, { "epoch": 0.03279306445533359, "grad_norm": 0.35693949460983276, "learning_rate": 4.998662088110972e-05, "loss": 0.8062, "step": 145 }, { "epoch": 0.03392385978137957, "grad_norm": 0.42634308338165283, "learning_rate": 4.998348291488435e-05, "loss": 0.7035, "step": 150 }, { "epoch": 0.03505465510742556, "grad_norm": 0.34167715907096863, "learning_rate": 4.998001478921395e-05, "loss": 0.7683, "step": 155 }, { "epoch": 0.036185450433471544, "grad_norm": 0.2687824070453644, "learning_rate": 4.997621654993018e-05, "loss": 0.7816, "step": 160 }, { "epoch": 0.03731624575951753, "grad_norm": 0.2919199764728546, "learning_rate": 4.997208824722719e-05, "loss": 0.7392, "step": 165 }, { "epoch": 0.038447041085563516, "grad_norm": 0.24317045509815216, "learning_rate": 4.9967629935660944e-05, "loss": 0.6972, "step": 170 }, { "epoch": 0.0395778364116095, "grad_norm": 0.2556512951850891, "learning_rate": 4.9962841674148516e-05, "loss": 0.7431, "step": 175 }, { "epoch": 0.04070863173765549, "grad_norm": 0.35918310284614563, "learning_rate": 4.99577235259673e-05, "loss": 0.78, "step": 180 }, { "epoch": 0.04183942706370147, "grad_norm": 0.28553536534309387, "learning_rate": 4.9952275558754185e-05, "loss": 0.7467, "step": 185 }, { "epoch": 0.04297022238974746, "grad_norm": 0.25147977471351624, "learning_rate": 4.994649784450465e-05, "loss": 0.7579, "step": 190 }, { "epoch": 0.044101017715793445, "grad_norm": 0.3088456690311432, "learning_rate": 4.994039045957182e-05, "loss": 0.752, "step": 195 }, { "epoch": 0.04523181304183943, "grad_norm": 0.32329487800598145, "learning_rate": 4.993395348466544e-05, "loss": 0.7012, "step": 200 }, { "epoch": 0.046362608367885416, "grad_norm": 0.28732138872146606, "learning_rate": 4.992718700485085e-05, "loss": 0.7247, "step": 205 }, { "epoch": 0.047493403693931395, "grad_norm": 0.2657299339771271, "learning_rate": 4.99200911095478e-05, "loss": 0.7247, "step": 210 }, { "epoch": 0.04862419901997738, "grad_norm": 0.30124104022979736, "learning_rate": 4.991266589252933e-05, "loss": 0.7001, "step": 215 }, { "epoch": 0.049754994346023367, "grad_norm": 0.3533799946308136, "learning_rate": 4.990491145192049e-05, "loss": 0.7714, "step": 220 }, { "epoch": 0.05088578967206935, "grad_norm": 0.29441332817077637, "learning_rate": 4.989682789019706e-05, "loss": 0.7338, "step": 225 }, { "epoch": 0.05201658499811534, "grad_norm": 0.2670339345932007, "learning_rate": 4.988841531418418e-05, "loss": 0.719, "step": 230 }, { "epoch": 0.053147380324161324, "grad_norm": 0.44572877883911133, "learning_rate": 4.9879673835054955e-05, "loss": 0.7315, "step": 235 }, { "epoch": 0.05427817565020731, "grad_norm": 0.29553067684173584, "learning_rate": 4.9870603568328985e-05, "loss": 0.7495, "step": 240 }, { "epoch": 0.055408970976253295, "grad_norm": 0.26393231749534607, "learning_rate": 4.986120463387084e-05, "loss": 0.6637, "step": 245 }, { "epoch": 0.05653976630229928, "grad_norm": 0.35982418060302734, "learning_rate": 4.985147715588845e-05, "loss": 0.7571, "step": 250 }, { "epoch": 0.05767056162834527, "grad_norm": 0.38977113366127014, "learning_rate": 4.9841421262931506e-05, "loss": 0.7551, "step": 255 }, { "epoch": 0.05880135695439125, "grad_norm": 0.28935956954956055, "learning_rate": 4.983103708788972e-05, "loss": 0.7863, "step": 260 }, { "epoch": 0.05993215228043724, "grad_norm": 0.34443530440330505, "learning_rate": 4.98203247679911e-05, "loss": 0.8106, "step": 265 }, { "epoch": 0.061062947606483224, "grad_norm": 0.4763427674770355, "learning_rate": 4.980928444480011e-05, "loss": 0.7729, "step": 270 }, { "epoch": 0.06219374293252921, "grad_norm": 0.2860422730445862, "learning_rate": 4.9797916264215824e-05, "loss": 0.7593, "step": 275 }, { "epoch": 0.0633245382585752, "grad_norm": 0.28870680928230286, "learning_rate": 4.978622037647e-05, "loss": 0.7574, "step": 280 }, { "epoch": 0.06445533358462119, "grad_norm": 0.40277180075645447, "learning_rate": 4.9774196936125056e-05, "loss": 0.799, "step": 285 }, { "epoch": 0.06558612891066717, "grad_norm": 0.3290288746356964, "learning_rate": 4.9761846102072065e-05, "loss": 0.7519, "step": 290 }, { "epoch": 0.06671692423671316, "grad_norm": 0.3139791190624237, "learning_rate": 4.9749168037528635e-05, "loss": 0.6837, "step": 295 }, { "epoch": 0.06784771956275915, "grad_norm": 0.30802035331726074, "learning_rate": 4.9736162910036785e-05, "loss": 0.7662, "step": 300 }, { "epoch": 0.06897851488880513, "grad_norm": 0.34561124444007874, "learning_rate": 4.972283089146067e-05, "loss": 0.6897, "step": 305 }, { "epoch": 0.07010931021485112, "grad_norm": 0.3372039198875427, "learning_rate": 4.970917215798438e-05, "loss": 0.7344, "step": 310 }, { "epoch": 0.0712401055408971, "grad_norm": 0.41160914301872253, "learning_rate": 4.9695186890109567e-05, "loss": 0.832, "step": 315 }, { "epoch": 0.07237090086694309, "grad_norm": 0.2914057672023773, "learning_rate": 4.968087527265306e-05, "loss": 0.7113, "step": 320 }, { "epoch": 0.07350169619298907, "grad_norm": 0.3247675597667694, "learning_rate": 4.966623749474445e-05, "loss": 0.6996, "step": 325 }, { "epoch": 0.07463249151903506, "grad_norm": 0.435735285282135, "learning_rate": 4.9651273749823546e-05, "loss": 0.8236, "step": 330 }, { "epoch": 0.07576328684508105, "grad_norm": 0.3213053047657013, "learning_rate": 4.963598423563788e-05, "loss": 0.7012, "step": 335 }, { "epoch": 0.07689408217112703, "grad_norm": 0.3745056390762329, "learning_rate": 4.962036915424004e-05, "loss": 0.7018, "step": 340 }, { "epoch": 0.07802487749717302, "grad_norm": 0.28368842601776123, "learning_rate": 4.960442871198503e-05, "loss": 0.7084, "step": 345 }, { "epoch": 0.079155672823219, "grad_norm": 0.2621799409389496, "learning_rate": 4.958816311952752e-05, "loss": 0.7217, "step": 350 }, { "epoch": 0.08028646814926499, "grad_norm": 0.25561287999153137, "learning_rate": 4.95715725918191e-05, "loss": 0.7616, "step": 355 }, { "epoch": 0.08141726347531097, "grad_norm": 0.3495071828365326, "learning_rate": 4.9554657348105385e-05, "loss": 0.7061, "step": 360 }, { "epoch": 0.08254805880135696, "grad_norm": 0.3490068018436432, "learning_rate": 4.953741761192317e-05, "loss": 0.7809, "step": 365 }, { "epoch": 0.08367885412740295, "grad_norm": 0.39416739344596863, "learning_rate": 4.9519853611097434e-05, "loss": 0.7282, "step": 370 }, { "epoch": 0.08480964945344893, "grad_norm": 0.2763444185256958, "learning_rate": 4.950196557773837e-05, "loss": 0.7262, "step": 375 }, { "epoch": 0.08594044477949492, "grad_norm": 0.29107871651649475, "learning_rate": 4.948375374823828e-05, "loss": 0.7346, "step": 380 }, { "epoch": 0.0870712401055409, "grad_norm": 0.28965339064598083, "learning_rate": 4.946521836326847e-05, "loss": 0.6768, "step": 385 }, { "epoch": 0.08820203543158689, "grad_norm": 0.31072792410850525, "learning_rate": 4.9446359667776065e-05, "loss": 0.7277, "step": 390 }, { "epoch": 0.08933283075763288, "grad_norm": 0.2789427936077118, "learning_rate": 4.9427177910980794e-05, "loss": 0.7481, "step": 395 }, { "epoch": 0.09046362608367886, "grad_norm": 0.2573710083961487, "learning_rate": 4.9407673346371644e-05, "loss": 0.7077, "step": 400 }, { "epoch": 0.09159442140972485, "grad_norm": 0.4152914881706238, "learning_rate": 4.938784623170357e-05, "loss": 0.7233, "step": 405 }, { "epoch": 0.09272521673577083, "grad_norm": 0.30680012702941895, "learning_rate": 4.936769682899404e-05, "loss": 0.7353, "step": 410 }, { "epoch": 0.0938560120618168, "grad_norm": 0.30145958065986633, "learning_rate": 4.934722540451961e-05, "loss": 0.7001, "step": 415 }, { "epoch": 0.09498680738786279, "grad_norm": 0.31772518157958984, "learning_rate": 4.932643222881238e-05, "loss": 0.7183, "step": 420 }, { "epoch": 0.09611760271390878, "grad_norm": 0.3001084327697754, "learning_rate": 4.930531757665643e-05, "loss": 0.6898, "step": 425 }, { "epoch": 0.09724839803995476, "grad_norm": 0.2780250012874603, "learning_rate": 4.928388172708418e-05, "loss": 0.7782, "step": 430 }, { "epoch": 0.09837919336600075, "grad_norm": 0.28147390484809875, "learning_rate": 4.926212496337272e-05, "loss": 0.7311, "step": 435 }, { "epoch": 0.09950998869204673, "grad_norm": 0.4945797324180603, "learning_rate": 4.924004757304005e-05, "loss": 0.8001, "step": 440 }, { "epoch": 0.10064078401809272, "grad_norm": 0.3075043857097626, "learning_rate": 4.921764984784128e-05, "loss": 0.7233, "step": 445 }, { "epoch": 0.1017715793441387, "grad_norm": 0.3451552093029022, "learning_rate": 4.919493208376479e-05, "loss": 0.6629, "step": 450 }, { "epoch": 0.10290237467018469, "grad_norm": 0.28970155119895935, "learning_rate": 4.917189458102831e-05, "loss": 0.7793, "step": 455 }, { "epoch": 0.10403316999623068, "grad_norm": 0.2446502447128296, "learning_rate": 4.9148537644074936e-05, "loss": 0.6899, "step": 460 }, { "epoch": 0.10516396532227666, "grad_norm": 0.2791134715080261, "learning_rate": 4.912486158156912e-05, "loss": 0.69, "step": 465 }, { "epoch": 0.10629476064832265, "grad_norm": 0.35021790862083435, "learning_rate": 4.910086670639264e-05, "loss": 0.7497, "step": 470 }, { "epoch": 0.10742555597436863, "grad_norm": 0.27730756998062134, "learning_rate": 4.907655333564035e-05, "loss": 0.6799, "step": 475 }, { "epoch": 0.10855635130041462, "grad_norm": 0.3183215856552124, "learning_rate": 4.9051921790616095e-05, "loss": 0.723, "step": 480 }, { "epoch": 0.1096871466264606, "grad_norm": 0.31501445174217224, "learning_rate": 4.902697239682844e-05, "loss": 0.7611, "step": 485 }, { "epoch": 0.11081794195250659, "grad_norm": 0.30429741740226746, "learning_rate": 4.9001705483986314e-05, "loss": 0.7909, "step": 490 }, { "epoch": 0.11194873727855258, "grad_norm": 0.27980148792266846, "learning_rate": 4.8976121385994735e-05, "loss": 0.7085, "step": 495 }, { "epoch": 0.11307953260459856, "grad_norm": 0.2850303649902344, "learning_rate": 4.895022044095034e-05, "loss": 0.751, "step": 500 }, { "epoch": 0.11421032793064455, "grad_norm": 0.30970653891563416, "learning_rate": 4.892400299113693e-05, "loss": 0.6766, "step": 505 }, { "epoch": 0.11534112325669053, "grad_norm": 0.4121417999267578, "learning_rate": 4.8897469383020966e-05, "loss": 0.6824, "step": 510 }, { "epoch": 0.11647191858273652, "grad_norm": 0.3178861737251282, "learning_rate": 4.887061996724696e-05, "loss": 0.6798, "step": 515 }, { "epoch": 0.1176027139087825, "grad_norm": 0.3267967700958252, "learning_rate": 4.884345509863286e-05, "loss": 0.7661, "step": 520 }, { "epoch": 0.11873350923482849, "grad_norm": 0.3270506262779236, "learning_rate": 4.881597513616536e-05, "loss": 0.7321, "step": 525 }, { "epoch": 0.11986430456087448, "grad_norm": 0.3873696029186249, "learning_rate": 4.878818044299517e-05, "loss": 0.7278, "step": 530 }, { "epoch": 0.12099509988692046, "grad_norm": 0.3305418789386749, "learning_rate": 4.876007138643216e-05, "loss": 0.7304, "step": 535 }, { "epoch": 0.12212589521296645, "grad_norm": 0.26419228315353394, "learning_rate": 4.873164833794059e-05, "loss": 0.7248, "step": 540 }, { "epoch": 0.12325669053901243, "grad_norm": 0.3038617968559265, "learning_rate": 4.870291167313413e-05, "loss": 0.6681, "step": 545 }, { "epoch": 0.12438748586505842, "grad_norm": 0.2820129692554474, "learning_rate": 4.8673861771770934e-05, "loss": 0.7434, "step": 550 }, { "epoch": 0.12551828119110442, "grad_norm": 0.3421660363674164, "learning_rate": 4.8644499017748615e-05, "loss": 0.7266, "step": 555 }, { "epoch": 0.1266490765171504, "grad_norm": 0.3642486035823822, "learning_rate": 4.861482379909914e-05, "loss": 0.7421, "step": 560 }, { "epoch": 0.1277798718431964, "grad_norm": 0.35517194867134094, "learning_rate": 4.8584836507983786e-05, "loss": 0.7432, "step": 565 }, { "epoch": 0.12891066716924238, "grad_norm": 0.3161648213863373, "learning_rate": 4.855453754068784e-05, "loss": 0.7098, "step": 570 }, { "epoch": 0.13004146249528836, "grad_norm": 0.296561598777771, "learning_rate": 4.852392729761547e-05, "loss": 0.6641, "step": 575 }, { "epoch": 0.13117225782133435, "grad_norm": 0.323515921831131, "learning_rate": 4.849300618328435e-05, "loss": 0.7522, "step": 580 }, { "epoch": 0.13230305314738033, "grad_norm": 0.34789595007896423, "learning_rate": 4.8461774606320386e-05, "loss": 0.7712, "step": 585 }, { "epoch": 0.13343384847342632, "grad_norm": 0.3661488890647888, "learning_rate": 4.843023297945226e-05, "loss": 0.6862, "step": 590 }, { "epoch": 0.1345646437994723, "grad_norm": 0.43650659918785095, "learning_rate": 4.8398381719506e-05, "loss": 0.7003, "step": 595 }, { "epoch": 0.1356954391255183, "grad_norm": 0.38563141226768494, "learning_rate": 4.836622124739948e-05, "loss": 0.7094, "step": 600 }, { "epoch": 0.13682623445156428, "grad_norm": 0.30190715193748474, "learning_rate": 4.833375198813683e-05, "loss": 0.6664, "step": 605 }, { "epoch": 0.13795702977761026, "grad_norm": 0.35016635060310364, "learning_rate": 4.8300974370802855e-05, "loss": 0.6657, "step": 610 }, { "epoch": 0.13908782510365625, "grad_norm": 0.3495071530342102, "learning_rate": 4.8267888828557315e-05, "loss": 0.7689, "step": 615 }, { "epoch": 0.14021862042970223, "grad_norm": 0.2628171145915985, "learning_rate": 4.823449579862927e-05, "loss": 0.7278, "step": 620 }, { "epoch": 0.14134941575574822, "grad_norm": 0.3362691104412079, "learning_rate": 4.820079572231123e-05, "loss": 0.6934, "step": 625 }, { "epoch": 0.1424802110817942, "grad_norm": 0.32949429750442505, "learning_rate": 4.8166789044953385e-05, "loss": 0.6363, "step": 630 }, { "epoch": 0.1436110064078402, "grad_norm": 0.3482156991958618, "learning_rate": 4.813247621595766e-05, "loss": 0.6735, "step": 635 }, { "epoch": 0.14474180173388618, "grad_norm": 0.27361541986465454, "learning_rate": 4.809785768877183e-05, "loss": 0.6783, "step": 640 }, { "epoch": 0.14587259705993216, "grad_norm": 0.29385972023010254, "learning_rate": 4.80629339208835e-05, "loss": 0.6947, "step": 645 }, { "epoch": 0.14700339238597815, "grad_norm": 0.2907145023345947, "learning_rate": 4.802770537381407e-05, "loss": 0.6583, "step": 650 }, { "epoch": 0.14813418771202413, "grad_norm": 0.3557474613189697, "learning_rate": 4.799217251311261e-05, "loss": 0.6196, "step": 655 }, { "epoch": 0.14926498303807012, "grad_norm": 0.3381137251853943, "learning_rate": 4.795633580834974e-05, "loss": 0.6959, "step": 660 }, { "epoch": 0.1503957783641161, "grad_norm": 0.3507809042930603, "learning_rate": 4.792019573311142e-05, "loss": 0.7787, "step": 665 }, { "epoch": 0.1515265736901621, "grad_norm": 0.3603408634662628, "learning_rate": 4.7883752764992676e-05, "loss": 0.6956, "step": 670 }, { "epoch": 0.15265736901620808, "grad_norm": 0.3778272867202759, "learning_rate": 4.7847007385591295e-05, "loss": 0.6352, "step": 675 }, { "epoch": 0.15378816434225406, "grad_norm": 0.3363897502422333, "learning_rate": 4.7809960080501464e-05, "loss": 0.6615, "step": 680 }, { "epoch": 0.15491895966830005, "grad_norm": 0.32491081953048706, "learning_rate": 4.777261133930735e-05, "loss": 0.7499, "step": 685 }, { "epoch": 0.15604975499434603, "grad_norm": 0.318862646818161, "learning_rate": 4.773496165557663e-05, "loss": 0.725, "step": 690 }, { "epoch": 0.15718055032039202, "grad_norm": 0.45129063725471497, "learning_rate": 4.7697011526853976e-05, "loss": 0.7582, "step": 695 }, { "epoch": 0.158311345646438, "grad_norm": 0.3082630932331085, "learning_rate": 4.7658761454654454e-05, "loss": 0.834, "step": 700 }, { "epoch": 0.159442140972484, "grad_norm": 0.29232099652290344, "learning_rate": 4.762021194445695e-05, "loss": 0.688, "step": 705 }, { "epoch": 0.16057293629852998, "grad_norm": 0.304189532995224, "learning_rate": 4.758136350569743e-05, "loss": 0.6758, "step": 710 }, { "epoch": 0.16170373162457596, "grad_norm": 0.3389667570590973, "learning_rate": 4.754221665176223e-05, "loss": 0.6746, "step": 715 }, { "epoch": 0.16283452695062195, "grad_norm": 0.5311838388442993, "learning_rate": 4.7502771899981284e-05, "loss": 0.8003, "step": 720 }, { "epoch": 0.16396532227666794, "grad_norm": 0.26352110505104065, "learning_rate": 4.7463029771621294e-05, "loss": 0.6647, "step": 725 }, { "epoch": 0.16509611760271392, "grad_norm": 0.3928554058074951, "learning_rate": 4.74229907918788e-05, "loss": 0.7258, "step": 730 }, { "epoch": 0.1662269129287599, "grad_norm": 0.4840872883796692, "learning_rate": 4.738265548987327e-05, "loss": 0.7886, "step": 735 }, { "epoch": 0.1673577082548059, "grad_norm": 0.324370414018631, "learning_rate": 4.734202439864012e-05, "loss": 0.7031, "step": 740 }, { "epoch": 0.16848850358085188, "grad_norm": 0.30743566155433655, "learning_rate": 4.730109805512363e-05, "loss": 0.7228, "step": 745 }, { "epoch": 0.16961929890689786, "grad_norm": 0.3641277551651001, "learning_rate": 4.7259877000169896e-05, "loss": 0.7265, "step": 750 }, { "epoch": 0.17075009423294385, "grad_norm": 0.40837985277175903, "learning_rate": 4.721836177851963e-05, "loss": 0.7128, "step": 755 }, { "epoch": 0.17188088955898984, "grad_norm": 0.28167346119880676, "learning_rate": 4.717655293880102e-05, "loss": 0.6837, "step": 760 }, { "epoch": 0.17301168488503582, "grad_norm": 0.37647080421447754, "learning_rate": 4.713445103352241e-05, "loss": 0.7493, "step": 765 }, { "epoch": 0.1741424802110818, "grad_norm": 0.3222416043281555, "learning_rate": 4.7092056619065084e-05, "loss": 0.6314, "step": 770 }, { "epoch": 0.1752732755371278, "grad_norm": 0.29139477014541626, "learning_rate": 4.704937025567582e-05, "loss": 0.7274, "step": 775 }, { "epoch": 0.17640407086317378, "grad_norm": 0.3189648687839508, "learning_rate": 4.700639250745957e-05, "loss": 0.7202, "step": 780 }, { "epoch": 0.17753486618921976, "grad_norm": 0.26070472598075867, "learning_rate": 4.696312394237195e-05, "loss": 0.7426, "step": 785 }, { "epoch": 0.17866566151526575, "grad_norm": 0.384833961725235, "learning_rate": 4.691956513221174e-05, "loss": 0.7669, "step": 790 }, { "epoch": 0.17979645684131174, "grad_norm": 0.3161134421825409, "learning_rate": 4.6875716652613366e-05, "loss": 0.7224, "step": 795 }, { "epoch": 0.18092725216735772, "grad_norm": 0.40663212537765503, "learning_rate": 4.6831579083039265e-05, "loss": 0.7176, "step": 800 }, { "epoch": 0.1820580474934037, "grad_norm": 0.4073905646800995, "learning_rate": 4.6787153006772214e-05, "loss": 0.7454, "step": 805 }, { "epoch": 0.1831888428194497, "grad_norm": 0.36114805936813354, "learning_rate": 4.6742439010907645e-05, "loss": 0.7271, "step": 810 }, { "epoch": 0.18431963814549568, "grad_norm": 0.35414162278175354, "learning_rate": 4.6697437686345883e-05, "loss": 0.8134, "step": 815 }, { "epoch": 0.18545043347154166, "grad_norm": 0.3441600799560547, "learning_rate": 4.6652149627784324e-05, "loss": 0.7259, "step": 820 }, { "epoch": 0.18658122879758765, "grad_norm": 0.34488874673843384, "learning_rate": 4.660657543370958e-05, "loss": 0.7541, "step": 825 }, { "epoch": 0.1877120241236336, "grad_norm": 0.3300029933452606, "learning_rate": 4.65607157063896e-05, "loss": 0.7123, "step": 830 }, { "epoch": 0.1888428194496796, "grad_norm": 0.39021798968315125, "learning_rate": 4.651457105186566e-05, "loss": 0.7049, "step": 835 }, { "epoch": 0.18997361477572558, "grad_norm": 0.3784525394439697, "learning_rate": 4.646814207994441e-05, "loss": 0.7892, "step": 840 }, { "epoch": 0.19110441010177157, "grad_norm": 0.3650527000427246, "learning_rate": 4.642142940418973e-05, "loss": 0.7315, "step": 845 }, { "epoch": 0.19223520542781755, "grad_norm": 0.36192572116851807, "learning_rate": 4.637443364191474e-05, "loss": 0.6201, "step": 850 }, { "epoch": 0.19336600075386354, "grad_norm": 0.3428821265697479, "learning_rate": 4.6327155414173554e-05, "loss": 0.7248, "step": 855 }, { "epoch": 0.19449679607990952, "grad_norm": 0.2692446708679199, "learning_rate": 4.627959534575307e-05, "loss": 0.6986, "step": 860 }, { "epoch": 0.1956275914059555, "grad_norm": 0.33562323451042175, "learning_rate": 4.623175406516479e-05, "loss": 0.7553, "step": 865 }, { "epoch": 0.1967583867320015, "grad_norm": 0.332381010055542, "learning_rate": 4.618363220463644e-05, "loss": 0.7021, "step": 870 }, { "epoch": 0.19788918205804748, "grad_norm": 0.3331127166748047, "learning_rate": 4.6135230400103636e-05, "loss": 0.7278, "step": 875 }, { "epoch": 0.19901997738409347, "grad_norm": 0.32819780707359314, "learning_rate": 4.6086549291201485e-05, "loss": 0.7189, "step": 880 }, { "epoch": 0.20015077271013945, "grad_norm": 0.31646525859832764, "learning_rate": 4.603758952125615e-05, "loss": 0.6949, "step": 885 }, { "epoch": 0.20128156803618544, "grad_norm": 0.3622991740703583, "learning_rate": 4.5988351737276316e-05, "loss": 0.7193, "step": 890 }, { "epoch": 0.20241236336223142, "grad_norm": 0.3097212016582489, "learning_rate": 4.593883658994466e-05, "loss": 0.6913, "step": 895 }, { "epoch": 0.2035431586882774, "grad_norm": 0.3757197856903076, "learning_rate": 4.588904473360923e-05, "loss": 0.6859, "step": 900 }, { "epoch": 0.2046739540143234, "grad_norm": 0.3894336223602295, "learning_rate": 4.5838976826274826e-05, "loss": 0.7495, "step": 905 }, { "epoch": 0.20580474934036938, "grad_norm": 0.2777577042579651, "learning_rate": 4.578863352959429e-05, "loss": 0.7305, "step": 910 }, { "epoch": 0.20693554466641537, "grad_norm": 0.30092760920524597, "learning_rate": 4.573801550885979e-05, "loss": 0.6952, "step": 915 }, { "epoch": 0.20806633999246135, "grad_norm": 0.31918197870254517, "learning_rate": 4.568712343299394e-05, "loss": 0.6309, "step": 920 }, { "epoch": 0.20919713531850734, "grad_norm": 0.3190583884716034, "learning_rate": 4.563595797454109e-05, "loss": 0.6932, "step": 925 }, { "epoch": 0.21032793064455332, "grad_norm": 0.4575042128562927, "learning_rate": 4.558451980965832e-05, "loss": 0.7446, "step": 930 }, { "epoch": 0.2114587259705993, "grad_norm": 0.3298736810684204, "learning_rate": 4.553280961810658e-05, "loss": 0.7434, "step": 935 }, { "epoch": 0.2125895212966453, "grad_norm": 0.2681873142719269, "learning_rate": 4.548082808324169e-05, "loss": 0.7609, "step": 940 }, { "epoch": 0.21372031662269128, "grad_norm": 0.32544100284576416, "learning_rate": 4.542857589200527e-05, "loss": 0.7076, "step": 945 }, { "epoch": 0.21485111194873727, "grad_norm": 0.3351302444934845, "learning_rate": 4.537605373491573e-05, "loss": 0.7442, "step": 950 }, { "epoch": 0.21598190727478325, "grad_norm": 0.3408782482147217, "learning_rate": 4.532326230605908e-05, "loss": 0.6697, "step": 955 }, { "epoch": 0.21711270260082924, "grad_norm": 0.31308743357658386, "learning_rate": 4.52702023030798e-05, "loss": 0.6795, "step": 960 }, { "epoch": 0.21824349792687522, "grad_norm": 0.31887832283973694, "learning_rate": 4.521687442717161e-05, "loss": 0.6907, "step": 965 }, { "epoch": 0.2193742932529212, "grad_norm": 0.28720954060554504, "learning_rate": 4.516327938306818e-05, "loss": 0.6951, "step": 970 }, { "epoch": 0.2205050885789672, "grad_norm": 0.35572728514671326, "learning_rate": 4.510941787903385e-05, "loss": 0.6731, "step": 975 }, { "epoch": 0.22163588390501318, "grad_norm": 0.32665789127349854, "learning_rate": 4.505529062685426e-05, "loss": 0.6859, "step": 980 }, { "epoch": 0.22276667923105917, "grad_norm": 0.425155907869339, "learning_rate": 4.5000898341826935e-05, "loss": 0.7611, "step": 985 }, { "epoch": 0.22389747455710515, "grad_norm": 0.3223753273487091, "learning_rate": 4.494624174275185e-05, "loss": 0.6784, "step": 990 }, { "epoch": 0.22502826988315114, "grad_norm": 0.29629823565483093, "learning_rate": 4.48913215519219e-05, "loss": 0.7528, "step": 995 }, { "epoch": 0.22615906520919712, "grad_norm": 0.45501330494880676, "learning_rate": 4.483613849511337e-05, "loss": 0.7412, "step": 1000 }, { "epoch": 0.2272898605352431, "grad_norm": 0.47708141803741455, "learning_rate": 4.478069330157638e-05, "loss": 0.7186, "step": 1005 }, { "epoch": 0.2284206558612891, "grad_norm": 0.46172332763671875, "learning_rate": 4.472498670402519e-05, "loss": 0.7429, "step": 1010 }, { "epoch": 0.22955145118733508, "grad_norm": 0.2885262966156006, "learning_rate": 4.4669019438628545e-05, "loss": 0.6749, "step": 1015 }, { "epoch": 0.23068224651338107, "grad_norm": 0.3848798871040344, "learning_rate": 4.461279224499995e-05, "loss": 0.6889, "step": 1020 }, { "epoch": 0.23181304183942705, "grad_norm": 0.3475760519504547, "learning_rate": 4.455630586618788e-05, "loss": 0.7423, "step": 1025 }, { "epoch": 0.23294383716547304, "grad_norm": 0.3690018653869629, "learning_rate": 4.449956104866597e-05, "loss": 0.6995, "step": 1030 }, { "epoch": 0.23407463249151902, "grad_norm": 0.4979022741317749, "learning_rate": 4.444255854232318e-05, "loss": 0.7137, "step": 1035 }, { "epoch": 0.235205427817565, "grad_norm": 0.3002910017967224, "learning_rate": 4.438529910045381e-05, "loss": 0.6342, "step": 1040 }, { "epoch": 0.236336223143611, "grad_norm": 0.2860986292362213, "learning_rate": 4.432778347974764e-05, "loss": 0.6486, "step": 1045 }, { "epoch": 0.23746701846965698, "grad_norm": 0.3187776207923889, "learning_rate": 4.427001244027984e-05, "loss": 0.6935, "step": 1050 }, { "epoch": 0.23859781379570297, "grad_norm": 0.436594694852829, "learning_rate": 4.4211986745500976e-05, "loss": 0.7125, "step": 1055 }, { "epoch": 0.23972860912174895, "grad_norm": 0.25989067554473877, "learning_rate": 4.415370716222693e-05, "loss": 0.6699, "step": 1060 }, { "epoch": 0.24085940444779494, "grad_norm": 0.30455416440963745, "learning_rate": 4.4095174460628734e-05, "loss": 0.7244, "step": 1065 }, { "epoch": 0.24199019977384092, "grad_norm": 0.2574412226676941, "learning_rate": 4.40363894142224e-05, "loss": 0.6719, "step": 1070 }, { "epoch": 0.2431209950998869, "grad_norm": 0.2614154815673828, "learning_rate": 4.397735279985873e-05, "loss": 0.7, "step": 1075 }, { "epoch": 0.2442517904259329, "grad_norm": 0.32729870080947876, "learning_rate": 4.3918065397712983e-05, "loss": 0.6669, "step": 1080 }, { "epoch": 0.24538258575197888, "grad_norm": 0.5149984359741211, "learning_rate": 4.385852799127464e-05, "loss": 0.7371, "step": 1085 }, { "epoch": 0.24651338107802487, "grad_norm": 0.322007417678833, "learning_rate": 4.379874136733702e-05, "loss": 0.7595, "step": 1090 }, { "epoch": 0.24764417640407085, "grad_norm": 0.38709428906440735, "learning_rate": 4.373870631598683e-05, "loss": 0.7662, "step": 1095 }, { "epoch": 0.24877497173011684, "grad_norm": 0.3887243866920471, "learning_rate": 4.367842363059383e-05, "loss": 0.6608, "step": 1100 }, { "epoch": 0.24990576705616283, "grad_norm": 0.343573659658432, "learning_rate": 4.3617894107800275e-05, "loss": 0.7364, "step": 1105 }, { "epoch": 0.25103656238220884, "grad_norm": 0.3381284773349762, "learning_rate": 4.355711854751037e-05, "loss": 0.6939, "step": 1110 }, { "epoch": 0.2521673577082548, "grad_norm": 0.428345650434494, "learning_rate": 4.3496097752879764e-05, "loss": 0.7322, "step": 1115 }, { "epoch": 0.2532981530343008, "grad_norm": 0.3029363453388214, "learning_rate": 4.3434832530304906e-05, "loss": 0.6434, "step": 1120 }, { "epoch": 0.2544289483603468, "grad_norm": 0.32285043597221375, "learning_rate": 4.337332368941237e-05, "loss": 0.686, "step": 1125 }, { "epoch": 0.2555597436863928, "grad_norm": 0.2844852805137634, "learning_rate": 4.331157204304819e-05, "loss": 0.6786, "step": 1130 }, { "epoch": 0.25669053901243877, "grad_norm": 0.38639211654663086, "learning_rate": 4.324957840726708e-05, "loss": 0.669, "step": 1135 }, { "epoch": 0.25782133433848475, "grad_norm": 0.29250484704971313, "learning_rate": 4.3187343601321696e-05, "loss": 0.684, "step": 1140 }, { "epoch": 0.25895212966453074, "grad_norm": 0.3040000796318054, "learning_rate": 4.312486844765175e-05, "loss": 0.6721, "step": 1145 }, { "epoch": 0.2600829249905767, "grad_norm": 0.3095468580722809, "learning_rate": 4.3062153771873214e-05, "loss": 0.8026, "step": 1150 }, { "epoch": 0.2612137203166227, "grad_norm": 0.3532247543334961, "learning_rate": 4.299920040276735e-05, "loss": 0.7338, "step": 1155 }, { "epoch": 0.2623445156426687, "grad_norm": 0.3691394627094269, "learning_rate": 4.2936009172269766e-05, "loss": 0.6489, "step": 1160 }, { "epoch": 0.2634753109687147, "grad_norm": 0.3503078520298004, "learning_rate": 4.287258091545946e-05, "loss": 0.6705, "step": 1165 }, { "epoch": 0.26460610629476067, "grad_norm": 0.31756189465522766, "learning_rate": 4.280891647054775e-05, "loss": 0.6642, "step": 1170 }, { "epoch": 0.26573690162080665, "grad_norm": 0.27942630648612976, "learning_rate": 4.274501667886718e-05, "loss": 0.7139, "step": 1175 }, { "epoch": 0.26686769694685264, "grad_norm": 0.35604235529899597, "learning_rate": 4.268088238486048e-05, "loss": 0.8335, "step": 1180 }, { "epoch": 0.2679984922728986, "grad_norm": 0.3140622675418854, "learning_rate": 4.261651443606931e-05, "loss": 0.8127, "step": 1185 }, { "epoch": 0.2691292875989446, "grad_norm": 0.327470988035202, "learning_rate": 4.255191368312311e-05, "loss": 0.7311, "step": 1190 }, { "epoch": 0.2702600829249906, "grad_norm": 0.3089313805103302, "learning_rate": 4.2487080979727876e-05, "loss": 0.733, "step": 1195 }, { "epoch": 0.2713908782510366, "grad_norm": 0.3237866163253784, "learning_rate": 4.242201718265483e-05, "loss": 0.6754, "step": 1200 }, { "epoch": 0.27252167357708257, "grad_norm": 0.3597028851509094, "learning_rate": 4.235672315172912e-05, "loss": 0.741, "step": 1205 }, { "epoch": 0.27365246890312855, "grad_norm": 0.30509960651397705, "learning_rate": 4.229119974981848e-05, "loss": 0.7098, "step": 1210 }, { "epoch": 0.27478326422917454, "grad_norm": 0.37183189392089844, "learning_rate": 4.222544784282178e-05, "loss": 0.7037, "step": 1215 }, { "epoch": 0.2759140595552205, "grad_norm": 0.35368862748146057, "learning_rate": 4.2159468299657645e-05, "loss": 0.654, "step": 1220 }, { "epoch": 0.2770448548812665, "grad_norm": 0.3120376765727997, "learning_rate": 4.209326199225291e-05, "loss": 0.6845, "step": 1225 }, { "epoch": 0.2781756502073125, "grad_norm": 0.3322497308254242, "learning_rate": 4.202682979553112e-05, "loss": 0.738, "step": 1230 }, { "epoch": 0.2793064455333585, "grad_norm": 0.39859551191329956, "learning_rate": 4.1960172587401007e-05, "loss": 0.7208, "step": 1235 }, { "epoch": 0.28043724085940447, "grad_norm": 0.304196298122406, "learning_rate": 4.1893291248744794e-05, "loss": 0.6701, "step": 1240 }, { "epoch": 0.28156803618545045, "grad_norm": 0.30052655935287476, "learning_rate": 4.1826186663406685e-05, "loss": 0.7255, "step": 1245 }, { "epoch": 0.28269883151149644, "grad_norm": 0.3247777223587036, "learning_rate": 4.1758859718181054e-05, "loss": 0.7067, "step": 1250 }, { "epoch": 0.2838296268375424, "grad_norm": 0.39652687311172485, "learning_rate": 4.169131130280081e-05, "loss": 0.8056, "step": 1255 }, { "epoch": 0.2849604221635884, "grad_norm": 0.299211710691452, "learning_rate": 4.162354230992562e-05, "loss": 0.7158, "step": 1260 }, { "epoch": 0.2860912174896344, "grad_norm": 0.34312811493873596, "learning_rate": 4.155555363513009e-05, "loss": 0.6555, "step": 1265 }, { "epoch": 0.2872220128156804, "grad_norm": 0.34061411023139954, "learning_rate": 4.148734617689196e-05, "loss": 0.6973, "step": 1270 }, { "epoch": 0.28835280814172637, "grad_norm": 0.32622766494750977, "learning_rate": 4.1418920836580214e-05, "loss": 0.7034, "step": 1275 }, { "epoch": 0.28948360346777235, "grad_norm": 0.31413719058036804, "learning_rate": 4.135027851844316e-05, "loss": 0.6874, "step": 1280 }, { "epoch": 0.29061439879381834, "grad_norm": 0.3852449357509613, "learning_rate": 4.1281420129596504e-05, "loss": 0.6937, "step": 1285 }, { "epoch": 0.2917451941198643, "grad_norm": 0.25905337929725647, "learning_rate": 4.121234658001135e-05, "loss": 0.7273, "step": 1290 }, { "epoch": 0.2928759894459103, "grad_norm": 0.33746325969696045, "learning_rate": 4.114305878250218e-05, "loss": 0.6815, "step": 1295 }, { "epoch": 0.2940067847719563, "grad_norm": 0.36523139476776123, "learning_rate": 4.1073557652714755e-05, "loss": 0.6763, "step": 1300 }, { "epoch": 0.2951375800980023, "grad_norm": 0.4286907911300659, "learning_rate": 4.100384410911409e-05, "loss": 0.7807, "step": 1305 }, { "epoch": 0.29626837542404827, "grad_norm": 0.27938035130500793, "learning_rate": 4.0933919072972224e-05, "loss": 0.6515, "step": 1310 }, { "epoch": 0.29739917075009425, "grad_norm": 0.28958678245544434, "learning_rate": 4.086378346835614e-05, "loss": 0.6303, "step": 1315 }, { "epoch": 0.29852996607614024, "grad_norm": 0.31973332166671753, "learning_rate": 4.0793438222115477e-05, "loss": 0.733, "step": 1320 }, { "epoch": 0.2996607614021862, "grad_norm": 0.302673876285553, "learning_rate": 4.072288426387032e-05, "loss": 0.6551, "step": 1325 }, { "epoch": 0.3007915567282322, "grad_norm": 0.3454115092754364, "learning_rate": 4.065212252599889e-05, "loss": 0.6847, "step": 1330 }, { "epoch": 0.3019223520542782, "grad_norm": 0.32197806239128113, "learning_rate": 4.0581153943625266e-05, "loss": 0.7283, "step": 1335 }, { "epoch": 0.3030531473803242, "grad_norm": 0.2939291000366211, "learning_rate": 4.050997945460699e-05, "loss": 0.6519, "step": 1340 }, { "epoch": 0.30418394270637017, "grad_norm": 0.34127116203308105, "learning_rate": 4.043859999952266e-05, "loss": 0.7041, "step": 1345 }, { "epoch": 0.30531473803241616, "grad_norm": 0.3606717586517334, "learning_rate": 4.0367016521659564e-05, "loss": 0.6745, "step": 1350 }, { "epoch": 0.30644553335846214, "grad_norm": 0.3977923095226288, "learning_rate": 4.029522996700112e-05, "loss": 0.6635, "step": 1355 }, { "epoch": 0.3075763286845081, "grad_norm": 0.27561894059181213, "learning_rate": 4.0223241284214496e-05, "loss": 0.6661, "step": 1360 }, { "epoch": 0.3087071240105541, "grad_norm": 0.31549111008644104, "learning_rate": 4.015105142463794e-05, "loss": 0.6659, "step": 1365 }, { "epoch": 0.3098379193366001, "grad_norm": 0.32156458497047424, "learning_rate": 4.0078661342268314e-05, "loss": 0.6656, "step": 1370 }, { "epoch": 0.3109687146626461, "grad_norm": 0.33597517013549805, "learning_rate": 4.000607199374843e-05, "loss": 0.6291, "step": 1375 }, { "epoch": 0.31209950998869207, "grad_norm": 0.2836547791957855, "learning_rate": 3.9933284338354415e-05, "loss": 0.6936, "step": 1380 }, { "epoch": 0.31323030531473806, "grad_norm": 0.3355998396873474, "learning_rate": 3.986029933798308e-05, "loss": 0.6578, "step": 1385 }, { "epoch": 0.31436110064078404, "grad_norm": 0.3303869962692261, "learning_rate": 3.9787117957139116e-05, "loss": 0.6859, "step": 1390 }, { "epoch": 0.31549189596683, "grad_norm": 0.3788108825683594, "learning_rate": 3.9713741162922455e-05, "loss": 0.6997, "step": 1395 }, { "epoch": 0.316622691292876, "grad_norm": 0.33582428097724915, "learning_rate": 3.964016992501541e-05, "loss": 0.689, "step": 1400 }, { "epoch": 0.317753486618922, "grad_norm": 0.35693231225013733, "learning_rate": 3.956640521566989e-05, "loss": 0.676, "step": 1405 }, { "epoch": 0.318884281944968, "grad_norm": 0.3589436709880829, "learning_rate": 3.949244800969456e-05, "loss": 0.7545, "step": 1410 }, { "epoch": 0.32001507727101397, "grad_norm": 0.3047327399253845, "learning_rate": 3.941829928444194e-05, "loss": 0.6391, "step": 1415 }, { "epoch": 0.32114587259705996, "grad_norm": 0.292953759431839, "learning_rate": 3.9343960019795525e-05, "loss": 0.6886, "step": 1420 }, { "epoch": 0.32227666792310594, "grad_norm": 0.3644665777683258, "learning_rate": 3.926943119815675e-05, "loss": 0.7283, "step": 1425 }, { "epoch": 0.3234074632491519, "grad_norm": 0.3624630570411682, "learning_rate": 3.919471380443212e-05, "loss": 0.6566, "step": 1430 }, { "epoch": 0.3245382585751979, "grad_norm": 0.48623165488243103, "learning_rate": 3.911980882602011e-05, "loss": 0.8311, "step": 1435 }, { "epoch": 0.3256690539012439, "grad_norm": 0.3244991600513458, "learning_rate": 3.904471725279818e-05, "loss": 0.7087, "step": 1440 }, { "epoch": 0.3267998492272899, "grad_norm": 0.3399847149848938, "learning_rate": 3.8969440077109634e-05, "loss": 0.6146, "step": 1445 }, { "epoch": 0.32793064455333587, "grad_norm": 0.3181338310241699, "learning_rate": 3.889397829375052e-05, "loss": 0.7608, "step": 1450 }, { "epoch": 0.32906143987938186, "grad_norm": 0.5128947496414185, "learning_rate": 3.881833289995654e-05, "loss": 0.7225, "step": 1455 }, { "epoch": 0.33019223520542784, "grad_norm": 0.3176124095916748, "learning_rate": 3.874250489538981e-05, "loss": 0.7225, "step": 1460 }, { "epoch": 0.33132303053147383, "grad_norm": 0.3748844563961029, "learning_rate": 3.866649528212563e-05, "loss": 0.7188, "step": 1465 }, { "epoch": 0.3324538258575198, "grad_norm": 0.974604606628418, "learning_rate": 3.859030506463932e-05, "loss": 0.7509, "step": 1470 }, { "epoch": 0.3335846211835658, "grad_norm": 0.3221200704574585, "learning_rate": 3.851393524979291e-05, "loss": 0.6781, "step": 1475 }, { "epoch": 0.3347154165096118, "grad_norm": 0.33971571922302246, "learning_rate": 3.84373868468218e-05, "loss": 0.6711, "step": 1480 }, { "epoch": 0.33584621183565777, "grad_norm": 0.3183509409427643, "learning_rate": 3.836066086732145e-05, "loss": 0.6808, "step": 1485 }, { "epoch": 0.33697700716170376, "grad_norm": 0.2814907729625702, "learning_rate": 3.828375832523407e-05, "loss": 0.7171, "step": 1490 }, { "epoch": 0.33810780248774974, "grad_norm": 0.2738807797431946, "learning_rate": 3.820668023683507e-05, "loss": 0.7934, "step": 1495 }, { "epoch": 0.33923859781379573, "grad_norm": 0.3376060128211975, "learning_rate": 3.812942762071981e-05, "loss": 0.6045, "step": 1500 }, { "epoch": 0.3403693931398417, "grad_norm": 0.3851218819618225, "learning_rate": 3.8052001497790005e-05, "loss": 0.7214, "step": 1505 }, { "epoch": 0.3415001884658877, "grad_norm": 0.2853710949420929, "learning_rate": 3.7974402891240294e-05, "loss": 0.7312, "step": 1510 }, { "epoch": 0.3426309837919337, "grad_norm": 0.34209561347961426, "learning_rate": 3.78966328265447e-05, "loss": 0.66, "step": 1515 }, { "epoch": 0.34376177911797967, "grad_norm": 0.2967279851436615, "learning_rate": 3.7818692331443093e-05, "loss": 0.7354, "step": 1520 }, { "epoch": 0.34489257444402566, "grad_norm": 0.31301623582839966, "learning_rate": 3.7740582435927614e-05, "loss": 0.6634, "step": 1525 }, { "epoch": 0.34602336977007164, "grad_norm": 0.287758469581604, "learning_rate": 3.766230417222901e-05, "loss": 0.7688, "step": 1530 }, { "epoch": 0.34715416509611763, "grad_norm": 0.34585824608802795, "learning_rate": 3.7583858574803046e-05, "loss": 0.6542, "step": 1535 }, { "epoch": 0.3482849604221636, "grad_norm": 0.32640525698661804, "learning_rate": 3.7505246680316853e-05, "loss": 0.71, "step": 1540 }, { "epoch": 0.3494157557482096, "grad_norm": 0.2845459580421448, "learning_rate": 3.742646952763515e-05, "loss": 0.6233, "step": 1545 }, { "epoch": 0.3505465510742556, "grad_norm": 0.30241382122039795, "learning_rate": 3.7347528157806586e-05, "loss": 0.6739, "step": 1550 }, { "epoch": 0.35167734640030157, "grad_norm": 0.35119229555130005, "learning_rate": 3.726842361404996e-05, "loss": 0.72, "step": 1555 }, { "epoch": 0.35280814172634756, "grad_norm": 0.3631749153137207, "learning_rate": 3.718915694174042e-05, "loss": 0.6596, "step": 1560 }, { "epoch": 0.35393893705239354, "grad_norm": 0.258357971906662, "learning_rate": 3.7109729188395666e-05, "loss": 0.7037, "step": 1565 }, { "epoch": 0.35506973237843953, "grad_norm": 0.2907659113407135, "learning_rate": 3.703014140366209e-05, "loss": 0.6494, "step": 1570 }, { "epoch": 0.3562005277044855, "grad_norm": 0.309076189994812, "learning_rate": 3.695039463930093e-05, "loss": 0.6668, "step": 1575 }, { "epoch": 0.3573313230305315, "grad_norm": 0.33287695050239563, "learning_rate": 3.687048994917437e-05, "loss": 0.7215, "step": 1580 }, { "epoch": 0.3584621183565775, "grad_norm": 0.2877466082572937, "learning_rate": 3.679042838923157e-05, "loss": 0.6261, "step": 1585 }, { "epoch": 0.35959291368262347, "grad_norm": 0.26237618923187256, "learning_rate": 3.671021101749476e-05, "loss": 0.6966, "step": 1590 }, { "epoch": 0.36072370900866946, "grad_norm": 0.34308937191963196, "learning_rate": 3.6629838894045224e-05, "loss": 0.662, "step": 1595 }, { "epoch": 0.36185450433471544, "grad_norm": 0.337215393781662, "learning_rate": 3.654931308100934e-05, "loss": 0.7402, "step": 1600 }, { "epoch": 0.36298529966076143, "grad_norm": 0.4486747980117798, "learning_rate": 3.646863464254447e-05, "loss": 0.7111, "step": 1605 }, { "epoch": 0.3641160949868074, "grad_norm": 0.37535396218299866, "learning_rate": 3.638780464482497e-05, "loss": 0.7322, "step": 1610 }, { "epoch": 0.3652468903128534, "grad_norm": 0.4385060966014862, "learning_rate": 3.630682415602804e-05, "loss": 0.6517, "step": 1615 }, { "epoch": 0.3663776856388994, "grad_norm": 0.29366278648376465, "learning_rate": 3.6225694246319666e-05, "loss": 0.636, "step": 1620 }, { "epoch": 0.36750848096494537, "grad_norm": 0.3330417573451996, "learning_rate": 3.614441598784042e-05, "loss": 0.727, "step": 1625 }, { "epoch": 0.36863927629099136, "grad_norm": 0.3851955831050873, "learning_rate": 3.6062990454691334e-05, "loss": 0.7019, "step": 1630 }, { "epoch": 0.36977007161703734, "grad_norm": 0.4180035889148712, "learning_rate": 3.598141872291969e-05, "loss": 0.7318, "step": 1635 }, { "epoch": 0.37090086694308333, "grad_norm": 0.28281131386756897, "learning_rate": 3.589970187050481e-05, "loss": 0.7143, "step": 1640 }, { "epoch": 0.3720316622691293, "grad_norm": 0.35991495847702026, "learning_rate": 3.581784097734376e-05, "loss": 0.7144, "step": 1645 }, { "epoch": 0.3731624575951753, "grad_norm": 0.3908022940158844, "learning_rate": 3.5735837125237174e-05, "loss": 0.6779, "step": 1650 }, { "epoch": 0.3742932529212213, "grad_norm": 0.3579081594944, "learning_rate": 3.565369139787488e-05, "loss": 0.6774, "step": 1655 }, { "epoch": 0.3754240482472672, "grad_norm": 0.37918293476104736, "learning_rate": 3.5571404880821594e-05, "loss": 0.7551, "step": 1660 }, { "epoch": 0.3765548435733132, "grad_norm": 0.372585654258728, "learning_rate": 3.548897866150259e-05, "loss": 0.7081, "step": 1665 }, { "epoch": 0.3776856388993592, "grad_norm": 0.38565728068351746, "learning_rate": 3.540641382918934e-05, "loss": 0.6547, "step": 1670 }, { "epoch": 0.3788164342254052, "grad_norm": 0.3910474479198456, "learning_rate": 3.532371147498507e-05, "loss": 0.6847, "step": 1675 }, { "epoch": 0.37994722955145116, "grad_norm": 0.3123336732387543, "learning_rate": 3.524087269181039e-05, "loss": 0.6692, "step": 1680 }, { "epoch": 0.38107802487749715, "grad_norm": 0.3222855031490326, "learning_rate": 3.515789857438885e-05, "loss": 0.7101, "step": 1685 }, { "epoch": 0.38220882020354313, "grad_norm": 0.3308558762073517, "learning_rate": 3.507479021923241e-05, "loss": 0.7193, "step": 1690 }, { "epoch": 0.3833396155295891, "grad_norm": 0.36425960063934326, "learning_rate": 3.4991548724627054e-05, "loss": 0.6698, "step": 1695 }, { "epoch": 0.3844704108556351, "grad_norm": 0.3454649746417999, "learning_rate": 3.490817519061819e-05, "loss": 0.6996, "step": 1700 }, { "epoch": 0.3856012061816811, "grad_norm": 0.39363983273506165, "learning_rate": 3.4824670718996114e-05, "loss": 0.7256, "step": 1705 }, { "epoch": 0.3867320015077271, "grad_norm": 0.29884523153305054, "learning_rate": 3.4741036413281534e-05, "loss": 0.706, "step": 1710 }, { "epoch": 0.38786279683377306, "grad_norm": 0.6705525517463684, "learning_rate": 3.4657273378710874e-05, "loss": 0.7508, "step": 1715 }, { "epoch": 0.38899359215981905, "grad_norm": 0.31176072359085083, "learning_rate": 3.4573382722221776e-05, "loss": 0.6792, "step": 1720 }, { "epoch": 0.39012438748586503, "grad_norm": 0.37332355976104736, "learning_rate": 3.448936555243837e-05, "loss": 0.6805, "step": 1725 }, { "epoch": 0.391255182811911, "grad_norm": 0.4867086112499237, "learning_rate": 3.440522297965671e-05, "loss": 0.6306, "step": 1730 }, { "epoch": 0.392385978137957, "grad_norm": 0.32693204283714294, "learning_rate": 3.4320956115830046e-05, "loss": 0.719, "step": 1735 }, { "epoch": 0.393516773464003, "grad_norm": 0.2943226993083954, "learning_rate": 3.4236566074554157e-05, "loss": 0.7405, "step": 1740 }, { "epoch": 0.394647568790049, "grad_norm": 0.3139977753162384, "learning_rate": 3.415205397105261e-05, "loss": 0.7152, "step": 1745 }, { "epoch": 0.39577836411609496, "grad_norm": 0.33439525961875916, "learning_rate": 3.406742092216206e-05, "loss": 0.7017, "step": 1750 }, { "epoch": 0.39690915944214095, "grad_norm": 0.3081996440887451, "learning_rate": 3.398266804631744e-05, "loss": 0.6647, "step": 1755 }, { "epoch": 0.39803995476818693, "grad_norm": 0.3134262263774872, "learning_rate": 3.389779646353724e-05, "loss": 0.7313, "step": 1760 }, { "epoch": 0.3991707500942329, "grad_norm": 0.3375689685344696, "learning_rate": 3.381280729540866e-05, "loss": 0.6829, "step": 1765 }, { "epoch": 0.4003015454202789, "grad_norm": 0.38416242599487305, "learning_rate": 3.37277016650728e-05, "loss": 0.7534, "step": 1770 }, { "epoch": 0.4014323407463249, "grad_norm": 0.3711940050125122, "learning_rate": 3.364248069720982e-05, "loss": 0.6618, "step": 1775 }, { "epoch": 0.4025631360723709, "grad_norm": 0.338777631521225, "learning_rate": 3.3557145518024094e-05, "loss": 0.6692, "step": 1780 }, { "epoch": 0.40369393139841686, "grad_norm": 0.2786078155040741, "learning_rate": 3.3471697255229294e-05, "loss": 0.7504, "step": 1785 }, { "epoch": 0.40482472672446285, "grad_norm": 0.33004823327064514, "learning_rate": 3.338613703803351e-05, "loss": 0.7056, "step": 1790 }, { "epoch": 0.40595552205050883, "grad_norm": 0.3257131278514862, "learning_rate": 3.330046599712432e-05, "loss": 0.7102, "step": 1795 }, { "epoch": 0.4070863173765548, "grad_norm": 0.3138837516307831, "learning_rate": 3.321468526465386e-05, "loss": 0.6638, "step": 1800 }, { "epoch": 0.4082171127026008, "grad_norm": 0.3327350914478302, "learning_rate": 3.312879597422383e-05, "loss": 0.7355, "step": 1805 }, { "epoch": 0.4093479080286468, "grad_norm": 0.2875402569770813, "learning_rate": 3.304279926087055e-05, "loss": 0.7113, "step": 1810 }, { "epoch": 0.4104787033546928, "grad_norm": 0.5153040289878845, "learning_rate": 3.295669626104995e-05, "loss": 0.7401, "step": 1815 }, { "epoch": 0.41160949868073876, "grad_norm": 0.3518928587436676, "learning_rate": 3.287048811262254e-05, "loss": 0.6864, "step": 1820 }, { "epoch": 0.41274029400678475, "grad_norm": 0.3488028049468994, "learning_rate": 3.2784175954838376e-05, "loss": 0.6401, "step": 1825 }, { "epoch": 0.41387108933283073, "grad_norm": 0.37360239028930664, "learning_rate": 3.2697760928322016e-05, "loss": 0.7004, "step": 1830 }, { "epoch": 0.4150018846588767, "grad_norm": 0.3383936285972595, "learning_rate": 3.261124417505745e-05, "loss": 0.6563, "step": 1835 }, { "epoch": 0.4161326799849227, "grad_norm": 0.36131277680397034, "learning_rate": 3.252462683837297e-05, "loss": 0.6737, "step": 1840 }, { "epoch": 0.4172634753109687, "grad_norm": 0.3024144768714905, "learning_rate": 3.2437910062926116e-05, "loss": 0.6466, "step": 1845 }, { "epoch": 0.4183942706370147, "grad_norm": 0.6971142888069153, "learning_rate": 3.235109499468849e-05, "loss": 0.6927, "step": 1850 }, { "epoch": 0.41952506596306066, "grad_norm": 0.3525508642196655, "learning_rate": 3.226418278093069e-05, "loss": 0.7009, "step": 1855 }, { "epoch": 0.42065586128910665, "grad_norm": 0.3152811527252197, "learning_rate": 3.2177174570207066e-05, "loss": 0.7065, "step": 1860 }, { "epoch": 0.42178665661515263, "grad_norm": 0.2631702721118927, "learning_rate": 3.2090071512340584e-05, "loss": 0.6723, "step": 1865 }, { "epoch": 0.4229174519411986, "grad_norm": 0.35791584849357605, "learning_rate": 3.200287475840764e-05, "loss": 0.6927, "step": 1870 }, { "epoch": 0.4240482472672446, "grad_norm": 0.30266880989074707, "learning_rate": 3.191558546072283e-05, "loss": 0.6395, "step": 1875 }, { "epoch": 0.4251790425932906, "grad_norm": 0.27712151408195496, "learning_rate": 3.1828204772823705e-05, "loss": 0.6246, "step": 1880 }, { "epoch": 0.4263098379193366, "grad_norm": 0.4084063172340393, "learning_rate": 3.174073384945556e-05, "loss": 0.6993, "step": 1885 }, { "epoch": 0.42744063324538256, "grad_norm": 0.3760344088077545, "learning_rate": 3.1653173846556186e-05, "loss": 0.6413, "step": 1890 }, { "epoch": 0.42857142857142855, "grad_norm": 0.41881611943244934, "learning_rate": 3.156552592124054e-05, "loss": 0.7295, "step": 1895 }, { "epoch": 0.42970222389747453, "grad_norm": 0.3386279046535492, "learning_rate": 3.147779123178548e-05, "loss": 0.7482, "step": 1900 }, { "epoch": 0.4308330192235205, "grad_norm": 0.4601892828941345, "learning_rate": 3.138997093761449e-05, "loss": 0.7499, "step": 1905 }, { "epoch": 0.4319638145495665, "grad_norm": 0.4254579246044159, "learning_rate": 3.1302066199282295e-05, "loss": 0.7148, "step": 1910 }, { "epoch": 0.4330946098756125, "grad_norm": 0.3381584584712982, "learning_rate": 3.121407817845959e-05, "loss": 0.6117, "step": 1915 }, { "epoch": 0.4342254052016585, "grad_norm": 0.3117331266403198, "learning_rate": 3.112600803791764e-05, "loss": 0.6246, "step": 1920 }, { "epoch": 0.43535620052770446, "grad_norm": 0.4453639090061188, "learning_rate": 3.103785694151293e-05, "loss": 0.754, "step": 1925 }, { "epoch": 0.43648699585375045, "grad_norm": 0.4143831729888916, "learning_rate": 3.094962605417179e-05, "loss": 0.7966, "step": 1930 }, { "epoch": 0.43761779117979643, "grad_norm": 0.2990778684616089, "learning_rate": 3.086131654187501e-05, "loss": 0.6519, "step": 1935 }, { "epoch": 0.4387485865058424, "grad_norm": 0.3955526649951935, "learning_rate": 3.077292957164238e-05, "loss": 0.7048, "step": 1940 }, { "epoch": 0.4398793818318884, "grad_norm": 0.3522753119468689, "learning_rate": 3.068446631151736e-05, "loss": 0.7202, "step": 1945 }, { "epoch": 0.4410101771579344, "grad_norm": 0.3563268482685089, "learning_rate": 3.0595927930551524e-05, "loss": 0.7145, "step": 1950 }, { "epoch": 0.4421409724839804, "grad_norm": 0.38255730271339417, "learning_rate": 3.0507315598789237e-05, "loss": 0.7158, "step": 1955 }, { "epoch": 0.44327176781002636, "grad_norm": 0.3502512276172638, "learning_rate": 3.0418630487252087e-05, "loss": 0.644, "step": 1960 }, { "epoch": 0.44440256313607235, "grad_norm": 0.36824584007263184, "learning_rate": 3.0329873767923477e-05, "loss": 0.7561, "step": 1965 }, { "epoch": 0.44553335846211833, "grad_norm": 0.32158178091049194, "learning_rate": 3.0241046613733114e-05, "loss": 0.6694, "step": 1970 }, { "epoch": 0.4466641537881643, "grad_norm": 0.28382861614227295, "learning_rate": 3.01521501985415e-05, "loss": 0.6803, "step": 1975 }, { "epoch": 0.4477949491142103, "grad_norm": 0.3525499999523163, "learning_rate": 3.0063185697124446e-05, "loss": 0.7263, "step": 1980 }, { "epoch": 0.4489257444402563, "grad_norm": 0.2863157093524933, "learning_rate": 2.9974154285157497e-05, "loss": 0.7232, "step": 1985 }, { "epoch": 0.4500565397663023, "grad_norm": 0.3138844668865204, "learning_rate": 2.9885057139200468e-05, "loss": 0.6912, "step": 1990 }, { "epoch": 0.45118733509234826, "grad_norm": 0.33406513929367065, "learning_rate": 2.979589543668182e-05, "loss": 0.684, "step": 1995 }, { "epoch": 0.45231813041839425, "grad_norm": 0.3506259620189667, "learning_rate": 2.970667035588317e-05, "loss": 0.7522, "step": 2000 }, { "epoch": 0.45344892574444023, "grad_norm": 0.37139952182769775, "learning_rate": 2.9617383075923665e-05, "loss": 0.6471, "step": 2005 }, { "epoch": 0.4545797210704862, "grad_norm": 0.295625239610672, "learning_rate": 2.952803477674441e-05, "loss": 0.7209, "step": 2010 }, { "epoch": 0.4557105163965322, "grad_norm": 0.3062797486782074, "learning_rate": 2.9438626639092932e-05, "loss": 0.7059, "step": 2015 }, { "epoch": 0.4568413117225782, "grad_norm": 0.3885577917098999, "learning_rate": 2.9349159844507455e-05, "loss": 0.7319, "step": 2020 }, { "epoch": 0.4579721070486242, "grad_norm": 0.365987628698349, "learning_rate": 2.9259635575301436e-05, "loss": 0.6858, "step": 2025 }, { "epoch": 0.45910290237467016, "grad_norm": 0.32557693123817444, "learning_rate": 2.9170055014547825e-05, "loss": 0.622, "step": 2030 }, { "epoch": 0.46023369770071615, "grad_norm": 0.31643807888031006, "learning_rate": 2.908041934606347e-05, "loss": 0.6794, "step": 2035 }, { "epoch": 0.46136449302676213, "grad_norm": 0.3457587957382202, "learning_rate": 2.89907297543935e-05, "loss": 0.7015, "step": 2040 }, { "epoch": 0.4624952883528081, "grad_norm": 0.3037043809890747, "learning_rate": 2.8900987424795606e-05, "loss": 0.6773, "step": 2045 }, { "epoch": 0.4636260836788541, "grad_norm": 0.3223413825035095, "learning_rate": 2.8811193543224462e-05, "loss": 0.643, "step": 2050 }, { "epoch": 0.4647568790049001, "grad_norm": 0.5646958947181702, "learning_rate": 2.8721349296315963e-05, "loss": 0.6942, "step": 2055 }, { "epoch": 0.4658876743309461, "grad_norm": 0.3289279043674469, "learning_rate": 2.8631455871371614e-05, "loss": 0.6679, "step": 2060 }, { "epoch": 0.46701846965699206, "grad_norm": 0.4061075448989868, "learning_rate": 2.8541514456342815e-05, "loss": 0.7992, "step": 2065 }, { "epoch": 0.46814926498303805, "grad_norm": 0.37772536277770996, "learning_rate": 2.8451526239815134e-05, "loss": 0.6817, "step": 2070 }, { "epoch": 0.46928006030908404, "grad_norm": 0.31532320380210876, "learning_rate": 2.8361492410992662e-05, "loss": 0.6771, "step": 2075 }, { "epoch": 0.47041085563513, "grad_norm": 0.352198988199234, "learning_rate": 2.8271414159682224e-05, "loss": 0.6515, "step": 2080 }, { "epoch": 0.471541650961176, "grad_norm": 0.39696019887924194, "learning_rate": 2.8181292676277738e-05, "loss": 0.7276, "step": 2085 }, { "epoch": 0.472672446287222, "grad_norm": 0.4117799997329712, "learning_rate": 2.809112915174439e-05, "loss": 0.6333, "step": 2090 }, { "epoch": 0.473803241613268, "grad_norm": 0.36984243988990784, "learning_rate": 2.8000924777602965e-05, "loss": 0.7202, "step": 2095 }, { "epoch": 0.47493403693931396, "grad_norm": 0.3305279612541199, "learning_rate": 2.79106807459141e-05, "loss": 0.6418, "step": 2100 }, { "epoch": 0.47606483226535995, "grad_norm": 0.40777119994163513, "learning_rate": 2.7820398249262474e-05, "loss": 0.7948, "step": 2105 }, { "epoch": 0.47719562759140594, "grad_norm": 0.3309784233570099, "learning_rate": 2.7730078480741122e-05, "loss": 0.6319, "step": 2110 }, { "epoch": 0.4783264229174519, "grad_norm": 0.3214864134788513, "learning_rate": 2.7639722633935605e-05, "loss": 0.7008, "step": 2115 }, { "epoch": 0.4794572182434979, "grad_norm": 0.3192216157913208, "learning_rate": 2.754933190290826e-05, "loss": 0.6489, "step": 2120 }, { "epoch": 0.4805880135695439, "grad_norm": 0.31766754388809204, "learning_rate": 2.745890748218245e-05, "loss": 0.6728, "step": 2125 }, { "epoch": 0.4817188088955899, "grad_norm": 0.32393330335617065, "learning_rate": 2.736845056672671e-05, "loss": 0.6808, "step": 2130 }, { "epoch": 0.48284960422163586, "grad_norm": 0.3087853193283081, "learning_rate": 2.727796235193904e-05, "loss": 0.7033, "step": 2135 }, { "epoch": 0.48398039954768185, "grad_norm": 0.3951945900917053, "learning_rate": 2.7187444033631044e-05, "loss": 0.6537, "step": 2140 }, { "epoch": 0.48511119487372784, "grad_norm": 0.31923210620880127, "learning_rate": 2.709689680801213e-05, "loss": 0.6795, "step": 2145 }, { "epoch": 0.4862419901997738, "grad_norm": 0.4405725300312042, "learning_rate": 2.7006321871673752e-05, "loss": 0.7204, "step": 2150 }, { "epoch": 0.4873727855258198, "grad_norm": 0.36784470081329346, "learning_rate": 2.6915720421573538e-05, "loss": 0.698, "step": 2155 }, { "epoch": 0.4885035808518658, "grad_norm": 0.38032978773117065, "learning_rate": 2.682509365501953e-05, "loss": 0.74, "step": 2160 }, { "epoch": 0.4896343761779118, "grad_norm": 0.36600053310394287, "learning_rate": 2.6734442769654273e-05, "loss": 0.6317, "step": 2165 }, { "epoch": 0.49076517150395776, "grad_norm": 0.39383023977279663, "learning_rate": 2.6643768963439113e-05, "loss": 0.6807, "step": 2170 }, { "epoch": 0.49189596683000375, "grad_norm": 0.37128937244415283, "learning_rate": 2.6553073434638248e-05, "loss": 0.7359, "step": 2175 }, { "epoch": 0.49302676215604974, "grad_norm": 0.32236599922180176, "learning_rate": 2.6462357381802966e-05, "loss": 0.6154, "step": 2180 }, { "epoch": 0.4941575574820957, "grad_norm": 0.3519161343574524, "learning_rate": 2.6371622003755768e-05, "loss": 0.7197, "step": 2185 }, { "epoch": 0.4952883528081417, "grad_norm": 0.38883543014526367, "learning_rate": 2.628086849957455e-05, "loss": 0.7554, "step": 2190 }, { "epoch": 0.4964191481341877, "grad_norm": 0.34098756313323975, "learning_rate": 2.6190098068576763e-05, "loss": 0.7133, "step": 2195 }, { "epoch": 0.4975499434602337, "grad_norm": 0.36088091135025024, "learning_rate": 2.6099311910303502e-05, "loss": 0.6746, "step": 2200 }, { "epoch": 0.49868073878627966, "grad_norm": 0.38198113441467285, "learning_rate": 2.6008511224503728e-05, "loss": 0.6848, "step": 2205 }, { "epoch": 0.49981153411232565, "grad_norm": 0.3310260474681854, "learning_rate": 2.59176972111184e-05, "loss": 0.6657, "step": 2210 }, { "epoch": 0.5009423294383717, "grad_norm": 0.3948574364185333, "learning_rate": 2.582687107026458e-05, "loss": 0.6704, "step": 2215 }, { "epoch": 0.5020731247644177, "grad_norm": 0.31727057695388794, "learning_rate": 2.5736034002219594e-05, "loss": 0.6454, "step": 2220 }, { "epoch": 0.5032039200904637, "grad_norm": 0.33022522926330566, "learning_rate": 2.564518720740519e-05, "loss": 0.6928, "step": 2225 }, { "epoch": 0.5043347154165096, "grad_norm": 0.5317490696907043, "learning_rate": 2.555433188637164e-05, "loss": 0.823, "step": 2230 }, { "epoch": 0.5054655107425556, "grad_norm": 0.39583778381347656, "learning_rate": 2.54634692397819e-05, "loss": 0.7081, "step": 2235 }, { "epoch": 0.5065963060686016, "grad_norm": 0.36913448572158813, "learning_rate": 2.5372600468395723e-05, "loss": 0.6707, "step": 2240 }, { "epoch": 0.5077271013946476, "grad_norm": 0.33116042613983154, "learning_rate": 2.528172677305382e-05, "loss": 0.7008, "step": 2245 }, { "epoch": 0.5088578967206936, "grad_norm": 0.3586164116859436, "learning_rate": 2.5190849354661955e-05, "loss": 0.6895, "step": 2250 }, { "epoch": 0.5099886920467396, "grad_norm": 0.44672051072120667, "learning_rate": 2.50999694141751e-05, "loss": 0.7304, "step": 2255 }, { "epoch": 0.5111194873727856, "grad_norm": 0.4558676779270172, "learning_rate": 2.5009088152581565e-05, "loss": 0.7073, "step": 2260 }, { "epoch": 0.5122502826988315, "grad_norm": 0.31825345754623413, "learning_rate": 2.4918206770887102e-05, "loss": 0.7007, "step": 2265 }, { "epoch": 0.5133810780248775, "grad_norm": 0.41337841749191284, "learning_rate": 2.482732647009907e-05, "loss": 0.7995, "step": 2270 }, { "epoch": 0.5145118733509235, "grad_norm": 0.3080434799194336, "learning_rate": 2.473644845121051e-05, "loss": 0.7367, "step": 2275 }, { "epoch": 0.5156426686769695, "grad_norm": 0.35662513971328735, "learning_rate": 2.4645573915184354e-05, "loss": 0.6669, "step": 2280 }, { "epoch": 0.5167734640030155, "grad_norm": 0.41301533579826355, "learning_rate": 2.4554704062937467e-05, "loss": 0.6953, "step": 2285 }, { "epoch": 0.5179042593290615, "grad_norm": 0.42937204241752625, "learning_rate": 2.4463840095324834e-05, "loss": 0.6625, "step": 2290 }, { "epoch": 0.5190350546551075, "grad_norm": 0.32970476150512695, "learning_rate": 2.437298321312369e-05, "loss": 0.6823, "step": 2295 }, { "epoch": 0.5201658499811534, "grad_norm": 0.36597487330436707, "learning_rate": 2.428213461701759e-05, "loss": 0.6233, "step": 2300 }, { "epoch": 0.5212966453071994, "grad_norm": 0.31977376341819763, "learning_rate": 2.4191295507580648e-05, "loss": 0.6732, "step": 2305 }, { "epoch": 0.5224274406332454, "grad_norm": 0.3720978796482086, "learning_rate": 2.410046708526155e-05, "loss": 0.7449, "step": 2310 }, { "epoch": 0.5235582359592914, "grad_norm": 0.4317164421081543, "learning_rate": 2.4009650550367804e-05, "loss": 0.6818, "step": 2315 }, { "epoch": 0.5246890312853374, "grad_norm": 0.358803391456604, "learning_rate": 2.3918847103049792e-05, "loss": 0.7051, "step": 2320 }, { "epoch": 0.5258198266113834, "grad_norm": 0.37477102875709534, "learning_rate": 2.3828057943284932e-05, "loss": 0.6474, "step": 2325 }, { "epoch": 0.5269506219374294, "grad_norm": 0.3854588568210602, "learning_rate": 2.373728427086188e-05, "loss": 0.6464, "step": 2330 }, { "epoch": 0.5280814172634754, "grad_norm": 0.29804185032844543, "learning_rate": 2.3646527285364565e-05, "loss": 0.6824, "step": 2335 }, { "epoch": 0.5292122125895213, "grad_norm": 0.3477884829044342, "learning_rate": 2.3555788186156442e-05, "loss": 0.7401, "step": 2340 }, { "epoch": 0.5303430079155673, "grad_norm": 0.3655013144016266, "learning_rate": 2.346506817236457e-05, "loss": 0.6915, "step": 2345 }, { "epoch": 0.5314738032416133, "grad_norm": 0.31074225902557373, "learning_rate": 2.3374368442863814e-05, "loss": 0.7442, "step": 2350 }, { "epoch": 0.5326045985676593, "grad_norm": 0.38817688822746277, "learning_rate": 2.3283690196260967e-05, "loss": 0.7317, "step": 2355 }, { "epoch": 0.5337353938937053, "grad_norm": 0.2897610366344452, "learning_rate": 2.3193034630878907e-05, "loss": 0.6206, "step": 2360 }, { "epoch": 0.5348661892197513, "grad_norm": 0.38513097167015076, "learning_rate": 2.310240294474081e-05, "loss": 0.7794, "step": 2365 }, { "epoch": 0.5359969845457973, "grad_norm": 0.3019099533557892, "learning_rate": 2.3011796335554258e-05, "loss": 0.6191, "step": 2370 }, { "epoch": 0.5371277798718432, "grad_norm": 0.29924795031547546, "learning_rate": 2.2921216000695465e-05, "loss": 0.6881, "step": 2375 }, { "epoch": 0.5382585751978892, "grad_norm": 0.37753212451934814, "learning_rate": 2.2830663137193398e-05, "loss": 0.6226, "step": 2380 }, { "epoch": 0.5393893705239352, "grad_norm": 0.3259458839893341, "learning_rate": 2.274013894171401e-05, "loss": 0.7258, "step": 2385 }, { "epoch": 0.5405201658499812, "grad_norm": 0.3186294436454773, "learning_rate": 2.2649644610544392e-05, "loss": 0.7074, "step": 2390 }, { "epoch": 0.5416509611760272, "grad_norm": 0.328595370054245, "learning_rate": 2.255918133957697e-05, "loss": 0.6656, "step": 2395 }, { "epoch": 0.5427817565020732, "grad_norm": 0.34288713335990906, "learning_rate": 2.2468750324293717e-05, "loss": 0.6913, "step": 2400 }, { "epoch": 0.5439125518281192, "grad_norm": 0.34917885065078735, "learning_rate": 2.2378352759750333e-05, "loss": 0.6997, "step": 2405 }, { "epoch": 0.5450433471541651, "grad_norm": 0.38892245292663574, "learning_rate": 2.2287989840560485e-05, "loss": 0.6667, "step": 2410 }, { "epoch": 0.5461741424802111, "grad_norm": 0.41548117995262146, "learning_rate": 2.219766276087996e-05, "loss": 0.648, "step": 2415 }, { "epoch": 0.5473049378062571, "grad_norm": 0.37720760703086853, "learning_rate": 2.2107372714390974e-05, "loss": 0.7646, "step": 2420 }, { "epoch": 0.5484357331323031, "grad_norm": 0.32246890664100647, "learning_rate": 2.2017120894286287e-05, "loss": 0.6772, "step": 2425 }, { "epoch": 0.5495665284583491, "grad_norm": 0.35085204243659973, "learning_rate": 2.1926908493253527e-05, "loss": 0.658, "step": 2430 }, { "epoch": 0.5506973237843951, "grad_norm": 0.32103869318962097, "learning_rate": 2.1836736703459398e-05, "loss": 0.6576, "step": 2435 }, { "epoch": 0.551828119110441, "grad_norm": 0.30640605092048645, "learning_rate": 2.1746606716533907e-05, "loss": 0.7009, "step": 2440 }, { "epoch": 0.552958914436487, "grad_norm": 0.4351046681404114, "learning_rate": 2.1656519723554643e-05, "loss": 0.7124, "step": 2445 }, { "epoch": 0.554089709762533, "grad_norm": 0.3515176773071289, "learning_rate": 2.1566476915031013e-05, "loss": 0.7086, "step": 2450 }, { "epoch": 0.555220505088579, "grad_norm": 0.35644426941871643, "learning_rate": 2.1476479480888545e-05, "loss": 0.7245, "step": 2455 }, { "epoch": 0.556351300414625, "grad_norm": 0.49966442584991455, "learning_rate": 2.1386528610453104e-05, "loss": 0.7511, "step": 2460 }, { "epoch": 0.557482095740671, "grad_norm": 0.3358660340309143, "learning_rate": 2.129662549243523e-05, "loss": 0.6579, "step": 2465 }, { "epoch": 0.558612891066717, "grad_norm": 0.392120361328125, "learning_rate": 2.120677131491442e-05, "loss": 0.7838, "step": 2470 }, { "epoch": 0.559743686392763, "grad_norm": 0.3123244047164917, "learning_rate": 2.11169672653234e-05, "loss": 0.6543, "step": 2475 }, { "epoch": 0.5608744817188089, "grad_norm": 0.3226960301399231, "learning_rate": 2.1027214530432465e-05, "loss": 0.6582, "step": 2480 }, { "epoch": 0.5620052770448549, "grad_norm": 0.3497219681739807, "learning_rate": 2.0937514296333754e-05, "loss": 0.6815, "step": 2485 }, { "epoch": 0.5631360723709009, "grad_norm": 0.39245134592056274, "learning_rate": 2.0847867748425648e-05, "loss": 0.7226, "step": 2490 }, { "epoch": 0.5642668676969469, "grad_norm": 0.3870549499988556, "learning_rate": 2.0758276071397012e-05, "loss": 0.7073, "step": 2495 }, { "epoch": 0.5653976630229929, "grad_norm": 0.40596914291381836, "learning_rate": 2.0668740449211605e-05, "loss": 0.6929, "step": 2500 }, { "epoch": 0.5665284583490389, "grad_norm": 0.3204245865345001, "learning_rate": 2.0579262065092423e-05, "loss": 0.7193, "step": 2505 }, { "epoch": 0.5676592536750849, "grad_norm": 0.30433857440948486, "learning_rate": 2.048984210150604e-05, "loss": 0.6859, "step": 2510 }, { "epoch": 0.5687900490011308, "grad_norm": 0.392553448677063, "learning_rate": 2.0400481740147022e-05, "loss": 0.7217, "step": 2515 }, { "epoch": 0.5699208443271768, "grad_norm": 0.3402389585971832, "learning_rate": 2.0311182161922237e-05, "loss": 0.6868, "step": 2520 }, { "epoch": 0.5710516396532228, "grad_norm": 0.42901313304901123, "learning_rate": 2.022194454693536e-05, "loss": 0.6861, "step": 2525 }, { "epoch": 0.5721824349792688, "grad_norm": 0.34680864214897156, "learning_rate": 2.013277007447117e-05, "loss": 0.7805, "step": 2530 }, { "epoch": 0.5733132303053148, "grad_norm": 0.30028700828552246, "learning_rate": 2.0043659922980005e-05, "loss": 0.6454, "step": 2535 }, { "epoch": 0.5744440256313608, "grad_norm": 0.3310604691505432, "learning_rate": 1.995461527006225e-05, "loss": 0.6193, "step": 2540 }, { "epoch": 0.5755748209574068, "grad_norm": 0.3977152407169342, "learning_rate": 1.9865637292452636e-05, "loss": 0.7275, "step": 2545 }, { "epoch": 0.5767056162834527, "grad_norm": 0.42726007103919983, "learning_rate": 1.977672716600486e-05, "loss": 0.7321, "step": 2550 }, { "epoch": 0.5778364116094987, "grad_norm": 0.4253356158733368, "learning_rate": 1.968788606567589e-05, "loss": 0.7107, "step": 2555 }, { "epoch": 0.5789672069355447, "grad_norm": 0.3486230969429016, "learning_rate": 1.9599115165510544e-05, "loss": 0.6859, "step": 2560 }, { "epoch": 0.5800980022615907, "grad_norm": 0.3471638560295105, "learning_rate": 1.9510415638625932e-05, "loss": 0.656, "step": 2565 }, { "epoch": 0.5812287975876367, "grad_norm": 0.37314942479133606, "learning_rate": 1.942178865719593e-05, "loss": 0.6545, "step": 2570 }, { "epoch": 0.5823595929136827, "grad_norm": 0.3019452393054962, "learning_rate": 1.9333235392435774e-05, "loss": 0.6422, "step": 2575 }, { "epoch": 0.5834903882397287, "grad_norm": 0.30790606141090393, "learning_rate": 1.9244757014586458e-05, "loss": 0.6182, "step": 2580 }, { "epoch": 0.5846211835657746, "grad_norm": 0.3539668917655945, "learning_rate": 1.9156354692899405e-05, "loss": 0.6835, "step": 2585 }, { "epoch": 0.5857519788918206, "grad_norm": 0.3208529055118561, "learning_rate": 1.9068029595620884e-05, "loss": 0.6619, "step": 2590 }, { "epoch": 0.5868827742178666, "grad_norm": 0.49773553013801575, "learning_rate": 1.897978288997669e-05, "loss": 0.7187, "step": 2595 }, { "epoch": 0.5880135695439126, "grad_norm": 0.3386790156364441, "learning_rate": 1.889161574215663e-05, "loss": 0.6659, "step": 2600 }, { "epoch": 0.5891443648699586, "grad_norm": 0.3373807370662689, "learning_rate": 1.880352931729914e-05, "loss": 0.6461, "step": 2605 }, { "epoch": 0.5902751601960046, "grad_norm": 0.4661053717136383, "learning_rate": 1.8715524779475944e-05, "loss": 0.6994, "step": 2610 }, { "epoch": 0.5914059555220506, "grad_norm": 0.38146570324897766, "learning_rate": 1.862760329167655e-05, "loss": 0.6413, "step": 2615 }, { "epoch": 0.5925367508480965, "grad_norm": 0.3764294981956482, "learning_rate": 1.8539766015793006e-05, "loss": 0.6617, "step": 2620 }, { "epoch": 0.5936675461741425, "grad_norm": 0.35271722078323364, "learning_rate": 1.845201411260446e-05, "loss": 0.7036, "step": 2625 }, { "epoch": 0.5947983415001885, "grad_norm": 0.3613468110561371, "learning_rate": 1.8364348741761867e-05, "loss": 0.7361, "step": 2630 }, { "epoch": 0.5959291368262345, "grad_norm": 0.34245991706848145, "learning_rate": 1.8276771061772647e-05, "loss": 0.7073, "step": 2635 }, { "epoch": 0.5970599321522805, "grad_norm": 0.32761844992637634, "learning_rate": 1.8189282229985345e-05, "loss": 0.7661, "step": 2640 }, { "epoch": 0.5981907274783265, "grad_norm": 0.3382299542427063, "learning_rate": 1.8101883402574415e-05, "loss": 0.6813, "step": 2645 }, { "epoch": 0.5993215228043725, "grad_norm": 0.30160099267959595, "learning_rate": 1.8014575734524865e-05, "loss": 0.7183, "step": 2650 }, { "epoch": 0.6004523181304184, "grad_norm": 0.3124518096446991, "learning_rate": 1.7927360379617024e-05, "loss": 0.6506, "step": 2655 }, { "epoch": 0.6015831134564644, "grad_norm": 0.3907219469547272, "learning_rate": 1.78402384904113e-05, "loss": 0.6575, "step": 2660 }, { "epoch": 0.6027139087825104, "grad_norm": 0.35735592246055603, "learning_rate": 1.7753211218232938e-05, "loss": 0.6877, "step": 2665 }, { "epoch": 0.6038447041085564, "grad_norm": 0.40482988953590393, "learning_rate": 1.7666279713156815e-05, "loss": 0.6788, "step": 2670 }, { "epoch": 0.6049754994346024, "grad_norm": 0.40024474263191223, "learning_rate": 1.757944512399221e-05, "loss": 0.7644, "step": 2675 }, { "epoch": 0.6061062947606484, "grad_norm": 0.4042050242424011, "learning_rate": 1.7492708598267683e-05, "loss": 0.7347, "step": 2680 }, { "epoch": 0.6072370900866944, "grad_norm": 0.38071730732917786, "learning_rate": 1.7406071282215854e-05, "loss": 0.6841, "step": 2685 }, { "epoch": 0.6083678854127403, "grad_norm": 0.4327053427696228, "learning_rate": 1.7319534320758284e-05, "loss": 0.7712, "step": 2690 }, { "epoch": 0.6094986807387863, "grad_norm": 0.41496187448501587, "learning_rate": 1.7233098857490325e-05, "loss": 0.7306, "step": 2695 }, { "epoch": 0.6106294760648323, "grad_norm": 0.48277217149734497, "learning_rate": 1.714676603466605e-05, "loss": 0.6843, "step": 2700 }, { "epoch": 0.6117602713908783, "grad_norm": 0.4226689338684082, "learning_rate": 1.7060536993183084e-05, "loss": 0.6336, "step": 2705 }, { "epoch": 0.6128910667169243, "grad_norm": 0.3646908104419708, "learning_rate": 1.6974412872567597e-05, "loss": 0.6637, "step": 2710 }, { "epoch": 0.6140218620429703, "grad_norm": 0.30986544489860535, "learning_rate": 1.688839481095922e-05, "loss": 0.6905, "step": 2715 }, { "epoch": 0.6151526573690163, "grad_norm": 0.416892409324646, "learning_rate": 1.680248394509599e-05, "loss": 0.7408, "step": 2720 }, { "epoch": 0.6162834526950622, "grad_norm": 0.3360169231891632, "learning_rate": 1.6716681410299348e-05, "loss": 0.7591, "step": 2725 }, { "epoch": 0.6174142480211082, "grad_norm": 0.40622156858444214, "learning_rate": 1.6630988340459128e-05, "loss": 0.6792, "step": 2730 }, { "epoch": 0.6185450433471542, "grad_norm": 0.37062937021255493, "learning_rate": 1.654540586801858e-05, "loss": 0.6656, "step": 2735 }, { "epoch": 0.6196758386732002, "grad_norm": 0.29908493161201477, "learning_rate": 1.645993512395938e-05, "loss": 0.6576, "step": 2740 }, { "epoch": 0.6208066339992462, "grad_norm": 0.3012191951274872, "learning_rate": 1.6374577237786703e-05, "loss": 0.6174, "step": 2745 }, { "epoch": 0.6219374293252922, "grad_norm": 0.322457879781723, "learning_rate": 1.628933333751432e-05, "loss": 0.6562, "step": 2750 }, { "epoch": 0.6230682246513382, "grad_norm": 0.3631836771965027, "learning_rate": 1.6204204549649628e-05, "loss": 0.6264, "step": 2755 }, { "epoch": 0.6241990199773841, "grad_norm": 0.30250152945518494, "learning_rate": 1.6119191999178847e-05, "loss": 0.7027, "step": 2760 }, { "epoch": 0.6253298153034301, "grad_norm": 0.31643325090408325, "learning_rate": 1.6034296809552047e-05, "loss": 0.6767, "step": 2765 }, { "epoch": 0.6264606106294761, "grad_norm": 0.3336418569087982, "learning_rate": 1.594952010266843e-05, "loss": 0.67, "step": 2770 }, { "epoch": 0.6275914059555221, "grad_norm": 0.33178821206092834, "learning_rate": 1.5864862998861384e-05, "loss": 0.6477, "step": 2775 }, { "epoch": 0.6287222012815681, "grad_norm": 0.3760960102081299, "learning_rate": 1.5780326616883745e-05, "loss": 0.6692, "step": 2780 }, { "epoch": 0.6298529966076141, "grad_norm": 0.38543248176574707, "learning_rate": 1.5695912073893006e-05, "loss": 0.6762, "step": 2785 }, { "epoch": 0.63098379193366, "grad_norm": 0.39795583486557007, "learning_rate": 1.561162048543653e-05, "loss": 0.6861, "step": 2790 }, { "epoch": 0.632114587259706, "grad_norm": 0.30678924918174744, "learning_rate": 1.552745296543684e-05, "loss": 0.7045, "step": 2795 }, { "epoch": 0.633245382585752, "grad_norm": 0.39268332719802856, "learning_rate": 1.544341062617685e-05, "loss": 0.6791, "step": 2800 }, { "epoch": 0.634376177911798, "grad_norm": 0.3372342586517334, "learning_rate": 1.535949457828525e-05, "loss": 0.6737, "step": 2805 }, { "epoch": 0.635506973237844, "grad_norm": 0.3903445303440094, "learning_rate": 1.527570593072172e-05, "loss": 0.7094, "step": 2810 }, { "epoch": 0.63663776856389, "grad_norm": 0.3412375748157501, "learning_rate": 1.5192045790762354e-05, "loss": 0.7126, "step": 2815 }, { "epoch": 0.637768563889936, "grad_norm": 0.37893742322921753, "learning_rate": 1.5108515263985018e-05, "loss": 0.739, "step": 2820 }, { "epoch": 0.638899359215982, "grad_norm": 0.3894254267215729, "learning_rate": 1.502511545425469e-05, "loss": 0.7108, "step": 2825 }, { "epoch": 0.6400301545420279, "grad_norm": 0.3613717257976532, "learning_rate": 1.4941847463708958e-05, "loss": 0.672, "step": 2830 }, { "epoch": 0.6411609498680739, "grad_norm": 0.2811620235443115, "learning_rate": 1.4858712392743352e-05, "loss": 0.7129, "step": 2835 }, { "epoch": 0.6422917451941199, "grad_norm": 0.411286324262619, "learning_rate": 1.4775711339996896e-05, "loss": 0.6747, "step": 2840 }, { "epoch": 0.6434225405201659, "grad_norm": 0.39716291427612305, "learning_rate": 1.4692845402337523e-05, "loss": 0.7217, "step": 2845 }, { "epoch": 0.6445533358462119, "grad_norm": 0.3730713725090027, "learning_rate": 1.4610115674847619e-05, "loss": 0.6249, "step": 2850 }, { "epoch": 0.6456841311722579, "grad_norm": 0.3958978056907654, "learning_rate": 1.4527523250809545e-05, "loss": 0.6599, "step": 2855 }, { "epoch": 0.6468149264983039, "grad_norm": 0.2955171763896942, "learning_rate": 1.4445069221691148e-05, "loss": 0.6542, "step": 2860 }, { "epoch": 0.6479457218243498, "grad_norm": 0.45475757122039795, "learning_rate": 1.436275467713141e-05, "loss": 0.8182, "step": 2865 }, { "epoch": 0.6490765171503958, "grad_norm": 0.40360134840011597, "learning_rate": 1.428058070492599e-05, "loss": 0.6866, "step": 2870 }, { "epoch": 0.6502073124764418, "grad_norm": 0.3490990996360779, "learning_rate": 1.4198548391012878e-05, "loss": 0.6879, "step": 2875 }, { "epoch": 0.6513381078024878, "grad_norm": 0.3447786569595337, "learning_rate": 1.4116658819458025e-05, "loss": 0.6206, "step": 2880 }, { "epoch": 0.6524689031285338, "grad_norm": 0.4018050730228424, "learning_rate": 1.4034913072441015e-05, "loss": 0.6705, "step": 2885 }, { "epoch": 0.6535996984545798, "grad_norm": 0.31316670775413513, "learning_rate": 1.3953312230240801e-05, "loss": 0.7058, "step": 2890 }, { "epoch": 0.6547304937806258, "grad_norm": 0.4424934387207031, "learning_rate": 1.3871857371221389e-05, "loss": 0.6871, "step": 2895 }, { "epoch": 0.6558612891066717, "grad_norm": 0.36514902114868164, "learning_rate": 1.3790549571817615e-05, "loss": 0.6632, "step": 2900 }, { "epoch": 0.6569920844327177, "grad_norm": 0.3037571609020233, "learning_rate": 1.3709389906520875e-05, "loss": 0.6516, "step": 2905 }, { "epoch": 0.6581228797587637, "grad_norm": 0.3230195641517639, "learning_rate": 1.3628379447864997e-05, "loss": 0.7393, "step": 2910 }, { "epoch": 0.6592536750848097, "grad_norm": 0.425601601600647, "learning_rate": 1.3547519266411985e-05, "loss": 0.6665, "step": 2915 }, { "epoch": 0.6603844704108557, "grad_norm": 0.39113959670066833, "learning_rate": 1.3466810430737941e-05, "loss": 0.6772, "step": 2920 }, { "epoch": 0.6615152657369017, "grad_norm": 0.3463885188102722, "learning_rate": 1.3386254007418928e-05, "loss": 0.7132, "step": 2925 }, { "epoch": 0.6626460610629477, "grad_norm": 0.29994767904281616, "learning_rate": 1.3305851061016821e-05, "loss": 0.7092, "step": 2930 }, { "epoch": 0.6637768563889936, "grad_norm": 0.3709157407283783, "learning_rate": 1.3225602654065323e-05, "loss": 0.6795, "step": 2935 }, { "epoch": 0.6649076517150396, "grad_norm": 0.36443623900413513, "learning_rate": 1.3145509847055837e-05, "loss": 0.6979, "step": 2940 }, { "epoch": 0.6660384470410856, "grad_norm": 0.3367445468902588, "learning_rate": 1.3065573698423558e-05, "loss": 0.7412, "step": 2945 }, { "epoch": 0.6671692423671316, "grad_norm": 0.3487666845321655, "learning_rate": 1.2985795264533372e-05, "loss": 0.8255, "step": 2950 }, { "epoch": 0.6683000376931776, "grad_norm": 0.3769291937351227, "learning_rate": 1.2906175599665949e-05, "loss": 0.6697, "step": 2955 }, { "epoch": 0.6694308330192236, "grad_norm": 0.3350309431552887, "learning_rate": 1.2826715756003846e-05, "loss": 0.7478, "step": 2960 }, { "epoch": 0.6705616283452696, "grad_norm": 0.30802276730537415, "learning_rate": 1.2747416783617511e-05, "loss": 0.6233, "step": 2965 }, { "epoch": 0.6716924236713155, "grad_norm": 0.5399759411811829, "learning_rate": 1.2668279730451535e-05, "loss": 0.7359, "step": 2970 }, { "epoch": 0.6728232189973615, "grad_norm": 0.38919568061828613, "learning_rate": 1.2589305642310651e-05, "loss": 0.6935, "step": 2975 }, { "epoch": 0.6739540143234075, "grad_norm": 0.30821794271469116, "learning_rate": 1.2510495562846053e-05, "loss": 0.7083, "step": 2980 }, { "epoch": 0.6750848096494535, "grad_norm": 0.28666800260543823, "learning_rate": 1.2431850533541487e-05, "loss": 0.6569, "step": 2985 }, { "epoch": 0.6762156049754995, "grad_norm": 0.36479493975639343, "learning_rate": 1.2353371593699592e-05, "loss": 0.6867, "step": 2990 }, { "epoch": 0.6773464003015455, "grad_norm": 0.3713551461696625, "learning_rate": 1.22750597804281e-05, "loss": 0.68, "step": 2995 }, { "epoch": 0.6784771956275915, "grad_norm": 0.3654766380786896, "learning_rate": 1.2196916128626126e-05, "loss": 0.73, "step": 3000 }, { "epoch": 0.6796079909536374, "grad_norm": 0.2997152507305145, "learning_rate": 1.2118941670970551e-05, "loss": 0.6777, "step": 3005 }, { "epoch": 0.6807387862796834, "grad_norm": 0.3098806142807007, "learning_rate": 1.2041137437902297e-05, "loss": 0.709, "step": 3010 }, { "epoch": 0.6818695816057294, "grad_norm": 0.32077932357788086, "learning_rate": 1.1963504457612781e-05, "loss": 0.6451, "step": 3015 }, { "epoch": 0.6830003769317754, "grad_norm": 0.33692800998687744, "learning_rate": 1.1886043756030294e-05, "loss": 0.6855, "step": 3020 }, { "epoch": 0.6841311722578214, "grad_norm": 0.4159882664680481, "learning_rate": 1.1808756356806411e-05, "loss": 0.6746, "step": 3025 }, { "epoch": 0.6852619675838674, "grad_norm": 0.4914894104003906, "learning_rate": 1.1731643281302548e-05, "loss": 0.7548, "step": 3030 }, { "epoch": 0.6863927629099134, "grad_norm": 0.354419082403183, "learning_rate": 1.1654705548576364e-05, "loss": 0.7227, "step": 3035 }, { "epoch": 0.6875235582359593, "grad_norm": 0.42316100001335144, "learning_rate": 1.157794417536838e-05, "loss": 0.709, "step": 3040 }, { "epoch": 0.6886543535620053, "grad_norm": 0.35025471448898315, "learning_rate": 1.1501360176088494e-05, "loss": 0.6336, "step": 3045 }, { "epoch": 0.6897851488880513, "grad_norm": 0.33178970217704773, "learning_rate": 1.1424954562802598e-05, "loss": 0.616, "step": 3050 }, { "epoch": 0.6909159442140973, "grad_norm": 0.32804471254348755, "learning_rate": 1.1348728345219176e-05, "loss": 0.6617, "step": 3055 }, { "epoch": 0.6920467395401433, "grad_norm": 0.29233989119529724, "learning_rate": 1.127268253067598e-05, "loss": 0.6296, "step": 3060 }, { "epoch": 0.6931775348661893, "grad_norm": 0.3966659605503082, "learning_rate": 1.1196818124126729e-05, "loss": 0.6721, "step": 3065 }, { "epoch": 0.6943083301922353, "grad_norm": 0.34669914841651917, "learning_rate": 1.1121136128127812e-05, "loss": 0.6118, "step": 3070 }, { "epoch": 0.6954391255182812, "grad_norm": 0.33230316638946533, "learning_rate": 1.104563754282505e-05, "loss": 0.6855, "step": 3075 }, { "epoch": 0.6965699208443272, "grad_norm": 0.3585960865020752, "learning_rate": 1.0970323365940444e-05, "loss": 0.6893, "step": 3080 }, { "epoch": 0.6977007161703732, "grad_norm": 0.4158352315425873, "learning_rate": 1.0895194592759042e-05, "loss": 0.7072, "step": 3085 }, { "epoch": 0.6988315114964192, "grad_norm": 0.43162232637405396, "learning_rate": 1.082025221611577e-05, "loss": 0.7138, "step": 3090 }, { "epoch": 0.6999623068224652, "grad_norm": 0.3278350830078125, "learning_rate": 1.0745497226382267e-05, "loss": 0.6111, "step": 3095 }, { "epoch": 0.7010931021485112, "grad_norm": 0.44768843054771423, "learning_rate": 1.0670930611453874e-05, "loss": 0.6449, "step": 3100 }, { "epoch": 0.7022238974745572, "grad_norm": 0.48205995559692383, "learning_rate": 1.0596553356736507e-05, "loss": 0.6902, "step": 3105 }, { "epoch": 0.7033546928006031, "grad_norm": 0.3945559561252594, "learning_rate": 1.0522366445133686e-05, "loss": 0.6727, "step": 3110 }, { "epoch": 0.7044854881266491, "grad_norm": 0.33756589889526367, "learning_rate": 1.044837085703352e-05, "loss": 0.6969, "step": 3115 }, { "epoch": 0.7056162834526951, "grad_norm": 0.2790575325489044, "learning_rate": 1.0374567570295766e-05, "loss": 0.625, "step": 3120 }, { "epoch": 0.7067470787787411, "grad_norm": 0.4053255021572113, "learning_rate": 1.0300957560238875e-05, "loss": 0.7338, "step": 3125 }, { "epoch": 0.7078778741047871, "grad_norm": 0.3397720158100128, "learning_rate": 1.0227541799627136e-05, "loss": 0.6771, "step": 3130 }, { "epoch": 0.7090086694308331, "grad_norm": 0.3540814518928528, "learning_rate": 1.015432125865782e-05, "loss": 0.6582, "step": 3135 }, { "epoch": 0.7101394647568791, "grad_norm": 0.3383145034313202, "learning_rate": 1.0081296904948342e-05, "loss": 0.5987, "step": 3140 }, { "epoch": 0.711270260082925, "grad_norm": 0.4115695357322693, "learning_rate": 1.0008469703523493e-05, "loss": 0.6981, "step": 3145 }, { "epoch": 0.712401055408971, "grad_norm": 0.3034178912639618, "learning_rate": 9.935840616802645e-06, "loss": 0.6991, "step": 3150 }, { "epoch": 0.713531850735017, "grad_norm": 0.32083261013031006, "learning_rate": 9.863410604587095e-06, "loss": 0.6806, "step": 3155 }, { "epoch": 0.714662646061063, "grad_norm": 0.5665989518165588, "learning_rate": 9.791180624047322e-06, "loss": 0.7539, "step": 3160 }, { "epoch": 0.715793441387109, "grad_norm": 0.4178657829761505, "learning_rate": 9.719151629710386e-06, "loss": 0.6961, "step": 3165 }, { "epoch": 0.716924236713155, "grad_norm": 0.36418062448501587, "learning_rate": 9.647324573447291e-06, "loss": 0.7055, "step": 3170 }, { "epoch": 0.718055032039201, "grad_norm": 0.32820287346839905, "learning_rate": 9.575700404460386e-06, "loss": 0.6329, "step": 3175 }, { "epoch": 0.7191858273652469, "grad_norm": 0.5332444906234741, "learning_rate": 9.504280069270871e-06, "loss": 0.723, "step": 3180 }, { "epoch": 0.7203166226912929, "grad_norm": 0.5376641154289246, "learning_rate": 9.433064511706225e-06, "loss": 0.7362, "step": 3185 }, { "epoch": 0.7214474180173389, "grad_norm": 0.32007166743278503, "learning_rate": 9.362054672887819e-06, "loss": 0.6754, "step": 3190 }, { "epoch": 0.7225782133433849, "grad_norm": 0.37915733456611633, "learning_rate": 9.291251491218387e-06, "loss": 0.6565, "step": 3195 }, { "epoch": 0.7237090086694309, "grad_norm": 0.35389769077301025, "learning_rate": 9.220655902369665e-06, "loss": 0.6775, "step": 3200 }, { "epoch": 0.7248398039954769, "grad_norm": 0.5367900133132935, "learning_rate": 9.150268839270055e-06, "loss": 0.7366, "step": 3205 }, { "epoch": 0.7259705993215229, "grad_norm": 0.4069572985172272, "learning_rate": 9.080091232092247e-06, "loss": 0.6873, "step": 3210 }, { "epoch": 0.7271013946475688, "grad_norm": 0.5752056837081909, "learning_rate": 9.01012400824097e-06, "loss": 0.7199, "step": 3215 }, { "epoch": 0.7282321899736148, "grad_norm": 0.29914751648902893, "learning_rate": 8.940368092340682e-06, "loss": 0.7129, "step": 3220 }, { "epoch": 0.7293629852996608, "grad_norm": 0.33143705129623413, "learning_rate": 8.870824406223416e-06, "loss": 0.6581, "step": 3225 }, { "epoch": 0.7304937806257068, "grad_norm": 0.4050018787384033, "learning_rate": 8.801493868916536e-06, "loss": 0.6941, "step": 3230 }, { "epoch": 0.7316245759517528, "grad_norm": 0.33594897389411926, "learning_rate": 8.732377396630642e-06, "loss": 0.6639, "step": 3235 }, { "epoch": 0.7327553712777988, "grad_norm": 0.4058912694454193, "learning_rate": 8.663475902747445e-06, "loss": 0.7139, "step": 3240 }, { "epoch": 0.7338861666038448, "grad_norm": 0.5566866397857666, "learning_rate": 8.594790297807667e-06, "loss": 0.6765, "step": 3245 }, { "epoch": 0.7350169619298907, "grad_norm": 0.36217668652534485, "learning_rate": 8.526321489499067e-06, "loss": 0.6592, "step": 3250 }, { "epoch": 0.7361477572559367, "grad_norm": 0.3255480229854584, "learning_rate": 8.458070382644382e-06, "loss": 0.7567, "step": 3255 }, { "epoch": 0.7372785525819827, "grad_norm": 0.4506484866142273, "learning_rate": 8.390037879189422e-06, "loss": 0.6732, "step": 3260 }, { "epoch": 0.7384093479080287, "grad_norm": 0.4981943368911743, "learning_rate": 8.322224878191126e-06, "loss": 0.6665, "step": 3265 }, { "epoch": 0.7395401432340747, "grad_norm": 0.36179500818252563, "learning_rate": 8.25463227580567e-06, "loss": 0.6821, "step": 3270 }, { "epoch": 0.7406709385601207, "grad_norm": 0.34908345341682434, "learning_rate": 8.187260965276666e-06, "loss": 0.6194, "step": 3275 }, { "epoch": 0.7418017338861667, "grad_norm": 0.3363327980041504, "learning_rate": 8.120111836923283e-06, "loss": 0.6294, "step": 3280 }, { "epoch": 0.7429325292122126, "grad_norm": 0.34136900305747986, "learning_rate": 8.053185778128594e-06, "loss": 0.6208, "step": 3285 }, { "epoch": 0.7440633245382586, "grad_norm": 0.37522128224372864, "learning_rate": 7.986483673327724e-06, "loss": 0.6751, "step": 3290 }, { "epoch": 0.7451941198643046, "grad_norm": 0.34423232078552246, "learning_rate": 7.92000640399626e-06, "loss": 0.733, "step": 3295 }, { "epoch": 0.7463249151903506, "grad_norm": 0.4137992858886719, "learning_rate": 7.853754848638542e-06, "loss": 0.7044, "step": 3300 }, { "epoch": 0.7474557105163966, "grad_norm": 0.36555016040802, "learning_rate": 7.787729882776065e-06, "loss": 0.6735, "step": 3305 }, { "epoch": 0.7485865058424426, "grad_norm": 0.33875149488449097, "learning_rate": 7.721932378935973e-06, "loss": 0.732, "step": 3310 }, { "epoch": 0.7497173011684886, "grad_norm": 0.32351580262184143, "learning_rate": 7.656363206639409e-06, "loss": 0.7191, "step": 3315 }, { "epoch": 0.7508480964945344, "grad_norm": 0.48030319809913635, "learning_rate": 7.591023232390138e-06, "loss": 0.6972, "step": 3320 }, { "epoch": 0.7519788918205804, "grad_norm": 0.381740540266037, "learning_rate": 7.525913319663011e-06, "loss": 0.6752, "step": 3325 }, { "epoch": 0.7531096871466264, "grad_norm": 0.3707197308540344, "learning_rate": 7.461034328892621e-06, "loss": 0.6924, "step": 3330 }, { "epoch": 0.7542404824726724, "grad_norm": 0.4179406762123108, "learning_rate": 7.3963871174618945e-06, "loss": 0.6774, "step": 3335 }, { "epoch": 0.7553712777987184, "grad_norm": 0.3096112012863159, "learning_rate": 7.3319725396907485e-06, "loss": 0.6671, "step": 3340 }, { "epoch": 0.7565020731247644, "grad_norm": 0.37638741731643677, "learning_rate": 7.267791446824854e-06, "loss": 0.739, "step": 3345 }, { "epoch": 0.7576328684508103, "grad_norm": 0.33642110228538513, "learning_rate": 7.2038446870243195e-06, "loss": 0.6591, "step": 3350 }, { "epoch": 0.7587636637768563, "grad_norm": 0.3964068591594696, "learning_rate": 7.140133105352545e-06, "loss": 0.6936, "step": 3355 }, { "epoch": 0.7598944591029023, "grad_norm": 0.42048409581184387, "learning_rate": 7.076657543765008e-06, "loss": 0.729, "step": 3360 }, { "epoch": 0.7610252544289483, "grad_norm": 0.3949214518070221, "learning_rate": 7.013418841098174e-06, "loss": 0.7064, "step": 3365 }, { "epoch": 0.7621560497549943, "grad_norm": 0.38450565934181213, "learning_rate": 6.95041783305837e-06, "loss": 0.6666, "step": 3370 }, { "epoch": 0.7632868450810403, "grad_norm": 0.33812659978866577, "learning_rate": 6.887655352210765e-06, "loss": 0.6572, "step": 3375 }, { "epoch": 0.7644176404070863, "grad_norm": 0.373017281293869, "learning_rate": 6.825132227968378e-06, "loss": 0.7411, "step": 3380 }, { "epoch": 0.7655484357331322, "grad_norm": 0.36028534173965454, "learning_rate": 6.7628492865810995e-06, "loss": 0.6234, "step": 3385 }, { "epoch": 0.7666792310591782, "grad_norm": 0.3726188838481903, "learning_rate": 6.700807351124785e-06, "loss": 0.6261, "step": 3390 }, { "epoch": 0.7678100263852242, "grad_norm": 0.32167547941207886, "learning_rate": 6.639007241490347e-06, "loss": 0.7218, "step": 3395 }, { "epoch": 0.7689408217112702, "grad_norm": 0.3346633315086365, "learning_rate": 6.5774497743729734e-06, "loss": 0.6264, "step": 3400 }, { "epoch": 0.7700716170373162, "grad_norm": 0.5438939929008484, "learning_rate": 6.5161357632612745e-06, "loss": 0.6799, "step": 3405 }, { "epoch": 0.7712024123633622, "grad_norm": 0.7162203192710876, "learning_rate": 6.4550660184265866e-06, "loss": 0.7282, "step": 3410 }, { "epoch": 0.7723332076894082, "grad_norm": 0.3571074306964874, "learning_rate": 6.394241346912236e-06, "loss": 0.7061, "step": 3415 }, { "epoch": 0.7734640030154541, "grad_norm": 0.37110117077827454, "learning_rate": 6.333662552522865e-06, "loss": 0.6464, "step": 3420 }, { "epoch": 0.7745947983415001, "grad_norm": 0.42400380969047546, "learning_rate": 6.273330435813837e-06, "loss": 0.6814, "step": 3425 }, { "epoch": 0.7757255936675461, "grad_norm": 0.41441938281059265, "learning_rate": 6.213245794080641e-06, "loss": 0.6435, "step": 3430 }, { "epoch": 0.7768563889935921, "grad_norm": 0.38226518034935, "learning_rate": 6.153409421348358e-06, "loss": 0.6979, "step": 3435 }, { "epoch": 0.7779871843196381, "grad_norm": 0.28945887088775635, "learning_rate": 6.093822108361163e-06, "loss": 0.6509, "step": 3440 }, { "epoch": 0.7791179796456841, "grad_norm": 0.3852415680885315, "learning_rate": 6.034484642571866e-06, "loss": 0.7581, "step": 3445 }, { "epoch": 0.7802487749717301, "grad_norm": 0.36283373832702637, "learning_rate": 5.975397808131549e-06, "loss": 0.6021, "step": 3450 }, { "epoch": 0.781379570297776, "grad_norm": 0.3490721583366394, "learning_rate": 5.916562385879151e-06, "loss": 0.6571, "step": 3455 }, { "epoch": 0.782510365623822, "grad_norm": 0.32459717988967896, "learning_rate": 5.857979153331189e-06, "loss": 0.6211, "step": 3460 }, { "epoch": 0.783641160949868, "grad_norm": 0.37339502573013306, "learning_rate": 5.799648884671441e-06, "loss": 0.6819, "step": 3465 }, { "epoch": 0.784771956275914, "grad_norm": 0.35320839285850525, "learning_rate": 5.741572350740768e-06, "loss": 0.7348, "step": 3470 }, { "epoch": 0.78590275160196, "grad_norm": 0.2820529043674469, "learning_rate": 5.68375031902687e-06, "loss": 0.6302, "step": 3475 }, { "epoch": 0.787033546928006, "grad_norm": 0.35477685928344727, "learning_rate": 5.626183553654194e-06, "loss": 0.6241, "step": 3480 }, { "epoch": 0.788164342254052, "grad_norm": 0.36558765172958374, "learning_rate": 5.5688728153738155e-06, "loss": 0.6594, "step": 3485 }, { "epoch": 0.789295137580098, "grad_norm": 0.3570399880409241, "learning_rate": 5.511818861553364e-06, "loss": 0.6271, "step": 3490 }, { "epoch": 0.7904259329061439, "grad_norm": 0.4297529458999634, "learning_rate": 5.45502244616706e-06, "loss": 0.7279, "step": 3495 }, { "epoch": 0.7915567282321899, "grad_norm": 0.3277917504310608, "learning_rate": 5.398484319785688e-06, "loss": 0.7204, "step": 3500 }, { "epoch": 0.7926875235582359, "grad_norm": 0.35679319500923157, "learning_rate": 5.342205229566774e-06, "loss": 0.6979, "step": 3505 }, { "epoch": 0.7938183188842819, "grad_norm": 0.5490666627883911, "learning_rate": 5.286185919244599e-06, "loss": 0.7884, "step": 3510 }, { "epoch": 0.7949491142103279, "grad_norm": 0.3300570845603943, "learning_rate": 5.230427129120441e-06, "loss": 0.6661, "step": 3515 }, { "epoch": 0.7960799095363739, "grad_norm": 0.34464097023010254, "learning_rate": 5.174929596052791e-06, "loss": 0.729, "step": 3520 }, { "epoch": 0.7972107048624198, "grad_norm": 0.36439618468284607, "learning_rate": 5.119694053447566e-06, "loss": 0.6483, "step": 3525 }, { "epoch": 0.7983415001884658, "grad_norm": 0.3646329939365387, "learning_rate": 5.064721231248498e-06, "loss": 0.6497, "step": 3530 }, { "epoch": 0.7994722955145118, "grad_norm": 0.42587414383888245, "learning_rate": 5.010011855927393e-06, "loss": 0.6638, "step": 3535 }, { "epoch": 0.8006030908405578, "grad_norm": 0.3738311529159546, "learning_rate": 4.955566650474616e-06, "loss": 0.806, "step": 3540 }, { "epoch": 0.8017338861666038, "grad_norm": 0.4998151659965515, "learning_rate": 4.90138633438946e-06, "loss": 0.6658, "step": 3545 }, { "epoch": 0.8028646814926498, "grad_norm": 0.39495596289634705, "learning_rate": 4.847471623670713e-06, "loss": 0.7759, "step": 3550 }, { "epoch": 0.8039954768186958, "grad_norm": 0.38152778148651123, "learning_rate": 4.79382323080714e-06, "loss": 0.6445, "step": 3555 }, { "epoch": 0.8051262721447418, "grad_norm": 0.5026568174362183, "learning_rate": 4.740441864768086e-06, "loss": 0.7176, "step": 3560 }, { "epoch": 0.8062570674707877, "grad_norm": 0.3014233112335205, "learning_rate": 4.687328230994118e-06, "loss": 0.6597, "step": 3565 }, { "epoch": 0.8073878627968337, "grad_norm": 0.4386585056781769, "learning_rate": 4.634483031387676e-06, "loss": 0.7718, "step": 3570 }, { "epoch": 0.8085186581228797, "grad_norm": 0.3882271647453308, "learning_rate": 4.581906964303825e-06, "loss": 0.6668, "step": 3575 }, { "epoch": 0.8096494534489257, "grad_norm": 0.3510667681694031, "learning_rate": 4.529600724541022e-06, "loss": 0.7296, "step": 3580 }, { "epoch": 0.8107802487749717, "grad_norm": 0.5134342908859253, "learning_rate": 4.477565003331904e-06, "loss": 0.7208, "step": 3585 }, { "epoch": 0.8119110441010177, "grad_norm": 0.32369402050971985, "learning_rate": 4.4258004883342e-06, "loss": 0.6951, "step": 3590 }, { "epoch": 0.8130418394270637, "grad_norm": 0.4089120030403137, "learning_rate": 4.3743078636215935e-06, "loss": 0.6571, "step": 3595 }, { "epoch": 0.8141726347531096, "grad_norm": 0.3248507082462311, "learning_rate": 4.323087809674733e-06, "loss": 0.6267, "step": 3600 }, { "epoch": 0.8153034300791556, "grad_norm": 0.3590109348297119, "learning_rate": 4.2721410033722014e-06, "loss": 0.6919, "step": 3605 }, { "epoch": 0.8164342254052016, "grad_norm": 0.3924254775047302, "learning_rate": 4.221468117981592e-06, "loss": 0.6, "step": 3610 }, { "epoch": 0.8175650207312476, "grad_norm": 0.42247772216796875, "learning_rate": 4.1710698231505975e-06, "loss": 0.6375, "step": 3615 }, { "epoch": 0.8186958160572936, "grad_norm": 0.3658187985420227, "learning_rate": 4.120946784898156e-06, "loss": 0.7743, "step": 3620 }, { "epoch": 0.8198266113833396, "grad_norm": 0.39758992195129395, "learning_rate": 4.071099665605682e-06, "loss": 0.6259, "step": 3625 }, { "epoch": 0.8209574067093856, "grad_norm": 0.45203524827957153, "learning_rate": 4.021529124008278e-06, "loss": 0.7297, "step": 3630 }, { "epoch": 0.8220882020354315, "grad_norm": 0.43119361996650696, "learning_rate": 3.9722358151860515e-06, "loss": 0.6612, "step": 3635 }, { "epoch": 0.8232189973614775, "grad_norm": 0.41796061396598816, "learning_rate": 3.923220390555432e-06, "loss": 0.7526, "step": 3640 }, { "epoch": 0.8243497926875235, "grad_norm": 0.33241549134254456, "learning_rate": 3.87448349786059e-06, "loss": 0.6832, "step": 3645 }, { "epoch": 0.8254805880135695, "grad_norm": 0.3728543817996979, "learning_rate": 3.826025781164874e-06, "loss": 0.6604, "step": 3650 }, { "epoch": 0.8266113833396155, "grad_norm": 0.297720342874527, "learning_rate": 3.7778478808422753e-06, "loss": 0.7111, "step": 3655 }, { "epoch": 0.8277421786656615, "grad_norm": 0.3133184015750885, "learning_rate": 3.7299504335689905e-06, "loss": 0.6552, "step": 3660 }, { "epoch": 0.8288729739917075, "grad_norm": 0.3344557583332062, "learning_rate": 3.682334072314994e-06, "loss": 0.6516, "step": 3665 }, { "epoch": 0.8300037693177534, "grad_norm": 0.33505749702453613, "learning_rate": 3.6349994263356806e-06, "loss": 0.6788, "step": 3670 }, { "epoch": 0.8311345646437994, "grad_norm": 0.32801946997642517, "learning_rate": 3.587947121163551e-06, "loss": 0.6627, "step": 3675 }, { "epoch": 0.8322653599698454, "grad_norm": 0.3641601800918579, "learning_rate": 3.541177778599944e-06, "loss": 0.6904, "step": 3680 }, { "epoch": 0.8333961552958914, "grad_norm": 0.3527655005455017, "learning_rate": 3.494692016706799e-06, "loss": 0.7227, "step": 3685 }, { "epoch": 0.8345269506219374, "grad_norm": 0.32480356097221375, "learning_rate": 3.4484904497985167e-06, "loss": 0.6718, "step": 3690 }, { "epoch": 0.8356577459479834, "grad_norm": 0.40660572052001953, "learning_rate": 3.4025736884338326e-06, "loss": 0.7252, "step": 3695 }, { "epoch": 0.8367885412740294, "grad_norm": 0.3736969530582428, "learning_rate": 3.356942339407748e-06, "loss": 0.6344, "step": 3700 }, { "epoch": 0.8379193366000753, "grad_norm": 0.3524364233016968, "learning_rate": 3.311597005743508e-06, "loss": 0.6561, "step": 3705 }, { "epoch": 0.8390501319261213, "grad_norm": 0.346884548664093, "learning_rate": 3.26653828668462e-06, "loss": 0.7203, "step": 3710 }, { "epoch": 0.8401809272521673, "grad_norm": 0.3504559099674225, "learning_rate": 3.2217667776869716e-06, "loss": 0.6846, "step": 3715 }, { "epoch": 0.8413117225782133, "grad_norm": 0.4117507338523865, "learning_rate": 3.1772830704109108e-06, "loss": 0.7109, "step": 3720 }, { "epoch": 0.8424425179042593, "grad_norm": 0.35699552297592163, "learning_rate": 3.133087752713479e-06, "loss": 0.7086, "step": 3725 }, { "epoch": 0.8435733132303053, "grad_norm": 0.4353185296058655, "learning_rate": 3.089181408640612e-06, "loss": 0.6974, "step": 3730 }, { "epoch": 0.8447041085563513, "grad_norm": 0.35224634408950806, "learning_rate": 3.0455646184194137e-06, "loss": 0.695, "step": 3735 }, { "epoch": 0.8458349038823972, "grad_norm": 0.3479955792427063, "learning_rate": 3.0022379584505212e-06, "loss": 0.7459, "step": 3740 }, { "epoch": 0.8469656992084432, "grad_norm": 0.33437180519104004, "learning_rate": 2.9592020013004455e-06, "loss": 0.6236, "step": 3745 }, { "epoch": 0.8480964945344892, "grad_norm": 0.3484211266040802, "learning_rate": 2.9164573156940654e-06, "loss": 0.6564, "step": 3750 }, { "epoch": 0.8492272898605352, "grad_norm": 0.42290642857551575, "learning_rate": 2.874004466507041e-06, "loss": 0.8202, "step": 3755 }, { "epoch": 0.8503580851865812, "grad_norm": 0.3348793089389801, "learning_rate": 2.8318440147583862e-06, "loss": 0.6083, "step": 3760 }, { "epoch": 0.8514888805126272, "grad_norm": 0.45830124616622925, "learning_rate": 2.7899765176030627e-06, "loss": 0.6741, "step": 3765 }, { "epoch": 0.8526196758386732, "grad_norm": 0.40784764289855957, "learning_rate": 2.7484025283246034e-06, "loss": 0.6632, "step": 3770 }, { "epoch": 0.8537504711647191, "grad_norm": 0.31340643763542175, "learning_rate": 2.707122596327805e-06, "loss": 0.6891, "step": 3775 }, { "epoch": 0.8548812664907651, "grad_norm": 0.5138049721717834, "learning_rate": 2.6661372671314493e-06, "loss": 0.7407, "step": 3780 }, { "epoch": 0.8560120618168111, "grad_norm": 0.3300493359565735, "learning_rate": 2.6254470823611323e-06, "loss": 0.7163, "step": 3785 }, { "epoch": 0.8571428571428571, "grad_norm": 0.4111888110637665, "learning_rate": 2.585052579742059e-06, "loss": 0.7343, "step": 3790 }, { "epoch": 0.8582736524689031, "grad_norm": 0.3648470938205719, "learning_rate": 2.5449542930919864e-06, "loss": 0.6905, "step": 3795 }, { "epoch": 0.8594044477949491, "grad_norm": 0.3930950164794922, "learning_rate": 2.5051527523141356e-06, "loss": 0.6164, "step": 3800 }, { "epoch": 0.860535243120995, "grad_norm": 0.35205841064453125, "learning_rate": 2.465648483390193e-06, "loss": 0.6893, "step": 3805 }, { "epoch": 0.861666038447041, "grad_norm": 0.3441345989704132, "learning_rate": 2.4264420083733807e-06, "loss": 0.6441, "step": 3810 }, { "epoch": 0.862796833773087, "grad_norm": 0.3523414134979248, "learning_rate": 2.387533845381518e-06, "loss": 0.7179, "step": 3815 }, { "epoch": 0.863927629099133, "grad_norm": 0.4754193425178528, "learning_rate": 2.3489245085902194e-06, "loss": 0.7682, "step": 3820 }, { "epoch": 0.865058424425179, "grad_norm": 0.3973066508769989, "learning_rate": 2.310614508226078e-06, "loss": 0.6431, "step": 3825 }, { "epoch": 0.866189219751225, "grad_norm": 0.49921613931655884, "learning_rate": 2.2726043505599036e-06, "loss": 0.7379, "step": 3830 }, { "epoch": 0.867320015077271, "grad_norm": 0.3483542203903198, "learning_rate": 2.2348945379000783e-06, "loss": 0.6746, "step": 3835 }, { "epoch": 0.868450810403317, "grad_norm": 0.409015417098999, "learning_rate": 2.1974855685858663e-06, "loss": 0.6205, "step": 3840 }, { "epoch": 0.8695816057293629, "grad_norm": 0.38850805163383484, "learning_rate": 2.1603779369808757e-06, "loss": 0.6971, "step": 3845 }, { "epoch": 0.8707124010554089, "grad_norm": 0.39465731382369995, "learning_rate": 2.123572133466495e-06, "loss": 0.6327, "step": 3850 }, { "epoch": 0.8718431963814549, "grad_norm": 0.3345524072647095, "learning_rate": 2.087068644435425e-06, "loss": 0.6426, "step": 3855 }, { "epoch": 0.8729739917075009, "grad_norm": 0.28288835287094116, "learning_rate": 2.050867952285243e-06, "loss": 0.5873, "step": 3860 }, { "epoch": 0.8741047870335469, "grad_norm": 0.3995983898639679, "learning_rate": 2.0149705354120224e-06, "loss": 0.6867, "step": 3865 }, { "epoch": 0.8752355823595929, "grad_norm": 0.4070720076560974, "learning_rate": 1.9793768682040524e-06, "loss": 0.726, "step": 3870 }, { "epoch": 0.8763663776856389, "grad_norm": 0.37636858224868774, "learning_rate": 1.9440874210355065e-06, "loss": 0.6516, "step": 3875 }, { "epoch": 0.8774971730116848, "grad_norm": 0.2892749309539795, "learning_rate": 1.909102660260273e-06, "loss": 0.6692, "step": 3880 }, { "epoch": 0.8786279683377308, "grad_norm": 0.3219640851020813, "learning_rate": 1.8744230482057673e-06, "loss": 0.7656, "step": 3885 }, { "epoch": 0.8797587636637768, "grad_norm": 0.4004978835582733, "learning_rate": 1.8400490431668387e-06, "loss": 0.7057, "step": 3890 }, { "epoch": 0.8808895589898228, "grad_norm": 0.3124302327632904, "learning_rate": 1.805981099399709e-06, "loss": 0.6377, "step": 3895 }, { "epoch": 0.8820203543158688, "grad_norm": 0.3707364797592163, "learning_rate": 1.7722196671159542e-06, "loss": 0.6751, "step": 3900 }, { "epoch": 0.8831511496419148, "grad_norm": 0.38595885038375854, "learning_rate": 1.7387651924765796e-06, "loss": 0.6968, "step": 3905 }, { "epoch": 0.8842819449679608, "grad_norm": 0.3807552754878998, "learning_rate": 1.7056181175861025e-06, "loss": 0.7338, "step": 3910 }, { "epoch": 0.8854127402940067, "grad_norm": 0.40677499771118164, "learning_rate": 1.6727788804867277e-06, "loss": 0.713, "step": 3915 }, { "epoch": 0.8865435356200527, "grad_norm": 0.5259581804275513, "learning_rate": 1.6402479151525458e-06, "loss": 0.6833, "step": 3920 }, { "epoch": 0.8876743309460987, "grad_norm": 0.3447456359863281, "learning_rate": 1.6080256514838077e-06, "loss": 0.6712, "step": 3925 }, { "epoch": 0.8888051262721447, "grad_norm": 0.31114619970321655, "learning_rate": 1.5761125153012312e-06, "loss": 0.73, "step": 3930 }, { "epoch": 0.8899359215981907, "grad_norm": 0.29841819405555725, "learning_rate": 1.5445089283403768e-06, "loss": 0.6782, "step": 3935 }, { "epoch": 0.8910667169242367, "grad_norm": 0.45541536808013916, "learning_rate": 1.5132153082460908e-06, "loss": 0.7093, "step": 3940 }, { "epoch": 0.8921975122502827, "grad_norm": 0.31067731976509094, "learning_rate": 1.482232068566966e-06, "loss": 0.6212, "step": 3945 }, { "epoch": 0.8933283075763286, "grad_norm": 0.3661406934261322, "learning_rate": 1.4515596187498898e-06, "loss": 0.6728, "step": 3950 }, { "epoch": 0.8944591029023746, "grad_norm": 0.31056010723114014, "learning_rate": 1.4211983641346154e-06, "loss": 0.64, "step": 3955 }, { "epoch": 0.8955898982284206, "grad_norm": 0.3716438114643097, "learning_rate": 1.3911487059484362e-06, "loss": 0.7058, "step": 3960 }, { "epoch": 0.8967206935544666, "grad_norm": 0.3605138659477234, "learning_rate": 1.3614110413008474e-06, "loss": 0.7142, "step": 3965 }, { "epoch": 0.8978514888805126, "grad_norm": 0.39523765444755554, "learning_rate": 1.3319857631783227e-06, "loss": 0.667, "step": 3970 }, { "epoch": 0.8989822842065586, "grad_norm": 0.4720902144908905, "learning_rate": 1.302873260439122e-06, "loss": 0.7009, "step": 3975 }, { "epoch": 0.9001130795326046, "grad_norm": 0.39917027950286865, "learning_rate": 1.2740739178081274e-06, "loss": 0.6236, "step": 3980 }, { "epoch": 0.9012438748586505, "grad_norm": 0.38968703150749207, "learning_rate": 1.2455881158717874e-06, "loss": 0.6108, "step": 3985 }, { "epoch": 0.9023746701846965, "grad_norm": 0.3744681775569916, "learning_rate": 1.2174162310730764e-06, "loss": 0.674, "step": 3990 }, { "epoch": 0.9035054655107425, "grad_norm": 0.41147854924201965, "learning_rate": 1.1895586357065197e-06, "loss": 0.6971, "step": 3995 }, { "epoch": 0.9046362608367885, "grad_norm": 0.4496522843837738, "learning_rate": 1.1620156979132685e-06, "loss": 0.7027, "step": 4000 }, { "epoch": 0.9057670561628345, "grad_norm": 0.38566187024116516, "learning_rate": 1.134787781676236e-06, "loss": 0.6488, "step": 4005 }, { "epoch": 0.9068978514888805, "grad_norm": 0.3715657591819763, "learning_rate": 1.1078752468153042e-06, "loss": 0.6727, "step": 4010 }, { "epoch": 0.9080286468149265, "grad_norm": 0.3041117787361145, "learning_rate": 1.0812784489825507e-06, "loss": 0.6763, "step": 4015 }, { "epoch": 0.9091594421409724, "grad_norm": 0.40202027559280396, "learning_rate": 1.054997739657551e-06, "loss": 0.652, "step": 4020 }, { "epoch": 0.9102902374670184, "grad_norm": 0.41109445691108704, "learning_rate": 1.029033466142737e-06, "loss": 0.7183, "step": 4025 }, { "epoch": 0.9114210327930644, "grad_norm": 0.31253260374069214, "learning_rate": 1.0033859715588122e-06, "loss": 0.6929, "step": 4030 }, { "epoch": 0.9125518281191104, "grad_norm": 0.4093742072582245, "learning_rate": 9.780555948401994e-07, "loss": 0.7043, "step": 4035 }, { "epoch": 0.9136826234451564, "grad_norm": 0.3625013828277588, "learning_rate": 9.530426707305918e-07, "loss": 0.7268, "step": 4040 }, { "epoch": 0.9148134187712024, "grad_norm": 0.34964805841445923, "learning_rate": 9.283475297785005e-07, "loss": 0.6746, "step": 4045 }, { "epoch": 0.9159442140972484, "grad_norm": 0.3727870285511017, "learning_rate": 9.039704983328984e-07, "loss": 0.6868, "step": 4050 }, { "epoch": 0.9170750094232943, "grad_norm": 0.5575105547904968, "learning_rate": 8.799118985389126e-07, "loss": 0.7606, "step": 4055 }, { "epoch": 0.9182058047493403, "grad_norm": 0.38986077904701233, "learning_rate": 8.561720483335478e-07, "loss": 0.6885, "step": 4060 }, { "epoch": 0.9193366000753863, "grad_norm": 0.35799407958984375, "learning_rate": 8.327512614415195e-07, "loss": 0.6676, "step": 4065 }, { "epoch": 0.9204673954014323, "grad_norm": 0.3649112284183502, "learning_rate": 8.09649847371069e-07, "loss": 0.6487, "step": 4070 }, { "epoch": 0.9215981907274783, "grad_norm": 0.4163808226585388, "learning_rate": 7.868681114098914e-07, "loss": 0.6342, "step": 4075 }, { "epoch": 0.9227289860535243, "grad_norm": 0.34704506397247314, "learning_rate": 7.644063546211167e-07, "loss": 0.6623, "step": 4080 }, { "epoch": 0.9238597813795703, "grad_norm": 0.3512720465660095, "learning_rate": 7.422648738392934e-07, "loss": 0.6688, "step": 4085 }, { "epoch": 0.9249905767056162, "grad_norm": 0.33337023854255676, "learning_rate": 7.204439616665115e-07, "loss": 0.6587, "step": 4090 }, { "epoch": 0.9261213720316622, "grad_norm": 0.35821977257728577, "learning_rate": 6.989439064684911e-07, "loss": 0.6823, "step": 4095 }, { "epoch": 0.9272521673577082, "grad_norm": 0.46636807918548584, "learning_rate": 6.777649923708024e-07, "loss": 0.7261, "step": 4100 }, { "epoch": 0.9283829626837542, "grad_norm": 0.4258100688457489, "learning_rate": 6.569074992551022e-07, "loss": 0.6615, "step": 4105 }, { "epoch": 0.9295137580098002, "grad_norm": 0.39648309350013733, "learning_rate": 6.363717027554256e-07, "loss": 0.7147, "step": 4110 }, { "epoch": 0.9306445533358462, "grad_norm": 0.41724279522895813, "learning_rate": 6.161578742545665e-07, "loss": 0.6852, "step": 4115 }, { "epoch": 0.9317753486618922, "grad_norm": 0.3736780285835266, "learning_rate": 5.962662808804587e-07, "loss": 0.717, "step": 4120 }, { "epoch": 0.9329061439879381, "grad_norm": 0.3304630517959595, "learning_rate": 5.766971855026809e-07, "loss": 0.6539, "step": 4125 }, { "epoch": 0.9340369393139841, "grad_norm": 0.39884060621261597, "learning_rate": 5.574508467289518e-07, "loss": 0.7029, "step": 4130 }, { "epoch": 0.9351677346400301, "grad_norm": 0.49207261204719543, "learning_rate": 5.385275189017353e-07, "loss": 0.7092, "step": 4135 }, { "epoch": 0.9362985299660761, "grad_norm": 0.3424831032752991, "learning_rate": 5.199274520948677e-07, "loss": 0.6355, "step": 4140 }, { "epoch": 0.9374293252921221, "grad_norm": 0.32665055990219116, "learning_rate": 5.01650892110253e-07, "loss": 0.6958, "step": 4145 }, { "epoch": 0.9385601206181681, "grad_norm": 0.38883742690086365, "learning_rate": 4.836980804746261e-07, "loss": 0.6334, "step": 4150 }, { "epoch": 0.9396909159442141, "grad_norm": 0.3798101842403412, "learning_rate": 4.660692544363382e-07, "loss": 0.781, "step": 4155 }, { "epoch": 0.94082171127026, "grad_norm": 0.4334189295768738, "learning_rate": 4.487646469622464e-07, "loss": 0.7235, "step": 4160 }, { "epoch": 0.941952506596306, "grad_norm": 0.34717586636543274, "learning_rate": 4.31784486734621e-07, "loss": 0.669, "step": 4165 }, { "epoch": 0.943083301922352, "grad_norm": 0.3831476867198944, "learning_rate": 4.1512899814813156e-07, "loss": 0.6536, "step": 4170 }, { "epoch": 0.944214097248398, "grad_norm": 0.35673925280570984, "learning_rate": 3.9879840130686576e-07, "loss": 0.6853, "step": 4175 }, { "epoch": 0.945344892574444, "grad_norm": 0.48463523387908936, "learning_rate": 3.82792912021443e-07, "loss": 0.7454, "step": 4180 }, { "epoch": 0.94647568790049, "grad_norm": 0.36075058579444885, "learning_rate": 3.6711274180614153e-07, "loss": 0.6806, "step": 4185 }, { "epoch": 0.947606483226536, "grad_norm": 0.42450064420700073, "learning_rate": 3.517580978761148e-07, "loss": 0.7356, "step": 4190 }, { "epoch": 0.9487372785525819, "grad_norm": 0.36473605036735535, "learning_rate": 3.3672918314466007e-07, "loss": 0.717, "step": 4195 }, { "epoch": 0.9498680738786279, "grad_norm": 0.4312973916530609, "learning_rate": 3.220261962205179e-07, "loss": 0.6991, "step": 4200 }, { "epoch": 0.9509988692046739, "grad_norm": 0.3392401933670044, "learning_rate": 3.0764933140525475e-07, "loss": 0.6468, "step": 4205 }, { "epoch": 0.9521296645307199, "grad_norm": 0.426111102104187, "learning_rate": 2.935987786907124e-07, "loss": 0.7004, "step": 4210 }, { "epoch": 0.9532604598567659, "grad_norm": 0.3565954864025116, "learning_rate": 2.7987472375646804e-07, "loss": 0.7199, "step": 4215 }, { "epoch": 0.9543912551828119, "grad_norm": 0.3929762542247772, "learning_rate": 2.664773479674032e-07, "loss": 0.7348, "step": 4220 }, { "epoch": 0.9555220505088579, "grad_norm": 0.4081243872642517, "learning_rate": 2.5340682837129146e-07, "loss": 0.662, "step": 4225 }, { "epoch": 0.9566528458349038, "grad_norm": 0.3799093961715698, "learning_rate": 2.406633376964784e-07, "loss": 0.6571, "step": 4230 }, { "epoch": 0.9577836411609498, "grad_norm": 0.35904359817504883, "learning_rate": 2.2824704434957766e-07, "loss": 0.7287, "step": 4235 }, { "epoch": 0.9589144364869958, "grad_norm": 0.38494235277175903, "learning_rate": 2.1615811241325613e-07, "loss": 0.71, "step": 4240 }, { "epoch": 0.9600452318130418, "grad_norm": 0.4001871943473816, "learning_rate": 2.0439670164406345e-07, "loss": 0.7414, "step": 4245 }, { "epoch": 0.9611760271390878, "grad_norm": 0.33206847310066223, "learning_rate": 1.929629674703226e-07, "loss": 0.6673, "step": 4250 }, { "epoch": 0.9623068224651338, "grad_norm": 0.3849842846393585, "learning_rate": 1.8185706099007883e-07, "loss": 0.7487, "step": 4255 }, { "epoch": 0.9634376177911798, "grad_norm": 0.33372604846954346, "learning_rate": 1.7107912896908995e-07, "loss": 0.6522, "step": 4260 }, { "epoch": 0.9645684131172257, "grad_norm": 0.4485720992088318, "learning_rate": 1.6062931383890312e-07, "loss": 0.6499, "step": 4265 }, { "epoch": 0.9656992084432717, "grad_norm": 0.5238537788391113, "learning_rate": 1.5050775369495895e-07, "loss": 0.6708, "step": 4270 }, { "epoch": 0.9668300037693177, "grad_norm": 0.31111645698547363, "learning_rate": 1.4071458229478196e-07, "loss": 0.6394, "step": 4275 }, { "epoch": 0.9679607990953637, "grad_norm": 0.3432201147079468, "learning_rate": 1.3124992905619028e-07, "loss": 0.7097, "step": 4280 }, { "epoch": 0.9690915944214097, "grad_norm": 0.38018399477005005, "learning_rate": 1.2211391905561086e-07, "loss": 0.711, "step": 4285 }, { "epoch": 0.9702223897474557, "grad_norm": 0.36301785707473755, "learning_rate": 1.1330667302641151e-07, "loss": 0.664, "step": 4290 }, { "epoch": 0.9713531850735017, "grad_norm": 0.3279567360877991, "learning_rate": 1.0482830735730198e-07, "loss": 0.6311, "step": 4295 }, { "epoch": 0.9724839803995476, "grad_norm": 0.31279444694519043, "learning_rate": 9.66789340908103e-08, "loss": 0.6588, "step": 4300 }, { "epoch": 0.9736147757255936, "grad_norm": 0.38352543115615845, "learning_rate": 8.885866092178952e-08, "loss": 0.6798, "step": 4305 }, { "epoch": 0.9747455710516396, "grad_norm": 0.3573386073112488, "learning_rate": 8.136759119600213e-08, "loss": 0.6686, "step": 4310 }, { "epoch": 0.9758763663776856, "grad_norm": 0.3751896917819977, "learning_rate": 7.42058239087462e-08, "loss": 0.7003, "step": 4315 }, { "epoch": 0.9770071617037316, "grad_norm": 0.425658643245697, "learning_rate": 6.737345370355919e-08, "loss": 0.7152, "step": 4320 }, { "epoch": 0.9781379570297776, "grad_norm": 0.34428244829177856, "learning_rate": 6.087057087095504e-08, "loss": 0.7485, "step": 4325 }, { "epoch": 0.9792687523558236, "grad_norm": 0.4021622836589813, "learning_rate": 5.469726134723907e-08, "loss": 0.7458, "step": 4330 }, { "epoch": 0.9803995476818695, "grad_norm": 0.364872545003891, "learning_rate": 4.885360671336714e-08, "loss": 0.7113, "step": 4335 }, { "epoch": 0.9815303430079155, "grad_norm": 0.4006316363811493, "learning_rate": 4.3339684193871576e-08, "loss": 0.7263, "step": 4340 }, { "epoch": 0.9826611383339615, "grad_norm": 0.3257283866405487, "learning_rate": 3.8155566655839746e-08, "loss": 0.6425, "step": 4345 }, { "epoch": 0.9837919336600075, "grad_norm": 0.37064969539642334, "learning_rate": 3.330132260794538e-08, "loss": 0.678, "step": 4350 }, { "epoch": 0.9849227289860535, "grad_norm": 0.4389401078224182, "learning_rate": 2.8777016199554863e-08, "loss": 0.651, "step": 4355 }, { "epoch": 0.9860535243120995, "grad_norm": 0.4058115482330322, "learning_rate": 2.4582707219866772e-08, "loss": 0.7288, "step": 4360 }, { "epoch": 0.9871843196381455, "grad_norm": 0.36480987071990967, "learning_rate": 2.0718451097134773e-08, "loss": 0.7054, "step": 4365 }, { "epoch": 0.9883151149641914, "grad_norm": 0.33493801951408386, "learning_rate": 1.718429889792095e-08, "loss": 0.6635, "step": 4370 }, { "epoch": 0.9894459102902374, "grad_norm": 0.4736829698085785, "learning_rate": 1.3980297326432468e-08, "loss": 0.6585, "step": 4375 }, { "epoch": 0.9905767056162834, "grad_norm": 0.4238462746143341, "learning_rate": 1.110648872389708e-08, "loss": 0.6614, "step": 4380 }, { "epoch": 0.9917075009423294, "grad_norm": 0.32537785172462463, "learning_rate": 8.56291106801077e-09, "loss": 0.6681, "step": 4385 }, { "epoch": 0.9928382962683754, "grad_norm": 0.4619493782520294, "learning_rate": 6.349597972424293e-09, "loss": 0.6741, "step": 4390 }, { "epoch": 0.9939690915944214, "grad_norm": 0.35761797428131104, "learning_rate": 4.4665786863185014e-09, "loss": 0.6885, "step": 4395 }, { "epoch": 0.9950998869204674, "grad_norm": 0.3071589767932892, "learning_rate": 2.913878093990796e-09, "loss": 0.6676, "step": 4400 }, { "epoch": 0.9962306822465133, "grad_norm": 0.3391641080379486, "learning_rate": 1.6915167145525878e-09, "loss": 0.6981, "step": 4405 }, { "epoch": 0.9973614775725593, "grad_norm": 0.41276663541793823, "learning_rate": 7.995107016406378e-10, "loss": 0.6594, "step": 4410 }, { "epoch": 0.9984922728986053, "grad_norm": 0.4570743143558502, "learning_rate": 2.3787184321444335e-10, "loss": 0.7059, "step": 4415 }, { "epoch": 0.9996230682246513, "grad_norm": 0.3156117796897888, "learning_rate": 6.607561386928751e-12, "loss": 0.6465, "step": 4420 }, { "epoch": 0.9998492272898606, "step": 4421, "total_flos": 4.746119130111803e+18, "train_loss": 0.6650473316231887, "train_runtime": 22780.4973, "train_samples_per_second": 9.316, "train_steps_per_second": 0.194 } ], "logging_steps": 5, "max_steps": 4421, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.746119130111803e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }