diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7492 @@ +{ + "best_metric": 0.9590113159486987, + "best_model_checkpoint": "output_classification_1280/hazard/checkpoint-10538", + "epoch": 8.0, + "eval_steps": 500, + "global_step": 10538, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007591573353577529, + "grad_norm": 27.469635009765625, + "learning_rate": 5.0607287449392715e-08, + "loss": 1.6903, + "step": 10 + }, + { + "epoch": 0.015183146707155058, + "grad_norm": 29.22759437561035, + "learning_rate": 9.109311740890688e-08, + "loss": 1.6631, + "step": 20 + }, + { + "epoch": 0.022774720060732587, + "grad_norm": 22.48965835571289, + "learning_rate": 1.417004048582996e-07, + "loss": 1.7504, + "step": 30 + }, + { + "epoch": 0.030366293414310117, + "grad_norm": 30.96166229248047, + "learning_rate": 1.9230769230769234e-07, + "loss": 1.7496, + "step": 40 + }, + { + "epoch": 0.03795786676788764, + "grad_norm": 28.63855743408203, + "learning_rate": 2.4291497975708504e-07, + "loss": 1.6787, + "step": 50 + }, + { + "epoch": 0.045549440121465175, + "grad_norm": 31.33084487915039, + "learning_rate": 2.9352226720647774e-07, + "loss": 1.7747, + "step": 60 + }, + { + "epoch": 0.0531410134750427, + "grad_norm": 27.18292236328125, + "learning_rate": 3.390688259109312e-07, + "loss": 1.6577, + "step": 70 + }, + { + "epoch": 0.06073258682862023, + "grad_norm": 30.794124603271484, + "learning_rate": 3.896761133603239e-07, + "loss": 1.7097, + "step": 80 + }, + { + "epoch": 0.06832416018219777, + "grad_norm": 42.49530792236328, + "learning_rate": 4.402834008097166e-07, + "loss": 1.5986, + "step": 90 + }, + { + "epoch": 0.07591573353577528, + "grad_norm": 26.470556259155273, + "learning_rate": 4.908906882591093e-07, + "loss": 1.7342, + "step": 100 + }, + { + "epoch": 0.08350730688935282, + "grad_norm": 40.713924407958984, + "learning_rate": 5.414979757085021e-07, + "loss": 1.517, + "step": 110 + }, + { + "epoch": 0.09109888024293035, + "grad_norm": 53.97127914428711, + "learning_rate": 5.921052631578947e-07, + "loss": 1.3995, + "step": 120 + }, + { + "epoch": 0.09869045359650788, + "grad_norm": 45.6757698059082, + "learning_rate": 6.427125506072875e-07, + "loss": 1.2737, + "step": 130 + }, + { + "epoch": 0.1062820269500854, + "grad_norm": 35.03736114501953, + "learning_rate": 6.933198380566802e-07, + "loss": 1.3719, + "step": 140 + }, + { + "epoch": 0.11387360030366293, + "grad_norm": 26.410057067871094, + "learning_rate": 7.388663967611337e-07, + "loss": 1.1505, + "step": 150 + }, + { + "epoch": 0.12146517365724047, + "grad_norm": 30.611797332763672, + "learning_rate": 7.844129554655872e-07, + "loss": 1.3579, + "step": 160 + }, + { + "epoch": 0.12905674701081798, + "grad_norm": 36.64908981323242, + "learning_rate": 8.350202429149798e-07, + "loss": 1.2164, + "step": 170 + }, + { + "epoch": 0.13664832036439553, + "grad_norm": 47.913612365722656, + "learning_rate": 8.805668016194332e-07, + "loss": 1.1154, + "step": 180 + }, + { + "epoch": 0.14423989371797305, + "grad_norm": 71.07138061523438, + "learning_rate": 9.31174089068826e-07, + "loss": 1.2263, + "step": 190 + }, + { + "epoch": 0.15183146707155057, + "grad_norm": 46.60552978515625, + "learning_rate": 9.817813765182186e-07, + "loss": 1.3512, + "step": 200 + }, + { + "epoch": 0.15942304042512812, + "grad_norm": 39.1867561340332, + "learning_rate": 1.0323886639676114e-06, + "loss": 1.095, + "step": 210 + }, + { + "epoch": 0.16701461377870563, + "grad_norm": 48.71131896972656, + "learning_rate": 1.0829959514170041e-06, + "loss": 1.401, + "step": 220 + }, + { + "epoch": 0.17460618713228315, + "grad_norm": 46.5413703918457, + "learning_rate": 1.133603238866397e-06, + "loss": 1.4863, + "step": 230 + }, + { + "epoch": 0.1821977604858607, + "grad_norm": 45.00301742553711, + "learning_rate": 1.1842105263157894e-06, + "loss": 1.0144, + "step": 240 + }, + { + "epoch": 0.18978933383943822, + "grad_norm": 66.17977905273438, + "learning_rate": 1.2348178137651822e-06, + "loss": 1.2741, + "step": 250 + }, + { + "epoch": 0.19738090719301576, + "grad_norm": 71.00930786132812, + "learning_rate": 1.285425101214575e-06, + "loss": 1.3536, + "step": 260 + }, + { + "epoch": 0.20497248054659328, + "grad_norm": 66.67515563964844, + "learning_rate": 1.336032388663968e-06, + "loss": 1.247, + "step": 270 + }, + { + "epoch": 0.2125640539001708, + "grad_norm": 47.43987274169922, + "learning_rate": 1.3866396761133605e-06, + "loss": 1.2843, + "step": 280 + }, + { + "epoch": 0.22015562725374835, + "grad_norm": 41.783695220947266, + "learning_rate": 1.4372469635627532e-06, + "loss": 1.401, + "step": 290 + }, + { + "epoch": 0.22774720060732587, + "grad_norm": 61.59716796875, + "learning_rate": 1.4878542510121458e-06, + "loss": 1.134, + "step": 300 + }, + { + "epoch": 0.23533877396090339, + "grad_norm": 52.884761810302734, + "learning_rate": 1.5384615384615387e-06, + "loss": 1.2276, + "step": 310 + }, + { + "epoch": 0.24293034731448093, + "grad_norm": 43.76587677001953, + "learning_rate": 1.5890688259109313e-06, + "loss": 1.1747, + "step": 320 + }, + { + "epoch": 0.25052192066805845, + "grad_norm": 24.984729766845703, + "learning_rate": 1.639676113360324e-06, + "loss": 1.2557, + "step": 330 + }, + { + "epoch": 0.25811349402163597, + "grad_norm": 32.03645324707031, + "learning_rate": 1.6902834008097168e-06, + "loss": 1.0106, + "step": 340 + }, + { + "epoch": 0.2657050673752135, + "grad_norm": 37.198177337646484, + "learning_rate": 1.7408906882591095e-06, + "loss": 1.0211, + "step": 350 + }, + { + "epoch": 0.27329664072879106, + "grad_norm": 46.07294464111328, + "learning_rate": 1.791497975708502e-06, + "loss": 1.019, + "step": 360 + }, + { + "epoch": 0.2808882140823686, + "grad_norm": 61.57015609741211, + "learning_rate": 1.8370445344129556e-06, + "loss": 1.3087, + "step": 370 + }, + { + "epoch": 0.2884797874359461, + "grad_norm": 37.366268157958984, + "learning_rate": 1.8876518218623483e-06, + "loss": 0.9793, + "step": 380 + }, + { + "epoch": 0.2960713607895236, + "grad_norm": 25.108686447143555, + "learning_rate": 1.938259109311741e-06, + "loss": 1.2809, + "step": 390 + }, + { + "epoch": 0.30366293414310114, + "grad_norm": 41.831172943115234, + "learning_rate": 1.988866396761134e-06, + "loss": 1.04, + "step": 400 + }, + { + "epoch": 0.3112545074966787, + "grad_norm": 53.10079574584961, + "learning_rate": 2.0394736842105266e-06, + "loss": 1.1906, + "step": 410 + }, + { + "epoch": 0.31884608085025623, + "grad_norm": 38.19053649902344, + "learning_rate": 2.090080971659919e-06, + "loss": 1.1724, + "step": 420 + }, + { + "epoch": 0.32643765420383375, + "grad_norm": 43.842498779296875, + "learning_rate": 2.140688259109312e-06, + "loss": 1.1657, + "step": 430 + }, + { + "epoch": 0.33402922755741127, + "grad_norm": 54.60807418823242, + "learning_rate": 2.1912955465587044e-06, + "loss": 0.9103, + "step": 440 + }, + { + "epoch": 0.3416208009109888, + "grad_norm": 48.880218505859375, + "learning_rate": 2.241902834008097e-06, + "loss": 1.1537, + "step": 450 + }, + { + "epoch": 0.3492123742645663, + "grad_norm": 40.26908493041992, + "learning_rate": 2.2925101214574904e-06, + "loss": 1.0753, + "step": 460 + }, + { + "epoch": 0.3568039476181439, + "grad_norm": 65.76298522949219, + "learning_rate": 2.3431174089068827e-06, + "loss": 0.9305, + "step": 470 + }, + { + "epoch": 0.3643955209717214, + "grad_norm": 33.159881591796875, + "learning_rate": 2.3937246963562755e-06, + "loss": 0.9382, + "step": 480 + }, + { + "epoch": 0.3719870943252989, + "grad_norm": 32.02263259887695, + "learning_rate": 2.4443319838056682e-06, + "loss": 0.8098, + "step": 490 + }, + { + "epoch": 0.37957866767887644, + "grad_norm": 50.231842041015625, + "learning_rate": 2.494939271255061e-06, + "loss": 1.0712, + "step": 500 + }, + { + "epoch": 0.38717024103245395, + "grad_norm": 54.17763137817383, + "learning_rate": 2.5455465587044537e-06, + "loss": 0.9814, + "step": 510 + }, + { + "epoch": 0.39476181438603153, + "grad_norm": 25.580745697021484, + "learning_rate": 2.5961538461538465e-06, + "loss": 0.5809, + "step": 520 + }, + { + "epoch": 0.40235338773960905, + "grad_norm": 5.449360370635986, + "learning_rate": 2.646761133603239e-06, + "loss": 0.5567, + "step": 530 + }, + { + "epoch": 0.40994496109318657, + "grad_norm": 29.534494400024414, + "learning_rate": 2.697368421052632e-06, + "loss": 0.5823, + "step": 540 + }, + { + "epoch": 0.4175365344467641, + "grad_norm": 13.788243293762207, + "learning_rate": 2.7479757085020247e-06, + "loss": 0.9266, + "step": 550 + }, + { + "epoch": 0.4251281078003416, + "grad_norm": 32.08829879760742, + "learning_rate": 2.798582995951417e-06, + "loss": 0.432, + "step": 560 + }, + { + "epoch": 0.4327196811539191, + "grad_norm": 12.410125732421875, + "learning_rate": 2.8491902834008103e-06, + "loss": 0.7482, + "step": 570 + }, + { + "epoch": 0.4403112545074967, + "grad_norm": 24.522109985351562, + "learning_rate": 2.8997975708502026e-06, + "loss": 0.504, + "step": 580 + }, + { + "epoch": 0.4479028278610742, + "grad_norm": 17.949840545654297, + "learning_rate": 2.9504048582995953e-06, + "loss": 0.6836, + "step": 590 + }, + { + "epoch": 0.45549440121465173, + "grad_norm": 13.618581771850586, + "learning_rate": 3.001012145748988e-06, + "loss": 0.6873, + "step": 600 + }, + { + "epoch": 0.46308597456822925, + "grad_norm": 50.62519454956055, + "learning_rate": 3.0516194331983804e-06, + "loss": 0.4781, + "step": 610 + }, + { + "epoch": 0.47067754792180677, + "grad_norm": 27.868289947509766, + "learning_rate": 3.1022267206477736e-06, + "loss": 0.7148, + "step": 620 + }, + { + "epoch": 0.47826912127538435, + "grad_norm": 30.8429012298584, + "learning_rate": 3.1528340080971664e-06, + "loss": 0.591, + "step": 630 + }, + { + "epoch": 0.48586069462896186, + "grad_norm": 51.042518615722656, + "learning_rate": 3.2034412955465587e-06, + "loss": 0.5481, + "step": 640 + }, + { + "epoch": 0.4934522679825394, + "grad_norm": 42.53914260864258, + "learning_rate": 3.254048582995952e-06, + "loss": 0.6404, + "step": 650 + }, + { + "epoch": 0.5010438413361169, + "grad_norm": 28.016672134399414, + "learning_rate": 3.3046558704453446e-06, + "loss": 1.075, + "step": 660 + }, + { + "epoch": 0.5086354146896944, + "grad_norm": 26.764345169067383, + "learning_rate": 3.355263157894737e-06, + "loss": 0.5689, + "step": 670 + }, + { + "epoch": 0.5162269880432719, + "grad_norm": 10.721156120300293, + "learning_rate": 3.40587044534413e-06, + "loss": 0.302, + "step": 680 + }, + { + "epoch": 0.5238185613968495, + "grad_norm": 33.98798751831055, + "learning_rate": 3.4564777327935225e-06, + "loss": 0.3699, + "step": 690 + }, + { + "epoch": 0.531410134750427, + "grad_norm": 98.7930908203125, + "learning_rate": 3.5070850202429152e-06, + "loss": 0.5585, + "step": 700 + }, + { + "epoch": 0.5390017081040046, + "grad_norm": 17.008193969726562, + "learning_rate": 3.557692307692308e-06, + "loss": 0.5513, + "step": 710 + }, + { + "epoch": 0.5465932814575821, + "grad_norm": 0.9657185077667236, + "learning_rate": 3.6082995951417003e-06, + "loss": 0.3778, + "step": 720 + }, + { + "epoch": 0.5541848548111596, + "grad_norm": 22.920196533203125, + "learning_rate": 3.6589068825910935e-06, + "loss": 0.2108, + "step": 730 + }, + { + "epoch": 0.5617764281647372, + "grad_norm": 24.24422264099121, + "learning_rate": 3.7095141700404862e-06, + "loss": 0.774, + "step": 740 + }, + { + "epoch": 0.5693680015183147, + "grad_norm": 10.006725311279297, + "learning_rate": 3.7601214574898786e-06, + "loss": 0.3806, + "step": 750 + }, + { + "epoch": 0.5769595748718922, + "grad_norm": 25.408447265625, + "learning_rate": 3.8107287449392717e-06, + "loss": 0.3539, + "step": 760 + }, + { + "epoch": 0.5845511482254697, + "grad_norm": 1.4603581428527832, + "learning_rate": 3.8613360323886645e-06, + "loss": 0.2608, + "step": 770 + }, + { + "epoch": 0.5921427215790472, + "grad_norm": 16.798980712890625, + "learning_rate": 3.911943319838057e-06, + "loss": 0.3287, + "step": 780 + }, + { + "epoch": 0.5997342949326248, + "grad_norm": 11.706854820251465, + "learning_rate": 3.96255060728745e-06, + "loss": 0.5302, + "step": 790 + }, + { + "epoch": 0.6073258682862023, + "grad_norm": 20.42545509338379, + "learning_rate": 4.013157894736842e-06, + "loss": 0.489, + "step": 800 + }, + { + "epoch": 0.6149174416397798, + "grad_norm": 44.284629821777344, + "learning_rate": 4.063765182186235e-06, + "loss": 0.4183, + "step": 810 + }, + { + "epoch": 0.6225090149933574, + "grad_norm": 35.91806411743164, + "learning_rate": 4.114372469635628e-06, + "loss": 0.6323, + "step": 820 + }, + { + "epoch": 0.6301005883469349, + "grad_norm": 5.10564661026001, + "learning_rate": 4.16497975708502e-06, + "loss": 0.2137, + "step": 830 + }, + { + "epoch": 0.6376921617005125, + "grad_norm": 14.327881813049316, + "learning_rate": 4.215587044534413e-06, + "loss": 0.1283, + "step": 840 + }, + { + "epoch": 0.64528373505409, + "grad_norm": 0.4119018018245697, + "learning_rate": 4.2661943319838065e-06, + "loss": 0.5361, + "step": 850 + }, + { + "epoch": 0.6528753084076675, + "grad_norm": 0.33248305320739746, + "learning_rate": 4.316801619433199e-06, + "loss": 0.3669, + "step": 860 + }, + { + "epoch": 0.660466881761245, + "grad_norm": 0.44110462069511414, + "learning_rate": 4.367408906882591e-06, + "loss": 0.2979, + "step": 870 + }, + { + "epoch": 0.6680584551148225, + "grad_norm": 0.34030860662460327, + "learning_rate": 4.418016194331984e-06, + "loss": 0.2611, + "step": 880 + }, + { + "epoch": 0.6756500284684, + "grad_norm": 46.19267272949219, + "learning_rate": 4.468623481781377e-06, + "loss": 0.2948, + "step": 890 + }, + { + "epoch": 0.6832416018219776, + "grad_norm": 33.486717224121094, + "learning_rate": 4.51923076923077e-06, + "loss": 0.2593, + "step": 900 + }, + { + "epoch": 0.6908331751755551, + "grad_norm": 43.04954528808594, + "learning_rate": 4.569838056680162e-06, + "loss": 0.2791, + "step": 910 + }, + { + "epoch": 0.6984247485291326, + "grad_norm": 57.075809478759766, + "learning_rate": 4.6204453441295545e-06, + "loss": 0.2198, + "step": 920 + }, + { + "epoch": 0.7060163218827102, + "grad_norm": 52.269168853759766, + "learning_rate": 4.671052631578948e-06, + "loss": 0.4377, + "step": 930 + }, + { + "epoch": 0.7136078952362878, + "grad_norm": 0.06886545568704605, + "learning_rate": 4.72165991902834e-06, + "loss": 0.1961, + "step": 940 + }, + { + "epoch": 0.7211994685898653, + "grad_norm": 41.10899353027344, + "learning_rate": 4.772267206477733e-06, + "loss": 0.4847, + "step": 950 + }, + { + "epoch": 0.7287910419434428, + "grad_norm": 2.2750415802001953, + "learning_rate": 4.822874493927126e-06, + "loss": 0.2632, + "step": 960 + }, + { + "epoch": 0.7363826152970203, + "grad_norm": 0.374896764755249, + "learning_rate": 4.873481781376519e-06, + "loss": 0.2243, + "step": 970 + }, + { + "epoch": 0.7439741886505978, + "grad_norm": 29.88395118713379, + "learning_rate": 4.924089068825911e-06, + "loss": 0.256, + "step": 980 + }, + { + "epoch": 0.7515657620041754, + "grad_norm": 48.7998161315918, + "learning_rate": 4.974696356275304e-06, + "loss": 0.2969, + "step": 990 + }, + { + "epoch": 0.7591573353577529, + "grad_norm": 33.62395095825195, + "learning_rate": 5.025303643724697e-06, + "loss": 0.4137, + "step": 1000 + }, + { + "epoch": 0.7667489087113304, + "grad_norm": 37.788795471191406, + "learning_rate": 5.07591093117409e-06, + "loss": 0.2332, + "step": 1010 + }, + { + "epoch": 0.7743404820649079, + "grad_norm": 0.14387387037277222, + "learning_rate": 5.126518218623482e-06, + "loss": 0.2919, + "step": 1020 + }, + { + "epoch": 0.7819320554184854, + "grad_norm": 5.84027099609375, + "learning_rate": 5.177125506072875e-06, + "loss": 0.2009, + "step": 1030 + }, + { + "epoch": 0.7895236287720631, + "grad_norm": 0.04207382723689079, + "learning_rate": 5.227732793522268e-06, + "loss": 0.2878, + "step": 1040 + }, + { + "epoch": 0.7971152021256406, + "grad_norm": 45.870460510253906, + "learning_rate": 5.27834008097166e-06, + "loss": 0.2449, + "step": 1050 + }, + { + "epoch": 0.8047067754792181, + "grad_norm": 51.27897262573242, + "learning_rate": 5.328947368421054e-06, + "loss": 0.2529, + "step": 1060 + }, + { + "epoch": 0.8122983488327956, + "grad_norm": 0.5692594051361084, + "learning_rate": 5.379554655870446e-06, + "loss": 0.6134, + "step": 1070 + }, + { + "epoch": 0.8198899221863731, + "grad_norm": 0.09678292274475098, + "learning_rate": 5.430161943319839e-06, + "loss": 0.2444, + "step": 1080 + }, + { + "epoch": 0.8274814955399507, + "grad_norm": 1.1001735925674438, + "learning_rate": 5.480769230769232e-06, + "loss": 0.2606, + "step": 1090 + }, + { + "epoch": 0.8350730688935282, + "grad_norm": 0.183668851852417, + "learning_rate": 5.531376518218624e-06, + "loss": 0.4214, + "step": 1100 + }, + { + "epoch": 0.8426646422471057, + "grad_norm": 1.1151483058929443, + "learning_rate": 5.5819838056680164e-06, + "loss": 0.1742, + "step": 1110 + }, + { + "epoch": 0.8502562156006832, + "grad_norm": 4.986824989318848, + "learning_rate": 5.632591093117409e-06, + "loss": 0.2275, + "step": 1120 + }, + { + "epoch": 0.8578477889542607, + "grad_norm": 40.5273323059082, + "learning_rate": 5.683198380566802e-06, + "loss": 0.2895, + "step": 1130 + }, + { + "epoch": 0.8654393623078382, + "grad_norm": 27.60036849975586, + "learning_rate": 5.733805668016194e-06, + "loss": 0.1973, + "step": 1140 + }, + { + "epoch": 0.8730309356614159, + "grad_norm": 0.3474140763282776, + "learning_rate": 5.784412955465587e-06, + "loss": 0.2508, + "step": 1150 + }, + { + "epoch": 0.8806225090149934, + "grad_norm": 41.10483169555664, + "learning_rate": 5.835020242914981e-06, + "loss": 0.1403, + "step": 1160 + }, + { + "epoch": 0.8882140823685709, + "grad_norm": 38.43809509277344, + "learning_rate": 5.885627530364373e-06, + "loss": 0.1639, + "step": 1170 + }, + { + "epoch": 0.8958056557221484, + "grad_norm": 0.1039985790848732, + "learning_rate": 5.936234817813765e-06, + "loss": 0.3821, + "step": 1180 + }, + { + "epoch": 0.903397229075726, + "grad_norm": 7.511643409729004, + "learning_rate": 5.9868421052631585e-06, + "loss": 0.3217, + "step": 1190 + }, + { + "epoch": 0.9109888024293035, + "grad_norm": 1.0779646635055542, + "learning_rate": 6.037449392712551e-06, + "loss": 0.306, + "step": 1200 + }, + { + "epoch": 0.918580375782881, + "grad_norm": 36.478607177734375, + "learning_rate": 6.088056680161943e-06, + "loss": 0.2311, + "step": 1210 + }, + { + "epoch": 0.9261719491364585, + "grad_norm": 5.484299659729004, + "learning_rate": 6.138663967611337e-06, + "loss": 0.1957, + "step": 1220 + }, + { + "epoch": 0.933763522490036, + "grad_norm": 36.05448532104492, + "learning_rate": 6.1892712550607295e-06, + "loss": 0.1806, + "step": 1230 + }, + { + "epoch": 0.9413550958436135, + "grad_norm": 0.11734521389007568, + "learning_rate": 6.239878542510122e-06, + "loss": 0.1755, + "step": 1240 + }, + { + "epoch": 0.9489466691971912, + "grad_norm": 29.589279174804688, + "learning_rate": 6.290485829959515e-06, + "loss": 0.1282, + "step": 1250 + }, + { + "epoch": 0.9565382425507687, + "grad_norm": 0.0918528214097023, + "learning_rate": 6.341093117408907e-06, + "loss": 0.1178, + "step": 1260 + }, + { + "epoch": 0.9641298159043462, + "grad_norm": 23.708993911743164, + "learning_rate": 6.3917004048583e-06, + "loss": 0.4097, + "step": 1270 + }, + { + "epoch": 0.9717213892579237, + "grad_norm": 34.241607666015625, + "learning_rate": 6.442307692307693e-06, + "loss": 0.25, + "step": 1280 + }, + { + "epoch": 0.9793129626115012, + "grad_norm": 3.067420482635498, + "learning_rate": 6.492914979757086e-06, + "loss": 0.201, + "step": 1290 + }, + { + "epoch": 0.9869045359650788, + "grad_norm": 20.88066291809082, + "learning_rate": 6.543522267206478e-06, + "loss": 0.2371, + "step": 1300 + }, + { + "epoch": 0.9944961093186563, + "grad_norm": 463.48541259765625, + "learning_rate": 6.5941295546558715e-06, + "loss": 0.436, + "step": 1310 + }, + { + "epoch": 0.9998102106661606, + "eval_f1": 0.8680898011188399, + "eval_loss": 0.3059525787830353, + "eval_precision": 0.8342809981821465, + "eval_recall": 0.8694992412746586, + "eval_runtime": 75.6444, + "eval_samples_per_second": 17.424, + "eval_steps_per_second": 17.424, + "step": 1317 + }, + { + "epoch": 1.0020876826722338, + "grad_norm": 49.61616897583008, + "learning_rate": 6.644736842105264e-06, + "loss": 0.0671, + "step": 1320 + }, + { + "epoch": 1.0096792560258114, + "grad_norm": 0.10375616699457169, + "learning_rate": 6.695344129554656e-06, + "loss": 0.3414, + "step": 1330 + }, + { + "epoch": 1.0172708293793888, + "grad_norm": 1.4322081804275513, + "learning_rate": 6.745951417004049e-06, + "loss": 0.4417, + "step": 1340 + }, + { + "epoch": 1.0248624027329665, + "grad_norm": 33.353389739990234, + "learning_rate": 6.796558704453442e-06, + "loss": 0.2982, + "step": 1350 + }, + { + "epoch": 1.0324539760865439, + "grad_norm": 0.03437357768416405, + "learning_rate": 6.847165991902834e-06, + "loss": 0.1854, + "step": 1360 + }, + { + "epoch": 1.0400455494401215, + "grad_norm": 25.882795333862305, + "learning_rate": 6.897773279352226e-06, + "loss": 0.2332, + "step": 1370 + }, + { + "epoch": 1.047637122793699, + "grad_norm": 0.06532780081033707, + "learning_rate": 6.94838056680162e-06, + "loss": 0.0215, + "step": 1380 + }, + { + "epoch": 1.0552286961472765, + "grad_norm": 32.39247131347656, + "learning_rate": 6.998987854251013e-06, + "loss": 0.072, + "step": 1390 + }, + { + "epoch": 1.062820269500854, + "grad_norm": 0.027906494215130806, + "learning_rate": 7.049595141700405e-06, + "loss": 0.2047, + "step": 1400 + }, + { + "epoch": 1.0704118428544316, + "grad_norm": 0.6285625100135803, + "learning_rate": 7.100202429149798e-06, + "loss": 0.1842, + "step": 1410 + }, + { + "epoch": 1.0780034162080092, + "grad_norm": 45.90504837036133, + "learning_rate": 7.1508097165991906e-06, + "loss": 0.3873, + "step": 1420 + }, + { + "epoch": 1.0855949895615866, + "grad_norm": 0.1192048192024231, + "learning_rate": 7.201417004048583e-06, + "loss": 0.0922, + "step": 1430 + }, + { + "epoch": 1.0931865629151643, + "grad_norm": 0.01541421003639698, + "learning_rate": 7.252024291497977e-06, + "loss": 0.2405, + "step": 1440 + }, + { + "epoch": 1.1007781362687417, + "grad_norm": 1.1560391187667847, + "learning_rate": 7.302631578947369e-06, + "loss": 0.127, + "step": 1450 + }, + { + "epoch": 1.1083697096223193, + "grad_norm": 0.03278697654604912, + "learning_rate": 7.353238866396762e-06, + "loss": 0.1782, + "step": 1460 + }, + { + "epoch": 1.1159612829758967, + "grad_norm": 0.019922947511076927, + "learning_rate": 7.403846153846155e-06, + "loss": 0.0464, + "step": 1470 + }, + { + "epoch": 1.1235528563294743, + "grad_norm": 0.06464574486017227, + "learning_rate": 7.454453441295547e-06, + "loss": 0.1965, + "step": 1480 + }, + { + "epoch": 1.1311444296830517, + "grad_norm": 3.4782345294952393, + "learning_rate": 7.505060728744939e-06, + "loss": 0.2716, + "step": 1490 + }, + { + "epoch": 1.1387360030366294, + "grad_norm": 11.709443092346191, + "learning_rate": 7.555668016194333e-06, + "loss": 0.0861, + "step": 1500 + }, + { + "epoch": 1.146327576390207, + "grad_norm": 0.06341992318630219, + "learning_rate": 7.606275303643725e-06, + "loss": 0.2737, + "step": 1510 + }, + { + "epoch": 1.1539191497437844, + "grad_norm": 0.41259318590164185, + "learning_rate": 7.656882591093118e-06, + "loss": 0.0553, + "step": 1520 + }, + { + "epoch": 1.1615107230973618, + "grad_norm": 0.2754802107810974, + "learning_rate": 7.70748987854251e-06, + "loss": 0.1928, + "step": 1530 + }, + { + "epoch": 1.1691022964509394, + "grad_norm": 35.12890625, + "learning_rate": 7.758097165991903e-06, + "loss": 0.2975, + "step": 1540 + }, + { + "epoch": 1.176693869804517, + "grad_norm": 0.01568063162267208, + "learning_rate": 7.808704453441295e-06, + "loss": 0.0757, + "step": 1550 + }, + { + "epoch": 1.1842854431580945, + "grad_norm": 63.980228424072266, + "learning_rate": 7.859311740890689e-06, + "loss": 0.2807, + "step": 1560 + }, + { + "epoch": 1.191877016511672, + "grad_norm": 0.28402331471443176, + "learning_rate": 7.909919028340081e-06, + "loss": 0.0808, + "step": 1570 + }, + { + "epoch": 1.1994685898652495, + "grad_norm": 0.028258422389626503, + "learning_rate": 7.960526315789474e-06, + "loss": 0.1397, + "step": 1580 + }, + { + "epoch": 1.2070601632188271, + "grad_norm": 3.0772080421447754, + "learning_rate": 8.011133603238868e-06, + "loss": 0.0761, + "step": 1590 + }, + { + "epoch": 1.2146517365724046, + "grad_norm": 0.37224826216697693, + "learning_rate": 8.06174089068826e-06, + "loss": 0.2265, + "step": 1600 + }, + { + "epoch": 1.2222433099259822, + "grad_norm": 0.02686912938952446, + "learning_rate": 8.112348178137652e-06, + "loss": 0.0467, + "step": 1610 + }, + { + "epoch": 1.2298348832795596, + "grad_norm": 0.040963444858789444, + "learning_rate": 8.162955465587045e-06, + "loss": 0.3815, + "step": 1620 + }, + { + "epoch": 1.2374264566331372, + "grad_norm": 15.119370460510254, + "learning_rate": 8.213562753036439e-06, + "loss": 0.1005, + "step": 1630 + }, + { + "epoch": 1.2450180299867148, + "grad_norm": 35.875064849853516, + "learning_rate": 8.264170040485831e-06, + "loss": 0.3051, + "step": 1640 + }, + { + "epoch": 1.2526096033402923, + "grad_norm": 70.46387481689453, + "learning_rate": 8.314777327935223e-06, + "loss": 0.175, + "step": 1650 + }, + { + "epoch": 1.2602011766938699, + "grad_norm": 0.02256329357624054, + "learning_rate": 8.365384615384616e-06, + "loss": 0.2415, + "step": 1660 + }, + { + "epoch": 1.2677927500474473, + "grad_norm": 35.11568069458008, + "learning_rate": 8.415991902834008e-06, + "loss": 0.2629, + "step": 1670 + }, + { + "epoch": 1.275384323401025, + "grad_norm": 71.48613739013672, + "learning_rate": 8.4665991902834e-06, + "loss": 0.4151, + "step": 1680 + }, + { + "epoch": 1.2829758967546023, + "grad_norm": 78.90449523925781, + "learning_rate": 8.517206477732795e-06, + "loss": 0.1292, + "step": 1690 + }, + { + "epoch": 1.29056747010818, + "grad_norm": 31.373775482177734, + "learning_rate": 8.567813765182187e-06, + "loss": 0.26, + "step": 1700 + }, + { + "epoch": 1.2981590434617574, + "grad_norm": 0.049251481890678406, + "learning_rate": 8.61842105263158e-06, + "loss": 0.4099, + "step": 1710 + }, + { + "epoch": 1.305750616815335, + "grad_norm": 26.275672912597656, + "learning_rate": 8.669028340080973e-06, + "loss": 0.1674, + "step": 1720 + }, + { + "epoch": 1.3133421901689126, + "grad_norm": 56.808570861816406, + "learning_rate": 8.719635627530366e-06, + "loss": 0.2071, + "step": 1730 + }, + { + "epoch": 1.32093376352249, + "grad_norm": 12.969684600830078, + "learning_rate": 8.770242914979758e-06, + "loss": 0.3515, + "step": 1740 + }, + { + "epoch": 1.3285253368760674, + "grad_norm": 0.2686771750450134, + "learning_rate": 8.82085020242915e-06, + "loss": 0.128, + "step": 1750 + }, + { + "epoch": 1.336116910229645, + "grad_norm": 0.012039333581924438, + "learning_rate": 8.871457489878543e-06, + "loss": 0.1058, + "step": 1760 + }, + { + "epoch": 1.3437084835832227, + "grad_norm": 20.223878860473633, + "learning_rate": 8.922064777327935e-06, + "loss": 0.196, + "step": 1770 + }, + { + "epoch": 1.3513000569368, + "grad_norm": 0.014049122110009193, + "learning_rate": 8.972672064777329e-06, + "loss": 0.3733, + "step": 1780 + }, + { + "epoch": 1.3588916302903777, + "grad_norm": 42.03798294067383, + "learning_rate": 9.023279352226721e-06, + "loss": 0.1683, + "step": 1790 + }, + { + "epoch": 1.3664832036439551, + "grad_norm": 0.044906727969646454, + "learning_rate": 9.073886639676114e-06, + "loss": 0.2116, + "step": 1800 + }, + { + "epoch": 1.3740747769975328, + "grad_norm": 33.70309829711914, + "learning_rate": 9.124493927125508e-06, + "loss": 0.3049, + "step": 1810 + }, + { + "epoch": 1.3816663503511104, + "grad_norm": 8.82701301574707, + "learning_rate": 9.1751012145749e-06, + "loss": 0.0822, + "step": 1820 + }, + { + "epoch": 1.3892579237046878, + "grad_norm": 2.3878729343414307, + "learning_rate": 9.225708502024292e-06, + "loss": 0.0592, + "step": 1830 + }, + { + "epoch": 1.3968494970582652, + "grad_norm": 0.0016124140238389373, + "learning_rate": 9.276315789473686e-06, + "loss": 0.1208, + "step": 1840 + }, + { + "epoch": 1.4044410704118429, + "grad_norm": 0.13426095247268677, + "learning_rate": 9.326923076923079e-06, + "loss": 0.2488, + "step": 1850 + }, + { + "epoch": 1.4120326437654205, + "grad_norm": 95.8023681640625, + "learning_rate": 9.377530364372471e-06, + "loss": 0.3505, + "step": 1860 + }, + { + "epoch": 1.4196242171189979, + "grad_norm": 0.10023036599159241, + "learning_rate": 9.428137651821863e-06, + "loss": 0.2593, + "step": 1870 + }, + { + "epoch": 1.4272157904725755, + "grad_norm": 0.0036512434016913176, + "learning_rate": 9.478744939271256e-06, + "loss": 0.1653, + "step": 1880 + }, + { + "epoch": 1.434807363826153, + "grad_norm": 0.11651404201984406, + "learning_rate": 9.529352226720648e-06, + "loss": 0.1281, + "step": 1890 + }, + { + "epoch": 1.4423989371797306, + "grad_norm": 99.45907592773438, + "learning_rate": 9.57995951417004e-06, + "loss": 0.2001, + "step": 1900 + }, + { + "epoch": 1.449990510533308, + "grad_norm": 0.42387983202934265, + "learning_rate": 9.630566801619434e-06, + "loss": 0.2895, + "step": 1910 + }, + { + "epoch": 1.4575820838868856, + "grad_norm": 155.79856872558594, + "learning_rate": 9.681174089068827e-06, + "loss": 0.2749, + "step": 1920 + }, + { + "epoch": 1.465173657240463, + "grad_norm": 0.036998867988586426, + "learning_rate": 9.731781376518219e-06, + "loss": 0.3386, + "step": 1930 + }, + { + "epoch": 1.4727652305940406, + "grad_norm": 20.147798538208008, + "learning_rate": 9.782388663967613e-06, + "loss": 0.0259, + "step": 1940 + }, + { + "epoch": 1.4803568039476183, + "grad_norm": 0.6697649955749512, + "learning_rate": 9.832995951417005e-06, + "loss": 0.0671, + "step": 1950 + }, + { + "epoch": 1.4879483773011957, + "grad_norm": 34.21855545043945, + "learning_rate": 9.883603238866398e-06, + "loss": 0.4116, + "step": 1960 + }, + { + "epoch": 1.495539950654773, + "grad_norm": 55.607818603515625, + "learning_rate": 9.93421052631579e-06, + "loss": 0.2809, + "step": 1970 + }, + { + "epoch": 1.5031315240083507, + "grad_norm": 7.255304336547852, + "learning_rate": 9.984817813765182e-06, + "loss": 0.2086, + "step": 1980 + }, + { + "epoch": 1.5107230973619283, + "grad_norm": 0.03336051478981972, + "learning_rate": 9.999996175090899e-06, + "loss": 0.0513, + "step": 1990 + }, + { + "epoch": 1.5183146707155057, + "grad_norm": 0.016688983887434006, + "learning_rate": 9.999977440856317e-06, + "loss": 0.1644, + "step": 2000 + }, + { + "epoch": 1.5259062440690834, + "grad_norm": 25.093719482421875, + "learning_rate": 9.999943094820354e-06, + "loss": 0.2127, + "step": 2010 + }, + { + "epoch": 1.5334978174226608, + "grad_norm": 28.240819931030273, + "learning_rate": 9.999893137090254e-06, + "loss": 0.2039, + "step": 2020 + }, + { + "epoch": 1.5410893907762384, + "grad_norm": 0.2675958275794983, + "learning_rate": 9.999827567822e-06, + "loss": 0.1192, + "step": 2030 + }, + { + "epoch": 1.548680964129816, + "grad_norm": 0.0035021628718823195, + "learning_rate": 9.999746387220327e-06, + "loss": 0.4307, + "step": 2040 + }, + { + "epoch": 1.5562725374833934, + "grad_norm": 45.449134826660156, + "learning_rate": 9.999649595538705e-06, + "loss": 0.1564, + "step": 2050 + }, + { + "epoch": 1.5638641108369709, + "grad_norm": 28.17760467529297, + "learning_rate": 9.999537193079362e-06, + "loss": 0.3947, + "step": 2060 + }, + { + "epoch": 1.5714556841905485, + "grad_norm": 0.08233608305454254, + "learning_rate": 9.999409180193255e-06, + "loss": 0.2997, + "step": 2070 + }, + { + "epoch": 1.5790472575441261, + "grad_norm": 0.010642267763614655, + "learning_rate": 9.99926555728009e-06, + "loss": 0.0658, + "step": 2080 + }, + { + "epoch": 1.5866388308977035, + "grad_norm": 33.69260787963867, + "learning_rate": 9.999106324788313e-06, + "loss": 0.2578, + "step": 2090 + }, + { + "epoch": 1.594230404251281, + "grad_norm": 35.530982971191406, + "learning_rate": 9.998931483215103e-06, + "loss": 0.0085, + "step": 2100 + }, + { + "epoch": 1.6018219776048586, + "grad_norm": 0.02198372408747673, + "learning_rate": 9.998741033106385e-06, + "loss": 0.1038, + "step": 2110 + }, + { + "epoch": 1.6094135509584362, + "grad_norm": 3.9551048278808594, + "learning_rate": 9.998534975056814e-06, + "loss": 0.1167, + "step": 2120 + }, + { + "epoch": 1.6170051243120138, + "grad_norm": 1.1452088356018066, + "learning_rate": 9.998313309709782e-06, + "loss": 0.1636, + "step": 2130 + }, + { + "epoch": 1.6245966976655912, + "grad_norm": 45.56749725341797, + "learning_rate": 9.998076037757408e-06, + "loss": 0.2347, + "step": 2140 + }, + { + "epoch": 1.6321882710191686, + "grad_norm": 0.002319494029507041, + "learning_rate": 9.997823159940545e-06, + "loss": 0.0795, + "step": 2150 + }, + { + "epoch": 1.6397798443727463, + "grad_norm": 0.028734903782606125, + "learning_rate": 9.997554677048776e-06, + "loss": 0.2305, + "step": 2160 + }, + { + "epoch": 1.647371417726324, + "grad_norm": 0.004517258144915104, + "learning_rate": 9.997270589920399e-06, + "loss": 0.0011, + "step": 2170 + }, + { + "epoch": 1.6549629910799013, + "grad_norm": 1.5917277336120605, + "learning_rate": 9.996970899442444e-06, + "loss": 0.1614, + "step": 2180 + }, + { + "epoch": 1.6625545644334787, + "grad_norm": 0.05392596498131752, + "learning_rate": 9.996655606550657e-06, + "loss": 0.2937, + "step": 2190 + }, + { + "epoch": 1.6701461377870563, + "grad_norm": 39.229007720947266, + "learning_rate": 9.996324712229499e-06, + "loss": 0.1227, + "step": 2200 + }, + { + "epoch": 1.677737711140634, + "grad_norm": 19.827287673950195, + "learning_rate": 9.995978217512146e-06, + "loss": 0.1703, + "step": 2210 + }, + { + "epoch": 1.6853292844942114, + "grad_norm": 0.007869013585150242, + "learning_rate": 9.995616123480485e-06, + "loss": 0.298, + "step": 2220 + }, + { + "epoch": 1.692920857847789, + "grad_norm": 17.308448791503906, + "learning_rate": 9.99523843126511e-06, + "loss": 0.2699, + "step": 2230 + }, + { + "epoch": 1.7005124312013664, + "grad_norm": 0.07290565222501755, + "learning_rate": 9.994845142045315e-06, + "loss": 0.0798, + "step": 2240 + }, + { + "epoch": 1.708104004554944, + "grad_norm": 0.0642884150147438, + "learning_rate": 9.994436257049098e-06, + "loss": 0.3115, + "step": 2250 + }, + { + "epoch": 1.7156955779085217, + "grad_norm": 3.773754835128784, + "learning_rate": 9.994011777553152e-06, + "loss": 0.1151, + "step": 2260 + }, + { + "epoch": 1.723287151262099, + "grad_norm": 12.578306198120117, + "learning_rate": 9.99357170488286e-06, + "loss": 0.2351, + "step": 2270 + }, + { + "epoch": 1.7308787246156765, + "grad_norm": 0.12735772132873535, + "learning_rate": 9.993116040412289e-06, + "loss": 0.2368, + "step": 2280 + }, + { + "epoch": 1.7384702979692541, + "grad_norm": 37.49304962158203, + "learning_rate": 9.9926447855642e-06, + "loss": 0.1451, + "step": 2290 + }, + { + "epoch": 1.7460618713228317, + "grad_norm": 7.337117671966553, + "learning_rate": 9.992157941810027e-06, + "loss": 0.2029, + "step": 2300 + }, + { + "epoch": 1.7536534446764092, + "grad_norm": 93.44843292236328, + "learning_rate": 9.991655510669875e-06, + "loss": 0.2177, + "step": 2310 + }, + { + "epoch": 1.7612450180299866, + "grad_norm": 6.563670635223389, + "learning_rate": 9.991137493712524e-06, + "loss": 0.0768, + "step": 2320 + }, + { + "epoch": 1.7688365913835642, + "grad_norm": 0.021621128544211388, + "learning_rate": 9.990603892555417e-06, + "loss": 0.1178, + "step": 2330 + }, + { + "epoch": 1.7764281647371418, + "grad_norm": 0.022252781316637993, + "learning_rate": 9.990054708864655e-06, + "loss": 0.1944, + "step": 2340 + }, + { + "epoch": 1.7840197380907195, + "grad_norm": 21.766817092895508, + "learning_rate": 9.989489944355e-06, + "loss": 0.355, + "step": 2350 + }, + { + "epoch": 1.7916113114442969, + "grad_norm": 0.05736351013183594, + "learning_rate": 9.988909600789851e-06, + "loss": 0.1318, + "step": 2360 + }, + { + "epoch": 1.7992028847978743, + "grad_norm": 44.977779388427734, + "learning_rate": 9.988313679981263e-06, + "loss": 0.0222, + "step": 2370 + }, + { + "epoch": 1.806794458151452, + "grad_norm": 0.016255084425210953, + "learning_rate": 9.987702183789922e-06, + "loss": 0.1285, + "step": 2380 + }, + { + "epoch": 1.8143860315050295, + "grad_norm": 0.5945267081260681, + "learning_rate": 9.987075114125148e-06, + "loss": 0.3838, + "step": 2390 + }, + { + "epoch": 1.821977604858607, + "grad_norm": 0.004704204387962818, + "learning_rate": 9.986432472944887e-06, + "loss": 0.1587, + "step": 2400 + }, + { + "epoch": 1.8295691782121843, + "grad_norm": 0.07433657348155975, + "learning_rate": 9.985774262255708e-06, + "loss": 0.1604, + "step": 2410 + }, + { + "epoch": 1.837160751565762, + "grad_norm": 0.08134903013706207, + "learning_rate": 9.985100484112786e-06, + "loss": 0.2395, + "step": 2420 + }, + { + "epoch": 1.8447523249193396, + "grad_norm": 0.5896629095077515, + "learning_rate": 9.984411140619914e-06, + "loss": 0.0397, + "step": 2430 + }, + { + "epoch": 1.852343898272917, + "grad_norm": 0.0015955844428390265, + "learning_rate": 9.983706233929477e-06, + "loss": 0.2479, + "step": 2440 + }, + { + "epoch": 1.8599354716264946, + "grad_norm": 12.32898998260498, + "learning_rate": 9.982985766242458e-06, + "loss": 0.071, + "step": 2450 + }, + { + "epoch": 1.867527044980072, + "grad_norm": 0.17913532257080078, + "learning_rate": 9.98224973980843e-06, + "loss": 0.0426, + "step": 2460 + }, + { + "epoch": 1.8751186183336497, + "grad_norm": 0.06611054390668869, + "learning_rate": 9.981498156925539e-06, + "loss": 0.3534, + "step": 2470 + }, + { + "epoch": 1.8827101916872273, + "grad_norm": 2.170029640197754, + "learning_rate": 9.98073101994051e-06, + "loss": 0.1845, + "step": 2480 + }, + { + "epoch": 1.8903017650408047, + "grad_norm": 3.257478952407837, + "learning_rate": 9.979948331248633e-06, + "loss": 0.0038, + "step": 2490 + }, + { + "epoch": 1.8978933383943821, + "grad_norm": 138.6713409423828, + "learning_rate": 9.979150093293753e-06, + "loss": 0.1855, + "step": 2500 + }, + { + "epoch": 1.9054849117479598, + "grad_norm": 0.7939999103546143, + "learning_rate": 9.978336308568266e-06, + "loss": 0.2101, + "step": 2510 + }, + { + "epoch": 1.9130764851015374, + "grad_norm": 14.836468696594238, + "learning_rate": 9.977506979613118e-06, + "loss": 0.2692, + "step": 2520 + }, + { + "epoch": 1.9206680584551148, + "grad_norm": 0.4420275390148163, + "learning_rate": 9.97666210901778e-06, + "loss": 0.0356, + "step": 2530 + }, + { + "epoch": 1.9282596318086922, + "grad_norm": 4.923569679260254, + "learning_rate": 9.975801699420256e-06, + "loss": 0.1263, + "step": 2540 + }, + { + "epoch": 1.9358512051622698, + "grad_norm": 0.01419526245445013, + "learning_rate": 9.974925753507066e-06, + "loss": 0.0735, + "step": 2550 + }, + { + "epoch": 1.9434427785158475, + "grad_norm": 72.68999481201172, + "learning_rate": 9.974034274013242e-06, + "loss": 0.0418, + "step": 2560 + }, + { + "epoch": 1.951034351869425, + "grad_norm": 0.0027209515683352947, + "learning_rate": 9.973127263722317e-06, + "loss": 0.0042, + "step": 2570 + }, + { + "epoch": 1.9586259252230025, + "grad_norm": 0.015417971648275852, + "learning_rate": 9.972204725466316e-06, + "loss": 0.2174, + "step": 2580 + }, + { + "epoch": 1.96621749857658, + "grad_norm": 0.013561515137553215, + "learning_rate": 9.971266662125749e-06, + "loss": 0.0808, + "step": 2590 + }, + { + "epoch": 1.9738090719301575, + "grad_norm": 72.93014526367188, + "learning_rate": 9.9703130766296e-06, + "loss": 0.2353, + "step": 2600 + }, + { + "epoch": 1.9814006452837352, + "grad_norm": 0.028727278113365173, + "learning_rate": 9.96934397195532e-06, + "loss": 0.0344, + "step": 2610 + }, + { + "epoch": 1.9889922186373126, + "grad_norm": 6.5093770027160645, + "learning_rate": 9.96835935112882e-06, + "loss": 0.3215, + "step": 2620 + }, + { + "epoch": 1.99658379199089, + "grad_norm": 45.58213806152344, + "learning_rate": 9.96735921722445e-06, + "loss": 0.4849, + "step": 2630 + }, + { + "epoch": 1.9996204213323212, + "eval_f1": 0.9241639816476168, + "eval_loss": 0.15625236928462982, + "eval_precision": 0.9250778152019562, + "eval_recall": 0.9241274658573596, + "eval_runtime": 75.5915, + "eval_samples_per_second": 17.436, + "eval_steps_per_second": 17.436, + "step": 2634 + }, + { + "epoch": 2.0041753653444676, + "grad_norm": 0.07899657636880875, + "learning_rate": 9.966343573365005e-06, + "loss": 0.0937, + "step": 2640 + }, + { + "epoch": 2.0117669386980452, + "grad_norm": 1.1364494562149048, + "learning_rate": 9.965312422721705e-06, + "loss": 0.0372, + "step": 2650 + }, + { + "epoch": 2.019358512051623, + "grad_norm": 0.009463181719183922, + "learning_rate": 9.964265768514189e-06, + "loss": 0.1315, + "step": 2660 + }, + { + "epoch": 2.0269500854052, + "grad_norm": 152.41160583496094, + "learning_rate": 9.963203614010502e-06, + "loss": 0.1601, + "step": 2670 + }, + { + "epoch": 2.0345416587587777, + "grad_norm": 12.7033109664917, + "learning_rate": 9.962125962527088e-06, + "loss": 0.1492, + "step": 2680 + }, + { + "epoch": 2.0421332321123553, + "grad_norm": 0.1103023886680603, + "learning_rate": 9.961032817428779e-06, + "loss": 0.044, + "step": 2690 + }, + { + "epoch": 2.049724805465933, + "grad_norm": 0.04437507316470146, + "learning_rate": 9.959924182128784e-06, + "loss": 0.2004, + "step": 2700 + }, + { + "epoch": 2.05731637881951, + "grad_norm": 0.016279350966215134, + "learning_rate": 9.958800060088675e-06, + "loss": 0.0789, + "step": 2710 + }, + { + "epoch": 2.0649079521730878, + "grad_norm": 0.06195428967475891, + "learning_rate": 9.957660454818385e-06, + "loss": 0.1212, + "step": 2720 + }, + { + "epoch": 2.0724995255266654, + "grad_norm": 0.07117705792188644, + "learning_rate": 9.956505369876187e-06, + "loss": 0.1124, + "step": 2730 + }, + { + "epoch": 2.080091098880243, + "grad_norm": 0.0017620900180190802, + "learning_rate": 9.955334808868686e-06, + "loss": 0.2135, + "step": 2740 + }, + { + "epoch": 2.0876826722338206, + "grad_norm": 0.0784306600689888, + "learning_rate": 9.954148775450816e-06, + "loss": 0.0047, + "step": 2750 + }, + { + "epoch": 2.095274245587398, + "grad_norm": 0.014996266923844814, + "learning_rate": 9.952947273325815e-06, + "loss": 0.0063, + "step": 2760 + }, + { + "epoch": 2.1028658189409755, + "grad_norm": 3.2599010467529297, + "learning_rate": 9.951730306245222e-06, + "loss": 0.1602, + "step": 2770 + }, + { + "epoch": 2.110457392294553, + "grad_norm": 0.016863863915205002, + "learning_rate": 9.950497878008865e-06, + "loss": 0.0317, + "step": 2780 + }, + { + "epoch": 2.1180489656481307, + "grad_norm": 15.340392112731934, + "learning_rate": 9.949249992464847e-06, + "loss": 0.154, + "step": 2790 + }, + { + "epoch": 2.125640539001708, + "grad_norm": 4.341642379760742, + "learning_rate": 9.947986653509531e-06, + "loss": 0.0257, + "step": 2800 + }, + { + "epoch": 2.1332321123552855, + "grad_norm": 1.8507261276245117, + "learning_rate": 9.946707865087538e-06, + "loss": 0.1434, + "step": 2810 + }, + { + "epoch": 2.140823685708863, + "grad_norm": 0.16088451445102692, + "learning_rate": 9.94541363119172e-06, + "loss": 0.0837, + "step": 2820 + }, + { + "epoch": 2.148415259062441, + "grad_norm": 0.689831018447876, + "learning_rate": 9.944103955863162e-06, + "loss": 0.4116, + "step": 2830 + }, + { + "epoch": 2.1560068324160184, + "grad_norm": 1.8963958024978638, + "learning_rate": 9.94277884319116e-06, + "loss": 0.1837, + "step": 2840 + }, + { + "epoch": 2.1635984057695956, + "grad_norm": 0.024928750470280647, + "learning_rate": 9.941438297313215e-06, + "loss": 0.0743, + "step": 2850 + }, + { + "epoch": 2.1711899791231732, + "grad_norm": 0.006995880510658026, + "learning_rate": 9.940082322415008e-06, + "loss": 0.0001, + "step": 2860 + }, + { + "epoch": 2.178781552476751, + "grad_norm": 78.55364227294922, + "learning_rate": 9.938710922730404e-06, + "loss": 0.1252, + "step": 2870 + }, + { + "epoch": 2.1863731258303285, + "grad_norm": 0.013810686767101288, + "learning_rate": 9.937324102541424e-06, + "loss": 0.0243, + "step": 2880 + }, + { + "epoch": 2.1939646991839057, + "grad_norm": 0.007164845243096352, + "learning_rate": 9.935921866178242e-06, + "loss": 0.0583, + "step": 2890 + }, + { + "epoch": 2.2015562725374833, + "grad_norm": 0.0043396539986133575, + "learning_rate": 9.934504218019161e-06, + "loss": 0.0862, + "step": 2900 + }, + { + "epoch": 2.209147845891061, + "grad_norm": 0.007671877276152372, + "learning_rate": 9.933071162490613e-06, + "loss": 0.0016, + "step": 2910 + }, + { + "epoch": 2.2167394192446386, + "grad_norm": 327.8991394042969, + "learning_rate": 9.931622704067133e-06, + "loss": 0.1624, + "step": 2920 + }, + { + "epoch": 2.224330992598216, + "grad_norm": 16.11570167541504, + "learning_rate": 9.93015884727135e-06, + "loss": 0.2645, + "step": 2930 + }, + { + "epoch": 2.2319225659517934, + "grad_norm": 0.005082719959318638, + "learning_rate": 9.928679596673974e-06, + "loss": 0.0002, + "step": 2940 + }, + { + "epoch": 2.239514139305371, + "grad_norm": 0.01941937580704689, + "learning_rate": 9.927184956893778e-06, + "loss": 0.0612, + "step": 2950 + }, + { + "epoch": 2.2471057126589487, + "grad_norm": 19.174551010131836, + "learning_rate": 9.925674932597586e-06, + "loss": 0.2042, + "step": 2960 + }, + { + "epoch": 2.2546972860125263, + "grad_norm": 21.23321533203125, + "learning_rate": 9.924149528500259e-06, + "loss": 0.0703, + "step": 2970 + }, + { + "epoch": 2.2622888593661035, + "grad_norm": 0.11990063637495041, + "learning_rate": 9.922608749364684e-06, + "loss": 0.1142, + "step": 2980 + }, + { + "epoch": 2.269880432719681, + "grad_norm": 0.1152704656124115, + "learning_rate": 9.921052600001746e-06, + "loss": 0.102, + "step": 2990 + }, + { + "epoch": 2.2774720060732587, + "grad_norm": 36.8327751159668, + "learning_rate": 9.919481085270328e-06, + "loss": 0.0215, + "step": 3000 + }, + { + "epoch": 2.2850635794268364, + "grad_norm": 0.06316674500703812, + "learning_rate": 9.917894210077285e-06, + "loss": 0.1024, + "step": 3010 + }, + { + "epoch": 2.292655152780414, + "grad_norm": 0.04541470482945442, + "learning_rate": 9.916291979377436e-06, + "loss": 0.21, + "step": 3020 + }, + { + "epoch": 2.300246726133991, + "grad_norm": 2.5551743507385254, + "learning_rate": 9.914674398173548e-06, + "loss": 0.0009, + "step": 3030 + }, + { + "epoch": 2.307838299487569, + "grad_norm": 0.0514085479080677, + "learning_rate": 9.913041471516311e-06, + "loss": 0.0674, + "step": 3040 + }, + { + "epoch": 2.3154298728411464, + "grad_norm": 0.09069258719682693, + "learning_rate": 9.911393204504339e-06, + "loss": 0.1548, + "step": 3050 + }, + { + "epoch": 2.3230214461947236, + "grad_norm": 0.0353839211165905, + "learning_rate": 9.909729602284131e-06, + "loss": 0.1214, + "step": 3060 + }, + { + "epoch": 2.3306130195483012, + "grad_norm": 0.006493726279586554, + "learning_rate": 9.908050670050081e-06, + "loss": 0.0039, + "step": 3070 + }, + { + "epoch": 2.338204592901879, + "grad_norm": 0.009368511848151684, + "learning_rate": 9.906356413044443e-06, + "loss": 0.0779, + "step": 3080 + }, + { + "epoch": 2.3457961662554565, + "grad_norm": 0.011731524951756, + "learning_rate": 9.90464683655732e-06, + "loss": 0.1077, + "step": 3090 + }, + { + "epoch": 2.353387739609034, + "grad_norm": 63.11314392089844, + "learning_rate": 9.902921945926653e-06, + "loss": 0.2824, + "step": 3100 + }, + { + "epoch": 2.3609793129626113, + "grad_norm": 0.0035196368116885424, + "learning_rate": 9.901181746538196e-06, + "loss": 0.0024, + "step": 3110 + }, + { + "epoch": 2.368570886316189, + "grad_norm": 0.0374101847410202, + "learning_rate": 9.8994262438255e-06, + "loss": 0.012, + "step": 3120 + }, + { + "epoch": 2.3761624596697666, + "grad_norm": 16.60328483581543, + "learning_rate": 9.897833211571187e-06, + "loss": 0.466, + "step": 3130 + }, + { + "epoch": 2.383754033023344, + "grad_norm": 21.628568649291992, + "learning_rate": 9.896048647683e-06, + "loss": 0.1202, + "step": 3140 + }, + { + "epoch": 2.3913456063769214, + "grad_norm": 15.491986274719238, + "learning_rate": 9.894248796498034e-06, + "loss": 0.056, + "step": 3150 + }, + { + "epoch": 2.398937179730499, + "grad_norm": 0.009366643615067005, + "learning_rate": 9.892433663636095e-06, + "loss": 0.0003, + "step": 3160 + }, + { + "epoch": 2.4065287530840767, + "grad_norm": 75.25447082519531, + "learning_rate": 9.890603254764708e-06, + "loss": 0.1785, + "step": 3170 + }, + { + "epoch": 2.4141203264376543, + "grad_norm": 0.010000905022025108, + "learning_rate": 9.888757575599095e-06, + "loss": 0.1125, + "step": 3180 + }, + { + "epoch": 2.421711899791232, + "grad_norm": 0.21319662034511566, + "learning_rate": 9.886896631902156e-06, + "loss": 0.0575, + "step": 3190 + }, + { + "epoch": 2.429303473144809, + "grad_norm": 6.481915473937988, + "learning_rate": 9.885020429484457e-06, + "loss": 0.2689, + "step": 3200 + }, + { + "epoch": 2.4368950464983867, + "grad_norm": 0.20284566283226013, + "learning_rate": 9.8831289742042e-06, + "loss": 0.149, + "step": 3210 + }, + { + "epoch": 2.4444866198519644, + "grad_norm": 1.3910574913024902, + "learning_rate": 9.881222271967224e-06, + "loss": 0.0142, + "step": 3220 + }, + { + "epoch": 2.452078193205542, + "grad_norm": 0.09682253748178482, + "learning_rate": 9.879300328726958e-06, + "loss": 0.0021, + "step": 3230 + }, + { + "epoch": 2.459669766559119, + "grad_norm": 0.005042471457272768, + "learning_rate": 9.877363150484434e-06, + "loss": 0.2168, + "step": 3240 + }, + { + "epoch": 2.467261339912697, + "grad_norm": 64.47718811035156, + "learning_rate": 9.875410743288246e-06, + "loss": 0.1994, + "step": 3250 + }, + { + "epoch": 2.4748529132662744, + "grad_norm": 0.2548009753227234, + "learning_rate": 9.873443113234541e-06, + "loss": 0.2271, + "step": 3260 + }, + { + "epoch": 2.482444486619852, + "grad_norm": 0.008805714547634125, + "learning_rate": 9.871460266466996e-06, + "loss": 0.0827, + "step": 3270 + }, + { + "epoch": 2.4900360599734297, + "grad_norm": 0.05888598784804344, + "learning_rate": 9.8694622091768e-06, + "loss": 0.054, + "step": 3280 + }, + { + "epoch": 2.497627633327007, + "grad_norm": 0.004817333538085222, + "learning_rate": 9.867448947602637e-06, + "loss": 0.105, + "step": 3290 + }, + { + "epoch": 2.5052192066805845, + "grad_norm": 0.04850906506180763, + "learning_rate": 9.865420488030664e-06, + "loss": 0.2363, + "step": 3300 + }, + { + "epoch": 2.512810780034162, + "grad_norm": 0.14938922226428986, + "learning_rate": 9.86337683679449e-06, + "loss": 0.1593, + "step": 3310 + }, + { + "epoch": 2.5204023533877398, + "grad_norm": 18.9013729095459, + "learning_rate": 9.861318000275158e-06, + "loss": 0.2351, + "step": 3320 + }, + { + "epoch": 2.527993926741317, + "grad_norm": 0.025823410600423813, + "learning_rate": 9.85924398490113e-06, + "loss": 0.0022, + "step": 3330 + }, + { + "epoch": 2.5355855000948946, + "grad_norm": 28.33924674987793, + "learning_rate": 9.857154797148255e-06, + "loss": 0.2312, + "step": 3340 + }, + { + "epoch": 2.543177073448472, + "grad_norm": 0.001974069746211171, + "learning_rate": 9.855050443539761e-06, + "loss": 0.0002, + "step": 3350 + }, + { + "epoch": 2.55076864680205, + "grad_norm": 21.997047424316406, + "learning_rate": 9.852930930646228e-06, + "loss": 0.1257, + "step": 3360 + }, + { + "epoch": 2.5583602201556275, + "grad_norm": 0.48950299620628357, + "learning_rate": 9.850796265085567e-06, + "loss": 0.0062, + "step": 3370 + }, + { + "epoch": 2.5659517935092047, + "grad_norm": 8.470258712768555, + "learning_rate": 9.848646453523005e-06, + "loss": 0.0585, + "step": 3380 + }, + { + "epoch": 2.5735433668627823, + "grad_norm": 0.11571002751588821, + "learning_rate": 9.846481502671056e-06, + "loss": 0.0329, + "step": 3390 + }, + { + "epoch": 2.58113494021636, + "grad_norm": 11.877908706665039, + "learning_rate": 9.844301419289511e-06, + "loss": 0.2921, + "step": 3400 + }, + { + "epoch": 2.588726513569937, + "grad_norm": 36.33771896362305, + "learning_rate": 9.842106210185403e-06, + "loss": 0.2223, + "step": 3410 + }, + { + "epoch": 2.5963180869235147, + "grad_norm": 2.979523181915283, + "learning_rate": 9.839895882212997e-06, + "loss": 0.0653, + "step": 3420 + }, + { + "epoch": 2.6039096602770924, + "grad_norm": 0.013308779336512089, + "learning_rate": 9.837670442273768e-06, + "loss": 0.0735, + "step": 3430 + }, + { + "epoch": 2.61150123363067, + "grad_norm": 111.11514282226562, + "learning_rate": 9.835429897316367e-06, + "loss": 0.1495, + "step": 3440 + }, + { + "epoch": 2.6190928069842476, + "grad_norm": 0.007320565637201071, + "learning_rate": 9.833174254336618e-06, + "loss": 0.3018, + "step": 3450 + }, + { + "epoch": 2.6266843803378253, + "grad_norm": 0.010831023566424847, + "learning_rate": 9.830903520377482e-06, + "loss": 0.0203, + "step": 3460 + }, + { + "epoch": 2.6342759536914024, + "grad_norm": 18.389625549316406, + "learning_rate": 9.82861770252904e-06, + "loss": 0.1973, + "step": 3470 + }, + { + "epoch": 2.64186752704498, + "grad_norm": 12.364988327026367, + "learning_rate": 9.826316807928468e-06, + "loss": 0.0988, + "step": 3480 + }, + { + "epoch": 2.6494591003985577, + "grad_norm": 0.0008839545771479607, + "learning_rate": 9.824000843760028e-06, + "loss": 0.0552, + "step": 3490 + }, + { + "epoch": 2.657050673752135, + "grad_norm": 0.028787225484848022, + "learning_rate": 9.821669817255021e-06, + "loss": 0.1918, + "step": 3500 + }, + { + "epoch": 2.6646422471057125, + "grad_norm": 0.007524173706769943, + "learning_rate": 9.819323735691787e-06, + "loss": 0.0056, + "step": 3510 + }, + { + "epoch": 2.67223382045929, + "grad_norm": 7.9602837562561035, + "learning_rate": 9.816962606395668e-06, + "loss": 0.1273, + "step": 3520 + }, + { + "epoch": 2.6798253938128678, + "grad_norm": 15.868315696716309, + "learning_rate": 9.814586436738998e-06, + "loss": 0.0943, + "step": 3530 + }, + { + "epoch": 2.6874169671664454, + "grad_norm": 0.2785890996456146, + "learning_rate": 9.812195234141064e-06, + "loss": 0.1291, + "step": 3540 + }, + { + "epoch": 2.695008540520023, + "grad_norm": 63.62078857421875, + "learning_rate": 9.809789006068097e-06, + "loss": 0.0672, + "step": 3550 + }, + { + "epoch": 2.7026001138736, + "grad_norm": 2.8807220458984375, + "learning_rate": 9.807367760033245e-06, + "loss": 0.217, + "step": 3560 + }, + { + "epoch": 2.710191687227178, + "grad_norm": 36.00885009765625, + "learning_rate": 9.80493150359654e-06, + "loss": 0.1016, + "step": 3570 + }, + { + "epoch": 2.7177832605807555, + "grad_norm": 0.021623503416776657, + "learning_rate": 9.80248024436489e-06, + "loss": 0.2195, + "step": 3580 + }, + { + "epoch": 2.7253748339343327, + "grad_norm": 0.03640507906675339, + "learning_rate": 9.800013989992042e-06, + "loss": 0.0001, + "step": 3590 + }, + { + "epoch": 2.7329664072879103, + "grad_norm": 27.120119094848633, + "learning_rate": 9.797532748178566e-06, + "loss": 0.4964, + "step": 3600 + }, + { + "epoch": 2.740557980641488, + "grad_norm": 0.08877989649772644, + "learning_rate": 9.795036526671828e-06, + "loss": 0.0498, + "step": 3610 + }, + { + "epoch": 2.7481495539950656, + "grad_norm": 0.0727711170911789, + "learning_rate": 9.792525333265965e-06, + "loss": 0.1452, + "step": 3620 + }, + { + "epoch": 2.755741127348643, + "grad_norm": 0.21834716200828552, + "learning_rate": 9.789999175801866e-06, + "loss": 0.1315, + "step": 3630 + }, + { + "epoch": 2.763332700702221, + "grad_norm": 3.933009147644043, + "learning_rate": 9.787458062167135e-06, + "loss": 0.0726, + "step": 3640 + }, + { + "epoch": 2.770924274055798, + "grad_norm": 0.00495730759575963, + "learning_rate": 9.784902000296084e-06, + "loss": 0.0092, + "step": 3650 + }, + { + "epoch": 2.7785158474093756, + "grad_norm": 0.06244872510433197, + "learning_rate": 9.782330998169695e-06, + "loss": 0.2204, + "step": 3660 + }, + { + "epoch": 2.7861074207629533, + "grad_norm": 0.032471269369125366, + "learning_rate": 9.779745063815598e-06, + "loss": 0.0887, + "step": 3670 + }, + { + "epoch": 2.7936989941165304, + "grad_norm": 0.0014243993209674954, + "learning_rate": 9.777144205308049e-06, + "loss": 0.1105, + "step": 3680 + }, + { + "epoch": 2.801290567470108, + "grad_norm": 69.43852233886719, + "learning_rate": 9.774528430767902e-06, + "loss": 0.0603, + "step": 3690 + }, + { + "epoch": 2.8088821408236857, + "grad_norm": 0.06080542132258415, + "learning_rate": 9.771897748362583e-06, + "loss": 0.0163, + "step": 3700 + }, + { + "epoch": 2.8164737141772633, + "grad_norm": 0.09897174686193466, + "learning_rate": 9.769252166306066e-06, + "loss": 0.1167, + "step": 3710 + }, + { + "epoch": 2.824065287530841, + "grad_norm": 0.20604291558265686, + "learning_rate": 9.766591692858854e-06, + "loss": 0.0706, + "step": 3720 + }, + { + "epoch": 2.831656860884418, + "grad_norm": 32.105499267578125, + "learning_rate": 9.763916336327935e-06, + "loss": 0.5321, + "step": 3730 + }, + { + "epoch": 2.8392484342379958, + "grad_norm": 0.00609110202640295, + "learning_rate": 9.761226105066778e-06, + "loss": 0.0794, + "step": 3740 + }, + { + "epoch": 2.8468400075915734, + "grad_norm": 0.14252524077892303, + "learning_rate": 9.75852100747529e-06, + "loss": 0.1037, + "step": 3750 + }, + { + "epoch": 2.854431580945151, + "grad_norm": 0.0007404695497825742, + "learning_rate": 9.7558010519998e-06, + "loss": 0.0552, + "step": 3760 + }, + { + "epoch": 2.8620231542987282, + "grad_norm": 0.007310529239475727, + "learning_rate": 9.753066247133025e-06, + "loss": 0.009, + "step": 3770 + }, + { + "epoch": 2.869614727652306, + "grad_norm": 88.26655578613281, + "learning_rate": 9.750316601414051e-06, + "loss": 0.1008, + "step": 3780 + }, + { + "epoch": 2.8772063010058835, + "grad_norm": 0.01418048795312643, + "learning_rate": 9.7475521234283e-06, + "loss": 0.0262, + "step": 3790 + }, + { + "epoch": 2.884797874359461, + "grad_norm": 0.06487419456243515, + "learning_rate": 9.744772821807509e-06, + "loss": 0.1206, + "step": 3800 + }, + { + "epoch": 2.8923894477130387, + "grad_norm": 0.0070535228587687016, + "learning_rate": 9.741978705229697e-06, + "loss": 0.0897, + "step": 3810 + }, + { + "epoch": 2.899981021066616, + "grad_norm": 1.5489246845245361, + "learning_rate": 9.739169782419143e-06, + "loss": 0.0008, + "step": 3820 + }, + { + "epoch": 2.9075725944201936, + "grad_norm": 0.001165062771178782, + "learning_rate": 9.736346062146356e-06, + "loss": 0.0239, + "step": 3830 + }, + { + "epoch": 2.915164167773771, + "grad_norm": 0.0013667664024978876, + "learning_rate": 9.733507553228045e-06, + "loss": 0.0017, + "step": 3840 + }, + { + "epoch": 2.9227557411273484, + "grad_norm": 0.004272387828677893, + "learning_rate": 9.7306542645271e-06, + "loss": 0.1874, + "step": 3850 + }, + { + "epoch": 2.930347314480926, + "grad_norm": 0.032470703125, + "learning_rate": 9.727786204952554e-06, + "loss": 0.0128, + "step": 3860 + }, + { + "epoch": 2.9379388878345036, + "grad_norm": 0.010683764703571796, + "learning_rate": 9.724903383459566e-06, + "loss": 0.064, + "step": 3870 + }, + { + "epoch": 2.9455304611880813, + "grad_norm": 11.981929779052734, + "learning_rate": 9.722005809049382e-06, + "loss": 0.2962, + "step": 3880 + }, + { + "epoch": 2.953122034541659, + "grad_norm": 7.638548374176025, + "learning_rate": 9.719093490769315e-06, + "loss": 0.2084, + "step": 3890 + }, + { + "epoch": 2.9607136078952365, + "grad_norm": 0.0027020114939659834, + "learning_rate": 9.71616643771271e-06, + "loss": 0.0521, + "step": 3900 + }, + { + "epoch": 2.9683051812488137, + "grad_norm": 0.041696127504110336, + "learning_rate": 9.713224659018927e-06, + "loss": 0.1488, + "step": 3910 + }, + { + "epoch": 2.9758967546023913, + "grad_norm": 276.02947998046875, + "learning_rate": 9.710268163873298e-06, + "loss": 0.1649, + "step": 3920 + }, + { + "epoch": 2.983488327955969, + "grad_norm": 4.512789726257324, + "learning_rate": 9.707296961507107e-06, + "loss": 0.0364, + "step": 3930 + }, + { + "epoch": 2.991079901309546, + "grad_norm": 0.07038887590169907, + "learning_rate": 9.70431106119756e-06, + "loss": 0.046, + "step": 3940 + }, + { + "epoch": 2.998671474663124, + "grad_norm": 52.16018295288086, + "learning_rate": 9.701310472267757e-06, + "loss": 0.1439, + "step": 3950 + }, + { + "epoch": 2.999430631998482, + "eval_f1": 0.9468203897167411, + "eval_loss": 0.17982631921768188, + "eval_precision": 0.9451027269774426, + "eval_recall": 0.9468892261001517, + "eval_runtime": 75.734, + "eval_samples_per_second": 17.403, + "eval_steps_per_second": 17.403, + "step": 3951 + }, + { + "epoch": 3.0062630480167014, + "grad_norm": 0.003164840629324317, + "learning_rate": 9.69829520408666e-06, + "loss": 0.0687, + "step": 3960 + }, + { + "epoch": 3.013854621370279, + "grad_norm": 0.005421197507530451, + "learning_rate": 9.695265266069066e-06, + "loss": 0.1768, + "step": 3970 + }, + { + "epoch": 3.0214461947238567, + "grad_norm": 0.07668659836053848, + "learning_rate": 9.692220667675572e-06, + "loss": 0.0092, + "step": 3980 + }, + { + "epoch": 3.029037768077434, + "grad_norm": 0.0020935048814862967, + "learning_rate": 9.689161418412557e-06, + "loss": 0.2435, + "step": 3990 + }, + { + "epoch": 3.0366293414310115, + "grad_norm": 0.012631943449378014, + "learning_rate": 9.68608752783214e-06, + "loss": 0.0858, + "step": 4000 + }, + { + "epoch": 3.044220914784589, + "grad_norm": 0.005341747775673866, + "learning_rate": 9.682999005532161e-06, + "loss": 0.0094, + "step": 4010 + }, + { + "epoch": 3.0518124881381667, + "grad_norm": 0.02143806405365467, + "learning_rate": 9.67989586115614e-06, + "loss": 0.0031, + "step": 4020 + }, + { + "epoch": 3.0594040614917444, + "grad_norm": 13.902883529663086, + "learning_rate": 9.67677810439326e-06, + "loss": 0.0965, + "step": 4030 + }, + { + "epoch": 3.0669956348453216, + "grad_norm": 0.20893624424934387, + "learning_rate": 9.67364574497832e-06, + "loss": 0.107, + "step": 4040 + }, + { + "epoch": 3.074587208198899, + "grad_norm": 0.18238410353660583, + "learning_rate": 9.67049879269172e-06, + "loss": 0.001, + "step": 4050 + }, + { + "epoch": 3.082178781552477, + "grad_norm": 0.022665822878479958, + "learning_rate": 9.667337257359425e-06, + "loss": 0.1673, + "step": 4060 + }, + { + "epoch": 3.0897703549060545, + "grad_norm": 10.807044982910156, + "learning_rate": 9.664161148852932e-06, + "loss": 0.0674, + "step": 4070 + }, + { + "epoch": 3.0973619282596316, + "grad_norm": 0.0026043581310659647, + "learning_rate": 9.660970477089238e-06, + "loss": 0.0097, + "step": 4080 + }, + { + "epoch": 3.1049535016132093, + "grad_norm": 18.194334030151367, + "learning_rate": 9.657765252030815e-06, + "loss": 0.0064, + "step": 4090 + }, + { + "epoch": 3.112545074966787, + "grad_norm": 12.572392463684082, + "learning_rate": 9.654545483685578e-06, + "loss": 0.1343, + "step": 4100 + }, + { + "epoch": 3.1201366483203645, + "grad_norm": 0.0007624260615557432, + "learning_rate": 9.651311182106848e-06, + "loss": 0.0325, + "step": 4110 + }, + { + "epoch": 3.1277282216739417, + "grad_norm": 0.018368422985076904, + "learning_rate": 9.648062357393325e-06, + "loss": 0.0005, + "step": 4120 + }, + { + "epoch": 3.1353197950275193, + "grad_norm": 78.8929443359375, + "learning_rate": 9.644799019689056e-06, + "loss": 0.054, + "step": 4130 + }, + { + "epoch": 3.142911368381097, + "grad_norm": 0.010049775242805481, + "learning_rate": 9.641521179183403e-06, + "loss": 0.0157, + "step": 4140 + }, + { + "epoch": 3.1505029417346746, + "grad_norm": 91.76640319824219, + "learning_rate": 9.638228846111011e-06, + "loss": 0.1893, + "step": 4150 + }, + { + "epoch": 3.1580945150882522, + "grad_norm": 0.30123358964920044, + "learning_rate": 9.634922030751777e-06, + "loss": 0.2819, + "step": 4160 + }, + { + "epoch": 3.1656860884418294, + "grad_norm": 32.838623046875, + "learning_rate": 9.631600743430817e-06, + "loss": 0.2494, + "step": 4170 + }, + { + "epoch": 3.173277661795407, + "grad_norm": 0.1474120020866394, + "learning_rate": 9.628264994518431e-06, + "loss": 0.0401, + "step": 4180 + }, + { + "epoch": 3.1808692351489847, + "grad_norm": 0.16810506582260132, + "learning_rate": 9.624914794430078e-06, + "loss": 0.0668, + "step": 4190 + }, + { + "epoch": 3.1884608085025623, + "grad_norm": 1.5835288763046265, + "learning_rate": 9.621550153626338e-06, + "loss": 0.1177, + "step": 4200 + }, + { + "epoch": 3.1960523818561395, + "grad_norm": 0.00022748277115169913, + "learning_rate": 9.618171082612875e-06, + "loss": 0.006, + "step": 4210 + }, + { + "epoch": 3.203643955209717, + "grad_norm": 0.011720534414052963, + "learning_rate": 9.614777591940419e-06, + "loss": 0.0547, + "step": 4220 + }, + { + "epoch": 3.2112355285632947, + "grad_norm": 16.759693145751953, + "learning_rate": 9.611369692204712e-06, + "loss": 0.0687, + "step": 4230 + }, + { + "epoch": 3.2188271019168724, + "grad_norm": 13.746438026428223, + "learning_rate": 9.6079473940465e-06, + "loss": 0.1731, + "step": 4240 + }, + { + "epoch": 3.22641867527045, + "grad_norm": 1.0661725997924805, + "learning_rate": 9.604510708151472e-06, + "loss": 0.0012, + "step": 4250 + }, + { + "epoch": 3.234010248624027, + "grad_norm": 0.0051275817677378654, + "learning_rate": 9.601059645250253e-06, + "loss": 0.1559, + "step": 4260 + }, + { + "epoch": 3.241601821977605, + "grad_norm": 0.03845924511551857, + "learning_rate": 9.59759421611835e-06, + "loss": 0.0414, + "step": 4270 + }, + { + "epoch": 3.2491933953311825, + "grad_norm": 0.2744313180446625, + "learning_rate": 9.594114431576133e-06, + "loss": 0.2521, + "step": 4280 + }, + { + "epoch": 3.25678496868476, + "grad_norm": 0.06969039887189865, + "learning_rate": 9.590620302488792e-06, + "loss": 0.1007, + "step": 4290 + }, + { + "epoch": 3.2643765420383373, + "grad_norm": 0.044375017285346985, + "learning_rate": 9.587111839766303e-06, + "loss": 0.1706, + "step": 4300 + }, + { + "epoch": 3.271968115391915, + "grad_norm": 0.008467442356050014, + "learning_rate": 9.583589054363402e-06, + "loss": 0.0518, + "step": 4310 + }, + { + "epoch": 3.2795596887454925, + "grad_norm": 0.006757930386811495, + "learning_rate": 9.580051957279545e-06, + "loss": 0.1301, + "step": 4320 + }, + { + "epoch": 3.28715126209907, + "grad_norm": 0.22480565309524536, + "learning_rate": 9.57650055955887e-06, + "loss": 0.2225, + "step": 4330 + }, + { + "epoch": 3.294742835452648, + "grad_norm": 0.005938298534601927, + "learning_rate": 9.572934872290175e-06, + "loss": 0.1615, + "step": 4340 + }, + { + "epoch": 3.302334408806225, + "grad_norm": 0.031019240617752075, + "learning_rate": 9.569354906606864e-06, + "loss": 0.0292, + "step": 4350 + }, + { + "epoch": 3.3099259821598026, + "grad_norm": 0.058189138770103455, + "learning_rate": 9.565760673686936e-06, + "loss": 0.1437, + "step": 4360 + }, + { + "epoch": 3.3175175555133802, + "grad_norm": 18.81794548034668, + "learning_rate": 9.56215218475293e-06, + "loss": 0.1732, + "step": 4370 + }, + { + "epoch": 3.325109128866958, + "grad_norm": 0.037775713950395584, + "learning_rate": 9.558529451071896e-06, + "loss": 0.0048, + "step": 4380 + }, + { + "epoch": 3.332700702220535, + "grad_norm": 0.014422253705561161, + "learning_rate": 9.55489248395537e-06, + "loss": 0.0021, + "step": 4390 + }, + { + "epoch": 3.3402922755741127, + "grad_norm": 30.743995666503906, + "learning_rate": 9.551241294759322e-06, + "loss": 0.238, + "step": 4400 + }, + { + "epoch": 3.3478838489276903, + "grad_norm": 1.6870224475860596, + "learning_rate": 9.547575894884132e-06, + "loss": 0.09, + "step": 4410 + }, + { + "epoch": 3.355475422281268, + "grad_norm": 0.03549875691533089, + "learning_rate": 9.54389629577455e-06, + "loss": 0.163, + "step": 4420 + }, + { + "epoch": 3.3630669956348456, + "grad_norm": 0.12179459631443024, + "learning_rate": 9.540202508919663e-06, + "loss": 0.0025, + "step": 4430 + }, + { + "epoch": 3.3706585689884228, + "grad_norm": 0.000569705618545413, + "learning_rate": 9.536494545852854e-06, + "loss": 0.0433, + "step": 4440 + }, + { + "epoch": 3.3782501423420004, + "grad_norm": 0.0051111155189573765, + "learning_rate": 9.532772418151777e-06, + "loss": 0.1015, + "step": 4450 + }, + { + "epoch": 3.385841715695578, + "grad_norm": 0.0955556184053421, + "learning_rate": 9.529036137438304e-06, + "loss": 0.2303, + "step": 4460 + }, + { + "epoch": 3.393433289049155, + "grad_norm": 0.02819570153951645, + "learning_rate": 9.5252857153785e-06, + "loss": 0.0003, + "step": 4470 + }, + { + "epoch": 3.401024862402733, + "grad_norm": 0.005423153750598431, + "learning_rate": 9.521521163682593e-06, + "loss": 0.0102, + "step": 4480 + }, + { + "epoch": 3.4086164357563105, + "grad_norm": 0.8613097667694092, + "learning_rate": 9.517742494104918e-06, + "loss": 0.0005, + "step": 4490 + }, + { + "epoch": 3.416208009109888, + "grad_norm": 0.2508643567562103, + "learning_rate": 9.513949718443898e-06, + "loss": 0.0711, + "step": 4500 + }, + { + "epoch": 3.4237995824634657, + "grad_norm": 0.026635829359292984, + "learning_rate": 9.510142848541998e-06, + "loss": 0.0596, + "step": 4510 + }, + { + "epoch": 3.431391155817043, + "grad_norm": 0.0043787783943116665, + "learning_rate": 9.50632189628569e-06, + "loss": 0.3671, + "step": 4520 + }, + { + "epoch": 3.4389827291706205, + "grad_norm": 0.05850038304924965, + "learning_rate": 9.502486873605419e-06, + "loss": 0.1132, + "step": 4530 + }, + { + "epoch": 3.446574302524198, + "grad_norm": 157.52146911621094, + "learning_rate": 9.49863779247556e-06, + "loss": 0.1559, + "step": 4540 + }, + { + "epoch": 3.454165875877776, + "grad_norm": 0.02441789209842682, + "learning_rate": 9.494774664914385e-06, + "loss": 0.0658, + "step": 4550 + }, + { + "epoch": 3.461757449231353, + "grad_norm": 1.3454347848892212, + "learning_rate": 9.490897502984028e-06, + "loss": 0.0128, + "step": 4560 + }, + { + "epoch": 3.4693490225849306, + "grad_norm": 0.012022917158901691, + "learning_rate": 9.487006318790435e-06, + "loss": 0.0266, + "step": 4570 + }, + { + "epoch": 3.4769405959385082, + "grad_norm": 0.01288307923823595, + "learning_rate": 9.483101124483345e-06, + "loss": 0.0001, + "step": 4580 + }, + { + "epoch": 3.484532169292086, + "grad_norm": 26.168624877929688, + "learning_rate": 9.479181932256232e-06, + "loss": 0.0258, + "step": 4590 + }, + { + "epoch": 3.4921237426456635, + "grad_norm": 0.004901974927634001, + "learning_rate": 9.475248754346282e-06, + "loss": 0.1046, + "step": 4600 + }, + { + "epoch": 3.4997153159992407, + "grad_norm": 0.001919193658977747, + "learning_rate": 9.471301603034353e-06, + "loss": 0.0766, + "step": 4610 + }, + { + "epoch": 3.5073068893528183, + "grad_norm": 0.030080076307058334, + "learning_rate": 9.467340490644923e-06, + "loss": 0.0022, + "step": 4620 + }, + { + "epoch": 3.514898462706396, + "grad_norm": 0.041573066264390945, + "learning_rate": 9.463365429546073e-06, + "loss": 0.0357, + "step": 4630 + }, + { + "epoch": 3.5224900360599736, + "grad_norm": 30.251873016357422, + "learning_rate": 9.459376432149429e-06, + "loss": 0.0533, + "step": 4640 + }, + { + "epoch": 3.5300816094135508, + "grad_norm": 58.92287826538086, + "learning_rate": 9.455373510910135e-06, + "loss": 0.1241, + "step": 4650 + }, + { + "epoch": 3.5376731827671284, + "grad_norm": 0.015299913473427296, + "learning_rate": 9.45135667832681e-06, + "loss": 0.0672, + "step": 4660 + }, + { + "epoch": 3.545264756120706, + "grad_norm": 0.024773746728897095, + "learning_rate": 9.447325946941509e-06, + "loss": 0.0002, + "step": 4670 + }, + { + "epoch": 3.5528563294742836, + "grad_norm": 0.0013335061958059669, + "learning_rate": 9.443281329339682e-06, + "loss": 0.0002, + "step": 4680 + }, + { + "epoch": 3.5604479028278613, + "grad_norm": 0.003542415564879775, + "learning_rate": 9.439222838150141e-06, + "loss": 0.0053, + "step": 4690 + }, + { + "epoch": 3.5680394761814385, + "grad_norm": 0.004198325797915459, + "learning_rate": 9.435150486045019e-06, + "loss": 0.0021, + "step": 4700 + }, + { + "epoch": 3.575631049535016, + "grad_norm": 0.012465923093259335, + "learning_rate": 9.431064285739717e-06, + "loss": 0.391, + "step": 4710 + }, + { + "epoch": 3.5832226228885937, + "grad_norm": 19.51753044128418, + "learning_rate": 9.426964249992885e-06, + "loss": 0.0163, + "step": 4720 + }, + { + "epoch": 3.5908141962421714, + "grad_norm": 15.74682903289795, + "learning_rate": 9.42285039160637e-06, + "loss": 0.1393, + "step": 4730 + }, + { + "epoch": 3.5984057695957485, + "grad_norm": 0.001853258814662695, + "learning_rate": 9.418722723425179e-06, + "loss": 0.1333, + "step": 4740 + }, + { + "epoch": 3.605997342949326, + "grad_norm": 0.00429703202098608, + "learning_rate": 9.414581258337433e-06, + "loss": 0.041, + "step": 4750 + }, + { + "epoch": 3.613588916302904, + "grad_norm": 0.019961683079600334, + "learning_rate": 9.410426009274343e-06, + "loss": 0.0041, + "step": 4760 + }, + { + "epoch": 3.6211804896564814, + "grad_norm": 0.003665096592158079, + "learning_rate": 9.406256989210146e-06, + "loss": 0.1252, + "step": 4770 + }, + { + "epoch": 3.628772063010059, + "grad_norm": 59.87676239013672, + "learning_rate": 9.402074211162086e-06, + "loss": 0.2175, + "step": 4780 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.0013629102613776922, + "learning_rate": 9.397877688190362e-06, + "loss": 0.026, + "step": 4790 + }, + { + "epoch": 3.643955209717214, + "grad_norm": 0.004092271439731121, + "learning_rate": 9.39366743339809e-06, + "loss": 0.0061, + "step": 4800 + }, + { + "epoch": 3.6515467830707915, + "grad_norm": 0.06597864627838135, + "learning_rate": 9.38944345993126e-06, + "loss": 0.0974, + "step": 4810 + }, + { + "epoch": 3.6591383564243687, + "grad_norm": 0.0014479252276942134, + "learning_rate": 9.3852057809787e-06, + "loss": 0.1248, + "step": 4820 + }, + { + "epoch": 3.6667299297779463, + "grad_norm": 0.0007850687834434211, + "learning_rate": 9.380954409772029e-06, + "loss": 0.0674, + "step": 4830 + }, + { + "epoch": 3.674321503131524, + "grad_norm": 0.009199988096952438, + "learning_rate": 9.376689359585623e-06, + "loss": 0.0707, + "step": 4840 + }, + { + "epoch": 3.6819130764851016, + "grad_norm": 0.001353310770355165, + "learning_rate": 9.37241064373656e-06, + "loss": 0.0001, + "step": 4850 + }, + { + "epoch": 3.689504649838679, + "grad_norm": 0.0004105101979803294, + "learning_rate": 9.368118275584596e-06, + "loss": 0.0161, + "step": 4860 + }, + { + "epoch": 3.697096223192257, + "grad_norm": 0.005007717292755842, + "learning_rate": 9.36381226853211e-06, + "loss": 0.0854, + "step": 4870 + }, + { + "epoch": 3.704687796545834, + "grad_norm": 0.001610257662832737, + "learning_rate": 9.359492636024067e-06, + "loss": 0.0002, + "step": 4880 + }, + { + "epoch": 3.7122793698994117, + "grad_norm": 0.0029359892942011356, + "learning_rate": 9.35515939154798e-06, + "loss": 0.0001, + "step": 4890 + }, + { + "epoch": 3.7198709432529893, + "grad_norm": 0.016431191936135292, + "learning_rate": 9.350812548633862e-06, + "loss": 0.0407, + "step": 4900 + }, + { + "epoch": 3.7274625166065665, + "grad_norm": 0.00021083364845253527, + "learning_rate": 9.346452120854176e-06, + "loss": 0.0001, + "step": 4910 + }, + { + "epoch": 3.735054089960144, + "grad_norm": 0.0014973161742091179, + "learning_rate": 9.342078121823817e-06, + "loss": 0.2248, + "step": 4920 + }, + { + "epoch": 3.7426456633137217, + "grad_norm": 0.01354212500154972, + "learning_rate": 9.337690565200042e-06, + "loss": 0.07, + "step": 4930 + }, + { + "epoch": 3.7502372366672994, + "grad_norm": 0.07265155762434006, + "learning_rate": 9.333289464682452e-06, + "loss": 0.0486, + "step": 4940 + }, + { + "epoch": 3.757828810020877, + "grad_norm": 0.0004681596765294671, + "learning_rate": 9.328874834012925e-06, + "loss": 0.0063, + "step": 4950 + }, + { + "epoch": 3.7654203833744546, + "grad_norm": 0.01314933318644762, + "learning_rate": 9.324446686975592e-06, + "loss": 0.0853, + "step": 4960 + }, + { + "epoch": 3.773011956728032, + "grad_norm": 0.00873385276645422, + "learning_rate": 9.320005037396787e-06, + "loss": 0.0936, + "step": 4970 + }, + { + "epoch": 3.7806035300816094, + "grad_norm": 10.59278678894043, + "learning_rate": 9.315549899145001e-06, + "loss": 0.1606, + "step": 4980 + }, + { + "epoch": 3.788195103435187, + "grad_norm": 0.0031807045452296734, + "learning_rate": 9.311081286130846e-06, + "loss": 0.1216, + "step": 4990 + }, + { + "epoch": 3.7957866767887642, + "grad_norm": 0.15921778976917267, + "learning_rate": 9.306599212307001e-06, + "loss": 0.1834, + "step": 5000 + }, + { + "epoch": 3.803378250142342, + "grad_norm": 0.24746917188167572, + "learning_rate": 9.302103691668182e-06, + "loss": 0.0025, + "step": 5010 + }, + { + "epoch": 3.8109698234959195, + "grad_norm": 23.347986221313477, + "learning_rate": 9.297594738251085e-06, + "loss": 0.155, + "step": 5020 + }, + { + "epoch": 3.818561396849497, + "grad_norm": 10.753530502319336, + "learning_rate": 9.293072366134353e-06, + "loss": 0.1938, + "step": 5030 + }, + { + "epoch": 3.8261529702030748, + "grad_norm": 11.585359573364258, + "learning_rate": 9.288536589438523e-06, + "loss": 0.0768, + "step": 5040 + }, + { + "epoch": 3.833744543556652, + "grad_norm": 0.035775743424892426, + "learning_rate": 9.283987422325988e-06, + "loss": 0.0124, + "step": 5050 + }, + { + "epoch": 3.8413361169102296, + "grad_norm": 0.008631790988147259, + "learning_rate": 9.279424879000948e-06, + "loss": 0.0634, + "step": 5060 + }, + { + "epoch": 3.848927690263807, + "grad_norm": 8.152615547180176, + "learning_rate": 9.274848973709378e-06, + "loss": 0.0008, + "step": 5070 + }, + { + "epoch": 3.856519263617385, + "grad_norm": 0.00742849987000227, + "learning_rate": 9.270259720738962e-06, + "loss": 0.0023, + "step": 5080 + }, + { + "epoch": 3.864110836970962, + "grad_norm": 0.00474806921556592, + "learning_rate": 9.265657134419068e-06, + "loss": 0.0822, + "step": 5090 + }, + { + "epoch": 3.8717024103245397, + "grad_norm": 0.04680832102894783, + "learning_rate": 9.261041229120693e-06, + "loss": 0.4435, + "step": 5100 + }, + { + "epoch": 3.8792939836781173, + "grad_norm": 0.05589527264237404, + "learning_rate": 9.25641201925642e-06, + "loss": 0.0161, + "step": 5110 + }, + { + "epoch": 3.886885557031695, + "grad_norm": 0.0864788219332695, + "learning_rate": 9.251769519280377e-06, + "loss": 0.0042, + "step": 5120 + }, + { + "epoch": 3.8944771303852725, + "grad_norm": 0.0046981326304376125, + "learning_rate": 9.247113743688188e-06, + "loss": 0.0202, + "step": 5130 + }, + { + "epoch": 3.9020687037388497, + "grad_norm": 0.008091968484222889, + "learning_rate": 9.242444707016924e-06, + "loss": 0.0255, + "step": 5140 + }, + { + "epoch": 3.9096602770924274, + "grad_norm": 0.016733279451727867, + "learning_rate": 9.237762423845067e-06, + "loss": 0.0609, + "step": 5150 + }, + { + "epoch": 3.917251850446005, + "grad_norm": 110.93751525878906, + "learning_rate": 9.233066908792459e-06, + "loss": 0.0854, + "step": 5160 + }, + { + "epoch": 3.9248434237995826, + "grad_norm": 0.0014931544428691268, + "learning_rate": 9.228358176520256e-06, + "loss": 0.5116, + "step": 5170 + }, + { + "epoch": 3.93243499715316, + "grad_norm": 0.013354528695344925, + "learning_rate": 9.22363624173088e-06, + "loss": 0.1488, + "step": 5180 + }, + { + "epoch": 3.9400265705067374, + "grad_norm": 0.00550916837528348, + "learning_rate": 9.218901119167983e-06, + "loss": 0.3537, + "step": 5190 + }, + { + "epoch": 3.947618143860315, + "grad_norm": 29.100811004638672, + "learning_rate": 9.214152823616385e-06, + "loss": 0.2662, + "step": 5200 + }, + { + "epoch": 3.9552097172138927, + "grad_norm": 0.0014990021008998156, + "learning_rate": 9.209391369902048e-06, + "loss": 0.2909, + "step": 5210 + }, + { + "epoch": 3.9628012905674703, + "grad_norm": 0.2769727110862732, + "learning_rate": 9.20461677289201e-06, + "loss": 0.131, + "step": 5220 + }, + { + "epoch": 3.9703928639210475, + "grad_norm": 0.04668630287051201, + "learning_rate": 9.199829047494351e-06, + "loss": 0.001, + "step": 5230 + }, + { + "epoch": 3.977984437274625, + "grad_norm": 0.005737427622079849, + "learning_rate": 9.195028208658143e-06, + "loss": 0.1876, + "step": 5240 + }, + { + "epoch": 3.9855760106282028, + "grad_norm": 0.0012742755934596062, + "learning_rate": 9.190214271373399e-06, + "loss": 0.0296, + "step": 5250 + }, + { + "epoch": 3.99316758398178, + "grad_norm": 0.23183897137641907, + "learning_rate": 9.185387250671037e-06, + "loss": 0.0464, + "step": 5260 + }, + { + "epoch": 4.0, + "eval_f1": 0.9484240795008525, + "eval_loss": 0.1520499438047409, + "eval_precision": 0.9488593551067371, + "eval_recall": 0.9484066767830045, + "eval_runtime": 75.764, + "eval_samples_per_second": 17.396, + "eval_steps_per_second": 17.396, + "step": 5269 + }, + { + "epoch": 4.000759157335358, + "grad_norm": 0.015655217692255974, + "learning_rate": 9.18054716162282e-06, + "loss": 0.0778, + "step": 5270 + }, + { + "epoch": 4.008350730688935, + "grad_norm": 51.39549255371094, + "learning_rate": 9.175694019341321e-06, + "loss": 0.1821, + "step": 5280 + }, + { + "epoch": 4.015942304042513, + "grad_norm": 20.591053009033203, + "learning_rate": 9.170827838979864e-06, + "loss": 0.0411, + "step": 5290 + }, + { + "epoch": 4.0235338773960905, + "grad_norm": 0.00464022858068347, + "learning_rate": 9.165948635732487e-06, + "loss": 0.024, + "step": 5300 + }, + { + "epoch": 4.031125450749668, + "grad_norm": 0.9038947820663452, + "learning_rate": 9.161056424833888e-06, + "loss": 0.1133, + "step": 5310 + }, + { + "epoch": 4.038717024103246, + "grad_norm": 104.494384765625, + "learning_rate": 9.156151221559384e-06, + "loss": 0.0752, + "step": 5320 + }, + { + "epoch": 4.046308597456823, + "grad_norm": 0.003295379225164652, + "learning_rate": 9.151233041224851e-06, + "loss": 0.0697, + "step": 5330 + }, + { + "epoch": 4.0539001708104, + "grad_norm": 0.0672566145658493, + "learning_rate": 9.146301899186696e-06, + "loss": 0.0149, + "step": 5340 + }, + { + "epoch": 4.061491744163978, + "grad_norm": 0.020139316096901894, + "learning_rate": 9.141357810841785e-06, + "loss": 0.0004, + "step": 5350 + }, + { + "epoch": 4.069083317517555, + "grad_norm": 0.18405619263648987, + "learning_rate": 9.136400791627414e-06, + "loss": 0.0003, + "step": 5360 + }, + { + "epoch": 4.076674890871133, + "grad_norm": 0.011098051443696022, + "learning_rate": 9.131430857021252e-06, + "loss": 0.1502, + "step": 5370 + }, + { + "epoch": 4.084266464224711, + "grad_norm": 0.0007754967082291842, + "learning_rate": 9.126448022541296e-06, + "loss": 0.1435, + "step": 5380 + }, + { + "epoch": 4.091858037578288, + "grad_norm": 0.059689611196517944, + "learning_rate": 9.121452303745823e-06, + "loss": 0.2681, + "step": 5390 + }, + { + "epoch": 4.099449610931866, + "grad_norm": 23.187213897705078, + "learning_rate": 9.116443716233336e-06, + "loss": 0.0408, + "step": 5400 + }, + { + "epoch": 4.1070411842854435, + "grad_norm": 0.022440658882260323, + "learning_rate": 9.111422275642518e-06, + "loss": 0.0499, + "step": 5410 + }, + { + "epoch": 4.11463275763902, + "grad_norm": 0.04940136522054672, + "learning_rate": 9.10638799765219e-06, + "loss": 0.0007, + "step": 5420 + }, + { + "epoch": 4.122224330992598, + "grad_norm": 0.0109120924025774, + "learning_rate": 9.101340897981247e-06, + "loss": 0.0577, + "step": 5430 + }, + { + "epoch": 4.1298159043461755, + "grad_norm": 15.833015441894531, + "learning_rate": 9.096280992388629e-06, + "loss": 0.0016, + "step": 5440 + }, + { + "epoch": 4.137407477699753, + "grad_norm": 0.002290463075041771, + "learning_rate": 9.091208296673253e-06, + "loss": 0.0022, + "step": 5450 + }, + { + "epoch": 4.144999051053331, + "grad_norm": 0.006408984772861004, + "learning_rate": 9.086122826673976e-06, + "loss": 0.0004, + "step": 5460 + }, + { + "epoch": 4.152590624406908, + "grad_norm": 0.04329880699515343, + "learning_rate": 9.081024598269537e-06, + "loss": 0.0001, + "step": 5470 + }, + { + "epoch": 4.160182197760486, + "grad_norm": 0.0005604320904240012, + "learning_rate": 9.075913627378515e-06, + "loss": 0.1444, + "step": 5480 + }, + { + "epoch": 4.167773771114064, + "grad_norm": 0.0035607500467449427, + "learning_rate": 9.070789929959273e-06, + "loss": 0.0705, + "step": 5490 + }, + { + "epoch": 4.175365344467641, + "grad_norm": 21.509424209594727, + "learning_rate": 9.065653522009914e-06, + "loss": 0.0963, + "step": 5500 + }, + { + "epoch": 4.182956917821218, + "grad_norm": 0.040827080607414246, + "learning_rate": 9.060504419568226e-06, + "loss": 0.2367, + "step": 5510 + }, + { + "epoch": 4.190548491174796, + "grad_norm": 0.03268290311098099, + "learning_rate": 9.055342638711636e-06, + "loss": 0.1356, + "step": 5520 + }, + { + "epoch": 4.198140064528373, + "grad_norm": 0.02690727449953556, + "learning_rate": 9.050168195557152e-06, + "loss": 0.1927, + "step": 5530 + }, + { + "epoch": 4.205731637881951, + "grad_norm": 0.0010843976633623242, + "learning_rate": 9.044981106261327e-06, + "loss": 0.03, + "step": 5540 + }, + { + "epoch": 4.2133232112355286, + "grad_norm": 0.017938513308763504, + "learning_rate": 9.039781387020195e-06, + "loss": 0.0011, + "step": 5550 + }, + { + "epoch": 4.220914784589106, + "grad_norm": 0.11831680685281754, + "learning_rate": 9.034569054069222e-06, + "loss": 0.0028, + "step": 5560 + }, + { + "epoch": 4.228506357942684, + "grad_norm": 0.0017340222839266062, + "learning_rate": 9.029344123683269e-06, + "loss": 0.0004, + "step": 5570 + }, + { + "epoch": 4.236097931296261, + "grad_norm": 45.62750244140625, + "learning_rate": 9.024106612176519e-06, + "loss": 0.199, + "step": 5580 + }, + { + "epoch": 4.243689504649839, + "grad_norm": 0.00023749677347950637, + "learning_rate": 9.019382108477498e-06, + "loss": 0.0737, + "step": 5590 + }, + { + "epoch": 4.251281078003416, + "grad_norm": 0.0017125029116868973, + "learning_rate": 9.014120737927479e-06, + "loss": 0.0038, + "step": 5600 + }, + { + "epoch": 4.258872651356993, + "grad_norm": 0.005647186189889908, + "learning_rate": 9.008846833789777e-06, + "loss": 0.0524, + "step": 5610 + }, + { + "epoch": 4.266464224710571, + "grad_norm": 0.02812052331864834, + "learning_rate": 9.003560412531492e-06, + "loss": 0.0008, + "step": 5620 + }, + { + "epoch": 4.274055798064149, + "grad_norm": 0.004697522614151239, + "learning_rate": 8.99826149065881e-06, + "loss": 0.022, + "step": 5630 + }, + { + "epoch": 4.281647371417726, + "grad_norm": 0.000999168842099607, + "learning_rate": 8.992950084716952e-06, + "loss": 0.0255, + "step": 5640 + }, + { + "epoch": 4.289238944771304, + "grad_norm": 0.00024819112149998546, + "learning_rate": 8.987626211290112e-06, + "loss": 0.1814, + "step": 5650 + }, + { + "epoch": 4.296830518124882, + "grad_norm": 15.028079986572266, + "learning_rate": 8.982289887001419e-06, + "loss": 0.0483, + "step": 5660 + }, + { + "epoch": 4.304422091478459, + "grad_norm": 0.012629321776330471, + "learning_rate": 8.976941128512873e-06, + "loss": 0.0727, + "step": 5670 + }, + { + "epoch": 4.312013664832037, + "grad_norm": 0.02232271246612072, + "learning_rate": 8.9715799525253e-06, + "loss": 0.1076, + "step": 5680 + }, + { + "epoch": 4.319605238185614, + "grad_norm": 0.013221162371337414, + "learning_rate": 8.966206375778302e-06, + "loss": 0.1304, + "step": 5690 + }, + { + "epoch": 4.327196811539191, + "grad_norm": 20.240745544433594, + "learning_rate": 8.960820415050193e-06, + "loss": 0.0818, + "step": 5700 + }, + { + "epoch": 4.334788384892769, + "grad_norm": 0.9472859501838684, + "learning_rate": 8.955422087157962e-06, + "loss": 0.0875, + "step": 5710 + }, + { + "epoch": 4.3423799582463465, + "grad_norm": 0.24365593492984772, + "learning_rate": 8.950011408957206e-06, + "loss": 0.0052, + "step": 5720 + }, + { + "epoch": 4.349971531599924, + "grad_norm": 0.5765083432197571, + "learning_rate": 8.944588397342093e-06, + "loss": 0.3057, + "step": 5730 + }, + { + "epoch": 4.357563104953502, + "grad_norm": 36.48699951171875, + "learning_rate": 8.939153069245291e-06, + "loss": 0.1687, + "step": 5740 + }, + { + "epoch": 4.365154678307079, + "grad_norm": 0.011977112852036953, + "learning_rate": 8.933705441637931e-06, + "loss": 0.0129, + "step": 5750 + }, + { + "epoch": 4.372746251660657, + "grad_norm": 0.049162607640028, + "learning_rate": 8.928245531529546e-06, + "loss": 0.0747, + "step": 5760 + }, + { + "epoch": 4.380337825014234, + "grad_norm": 0.006424940191209316, + "learning_rate": 8.922773355968018e-06, + "loss": 0.0001, + "step": 5770 + }, + { + "epoch": 4.387929398367811, + "grad_norm": 0.0021049147471785545, + "learning_rate": 8.91728893203953e-06, + "loss": 0.0011, + "step": 5780 + }, + { + "epoch": 4.395520971721389, + "grad_norm": 0.005935146939009428, + "learning_rate": 8.911792276868502e-06, + "loss": 0.0685, + "step": 5790 + }, + { + "epoch": 4.403112545074967, + "grad_norm": 0.16192130744457245, + "learning_rate": 8.906283407617555e-06, + "loss": 0.0789, + "step": 5800 + }, + { + "epoch": 4.410704118428544, + "grad_norm": 0.0363471657037735, + "learning_rate": 8.900762341487439e-06, + "loss": 0.0003, + "step": 5810 + }, + { + "epoch": 4.418295691782122, + "grad_norm": 0.03035406582057476, + "learning_rate": 8.895229095716988e-06, + "loss": 0.0004, + "step": 5820 + }, + { + "epoch": 4.4258872651356995, + "grad_norm": 0.0051777479238808155, + "learning_rate": 8.889683687583067e-06, + "loss": 0.0974, + "step": 5830 + }, + { + "epoch": 4.433478838489277, + "grad_norm": 0.001428132993169129, + "learning_rate": 8.884126134400516e-06, + "loss": 0.0104, + "step": 5840 + }, + { + "epoch": 4.441070411842855, + "grad_norm": 0.029337646439671516, + "learning_rate": 8.8785564535221e-06, + "loss": 0.1961, + "step": 5850 + }, + { + "epoch": 4.448661985196432, + "grad_norm": 103.57210540771484, + "learning_rate": 8.872974662338443e-06, + "loss": 0.0941, + "step": 5860 + }, + { + "epoch": 4.456253558550009, + "grad_norm": 0.006421659607440233, + "learning_rate": 8.86738077827799e-06, + "loss": 0.0586, + "step": 5870 + }, + { + "epoch": 4.463845131903587, + "grad_norm": 0.21757641434669495, + "learning_rate": 8.861774818806939e-06, + "loss": 0.1107, + "step": 5880 + }, + { + "epoch": 4.471436705257164, + "grad_norm": 0.2700095474720001, + "learning_rate": 8.856156801429196e-06, + "loss": 0.1388, + "step": 5890 + }, + { + "epoch": 4.479028278610742, + "grad_norm": 0.0029901862144470215, + "learning_rate": 8.850526743686314e-06, + "loss": 0.1908, + "step": 5900 + }, + { + "epoch": 4.48661985196432, + "grad_norm": 0.008274559862911701, + "learning_rate": 8.844884663157441e-06, + "loss": 0.0842, + "step": 5910 + }, + { + "epoch": 4.494211425317897, + "grad_norm": 0.006725401151925325, + "learning_rate": 8.83923057745926e-06, + "loss": 0.0003, + "step": 5920 + }, + { + "epoch": 4.501802998671475, + "grad_norm": 13.423134803771973, + "learning_rate": 8.833564504245953e-06, + "loss": 0.0658, + "step": 5930 + }, + { + "epoch": 4.509394572025053, + "grad_norm": 0.047781139612197876, + "learning_rate": 8.827886461209114e-06, + "loss": 0.0008, + "step": 5940 + }, + { + "epoch": 4.516986145378629, + "grad_norm": 0.0009586279047653079, + "learning_rate": 8.82219646607772e-06, + "loss": 0.0003, + "step": 5950 + }, + { + "epoch": 4.524577718732207, + "grad_norm": 0.07489871978759766, + "learning_rate": 8.816494536618069e-06, + "loss": 0.0003, + "step": 5960 + }, + { + "epoch": 4.532169292085785, + "grad_norm": 0.015722280368208885, + "learning_rate": 8.810780690633715e-06, + "loss": 0.1269, + "step": 5970 + }, + { + "epoch": 4.539760865439362, + "grad_norm": 0.01760883256793022, + "learning_rate": 8.805054945965429e-06, + "loss": 0.0659, + "step": 5980 + }, + { + "epoch": 4.54735243879294, + "grad_norm": 0.03223474696278572, + "learning_rate": 8.799317320491125e-06, + "loss": 0.0005, + "step": 5990 + }, + { + "epoch": 4.5549440121465175, + "grad_norm": 0.0017072842456400394, + "learning_rate": 8.793567832125823e-06, + "loss": 0.1485, + "step": 6000 + }, + { + "epoch": 4.562535585500095, + "grad_norm": 0.0031113557051867247, + "learning_rate": 8.787806498821572e-06, + "loss": 0.0058, + "step": 6010 + }, + { + "epoch": 4.570127158853673, + "grad_norm": 0.016612514853477478, + "learning_rate": 8.782033338567414e-06, + "loss": 0.0294, + "step": 6020 + }, + { + "epoch": 4.57771873220725, + "grad_norm": 0.010033627972006798, + "learning_rate": 8.776248369389319e-06, + "loss": 0.064, + "step": 6030 + }, + { + "epoch": 4.585310305560828, + "grad_norm": 0.007523770444095135, + "learning_rate": 8.770451609350123e-06, + "loss": 0.1784, + "step": 6040 + }, + { + "epoch": 4.592901878914405, + "grad_norm": 0.0006488583167083561, + "learning_rate": 8.764643076549481e-06, + "loss": 0.0001, + "step": 6050 + }, + { + "epoch": 4.600493452267982, + "grad_norm": 49.0224494934082, + "learning_rate": 8.75882278912381e-06, + "loss": 0.1479, + "step": 6060 + }, + { + "epoch": 4.60808502562156, + "grad_norm": 0.05112855136394501, + "learning_rate": 8.752990765246222e-06, + "loss": 0.0742, + "step": 6070 + }, + { + "epoch": 4.615676598975138, + "grad_norm": 0.007768516894429922, + "learning_rate": 8.747147023126486e-06, + "loss": 0.0547, + "step": 6080 + }, + { + "epoch": 4.623268172328715, + "grad_norm": 0.03929920494556427, + "learning_rate": 8.741291581010945e-06, + "loss": 0.0005, + "step": 6090 + }, + { + "epoch": 4.630859745682293, + "grad_norm": 0.0333462730050087, + "learning_rate": 8.735424457182483e-06, + "loss": 0.0912, + "step": 6100 + }, + { + "epoch": 4.6384513190358705, + "grad_norm": 0.0021920499857515097, + "learning_rate": 8.729545669960459e-06, + "loss": 0.0025, + "step": 6110 + }, + { + "epoch": 4.646042892389447, + "grad_norm": 0.24167831242084503, + "learning_rate": 8.723655237700646e-06, + "loss": 0.0184, + "step": 6120 + }, + { + "epoch": 4.653634465743025, + "grad_norm": 0.01909787394106388, + "learning_rate": 8.71775317879518e-06, + "loss": 0.0001, + "step": 6130 + }, + { + "epoch": 4.6612260390966025, + "grad_norm": 97.6840591430664, + "learning_rate": 8.711839511672497e-06, + "loss": 0.0578, + "step": 6140 + }, + { + "epoch": 4.66881761245018, + "grad_norm": 0.000244935043156147, + "learning_rate": 8.705914254797283e-06, + "loss": 0.1423, + "step": 6150 + }, + { + "epoch": 4.676409185803758, + "grad_norm": 0.0006741081597283483, + "learning_rate": 8.699977426670403e-06, + "loss": 0.0306, + "step": 6160 + }, + { + "epoch": 4.684000759157335, + "grad_norm": 0.001535810879431665, + "learning_rate": 8.69402904582886e-06, + "loss": 0.0496, + "step": 6170 + }, + { + "epoch": 4.691592332510913, + "grad_norm": 0.4821704030036926, + "learning_rate": 8.688069130845725e-06, + "loss": 0.0443, + "step": 6180 + }, + { + "epoch": 4.699183905864491, + "grad_norm": 0.002279536332935095, + "learning_rate": 8.682097700330086e-06, + "loss": 0.0222, + "step": 6190 + }, + { + "epoch": 4.706775479218068, + "grad_norm": 0.009520245715975761, + "learning_rate": 8.67611477292698e-06, + "loss": 0.1731, + "step": 6200 + }, + { + "epoch": 4.714367052571646, + "grad_norm": 0.1851215660572052, + "learning_rate": 8.67012036731735e-06, + "loss": 0.0629, + "step": 6210 + }, + { + "epoch": 4.721958625925223, + "grad_norm": 0.12576204538345337, + "learning_rate": 8.664114502217975e-06, + "loss": 0.0448, + "step": 6220 + }, + { + "epoch": 4.7295501992788, + "grad_norm": 0.015547769144177437, + "learning_rate": 8.65809719638141e-06, + "loss": 0.0147, + "step": 6230 + }, + { + "epoch": 4.737141772632378, + "grad_norm": 0.2670181095600128, + "learning_rate": 8.65206846859594e-06, + "loss": 0.0005, + "step": 6240 + }, + { + "epoch": 4.7447333459859555, + "grad_norm": 0.028395511209964752, + "learning_rate": 8.646028337685509e-06, + "loss": 0.05, + "step": 6250 + }, + { + "epoch": 4.752324919339533, + "grad_norm": 0.018742332234978676, + "learning_rate": 8.639976822509666e-06, + "loss": 0.2398, + "step": 6260 + }, + { + "epoch": 4.759916492693111, + "grad_norm": 12.270938873291016, + "learning_rate": 8.633913941963507e-06, + "loss": 0.313, + "step": 6270 + }, + { + "epoch": 4.767508066046688, + "grad_norm": 0.07293716818094254, + "learning_rate": 8.627839714977618e-06, + "loss": 0.0008, + "step": 6280 + }, + { + "epoch": 4.775099639400266, + "grad_norm": 0.06347032636404037, + "learning_rate": 8.621754160518005e-06, + "loss": 0.0221, + "step": 6290 + }, + { + "epoch": 4.782691212753843, + "grad_norm": 0.0011452403850853443, + "learning_rate": 8.615657297586051e-06, + "loss": 0.1013, + "step": 6300 + }, + { + "epoch": 4.79028278610742, + "grad_norm": 0.0021203244104981422, + "learning_rate": 8.609549145218442e-06, + "loss": 0.0007, + "step": 6310 + }, + { + "epoch": 4.797874359460998, + "grad_norm": 0.006574318744242191, + "learning_rate": 8.603429722487117e-06, + "loss": 0.0725, + "step": 6320 + }, + { + "epoch": 4.805465932814576, + "grad_norm": 0.00014791313151363283, + "learning_rate": 8.597299048499206e-06, + "loss": 0.0532, + "step": 6330 + }, + { + "epoch": 4.813057506168153, + "grad_norm": 0.12207093834877014, + "learning_rate": 8.591157142396966e-06, + "loss": 0.1137, + "step": 6340 + }, + { + "epoch": 4.820649079521731, + "grad_norm": 0.027442127466201782, + "learning_rate": 8.58500402335773e-06, + "loss": 0.0812, + "step": 6350 + }, + { + "epoch": 4.828240652875309, + "grad_norm": 0.00018395182269159704, + "learning_rate": 8.578839710593836e-06, + "loss": 0.1686, + "step": 6360 + }, + { + "epoch": 4.835832226228886, + "grad_norm": 0.06821048259735107, + "learning_rate": 8.57266422335258e-06, + "loss": 0.0005, + "step": 6370 + }, + { + "epoch": 4.843423799582464, + "grad_norm": 9.863347804639488e-05, + "learning_rate": 8.56647758091614e-06, + "loss": 0.0005, + "step": 6380 + }, + { + "epoch": 4.8510153729360415, + "grad_norm": 0.0016949453856796026, + "learning_rate": 8.560279802601533e-06, + "loss": 0.1504, + "step": 6390 + }, + { + "epoch": 4.858606946289618, + "grad_norm": 0.0009430780191905797, + "learning_rate": 8.554070907760544e-06, + "loss": 0.0, + "step": 6400 + }, + { + "epoch": 4.866198519643196, + "grad_norm": 0.02552955597639084, + "learning_rate": 8.547850915779662e-06, + "loss": 0.0001, + "step": 6410 + }, + { + "epoch": 4.8737900929967735, + "grad_norm": 0.014719455502927303, + "learning_rate": 8.541619846080039e-06, + "loss": 0.15, + "step": 6420 + }, + { + "epoch": 4.881381666350351, + "grad_norm": 0.09882048517465591, + "learning_rate": 8.535377718117399e-06, + "loss": 0.0569, + "step": 6430 + }, + { + "epoch": 4.888973239703929, + "grad_norm": 0.22454605996608734, + "learning_rate": 8.52912455138201e-06, + "loss": 0.1482, + "step": 6440 + }, + { + "epoch": 4.896564813057506, + "grad_norm": 0.08625132590532303, + "learning_rate": 8.52286036539859e-06, + "loss": 0.0011, + "step": 6450 + }, + { + "epoch": 4.904156386411084, + "grad_norm": 0.03739362582564354, + "learning_rate": 8.51658517972628e-06, + "loss": 0.1778, + "step": 6460 + }, + { + "epoch": 4.911747959764662, + "grad_norm": 0.21021807193756104, + "learning_rate": 8.510299013958559e-06, + "loss": 0.0011, + "step": 6470 + }, + { + "epoch": 4.919339533118238, + "grad_norm": 0.04205634444952011, + "learning_rate": 8.504001887723185e-06, + "loss": 0.0787, + "step": 6480 + }, + { + "epoch": 4.926931106471816, + "grad_norm": 0.09222347289323807, + "learning_rate": 8.497693820682146e-06, + "loss": 0.0006, + "step": 6490 + }, + { + "epoch": 4.934522679825394, + "grad_norm": 0.1209307536482811, + "learning_rate": 8.491374832531591e-06, + "loss": 0.053, + "step": 6500 + }, + { + "epoch": 4.942114253178971, + "grad_norm": 0.009995940141379833, + "learning_rate": 8.485044943001763e-06, + "loss": 0.0096, + "step": 6510 + }, + { + "epoch": 4.949705826532549, + "grad_norm": 0.018289346247911453, + "learning_rate": 8.47870417185695e-06, + "loss": 0.0012, + "step": 6520 + }, + { + "epoch": 4.9572973998861265, + "grad_norm": 65.71520233154297, + "learning_rate": 8.472352538895411e-06, + "loss": 0.1783, + "step": 6530 + }, + { + "epoch": 4.964888973239704, + "grad_norm": 57.22151184082031, + "learning_rate": 8.465990063949323e-06, + "loss": 0.1034, + "step": 6540 + }, + { + "epoch": 4.972480546593282, + "grad_norm": 0.003517146920785308, + "learning_rate": 8.459616766884713e-06, + "loss": 0.0024, + "step": 6550 + }, + { + "epoch": 4.980072119946859, + "grad_norm": 0.0020259765442460775, + "learning_rate": 8.453232667601403e-06, + "loss": 0.0001, + "step": 6560 + }, + { + "epoch": 4.987663693300436, + "grad_norm": 0.007150826510041952, + "learning_rate": 8.44683778603294e-06, + "loss": 0.1704, + "step": 6570 + }, + { + "epoch": 4.995255266654014, + "grad_norm": 0.0018830208573490381, + "learning_rate": 8.440432142146535e-06, + "loss": 0.0349, + "step": 6580 + }, + { + "epoch": 4.99981021066616, + "eval_f1": 0.9552939310725507, + "eval_loss": 0.15884605050086975, + "eval_precision": 0.9567644368540595, + "eval_recall": 0.9552352048558422, + "eval_runtime": 75.7698, + "eval_samples_per_second": 17.395, + "eval_steps_per_second": 17.395, + "step": 6586 + }, + { + "epoch": 5.002846840007591, + "grad_norm": 0.002661398844793439, + "learning_rate": 8.434015755943013e-06, + "loss": 0.0002, + "step": 6590 + }, + { + "epoch": 5.010438413361169, + "grad_norm": 0.015321805141866207, + "learning_rate": 8.427588647456727e-06, + "loss": 0.0006, + "step": 6600 + }, + { + "epoch": 5.018029986714747, + "grad_norm": 0.015539165586233139, + "learning_rate": 8.42115083675552e-06, + "loss": 0.128, + "step": 6610 + }, + { + "epoch": 5.025621560068324, + "grad_norm": 5.927582263946533, + "learning_rate": 8.414702343940647e-06, + "loss": 0.0743, + "step": 6620 + }, + { + "epoch": 5.033213133421902, + "grad_norm": 0.0004428077954798937, + "learning_rate": 8.408243189146714e-06, + "loss": 0.0764, + "step": 6630 + }, + { + "epoch": 5.0408047067754795, + "grad_norm": 13.519503593444824, + "learning_rate": 8.401773392541621e-06, + "loss": 0.0837, + "step": 6640 + }, + { + "epoch": 5.048396280129057, + "grad_norm": 0.0011204121401533484, + "learning_rate": 8.395292974326497e-06, + "loss": 0.0001, + "step": 6650 + }, + { + "epoch": 5.055987853482634, + "grad_norm": 0.005702109541743994, + "learning_rate": 8.388801954735632e-06, + "loss": 0.0003, + "step": 6660 + }, + { + "epoch": 5.0635794268362115, + "grad_norm": 0.009877257980406284, + "learning_rate": 8.38230035403642e-06, + "loss": 0.0001, + "step": 6670 + }, + { + "epoch": 5.071171000189789, + "grad_norm": 0.0006185189704410732, + "learning_rate": 8.375788192529292e-06, + "loss": 0.0002, + "step": 6680 + }, + { + "epoch": 5.078762573543367, + "grad_norm": 0.0004436051531229168, + "learning_rate": 8.369265490547653e-06, + "loss": 0.0004, + "step": 6690 + }, + { + "epoch": 5.086354146896944, + "grad_norm": 0.016778159886598587, + "learning_rate": 8.362732268457824e-06, + "loss": 0.1505, + "step": 6700 + }, + { + "epoch": 5.093945720250522, + "grad_norm": 0.13505133986473083, + "learning_rate": 8.356188546658966e-06, + "loss": 0.0825, + "step": 6710 + }, + { + "epoch": 5.1015372936041, + "grad_norm": 0.015829697251319885, + "learning_rate": 8.34963434558303e-06, + "loss": 0.106, + "step": 6720 + }, + { + "epoch": 5.109128866957677, + "grad_norm": 0.006577119696885347, + "learning_rate": 8.343069685694687e-06, + "loss": 0.1537, + "step": 6730 + }, + { + "epoch": 5.116720440311255, + "grad_norm": 0.0571792870759964, + "learning_rate": 8.33649458749126e-06, + "loss": 0.023, + "step": 6740 + }, + { + "epoch": 5.124312013664832, + "grad_norm": 0.13444474339485168, + "learning_rate": 8.329909071502668e-06, + "loss": 0.1881, + "step": 6750 + }, + { + "epoch": 5.131903587018409, + "grad_norm": 0.011354477144777775, + "learning_rate": 8.32331315829136e-06, + "loss": 0.2186, + "step": 6760 + }, + { + "epoch": 5.139495160371987, + "grad_norm": 0.11647947877645493, + "learning_rate": 8.31670686845224e-06, + "loss": 0.0005, + "step": 6770 + }, + { + "epoch": 5.147086733725565, + "grad_norm": 0.03318728879094124, + "learning_rate": 8.310090222612623e-06, + "loss": 0.0004, + "step": 6780 + }, + { + "epoch": 5.154678307079142, + "grad_norm": 0.0020830295979976654, + "learning_rate": 8.303463241432156e-06, + "loss": 0.0738, + "step": 6790 + }, + { + "epoch": 5.16226988043272, + "grad_norm": 0.18546123802661896, + "learning_rate": 8.296825945602749e-06, + "loss": 0.225, + "step": 6800 + }, + { + "epoch": 5.1698614537862975, + "grad_norm": 0.013226731680333614, + "learning_rate": 8.290178355848528e-06, + "loss": 0.0024, + "step": 6810 + }, + { + "epoch": 5.177453027139875, + "grad_norm": 0.0015887143090367317, + "learning_rate": 8.283520492925758e-06, + "loss": 0.1161, + "step": 6820 + }, + { + "epoch": 5.185044600493452, + "grad_norm": 12.341133117675781, + "learning_rate": 8.276852377622777e-06, + "loss": 0.0333, + "step": 6830 + }, + { + "epoch": 5.1926361738470295, + "grad_norm": 0.48488712310791016, + "learning_rate": 8.270174030759939e-06, + "loss": 0.0025, + "step": 6840 + }, + { + "epoch": 5.200227747200607, + "grad_norm": 0.09974020719528198, + "learning_rate": 8.263485473189542e-06, + "loss": 0.0003, + "step": 6850 + }, + { + "epoch": 5.207819320554185, + "grad_norm": 0.005017921794205904, + "learning_rate": 8.256786725795767e-06, + "loss": 0.0707, + "step": 6860 + }, + { + "epoch": 5.215410893907762, + "grad_norm": 16.735441207885742, + "learning_rate": 8.250077809494612e-06, + "loss": 0.1761, + "step": 6870 + }, + { + "epoch": 5.22300246726134, + "grad_norm": 0.08619498461484909, + "learning_rate": 8.243358745233822e-06, + "loss": 0.0025, + "step": 6880 + }, + { + "epoch": 5.230594040614918, + "grad_norm": 0.008258694782853127, + "learning_rate": 8.236629553992837e-06, + "loss": 0.1096, + "step": 6890 + }, + { + "epoch": 5.238185613968495, + "grad_norm": 0.032047972083091736, + "learning_rate": 8.229890256782705e-06, + "loss": 0.0774, + "step": 6900 + }, + { + "epoch": 5.245777187322073, + "grad_norm": 0.12164535373449326, + "learning_rate": 8.223140874646039e-06, + "loss": 0.041, + "step": 6910 + }, + { + "epoch": 5.25336876067565, + "grad_norm": 0.30879223346710205, + "learning_rate": 8.216381428656935e-06, + "loss": 0.0008, + "step": 6920 + }, + { + "epoch": 5.260960334029227, + "grad_norm": 0.011329672299325466, + "learning_rate": 8.209611939920912e-06, + "loss": 0.0507, + "step": 6930 + }, + { + "epoch": 5.268551907382805, + "grad_norm": 0.0024318841751664877, + "learning_rate": 8.202832429574851e-06, + "loss": 0.0511, + "step": 6940 + }, + { + "epoch": 5.2761434807363825, + "grad_norm": 0.06363888084888458, + "learning_rate": 8.196042918786923e-06, + "loss": 0.0418, + "step": 6950 + }, + { + "epoch": 5.28373505408996, + "grad_norm": 0.006296386010944843, + "learning_rate": 8.189243428756518e-06, + "loss": 0.0013, + "step": 6960 + }, + { + "epoch": 5.291326627443538, + "grad_norm": 1.5055712461471558, + "learning_rate": 8.182433980714191e-06, + "loss": 0.0003, + "step": 6970 + }, + { + "epoch": 5.298918200797115, + "grad_norm": 0.04809055104851723, + "learning_rate": 8.175614595921589e-06, + "loss": 0.0001, + "step": 6980 + }, + { + "epoch": 5.306509774150693, + "grad_norm": 0.0006017005071043968, + "learning_rate": 8.168785295671385e-06, + "loss": 0.0001, + "step": 6990 + }, + { + "epoch": 5.314101347504271, + "grad_norm": 0.05823567882180214, + "learning_rate": 8.161946101287205e-06, + "loss": 0.1, + "step": 7000 + }, + { + "epoch": 5.321692920857847, + "grad_norm": 0.21126702427864075, + "learning_rate": 8.155097034123582e-06, + "loss": 0.0012, + "step": 7010 + }, + { + "epoch": 5.329284494211425, + "grad_norm": 0.005064593628048897, + "learning_rate": 8.148238115565865e-06, + "loss": 0.2162, + "step": 7020 + }, + { + "epoch": 5.336876067565003, + "grad_norm": 0.03429802507162094, + "learning_rate": 8.141369367030165e-06, + "loss": 0.0068, + "step": 7030 + }, + { + "epoch": 5.34446764091858, + "grad_norm": 0.019597377628087997, + "learning_rate": 8.134490809963285e-06, + "loss": 0.0447, + "step": 7040 + }, + { + "epoch": 5.352059214272158, + "grad_norm": 3.237245559692383, + "learning_rate": 8.127602465842656e-06, + "loss": 0.0408, + "step": 7050 + }, + { + "epoch": 5.3596507876257355, + "grad_norm": 0.1109641045331955, + "learning_rate": 8.12070435617627e-06, + "loss": 0.0041, + "step": 7060 + }, + { + "epoch": 5.367242360979313, + "grad_norm": 1.6172115802764893, + "learning_rate": 8.113796502502605e-06, + "loss": 0.0008, + "step": 7070 + }, + { + "epoch": 5.374833934332891, + "grad_norm": 0.0019253261853009462, + "learning_rate": 8.106878926390565e-06, + "loss": 0.0106, + "step": 7080 + }, + { + "epoch": 5.382425507686468, + "grad_norm": 0.010185305029153824, + "learning_rate": 8.099951649439415e-06, + "loss": 0.17, + "step": 7090 + }, + { + "epoch": 5.390017081040045, + "grad_norm": 0.00028460906469263136, + "learning_rate": 8.093014693278705e-06, + "loss": 0.0814, + "step": 7100 + }, + { + "epoch": 5.397608654393623, + "grad_norm": 0.09348779916763306, + "learning_rate": 8.08606807956821e-06, + "loss": 0.0562, + "step": 7110 + }, + { + "epoch": 5.4052002277472, + "grad_norm": 0.01985323429107666, + "learning_rate": 8.079111829997861e-06, + "loss": 0.0004, + "step": 7120 + }, + { + "epoch": 5.412791801100778, + "grad_norm": 0.084492027759552, + "learning_rate": 8.072145966287668e-06, + "loss": 0.0393, + "step": 7130 + }, + { + "epoch": 5.420383374454356, + "grad_norm": 0.008949169889092445, + "learning_rate": 8.06517051018767e-06, + "loss": 0.0027, + "step": 7140 + }, + { + "epoch": 5.427974947807933, + "grad_norm": 0.010001681745052338, + "learning_rate": 8.058185483477849e-06, + "loss": 0.0002, + "step": 7150 + }, + { + "epoch": 5.435566521161511, + "grad_norm": 0.00013484137889463454, + "learning_rate": 8.051190907968077e-06, + "loss": 0.0617, + "step": 7160 + }, + { + "epoch": 5.443158094515089, + "grad_norm": 0.028125835582613945, + "learning_rate": 8.044186805498033e-06, + "loss": 0.0003, + "step": 7170 + }, + { + "epoch": 5.450749667868665, + "grad_norm": 0.011845303699374199, + "learning_rate": 8.037173197937149e-06, + "loss": 0.0002, + "step": 7180 + }, + { + "epoch": 5.458341241222243, + "grad_norm": 0.021918371319770813, + "learning_rate": 8.030150107184535e-06, + "loss": 0.0003, + "step": 7190 + }, + { + "epoch": 5.465932814575821, + "grad_norm": 0.002744874684140086, + "learning_rate": 8.023117555168907e-06, + "loss": 0.0174, + "step": 7200 + }, + { + "epoch": 5.473524387929398, + "grad_norm": 0.0008592222584411502, + "learning_rate": 8.016075563848524e-06, + "loss": 0.0001, + "step": 7210 + }, + { + "epoch": 5.481115961282976, + "grad_norm": 0.0009818489197641611, + "learning_rate": 8.009024155211125e-06, + "loss": 0.0001, + "step": 7220 + }, + { + "epoch": 5.4887075346365535, + "grad_norm": 0.0036790217272937298, + "learning_rate": 8.001963351273843e-06, + "loss": 0.0001, + "step": 7230 + }, + { + "epoch": 5.496299107990131, + "grad_norm": 0.009668831713497639, + "learning_rate": 7.994893174083151e-06, + "loss": 0.0663, + "step": 7240 + }, + { + "epoch": 5.503890681343709, + "grad_norm": 0.008087705820798874, + "learning_rate": 7.98781364571479e-06, + "loss": 0.0, + "step": 7250 + }, + { + "epoch": 5.511482254697286, + "grad_norm": 0.001750052673742175, + "learning_rate": 7.980724788273698e-06, + "loss": 0.0001, + "step": 7260 + }, + { + "epoch": 5.519073828050864, + "grad_norm": 0.0040147858671844006, + "learning_rate": 7.973626623893942e-06, + "loss": 0.1629, + "step": 7270 + }, + { + "epoch": 5.526665401404441, + "grad_norm": 0.005076427478343248, + "learning_rate": 7.96651917473865e-06, + "loss": 0.0001, + "step": 7280 + }, + { + "epoch": 5.534256974758018, + "grad_norm": 0.022049933671951294, + "learning_rate": 7.959402462999934e-06, + "loss": 0.0001, + "step": 7290 + }, + { + "epoch": 5.541848548111596, + "grad_norm": 7.288018226623535, + "learning_rate": 7.952276510898838e-06, + "loss": 0.0612, + "step": 7300 + }, + { + "epoch": 5.549440121465174, + "grad_norm": 0.48564571142196655, + "learning_rate": 7.945141340685249e-06, + "loss": 0.0001, + "step": 7310 + }, + { + "epoch": 5.557031694818751, + "grad_norm": 0.0020839564967900515, + "learning_rate": 7.937996974637839e-06, + "loss": 0.0002, + "step": 7320 + }, + { + "epoch": 5.564623268172329, + "grad_norm": 0.0012567265657708049, + "learning_rate": 7.930843435063996e-06, + "loss": 0.0003, + "step": 7330 + }, + { + "epoch": 5.5722148415259065, + "grad_norm": 0.0036961582954972982, + "learning_rate": 7.923680744299747e-06, + "loss": 0.0876, + "step": 7340 + }, + { + "epoch": 5.579806414879484, + "grad_norm": 31.300655364990234, + "learning_rate": 7.916508924709693e-06, + "loss": 0.151, + "step": 7350 + }, + { + "epoch": 5.587397988233061, + "grad_norm": 0.008196866139769554, + "learning_rate": 7.909327998686942e-06, + "loss": 0.0001, + "step": 7360 + }, + { + "epoch": 5.5949895615866385, + "grad_norm": 10.782143592834473, + "learning_rate": 7.902137988653032e-06, + "loss": 0.0539, + "step": 7370 + }, + { + "epoch": 5.602581134940216, + "grad_norm": 0.004750726278871298, + "learning_rate": 7.894938917057866e-06, + "loss": 0.0385, + "step": 7380 + }, + { + "epoch": 5.610172708293794, + "grad_norm": 0.04581161588430405, + "learning_rate": 7.887730806379641e-06, + "loss": 0.2684, + "step": 7390 + }, + { + "epoch": 5.617764281647371, + "grad_norm": 0.026009181514382362, + "learning_rate": 7.880513679124777e-06, + "loss": 0.1283, + "step": 7400 + }, + { + "epoch": 5.625355855000949, + "grad_norm": 2.0138673782348633, + "learning_rate": 7.873287557827846e-06, + "loss": 0.0004, + "step": 7410 + }, + { + "epoch": 5.632947428354527, + "grad_norm": 0.14630401134490967, + "learning_rate": 7.866052465051506e-06, + "loss": 0.0503, + "step": 7420 + }, + { + "epoch": 5.640539001708104, + "grad_norm": 0.0008778591873124242, + "learning_rate": 7.858808423386422e-06, + "loss": 0.0032, + "step": 7430 + }, + { + "epoch": 5.648130575061682, + "grad_norm": 0.004400940611958504, + "learning_rate": 7.851555455451208e-06, + "loss": 0.0002, + "step": 7440 + }, + { + "epoch": 5.6557221484152596, + "grad_norm": 0.002588229486718774, + "learning_rate": 7.844293583892341e-06, + "loss": 0.0055, + "step": 7450 + }, + { + "epoch": 5.663313721768836, + "grad_norm": 0.0016362261958420277, + "learning_rate": 7.837022831384107e-06, + "loss": 0.0001, + "step": 7460 + }, + { + "epoch": 5.670905295122414, + "grad_norm": 0.006628331728279591, + "learning_rate": 7.829743220628515e-06, + "loss": 0.0001, + "step": 7470 + }, + { + "epoch": 5.6784968684759916, + "grad_norm": 0.0015720854280516505, + "learning_rate": 7.822454774355233e-06, + "loss": 0.1205, + "step": 7480 + }, + { + "epoch": 5.686088441829569, + "grad_norm": 0.005687546916306019, + "learning_rate": 7.815157515321521e-06, + "loss": 0.1584, + "step": 7490 + }, + { + "epoch": 5.693680015183147, + "grad_norm": 0.0018359271343797445, + "learning_rate": 7.807851466312152e-06, + "loss": 0.0833, + "step": 7500 + }, + { + "epoch": 5.701271588536724, + "grad_norm": 0.004786277189850807, + "learning_rate": 7.80053665013935e-06, + "loss": 0.0004, + "step": 7510 + }, + { + "epoch": 5.708863161890302, + "grad_norm": 0.14934459328651428, + "learning_rate": 7.793213089642705e-06, + "loss": 0.0678, + "step": 7520 + }, + { + "epoch": 5.716454735243879, + "grad_norm": 0.002186194993555546, + "learning_rate": 7.785880807689119e-06, + "loss": 0.014, + "step": 7530 + }, + { + "epoch": 5.7240463085974564, + "grad_norm": 0.007107855286449194, + "learning_rate": 7.778539827172717e-06, + "loss": 0.0021, + "step": 7540 + }, + { + "epoch": 5.731637881951034, + "grad_norm": 0.00156366394367069, + "learning_rate": 7.771190171014789e-06, + "loss": 0.0299, + "step": 7550 + }, + { + "epoch": 5.739229455304612, + "grad_norm": 0.006057819351553917, + "learning_rate": 7.763831862163715e-06, + "loss": 0.3021, + "step": 7560 + }, + { + "epoch": 5.746821028658189, + "grad_norm": 0.1267128884792328, + "learning_rate": 7.756464923594889e-06, + "loss": 0.1477, + "step": 7570 + }, + { + "epoch": 5.754412602011767, + "grad_norm": 0.003787196008488536, + "learning_rate": 7.74908937831065e-06, + "loss": 0.0012, + "step": 7580 + }, + { + "epoch": 5.762004175365345, + "grad_norm": 0.004670240916311741, + "learning_rate": 7.741705249340212e-06, + "loss": 0.0001, + "step": 7590 + }, + { + "epoch": 5.769595748718922, + "grad_norm": 0.0031925721559673548, + "learning_rate": 7.734312559739591e-06, + "loss": 0.1256, + "step": 7600 + }, + { + "epoch": 5.7771873220725, + "grad_norm": 0.05346198379993439, + "learning_rate": 7.726911332591533e-06, + "loss": 0.0297, + "step": 7610 + }, + { + "epoch": 5.7847788954260775, + "grad_norm": 9.102517127990723, + "learning_rate": 7.719501591005435e-06, + "loss": 0.0291, + "step": 7620 + }, + { + "epoch": 5.792370468779654, + "grad_norm": 0.012199531309306622, + "learning_rate": 7.71208335811729e-06, + "loss": 0.0015, + "step": 7630 + }, + { + "epoch": 5.799962042133232, + "grad_norm": 0.0010750379879027605, + "learning_rate": 7.704656657089594e-06, + "loss": 0.0002, + "step": 7640 + }, + { + "epoch": 5.8075536154868095, + "grad_norm": 0.0029223288875073195, + "learning_rate": 7.697221511111289e-06, + "loss": 0.0404, + "step": 7650 + }, + { + "epoch": 5.815145188840387, + "grad_norm": 0.030176958069205284, + "learning_rate": 7.689777943397684e-06, + "loss": 0.0002, + "step": 7660 + }, + { + "epoch": 5.822736762193965, + "grad_norm": 0.01166499499231577, + "learning_rate": 7.682325977190386e-06, + "loss": 0.0381, + "step": 7670 + }, + { + "epoch": 5.830328335547542, + "grad_norm": 32.26509475708008, + "learning_rate": 7.674865635757219e-06, + "loss": 0.0993, + "step": 7680 + }, + { + "epoch": 5.83791990890112, + "grad_norm": 19.091943740844727, + "learning_rate": 7.667396942392165e-06, + "loss": 0.0492, + "step": 7690 + }, + { + "epoch": 5.845511482254698, + "grad_norm": 0.01752518303692341, + "learning_rate": 7.659919920415282e-06, + "loss": 0.0053, + "step": 7700 + }, + { + "epoch": 5.853103055608274, + "grad_norm": 0.0013000709004700184, + "learning_rate": 7.652434593172629e-06, + "loss": 0.2842, + "step": 7710 + }, + { + "epoch": 5.860694628961852, + "grad_norm": 76.4178695678711, + "learning_rate": 7.6449409840362e-06, + "loss": 0.019, + "step": 7720 + }, + { + "epoch": 5.86828620231543, + "grad_norm": 49.07400894165039, + "learning_rate": 7.63743911640385e-06, + "loss": 0.0412, + "step": 7730 + }, + { + "epoch": 5.875877775669007, + "grad_norm": 0.018517136573791504, + "learning_rate": 7.629929013699215e-06, + "loss": 0.0113, + "step": 7740 + }, + { + "epoch": 5.883469349022585, + "grad_norm": 0.0009308361331932247, + "learning_rate": 7.622410699371651e-06, + "loss": 0.0975, + "step": 7750 + }, + { + "epoch": 5.8910609223761625, + "grad_norm": 0.002873294521123171, + "learning_rate": 7.614884196896146e-06, + "loss": 0.0001, + "step": 7760 + }, + { + "epoch": 5.89865249572974, + "grad_norm": 0.5766377449035645, + "learning_rate": 7.607349529773263e-06, + "loss": 0.0894, + "step": 7770 + }, + { + "epoch": 5.906244069083318, + "grad_norm": 0.33659154176712036, + "learning_rate": 7.599806721529048e-06, + "loss": 0.026, + "step": 7780 + }, + { + "epoch": 5.913835642436895, + "grad_norm": 0.06800296902656555, + "learning_rate": 7.592255795714978e-06, + "loss": 0.001, + "step": 7790 + }, + { + "epoch": 5.921427215790473, + "grad_norm": 0.010890863835811615, + "learning_rate": 7.5846967759078646e-06, + "loss": 0.0515, + "step": 7800 + }, + { + "epoch": 5.92901878914405, + "grad_norm": 0.0007496042526327074, + "learning_rate": 7.577129685709802e-06, + "loss": 0.0196, + "step": 7810 + }, + { + "epoch": 5.936610362497627, + "grad_norm": 0.12547799944877625, + "learning_rate": 7.569554548748076e-06, + "loss": 0.0212, + "step": 7820 + }, + { + "epoch": 5.944201935851205, + "grad_norm": 0.000410243752412498, + "learning_rate": 7.561971388675101e-06, + "loss": 0.0001, + "step": 7830 + }, + { + "epoch": 5.951793509204783, + "grad_norm": 0.0626864954829216, + "learning_rate": 7.554380229168341e-06, + "loss": 0.1047, + "step": 7840 + }, + { + "epoch": 5.95938508255836, + "grad_norm": 0.0048113660886883736, + "learning_rate": 7.546781093930238e-06, + "loss": 0.0166, + "step": 7850 + }, + { + "epoch": 5.966976655911938, + "grad_norm": 0.04934828728437424, + "learning_rate": 7.539174006688137e-06, + "loss": 0.1765, + "step": 7860 + }, + { + "epoch": 5.974568229265516, + "grad_norm": 3.118401527404785, + "learning_rate": 7.531558991194214e-06, + "loss": 0.0369, + "step": 7870 + }, + { + "epoch": 5.982159802619093, + "grad_norm": 33.45072937011719, + "learning_rate": 7.523936071225395e-06, + "loss": 0.1186, + "step": 7880 + }, + { + "epoch": 5.98975137597267, + "grad_norm": 0.09529292583465576, + "learning_rate": 7.516305270583291e-06, + "loss": 0.0382, + "step": 7890 + }, + { + "epoch": 5.997342949326248, + "grad_norm": 0.09993643313646317, + "learning_rate": 7.50866661309412e-06, + "loss": 0.1966, + "step": 7900 + }, + { + "epoch": 5.999620421332321, + "eval_f1": 0.9453778934602862, + "eval_loss": 0.17724575102329254, + "eval_precision": 0.9455308702748206, + "eval_recall": 0.9453717754172989, + "eval_runtime": 75.7194, + "eval_samples_per_second": 17.406, + "eval_steps_per_second": 17.406, + "step": 7903 + }, + { + "epoch": 6.004934522679825, + "grad_norm": 0.5747145414352417, + "learning_rate": 7.5010201226086285e-06, + "loss": 0.0792, + "step": 7910 + }, + { + "epoch": 6.012526096033403, + "grad_norm": 15.592010498046875, + "learning_rate": 7.493365823002023e-06, + "loss": 0.066, + "step": 7920 + }, + { + "epoch": 6.0201176693869805, + "grad_norm": 0.002133031841367483, + "learning_rate": 7.4857037381738924e-06, + "loss": 0.0001, + "step": 7930 + }, + { + "epoch": 6.027709242740558, + "grad_norm": 0.006577716208994389, + "learning_rate": 7.478033892048134e-06, + "loss": 0.0005, + "step": 7940 + }, + { + "epoch": 6.035300816094136, + "grad_norm": 0.0061035482212901115, + "learning_rate": 7.470356308572879e-06, + "loss": 0.0, + "step": 7950 + }, + { + "epoch": 6.042892389447713, + "grad_norm": 0.0037885792553424835, + "learning_rate": 7.462671011720417e-06, + "loss": 0.0001, + "step": 7960 + }, + { + "epoch": 6.050483962801291, + "grad_norm": 0.010262789204716682, + "learning_rate": 7.454978025487121e-06, + "loss": 0.0007, + "step": 7970 + }, + { + "epoch": 6.058075536154868, + "grad_norm": 0.0021226617973297834, + "learning_rate": 7.447277373893373e-06, + "loss": 0.0386, + "step": 7980 + }, + { + "epoch": 6.065667109508445, + "grad_norm": 0.00850209966301918, + "learning_rate": 7.439569080983493e-06, + "loss": 0.0008, + "step": 7990 + }, + { + "epoch": 6.073258682862023, + "grad_norm": 0.004618831444531679, + "learning_rate": 7.431853170825658e-06, + "loss": 0.0, + "step": 8000 + }, + { + "epoch": 6.080850256215601, + "grad_norm": 0.0010309051722288132, + "learning_rate": 7.424129667511824e-06, + "loss": 0.0174, + "step": 8010 + }, + { + "epoch": 6.088441829569178, + "grad_norm": 0.005731165409088135, + "learning_rate": 7.4163985951576616e-06, + "loss": 0.0099, + "step": 8020 + }, + { + "epoch": 6.096033402922756, + "grad_norm": 2.437437057495117, + "learning_rate": 7.408659977902474e-06, + "loss": 0.159, + "step": 8030 + }, + { + "epoch": 6.1036249762763335, + "grad_norm": 0.008021681569516659, + "learning_rate": 7.400913839909119e-06, + "loss": 0.0002, + "step": 8040 + }, + { + "epoch": 6.111216549629911, + "grad_norm": 0.0012970505049452186, + "learning_rate": 7.3931602053639414e-06, + "loss": 0.0527, + "step": 8050 + }, + { + "epoch": 6.118808122983489, + "grad_norm": 0.031485993415117264, + "learning_rate": 7.385399098476691e-06, + "loss": 0.0416, + "step": 8060 + }, + { + "epoch": 6.1263996963370655, + "grad_norm": 0.037826113402843475, + "learning_rate": 7.377630543480447e-06, + "loss": 0.0064, + "step": 8070 + }, + { + "epoch": 6.133991269690643, + "grad_norm": 0.007939423434436321, + "learning_rate": 7.369854564631549e-06, + "loss": 0.0004, + "step": 8080 + }, + { + "epoch": 6.141582843044221, + "grad_norm": 0.011576803401112556, + "learning_rate": 7.3620711862095116e-06, + "loss": 0.0003, + "step": 8090 + }, + { + "epoch": 6.149174416397798, + "grad_norm": 0.01118936575949192, + "learning_rate": 7.354280432516957e-06, + "loss": 0.0002, + "step": 8100 + }, + { + "epoch": 6.156765989751376, + "grad_norm": 0.001931383740156889, + "learning_rate": 7.346482327879535e-06, + "loss": 0.0009, + "step": 8110 + }, + { + "epoch": 6.164357563104954, + "grad_norm": 0.005506934132426977, + "learning_rate": 7.338676896645848e-06, + "loss": 0.0567, + "step": 8120 + }, + { + "epoch": 6.171949136458531, + "grad_norm": 0.07792196422815323, + "learning_rate": 7.330864163187372e-06, + "loss": 0.0003, + "step": 8130 + }, + { + "epoch": 6.179540709812109, + "grad_norm": 0.06636549532413483, + "learning_rate": 7.323044151898388e-06, + "loss": 0.0658, + "step": 8140 + }, + { + "epoch": 6.1871322831656865, + "grad_norm": 0.0012724515981972218, + "learning_rate": 7.3152168871959e-06, + "loss": 0.0605, + "step": 8150 + }, + { + "epoch": 6.194723856519263, + "grad_norm": 0.0033073413651436567, + "learning_rate": 7.307382393519556e-06, + "loss": 0.0732, + "step": 8160 + }, + { + "epoch": 6.202315429872841, + "grad_norm": 0.00361923361197114, + "learning_rate": 7.299540695331579e-06, + "loss": 0.0054, + "step": 8170 + }, + { + "epoch": 6.2099070032264185, + "grad_norm": 0.0007601641118526459, + "learning_rate": 7.291691817116686e-06, + "loss": 0.0001, + "step": 8180 + }, + { + "epoch": 6.217498576579996, + "grad_norm": 0.0025373934768140316, + "learning_rate": 7.283835783382015e-06, + "loss": 0.0567, + "step": 8190 + }, + { + "epoch": 6.225090149933574, + "grad_norm": 0.0037624204996973276, + "learning_rate": 7.275972618657041e-06, + "loss": 0.0001, + "step": 8200 + }, + { + "epoch": 6.232681723287151, + "grad_norm": 0.002659817226231098, + "learning_rate": 7.268102347493511e-06, + "loss": 0.0727, + "step": 8210 + }, + { + "epoch": 6.240273296640729, + "grad_norm": 0.08516960591077805, + "learning_rate": 7.260224994465357e-06, + "loss": 0.001, + "step": 8220 + }, + { + "epoch": 6.247864869994307, + "grad_norm": 0.03827419877052307, + "learning_rate": 7.252340584168624e-06, + "loss": 0.0023, + "step": 8230 + }, + { + "epoch": 6.255456443347883, + "grad_norm": 0.0027726832777261734, + "learning_rate": 7.2444491412213914e-06, + "loss": 0.0536, + "step": 8240 + }, + { + "epoch": 6.263048016701461, + "grad_norm": 0.0064014289528131485, + "learning_rate": 7.236550690263702e-06, + "loss": 0.001, + "step": 8250 + }, + { + "epoch": 6.270639590055039, + "grad_norm": 0.005650675855576992, + "learning_rate": 7.228645255957472e-06, + "loss": 0.2206, + "step": 8260 + }, + { + "epoch": 6.278231163408616, + "grad_norm": 21.262990951538086, + "learning_rate": 7.2207328629864285e-06, + "loss": 0.0884, + "step": 8270 + }, + { + "epoch": 6.285822736762194, + "grad_norm": 0.03092315047979355, + "learning_rate": 7.212813536056025e-06, + "loss": 0.0684, + "step": 8280 + }, + { + "epoch": 6.293414310115772, + "grad_norm": 0.00995034258812666, + "learning_rate": 7.2048872998933665e-06, + "loss": 0.0003, + "step": 8290 + }, + { + "epoch": 6.301005883469349, + "grad_norm": 0.08173485100269318, + "learning_rate": 7.196954179247127e-06, + "loss": 0.0699, + "step": 8300 + }, + { + "epoch": 6.308597456822927, + "grad_norm": 0.15706369280815125, + "learning_rate": 7.189014198887478e-06, + "loss": 0.0419, + "step": 8310 + }, + { + "epoch": 6.3161890301765045, + "grad_norm": 0.44603389501571655, + "learning_rate": 7.181067383606015e-06, + "loss": 0.0374, + "step": 8320 + }, + { + "epoch": 6.323780603530081, + "grad_norm": 89.45038604736328, + "learning_rate": 7.173113758215667e-06, + "loss": 0.0231, + "step": 8330 + }, + { + "epoch": 6.331372176883659, + "grad_norm": 0.07431600242853165, + "learning_rate": 7.165153347550631e-06, + "loss": 0.007, + "step": 8340 + }, + { + "epoch": 6.3389637502372365, + "grad_norm": 0.00812879391014576, + "learning_rate": 7.15718617646629e-06, + "loss": 0.1122, + "step": 8350 + }, + { + "epoch": 6.346555323590814, + "grad_norm": 0.4049533009529114, + "learning_rate": 7.149212269839132e-06, + "loss": 0.0532, + "step": 8360 + }, + { + "epoch": 6.354146896944392, + "grad_norm": 0.403401255607605, + "learning_rate": 7.141231652566681e-06, + "loss": 0.0008, + "step": 8370 + }, + { + "epoch": 6.361738470297969, + "grad_norm": 0.8025851249694824, + "learning_rate": 7.133244349567411e-06, + "loss": 0.0221, + "step": 8380 + }, + { + "epoch": 6.369330043651547, + "grad_norm": 0.06498798727989197, + "learning_rate": 7.125250385780673e-06, + "loss": 0.0621, + "step": 8390 + }, + { + "epoch": 6.376921617005125, + "grad_norm": 0.0010519091738387942, + "learning_rate": 7.1172497861666124e-06, + "loss": 0.0404, + "step": 8400 + }, + { + "epoch": 6.384513190358702, + "grad_norm": 0.01423695683479309, + "learning_rate": 7.109242575706099e-06, + "loss": 0.0314, + "step": 8410 + }, + { + "epoch": 6.392104763712279, + "grad_norm": 0.8802148103713989, + "learning_rate": 7.10122877940064e-06, + "loss": 0.013, + "step": 8420 + }, + { + "epoch": 6.399696337065857, + "grad_norm": 0.037081677466630936, + "learning_rate": 7.093208422272309e-06, + "loss": 0.0005, + "step": 8430 + }, + { + "epoch": 6.407287910419434, + "grad_norm": 0.0005525704473257065, + "learning_rate": 7.085181529363661e-06, + "loss": 0.0972, + "step": 8440 + }, + { + "epoch": 6.414879483773012, + "grad_norm": 0.018398938700556755, + "learning_rate": 7.077148125737661e-06, + "loss": 0.1108, + "step": 8450 + }, + { + "epoch": 6.4224710571265895, + "grad_norm": 0.040173228830099106, + "learning_rate": 7.069108236477604e-06, + "loss": 0.0002, + "step": 8460 + }, + { + "epoch": 6.430062630480167, + "grad_norm": 0.009616430848836899, + "learning_rate": 7.061061886687035e-06, + "loss": 0.0013, + "step": 8470 + }, + { + "epoch": 6.437654203833745, + "grad_norm": 78.41429901123047, + "learning_rate": 7.053009101489667e-06, + "loss": 0.1232, + "step": 8480 + }, + { + "epoch": 6.445245777187322, + "grad_norm": 0.000696105882525444, + "learning_rate": 7.044949906029314e-06, + "loss": 0.0066, + "step": 8490 + }, + { + "epoch": 6.4528373505409, + "grad_norm": 0.012759624980390072, + "learning_rate": 7.036884325469797e-06, + "loss": 0.205, + "step": 8500 + }, + { + "epoch": 6.460428923894477, + "grad_norm": 51.840309143066406, + "learning_rate": 7.028812384994883e-06, + "loss": 0.1227, + "step": 8510 + }, + { + "epoch": 6.468020497248054, + "grad_norm": 0.0019890512339770794, + "learning_rate": 7.0207341098081875e-06, + "loss": 0.1419, + "step": 8520 + }, + { + "epoch": 6.475612070601632, + "grad_norm": 0.003854219801723957, + "learning_rate": 7.012649525133112e-06, + "loss": 0.0714, + "step": 8530 + }, + { + "epoch": 6.48320364395521, + "grad_norm": 0.06946977972984314, + "learning_rate": 7.004558656212754e-06, + "loss": 0.0004, + "step": 8540 + }, + { + "epoch": 6.490795217308787, + "grad_norm": 0.003731220494955778, + "learning_rate": 6.9964615283098405e-06, + "loss": 0.0017, + "step": 8550 + }, + { + "epoch": 6.498386790662365, + "grad_norm": 0.002791723469272256, + "learning_rate": 6.988358166706631e-06, + "loss": 0.0403, + "step": 8560 + }, + { + "epoch": 6.5059783640159425, + "grad_norm": 4.053121089935303, + "learning_rate": 6.980248596704856e-06, + "loss": 0.0008, + "step": 8570 + }, + { + "epoch": 6.51356993736952, + "grad_norm": 0.0038540286477655172, + "learning_rate": 6.97213284362563e-06, + "loss": 0.0003, + "step": 8580 + }, + { + "epoch": 6.521161510723097, + "grad_norm": 0.0033889245241880417, + "learning_rate": 6.96401093280937e-06, + "loss": 0.0505, + "step": 8590 + }, + { + "epoch": 6.5287530840766745, + "grad_norm": 0.0008385963155888021, + "learning_rate": 6.9558828896157225e-06, + "loss": 0.0001, + "step": 8600 + }, + { + "epoch": 6.536344657430252, + "grad_norm": 0.05049284175038338, + "learning_rate": 6.947748739423483e-06, + "loss": 0.0776, + "step": 8610 + }, + { + "epoch": 6.54393623078383, + "grad_norm": 0.014165320433676243, + "learning_rate": 6.939608507630513e-06, + "loss": 0.0339, + "step": 8620 + }, + { + "epoch": 6.551527804137407, + "grad_norm": 24.47572898864746, + "learning_rate": 6.931462219653662e-06, + "loss": 0.1604, + "step": 8630 + }, + { + "epoch": 6.559119377490985, + "grad_norm": 0.07809809595346451, + "learning_rate": 6.923309900928693e-06, + "loss": 0.0003, + "step": 8640 + }, + { + "epoch": 6.566710950844563, + "grad_norm": 0.08131968230009079, + "learning_rate": 6.915151576910194e-06, + "loss": 0.0097, + "step": 8650 + }, + { + "epoch": 6.57430252419814, + "grad_norm": 106.42731475830078, + "learning_rate": 6.906987273071509e-06, + "loss": 0.0111, + "step": 8660 + }, + { + "epoch": 6.581894097551718, + "grad_norm": 0.0046349032782018185, + "learning_rate": 6.898817014904653e-06, + "loss": 0.0601, + "step": 8670 + }, + { + "epoch": 6.589485670905296, + "grad_norm": 0.00192779372446239, + "learning_rate": 6.890640827920226e-06, + "loss": 0.0349, + "step": 8680 + }, + { + "epoch": 6.597077244258872, + "grad_norm": 0.0012624857481569052, + "learning_rate": 6.882458737647346e-06, + "loss": 0.0009, + "step": 8690 + }, + { + "epoch": 6.60466881761245, + "grad_norm": 0.00019073448493145406, + "learning_rate": 6.874270769633564e-06, + "loss": 0.0001, + "step": 8700 + }, + { + "epoch": 6.612260390966028, + "grad_norm": 0.03901955857872963, + "learning_rate": 6.866076949444781e-06, + "loss": 0.238, + "step": 8710 + }, + { + "epoch": 6.619851964319605, + "grad_norm": 0.05632855370640755, + "learning_rate": 6.857877302665169e-06, + "loss": 0.0435, + "step": 8720 + }, + { + "epoch": 6.627443537673183, + "grad_norm": 0.01720161736011505, + "learning_rate": 6.8496718548970956e-06, + "loss": 0.0208, + "step": 8730 + }, + { + "epoch": 6.6350351110267605, + "grad_norm": 0.00398442754521966, + "learning_rate": 6.8414606317610435e-06, + "loss": 0.0012, + "step": 8740 + }, + { + "epoch": 6.642626684380338, + "grad_norm": 0.02426181733608246, + "learning_rate": 6.833243658895521e-06, + "loss": 0.0004, + "step": 8750 + }, + { + "epoch": 6.650218257733916, + "grad_norm": 14.350150108337402, + "learning_rate": 6.825020961956995e-06, + "loss": 0.0823, + "step": 8760 + }, + { + "epoch": 6.6578098310874925, + "grad_norm": 0.0016744782915338874, + "learning_rate": 6.816792566619805e-06, + "loss": 0.1436, + "step": 8770 + }, + { + "epoch": 6.66540140444107, + "grad_norm": 0.020618196576833725, + "learning_rate": 6.808558498576081e-06, + "loss": 0.0006, + "step": 8780 + }, + { + "epoch": 6.672992977794648, + "grad_norm": 0.13271041214466095, + "learning_rate": 6.800318783535665e-06, + "loss": 0.0074, + "step": 8790 + }, + { + "epoch": 6.680584551148225, + "grad_norm": 0.020608441904187202, + "learning_rate": 6.792073447226034e-06, + "loss": 0.0002, + "step": 8800 + }, + { + "epoch": 6.688176124501803, + "grad_norm": 0.0014845712576061487, + "learning_rate": 6.7838225153922125e-06, + "loss": 0.0004, + "step": 8810 + }, + { + "epoch": 6.695767697855381, + "grad_norm": 0.06566622108221054, + "learning_rate": 6.775566013796699e-06, + "loss": 0.055, + "step": 8820 + }, + { + "epoch": 6.703359271208958, + "grad_norm": 0.13233526051044464, + "learning_rate": 6.767303968219383e-06, + "loss": 0.051, + "step": 8830 + }, + { + "epoch": 6.710950844562536, + "grad_norm": 12.247241020202637, + "learning_rate": 6.759036404457465e-06, + "loss": 0.171, + "step": 8840 + }, + { + "epoch": 6.7185424179161135, + "grad_norm": 0.06808517873287201, + "learning_rate": 6.750763348325371e-06, + "loss": 0.1818, + "step": 8850 + }, + { + "epoch": 6.726133991269691, + "grad_norm": 0.011621583253145218, + "learning_rate": 6.7424848256546825e-06, + "loss": 0.0119, + "step": 8860 + }, + { + "epoch": 6.733725564623268, + "grad_norm": 22.450834274291992, + "learning_rate": 6.734200862294045e-06, + "loss": 0.176, + "step": 8870 + }, + { + "epoch": 6.7413171379768455, + "grad_norm": 11.976455688476562, + "learning_rate": 6.725911484109094e-06, + "loss": 0.0507, + "step": 8880 + }, + { + "epoch": 6.748908711330423, + "grad_norm": 0.042554713785648346, + "learning_rate": 6.717616716982369e-06, + "loss": 0.0004, + "step": 8890 + }, + { + "epoch": 6.756500284684001, + "grad_norm": 0.0029066246934235096, + "learning_rate": 6.7093165868132415e-06, + "loss": 0.0066, + "step": 8900 + }, + { + "epoch": 6.764091858037578, + "grad_norm": 0.31371551752090454, + "learning_rate": 6.701011119517824e-06, + "loss": 0.0311, + "step": 8910 + }, + { + "epoch": 6.771683431391156, + "grad_norm": 0.025408325716853142, + "learning_rate": 6.692700341028893e-06, + "loss": 0.0002, + "step": 8920 + }, + { + "epoch": 6.779275004744734, + "grad_norm": 0.6896237730979919, + "learning_rate": 6.684384277295813e-06, + "loss": 0.003, + "step": 8930 + }, + { + "epoch": 6.78686657809831, + "grad_norm": 0.0014387418050318956, + "learning_rate": 6.676062954284447e-06, + "loss": 0.1432, + "step": 8940 + }, + { + "epoch": 6.794458151451888, + "grad_norm": 0.012326021678745747, + "learning_rate": 6.667736397977079e-06, + "loss": 0.0131, + "step": 8950 + }, + { + "epoch": 6.802049724805466, + "grad_norm": 0.010481426492333412, + "learning_rate": 6.659404634372338e-06, + "loss": 0.0027, + "step": 8960 + }, + { + "epoch": 6.809641298159043, + "grad_norm": 0.11520393937826157, + "learning_rate": 6.6510676894851065e-06, + "loss": 0.0008, + "step": 8970 + }, + { + "epoch": 6.817232871512621, + "grad_norm": 14.105742454528809, + "learning_rate": 6.6427255893464495e-06, + "loss": 0.1792, + "step": 8980 + }, + { + "epoch": 6.8248244448661985, + "grad_norm": 0.012812143191695213, + "learning_rate": 6.634378360003525e-06, + "loss": 0.0001, + "step": 8990 + }, + { + "epoch": 6.832416018219776, + "grad_norm": 0.0041709113866090775, + "learning_rate": 6.62602602751951e-06, + "loss": 0.0001, + "step": 9000 + }, + { + "epoch": 6.840007591573354, + "grad_norm": 0.0038161997217684984, + "learning_rate": 6.6176686179735095e-06, + "loss": 0.0665, + "step": 9010 + }, + { + "epoch": 6.847599164926931, + "grad_norm": 0.30405986309051514, + "learning_rate": 6.6093061574604875e-06, + "loss": 0.0624, + "step": 9020 + }, + { + "epoch": 6.855190738280509, + "grad_norm": 0.001419481704942882, + "learning_rate": 6.600938672091178e-06, + "loss": 0.0001, + "step": 9030 + }, + { + "epoch": 6.862782311634086, + "grad_norm": 0.005425265524536371, + "learning_rate": 6.592566187992e-06, + "loss": 0.0115, + "step": 9040 + }, + { + "epoch": 6.870373884987663, + "grad_norm": 0.009964833967387676, + "learning_rate": 6.584188731304984e-06, + "loss": 0.0001, + "step": 9050 + }, + { + "epoch": 6.877965458341241, + "grad_norm": 17.450939178466797, + "learning_rate": 6.575806328187684e-06, + "loss": 0.0065, + "step": 9060 + }, + { + "epoch": 6.885557031694819, + "grad_norm": 0.5963069796562195, + "learning_rate": 6.567419004813105e-06, + "loss": 0.162, + "step": 9070 + }, + { + "epoch": 6.893148605048396, + "grad_norm": 0.002563629997894168, + "learning_rate": 6.559026787369608e-06, + "loss": 0.0006, + "step": 9080 + }, + { + "epoch": 6.900740178401974, + "grad_norm": 0.0032906217966228724, + "learning_rate": 6.550629702060836e-06, + "loss": 0.0576, + "step": 9090 + }, + { + "epoch": 6.908331751755552, + "grad_norm": 0.00252812379039824, + "learning_rate": 6.542227775105636e-06, + "loss": 0.0003, + "step": 9100 + }, + { + "epoch": 6.915923325109129, + "grad_norm": 0.13027949631214142, + "learning_rate": 6.533821032737968e-06, + "loss": 0.1393, + "step": 9110 + }, + { + "epoch": 6.923514898462706, + "grad_norm": 0.0013868529349565506, + "learning_rate": 6.525409501206828e-06, + "loss": 0.0003, + "step": 9120 + }, + { + "epoch": 6.931106471816284, + "grad_norm": 0.0035531616304069757, + "learning_rate": 6.516993206776167e-06, + "loss": 0.0516, + "step": 9130 + }, + { + "epoch": 6.938698045169861, + "grad_norm": 0.02282761037349701, + "learning_rate": 6.508572175724809e-06, + "loss": 0.0811, + "step": 9140 + }, + { + "epoch": 6.946289618523439, + "grad_norm": 29.90252685546875, + "learning_rate": 6.500146434346363e-06, + "loss": 0.065, + "step": 9150 + }, + { + "epoch": 6.9538811918770165, + "grad_norm": 0.14673539996147156, + "learning_rate": 6.4917160089491475e-06, + "loss": 0.0004, + "step": 9160 + }, + { + "epoch": 6.961472765230594, + "grad_norm": 2.630889892578125, + "learning_rate": 6.483280925856108e-06, + "loss": 0.039, + "step": 9170 + }, + { + "epoch": 6.969064338584172, + "grad_norm": 0.005536849144846201, + "learning_rate": 6.474841211404732e-06, + "loss": 0.0212, + "step": 9180 + }, + { + "epoch": 6.976655911937749, + "grad_norm": 13.343396186828613, + "learning_rate": 6.466396891946967e-06, + "loss": 0.1344, + "step": 9190 + }, + { + "epoch": 6.984247485291327, + "grad_norm": 13.473750114440918, + "learning_rate": 6.457947993849138e-06, + "loss": 0.0461, + "step": 9200 + }, + { + "epoch": 6.991839058644905, + "grad_norm": 0.10873476415872574, + "learning_rate": 6.4494945434918695e-06, + "loss": 0.0008, + "step": 9210 + }, + { + "epoch": 6.999430631998481, + "grad_norm": 0.08516258746385574, + "learning_rate": 6.441036567269999e-06, + "loss": 0.0006, + "step": 9220 + }, + { + "epoch": 6.999430631998481, + "eval_f1": 0.9574758853469025, + "eval_loss": 0.15473049879074097, + "eval_precision": 0.9565893515212521, + "eval_recall": 0.9575113808801214, + "eval_runtime": 75.8291, + "eval_samples_per_second": 17.381, + "eval_steps_per_second": 17.381, + "step": 9220 + }, + { + "epoch": 7.007022205352059, + "grad_norm": 0.4523492455482483, + "learning_rate": 6.432574091592495e-06, + "loss": 0.0557, + "step": 9230 + }, + { + "epoch": 7.014613778705637, + "grad_norm": 0.03357968479394913, + "learning_rate": 6.424107142882371e-06, + "loss": 0.0008, + "step": 9240 + }, + { + "epoch": 7.022205352059214, + "grad_norm": 46.4831657409668, + "learning_rate": 6.415635747576613e-06, + "loss": 0.0078, + "step": 9250 + }, + { + "epoch": 7.029796925412792, + "grad_norm": 0.010737122967839241, + "learning_rate": 6.40715993212609e-06, + "loss": 0.0003, + "step": 9260 + }, + { + "epoch": 7.0373884987663695, + "grad_norm": 21.10315704345703, + "learning_rate": 6.398679722995468e-06, + "loss": 0.1309, + "step": 9270 + }, + { + "epoch": 7.044980072119947, + "grad_norm": 0.010574131272733212, + "learning_rate": 6.3901951466631355e-06, + "loss": 0.0138, + "step": 9280 + }, + { + "epoch": 7.052571645473525, + "grad_norm": 0.0182713121175766, + "learning_rate": 6.381706229621117e-06, + "loss": 0.0002, + "step": 9290 + }, + { + "epoch": 7.0601632188271015, + "grad_norm": 0.10783802717924118, + "learning_rate": 6.373212998374989e-06, + "loss": 0.0337, + "step": 9300 + }, + { + "epoch": 7.067754792180679, + "grad_norm": 0.001446128822863102, + "learning_rate": 6.364715479443798e-06, + "loss": 0.0007, + "step": 9310 + }, + { + "epoch": 7.075346365534257, + "grad_norm": 0.00694943917915225, + "learning_rate": 6.356213699359982e-06, + "loss": 0.071, + "step": 9320 + }, + { + "epoch": 7.082937938887834, + "grad_norm": 0.09859494864940643, + "learning_rate": 6.347707684669278e-06, + "loss": 0.0005, + "step": 9330 + }, + { + "epoch": 7.090529512241412, + "grad_norm": 0.0008273068233393133, + "learning_rate": 6.33919746193065e-06, + "loss": 0.0, + "step": 9340 + }, + { + "epoch": 7.09812108559499, + "grad_norm": 0.0038316529244184494, + "learning_rate": 6.330683057716198e-06, + "loss": 0.0002, + "step": 9350 + }, + { + "epoch": 7.105712658948567, + "grad_norm": 0.0030708136036992073, + "learning_rate": 6.322164498611081e-06, + "loss": 0.0444, + "step": 9360 + }, + { + "epoch": 7.113304232302145, + "grad_norm": 0.0017414516769349575, + "learning_rate": 6.313641811213429e-06, + "loss": 0.0001, + "step": 9370 + }, + { + "epoch": 7.1208958056557226, + "grad_norm": 0.0035761166363954544, + "learning_rate": 6.305115022134262e-06, + "loss": 0.0001, + "step": 9380 + }, + { + "epoch": 7.128487379009299, + "grad_norm": 0.006457789335399866, + "learning_rate": 6.296584157997408e-06, + "loss": 0.0, + "step": 9390 + }, + { + "epoch": 7.136078952362877, + "grad_norm": 0.002314153825864196, + "learning_rate": 6.288049245439419e-06, + "loss": 0.0, + "step": 9400 + }, + { + "epoch": 7.1436705257164546, + "grad_norm": 0.008694717660546303, + "learning_rate": 6.279510311109487e-06, + "loss": 0.0001, + "step": 9410 + }, + { + "epoch": 7.151262099070032, + "grad_norm": 0.0009509180672466755, + "learning_rate": 6.270967381669362e-06, + "loss": 0.0001, + "step": 9420 + }, + { + "epoch": 7.15885367242361, + "grad_norm": 0.009006676264107227, + "learning_rate": 6.262420483793267e-06, + "loss": 0.0605, + "step": 9430 + }, + { + "epoch": 7.1664452457771874, + "grad_norm": 0.048271872103214264, + "learning_rate": 6.253869644167816e-06, + "loss": 0.3191, + "step": 9440 + }, + { + "epoch": 7.174036819130765, + "grad_norm": 0.001320886891335249, + "learning_rate": 6.245314889491933e-06, + "loss": 0.0066, + "step": 9450 + }, + { + "epoch": 7.181628392484343, + "grad_norm": 0.4799332916736603, + "learning_rate": 6.236756246476765e-06, + "loss": 0.0261, + "step": 9460 + }, + { + "epoch": 7.18921996583792, + "grad_norm": 1.649972677230835, + "learning_rate": 6.228193741845598e-06, + "loss": 0.001, + "step": 9470 + }, + { + "epoch": 7.196811539191497, + "grad_norm": 0.001544089405797422, + "learning_rate": 6.219627402333779e-06, + "loss": 0.0001, + "step": 9480 + }, + { + "epoch": 7.204403112545075, + "grad_norm": 0.0058356523513793945, + "learning_rate": 6.211057254688625e-06, + "loss": 0.0005, + "step": 9490 + }, + { + "epoch": 7.211994685898652, + "grad_norm": 0.001541537931188941, + "learning_rate": 6.202483325669345e-06, + "loss": 0.0001, + "step": 9500 + }, + { + "epoch": 7.21958625925223, + "grad_norm": 0.002176716923713684, + "learning_rate": 6.193905642046957e-06, + "loss": 0.0472, + "step": 9510 + }, + { + "epoch": 7.227177832605808, + "grad_norm": 1.9937260150909424, + "learning_rate": 6.1853242306041995e-06, + "loss": 0.1573, + "step": 9520 + }, + { + "epoch": 7.234769405959385, + "grad_norm": 0.005575578194111586, + "learning_rate": 6.176739118135451e-06, + "loss": 0.004, + "step": 9530 + }, + { + "epoch": 7.242360979312963, + "grad_norm": 0.000481792347272858, + "learning_rate": 6.168150331446647e-06, + "loss": 0.0001, + "step": 9540 + }, + { + "epoch": 7.2499525526665405, + "grad_norm": 17.994680404663086, + "learning_rate": 6.159557897355198e-06, + "loss": 0.1026, + "step": 9550 + }, + { + "epoch": 7.257544126020118, + "grad_norm": 0.002096704440191388, + "learning_rate": 6.1509618426898934e-06, + "loss": 0.0004, + "step": 9560 + }, + { + "epoch": 7.265135699373695, + "grad_norm": 0.8841345906257629, + "learning_rate": 6.142362194290839e-06, + "loss": 0.0017, + "step": 9570 + }, + { + "epoch": 7.2727272727272725, + "grad_norm": 0.0021465634927153587, + "learning_rate": 6.133758979009355e-06, + "loss": 0.0001, + "step": 9580 + }, + { + "epoch": 7.28031884608085, + "grad_norm": 0.000766513985581696, + "learning_rate": 6.1251522237078996e-06, + "loss": 0.2186, + "step": 9590 + }, + { + "epoch": 7.287910419434428, + "grad_norm": 0.01812721975147724, + "learning_rate": 6.116541955259986e-06, + "loss": 0.0007, + "step": 9600 + }, + { + "epoch": 7.295501992788005, + "grad_norm": 0.00034479115856811404, + "learning_rate": 6.1079282005500965e-06, + "loss": 0.0055, + "step": 9610 + }, + { + "epoch": 7.303093566141583, + "grad_norm": 0.0008322893991135061, + "learning_rate": 6.099310986473595e-06, + "loss": 0.1915, + "step": 9620 + }, + { + "epoch": 7.310685139495161, + "grad_norm": 0.001017669215798378, + "learning_rate": 6.090690339936651e-06, + "loss": 0.0001, + "step": 9630 + }, + { + "epoch": 7.318276712848738, + "grad_norm": 0.003790239803493023, + "learning_rate": 6.082066287856152e-06, + "loss": 0.0001, + "step": 9640 + }, + { + "epoch": 7.325868286202315, + "grad_norm": 0.001801560982130468, + "learning_rate": 6.073438857159617e-06, + "loss": 0.0404, + "step": 9650 + }, + { + "epoch": 7.333459859555893, + "grad_norm": 0.0027911756187677383, + "learning_rate": 6.064808074785112e-06, + "loss": 0.0215, + "step": 9660 + }, + { + "epoch": 7.34105143290947, + "grad_norm": 0.001065615564584732, + "learning_rate": 6.056173967681172e-06, + "loss": 0.0, + "step": 9670 + }, + { + "epoch": 7.348643006263048, + "grad_norm": 0.0008436132338829339, + "learning_rate": 6.047536562806712e-06, + "loss": 0.0001, + "step": 9680 + }, + { + "epoch": 7.3562345796166255, + "grad_norm": 0.8050636053085327, + "learning_rate": 6.038895887130942e-06, + "loss": 0.068, + "step": 9690 + }, + { + "epoch": 7.363826152970203, + "grad_norm": 0.011237557046115398, + "learning_rate": 6.030251967633288e-06, + "loss": 0.0001, + "step": 9700 + }, + { + "epoch": 7.371417726323781, + "grad_norm": 0.0008242133189924061, + "learning_rate": 6.021604831303303e-06, + "loss": 0.0963, + "step": 9710 + }, + { + "epoch": 7.379009299677358, + "grad_norm": 0.9633244276046753, + "learning_rate": 6.012954505140582e-06, + "loss": 0.0032, + "step": 9720 + }, + { + "epoch": 7.386600873030936, + "grad_norm": 33.064613342285156, + "learning_rate": 6.004301016154683e-06, + "loss": 0.0926, + "step": 9730 + }, + { + "epoch": 7.394192446384513, + "grad_norm": 0.010244650766253471, + "learning_rate": 5.995644391365038e-06, + "loss": 0.0, + "step": 9740 + }, + { + "epoch": 7.40178401973809, + "grad_norm": 0.0010498914634808898, + "learning_rate": 5.98698465780087e-06, + "loss": 0.2515, + "step": 9750 + }, + { + "epoch": 7.409375593091668, + "grad_norm": 0.005540487356483936, + "learning_rate": 5.978321842501108e-06, + "loss": 0.0001, + "step": 9760 + }, + { + "epoch": 7.416967166445246, + "grad_norm": 0.001410833327099681, + "learning_rate": 5.9696559725143054e-06, + "loss": 0.0024, + "step": 9770 + }, + { + "epoch": 7.424558739798823, + "grad_norm": 0.11642355471849442, + "learning_rate": 5.960987074898553e-06, + "loss": 0.0004, + "step": 9780 + }, + { + "epoch": 7.432150313152401, + "grad_norm": 0.029217828065156937, + "learning_rate": 5.952315176721395e-06, + "loss": 0.0002, + "step": 9790 + }, + { + "epoch": 7.439741886505979, + "grad_norm": 0.057612184435129166, + "learning_rate": 5.943640305059742e-06, + "loss": 0.0455, + "step": 9800 + }, + { + "epoch": 7.447333459859556, + "grad_norm": 30.20539665222168, + "learning_rate": 5.9349624869997915e-06, + "loss": 0.152, + "step": 9810 + }, + { + "epoch": 7.454925033213134, + "grad_norm": 0.011167285032570362, + "learning_rate": 5.926281749636941e-06, + "loss": 0.0013, + "step": 9820 + }, + { + "epoch": 7.462516606566711, + "grad_norm": 0.01445252075791359, + "learning_rate": 5.9175981200757026e-06, + "loss": 0.0275, + "step": 9830 + }, + { + "epoch": 7.470108179920288, + "grad_norm": 0.0006470708176493645, + "learning_rate": 5.908911625429617e-06, + "loss": 0.0004, + "step": 9840 + }, + { + "epoch": 7.477699753273866, + "grad_norm": 0.010150356218218803, + "learning_rate": 5.900222292821173e-06, + "loss": 0.0572, + "step": 9850 + }, + { + "epoch": 7.4852913266274435, + "grad_norm": 0.05601394549012184, + "learning_rate": 5.89153014938172e-06, + "loss": 0.0004, + "step": 9860 + }, + { + "epoch": 7.492882899981021, + "grad_norm": 0.007213375996798277, + "learning_rate": 5.8828352222513866e-06, + "loss": 0.0184, + "step": 9870 + }, + { + "epoch": 7.500474473334599, + "grad_norm": 0.005943207535892725, + "learning_rate": 5.874137538578984e-06, + "loss": 0.0519, + "step": 9880 + }, + { + "epoch": 7.508066046688176, + "grad_norm": 0.005052383989095688, + "learning_rate": 5.865437125521943e-06, + "loss": 0.091, + "step": 9890 + }, + { + "epoch": 7.515657620041754, + "grad_norm": 0.000759047397878021, + "learning_rate": 5.856734010246207e-06, + "loss": 0.0, + "step": 9900 + }, + { + "epoch": 7.523249193395332, + "grad_norm": 0.004873152356594801, + "learning_rate": 5.848028219926162e-06, + "loss": 0.0001, + "step": 9910 + }, + { + "epoch": 7.530840766748908, + "grad_norm": 0.0005250478279776871, + "learning_rate": 5.839319781744543e-06, + "loss": 0.0, + "step": 9920 + }, + { + "epoch": 7.538432340102486, + "grad_norm": 0.0007055936730466783, + "learning_rate": 5.830608722892352e-06, + "loss": 0.0001, + "step": 9930 + }, + { + "epoch": 7.546023913456064, + "grad_norm": 0.0024068867787718773, + "learning_rate": 5.821895070568781e-06, + "loss": 0.0001, + "step": 9940 + }, + { + "epoch": 7.553615486809641, + "grad_norm": 183.14315795898438, + "learning_rate": 5.813178851981112e-06, + "loss": 0.1222, + "step": 9950 + }, + { + "epoch": 7.561207060163219, + "grad_norm": 0.8877391219139099, + "learning_rate": 5.804460094344642e-06, + "loss": 0.0002, + "step": 9960 + }, + { + "epoch": 7.5687986335167965, + "grad_norm": 0.006915534846484661, + "learning_rate": 5.795738824882596e-06, + "loss": 0.0001, + "step": 9970 + }, + { + "epoch": 7.576390206870374, + "grad_norm": 9.879432678222656, + "learning_rate": 5.787015070826044e-06, + "loss": 0.0076, + "step": 9980 + }, + { + "epoch": 7.583981780223952, + "grad_norm": 0.004392684902995825, + "learning_rate": 5.77828885941381e-06, + "loss": 0.0001, + "step": 9990 + }, + { + "epoch": 7.5915733535775285, + "grad_norm": 0.000951431633438915, + "learning_rate": 5.769560217892395e-06, + "loss": 0.0002, + "step": 10000 + }, + { + "epoch": 7.599164926931106, + "grad_norm": 0.0021181986667215824, + "learning_rate": 5.760829173515883e-06, + "loss": 0.0002, + "step": 10010 + }, + { + "epoch": 7.606756500284684, + "grad_norm": 0.006260419264435768, + "learning_rate": 5.752095753545864e-06, + "loss": 0.0577, + "step": 10020 + }, + { + "epoch": 7.614348073638261, + "grad_norm": 0.0006751982145942748, + "learning_rate": 5.743359985251348e-06, + "loss": 0.0292, + "step": 10030 + }, + { + "epoch": 7.621939646991839, + "grad_norm": 0.00024200859479606152, + "learning_rate": 5.734621895908668e-06, + "loss": 0.0238, + "step": 10040 + }, + { + "epoch": 7.629531220345417, + "grad_norm": 0.002035447396337986, + "learning_rate": 5.725881512801413e-06, + "loss": 0.0002, + "step": 10050 + }, + { + "epoch": 7.637122793698994, + "grad_norm": 0.0007019038312137127, + "learning_rate": 5.717138863220333e-06, + "loss": 0.0982, + "step": 10060 + }, + { + "epoch": 7.644714367052572, + "grad_norm": 0.0009322810219600797, + "learning_rate": 5.7083939744632514e-06, + "loss": 0.0001, + "step": 10070 + }, + { + "epoch": 7.6523059404061495, + "grad_norm": 0.011389588937163353, + "learning_rate": 5.699646873834983e-06, + "loss": 0.0691, + "step": 10080 + }, + { + "epoch": 7.659897513759727, + "grad_norm": 0.01710079051554203, + "learning_rate": 5.690897588647253e-06, + "loss": 0.0005, + "step": 10090 + }, + { + "epoch": 7.667489087113304, + "grad_norm": 9.926609992980957, + "learning_rate": 5.6821461462186045e-06, + "loss": 0.0352, + "step": 10100 + }, + { + "epoch": 7.6750806604668815, + "grad_norm": 0.00021108197688590735, + "learning_rate": 5.673392573874316e-06, + "loss": 0.0005, + "step": 10110 + }, + { + "epoch": 7.682672233820459, + "grad_norm": 0.001629292848519981, + "learning_rate": 5.6646368989463185e-06, + "loss": 0.0479, + "step": 10120 + }, + { + "epoch": 7.690263807174037, + "grad_norm": 0.12789593636989594, + "learning_rate": 5.655879148773107e-06, + "loss": 0.0183, + "step": 10130 + }, + { + "epoch": 7.697855380527614, + "grad_norm": 0.001387747353874147, + "learning_rate": 5.647119350699655e-06, + "loss": 0.0116, + "step": 10140 + }, + { + "epoch": 7.705446953881192, + "grad_norm": 0.0015600691549479961, + "learning_rate": 5.638357532077331e-06, + "loss": 0.0316, + "step": 10150 + }, + { + "epoch": 7.71303852723477, + "grad_norm": 0.0008326400420628488, + "learning_rate": 5.629593720263816e-06, + "loss": 0.0002, + "step": 10160 + }, + { + "epoch": 7.720630100588347, + "grad_norm": 0.023590516299009323, + "learning_rate": 5.620827942623008e-06, + "loss": 0.0008, + "step": 10170 + }, + { + "epoch": 7.728221673941924, + "grad_norm": 0.000754083099309355, + "learning_rate": 5.612060226524948e-06, + "loss": 0.0365, + "step": 10180 + }, + { + "epoch": 7.735813247295502, + "grad_norm": 0.011727853678166866, + "learning_rate": 5.603290599345726e-06, + "loss": 0.0438, + "step": 10190 + }, + { + "epoch": 7.743404820649079, + "grad_norm": 0.20062032341957092, + "learning_rate": 5.5945190884674065e-06, + "loss": 0.0056, + "step": 10200 + }, + { + "epoch": 7.750996394002657, + "grad_norm": 0.30250805616378784, + "learning_rate": 5.585745721277923e-06, + "loss": 0.1501, + "step": 10210 + }, + { + "epoch": 7.758587967356235, + "grad_norm": 0.00017410292639397085, + "learning_rate": 5.5769705251710175e-06, + "loss": 0.0002, + "step": 10220 + }, + { + "epoch": 7.766179540709812, + "grad_norm": 0.011902794241905212, + "learning_rate": 5.568193527546135e-06, + "loss": 0.0001, + "step": 10230 + }, + { + "epoch": 7.77377111406339, + "grad_norm": 0.3667079508304596, + "learning_rate": 5.559414755808348e-06, + "loss": 0.0394, + "step": 10240 + }, + { + "epoch": 7.7813626874169675, + "grad_norm": 0.001953916857019067, + "learning_rate": 5.550634237368269e-06, + "loss": 0.0006, + "step": 10250 + }, + { + "epoch": 7.788954260770545, + "grad_norm": 0.0013212488265708089, + "learning_rate": 5.541851999641964e-06, + "loss": 0.0004, + "step": 10260 + }, + { + "epoch": 7.796545834124123, + "grad_norm": 0.00039594716508872807, + "learning_rate": 5.533068070050867e-06, + "loss": 0.0322, + "step": 10270 + }, + { + "epoch": 7.8041374074776995, + "grad_norm": 0.000754969718400389, + "learning_rate": 5.524282476021692e-06, + "loss": 0.1497, + "step": 10280 + }, + { + "epoch": 7.811728980831277, + "grad_norm": 0.035513028502464294, + "learning_rate": 5.515495244986356e-06, + "loss": 0.0081, + "step": 10290 + }, + { + "epoch": 7.819320554184855, + "grad_norm": 0.0016785170882940292, + "learning_rate": 5.5067064043818815e-06, + "loss": 0.0001, + "step": 10300 + }, + { + "epoch": 7.826912127538432, + "grad_norm": 8.234527194872499e-05, + "learning_rate": 5.49791598165032e-06, + "loss": 0.0001, + "step": 10310 + }, + { + "epoch": 7.83450370089201, + "grad_norm": 0.0006789985345676541, + "learning_rate": 5.489124004238662e-06, + "loss": 0.0393, + "step": 10320 + }, + { + "epoch": 7.842095274245588, + "grad_norm": 0.0023299374151974916, + "learning_rate": 5.480330499598754e-06, + "loss": 0.0046, + "step": 10330 + }, + { + "epoch": 7.849686847599165, + "grad_norm": 0.007388091180473566, + "learning_rate": 5.471535495187207e-06, + "loss": 0.0001, + "step": 10340 + }, + { + "epoch": 7.857278420952742, + "grad_norm": 0.00018302824173588306, + "learning_rate": 5.462739018465318e-06, + "loss": 0.1768, + "step": 10350 + }, + { + "epoch": 7.86486999430632, + "grad_norm": 0.00418035127222538, + "learning_rate": 5.45394109689898e-06, + "loss": 0.0165, + "step": 10360 + }, + { + "epoch": 7.872461567659897, + "grad_norm": 0.01187161449342966, + "learning_rate": 5.445141757958599e-06, + "loss": 0.0004, + "step": 10370 + }, + { + "epoch": 7.880053141013475, + "grad_norm": 0.08545250445604324, + "learning_rate": 5.436341029119004e-06, + "loss": 0.06, + "step": 10380 + }, + { + "epoch": 7.8876447143670525, + "grad_norm": 0.004683859180659056, + "learning_rate": 5.427538937859368e-06, + "loss": 0.2187, + "step": 10390 + }, + { + "epoch": 7.89523628772063, + "grad_norm": 0.0011295732110738754, + "learning_rate": 5.418735511663112e-06, + "loss": 0.0002, + "step": 10400 + }, + { + "epoch": 7.902827861074208, + "grad_norm": 0.0021211670245975256, + "learning_rate": 5.409930778017828e-06, + "loss": 0.0425, + "step": 10410 + }, + { + "epoch": 7.910419434427785, + "grad_norm": 0.0004632298951037228, + "learning_rate": 5.401124764415192e-06, + "loss": 0.0001, + "step": 10420 + }, + { + "epoch": 7.918011007781363, + "grad_norm": 0.03465382754802704, + "learning_rate": 5.392317498350876e-06, + "loss": 0.07, + "step": 10430 + }, + { + "epoch": 7.925602581134941, + "grad_norm": 0.0012545166537165642, + "learning_rate": 5.38350900732446e-06, + "loss": 0.0003, + "step": 10440 + }, + { + "epoch": 7.933194154488517, + "grad_norm": 0.0008013694896362722, + "learning_rate": 5.374699318839352e-06, + "loss": 0.0001, + "step": 10450 + }, + { + "epoch": 7.940785727842095, + "grad_norm": 0.01796998642385006, + "learning_rate": 5.365888460402695e-06, + "loss": 0.0001, + "step": 10460 + }, + { + "epoch": 7.948377301195673, + "grad_norm": 0.06785059720277786, + "learning_rate": 5.357076459525291e-06, + "loss": 0.0002, + "step": 10470 + }, + { + "epoch": 7.95596887454925, + "grad_norm": 0.001381418784148991, + "learning_rate": 5.348263343721503e-06, + "loss": 0.0001, + "step": 10480 + }, + { + "epoch": 7.963560447902828, + "grad_norm": 0.06072179973125458, + "learning_rate": 5.339449140509179e-06, + "loss": 0.0002, + "step": 10490 + }, + { + "epoch": 7.9711520212564055, + "grad_norm": 0.024496397003531456, + "learning_rate": 5.330633877409561e-06, + "loss": 0.1215, + "step": 10500 + }, + { + "epoch": 7.978743594609983, + "grad_norm": 0.0315159372985363, + "learning_rate": 5.3218175819472e-06, + "loss": 0.0001, + "step": 10510 + }, + { + "epoch": 7.986335167963561, + "grad_norm": 0.0015373720088973641, + "learning_rate": 5.313000281649872e-06, + "loss": 0.0003, + "step": 10520 + }, + { + "epoch": 7.9939267413171375, + "grad_norm": 0.12398699671030045, + "learning_rate": 5.304182004048488e-06, + "loss": 0.0002, + "step": 10530 + }, + { + "epoch": 8.0, + "eval_f1": 0.9590113159486987, + "eval_loss": 0.15769141912460327, + "eval_precision": 0.9585736334342291, + "eval_recall": 0.9590288315629742, + "eval_runtime": 75.7332, + "eval_samples_per_second": 17.403, + "eval_steps_per_second": 17.403, + "step": 10538 + } + ], + "logging_steps": 10, + "max_steps": 19755, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.701261509159456e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}