diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10123 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999924371336737, + "eval_steps": 1000, + "global_step": 12396, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008067057414760194, + "grad_norm": 0.6473819017410278, + "learning_rate": 4.838709677419355e-07, + "loss": 2.5238, + "num_input_tokens_seen": 350316, + "step": 10 + }, + { + "epoch": 0.0016134114829520387, + "grad_norm": 0.6671866774559021, + "learning_rate": 9.67741935483871e-07, + "loss": 2.5305, + "num_input_tokens_seen": 706016, + "step": 20 + }, + { + "epoch": 0.0024201172244280584, + "grad_norm": 0.5423910021781921, + "learning_rate": 1.4516129032258064e-06, + "loss": 2.5469, + "num_input_tokens_seen": 1029048, + "step": 30 + }, + { + "epoch": 0.0032268229659040775, + "grad_norm": 0.5091773867607117, + "learning_rate": 1.935483870967742e-06, + "loss": 2.4654, + "num_input_tokens_seen": 1335348, + "step": 40 + }, + { + "epoch": 0.004033528707380097, + "grad_norm": 0.46149584650993347, + "learning_rate": 2.4193548387096776e-06, + "loss": 2.4348, + "num_input_tokens_seen": 1704148, + "step": 50 + }, + { + "epoch": 0.004840234448856117, + "grad_norm": 0.4362267255783081, + "learning_rate": 2.9032258064516128e-06, + "loss": 2.4508, + "num_input_tokens_seen": 2054768, + "step": 60 + }, + { + "epoch": 0.005646940190332136, + "grad_norm": 0.38532692193984985, + "learning_rate": 3.3870967741935484e-06, + "loss": 2.4029, + "num_input_tokens_seen": 2406312, + "step": 70 + }, + { + "epoch": 0.006453645931808155, + "grad_norm": 0.3635064959526062, + "learning_rate": 3.870967741935484e-06, + "loss": 2.4462, + "num_input_tokens_seen": 2760872, + "step": 80 + }, + { + "epoch": 0.007260351673284175, + "grad_norm": 0.39128369092941284, + "learning_rate": 4.35483870967742e-06, + "loss": 2.3645, + "num_input_tokens_seen": 3107736, + "step": 90 + }, + { + "epoch": 0.008067057414760194, + "grad_norm": 0.4147037863731384, + "learning_rate": 4.838709677419355e-06, + "loss": 2.3476, + "num_input_tokens_seen": 3475576, + "step": 100 + }, + { + "epoch": 0.008873763156236213, + "grad_norm": 0.3666001260280609, + "learning_rate": 5.322580645161291e-06, + "loss": 2.4149, + "num_input_tokens_seen": 3855216, + "step": 110 + }, + { + "epoch": 0.009680468897712234, + "grad_norm": 0.35516056418418884, + "learning_rate": 5.8064516129032256e-06, + "loss": 2.3853, + "num_input_tokens_seen": 4199752, + "step": 120 + }, + { + "epoch": 0.010487174639188253, + "grad_norm": 0.3447749614715576, + "learning_rate": 6.290322580645162e-06, + "loss": 2.3515, + "num_input_tokens_seen": 4539728, + "step": 130 + }, + { + "epoch": 0.011293880380664272, + "grad_norm": 0.36747947335243225, + "learning_rate": 6.774193548387097e-06, + "loss": 2.3733, + "num_input_tokens_seen": 4883460, + "step": 140 + }, + { + "epoch": 0.012100586122140291, + "grad_norm": 0.33421263098716736, + "learning_rate": 7.258064516129032e-06, + "loss": 2.3517, + "num_input_tokens_seen": 5208404, + "step": 150 + }, + { + "epoch": 0.01290729186361631, + "grad_norm": 0.3581472337245941, + "learning_rate": 7.741935483870968e-06, + "loss": 2.3197, + "num_input_tokens_seen": 5565648, + "step": 160 + }, + { + "epoch": 0.01371399760509233, + "grad_norm": 0.3424622118473053, + "learning_rate": 8.225806451612904e-06, + "loss": 2.2875, + "num_input_tokens_seen": 5932124, + "step": 170 + }, + { + "epoch": 0.01452070334656835, + "grad_norm": 0.35327455401420593, + "learning_rate": 8.70967741935484e-06, + "loss": 2.3161, + "num_input_tokens_seen": 6294120, + "step": 180 + }, + { + "epoch": 0.015327409088044369, + "grad_norm": 0.35110658407211304, + "learning_rate": 9.193548387096775e-06, + "loss": 2.3089, + "num_input_tokens_seen": 6655836, + "step": 190 + }, + { + "epoch": 0.016134114829520388, + "grad_norm": 0.3083972632884979, + "learning_rate": 9.67741935483871e-06, + "loss": 2.2615, + "num_input_tokens_seen": 7006032, + "step": 200 + }, + { + "epoch": 0.01694082057099641, + "grad_norm": 0.33634695410728455, + "learning_rate": 1.0161290322580644e-05, + "loss": 2.3292, + "num_input_tokens_seen": 7346560, + "step": 210 + }, + { + "epoch": 0.017747526312472426, + "grad_norm": 0.33328714966773987, + "learning_rate": 1.0645161290322582e-05, + "loss": 2.2443, + "num_input_tokens_seen": 7695720, + "step": 220 + }, + { + "epoch": 0.018554232053948447, + "grad_norm": 0.3662545680999756, + "learning_rate": 1.1129032258064517e-05, + "loss": 2.2721, + "num_input_tokens_seen": 8057116, + "step": 230 + }, + { + "epoch": 0.019360937795424468, + "grad_norm": 0.35266175866127014, + "learning_rate": 1.1612903225806451e-05, + "loss": 2.2645, + "num_input_tokens_seen": 8392512, + "step": 240 + }, + { + "epoch": 0.020167643536900485, + "grad_norm": 0.3189757168292999, + "learning_rate": 1.2096774193548387e-05, + "loss": 2.2282, + "num_input_tokens_seen": 8739404, + "step": 250 + }, + { + "epoch": 0.020974349278376506, + "grad_norm": 0.3413700461387634, + "learning_rate": 1.2580645161290324e-05, + "loss": 2.265, + "num_input_tokens_seen": 9092044, + "step": 260 + }, + { + "epoch": 0.021781055019852523, + "grad_norm": 0.33349013328552246, + "learning_rate": 1.3064516129032258e-05, + "loss": 2.201, + "num_input_tokens_seen": 9411188, + "step": 270 + }, + { + "epoch": 0.022587760761328544, + "grad_norm": 0.3217449188232422, + "learning_rate": 1.3548387096774194e-05, + "loss": 2.2008, + "num_input_tokens_seen": 9770648, + "step": 280 + }, + { + "epoch": 0.023394466502804565, + "grad_norm": 0.33256444334983826, + "learning_rate": 1.403225806451613e-05, + "loss": 2.1969, + "num_input_tokens_seen": 10122184, + "step": 290 + }, + { + "epoch": 0.024201172244280582, + "grad_norm": 0.32968392968177795, + "learning_rate": 1.4516129032258065e-05, + "loss": 2.2431, + "num_input_tokens_seen": 10469264, + "step": 300 + }, + { + "epoch": 0.025007877985756603, + "grad_norm": 0.29683488607406616, + "learning_rate": 1.5e-05, + "loss": 2.2201, + "num_input_tokens_seen": 10825668, + "step": 310 + }, + { + "epoch": 0.02581458372723262, + "grad_norm": 0.3019179403781891, + "learning_rate": 1.5483870967741936e-05, + "loss": 2.1465, + "num_input_tokens_seen": 11224912, + "step": 320 + }, + { + "epoch": 0.02662128946870864, + "grad_norm": 0.35323160886764526, + "learning_rate": 1.596774193548387e-05, + "loss": 2.1791, + "num_input_tokens_seen": 11570236, + "step": 330 + }, + { + "epoch": 0.02742799521018466, + "grad_norm": 0.3326450288295746, + "learning_rate": 1.6451612903225807e-05, + "loss": 2.0943, + "num_input_tokens_seen": 11909600, + "step": 340 + }, + { + "epoch": 0.02823470095166068, + "grad_norm": 0.33823785185813904, + "learning_rate": 1.6935483870967744e-05, + "loss": 2.1687, + "num_input_tokens_seen": 12268976, + "step": 350 + }, + { + "epoch": 0.0290414066931367, + "grad_norm": 0.31576064229011536, + "learning_rate": 1.741935483870968e-05, + "loss": 2.0925, + "num_input_tokens_seen": 12630560, + "step": 360 + }, + { + "epoch": 0.029848112434612717, + "grad_norm": 0.33903634548187256, + "learning_rate": 1.7903225806451616e-05, + "loss": 2.1164, + "num_input_tokens_seen": 12968760, + "step": 370 + }, + { + "epoch": 0.030654818176088738, + "grad_norm": 0.33790096640586853, + "learning_rate": 1.838709677419355e-05, + "loss": 2.0834, + "num_input_tokens_seen": 13320408, + "step": 380 + }, + { + "epoch": 0.03146152391756476, + "grad_norm": 0.32104986906051636, + "learning_rate": 1.8870967741935484e-05, + "loss": 2.1008, + "num_input_tokens_seen": 13678172, + "step": 390 + }, + { + "epoch": 0.032268229659040776, + "grad_norm": 0.3284147083759308, + "learning_rate": 1.935483870967742e-05, + "loss": 2.1674, + "num_input_tokens_seen": 14048132, + "step": 400 + }, + { + "epoch": 0.03307493540051679, + "grad_norm": 0.31699198484420776, + "learning_rate": 1.9838709677419355e-05, + "loss": 2.1186, + "num_input_tokens_seen": 14376172, + "step": 410 + }, + { + "epoch": 0.03388164114199282, + "grad_norm": 0.3431970775127411, + "learning_rate": 2.032258064516129e-05, + "loss": 2.0956, + "num_input_tokens_seen": 14729936, + "step": 420 + }, + { + "epoch": 0.034688346883468835, + "grad_norm": 0.35230588912963867, + "learning_rate": 2.080645161290323e-05, + "loss": 2.0643, + "num_input_tokens_seen": 15082004, + "step": 430 + }, + { + "epoch": 0.03549505262494485, + "grad_norm": 0.3975660800933838, + "learning_rate": 2.1290322580645163e-05, + "loss": 2.1215, + "num_input_tokens_seen": 15450700, + "step": 440 + }, + { + "epoch": 0.036301758366420876, + "grad_norm": 0.31235337257385254, + "learning_rate": 2.1774193548387097e-05, + "loss": 1.9926, + "num_input_tokens_seen": 15786772, + "step": 450 + }, + { + "epoch": 0.037108464107896894, + "grad_norm": 0.33642396330833435, + "learning_rate": 2.2258064516129034e-05, + "loss": 2.0398, + "num_input_tokens_seen": 16132004, + "step": 460 + }, + { + "epoch": 0.03791516984937291, + "grad_norm": 0.3371775448322296, + "learning_rate": 2.274193548387097e-05, + "loss": 2.124, + "num_input_tokens_seen": 16524448, + "step": 470 + }, + { + "epoch": 0.038721875590848935, + "grad_norm": 0.31219518184661865, + "learning_rate": 2.3225806451612902e-05, + "loss": 2.0735, + "num_input_tokens_seen": 16888128, + "step": 480 + }, + { + "epoch": 0.03952858133232495, + "grad_norm": 0.2977409362792969, + "learning_rate": 2.370967741935484e-05, + "loss": 2.0295, + "num_input_tokens_seen": 17227520, + "step": 490 + }, + { + "epoch": 0.04033528707380097, + "grad_norm": 0.28311657905578613, + "learning_rate": 2.4193548387096773e-05, + "loss": 2.0672, + "num_input_tokens_seen": 17584540, + "step": 500 + }, + { + "epoch": 0.04114199281527699, + "grad_norm": 0.28878408670425415, + "learning_rate": 2.467741935483871e-05, + "loss": 1.9962, + "num_input_tokens_seen": 17945160, + "step": 510 + }, + { + "epoch": 0.04194869855675301, + "grad_norm": 0.3487817049026489, + "learning_rate": 2.5161290322580648e-05, + "loss": 2.062, + "num_input_tokens_seen": 18314364, + "step": 520 + }, + { + "epoch": 0.04275540429822903, + "grad_norm": 0.29766845703125, + "learning_rate": 2.5645161290322582e-05, + "loss": 2.0474, + "num_input_tokens_seen": 18629644, + "step": 530 + }, + { + "epoch": 0.043562110039705046, + "grad_norm": 1.0644810199737549, + "learning_rate": 2.6129032258064516e-05, + "loss": 2.0612, + "num_input_tokens_seen": 18969332, + "step": 540 + }, + { + "epoch": 0.04436881578118107, + "grad_norm": 0.3535769581794739, + "learning_rate": 2.6612903225806453e-05, + "loss": 1.9915, + "num_input_tokens_seen": 19331132, + "step": 550 + }, + { + "epoch": 0.04517552152265709, + "grad_norm": 0.3135438561439514, + "learning_rate": 2.7096774193548387e-05, + "loss": 1.9903, + "num_input_tokens_seen": 19675048, + "step": 560 + }, + { + "epoch": 0.045982227264133105, + "grad_norm": 0.3067088723182678, + "learning_rate": 2.758064516129032e-05, + "loss": 2.0101, + "num_input_tokens_seen": 20024728, + "step": 570 + }, + { + "epoch": 0.04678893300560913, + "grad_norm": 0.39044031500816345, + "learning_rate": 2.806451612903226e-05, + "loss": 2.0239, + "num_input_tokens_seen": 20366444, + "step": 580 + }, + { + "epoch": 0.047595638747085146, + "grad_norm": 0.28235116600990295, + "learning_rate": 2.8548387096774196e-05, + "loss": 1.984, + "num_input_tokens_seen": 20728484, + "step": 590 + }, + { + "epoch": 0.048402344488561164, + "grad_norm": 0.3268643021583557, + "learning_rate": 2.903225806451613e-05, + "loss": 2.0046, + "num_input_tokens_seen": 21053188, + "step": 600 + }, + { + "epoch": 0.04920905023003718, + "grad_norm": 0.3217363953590393, + "learning_rate": 2.9516129032258067e-05, + "loss": 2.0211, + "num_input_tokens_seen": 21401976, + "step": 610 + }, + { + "epoch": 0.050015755971513205, + "grad_norm": 0.3585478961467743, + "learning_rate": 3e-05, + "loss": 2.003, + "num_input_tokens_seen": 21721508, + "step": 620 + }, + { + "epoch": 0.05082246171298922, + "grad_norm": 0.31601133942604065, + "learning_rate": 3e-05, + "loss": 1.9809, + "num_input_tokens_seen": 22052500, + "step": 630 + }, + { + "epoch": 0.05162916745446524, + "grad_norm": 0.33022111654281616, + "learning_rate": 3e-05, + "loss": 2.0097, + "num_input_tokens_seen": 22410848, + "step": 640 + }, + { + "epoch": 0.052435873195941264, + "grad_norm": 0.30348455905914307, + "learning_rate": 3e-05, + "loss": 1.9627, + "num_input_tokens_seen": 22769696, + "step": 650 + }, + { + "epoch": 0.05324257893741728, + "grad_norm": 0.31722530722618103, + "learning_rate": 3e-05, + "loss": 1.9734, + "num_input_tokens_seen": 23141500, + "step": 660 + }, + { + "epoch": 0.0540492846788933, + "grad_norm": 0.3014906644821167, + "learning_rate": 3e-05, + "loss": 1.9408, + "num_input_tokens_seen": 23484440, + "step": 670 + }, + { + "epoch": 0.05485599042036932, + "grad_norm": 0.34274202585220337, + "learning_rate": 3e-05, + "loss": 1.9022, + "num_input_tokens_seen": 23826764, + "step": 680 + }, + { + "epoch": 0.05566269616184534, + "grad_norm": 0.31761202216148376, + "learning_rate": 3e-05, + "loss": 1.98, + "num_input_tokens_seen": 24161792, + "step": 690 + }, + { + "epoch": 0.05646940190332136, + "grad_norm": 0.37271755933761597, + "learning_rate": 3e-05, + "loss": 1.9989, + "num_input_tokens_seen": 24503600, + "step": 700 + }, + { + "epoch": 0.057276107644797375, + "grad_norm": 0.3486018180847168, + "learning_rate": 3e-05, + "loss": 1.9389, + "num_input_tokens_seen": 24820152, + "step": 710 + }, + { + "epoch": 0.0580828133862734, + "grad_norm": 0.3391967713832855, + "learning_rate": 3e-05, + "loss": 1.9054, + "num_input_tokens_seen": 25190404, + "step": 720 + }, + { + "epoch": 0.058889519127749416, + "grad_norm": 0.3023532032966614, + "learning_rate": 3e-05, + "loss": 1.9632, + "num_input_tokens_seen": 25536472, + "step": 730 + }, + { + "epoch": 0.059696224869225434, + "grad_norm": 3234.732666015625, + "learning_rate": 3e-05, + "loss": 2.018, + "num_input_tokens_seen": 25890504, + "step": 740 + }, + { + "epoch": 0.06050293061070146, + "grad_norm": 2243.118408203125, + "learning_rate": 3e-05, + "loss": 1.9958, + "num_input_tokens_seen": 26256376, + "step": 750 + }, + { + "epoch": 0.061309636352177475, + "grad_norm": 0.356982946395874, + "learning_rate": 3e-05, + "loss": 1.9637, + "num_input_tokens_seen": 26638344, + "step": 760 + }, + { + "epoch": 0.06211634209365349, + "grad_norm": 0.296735942363739, + "learning_rate": 3e-05, + "loss": 1.9099, + "num_input_tokens_seen": 26962544, + "step": 770 + }, + { + "epoch": 0.06292304783512952, + "grad_norm": 0.29231005907058716, + "learning_rate": 3e-05, + "loss": 1.8578, + "num_input_tokens_seen": 27308112, + "step": 780 + }, + { + "epoch": 0.06372975357660553, + "grad_norm": 0.31055736541748047, + "learning_rate": 3e-05, + "loss": 1.8843, + "num_input_tokens_seen": 27676552, + "step": 790 + }, + { + "epoch": 0.06453645931808155, + "grad_norm": 0.3155435025691986, + "learning_rate": 3e-05, + "loss": 1.9495, + "num_input_tokens_seen": 28056316, + "step": 800 + }, + { + "epoch": 0.06534316505955758, + "grad_norm": 0.3073548972606659, + "learning_rate": 3e-05, + "loss": 1.9373, + "num_input_tokens_seen": 28404224, + "step": 810 + }, + { + "epoch": 0.06614987080103359, + "grad_norm": 0.30792343616485596, + "learning_rate": 3e-05, + "loss": 1.8845, + "num_input_tokens_seen": 28749544, + "step": 820 + }, + { + "epoch": 0.06695657654250961, + "grad_norm": 0.27868854999542236, + "learning_rate": 3e-05, + "loss": 1.8979, + "num_input_tokens_seen": 29086400, + "step": 830 + }, + { + "epoch": 0.06776328228398563, + "grad_norm": 0.2984897792339325, + "learning_rate": 3e-05, + "loss": 1.8669, + "num_input_tokens_seen": 29427836, + "step": 840 + }, + { + "epoch": 0.06856998802546165, + "grad_norm": 0.31874415278434753, + "learning_rate": 3e-05, + "loss": 1.8357, + "num_input_tokens_seen": 29777592, + "step": 850 + }, + { + "epoch": 0.06937669376693767, + "grad_norm": 0.32049503922462463, + "learning_rate": 3e-05, + "loss": 1.8856, + "num_input_tokens_seen": 30132364, + "step": 860 + }, + { + "epoch": 0.0701833995084137, + "grad_norm": 0.6638230085372925, + "learning_rate": 3e-05, + "loss": 1.8624, + "num_input_tokens_seen": 30474616, + "step": 870 + }, + { + "epoch": 0.0709901052498897, + "grad_norm": 0.293955534696579, + "learning_rate": 3e-05, + "loss": 1.8969, + "num_input_tokens_seen": 30836880, + "step": 880 + }, + { + "epoch": 0.07179681099136573, + "grad_norm": 0.34990906715393066, + "learning_rate": 3e-05, + "loss": 1.8845, + "num_input_tokens_seen": 31210872, + "step": 890 + }, + { + "epoch": 0.07260351673284175, + "grad_norm": 0.31150537729263306, + "learning_rate": 3e-05, + "loss": 1.913, + "num_input_tokens_seen": 31593820, + "step": 900 + }, + { + "epoch": 0.07341022247431776, + "grad_norm": 0.3393719792366028, + "learning_rate": 3e-05, + "loss": 1.8884, + "num_input_tokens_seen": 31948328, + "step": 910 + }, + { + "epoch": 0.07421692821579379, + "grad_norm": 0.2771390378475189, + "learning_rate": 3e-05, + "loss": 1.8384, + "num_input_tokens_seen": 32279500, + "step": 920 + }, + { + "epoch": 0.07502363395726981, + "grad_norm": 0.29383623600006104, + "learning_rate": 3e-05, + "loss": 1.7929, + "num_input_tokens_seen": 32639840, + "step": 930 + }, + { + "epoch": 0.07583033969874582, + "grad_norm": 0.2876071333885193, + "learning_rate": 3e-05, + "loss": 1.8682, + "num_input_tokens_seen": 32976672, + "step": 940 + }, + { + "epoch": 0.07663704544022185, + "grad_norm": 0.2755143940448761, + "learning_rate": 3e-05, + "loss": 1.8409, + "num_input_tokens_seen": 33315328, + "step": 950 + }, + { + "epoch": 0.07744375118169787, + "grad_norm": 0.31250065565109253, + "learning_rate": 3e-05, + "loss": 1.8467, + "num_input_tokens_seen": 33691124, + "step": 960 + }, + { + "epoch": 0.07825045692317388, + "grad_norm": 0.3030893802642822, + "learning_rate": 3e-05, + "loss": 1.8342, + "num_input_tokens_seen": 34049184, + "step": 970 + }, + { + "epoch": 0.0790571626646499, + "grad_norm": 0.2992667555809021, + "learning_rate": 3e-05, + "loss": 1.8645, + "num_input_tokens_seen": 34402068, + "step": 980 + }, + { + "epoch": 0.07986386840612592, + "grad_norm": 0.2903348505496979, + "learning_rate": 3e-05, + "loss": 1.8328, + "num_input_tokens_seen": 34749788, + "step": 990 + }, + { + "epoch": 0.08067057414760194, + "grad_norm": 0.30363109707832336, + "learning_rate": 3e-05, + "loss": 1.8808, + "num_input_tokens_seen": 35147692, + "step": 1000 + }, + { + "epoch": 0.08067057414760194, + "eval_gen_len": 636.465, + "eval_loss": 1.788309931755066, + "eval_rouge1": 24.1946, + "eval_rouge2": 12.2099, + "eval_rougeL": 20.4185, + "eval_rougeLsum": 22.251, + "eval_runtime": 1680.7996, + "eval_samples_per_second": 0.119, + "eval_steps_per_second": 0.03, + "num_input_tokens_seen": 35147692, + "step": 1000 + }, + { + "epoch": 0.08147727988907796, + "grad_norm": 0.31382158398628235, + "learning_rate": 3e-05, + "loss": 1.8377, + "num_input_tokens_seen": 35457728, + "step": 1010 + }, + { + "epoch": 0.08228398563055397, + "grad_norm": 0.3211570680141449, + "learning_rate": 3e-05, + "loss": 1.791, + "num_input_tokens_seen": 35838912, + "step": 1020 + }, + { + "epoch": 0.08309069137203, + "grad_norm": 0.3069629669189453, + "learning_rate": 3e-05, + "loss": 1.8453, + "num_input_tokens_seen": 36194004, + "step": 1030 + }, + { + "epoch": 0.08389739711350602, + "grad_norm": 0.2732415497303009, + "learning_rate": 3e-05, + "loss": 1.7939, + "num_input_tokens_seen": 36530872, + "step": 1040 + }, + { + "epoch": 0.08470410285498203, + "grad_norm": 0.31079530715942383, + "learning_rate": 3e-05, + "loss": 1.7718, + "num_input_tokens_seen": 36900376, + "step": 1050 + }, + { + "epoch": 0.08551080859645806, + "grad_norm": 0.28770914673805237, + "learning_rate": 3e-05, + "loss": 1.8129, + "num_input_tokens_seen": 37248588, + "step": 1060 + }, + { + "epoch": 0.08631751433793408, + "grad_norm": 0.31988024711608887, + "learning_rate": 3e-05, + "loss": 1.8535, + "num_input_tokens_seen": 37626604, + "step": 1070 + }, + { + "epoch": 0.08712422007941009, + "grad_norm": 0.2785434126853943, + "learning_rate": 3e-05, + "loss": 1.8293, + "num_input_tokens_seen": 37968388, + "step": 1080 + }, + { + "epoch": 0.08793092582088612, + "grad_norm": 0.3427545726299286, + "learning_rate": 3e-05, + "loss": 1.7788, + "num_input_tokens_seen": 38308276, + "step": 1090 + }, + { + "epoch": 0.08873763156236214, + "grad_norm": 0.3006548583507538, + "learning_rate": 3e-05, + "loss": 1.7762, + "num_input_tokens_seen": 38669908, + "step": 1100 + }, + { + "epoch": 0.08954433730383815, + "grad_norm": 0.32136908173561096, + "learning_rate": 3e-05, + "loss": 1.845, + "num_input_tokens_seen": 39013520, + "step": 1110 + }, + { + "epoch": 0.09035104304531417, + "grad_norm": 0.34362053871154785, + "learning_rate": 3e-05, + "loss": 1.8068, + "num_input_tokens_seen": 39354192, + "step": 1120 + }, + { + "epoch": 0.0911577487867902, + "grad_norm": 0.3446958661079407, + "learning_rate": 3e-05, + "loss": 1.8646, + "num_input_tokens_seen": 39704092, + "step": 1130 + }, + { + "epoch": 0.09196445452826621, + "grad_norm": 0.3206467032432556, + "learning_rate": 3e-05, + "loss": 1.8266, + "num_input_tokens_seen": 40064272, + "step": 1140 + }, + { + "epoch": 0.09277116026974223, + "grad_norm": 0.2903178036212921, + "learning_rate": 3e-05, + "loss": 1.8034, + "num_input_tokens_seen": 40447744, + "step": 1150 + }, + { + "epoch": 0.09357786601121826, + "grad_norm": 0.29461219906806946, + "learning_rate": 3e-05, + "loss": 1.8363, + "num_input_tokens_seen": 40784056, + "step": 1160 + }, + { + "epoch": 0.09438457175269427, + "grad_norm": 0.32987499237060547, + "learning_rate": 3e-05, + "loss": 1.8514, + "num_input_tokens_seen": 41142580, + "step": 1170 + }, + { + "epoch": 0.09519127749417029, + "grad_norm": 0.31194567680358887, + "learning_rate": 3e-05, + "loss": 1.8027, + "num_input_tokens_seen": 41471144, + "step": 1180 + }, + { + "epoch": 0.0959979832356463, + "grad_norm": 0.2921917736530304, + "learning_rate": 3e-05, + "loss": 1.8098, + "num_input_tokens_seen": 41810900, + "step": 1190 + }, + { + "epoch": 0.09680468897712233, + "grad_norm": 0.2785918116569519, + "learning_rate": 3e-05, + "loss": 1.8202, + "num_input_tokens_seen": 42140460, + "step": 1200 + }, + { + "epoch": 0.09761139471859835, + "grad_norm": 0.3230614960193634, + "learning_rate": 3e-05, + "loss": 1.7923, + "num_input_tokens_seen": 42482488, + "step": 1210 + }, + { + "epoch": 0.09841810046007436, + "grad_norm": 0.2865009009838104, + "learning_rate": 3e-05, + "loss": 1.7968, + "num_input_tokens_seen": 42810344, + "step": 1220 + }, + { + "epoch": 0.09922480620155039, + "grad_norm": 0.32666832208633423, + "learning_rate": 3e-05, + "loss": 1.7991, + "num_input_tokens_seen": 43154724, + "step": 1230 + }, + { + "epoch": 0.10003151194302641, + "grad_norm": 0.28828418254852295, + "learning_rate": 3e-05, + "loss": 1.7948, + "num_input_tokens_seen": 43514588, + "step": 1240 + }, + { + "epoch": 0.10083821768450242, + "grad_norm": 0.2931421101093292, + "learning_rate": 3e-05, + "loss": 1.7972, + "num_input_tokens_seen": 43860916, + "step": 1250 + }, + { + "epoch": 0.10164492342597845, + "grad_norm": 0.3084103465080261, + "learning_rate": 3e-05, + "loss": 1.7792, + "num_input_tokens_seen": 44227052, + "step": 1260 + }, + { + "epoch": 0.10245162916745447, + "grad_norm": 0.27955740690231323, + "learning_rate": 3e-05, + "loss": 1.761, + "num_input_tokens_seen": 44614048, + "step": 1270 + }, + { + "epoch": 0.10325833490893048, + "grad_norm": 0.2971053421497345, + "learning_rate": 3e-05, + "loss": 1.8307, + "num_input_tokens_seen": 44971956, + "step": 1280 + }, + { + "epoch": 0.1040650406504065, + "grad_norm": 0.3030679225921631, + "learning_rate": 3e-05, + "loss": 1.8344, + "num_input_tokens_seen": 45324808, + "step": 1290 + }, + { + "epoch": 0.10487174639188253, + "grad_norm": 0.31672757863998413, + "learning_rate": 3e-05, + "loss": 1.8331, + "num_input_tokens_seen": 45676540, + "step": 1300 + }, + { + "epoch": 0.10567845213335854, + "grad_norm": 0.3107895255088806, + "learning_rate": 3e-05, + "loss": 1.7838, + "num_input_tokens_seen": 46049464, + "step": 1310 + }, + { + "epoch": 0.10648515787483456, + "grad_norm": 0.3014747202396393, + "learning_rate": 3e-05, + "loss": 1.7451, + "num_input_tokens_seen": 46387884, + "step": 1320 + }, + { + "epoch": 0.10729186361631059, + "grad_norm": 0.3187197148799896, + "learning_rate": 3e-05, + "loss": 1.7539, + "num_input_tokens_seen": 46736308, + "step": 1330 + }, + { + "epoch": 0.1080985693577866, + "grad_norm": 0.29054009914398193, + "learning_rate": 3e-05, + "loss": 1.7769, + "num_input_tokens_seen": 47072184, + "step": 1340 + }, + { + "epoch": 0.10890527509926262, + "grad_norm": 0.2759428322315216, + "learning_rate": 3e-05, + "loss": 1.7871, + "num_input_tokens_seen": 47436176, + "step": 1350 + }, + { + "epoch": 0.10971198084073865, + "grad_norm": 0.3081207275390625, + "learning_rate": 3e-05, + "loss": 1.7234, + "num_input_tokens_seen": 47787408, + "step": 1360 + }, + { + "epoch": 0.11051868658221466, + "grad_norm": 0.2889757454395294, + "learning_rate": 3e-05, + "loss": 1.8438, + "num_input_tokens_seen": 48142540, + "step": 1370 + }, + { + "epoch": 0.11132539232369068, + "grad_norm": 0.29038187861442566, + "learning_rate": 3e-05, + "loss": 1.7569, + "num_input_tokens_seen": 48486176, + "step": 1380 + }, + { + "epoch": 0.1121320980651667, + "grad_norm": 0.2944973409175873, + "learning_rate": 3e-05, + "loss": 1.769, + "num_input_tokens_seen": 48856256, + "step": 1390 + }, + { + "epoch": 0.11293880380664272, + "grad_norm": 0.2953120470046997, + "learning_rate": 3e-05, + "loss": 1.7863, + "num_input_tokens_seen": 49197100, + "step": 1400 + }, + { + "epoch": 0.11374550954811874, + "grad_norm": 0.2875744700431824, + "learning_rate": 3e-05, + "loss": 1.7009, + "num_input_tokens_seen": 49520196, + "step": 1410 + }, + { + "epoch": 0.11455221528959475, + "grad_norm": 0.2693103849887848, + "learning_rate": 3e-05, + "loss": 1.7705, + "num_input_tokens_seen": 49881204, + "step": 1420 + }, + { + "epoch": 0.11535892103107077, + "grad_norm": 0.2919449210166931, + "learning_rate": 3e-05, + "loss": 1.7068, + "num_input_tokens_seen": 50202060, + "step": 1430 + }, + { + "epoch": 0.1161656267725468, + "grad_norm": 0.2909579575061798, + "learning_rate": 3e-05, + "loss": 1.7693, + "num_input_tokens_seen": 50539700, + "step": 1440 + }, + { + "epoch": 0.11697233251402281, + "grad_norm": 0.29420360922813416, + "learning_rate": 3e-05, + "loss": 1.7307, + "num_input_tokens_seen": 50919492, + "step": 1450 + }, + { + "epoch": 0.11777903825549883, + "grad_norm": 0.3208655118942261, + "learning_rate": 3e-05, + "loss": 1.7425, + "num_input_tokens_seen": 51243268, + "step": 1460 + }, + { + "epoch": 0.11858574399697486, + "grad_norm": 0.2889709174633026, + "learning_rate": 3e-05, + "loss": 1.7642, + "num_input_tokens_seen": 51599620, + "step": 1470 + }, + { + "epoch": 0.11939244973845087, + "grad_norm": 0.29108598828315735, + "learning_rate": 3e-05, + "loss": 1.7404, + "num_input_tokens_seen": 51960000, + "step": 1480 + }, + { + "epoch": 0.12019915547992689, + "grad_norm": 0.3082159757614136, + "learning_rate": 3e-05, + "loss": 1.7389, + "num_input_tokens_seen": 52343160, + "step": 1490 + }, + { + "epoch": 0.12100586122140292, + "grad_norm": 0.30964505672454834, + "learning_rate": 3e-05, + "loss": 1.7052, + "num_input_tokens_seen": 52695936, + "step": 1500 + }, + { + "epoch": 0.12181256696287893, + "grad_norm": 0.2976539433002472, + "learning_rate": 3e-05, + "loss": 1.7637, + "num_input_tokens_seen": 53041128, + "step": 1510 + }, + { + "epoch": 0.12261927270435495, + "grad_norm": 0.2930919826030731, + "learning_rate": 3e-05, + "loss": 1.7254, + "num_input_tokens_seen": 53382704, + "step": 1520 + }, + { + "epoch": 0.12342597844583097, + "grad_norm": 0.31611040234565735, + "learning_rate": 3e-05, + "loss": 1.786, + "num_input_tokens_seen": 53728720, + "step": 1530 + }, + { + "epoch": 0.12423268418730699, + "grad_norm": 0.3480939269065857, + "learning_rate": 3e-05, + "loss": 1.76, + "num_input_tokens_seen": 54063164, + "step": 1540 + }, + { + "epoch": 0.125039389928783, + "grad_norm": 0.31007248163223267, + "learning_rate": 3e-05, + "loss": 1.7805, + "num_input_tokens_seen": 54386028, + "step": 1550 + }, + { + "epoch": 0.12584609567025903, + "grad_norm": 0.2958042621612549, + "learning_rate": 3e-05, + "loss": 1.7668, + "num_input_tokens_seen": 54761288, + "step": 1560 + }, + { + "epoch": 0.12665280141173504, + "grad_norm": 0.2833440899848938, + "learning_rate": 3e-05, + "loss": 1.747, + "num_input_tokens_seen": 55134672, + "step": 1570 + }, + { + "epoch": 0.12745950715321105, + "grad_norm": 0.2970580458641052, + "learning_rate": 3e-05, + "loss": 1.7097, + "num_input_tokens_seen": 55522524, + "step": 1580 + }, + { + "epoch": 0.1282662128946871, + "grad_norm": 0.3164750635623932, + "learning_rate": 3e-05, + "loss": 1.7395, + "num_input_tokens_seen": 55869072, + "step": 1590 + }, + { + "epoch": 0.1290729186361631, + "grad_norm": 0.32586508989334106, + "learning_rate": 3e-05, + "loss": 1.748, + "num_input_tokens_seen": 56239036, + "step": 1600 + }, + { + "epoch": 0.1298796243776391, + "grad_norm": 0.27935513854026794, + "learning_rate": 3e-05, + "loss": 1.7256, + "num_input_tokens_seen": 56579324, + "step": 1610 + }, + { + "epoch": 0.13068633011911515, + "grad_norm": 0.3307097256183624, + "learning_rate": 3e-05, + "loss": 1.668, + "num_input_tokens_seen": 56928744, + "step": 1620 + }, + { + "epoch": 0.13149303586059116, + "grad_norm": 0.3158148229122162, + "learning_rate": 3e-05, + "loss": 1.7193, + "num_input_tokens_seen": 57286024, + "step": 1630 + }, + { + "epoch": 0.13229974160206717, + "grad_norm": 0.29580333828926086, + "learning_rate": 3e-05, + "loss": 1.7249, + "num_input_tokens_seen": 57671300, + "step": 1640 + }, + { + "epoch": 0.1331064473435432, + "grad_norm": 0.26224178075790405, + "learning_rate": 3e-05, + "loss": 1.6971, + "num_input_tokens_seen": 58044600, + "step": 1650 + }, + { + "epoch": 0.13391315308501922, + "grad_norm": 0.2952196002006531, + "learning_rate": 3e-05, + "loss": 1.7619, + "num_input_tokens_seen": 58389844, + "step": 1660 + }, + { + "epoch": 0.13471985882649523, + "grad_norm": 0.30456557869911194, + "learning_rate": 3e-05, + "loss": 1.7429, + "num_input_tokens_seen": 58754384, + "step": 1670 + }, + { + "epoch": 0.13552656456797127, + "grad_norm": 0.2966090142726898, + "learning_rate": 3e-05, + "loss": 1.7241, + "num_input_tokens_seen": 59114496, + "step": 1680 + }, + { + "epoch": 0.13633327030944728, + "grad_norm": 0.2919583320617676, + "learning_rate": 3e-05, + "loss": 1.6988, + "num_input_tokens_seen": 59464252, + "step": 1690 + }, + { + "epoch": 0.1371399760509233, + "grad_norm": 0.2832421064376831, + "learning_rate": 3e-05, + "loss": 1.7817, + "num_input_tokens_seen": 59837848, + "step": 1700 + }, + { + "epoch": 0.13794668179239933, + "grad_norm": 0.2778345048427582, + "learning_rate": 3e-05, + "loss": 1.6825, + "num_input_tokens_seen": 60182016, + "step": 1710 + }, + { + "epoch": 0.13875338753387534, + "grad_norm": 0.3401370048522949, + "learning_rate": 3e-05, + "loss": 1.7525, + "num_input_tokens_seen": 60532724, + "step": 1720 + }, + { + "epoch": 0.13956009327535135, + "grad_norm": 0.30803683400154114, + "learning_rate": 3e-05, + "loss": 1.6473, + "num_input_tokens_seen": 60868756, + "step": 1730 + }, + { + "epoch": 0.1403667990168274, + "grad_norm": 0.2971110939979553, + "learning_rate": 3e-05, + "loss": 1.6967, + "num_input_tokens_seen": 61206004, + "step": 1740 + }, + { + "epoch": 0.1411735047583034, + "grad_norm": 0.3091312646865845, + "learning_rate": 3e-05, + "loss": 1.6649, + "num_input_tokens_seen": 61578372, + "step": 1750 + }, + { + "epoch": 0.1419802104997794, + "grad_norm": 0.25792524218559265, + "learning_rate": 3e-05, + "loss": 1.6868, + "num_input_tokens_seen": 61954252, + "step": 1760 + }, + { + "epoch": 0.14278691624125545, + "grad_norm": 0.32082629203796387, + "learning_rate": 3e-05, + "loss": 1.6844, + "num_input_tokens_seen": 62312328, + "step": 1770 + }, + { + "epoch": 0.14359362198273146, + "grad_norm": 0.2915956974029541, + "learning_rate": 3e-05, + "loss": 1.6998, + "num_input_tokens_seen": 62657528, + "step": 1780 + }, + { + "epoch": 0.14440032772420747, + "grad_norm": 0.28821295499801636, + "learning_rate": 3e-05, + "loss": 1.7053, + "num_input_tokens_seen": 63010336, + "step": 1790 + }, + { + "epoch": 0.1452070334656835, + "grad_norm": 0.2947831451892853, + "learning_rate": 3e-05, + "loss": 1.7078, + "num_input_tokens_seen": 63341864, + "step": 1800 + }, + { + "epoch": 0.14601373920715952, + "grad_norm": 0.31316396594047546, + "learning_rate": 3e-05, + "loss": 1.6593, + "num_input_tokens_seen": 63696096, + "step": 1810 + }, + { + "epoch": 0.14682044494863553, + "grad_norm": 0.3107188642024994, + "learning_rate": 3e-05, + "loss": 1.6506, + "num_input_tokens_seen": 64077708, + "step": 1820 + }, + { + "epoch": 0.14762715069011156, + "grad_norm": 0.3115972876548767, + "learning_rate": 3e-05, + "loss": 1.6887, + "num_input_tokens_seen": 64437828, + "step": 1830 + }, + { + "epoch": 0.14843385643158757, + "grad_norm": 0.34425589442253113, + "learning_rate": 3e-05, + "loss": 1.6977, + "num_input_tokens_seen": 64819332, + "step": 1840 + }, + { + "epoch": 0.14924056217306358, + "grad_norm": 0.27634525299072266, + "learning_rate": 3e-05, + "loss": 1.7464, + "num_input_tokens_seen": 65179536, + "step": 1850 + }, + { + "epoch": 0.15004726791453962, + "grad_norm": 0.31853121519088745, + "learning_rate": 3e-05, + "loss": 1.6382, + "num_input_tokens_seen": 65551676, + "step": 1860 + }, + { + "epoch": 0.15085397365601563, + "grad_norm": 0.30623626708984375, + "learning_rate": 3e-05, + "loss": 1.6798, + "num_input_tokens_seen": 65900132, + "step": 1870 + }, + { + "epoch": 0.15166067939749164, + "grad_norm": 0.28665515780448914, + "learning_rate": 3e-05, + "loss": 1.6672, + "num_input_tokens_seen": 66266544, + "step": 1880 + }, + { + "epoch": 0.15246738513896768, + "grad_norm": 0.29499661922454834, + "learning_rate": 3e-05, + "loss": 1.6661, + "num_input_tokens_seen": 66606176, + "step": 1890 + }, + { + "epoch": 0.1532740908804437, + "grad_norm": 0.3188175559043884, + "learning_rate": 3e-05, + "loss": 1.6772, + "num_input_tokens_seen": 66975608, + "step": 1900 + }, + { + "epoch": 0.1540807966219197, + "grad_norm": 0.31832584738731384, + "learning_rate": 3e-05, + "loss": 1.6704, + "num_input_tokens_seen": 67350296, + "step": 1910 + }, + { + "epoch": 0.15488750236339574, + "grad_norm": 0.329738974571228, + "learning_rate": 3e-05, + "loss": 1.7244, + "num_input_tokens_seen": 67707796, + "step": 1920 + }, + { + "epoch": 0.15569420810487175, + "grad_norm": 0.2936003804206848, + "learning_rate": 3e-05, + "loss": 1.6684, + "num_input_tokens_seen": 68036860, + "step": 1930 + }, + { + "epoch": 0.15650091384634776, + "grad_norm": 0.30164700746536255, + "learning_rate": 3e-05, + "loss": 1.7125, + "num_input_tokens_seen": 68377696, + "step": 1940 + }, + { + "epoch": 0.15730761958782377, + "grad_norm": 0.3079434931278229, + "learning_rate": 3e-05, + "loss": 1.6165, + "num_input_tokens_seen": 68748300, + "step": 1950 + }, + { + "epoch": 0.1581143253292998, + "grad_norm": 0.36346644163131714, + "learning_rate": 3e-05, + "loss": 1.6648, + "num_input_tokens_seen": 69128968, + "step": 1960 + }, + { + "epoch": 0.15892103107077582, + "grad_norm": 0.2884806990623474, + "learning_rate": 3e-05, + "loss": 1.6391, + "num_input_tokens_seen": 69471156, + "step": 1970 + }, + { + "epoch": 0.15972773681225183, + "grad_norm": 0.2658495306968689, + "learning_rate": 3e-05, + "loss": 1.6525, + "num_input_tokens_seen": 69820100, + "step": 1980 + }, + { + "epoch": 0.16053444255372787, + "grad_norm": 0.31078723073005676, + "learning_rate": 3e-05, + "loss": 1.6115, + "num_input_tokens_seen": 70181056, + "step": 1990 + }, + { + "epoch": 0.16134114829520388, + "grad_norm": 0.28954872488975525, + "learning_rate": 3e-05, + "loss": 1.6545, + "num_input_tokens_seen": 70510224, + "step": 2000 + }, + { + "epoch": 0.16134114829520388, + "eval_gen_len": 577.04, + "eval_loss": 1.5985389947891235, + "eval_rouge1": 28.9492, + "eval_rouge2": 15.3233, + "eval_rougeL": 23.871, + "eval_rougeLsum": 26.9919, + "eval_runtime": 1635.8727, + "eval_samples_per_second": 0.122, + "eval_steps_per_second": 0.031, + "num_input_tokens_seen": 70510224, + "step": 2000 + }, + { + "epoch": 0.1621478540366799, + "grad_norm": 0.2800785005092621, + "learning_rate": 3e-05, + "loss": 1.658, + "num_input_tokens_seen": 70896684, + "step": 2010 + }, + { + "epoch": 0.16295455977815593, + "grad_norm": 0.3101065754890442, + "learning_rate": 3e-05, + "loss": 1.6204, + "num_input_tokens_seen": 71245800, + "step": 2020 + }, + { + "epoch": 0.16376126551963194, + "grad_norm": 0.27418360114097595, + "learning_rate": 3e-05, + "loss": 1.6942, + "num_input_tokens_seen": 71561040, + "step": 2030 + }, + { + "epoch": 0.16456797126110795, + "grad_norm": 0.29117581248283386, + "learning_rate": 3e-05, + "loss": 1.6862, + "num_input_tokens_seen": 71917876, + "step": 2040 + }, + { + "epoch": 0.165374677002584, + "grad_norm": 0.3083847165107727, + "learning_rate": 3e-05, + "loss": 1.681, + "num_input_tokens_seen": 72278364, + "step": 2050 + }, + { + "epoch": 0.16618138274406, + "grad_norm": 0.29766711592674255, + "learning_rate": 3e-05, + "loss": 1.7143, + "num_input_tokens_seen": 72618996, + "step": 2060 + }, + { + "epoch": 0.166988088485536, + "grad_norm": 0.311576247215271, + "learning_rate": 3e-05, + "loss": 1.6782, + "num_input_tokens_seen": 72956980, + "step": 2070 + }, + { + "epoch": 0.16779479422701205, + "grad_norm": 0.5800204277038574, + "learning_rate": 3e-05, + "loss": 1.6647, + "num_input_tokens_seen": 73356960, + "step": 2080 + }, + { + "epoch": 0.16860149996848806, + "grad_norm": 37.67682647705078, + "learning_rate": 3e-05, + "loss": 1.6091, + "num_input_tokens_seen": 73675816, + "step": 2090 + }, + { + "epoch": 0.16940820570996407, + "grad_norm": 0.27842187881469727, + "learning_rate": 3e-05, + "loss": 1.6636, + "num_input_tokens_seen": 74032256, + "step": 2100 + }, + { + "epoch": 0.1702149114514401, + "grad_norm": 0.29616591334342957, + "learning_rate": 3e-05, + "loss": 1.6134, + "num_input_tokens_seen": 74398400, + "step": 2110 + }, + { + "epoch": 0.17102161719291611, + "grad_norm": 0.3454131782054901, + "learning_rate": 3e-05, + "loss": 1.6459, + "num_input_tokens_seen": 74733948, + "step": 2120 + }, + { + "epoch": 0.17182832293439212, + "grad_norm": 0.28399163484573364, + "learning_rate": 3e-05, + "loss": 1.6734, + "num_input_tokens_seen": 75108376, + "step": 2130 + }, + { + "epoch": 0.17263502867586816, + "grad_norm": 0.2860686480998993, + "learning_rate": 3e-05, + "loss": 1.6208, + "num_input_tokens_seen": 75448896, + "step": 2140 + }, + { + "epoch": 0.17344173441734417, + "grad_norm": 0.26892679929733276, + "learning_rate": 3e-05, + "loss": 1.636, + "num_input_tokens_seen": 75797696, + "step": 2150 + }, + { + "epoch": 0.17424844015882018, + "grad_norm": 0.2738756537437439, + "learning_rate": 3e-05, + "loss": 1.6832, + "num_input_tokens_seen": 76171544, + "step": 2160 + }, + { + "epoch": 0.17505514590029622, + "grad_norm": 0.29067671298980713, + "learning_rate": 3e-05, + "loss": 1.6717, + "num_input_tokens_seen": 76547816, + "step": 2170 + }, + { + "epoch": 0.17586185164177223, + "grad_norm": 0.28701356053352356, + "learning_rate": 3e-05, + "loss": 1.6343, + "num_input_tokens_seen": 76890468, + "step": 2180 + }, + { + "epoch": 0.17666855738324824, + "grad_norm": 0.28434693813323975, + "learning_rate": 3e-05, + "loss": 1.6087, + "num_input_tokens_seen": 77229360, + "step": 2190 + }, + { + "epoch": 0.17747526312472428, + "grad_norm": 0.3032514154911041, + "learning_rate": 3e-05, + "loss": 1.6132, + "num_input_tokens_seen": 77586424, + "step": 2200 + }, + { + "epoch": 0.1782819688662003, + "grad_norm": 0.3082556426525116, + "learning_rate": 3e-05, + "loss": 1.6578, + "num_input_tokens_seen": 77916864, + "step": 2210 + }, + { + "epoch": 0.1790886746076763, + "grad_norm": 0.28357553482055664, + "learning_rate": 3e-05, + "loss": 1.6743, + "num_input_tokens_seen": 78271708, + "step": 2220 + }, + { + "epoch": 0.17989538034915234, + "grad_norm": 0.30355584621429443, + "learning_rate": 3e-05, + "loss": 1.6257, + "num_input_tokens_seen": 78635684, + "step": 2230 + }, + { + "epoch": 0.18070208609062835, + "grad_norm": 0.3390004634857178, + "learning_rate": 3e-05, + "loss": 1.6041, + "num_input_tokens_seen": 78983708, + "step": 2240 + }, + { + "epoch": 0.18150879183210436, + "grad_norm": 0.30169346928596497, + "learning_rate": 3e-05, + "loss": 1.6102, + "num_input_tokens_seen": 79323848, + "step": 2250 + }, + { + "epoch": 0.1823154975735804, + "grad_norm": 0.33197805285453796, + "learning_rate": 3e-05, + "loss": 1.648, + "num_input_tokens_seen": 79691064, + "step": 2260 + }, + { + "epoch": 0.1831222033150564, + "grad_norm": 0.301727294921875, + "learning_rate": 3e-05, + "loss": 1.6121, + "num_input_tokens_seen": 80057832, + "step": 2270 + }, + { + "epoch": 0.18392890905653242, + "grad_norm": 0.44830191135406494, + "learning_rate": 3e-05, + "loss": 1.6317, + "num_input_tokens_seen": 80448712, + "step": 2280 + }, + { + "epoch": 0.18473561479800846, + "grad_norm": 0.2938157320022583, + "learning_rate": 3e-05, + "loss": 1.6598, + "num_input_tokens_seen": 80804116, + "step": 2290 + }, + { + "epoch": 0.18554232053948447, + "grad_norm": 0.2986922264099121, + "learning_rate": 3e-05, + "loss": 1.6292, + "num_input_tokens_seen": 81171136, + "step": 2300 + }, + { + "epoch": 0.18634902628096048, + "grad_norm": 0.2788652181625366, + "learning_rate": 3e-05, + "loss": 1.6548, + "num_input_tokens_seen": 81540708, + "step": 2310 + }, + { + "epoch": 0.18715573202243652, + "grad_norm": 0.312258243560791, + "learning_rate": 3e-05, + "loss": 1.6116, + "num_input_tokens_seen": 81870928, + "step": 2320 + }, + { + "epoch": 0.18796243776391253, + "grad_norm": 0.30631476640701294, + "learning_rate": 3e-05, + "loss": 1.6405, + "num_input_tokens_seen": 82223772, + "step": 2330 + }, + { + "epoch": 0.18876914350538854, + "grad_norm": 0.29788920283317566, + "learning_rate": 3e-05, + "loss": 1.6403, + "num_input_tokens_seen": 82541752, + "step": 2340 + }, + { + "epoch": 0.18957584924686458, + "grad_norm": 0.3009161949157715, + "learning_rate": 3e-05, + "loss": 1.6534, + "num_input_tokens_seen": 82897448, + "step": 2350 + }, + { + "epoch": 0.19038255498834059, + "grad_norm": 0.3116704821586609, + "learning_rate": 3e-05, + "loss": 1.6182, + "num_input_tokens_seen": 83272296, + "step": 2360 + }, + { + "epoch": 0.1911892607298166, + "grad_norm": 0.32088491320610046, + "learning_rate": 3e-05, + "loss": 1.6609, + "num_input_tokens_seen": 83617696, + "step": 2370 + }, + { + "epoch": 0.1919959664712926, + "grad_norm": 0.32367074489593506, + "learning_rate": 3e-05, + "loss": 1.6087, + "num_input_tokens_seen": 83977604, + "step": 2380 + }, + { + "epoch": 0.19280267221276864, + "grad_norm": 0.28396207094192505, + "learning_rate": 3e-05, + "loss": 1.5404, + "num_input_tokens_seen": 84339752, + "step": 2390 + }, + { + "epoch": 0.19360937795424465, + "grad_norm": 0.3000083267688751, + "learning_rate": 3e-05, + "loss": 1.599, + "num_input_tokens_seen": 84695344, + "step": 2400 + }, + { + "epoch": 0.19441608369572067, + "grad_norm": 0.296040415763855, + "learning_rate": 3e-05, + "loss": 1.5851, + "num_input_tokens_seen": 85022864, + "step": 2410 + }, + { + "epoch": 0.1952227894371967, + "grad_norm": 0.2935866415500641, + "learning_rate": 3e-05, + "loss": 1.6412, + "num_input_tokens_seen": 85362004, + "step": 2420 + }, + { + "epoch": 0.1960294951786727, + "grad_norm": 0.31561270356178284, + "learning_rate": 3e-05, + "loss": 1.6026, + "num_input_tokens_seen": 85682084, + "step": 2430 + }, + { + "epoch": 0.19683620092014872, + "grad_norm": 0.2930440306663513, + "learning_rate": 3e-05, + "loss": 1.6198, + "num_input_tokens_seen": 86053116, + "step": 2440 + }, + { + "epoch": 0.19764290666162476, + "grad_norm": 0.29515814781188965, + "learning_rate": 3e-05, + "loss": 1.6001, + "num_input_tokens_seen": 86407792, + "step": 2450 + }, + { + "epoch": 0.19844961240310077, + "grad_norm": 0.29479432106018066, + "learning_rate": 3e-05, + "loss": 1.5964, + "num_input_tokens_seen": 86747732, + "step": 2460 + }, + { + "epoch": 0.19925631814457678, + "grad_norm": 0.28793784976005554, + "learning_rate": 3e-05, + "loss": 1.5935, + "num_input_tokens_seen": 87105244, + "step": 2470 + }, + { + "epoch": 0.20006302388605282, + "grad_norm": 0.2696222960948944, + "learning_rate": 3e-05, + "loss": 1.5956, + "num_input_tokens_seen": 87446420, + "step": 2480 + }, + { + "epoch": 0.20086972962752883, + "grad_norm": 0.2994723618030548, + "learning_rate": 3e-05, + "loss": 1.5979, + "num_input_tokens_seen": 87770636, + "step": 2490 + }, + { + "epoch": 0.20167643536900484, + "grad_norm": 0.3084478974342346, + "learning_rate": 3e-05, + "loss": 1.659, + "num_input_tokens_seen": 88100828, + "step": 2500 + }, + { + "epoch": 0.20248314111048088, + "grad_norm": 0.2618251144886017, + "learning_rate": 3e-05, + "loss": 1.6145, + "num_input_tokens_seen": 88483608, + "step": 2510 + }, + { + "epoch": 0.2032898468519569, + "grad_norm": 0.28900229930877686, + "learning_rate": 3e-05, + "loss": 1.6172, + "num_input_tokens_seen": 88832372, + "step": 2520 + }, + { + "epoch": 0.2040965525934329, + "grad_norm": 0.30491867661476135, + "learning_rate": 3e-05, + "loss": 1.5989, + "num_input_tokens_seen": 89160240, + "step": 2530 + }, + { + "epoch": 0.20490325833490894, + "grad_norm": 0.3022604286670685, + "learning_rate": 3e-05, + "loss": 1.6099, + "num_input_tokens_seen": 89520280, + "step": 2540 + }, + { + "epoch": 0.20570996407638495, + "grad_norm": 0.27108603715896606, + "learning_rate": 3e-05, + "loss": 1.5853, + "num_input_tokens_seen": 89873136, + "step": 2550 + }, + { + "epoch": 0.20651666981786096, + "grad_norm": 0.3252500295639038, + "learning_rate": 3e-05, + "loss": 1.5598, + "num_input_tokens_seen": 90253000, + "step": 2560 + }, + { + "epoch": 0.207323375559337, + "grad_norm": 0.30979427695274353, + "learning_rate": 3e-05, + "loss": 1.5417, + "num_input_tokens_seen": 90554752, + "step": 2570 + }, + { + "epoch": 0.208130081300813, + "grad_norm": 0.2795146703720093, + "learning_rate": 3e-05, + "loss": 1.6095, + "num_input_tokens_seen": 90936820, + "step": 2580 + }, + { + "epoch": 0.20893678704228902, + "grad_norm": 0.28166651725769043, + "learning_rate": 3e-05, + "loss": 1.5759, + "num_input_tokens_seen": 91299076, + "step": 2590 + }, + { + "epoch": 0.20974349278376506, + "grad_norm": 0.3146922290325165, + "learning_rate": 3e-05, + "loss": 1.567, + "num_input_tokens_seen": 91648568, + "step": 2600 + }, + { + "epoch": 0.21055019852524107, + "grad_norm": 0.2938322424888611, + "learning_rate": 3e-05, + "loss": 1.5781, + "num_input_tokens_seen": 91998480, + "step": 2610 + }, + { + "epoch": 0.21135690426671708, + "grad_norm": 0.2709970772266388, + "learning_rate": 3e-05, + "loss": 1.5961, + "num_input_tokens_seen": 92379904, + "step": 2620 + }, + { + "epoch": 0.21216361000819312, + "grad_norm": 0.27745142579078674, + "learning_rate": 3e-05, + "loss": 1.5985, + "num_input_tokens_seen": 92719980, + "step": 2630 + }, + { + "epoch": 0.21297031574966913, + "grad_norm": 0.2709800899028778, + "learning_rate": 3e-05, + "loss": 1.5578, + "num_input_tokens_seen": 93052044, + "step": 2640 + }, + { + "epoch": 0.21377702149114514, + "grad_norm": 0.26459309458732605, + "learning_rate": 3e-05, + "loss": 1.5896, + "num_input_tokens_seen": 93415468, + "step": 2650 + }, + { + "epoch": 0.21458372723262117, + "grad_norm": 0.2925964891910553, + "learning_rate": 3e-05, + "loss": 1.6339, + "num_input_tokens_seen": 93782336, + "step": 2660 + }, + { + "epoch": 0.21539043297409718, + "grad_norm": 0.26069968938827515, + "learning_rate": 3e-05, + "loss": 1.5298, + "num_input_tokens_seen": 94122876, + "step": 2670 + }, + { + "epoch": 0.2161971387155732, + "grad_norm": 0.300855427980423, + "learning_rate": 3e-05, + "loss": 1.5816, + "num_input_tokens_seen": 94463076, + "step": 2680 + }, + { + "epoch": 0.21700384445704923, + "grad_norm": 0.283113956451416, + "learning_rate": 3e-05, + "loss": 1.6143, + "num_input_tokens_seen": 94822824, + "step": 2690 + }, + { + "epoch": 0.21781055019852524, + "grad_norm": 0.27436137199401855, + "learning_rate": 3e-05, + "loss": 1.5729, + "num_input_tokens_seen": 95153340, + "step": 2700 + }, + { + "epoch": 0.21861725594000125, + "grad_norm": 0.32102033495903015, + "learning_rate": 3e-05, + "loss": 1.5977, + "num_input_tokens_seen": 95507556, + "step": 2710 + }, + { + "epoch": 0.2194239616814773, + "grad_norm": 0.29213079810142517, + "learning_rate": 3e-05, + "loss": 1.5738, + "num_input_tokens_seen": 95868396, + "step": 2720 + }, + { + "epoch": 0.2202306674229533, + "grad_norm": 0.2973087728023529, + "learning_rate": 3e-05, + "loss": 1.5457, + "num_input_tokens_seen": 96220008, + "step": 2730 + }, + { + "epoch": 0.2210373731644293, + "grad_norm": 0.28580325841903687, + "learning_rate": 3e-05, + "loss": 1.5911, + "num_input_tokens_seen": 96579440, + "step": 2740 + }, + { + "epoch": 0.22184407890590535, + "grad_norm": 0.3367248773574829, + "learning_rate": 3e-05, + "loss": 1.5535, + "num_input_tokens_seen": 96938504, + "step": 2750 + }, + { + "epoch": 0.22265078464738136, + "grad_norm": 0.3134912848472595, + "learning_rate": 3e-05, + "loss": 1.5942, + "num_input_tokens_seen": 97306988, + "step": 2760 + }, + { + "epoch": 0.22345749038885737, + "grad_norm": 0.2981172800064087, + "learning_rate": 3e-05, + "loss": 1.5415, + "num_input_tokens_seen": 97653476, + "step": 2770 + }, + { + "epoch": 0.2242641961303334, + "grad_norm": 0.279850572347641, + "learning_rate": 3e-05, + "loss": 1.5997, + "num_input_tokens_seen": 98021276, + "step": 2780 + }, + { + "epoch": 0.22507090187180942, + "grad_norm": 0.28641802072525024, + "learning_rate": 3e-05, + "loss": 1.5944, + "num_input_tokens_seen": 98383012, + "step": 2790 + }, + { + "epoch": 0.22587760761328543, + "grad_norm": 0.3132043480873108, + "learning_rate": 3e-05, + "loss": 1.5811, + "num_input_tokens_seen": 98714760, + "step": 2800 + }, + { + "epoch": 0.22668431335476144, + "grad_norm": 0.316658079624176, + "learning_rate": 3e-05, + "loss": 1.5405, + "num_input_tokens_seen": 99073344, + "step": 2810 + }, + { + "epoch": 0.22749101909623748, + "grad_norm": 0.3003792464733124, + "learning_rate": 3e-05, + "loss": 1.5542, + "num_input_tokens_seen": 99405504, + "step": 2820 + }, + { + "epoch": 0.2282977248377135, + "grad_norm": 0.30942708253860474, + "learning_rate": 3e-05, + "loss": 1.603, + "num_input_tokens_seen": 99721668, + "step": 2830 + }, + { + "epoch": 0.2291044305791895, + "grad_norm": 0.3059990704059601, + "learning_rate": 3e-05, + "loss": 1.5811, + "num_input_tokens_seen": 100103932, + "step": 2840 + }, + { + "epoch": 0.22991113632066554, + "grad_norm": 0.28223365545272827, + "learning_rate": 3e-05, + "loss": 1.5837, + "num_input_tokens_seen": 100444700, + "step": 2850 + }, + { + "epoch": 0.23071784206214155, + "grad_norm": 0.3146832287311554, + "learning_rate": 3e-05, + "loss": 1.543, + "num_input_tokens_seen": 100799240, + "step": 2860 + }, + { + "epoch": 0.23152454780361756, + "grad_norm": 0.2812480628490448, + "learning_rate": 3e-05, + "loss": 1.573, + "num_input_tokens_seen": 101167952, + "step": 2870 + }, + { + "epoch": 0.2323312535450936, + "grad_norm": 0.29142189025878906, + "learning_rate": 3e-05, + "loss": 1.5945, + "num_input_tokens_seen": 101537024, + "step": 2880 + }, + { + "epoch": 0.2331379592865696, + "grad_norm": 0.2754380404949188, + "learning_rate": 3e-05, + "loss": 1.6187, + "num_input_tokens_seen": 101891288, + "step": 2890 + }, + { + "epoch": 0.23394466502804562, + "grad_norm": 0.2767621576786041, + "learning_rate": 3e-05, + "loss": 1.5483, + "num_input_tokens_seen": 102222636, + "step": 2900 + }, + { + "epoch": 0.23475137076952166, + "grad_norm": 0.3091464638710022, + "learning_rate": 3e-05, + "loss": 1.5503, + "num_input_tokens_seen": 102566644, + "step": 2910 + }, + { + "epoch": 0.23555807651099767, + "grad_norm": 0.29182493686676025, + "learning_rate": 3e-05, + "loss": 1.5685, + "num_input_tokens_seen": 102911868, + "step": 2920 + }, + { + "epoch": 0.23636478225247368, + "grad_norm": 0.31178319454193115, + "learning_rate": 3e-05, + "loss": 1.5439, + "num_input_tokens_seen": 103267188, + "step": 2930 + }, + { + "epoch": 0.23717148799394971, + "grad_norm": 0.2722642719745636, + "learning_rate": 3e-05, + "loss": 1.5385, + "num_input_tokens_seen": 103570216, + "step": 2940 + }, + { + "epoch": 0.23797819373542572, + "grad_norm": 0.29112839698791504, + "learning_rate": 3e-05, + "loss": 1.529, + "num_input_tokens_seen": 103952836, + "step": 2950 + }, + { + "epoch": 0.23878489947690174, + "grad_norm": 0.33165234327316284, + "learning_rate": 3e-05, + "loss": 1.5905, + "num_input_tokens_seen": 104312972, + "step": 2960 + }, + { + "epoch": 0.23959160521837777, + "grad_norm": 0.283861368894577, + "learning_rate": 3e-05, + "loss": 1.5791, + "num_input_tokens_seen": 104674176, + "step": 2970 + }, + { + "epoch": 0.24039831095985378, + "grad_norm": 0.29667556285858154, + "learning_rate": 3e-05, + "loss": 1.5679, + "num_input_tokens_seen": 105000536, + "step": 2980 + }, + { + "epoch": 0.2412050167013298, + "grad_norm": 0.2613981068134308, + "learning_rate": 3e-05, + "loss": 1.5683, + "num_input_tokens_seen": 105334356, + "step": 2990 + }, + { + "epoch": 0.24201172244280583, + "grad_norm": 0.32442784309387207, + "learning_rate": 3e-05, + "loss": 1.5522, + "num_input_tokens_seen": 105707144, + "step": 3000 + }, + { + "epoch": 0.24201172244280583, + "eval_gen_len": 537.77, + "eval_loss": 1.490655779838562, + "eval_rouge1": 30.4033, + "eval_rouge2": 16.1354, + "eval_rougeL": 24.7244, + "eval_rougeLsum": 28.5037, + "eval_runtime": 1529.7468, + "eval_samples_per_second": 0.131, + "eval_steps_per_second": 0.033, + "num_input_tokens_seen": 105707144, + "step": 3000 + }, + { + "epoch": 0.24281842818428184, + "grad_norm": 0.25999367237091064, + "learning_rate": 3e-05, + "loss": 1.5511, + "num_input_tokens_seen": 106039376, + "step": 3010 + }, + { + "epoch": 0.24362513392575785, + "grad_norm": 0.30608776211738586, + "learning_rate": 3e-05, + "loss": 1.5551, + "num_input_tokens_seen": 106400776, + "step": 3020 + }, + { + "epoch": 0.2444318396672339, + "grad_norm": 0.2672644257545471, + "learning_rate": 3e-05, + "loss": 1.5703, + "num_input_tokens_seen": 106753976, + "step": 3030 + }, + { + "epoch": 0.2452385454087099, + "grad_norm": 0.2924732565879822, + "learning_rate": 3e-05, + "loss": 1.5668, + "num_input_tokens_seen": 107084116, + "step": 3040 + }, + { + "epoch": 0.2460452511501859, + "grad_norm": 0.26746517419815063, + "learning_rate": 3e-05, + "loss": 1.5731, + "num_input_tokens_seen": 107445220, + "step": 3050 + }, + { + "epoch": 0.24685195689166195, + "grad_norm": 0.2895317077636719, + "learning_rate": 3e-05, + "loss": 1.5477, + "num_input_tokens_seen": 107824932, + "step": 3060 + }, + { + "epoch": 0.24765866263313796, + "grad_norm": 0.3116007447242737, + "learning_rate": 3e-05, + "loss": 1.5384, + "num_input_tokens_seen": 108169544, + "step": 3070 + }, + { + "epoch": 0.24846536837461397, + "grad_norm": 0.30636924505233765, + "learning_rate": 3e-05, + "loss": 1.548, + "num_input_tokens_seen": 108509580, + "step": 3080 + }, + { + "epoch": 0.24927207411609, + "grad_norm": 0.2778127193450928, + "learning_rate": 3e-05, + "loss": 1.5389, + "num_input_tokens_seen": 108841048, + "step": 3090 + }, + { + "epoch": 0.250078779857566, + "grad_norm": 0.33867573738098145, + "learning_rate": 3e-05, + "loss": 1.5356, + "num_input_tokens_seen": 109174340, + "step": 3100 + }, + { + "epoch": 0.25088548559904206, + "grad_norm": 0.3052271604537964, + "learning_rate": 3e-05, + "loss": 1.5869, + "num_input_tokens_seen": 109536332, + "step": 3110 + }, + { + "epoch": 0.25169219134051807, + "grad_norm": 0.3291682004928589, + "learning_rate": 3e-05, + "loss": 1.5583, + "num_input_tokens_seen": 109876576, + "step": 3120 + }, + { + "epoch": 0.2524988970819941, + "grad_norm": 0.27373817563056946, + "learning_rate": 3e-05, + "loss": 1.5523, + "num_input_tokens_seen": 110248928, + "step": 3130 + }, + { + "epoch": 0.2533056028234701, + "grad_norm": 0.2915042042732239, + "learning_rate": 3e-05, + "loss": 1.531, + "num_input_tokens_seen": 110605440, + "step": 3140 + }, + { + "epoch": 0.2541123085649461, + "grad_norm": 0.2974439561367035, + "learning_rate": 3e-05, + "loss": 1.5545, + "num_input_tokens_seen": 110951152, + "step": 3150 + }, + { + "epoch": 0.2549190143064221, + "grad_norm": 0.2974379062652588, + "learning_rate": 3e-05, + "loss": 1.5396, + "num_input_tokens_seen": 111293688, + "step": 3160 + }, + { + "epoch": 0.2557257200478982, + "grad_norm": 0.28520846366882324, + "learning_rate": 3e-05, + "loss": 1.553, + "num_input_tokens_seen": 111657012, + "step": 3170 + }, + { + "epoch": 0.2565324257893742, + "grad_norm": 0.2918589413166046, + "learning_rate": 3e-05, + "loss": 1.5384, + "num_input_tokens_seen": 112000840, + "step": 3180 + }, + { + "epoch": 0.2573391315308502, + "grad_norm": 0.2972608208656311, + "learning_rate": 3e-05, + "loss": 1.5092, + "num_input_tokens_seen": 112363632, + "step": 3190 + }, + { + "epoch": 0.2581458372723262, + "grad_norm": 0.28906238079071045, + "learning_rate": 3e-05, + "loss": 1.504, + "num_input_tokens_seen": 112755768, + "step": 3200 + }, + { + "epoch": 0.2589525430138022, + "grad_norm": 0.3328370451927185, + "learning_rate": 3e-05, + "loss": 1.4841, + "num_input_tokens_seen": 113115408, + "step": 3210 + }, + { + "epoch": 0.2597592487552782, + "grad_norm": 0.276845246553421, + "learning_rate": 3e-05, + "loss": 1.5259, + "num_input_tokens_seen": 113485700, + "step": 3220 + }, + { + "epoch": 0.2605659544967543, + "grad_norm": 0.2899667024612427, + "learning_rate": 3e-05, + "loss": 1.5442, + "num_input_tokens_seen": 113815188, + "step": 3230 + }, + { + "epoch": 0.2613726602382303, + "grad_norm": 0.2876961827278137, + "learning_rate": 3e-05, + "loss": 1.5318, + "num_input_tokens_seen": 114160588, + "step": 3240 + }, + { + "epoch": 0.2621793659797063, + "grad_norm": 0.28680142760276794, + "learning_rate": 3e-05, + "loss": 1.5557, + "num_input_tokens_seen": 114495188, + "step": 3250 + }, + { + "epoch": 0.2629860717211823, + "grad_norm": 0.3168465495109558, + "learning_rate": 3e-05, + "loss": 1.5693, + "num_input_tokens_seen": 114854536, + "step": 3260 + }, + { + "epoch": 0.26379277746265833, + "grad_norm": 0.28036338090896606, + "learning_rate": 3e-05, + "loss": 1.4784, + "num_input_tokens_seen": 115203172, + "step": 3270 + }, + { + "epoch": 0.26459948320413434, + "grad_norm": 0.3073316514492035, + "learning_rate": 3e-05, + "loss": 1.5274, + "num_input_tokens_seen": 115585256, + "step": 3280 + }, + { + "epoch": 0.2654061889456104, + "grad_norm": 0.28101012110710144, + "learning_rate": 3e-05, + "loss": 1.5496, + "num_input_tokens_seen": 115922364, + "step": 3290 + }, + { + "epoch": 0.2662128946870864, + "grad_norm": 0.2771126329898834, + "learning_rate": 3e-05, + "loss": 1.5634, + "num_input_tokens_seen": 116267708, + "step": 3300 + }, + { + "epoch": 0.26701960042856243, + "grad_norm": 0.3039109408855438, + "learning_rate": 3e-05, + "loss": 1.4806, + "num_input_tokens_seen": 116607404, + "step": 3310 + }, + { + "epoch": 0.26782630617003844, + "grad_norm": 0.2795468270778656, + "learning_rate": 3e-05, + "loss": 1.5449, + "num_input_tokens_seen": 116917480, + "step": 3320 + }, + { + "epoch": 0.26863301191151445, + "grad_norm": 0.2998358905315399, + "learning_rate": 3e-05, + "loss": 1.5067, + "num_input_tokens_seen": 117255260, + "step": 3330 + }, + { + "epoch": 0.26943971765299046, + "grad_norm": 0.3048727810382843, + "learning_rate": 3e-05, + "loss": 1.5021, + "num_input_tokens_seen": 117596360, + "step": 3340 + }, + { + "epoch": 0.27024642339446653, + "grad_norm": 0.31331056356430054, + "learning_rate": 3e-05, + "loss": 1.5621, + "num_input_tokens_seen": 117967920, + "step": 3350 + }, + { + "epoch": 0.27105312913594254, + "grad_norm": 0.3083108961582184, + "learning_rate": 3e-05, + "loss": 1.4923, + "num_input_tokens_seen": 118314268, + "step": 3360 + }, + { + "epoch": 0.27185983487741855, + "grad_norm": 0.36439692974090576, + "learning_rate": 3e-05, + "loss": 1.5368, + "num_input_tokens_seen": 118658628, + "step": 3370 + }, + { + "epoch": 0.27266654061889456, + "grad_norm": 0.2711757719516754, + "learning_rate": 3e-05, + "loss": 1.5048, + "num_input_tokens_seen": 119018540, + "step": 3380 + }, + { + "epoch": 0.27347324636037057, + "grad_norm": 0.2828957438468933, + "learning_rate": 3e-05, + "loss": 1.5502, + "num_input_tokens_seen": 119366464, + "step": 3390 + }, + { + "epoch": 0.2742799521018466, + "grad_norm": 0.3058261573314667, + "learning_rate": 3e-05, + "loss": 1.5167, + "num_input_tokens_seen": 119713824, + "step": 3400 + }, + { + "epoch": 0.27508665784332265, + "grad_norm": 0.2823350429534912, + "learning_rate": 3e-05, + "loss": 1.5371, + "num_input_tokens_seen": 120077416, + "step": 3410 + }, + { + "epoch": 0.27589336358479866, + "grad_norm": 0.2950865626335144, + "learning_rate": 3e-05, + "loss": 1.5208, + "num_input_tokens_seen": 120429560, + "step": 3420 + }, + { + "epoch": 0.27670006932627467, + "grad_norm": 0.2756860852241516, + "learning_rate": 3e-05, + "loss": 1.5479, + "num_input_tokens_seen": 120775808, + "step": 3430 + }, + { + "epoch": 0.2775067750677507, + "grad_norm": 0.32079747319221497, + "learning_rate": 3e-05, + "loss": 1.5235, + "num_input_tokens_seen": 121146688, + "step": 3440 + }, + { + "epoch": 0.2783134808092267, + "grad_norm": 0.2849906086921692, + "learning_rate": 3e-05, + "loss": 1.5281, + "num_input_tokens_seen": 121511252, + "step": 3450 + }, + { + "epoch": 0.2791201865507027, + "grad_norm": 0.3128233850002289, + "learning_rate": 3e-05, + "loss": 1.4737, + "num_input_tokens_seen": 121880504, + "step": 3460 + }, + { + "epoch": 0.2799268922921787, + "grad_norm": 0.281825989484787, + "learning_rate": 3e-05, + "loss": 1.4789, + "num_input_tokens_seen": 122207764, + "step": 3470 + }, + { + "epoch": 0.2807335980336548, + "grad_norm": 0.26039403676986694, + "learning_rate": 3e-05, + "loss": 1.519, + "num_input_tokens_seen": 122556148, + "step": 3480 + }, + { + "epoch": 0.2815403037751308, + "grad_norm": 0.34013232588768005, + "learning_rate": 3e-05, + "loss": 1.5325, + "num_input_tokens_seen": 122911404, + "step": 3490 + }, + { + "epoch": 0.2823470095166068, + "grad_norm": 0.3078472912311554, + "learning_rate": 3e-05, + "loss": 1.5493, + "num_input_tokens_seen": 123295332, + "step": 3500 + }, + { + "epoch": 0.2831537152580828, + "grad_norm": 0.3297036290168762, + "learning_rate": 3e-05, + "loss": 1.5111, + "num_input_tokens_seen": 123608268, + "step": 3510 + }, + { + "epoch": 0.2839604209995588, + "grad_norm": 0.2852914035320282, + "learning_rate": 3e-05, + "loss": 1.4844, + "num_input_tokens_seen": 123923784, + "step": 3520 + }, + { + "epoch": 0.2847671267410348, + "grad_norm": 0.2900603711605072, + "learning_rate": 3e-05, + "loss": 1.536, + "num_input_tokens_seen": 124255432, + "step": 3530 + }, + { + "epoch": 0.2855738324825109, + "grad_norm": 0.2996746003627777, + "learning_rate": 3e-05, + "loss": 1.4837, + "num_input_tokens_seen": 124611088, + "step": 3540 + }, + { + "epoch": 0.2863805382239869, + "grad_norm": 0.257682740688324, + "learning_rate": 3e-05, + "loss": 1.5242, + "num_input_tokens_seen": 124959064, + "step": 3550 + }, + { + "epoch": 0.2871872439654629, + "grad_norm": 0.3033203184604645, + "learning_rate": 3e-05, + "loss": 1.4912, + "num_input_tokens_seen": 125314100, + "step": 3560 + }, + { + "epoch": 0.2879939497069389, + "grad_norm": 0.3357955515384674, + "learning_rate": 3e-05, + "loss": 1.5057, + "num_input_tokens_seen": 125628132, + "step": 3570 + }, + { + "epoch": 0.28880065544841493, + "grad_norm": 0.30520308017730713, + "learning_rate": 3e-05, + "loss": 1.5287, + "num_input_tokens_seen": 125984032, + "step": 3580 + }, + { + "epoch": 0.28960736118989094, + "grad_norm": 0.3066059648990631, + "learning_rate": 3e-05, + "loss": 1.5461, + "num_input_tokens_seen": 126339664, + "step": 3590 + }, + { + "epoch": 0.290414066931367, + "grad_norm": 0.2903365194797516, + "learning_rate": 3e-05, + "loss": 1.5364, + "num_input_tokens_seen": 126680156, + "step": 3600 + }, + { + "epoch": 0.291220772672843, + "grad_norm": 0.30246102809906006, + "learning_rate": 3e-05, + "loss": 1.5888, + "num_input_tokens_seen": 127076916, + "step": 3610 + }, + { + "epoch": 0.29202747841431903, + "grad_norm": 0.28773432970046997, + "learning_rate": 3e-05, + "loss": 1.4945, + "num_input_tokens_seen": 127418188, + "step": 3620 + }, + { + "epoch": 0.29283418415579504, + "grad_norm": 0.36873912811279297, + "learning_rate": 3e-05, + "loss": 1.4849, + "num_input_tokens_seen": 127795860, + "step": 3630 + }, + { + "epoch": 0.29364088989727105, + "grad_norm": 0.31495216488838196, + "learning_rate": 3e-05, + "loss": 1.4918, + "num_input_tokens_seen": 128127020, + "step": 3640 + }, + { + "epoch": 0.29444759563874706, + "grad_norm": 0.9313835501670837, + "learning_rate": 3e-05, + "loss": 1.549, + "num_input_tokens_seen": 128472256, + "step": 3650 + }, + { + "epoch": 0.2952543013802231, + "grad_norm": 0.29919105768203735, + "learning_rate": 3e-05, + "loss": 1.5645, + "num_input_tokens_seen": 128831764, + "step": 3660 + }, + { + "epoch": 0.29606100712169914, + "grad_norm": 0.29914769530296326, + "learning_rate": 3e-05, + "loss": 1.4823, + "num_input_tokens_seen": 129175644, + "step": 3670 + }, + { + "epoch": 0.29686771286317515, + "grad_norm": 0.2776944041252136, + "learning_rate": 3e-05, + "loss": 1.4981, + "num_input_tokens_seen": 129534220, + "step": 3680 + }, + { + "epoch": 0.29767441860465116, + "grad_norm": 0.2623848021030426, + "learning_rate": 3e-05, + "loss": 1.5136, + "num_input_tokens_seen": 129882948, + "step": 3690 + }, + { + "epoch": 0.29848112434612717, + "grad_norm": 0.2865106165409088, + "learning_rate": 3e-05, + "loss": 1.4903, + "num_input_tokens_seen": 130238792, + "step": 3700 + }, + { + "epoch": 0.2992878300876032, + "grad_norm": 0.30147454142570496, + "learning_rate": 3e-05, + "loss": 1.4774, + "num_input_tokens_seen": 130602272, + "step": 3710 + }, + { + "epoch": 0.30009453582907925, + "grad_norm": 0.2756776809692383, + "learning_rate": 3e-05, + "loss": 1.5004, + "num_input_tokens_seen": 130953160, + "step": 3720 + }, + { + "epoch": 0.30090124157055526, + "grad_norm": 0.3233429193496704, + "learning_rate": 3e-05, + "loss": 1.5095, + "num_input_tokens_seen": 131287396, + "step": 3730 + }, + { + "epoch": 0.30170794731203127, + "grad_norm": 0.2846832871437073, + "learning_rate": 3e-05, + "loss": 1.5312, + "num_input_tokens_seen": 131634640, + "step": 3740 + }, + { + "epoch": 0.3025146530535073, + "grad_norm": 0.31799256801605225, + "learning_rate": 3e-05, + "loss": 1.539, + "num_input_tokens_seen": 131998680, + "step": 3750 + }, + { + "epoch": 0.3033213587949833, + "grad_norm": 0.2880600392818451, + "learning_rate": 3e-05, + "loss": 1.4928, + "num_input_tokens_seen": 132325324, + "step": 3760 + }, + { + "epoch": 0.3041280645364593, + "grad_norm": 0.3118450343608856, + "learning_rate": 3e-05, + "loss": 1.4899, + "num_input_tokens_seen": 132681648, + "step": 3770 + }, + { + "epoch": 0.30493477027793536, + "grad_norm": 0.2892366945743561, + "learning_rate": 3e-05, + "loss": 1.5506, + "num_input_tokens_seen": 133029972, + "step": 3780 + }, + { + "epoch": 0.3057414760194114, + "grad_norm": 0.26994529366493225, + "learning_rate": 3e-05, + "loss": 1.4862, + "num_input_tokens_seen": 133381324, + "step": 3790 + }, + { + "epoch": 0.3065481817608874, + "grad_norm": 0.30546241998672485, + "learning_rate": 3e-05, + "loss": 1.4856, + "num_input_tokens_seen": 133726364, + "step": 3800 + }, + { + "epoch": 0.3073548875023634, + "grad_norm": 0.31917914748191833, + "learning_rate": 3e-05, + "loss": 1.5729, + "num_input_tokens_seen": 134081304, + "step": 3810 + }, + { + "epoch": 0.3081615932438394, + "grad_norm": 0.28447583317756653, + "learning_rate": 3e-05, + "loss": 1.4627, + "num_input_tokens_seen": 134427992, + "step": 3820 + }, + { + "epoch": 0.3089682989853154, + "grad_norm": 0.2646794617176056, + "learning_rate": 3e-05, + "loss": 1.5402, + "num_input_tokens_seen": 134791020, + "step": 3830 + }, + { + "epoch": 0.3097750047267915, + "grad_norm": 0.33490800857543945, + "learning_rate": 3e-05, + "loss": 1.5013, + "num_input_tokens_seen": 135143312, + "step": 3840 + }, + { + "epoch": 0.3105817104682675, + "grad_norm": 0.28088971972465515, + "learning_rate": 3e-05, + "loss": 1.5179, + "num_input_tokens_seen": 135461584, + "step": 3850 + }, + { + "epoch": 0.3113884162097435, + "grad_norm": 0.31193193793296814, + "learning_rate": 3e-05, + "loss": 1.4818, + "num_input_tokens_seen": 135833744, + "step": 3860 + }, + { + "epoch": 0.3121951219512195, + "grad_norm": 0.2969256341457367, + "learning_rate": 3e-05, + "loss": 1.5094, + "num_input_tokens_seen": 136187480, + "step": 3870 + }, + { + "epoch": 0.3130018276926955, + "grad_norm": 0.2791529595851898, + "learning_rate": 3e-05, + "loss": 1.4803, + "num_input_tokens_seen": 136526612, + "step": 3880 + }, + { + "epoch": 0.31380853343417153, + "grad_norm": 0.2843697667121887, + "learning_rate": 3e-05, + "loss": 1.4858, + "num_input_tokens_seen": 136911180, + "step": 3890 + }, + { + "epoch": 0.31461523917564754, + "grad_norm": 0.289218932390213, + "learning_rate": 3e-05, + "loss": 1.4901, + "num_input_tokens_seen": 137252616, + "step": 3900 + }, + { + "epoch": 0.3154219449171236, + "grad_norm": 0.2953207790851593, + "learning_rate": 3e-05, + "loss": 1.498, + "num_input_tokens_seen": 137622500, + "step": 3910 + }, + { + "epoch": 0.3162286506585996, + "grad_norm": 0.2963256239891052, + "learning_rate": 3e-05, + "loss": 1.4842, + "num_input_tokens_seen": 137965636, + "step": 3920 + }, + { + "epoch": 0.31703535640007563, + "grad_norm": 0.26671716570854187, + "learning_rate": 3e-05, + "loss": 1.4532, + "num_input_tokens_seen": 138320552, + "step": 3930 + }, + { + "epoch": 0.31784206214155164, + "grad_norm": 0.2607724368572235, + "learning_rate": 3e-05, + "loss": 1.4774, + "num_input_tokens_seen": 138682864, + "step": 3940 + }, + { + "epoch": 0.31864876788302765, + "grad_norm": 0.25891661643981934, + "learning_rate": 3e-05, + "loss": 1.4808, + "num_input_tokens_seen": 139009880, + "step": 3950 + }, + { + "epoch": 0.31945547362450366, + "grad_norm": 0.2629043161869049, + "learning_rate": 3e-05, + "loss": 1.5152, + "num_input_tokens_seen": 139349380, + "step": 3960 + }, + { + "epoch": 0.3202621793659797, + "grad_norm": 0.2573290765285492, + "learning_rate": 3e-05, + "loss": 1.4592, + "num_input_tokens_seen": 139690036, + "step": 3970 + }, + { + "epoch": 0.32106888510745574, + "grad_norm": 0.291111022233963, + "learning_rate": 3e-05, + "loss": 1.5394, + "num_input_tokens_seen": 140029928, + "step": 3980 + }, + { + "epoch": 0.32187559084893175, + "grad_norm": 0.29500630497932434, + "learning_rate": 3e-05, + "loss": 1.505, + "num_input_tokens_seen": 140375124, + "step": 3990 + }, + { + "epoch": 0.32268229659040776, + "grad_norm": 0.27471858263015747, + "learning_rate": 3e-05, + "loss": 1.5059, + "num_input_tokens_seen": 140722844, + "step": 4000 + }, + { + "epoch": 0.32268229659040776, + "eval_gen_len": 522.495, + "eval_loss": 1.4203619956970215, + "eval_rouge1": 34.0294, + "eval_rouge2": 19.2608, + "eval_rougeL": 27.9322, + "eval_rougeLsum": 32.3166, + "eval_runtime": 1678.7718, + "eval_samples_per_second": 0.119, + "eval_steps_per_second": 0.03, + "num_input_tokens_seen": 140722844, + "step": 4000 + }, + { + "epoch": 0.32348900233188377, + "grad_norm": 0.28979143500328064, + "learning_rate": 3e-05, + "loss": 1.5586, + "num_input_tokens_seen": 141066960, + "step": 4010 + }, + { + "epoch": 0.3242957080733598, + "grad_norm": 0.2836126983165741, + "learning_rate": 3e-05, + "loss": 1.4956, + "num_input_tokens_seen": 141419352, + "step": 4020 + }, + { + "epoch": 0.32510241381483584, + "grad_norm": 0.28655633330345154, + "learning_rate": 3e-05, + "loss": 1.4839, + "num_input_tokens_seen": 141790804, + "step": 4030 + }, + { + "epoch": 0.32590911955631185, + "grad_norm": 0.28721150755882263, + "learning_rate": 3e-05, + "loss": 1.5154, + "num_input_tokens_seen": 142162756, + "step": 4040 + }, + { + "epoch": 0.32671582529778787, + "grad_norm": 0.30329418182373047, + "learning_rate": 3e-05, + "loss": 1.4852, + "num_input_tokens_seen": 142517624, + "step": 4050 + }, + { + "epoch": 0.3275225310392639, + "grad_norm": 0.2742053270339966, + "learning_rate": 3e-05, + "loss": 1.4663, + "num_input_tokens_seen": 142839740, + "step": 4060 + }, + { + "epoch": 0.3283292367807399, + "grad_norm": 0.2814532220363617, + "learning_rate": 3e-05, + "loss": 1.509, + "num_input_tokens_seen": 143173156, + "step": 4070 + }, + { + "epoch": 0.3291359425222159, + "grad_norm": 0.3034536838531494, + "learning_rate": 3e-05, + "loss": 1.4528, + "num_input_tokens_seen": 143537620, + "step": 4080 + }, + { + "epoch": 0.32994264826369196, + "grad_norm": 0.29641520977020264, + "learning_rate": 3e-05, + "loss": 1.4413, + "num_input_tokens_seen": 143874732, + "step": 4090 + }, + { + "epoch": 0.330749354005168, + "grad_norm": 0.2924509644508362, + "learning_rate": 3e-05, + "loss": 1.5089, + "num_input_tokens_seen": 144230600, + "step": 4100 + }, + { + "epoch": 0.331556059746644, + "grad_norm": 0.2810611128807068, + "learning_rate": 3e-05, + "loss": 1.4568, + "num_input_tokens_seen": 144595320, + "step": 4110 + }, + { + "epoch": 0.33236276548812, + "grad_norm": 0.2762203812599182, + "learning_rate": 3e-05, + "loss": 1.488, + "num_input_tokens_seen": 144946772, + "step": 4120 + }, + { + "epoch": 0.333169471229596, + "grad_norm": 0.3193224370479584, + "learning_rate": 3e-05, + "loss": 1.4391, + "num_input_tokens_seen": 145295928, + "step": 4130 + }, + { + "epoch": 0.333976176971072, + "grad_norm": 0.2631831467151642, + "learning_rate": 3e-05, + "loss": 1.4396, + "num_input_tokens_seen": 145653456, + "step": 4140 + }, + { + "epoch": 0.3347828827125481, + "grad_norm": 0.27242833375930786, + "learning_rate": 3e-05, + "loss": 1.4471, + "num_input_tokens_seen": 146017976, + "step": 4150 + }, + { + "epoch": 0.3355895884540241, + "grad_norm": 0.3117299973964691, + "learning_rate": 3e-05, + "loss": 1.448, + "num_input_tokens_seen": 146366548, + "step": 4160 + }, + { + "epoch": 0.3363962941955001, + "grad_norm": 0.28237223625183105, + "learning_rate": 3e-05, + "loss": 1.4627, + "num_input_tokens_seen": 146688608, + "step": 4170 + }, + { + "epoch": 0.3372029999369761, + "grad_norm": 0.33882033824920654, + "learning_rate": 3e-05, + "loss": 1.4841, + "num_input_tokens_seen": 147036924, + "step": 4180 + }, + { + "epoch": 0.3380097056784521, + "grad_norm": 0.2639561593532562, + "learning_rate": 3e-05, + "loss": 1.4653, + "num_input_tokens_seen": 147354544, + "step": 4190 + }, + { + "epoch": 0.33881641141992813, + "grad_norm": 0.2983449101448059, + "learning_rate": 3e-05, + "loss": 1.5031, + "num_input_tokens_seen": 147705132, + "step": 4200 + }, + { + "epoch": 0.3396231171614042, + "grad_norm": 0.30153656005859375, + "learning_rate": 3e-05, + "loss": 1.4866, + "num_input_tokens_seen": 148044316, + "step": 4210 + }, + { + "epoch": 0.3404298229028802, + "grad_norm": 0.2834070026874542, + "learning_rate": 3e-05, + "loss": 1.4838, + "num_input_tokens_seen": 148383308, + "step": 4220 + }, + { + "epoch": 0.3412365286443562, + "grad_norm": 0.28662896156311035, + "learning_rate": 3e-05, + "loss": 1.4963, + "num_input_tokens_seen": 148711800, + "step": 4230 + }, + { + "epoch": 0.34204323438583223, + "grad_norm": 0.26079222559928894, + "learning_rate": 3e-05, + "loss": 1.4763, + "num_input_tokens_seen": 149072140, + "step": 4240 + }, + { + "epoch": 0.34284994012730824, + "grad_norm": 0.29420602321624756, + "learning_rate": 3e-05, + "loss": 1.4634, + "num_input_tokens_seen": 149418364, + "step": 4250 + }, + { + "epoch": 0.34365664586878425, + "grad_norm": 0.2780504524707794, + "learning_rate": 3e-05, + "loss": 1.4612, + "num_input_tokens_seen": 149776088, + "step": 4260 + }, + { + "epoch": 0.3444633516102603, + "grad_norm": 0.308002769947052, + "learning_rate": 3e-05, + "loss": 1.4388, + "num_input_tokens_seen": 150144108, + "step": 4270 + }, + { + "epoch": 0.3452700573517363, + "grad_norm": 0.33135300874710083, + "learning_rate": 3e-05, + "loss": 1.4682, + "num_input_tokens_seen": 150494172, + "step": 4280 + }, + { + "epoch": 0.34607676309321234, + "grad_norm": 0.2844593822956085, + "learning_rate": 3e-05, + "loss": 1.5251, + "num_input_tokens_seen": 150828560, + "step": 4290 + }, + { + "epoch": 0.34688346883468835, + "grad_norm": 0.3216274082660675, + "learning_rate": 3e-05, + "loss": 1.5058, + "num_input_tokens_seen": 151201392, + "step": 4300 + }, + { + "epoch": 0.34769017457616436, + "grad_norm": 0.27584394812583923, + "learning_rate": 3e-05, + "loss": 1.4839, + "num_input_tokens_seen": 151566364, + "step": 4310 + }, + { + "epoch": 0.34849688031764037, + "grad_norm": 0.2775894105434418, + "learning_rate": 3e-05, + "loss": 1.4803, + "num_input_tokens_seen": 151904260, + "step": 4320 + }, + { + "epoch": 0.3493035860591164, + "grad_norm": 0.30853790044784546, + "learning_rate": 3e-05, + "loss": 1.4654, + "num_input_tokens_seen": 152247804, + "step": 4330 + }, + { + "epoch": 0.35011029180059244, + "grad_norm": 0.2662428617477417, + "learning_rate": 3e-05, + "loss": 1.4837, + "num_input_tokens_seen": 152605848, + "step": 4340 + }, + { + "epoch": 0.35091699754206845, + "grad_norm": 0.296486496925354, + "learning_rate": 3e-05, + "loss": 1.4151, + "num_input_tokens_seen": 152927008, + "step": 4350 + }, + { + "epoch": 0.35172370328354446, + "grad_norm": 0.314229279756546, + "learning_rate": 3e-05, + "loss": 1.4944, + "num_input_tokens_seen": 153271476, + "step": 4360 + }, + { + "epoch": 0.3525304090250205, + "grad_norm": 0.25222501158714294, + "learning_rate": 3e-05, + "loss": 1.5215, + "num_input_tokens_seen": 153595852, + "step": 4370 + }, + { + "epoch": 0.3533371147664965, + "grad_norm": 0.3103020489215851, + "learning_rate": 3e-05, + "loss": 1.4244, + "num_input_tokens_seen": 153933224, + "step": 4380 + }, + { + "epoch": 0.3541438205079725, + "grad_norm": 0.28948068618774414, + "learning_rate": 3e-05, + "loss": 1.4395, + "num_input_tokens_seen": 154243172, + "step": 4390 + }, + { + "epoch": 0.35495052624944856, + "grad_norm": 0.2793199419975281, + "learning_rate": 3e-05, + "loss": 1.4541, + "num_input_tokens_seen": 154589252, + "step": 4400 + }, + { + "epoch": 0.35575723199092457, + "grad_norm": 0.2927285432815552, + "learning_rate": 3e-05, + "loss": 1.4764, + "num_input_tokens_seen": 154948944, + "step": 4410 + }, + { + "epoch": 0.3565639377324006, + "grad_norm": 0.2556557059288025, + "learning_rate": 3e-05, + "loss": 1.4135, + "num_input_tokens_seen": 155298440, + "step": 4420 + }, + { + "epoch": 0.3573706434738766, + "grad_norm": 0.28829360008239746, + "learning_rate": 3e-05, + "loss": 1.4656, + "num_input_tokens_seen": 155686288, + "step": 4430 + }, + { + "epoch": 0.3581773492153526, + "grad_norm": 0.29673314094543457, + "learning_rate": 3e-05, + "loss": 1.3826, + "num_input_tokens_seen": 156031180, + "step": 4440 + }, + { + "epoch": 0.3589840549568286, + "grad_norm": 0.2608402371406555, + "learning_rate": 3e-05, + "loss": 1.4831, + "num_input_tokens_seen": 156361652, + "step": 4450 + }, + { + "epoch": 0.3597907606983047, + "grad_norm": 0.2800503075122833, + "learning_rate": 3e-05, + "loss": 1.4343, + "num_input_tokens_seen": 156701024, + "step": 4460 + }, + { + "epoch": 0.3605974664397807, + "grad_norm": 0.28234806656837463, + "learning_rate": 3e-05, + "loss": 1.4798, + "num_input_tokens_seen": 157070896, + "step": 4470 + }, + { + "epoch": 0.3614041721812567, + "grad_norm": 0.27914923429489136, + "learning_rate": 3e-05, + "loss": 1.4497, + "num_input_tokens_seen": 157420460, + "step": 4480 + }, + { + "epoch": 0.3622108779227327, + "grad_norm": 0.2710079550743103, + "learning_rate": 3e-05, + "loss": 1.4706, + "num_input_tokens_seen": 157779212, + "step": 4490 + }, + { + "epoch": 0.3630175836642087, + "grad_norm": 0.28353649377822876, + "learning_rate": 3e-05, + "loss": 1.4075, + "num_input_tokens_seen": 158084872, + "step": 4500 + }, + { + "epoch": 0.36382428940568473, + "grad_norm": 0.28383737802505493, + "learning_rate": 3e-05, + "loss": 1.4363, + "num_input_tokens_seen": 158417664, + "step": 4510 + }, + { + "epoch": 0.3646309951471608, + "grad_norm": 0.27592507004737854, + "learning_rate": 3e-05, + "loss": 1.4278, + "num_input_tokens_seen": 158733056, + "step": 4520 + }, + { + "epoch": 0.3654377008886368, + "grad_norm": 0.26034659147262573, + "learning_rate": 3e-05, + "loss": 1.4583, + "num_input_tokens_seen": 159062868, + "step": 4530 + }, + { + "epoch": 0.3662444066301128, + "grad_norm": 0.26085537672042847, + "learning_rate": 3e-05, + "loss": 1.4116, + "num_input_tokens_seen": 159421052, + "step": 4540 + }, + { + "epoch": 0.36705111237158883, + "grad_norm": 0.26964882016181946, + "learning_rate": 3e-05, + "loss": 1.4616, + "num_input_tokens_seen": 159782660, + "step": 4550 + }, + { + "epoch": 0.36785781811306484, + "grad_norm": 0.28062888979911804, + "learning_rate": 3e-05, + "loss": 1.4085, + "num_input_tokens_seen": 160124688, + "step": 4560 + }, + { + "epoch": 0.36866452385454085, + "grad_norm": 0.2562553286552429, + "learning_rate": 3e-05, + "loss": 1.4625, + "num_input_tokens_seen": 160513904, + "step": 4570 + }, + { + "epoch": 0.3694712295960169, + "grad_norm": 0.29400065541267395, + "learning_rate": 3e-05, + "loss": 1.442, + "num_input_tokens_seen": 160867220, + "step": 4580 + }, + { + "epoch": 0.3702779353374929, + "grad_norm": 0.2740069627761841, + "learning_rate": 3e-05, + "loss": 1.4217, + "num_input_tokens_seen": 161238568, + "step": 4590 + }, + { + "epoch": 0.37108464107896894, + "grad_norm": 0.28682824969291687, + "learning_rate": 3e-05, + "loss": 1.492, + "num_input_tokens_seen": 161589304, + "step": 4600 + }, + { + "epoch": 0.37189134682044495, + "grad_norm": 0.2908526360988617, + "learning_rate": 3e-05, + "loss": 1.4742, + "num_input_tokens_seen": 161970132, + "step": 4610 + }, + { + "epoch": 0.37269805256192096, + "grad_norm": 0.2921622097492218, + "learning_rate": 3e-05, + "loss": 1.4761, + "num_input_tokens_seen": 162320336, + "step": 4620 + }, + { + "epoch": 0.37350475830339697, + "grad_norm": 0.3282817304134369, + "learning_rate": 3e-05, + "loss": 1.4517, + "num_input_tokens_seen": 162665048, + "step": 4630 + }, + { + "epoch": 0.37431146404487303, + "grad_norm": 0.27311021089553833, + "learning_rate": 3e-05, + "loss": 1.4484, + "num_input_tokens_seen": 163011772, + "step": 4640 + }, + { + "epoch": 0.37511816978634904, + "grad_norm": 0.24732042849063873, + "learning_rate": 3e-05, + "loss": 1.4262, + "num_input_tokens_seen": 163366004, + "step": 4650 + }, + { + "epoch": 0.37592487552782505, + "grad_norm": 0.3375225365161896, + "learning_rate": 3e-05, + "loss": 1.4143, + "num_input_tokens_seen": 163695340, + "step": 4660 + }, + { + "epoch": 0.37673158126930106, + "grad_norm": 0.2611980140209198, + "learning_rate": 3e-05, + "loss": 1.4367, + "num_input_tokens_seen": 164050628, + "step": 4670 + }, + { + "epoch": 0.3775382870107771, + "grad_norm": 0.30901384353637695, + "learning_rate": 3e-05, + "loss": 1.458, + "num_input_tokens_seen": 164403700, + "step": 4680 + }, + { + "epoch": 0.3783449927522531, + "grad_norm": 0.29676762223243713, + "learning_rate": 3e-05, + "loss": 1.4785, + "num_input_tokens_seen": 164749396, + "step": 4690 + }, + { + "epoch": 0.37915169849372915, + "grad_norm": 0.29146572947502136, + "learning_rate": 3e-05, + "loss": 1.434, + "num_input_tokens_seen": 165076256, + "step": 4700 + }, + { + "epoch": 0.37995840423520516, + "grad_norm": 0.35839927196502686, + "learning_rate": 3e-05, + "loss": 1.4647, + "num_input_tokens_seen": 165424992, + "step": 4710 + }, + { + "epoch": 0.38076510997668117, + "grad_norm": 0.2916266620159149, + "learning_rate": 3e-05, + "loss": 1.4701, + "num_input_tokens_seen": 165764352, + "step": 4720 + }, + { + "epoch": 0.3815718157181572, + "grad_norm": 0.2933688163757324, + "learning_rate": 3e-05, + "loss": 1.4398, + "num_input_tokens_seen": 166097368, + "step": 4730 + }, + { + "epoch": 0.3823785214596332, + "grad_norm": 0.2589133679866791, + "learning_rate": 3e-05, + "loss": 1.4017, + "num_input_tokens_seen": 166468532, + "step": 4740 + }, + { + "epoch": 0.3831852272011092, + "grad_norm": 0.3302017152309418, + "learning_rate": 3e-05, + "loss": 1.4082, + "num_input_tokens_seen": 166819988, + "step": 4750 + }, + { + "epoch": 0.3839919329425852, + "grad_norm": 0.2915537655353546, + "learning_rate": 3e-05, + "loss": 1.4585, + "num_input_tokens_seen": 167157084, + "step": 4760 + }, + { + "epoch": 0.3847986386840613, + "grad_norm": 0.29807379841804504, + "learning_rate": 3e-05, + "loss": 1.4276, + "num_input_tokens_seen": 167524544, + "step": 4770 + }, + { + "epoch": 0.3856053444255373, + "grad_norm": 0.28128594160079956, + "learning_rate": 3e-05, + "loss": 1.471, + "num_input_tokens_seen": 167853064, + "step": 4780 + }, + { + "epoch": 0.3864120501670133, + "grad_norm": 0.2917296886444092, + "learning_rate": 3e-05, + "loss": 1.4871, + "num_input_tokens_seen": 168220760, + "step": 4790 + }, + { + "epoch": 0.3872187559084893, + "grad_norm": 0.2948204576969147, + "learning_rate": 3e-05, + "loss": 1.443, + "num_input_tokens_seen": 168551420, + "step": 4800 + }, + { + "epoch": 0.3880254616499653, + "grad_norm": 0.2919817268848419, + "learning_rate": 3e-05, + "loss": 1.4142, + "num_input_tokens_seen": 168903208, + "step": 4810 + }, + { + "epoch": 0.38883216739144133, + "grad_norm": 0.28495824337005615, + "learning_rate": 3e-05, + "loss": 1.4491, + "num_input_tokens_seen": 169259372, + "step": 4820 + }, + { + "epoch": 0.3896388731329174, + "grad_norm": 0.28058505058288574, + "learning_rate": 3e-05, + "loss": 1.439, + "num_input_tokens_seen": 169603980, + "step": 4830 + }, + { + "epoch": 0.3904455788743934, + "grad_norm": 0.27780622243881226, + "learning_rate": 3e-05, + "loss": 1.4333, + "num_input_tokens_seen": 169969336, + "step": 4840 + }, + { + "epoch": 0.3912522846158694, + "grad_norm": 0.28063181042671204, + "learning_rate": 3e-05, + "loss": 1.4642, + "num_input_tokens_seen": 170331728, + "step": 4850 + }, + { + "epoch": 0.3920589903573454, + "grad_norm": 0.2832536995410919, + "learning_rate": 3e-05, + "loss": 1.4097, + "num_input_tokens_seen": 170698136, + "step": 4860 + }, + { + "epoch": 0.39286569609882144, + "grad_norm": 0.31159868836402893, + "learning_rate": 3e-05, + "loss": 1.4248, + "num_input_tokens_seen": 171051356, + "step": 4870 + }, + { + "epoch": 0.39367240184029745, + "grad_norm": 0.3231009244918823, + "learning_rate": 3e-05, + "loss": 1.424, + "num_input_tokens_seen": 171411700, + "step": 4880 + }, + { + "epoch": 0.3944791075817735, + "grad_norm": 0.3507569432258606, + "learning_rate": 3e-05, + "loss": 1.4611, + "num_input_tokens_seen": 171780536, + "step": 4890 + }, + { + "epoch": 0.3952858133232495, + "grad_norm": 0.2700771391391754, + "learning_rate": 3e-05, + "loss": 1.4234, + "num_input_tokens_seen": 172139560, + "step": 4900 + }, + { + "epoch": 0.39609251906472553, + "grad_norm": 0.28461360931396484, + "learning_rate": 3e-05, + "loss": 1.4077, + "num_input_tokens_seen": 172461924, + "step": 4910 + }, + { + "epoch": 0.39689922480620154, + "grad_norm": 0.2726331353187561, + "learning_rate": 3e-05, + "loss": 1.4361, + "num_input_tokens_seen": 172822620, + "step": 4920 + }, + { + "epoch": 0.39770593054767756, + "grad_norm": 0.266812264919281, + "learning_rate": 3e-05, + "loss": 1.4222, + "num_input_tokens_seen": 173165692, + "step": 4930 + }, + { + "epoch": 0.39851263628915357, + "grad_norm": 0.31729623675346375, + "learning_rate": 3e-05, + "loss": 1.4395, + "num_input_tokens_seen": 173514872, + "step": 4940 + }, + { + "epoch": 0.39931934203062963, + "grad_norm": 0.2758219838142395, + "learning_rate": 3e-05, + "loss": 1.462, + "num_input_tokens_seen": 173870404, + "step": 4950 + }, + { + "epoch": 0.40012604777210564, + "grad_norm": 0.2920880615711212, + "learning_rate": 3e-05, + "loss": 1.4334, + "num_input_tokens_seen": 174254056, + "step": 4960 + }, + { + "epoch": 0.40093275351358165, + "grad_norm": 0.2842954397201538, + "learning_rate": 3e-05, + "loss": 1.4819, + "num_input_tokens_seen": 174603984, + "step": 4970 + }, + { + "epoch": 0.40173945925505766, + "grad_norm": 0.27924880385398865, + "learning_rate": 3e-05, + "loss": 1.4149, + "num_input_tokens_seen": 174952904, + "step": 4980 + }, + { + "epoch": 0.4025461649965337, + "grad_norm": 0.28720763325691223, + "learning_rate": 3e-05, + "loss": 1.4737, + "num_input_tokens_seen": 175315668, + "step": 4990 + }, + { + "epoch": 0.4033528707380097, + "grad_norm": 0.3302316963672638, + "learning_rate": 3e-05, + "loss": 1.4346, + "num_input_tokens_seen": 175639924, + "step": 5000 + }, + { + "epoch": 0.4033528707380097, + "eval_gen_len": 494.68, + "eval_loss": 1.363584041595459, + "eval_rouge1": 34.4104, + "eval_rouge2": 19.4149, + "eval_rougeL": 28.1022, + "eval_rougeLsum": 32.7299, + "eval_runtime": 1479.8136, + "eval_samples_per_second": 0.135, + "eval_steps_per_second": 0.034, + "num_input_tokens_seen": 175639924, + "step": 5000 + }, + { + "epoch": 0.40415957647948575, + "grad_norm": 0.264972060918808, + "learning_rate": 3e-05, + "loss": 1.3869, + "num_input_tokens_seen": 175981392, + "step": 5010 + }, + { + "epoch": 0.40496628222096176, + "grad_norm": 0.2692941129207611, + "learning_rate": 3e-05, + "loss": 1.4391, + "num_input_tokens_seen": 176323964, + "step": 5020 + }, + { + "epoch": 0.40577298796243777, + "grad_norm": 0.31324198842048645, + "learning_rate": 3e-05, + "loss": 1.3911, + "num_input_tokens_seen": 176687880, + "step": 5030 + }, + { + "epoch": 0.4065796937039138, + "grad_norm": 0.2583986222743988, + "learning_rate": 3e-05, + "loss": 1.4258, + "num_input_tokens_seen": 177024336, + "step": 5040 + }, + { + "epoch": 0.4073863994453898, + "grad_norm": 0.2632867693901062, + "learning_rate": 3e-05, + "loss": 1.4099, + "num_input_tokens_seen": 177365432, + "step": 5050 + }, + { + "epoch": 0.4081931051868658, + "grad_norm": 0.2581656277179718, + "learning_rate": 3e-05, + "loss": 1.3863, + "num_input_tokens_seen": 177721268, + "step": 5060 + }, + { + "epoch": 0.40899981092834187, + "grad_norm": 0.256698340177536, + "learning_rate": 3e-05, + "loss": 1.445, + "num_input_tokens_seen": 178061116, + "step": 5070 + }, + { + "epoch": 0.4098065166698179, + "grad_norm": 0.2994880974292755, + "learning_rate": 3e-05, + "loss": 1.4639, + "num_input_tokens_seen": 178375268, + "step": 5080 + }, + { + "epoch": 0.4106132224112939, + "grad_norm": 0.3011598587036133, + "learning_rate": 3e-05, + "loss": 1.4544, + "num_input_tokens_seen": 178693356, + "step": 5090 + }, + { + "epoch": 0.4114199281527699, + "grad_norm": 0.3107489049434662, + "learning_rate": 3e-05, + "loss": 1.4446, + "num_input_tokens_seen": 179028016, + "step": 5100 + }, + { + "epoch": 0.4122266338942459, + "grad_norm": 0.28605297207832336, + "learning_rate": 3e-05, + "loss": 1.394, + "num_input_tokens_seen": 179372364, + "step": 5110 + }, + { + "epoch": 0.4130333396357219, + "grad_norm": 0.29272472858428955, + "learning_rate": 3e-05, + "loss": 1.4559, + "num_input_tokens_seen": 179698216, + "step": 5120 + }, + { + "epoch": 0.413840045377198, + "grad_norm": 0.2901201546192169, + "learning_rate": 3e-05, + "loss": 1.4222, + "num_input_tokens_seen": 180049712, + "step": 5130 + }, + { + "epoch": 0.414646751118674, + "grad_norm": 0.3165605664253235, + "learning_rate": 3e-05, + "loss": 1.4017, + "num_input_tokens_seen": 180424368, + "step": 5140 + }, + { + "epoch": 0.41545345686015, + "grad_norm": 0.26698291301727295, + "learning_rate": 3e-05, + "loss": 1.427, + "num_input_tokens_seen": 180756776, + "step": 5150 + }, + { + "epoch": 0.416260162601626, + "grad_norm": 0.2778262197971344, + "learning_rate": 3e-05, + "loss": 1.4343, + "num_input_tokens_seen": 181094316, + "step": 5160 + }, + { + "epoch": 0.417066868343102, + "grad_norm": 0.3387869894504547, + "learning_rate": 3e-05, + "loss": 1.4165, + "num_input_tokens_seen": 181454460, + "step": 5170 + }, + { + "epoch": 0.41787357408457804, + "grad_norm": 0.2814273238182068, + "learning_rate": 3e-05, + "loss": 1.4512, + "num_input_tokens_seen": 181811144, + "step": 5180 + }, + { + "epoch": 0.41868027982605405, + "grad_norm": 0.28893864154815674, + "learning_rate": 3e-05, + "loss": 1.412, + "num_input_tokens_seen": 182202380, + "step": 5190 + }, + { + "epoch": 0.4194869855675301, + "grad_norm": 0.2955783009529114, + "learning_rate": 3e-05, + "loss": 1.4187, + "num_input_tokens_seen": 182566948, + "step": 5200 + }, + { + "epoch": 0.4202936913090061, + "grad_norm": 0.2692851722240448, + "learning_rate": 3e-05, + "loss": 1.4056, + "num_input_tokens_seen": 182920912, + "step": 5210 + }, + { + "epoch": 0.42110039705048213, + "grad_norm": 0.28022801876068115, + "learning_rate": 3e-05, + "loss": 1.3988, + "num_input_tokens_seen": 183271764, + "step": 5220 + }, + { + "epoch": 0.42190710279195814, + "grad_norm": 0.31612420082092285, + "learning_rate": 3e-05, + "loss": 1.4269, + "num_input_tokens_seen": 183617064, + "step": 5230 + }, + { + "epoch": 0.42271380853343415, + "grad_norm": 0.2966879904270172, + "learning_rate": 3e-05, + "loss": 1.3826, + "num_input_tokens_seen": 183961216, + "step": 5240 + }, + { + "epoch": 0.42352051427491016, + "grad_norm": 0.31079381704330444, + "learning_rate": 3e-05, + "loss": 1.3818, + "num_input_tokens_seen": 184308792, + "step": 5250 + }, + { + "epoch": 0.42432722001638623, + "grad_norm": 0.28356415033340454, + "learning_rate": 3e-05, + "loss": 1.4443, + "num_input_tokens_seen": 184652412, + "step": 5260 + }, + { + "epoch": 0.42513392575786224, + "grad_norm": 0.2671275734901428, + "learning_rate": 3e-05, + "loss": 1.4097, + "num_input_tokens_seen": 185005656, + "step": 5270 + }, + { + "epoch": 0.42594063149933825, + "grad_norm": 0.3049359917640686, + "learning_rate": 3e-05, + "loss": 1.3983, + "num_input_tokens_seen": 185364004, + "step": 5280 + }, + { + "epoch": 0.42674733724081426, + "grad_norm": 0.26577872037887573, + "learning_rate": 3e-05, + "loss": 1.4389, + "num_input_tokens_seen": 185721984, + "step": 5290 + }, + { + "epoch": 0.42755404298229027, + "grad_norm": 0.27239790558815, + "learning_rate": 3e-05, + "loss": 1.4502, + "num_input_tokens_seen": 186059416, + "step": 5300 + }, + { + "epoch": 0.4283607487237663, + "grad_norm": 0.30805954337120056, + "learning_rate": 3e-05, + "loss": 1.4108, + "num_input_tokens_seen": 186400908, + "step": 5310 + }, + { + "epoch": 0.42916745446524235, + "grad_norm": 0.27232635021209717, + "learning_rate": 3e-05, + "loss": 1.3694, + "num_input_tokens_seen": 186757120, + "step": 5320 + }, + { + "epoch": 0.42997416020671836, + "grad_norm": 0.30555519461631775, + "learning_rate": 3e-05, + "loss": 1.3979, + "num_input_tokens_seen": 187084720, + "step": 5330 + }, + { + "epoch": 0.43078086594819437, + "grad_norm": 0.2889952063560486, + "learning_rate": 3e-05, + "loss": 1.3979, + "num_input_tokens_seen": 187430864, + "step": 5340 + }, + { + "epoch": 0.4315875716896704, + "grad_norm": 0.28782588243484497, + "learning_rate": 3e-05, + "loss": 1.4026, + "num_input_tokens_seen": 187772016, + "step": 5350 + }, + { + "epoch": 0.4323942774311464, + "grad_norm": 0.25100380182266235, + "learning_rate": 3e-05, + "loss": 1.3516, + "num_input_tokens_seen": 188123096, + "step": 5360 + }, + { + "epoch": 0.4332009831726224, + "grad_norm": 0.2925686240196228, + "learning_rate": 3e-05, + "loss": 1.4206, + "num_input_tokens_seen": 188491824, + "step": 5370 + }, + { + "epoch": 0.43400768891409847, + "grad_norm": 0.27262914180755615, + "learning_rate": 3e-05, + "loss": 1.4259, + "num_input_tokens_seen": 188838176, + "step": 5380 + }, + { + "epoch": 0.4348143946555745, + "grad_norm": 0.2965831458568573, + "learning_rate": 3e-05, + "loss": 1.4348, + "num_input_tokens_seen": 189176428, + "step": 5390 + }, + { + "epoch": 0.4356211003970505, + "grad_norm": 0.29133981466293335, + "learning_rate": 3e-05, + "loss": 1.41, + "num_input_tokens_seen": 189532172, + "step": 5400 + }, + { + "epoch": 0.4364278061385265, + "grad_norm": 0.2646975815296173, + "learning_rate": 3e-05, + "loss": 1.4505, + "num_input_tokens_seen": 189883400, + "step": 5410 + }, + { + "epoch": 0.4372345118800025, + "grad_norm": 0.2631090581417084, + "learning_rate": 3e-05, + "loss": 1.3669, + "num_input_tokens_seen": 190248452, + "step": 5420 + }, + { + "epoch": 0.4380412176214785, + "grad_norm": 0.2600938379764557, + "learning_rate": 3e-05, + "loss": 1.3874, + "num_input_tokens_seen": 190583324, + "step": 5430 + }, + { + "epoch": 0.4388479233629546, + "grad_norm": 0.2651340663433075, + "learning_rate": 3e-05, + "loss": 1.4112, + "num_input_tokens_seen": 190932528, + "step": 5440 + }, + { + "epoch": 0.4396546291044306, + "grad_norm": 0.2757515609264374, + "learning_rate": 3e-05, + "loss": 1.4233, + "num_input_tokens_seen": 191266632, + "step": 5450 + }, + { + "epoch": 0.4404613348459066, + "grad_norm": 0.3117634057998657, + "learning_rate": 3e-05, + "loss": 1.3617, + "num_input_tokens_seen": 191594048, + "step": 5460 + }, + { + "epoch": 0.4412680405873826, + "grad_norm": 0.27428796887397766, + "learning_rate": 3e-05, + "loss": 1.3699, + "num_input_tokens_seen": 191959780, + "step": 5470 + }, + { + "epoch": 0.4420747463288586, + "grad_norm": 0.2628273367881775, + "learning_rate": 3e-05, + "loss": 1.4274, + "num_input_tokens_seen": 192275380, + "step": 5480 + }, + { + "epoch": 0.44288145207033464, + "grad_norm": 0.26145341992378235, + "learning_rate": 3e-05, + "loss": 1.4375, + "num_input_tokens_seen": 192635040, + "step": 5490 + }, + { + "epoch": 0.4436881578118107, + "grad_norm": 0.2731001675128937, + "learning_rate": 3e-05, + "loss": 1.4172, + "num_input_tokens_seen": 192991412, + "step": 5500 + }, + { + "epoch": 0.4444948635532867, + "grad_norm": 0.2722030282020569, + "learning_rate": 3e-05, + "loss": 1.3866, + "num_input_tokens_seen": 193320880, + "step": 5510 + }, + { + "epoch": 0.4453015692947627, + "grad_norm": 0.29632169008255005, + "learning_rate": 3e-05, + "loss": 1.447, + "num_input_tokens_seen": 193669744, + "step": 5520 + }, + { + "epoch": 0.44610827503623873, + "grad_norm": 0.28086063265800476, + "learning_rate": 3e-05, + "loss": 1.4636, + "num_input_tokens_seen": 194026876, + "step": 5530 + }, + { + "epoch": 0.44691498077771474, + "grad_norm": 0.29540812969207764, + "learning_rate": 3e-05, + "loss": 1.3922, + "num_input_tokens_seen": 194367412, + "step": 5540 + }, + { + "epoch": 0.44772168651919075, + "grad_norm": 0.2671002447605133, + "learning_rate": 3e-05, + "loss": 1.4226, + "num_input_tokens_seen": 194713016, + "step": 5550 + }, + { + "epoch": 0.4485283922606668, + "grad_norm": 0.2889344394207001, + "learning_rate": 3e-05, + "loss": 1.4291, + "num_input_tokens_seen": 195084592, + "step": 5560 + }, + { + "epoch": 0.44933509800214283, + "grad_norm": 0.28490033745765686, + "learning_rate": 3e-05, + "loss": 1.3612, + "num_input_tokens_seen": 195367436, + "step": 5570 + }, + { + "epoch": 0.45014180374361884, + "grad_norm": 0.25098714232444763, + "learning_rate": 3e-05, + "loss": 1.4348, + "num_input_tokens_seen": 195724988, + "step": 5580 + }, + { + "epoch": 0.45094850948509485, + "grad_norm": 0.28072845935821533, + "learning_rate": 3e-05, + "loss": 1.4031, + "num_input_tokens_seen": 196114160, + "step": 5590 + }, + { + "epoch": 0.45175521522657086, + "grad_norm": 0.26970839500427246, + "learning_rate": 3e-05, + "loss": 1.3853, + "num_input_tokens_seen": 196463832, + "step": 5600 + }, + { + "epoch": 0.45256192096804687, + "grad_norm": 0.2835977375507355, + "learning_rate": 3e-05, + "loss": 1.4153, + "num_input_tokens_seen": 196815808, + "step": 5610 + }, + { + "epoch": 0.4533686267095229, + "grad_norm": 0.3386438190937042, + "learning_rate": 3e-05, + "loss": 1.363, + "num_input_tokens_seen": 197182716, + "step": 5620 + }, + { + "epoch": 0.45417533245099895, + "grad_norm": 0.2961023449897766, + "learning_rate": 3e-05, + "loss": 1.4127, + "num_input_tokens_seen": 197526772, + "step": 5630 + }, + { + "epoch": 0.45498203819247496, + "grad_norm": 0.29476794600486755, + "learning_rate": 3e-05, + "loss": 1.4113, + "num_input_tokens_seen": 197878328, + "step": 5640 + }, + { + "epoch": 0.45578874393395097, + "grad_norm": 0.305695503950119, + "learning_rate": 3e-05, + "loss": 1.4272, + "num_input_tokens_seen": 198239360, + "step": 5650 + }, + { + "epoch": 0.456595449675427, + "grad_norm": 0.2787207365036011, + "learning_rate": 3e-05, + "loss": 1.4079, + "num_input_tokens_seen": 198581888, + "step": 5660 + }, + { + "epoch": 0.457402155416903, + "grad_norm": 0.2544805705547333, + "learning_rate": 3e-05, + "loss": 1.372, + "num_input_tokens_seen": 198930636, + "step": 5670 + }, + { + "epoch": 0.458208861158379, + "grad_norm": 0.2546211779117584, + "learning_rate": 3e-05, + "loss": 1.3958, + "num_input_tokens_seen": 199280792, + "step": 5680 + }, + { + "epoch": 0.45901556689985507, + "grad_norm": 0.2609899938106537, + "learning_rate": 3e-05, + "loss": 1.4331, + "num_input_tokens_seen": 199631024, + "step": 5690 + }, + { + "epoch": 0.4598222726413311, + "grad_norm": 0.2949337363243103, + "learning_rate": 3e-05, + "loss": 1.4179, + "num_input_tokens_seen": 199979964, + "step": 5700 + }, + { + "epoch": 0.4606289783828071, + "grad_norm": 0.2916325032711029, + "learning_rate": 3e-05, + "loss": 1.4107, + "num_input_tokens_seen": 200317260, + "step": 5710 + }, + { + "epoch": 0.4614356841242831, + "grad_norm": 0.2985553741455078, + "learning_rate": 3e-05, + "loss": 1.3435, + "num_input_tokens_seen": 200706164, + "step": 5720 + }, + { + "epoch": 0.4622423898657591, + "grad_norm": 0.29759296774864197, + "learning_rate": 3e-05, + "loss": 1.3962, + "num_input_tokens_seen": 201041936, + "step": 5730 + }, + { + "epoch": 0.4630490956072351, + "grad_norm": 0.2666504383087158, + "learning_rate": 3e-05, + "loss": 1.3736, + "num_input_tokens_seen": 201384532, + "step": 5740 + }, + { + "epoch": 0.4638558013487112, + "grad_norm": 0.2790429890155792, + "learning_rate": 3e-05, + "loss": 1.3505, + "num_input_tokens_seen": 201732544, + "step": 5750 + }, + { + "epoch": 0.4646625070901872, + "grad_norm": 0.27765849232673645, + "learning_rate": 3e-05, + "loss": 1.4432, + "num_input_tokens_seen": 202072132, + "step": 5760 + }, + { + "epoch": 0.4654692128316632, + "grad_norm": 0.27785608172416687, + "learning_rate": 3e-05, + "loss": 1.4137, + "num_input_tokens_seen": 202425292, + "step": 5770 + }, + { + "epoch": 0.4662759185731392, + "grad_norm": 0.3008098900318146, + "learning_rate": 3e-05, + "loss": 1.3844, + "num_input_tokens_seen": 202754488, + "step": 5780 + }, + { + "epoch": 0.4670826243146152, + "grad_norm": 0.2869485318660736, + "learning_rate": 3e-05, + "loss": 1.3913, + "num_input_tokens_seen": 203080408, + "step": 5790 + }, + { + "epoch": 0.46788933005609123, + "grad_norm": 0.2760210335254669, + "learning_rate": 3e-05, + "loss": 1.3775, + "num_input_tokens_seen": 203433440, + "step": 5800 + }, + { + "epoch": 0.4686960357975673, + "grad_norm": 0.29998424649238586, + "learning_rate": 3e-05, + "loss": 1.3395, + "num_input_tokens_seen": 203773520, + "step": 5810 + }, + { + "epoch": 0.4695027415390433, + "grad_norm": 0.26301300525665283, + "learning_rate": 3e-05, + "loss": 1.3362, + "num_input_tokens_seen": 204128604, + "step": 5820 + }, + { + "epoch": 0.4703094472805193, + "grad_norm": 0.2580535113811493, + "learning_rate": 3e-05, + "loss": 1.4004, + "num_input_tokens_seen": 204494312, + "step": 5830 + }, + { + "epoch": 0.47111615302199533, + "grad_norm": 0.25355467200279236, + "learning_rate": 3e-05, + "loss": 1.3836, + "num_input_tokens_seen": 204826752, + "step": 5840 + }, + { + "epoch": 0.47192285876347134, + "grad_norm": 0.2825932502746582, + "learning_rate": 3e-05, + "loss": 1.3717, + "num_input_tokens_seen": 205188700, + "step": 5850 + }, + { + "epoch": 0.47272956450494735, + "grad_norm": 0.3444035053253174, + "learning_rate": 3e-05, + "loss": 1.3581, + "num_input_tokens_seen": 205529408, + "step": 5860 + }, + { + "epoch": 0.4735362702464234, + "grad_norm": 0.25847604870796204, + "learning_rate": 3e-05, + "loss": 1.357, + "num_input_tokens_seen": 205868532, + "step": 5870 + }, + { + "epoch": 0.47434297598789943, + "grad_norm": 0.2876322269439697, + "learning_rate": 3e-05, + "loss": 1.3783, + "num_input_tokens_seen": 206205984, + "step": 5880 + }, + { + "epoch": 0.47514968172937544, + "grad_norm": 0.27320173382759094, + "learning_rate": 3e-05, + "loss": 1.4018, + "num_input_tokens_seen": 206585048, + "step": 5890 + }, + { + "epoch": 0.47595638747085145, + "grad_norm": 0.31563153862953186, + "learning_rate": 3e-05, + "loss": 1.4208, + "num_input_tokens_seen": 206935052, + "step": 5900 + }, + { + "epoch": 0.47676309321232746, + "grad_norm": 0.29032954573631287, + "learning_rate": 3e-05, + "loss": 1.404, + "num_input_tokens_seen": 207275132, + "step": 5910 + }, + { + "epoch": 0.47756979895380347, + "grad_norm": 0.27211418747901917, + "learning_rate": 3e-05, + "loss": 1.3487, + "num_input_tokens_seen": 207633312, + "step": 5920 + }, + { + "epoch": 0.47837650469527954, + "grad_norm": 0.3004505932331085, + "learning_rate": 3e-05, + "loss": 1.3679, + "num_input_tokens_seen": 207984000, + "step": 5930 + }, + { + "epoch": 0.47918321043675555, + "grad_norm": 0.25671249628067017, + "learning_rate": 3e-05, + "loss": 1.389, + "num_input_tokens_seen": 208302920, + "step": 5940 + }, + { + "epoch": 0.47998991617823156, + "grad_norm": 0.29051607847213745, + "learning_rate": 3e-05, + "loss": 1.4225, + "num_input_tokens_seen": 208680580, + "step": 5950 + }, + { + "epoch": 0.48079662191970757, + "grad_norm": 0.2684350311756134, + "learning_rate": 3e-05, + "loss": 1.4091, + "num_input_tokens_seen": 209017588, + "step": 5960 + }, + { + "epoch": 0.4816033276611836, + "grad_norm": 0.28748780488967896, + "learning_rate": 3e-05, + "loss": 1.396, + "num_input_tokens_seen": 209385672, + "step": 5970 + }, + { + "epoch": 0.4824100334026596, + "grad_norm": 0.26985928416252136, + "learning_rate": 3e-05, + "loss": 1.3779, + "num_input_tokens_seen": 209735156, + "step": 5980 + }, + { + "epoch": 0.48321673914413565, + "grad_norm": 0.26085472106933594, + "learning_rate": 3e-05, + "loss": 1.3524, + "num_input_tokens_seen": 210074668, + "step": 5990 + }, + { + "epoch": 0.48402344488561166, + "grad_norm": 0.30844658613204956, + "learning_rate": 3e-05, + "loss": 1.3912, + "num_input_tokens_seen": 210409328, + "step": 6000 + }, + { + "epoch": 0.48402344488561166, + "eval_gen_len": 469.885, + "eval_loss": 1.3159054517745972, + "eval_rouge1": 36.5059, + "eval_rouge2": 21.2447, + "eval_rougeL": 30.116, + "eval_rougeLsum": 34.7303, + "eval_runtime": 1601.8691, + "eval_samples_per_second": 0.125, + "eval_steps_per_second": 0.031, + "num_input_tokens_seen": 210409328, + "step": 6000 + }, + { + "epoch": 0.4848301506270877, + "grad_norm": 0.3409636914730072, + "learning_rate": 3e-05, + "loss": 1.3814, + "num_input_tokens_seen": 210757000, + "step": 6010 + }, + { + "epoch": 0.4856368563685637, + "grad_norm": 0.285645067691803, + "learning_rate": 3e-05, + "loss": 1.3968, + "num_input_tokens_seen": 211087396, + "step": 6020 + }, + { + "epoch": 0.4864435621100397, + "grad_norm": 0.23893733322620392, + "learning_rate": 3e-05, + "loss": 1.3671, + "num_input_tokens_seen": 211458184, + "step": 6030 + }, + { + "epoch": 0.4872502678515157, + "grad_norm": 0.2706129252910614, + "learning_rate": 3e-05, + "loss": 1.3598, + "num_input_tokens_seen": 211782916, + "step": 6040 + }, + { + "epoch": 0.4880569735929917, + "grad_norm": 0.2617262601852417, + "learning_rate": 3e-05, + "loss": 1.3718, + "num_input_tokens_seen": 212146920, + "step": 6050 + }, + { + "epoch": 0.4888636793344678, + "grad_norm": 0.2852620780467987, + "learning_rate": 3e-05, + "loss": 1.3483, + "num_input_tokens_seen": 212525416, + "step": 6060 + }, + { + "epoch": 0.4896703850759438, + "grad_norm": 0.30544915795326233, + "learning_rate": 3e-05, + "loss": 1.3977, + "num_input_tokens_seen": 212850600, + "step": 6070 + }, + { + "epoch": 0.4904770908174198, + "grad_norm": 0.2675735056400299, + "learning_rate": 3e-05, + "loss": 1.383, + "num_input_tokens_seen": 213202036, + "step": 6080 + }, + { + "epoch": 0.4912837965588958, + "grad_norm": 0.2696596086025238, + "learning_rate": 3e-05, + "loss": 1.3809, + "num_input_tokens_seen": 213537836, + "step": 6090 + }, + { + "epoch": 0.4920905023003718, + "grad_norm": 0.281474769115448, + "learning_rate": 3e-05, + "loss": 1.351, + "num_input_tokens_seen": 213910248, + "step": 6100 + }, + { + "epoch": 0.49289720804184783, + "grad_norm": 0.3014686405658722, + "learning_rate": 3e-05, + "loss": 1.381, + "num_input_tokens_seen": 214261100, + "step": 6110 + }, + { + "epoch": 0.4937039137833239, + "grad_norm": 0.27863389253616333, + "learning_rate": 3e-05, + "loss": 1.3878, + "num_input_tokens_seen": 214636128, + "step": 6120 + }, + { + "epoch": 0.4945106195247999, + "grad_norm": 0.28771695494651794, + "learning_rate": 3e-05, + "loss": 1.3721, + "num_input_tokens_seen": 215003756, + "step": 6130 + }, + { + "epoch": 0.4953173252662759, + "grad_norm": 0.23946808278560638, + "learning_rate": 3e-05, + "loss": 1.3648, + "num_input_tokens_seen": 215364600, + "step": 6140 + }, + { + "epoch": 0.49612403100775193, + "grad_norm": 0.2888747751712799, + "learning_rate": 3e-05, + "loss": 1.4156, + "num_input_tokens_seen": 215725344, + "step": 6150 + }, + { + "epoch": 0.49693073674922794, + "grad_norm": 0.2734207808971405, + "learning_rate": 3e-05, + "loss": 1.3646, + "num_input_tokens_seen": 216095268, + "step": 6160 + }, + { + "epoch": 0.49773744249070395, + "grad_norm": 0.2685578167438507, + "learning_rate": 3e-05, + "loss": 1.3459, + "num_input_tokens_seen": 216469020, + "step": 6170 + }, + { + "epoch": 0.49854414823218, + "grad_norm": 0.2771487236022949, + "learning_rate": 3e-05, + "loss": 1.3673, + "num_input_tokens_seen": 216823048, + "step": 6180 + }, + { + "epoch": 0.49935085397365603, + "grad_norm": 0.2881760895252228, + "learning_rate": 3e-05, + "loss": 1.3973, + "num_input_tokens_seen": 217211200, + "step": 6190 + }, + { + "epoch": 0.500157559715132, + "grad_norm": 0.2920476496219635, + "learning_rate": 3e-05, + "loss": 1.3809, + "num_input_tokens_seen": 217569052, + "step": 6200 + }, + { + "epoch": 0.5009642654566081, + "grad_norm": 0.28796783089637756, + "learning_rate": 3e-05, + "loss": 1.3666, + "num_input_tokens_seen": 217919988, + "step": 6210 + }, + { + "epoch": 0.5017709711980841, + "grad_norm": 0.24618837237358093, + "learning_rate": 3e-05, + "loss": 1.3739, + "num_input_tokens_seen": 218277224, + "step": 6220 + }, + { + "epoch": 0.5025776769395601, + "grad_norm": 0.2835310995578766, + "learning_rate": 3e-05, + "loss": 1.3831, + "num_input_tokens_seen": 218633308, + "step": 6230 + }, + { + "epoch": 0.5033843826810361, + "grad_norm": 0.25976064801216125, + "learning_rate": 3e-05, + "loss": 1.3267, + "num_input_tokens_seen": 218982380, + "step": 6240 + }, + { + "epoch": 0.5041910884225121, + "grad_norm": 0.24962379038333893, + "learning_rate": 3e-05, + "loss": 1.3829, + "num_input_tokens_seen": 219302800, + "step": 6250 + }, + { + "epoch": 0.5049977941639882, + "grad_norm": 0.2833407521247864, + "learning_rate": 3e-05, + "loss": 1.3575, + "num_input_tokens_seen": 219657948, + "step": 6260 + }, + { + "epoch": 0.5058044999054642, + "grad_norm": 0.2600440979003906, + "learning_rate": 3e-05, + "loss": 1.3886, + "num_input_tokens_seen": 219986420, + "step": 6270 + }, + { + "epoch": 0.5066112056469402, + "grad_norm": 0.28562673926353455, + "learning_rate": 3e-05, + "loss": 1.3903, + "num_input_tokens_seen": 220335652, + "step": 6280 + }, + { + "epoch": 0.5074179113884162, + "grad_norm": 0.30197736620903015, + "learning_rate": 3e-05, + "loss": 1.3592, + "num_input_tokens_seen": 220710520, + "step": 6290 + }, + { + "epoch": 0.5082246171298922, + "grad_norm": 0.35574081540107727, + "learning_rate": 3e-05, + "loss": 1.3804, + "num_input_tokens_seen": 221048404, + "step": 6300 + }, + { + "epoch": 0.5090313228713682, + "grad_norm": 0.26034465432167053, + "learning_rate": 3e-05, + "loss": 1.3843, + "num_input_tokens_seen": 221398644, + "step": 6310 + }, + { + "epoch": 0.5098380286128442, + "grad_norm": 0.27993252873420715, + "learning_rate": 3e-05, + "loss": 1.3452, + "num_input_tokens_seen": 221766764, + "step": 6320 + }, + { + "epoch": 0.5106447343543203, + "grad_norm": 0.278550386428833, + "learning_rate": 3e-05, + "loss": 1.3901, + "num_input_tokens_seen": 222114724, + "step": 6330 + }, + { + "epoch": 0.5114514400957964, + "grad_norm": 0.32215824723243713, + "learning_rate": 3e-05, + "loss": 1.3696, + "num_input_tokens_seen": 222494148, + "step": 6340 + }, + { + "epoch": 0.5122581458372724, + "grad_norm": 0.2745245099067688, + "learning_rate": 3e-05, + "loss": 1.3832, + "num_input_tokens_seen": 222847792, + "step": 6350 + }, + { + "epoch": 0.5130648515787484, + "grad_norm": 0.2977345585823059, + "learning_rate": 3e-05, + "loss": 1.3719, + "num_input_tokens_seen": 223203024, + "step": 6360 + }, + { + "epoch": 0.5138715573202244, + "grad_norm": 0.299365371465683, + "learning_rate": 3e-05, + "loss": 1.3602, + "num_input_tokens_seen": 223552380, + "step": 6370 + }, + { + "epoch": 0.5146782630617004, + "grad_norm": 0.2765893340110779, + "learning_rate": 3e-05, + "loss": 1.3861, + "num_input_tokens_seen": 223902684, + "step": 6380 + }, + { + "epoch": 0.5154849688031764, + "grad_norm": 0.3482683002948761, + "learning_rate": 3e-05, + "loss": 1.4014, + "num_input_tokens_seen": 224249200, + "step": 6390 + }, + { + "epoch": 0.5162916745446524, + "grad_norm": 0.2550183832645416, + "learning_rate": 3e-05, + "loss": 1.4198, + "num_input_tokens_seen": 224584080, + "step": 6400 + }, + { + "epoch": 0.5170983802861284, + "grad_norm": 0.2872161865234375, + "learning_rate": 3e-05, + "loss": 1.37, + "num_input_tokens_seen": 224949828, + "step": 6410 + }, + { + "epoch": 0.5179050860276044, + "grad_norm": 0.2459658682346344, + "learning_rate": 3e-05, + "loss": 1.3798, + "num_input_tokens_seen": 225310160, + "step": 6420 + }, + { + "epoch": 0.5187117917690804, + "grad_norm": 0.2668297588825226, + "learning_rate": 3e-05, + "loss": 1.3406, + "num_input_tokens_seen": 225654484, + "step": 6430 + }, + { + "epoch": 0.5195184975105565, + "grad_norm": 0.2736770808696747, + "learning_rate": 3e-05, + "loss": 1.341, + "num_input_tokens_seen": 226010672, + "step": 6440 + }, + { + "epoch": 0.5203252032520326, + "grad_norm": 0.27257856726646423, + "learning_rate": 3e-05, + "loss": 1.4121, + "num_input_tokens_seen": 226356336, + "step": 6450 + }, + { + "epoch": 0.5211319089935086, + "grad_norm": 0.278709352016449, + "learning_rate": 3e-05, + "loss": 1.2977, + "num_input_tokens_seen": 226715132, + "step": 6460 + }, + { + "epoch": 0.5219386147349846, + "grad_norm": 0.2663877010345459, + "learning_rate": 3e-05, + "loss": 1.3665, + "num_input_tokens_seen": 227064968, + "step": 6470 + }, + { + "epoch": 0.5227453204764606, + "grad_norm": 0.29134681820869446, + "learning_rate": 3e-05, + "loss": 1.2802, + "num_input_tokens_seen": 227397320, + "step": 6480 + }, + { + "epoch": 0.5235520262179366, + "grad_norm": 0.3129670321941376, + "learning_rate": 3e-05, + "loss": 1.3934, + "num_input_tokens_seen": 227757800, + "step": 6490 + }, + { + "epoch": 0.5243587319594126, + "grad_norm": 0.2857125997543335, + "learning_rate": 3e-05, + "loss": 1.3935, + "num_input_tokens_seen": 228107956, + "step": 6500 + }, + { + "epoch": 0.5251654377008886, + "grad_norm": 0.26699715852737427, + "learning_rate": 3e-05, + "loss": 1.4017, + "num_input_tokens_seen": 228435140, + "step": 6510 + }, + { + "epoch": 0.5259721434423646, + "grad_norm": 0.3041050136089325, + "learning_rate": 3e-05, + "loss": 1.3659, + "num_input_tokens_seen": 228778040, + "step": 6520 + }, + { + "epoch": 0.5267788491838407, + "grad_norm": 0.2667132616043091, + "learning_rate": 3e-05, + "loss": 1.3765, + "num_input_tokens_seen": 229124136, + "step": 6530 + }, + { + "epoch": 0.5275855549253167, + "grad_norm": 0.27975499629974365, + "learning_rate": 3e-05, + "loss": 1.3799, + "num_input_tokens_seen": 229453708, + "step": 6540 + }, + { + "epoch": 0.5283922606667927, + "grad_norm": 0.25194570422172546, + "learning_rate": 3e-05, + "loss": 1.3369, + "num_input_tokens_seen": 229789164, + "step": 6550 + }, + { + "epoch": 0.5291989664082687, + "grad_norm": 0.26208654046058655, + "learning_rate": 3e-05, + "loss": 1.3712, + "num_input_tokens_seen": 230141916, + "step": 6560 + }, + { + "epoch": 0.5300056721497447, + "grad_norm": 0.2651267647743225, + "learning_rate": 3e-05, + "loss": 1.3616, + "num_input_tokens_seen": 230503840, + "step": 6570 + }, + { + "epoch": 0.5308123778912208, + "grad_norm": 0.25937962532043457, + "learning_rate": 3e-05, + "loss": 1.2915, + "num_input_tokens_seen": 230819784, + "step": 6580 + }, + { + "epoch": 0.5316190836326968, + "grad_norm": 0.31449395418167114, + "learning_rate": 3e-05, + "loss": 1.3706, + "num_input_tokens_seen": 231205872, + "step": 6590 + }, + { + "epoch": 0.5324257893741728, + "grad_norm": 0.2909031808376312, + "learning_rate": 3e-05, + "loss": 1.4025, + "num_input_tokens_seen": 231547816, + "step": 6600 + }, + { + "epoch": 0.5332324951156489, + "grad_norm": 0.23776140809059143, + "learning_rate": 3e-05, + "loss": 1.3567, + "num_input_tokens_seen": 231915224, + "step": 6610 + }, + { + "epoch": 0.5340392008571249, + "grad_norm": 0.255609929561615, + "learning_rate": 3e-05, + "loss": 1.3352, + "num_input_tokens_seen": 232289088, + "step": 6620 + }, + { + "epoch": 0.5348459065986009, + "grad_norm": 0.2546085715293884, + "learning_rate": 3e-05, + "loss": 1.3868, + "num_input_tokens_seen": 232690580, + "step": 6630 + }, + { + "epoch": 0.5356526123400769, + "grad_norm": 0.2850560247898102, + "learning_rate": 3e-05, + "loss": 1.3626, + "num_input_tokens_seen": 233036468, + "step": 6640 + }, + { + "epoch": 0.5364593180815529, + "grad_norm": 0.26064831018447876, + "learning_rate": 3e-05, + "loss": 1.385, + "num_input_tokens_seen": 233382180, + "step": 6650 + }, + { + "epoch": 0.5372660238230289, + "grad_norm": 0.2727303206920624, + "learning_rate": 3e-05, + "loss": 1.3323, + "num_input_tokens_seen": 233733420, + "step": 6660 + }, + { + "epoch": 0.5380727295645049, + "grad_norm": 0.2605370283126831, + "learning_rate": 3e-05, + "loss": 1.3613, + "num_input_tokens_seen": 234088620, + "step": 6670 + }, + { + "epoch": 0.5388794353059809, + "grad_norm": 0.2989037036895752, + "learning_rate": 3e-05, + "loss": 1.3553, + "num_input_tokens_seen": 234457604, + "step": 6680 + }, + { + "epoch": 0.5396861410474569, + "grad_norm": 0.27383917570114136, + "learning_rate": 3e-05, + "loss": 1.3567, + "num_input_tokens_seen": 234790868, + "step": 6690 + }, + { + "epoch": 0.5404928467889331, + "grad_norm": 0.2771012783050537, + "learning_rate": 3e-05, + "loss": 1.3888, + "num_input_tokens_seen": 235140724, + "step": 6700 + }, + { + "epoch": 0.5412995525304091, + "grad_norm": 0.35448309779167175, + "learning_rate": 3e-05, + "loss": 1.3347, + "num_input_tokens_seen": 235476928, + "step": 6710 + }, + { + "epoch": 0.5421062582718851, + "grad_norm": 0.28626537322998047, + "learning_rate": 3e-05, + "loss": 1.3486, + "num_input_tokens_seen": 235805748, + "step": 6720 + }, + { + "epoch": 0.5429129640133611, + "grad_norm": 0.280998170375824, + "learning_rate": 3e-05, + "loss": 1.3868, + "num_input_tokens_seen": 236142400, + "step": 6730 + }, + { + "epoch": 0.5437196697548371, + "grad_norm": 0.2946176826953888, + "learning_rate": 3e-05, + "loss": 1.3424, + "num_input_tokens_seen": 236492160, + "step": 6740 + }, + { + "epoch": 0.5445263754963131, + "grad_norm": 0.2584805488586426, + "learning_rate": 3e-05, + "loss": 1.3451, + "num_input_tokens_seen": 236826392, + "step": 6750 + }, + { + "epoch": 0.5453330812377891, + "grad_norm": 0.27321335673332214, + "learning_rate": 3e-05, + "loss": 1.3567, + "num_input_tokens_seen": 237160908, + "step": 6760 + }, + { + "epoch": 0.5461397869792651, + "grad_norm": 0.2743065655231476, + "learning_rate": 3e-05, + "loss": 1.3933, + "num_input_tokens_seen": 237514248, + "step": 6770 + }, + { + "epoch": 0.5469464927207411, + "grad_norm": 0.26243406534194946, + "learning_rate": 3e-05, + "loss": 1.3716, + "num_input_tokens_seen": 237850192, + "step": 6780 + }, + { + "epoch": 0.5477531984622172, + "grad_norm": 0.2854134738445282, + "learning_rate": 3e-05, + "loss": 1.3882, + "num_input_tokens_seen": 238236160, + "step": 6790 + }, + { + "epoch": 0.5485599042036932, + "grad_norm": 0.2889584004878998, + "learning_rate": 3e-05, + "loss": 1.3232, + "num_input_tokens_seen": 238608400, + "step": 6800 + }, + { + "epoch": 0.5493666099451692, + "grad_norm": 0.2689494490623474, + "learning_rate": 3e-05, + "loss": 1.3191, + "num_input_tokens_seen": 238965484, + "step": 6810 + }, + { + "epoch": 0.5501733156866453, + "grad_norm": 0.2811024785041809, + "learning_rate": 3e-05, + "loss": 1.3883, + "num_input_tokens_seen": 239305156, + "step": 6820 + }, + { + "epoch": 0.5509800214281213, + "grad_norm": 0.29699015617370605, + "learning_rate": 3e-05, + "loss": 1.3527, + "num_input_tokens_seen": 239633044, + "step": 6830 + }, + { + "epoch": 0.5517867271695973, + "grad_norm": 0.2648441791534424, + "learning_rate": 3e-05, + "loss": 1.3464, + "num_input_tokens_seen": 239984216, + "step": 6840 + }, + { + "epoch": 0.5525934329110733, + "grad_norm": 0.2662919759750366, + "learning_rate": 3e-05, + "loss": 1.3185, + "num_input_tokens_seen": 240344632, + "step": 6850 + }, + { + "epoch": 0.5534001386525493, + "grad_norm": 0.3006437420845032, + "learning_rate": 3e-05, + "loss": 1.3587, + "num_input_tokens_seen": 240702092, + "step": 6860 + }, + { + "epoch": 0.5542068443940253, + "grad_norm": 0.27952778339385986, + "learning_rate": 3e-05, + "loss": 1.3546, + "num_input_tokens_seen": 241059608, + "step": 6870 + }, + { + "epoch": 0.5550135501355014, + "grad_norm": 0.27199041843414307, + "learning_rate": 3e-05, + "loss": 1.3444, + "num_input_tokens_seen": 241417296, + "step": 6880 + }, + { + "epoch": 0.5558202558769774, + "grad_norm": 0.2580903470516205, + "learning_rate": 3e-05, + "loss": 1.3458, + "num_input_tokens_seen": 241788856, + "step": 6890 + }, + { + "epoch": 0.5566269616184534, + "grad_norm": 0.2709527909755707, + "learning_rate": 3e-05, + "loss": 1.3785, + "num_input_tokens_seen": 242139900, + "step": 6900 + }, + { + "epoch": 0.5574336673599294, + "grad_norm": 0.30123209953308105, + "learning_rate": 3e-05, + "loss": 1.4256, + "num_input_tokens_seen": 242504148, + "step": 6910 + }, + { + "epoch": 0.5582403731014054, + "grad_norm": 0.2620568871498108, + "learning_rate": 3e-05, + "loss": 1.2906, + "num_input_tokens_seen": 242827892, + "step": 6920 + }, + { + "epoch": 0.5590470788428814, + "grad_norm": 0.2878223955631256, + "learning_rate": 3e-05, + "loss": 1.3146, + "num_input_tokens_seen": 243181464, + "step": 6930 + }, + { + "epoch": 0.5598537845843574, + "grad_norm": 0.26872310042381287, + "learning_rate": 3e-05, + "loss": 1.3387, + "num_input_tokens_seen": 243523144, + "step": 6940 + }, + { + "epoch": 0.5606604903258335, + "grad_norm": 0.30172818899154663, + "learning_rate": 3e-05, + "loss": 1.3215, + "num_input_tokens_seen": 243867668, + "step": 6950 + }, + { + "epoch": 0.5614671960673095, + "grad_norm": 0.2358444631099701, + "learning_rate": 3e-05, + "loss": 1.3118, + "num_input_tokens_seen": 244222464, + "step": 6960 + }, + { + "epoch": 0.5622739018087856, + "grad_norm": 0.2576392889022827, + "learning_rate": 3e-05, + "loss": 1.3382, + "num_input_tokens_seen": 244580240, + "step": 6970 + }, + { + "epoch": 0.5630806075502616, + "grad_norm": 0.2685336768627167, + "learning_rate": 3e-05, + "loss": 1.3596, + "num_input_tokens_seen": 244952004, + "step": 6980 + }, + { + "epoch": 0.5638873132917376, + "grad_norm": 0.3160952925682068, + "learning_rate": 3e-05, + "loss": 1.3249, + "num_input_tokens_seen": 245296600, + "step": 6990 + }, + { + "epoch": 0.5646940190332136, + "grad_norm": 0.2507816255092621, + "learning_rate": 3e-05, + "loss": 1.3148, + "num_input_tokens_seen": 245601908, + "step": 7000 + }, + { + "epoch": 0.5646940190332136, + "eval_gen_len": 458.28, + "eval_loss": 1.2807079553604126, + "eval_rouge1": 37.0123, + "eval_rouge2": 21.3666, + "eval_rougeL": 30.11, + "eval_rougeLsum": 35.0891, + "eval_runtime": 1549.9426, + "eval_samples_per_second": 0.129, + "eval_steps_per_second": 0.032, + "num_input_tokens_seen": 245601908, + "step": 7000 + }, + { + "epoch": 0.5655007247746896, + "grad_norm": 0.25211119651794434, + "learning_rate": 3e-05, + "loss": 1.3326, + "num_input_tokens_seen": 245935868, + "step": 7010 + }, + { + "epoch": 0.5663074305161656, + "grad_norm": 0.26975396275520325, + "learning_rate": 3e-05, + "loss": 1.344, + "num_input_tokens_seen": 246286460, + "step": 7020 + }, + { + "epoch": 0.5671141362576416, + "grad_norm": 0.25872379541397095, + "learning_rate": 3e-05, + "loss": 1.2972, + "num_input_tokens_seen": 246643836, + "step": 7030 + }, + { + "epoch": 0.5679208419991176, + "grad_norm": 0.25087055563926697, + "learning_rate": 3e-05, + "loss": 1.3577, + "num_input_tokens_seen": 246981328, + "step": 7040 + }, + { + "epoch": 0.5687275477405936, + "grad_norm": 0.29341551661491394, + "learning_rate": 3e-05, + "loss": 1.3222, + "num_input_tokens_seen": 247316288, + "step": 7050 + }, + { + "epoch": 0.5695342534820697, + "grad_norm": 0.2665810286998749, + "learning_rate": 3e-05, + "loss": 1.3704, + "num_input_tokens_seen": 247677216, + "step": 7060 + }, + { + "epoch": 0.5703409592235458, + "grad_norm": 0.2743811309337616, + "learning_rate": 3e-05, + "loss": 1.4113, + "num_input_tokens_seen": 248018060, + "step": 7070 + }, + { + "epoch": 0.5711476649650218, + "grad_norm": 0.2498067319393158, + "learning_rate": 3e-05, + "loss": 1.3407, + "num_input_tokens_seen": 248347064, + "step": 7080 + }, + { + "epoch": 0.5719543707064978, + "grad_norm": 0.31014665961265564, + "learning_rate": 3e-05, + "loss": 1.3535, + "num_input_tokens_seen": 248698972, + "step": 7090 + }, + { + "epoch": 0.5727610764479738, + "grad_norm": 0.304561972618103, + "learning_rate": 3e-05, + "loss": 1.3736, + "num_input_tokens_seen": 249061164, + "step": 7100 + }, + { + "epoch": 0.5735677821894498, + "grad_norm": 0.2791111469268799, + "learning_rate": 3e-05, + "loss": 1.3308, + "num_input_tokens_seen": 249394300, + "step": 7110 + }, + { + "epoch": 0.5743744879309258, + "grad_norm": 0.2657420039176941, + "learning_rate": 3e-05, + "loss": 1.3221, + "num_input_tokens_seen": 249749096, + "step": 7120 + }, + { + "epoch": 0.5751811936724018, + "grad_norm": 0.26944419741630554, + "learning_rate": 3e-05, + "loss": 1.2559, + "num_input_tokens_seen": 250099212, + "step": 7130 + }, + { + "epoch": 0.5759878994138778, + "grad_norm": 0.2616654336452484, + "learning_rate": 3e-05, + "loss": 1.3261, + "num_input_tokens_seen": 250468696, + "step": 7140 + }, + { + "epoch": 0.5767946051553539, + "grad_norm": 0.26480162143707275, + "learning_rate": 3e-05, + "loss": 1.3204, + "num_input_tokens_seen": 250820768, + "step": 7150 + }, + { + "epoch": 0.5776013108968299, + "grad_norm": 0.3008149266242981, + "learning_rate": 3e-05, + "loss": 1.3495, + "num_input_tokens_seen": 251157348, + "step": 7160 + }, + { + "epoch": 0.5784080166383059, + "grad_norm": 0.2766317129135132, + "learning_rate": 3e-05, + "loss": 1.3596, + "num_input_tokens_seen": 251534100, + "step": 7170 + }, + { + "epoch": 0.5792147223797819, + "grad_norm": 0.2724848687648773, + "learning_rate": 3e-05, + "loss": 1.364, + "num_input_tokens_seen": 251892536, + "step": 7180 + }, + { + "epoch": 0.580021428121258, + "grad_norm": 0.2869981825351715, + "learning_rate": 3e-05, + "loss": 1.3494, + "num_input_tokens_seen": 252257808, + "step": 7190 + }, + { + "epoch": 0.580828133862734, + "grad_norm": 0.2904507517814636, + "learning_rate": 3e-05, + "loss": 1.3117, + "num_input_tokens_seen": 252602564, + "step": 7200 + }, + { + "epoch": 0.58163483960421, + "grad_norm": 0.29304027557373047, + "learning_rate": 3e-05, + "loss": 1.3394, + "num_input_tokens_seen": 253000804, + "step": 7210 + }, + { + "epoch": 0.582441545345686, + "grad_norm": 0.2698158323764801, + "learning_rate": 3e-05, + "loss": 1.352, + "num_input_tokens_seen": 253349332, + "step": 7220 + }, + { + "epoch": 0.583248251087162, + "grad_norm": 0.259858638048172, + "learning_rate": 3e-05, + "loss": 1.3094, + "num_input_tokens_seen": 253711588, + "step": 7230 + }, + { + "epoch": 0.5840549568286381, + "grad_norm": 0.24673224985599518, + "learning_rate": 3e-05, + "loss": 1.3504, + "num_input_tokens_seen": 254032876, + "step": 7240 + }, + { + "epoch": 0.5848616625701141, + "grad_norm": 0.24645106494426727, + "learning_rate": 3e-05, + "loss": 1.3109, + "num_input_tokens_seen": 254388696, + "step": 7250 + }, + { + "epoch": 0.5856683683115901, + "grad_norm": 0.295168399810791, + "learning_rate": 3e-05, + "loss": 1.3275, + "num_input_tokens_seen": 254709932, + "step": 7260 + }, + { + "epoch": 0.5864750740530661, + "grad_norm": 0.28658369183540344, + "learning_rate": 3e-05, + "loss": 1.3473, + "num_input_tokens_seen": 255043668, + "step": 7270 + }, + { + "epoch": 0.5872817797945421, + "grad_norm": 0.26063695549964905, + "learning_rate": 3e-05, + "loss": 1.2875, + "num_input_tokens_seen": 255402252, + "step": 7280 + }, + { + "epoch": 0.5880884855360181, + "grad_norm": 0.2694176435470581, + "learning_rate": 3e-05, + "loss": 1.3576, + "num_input_tokens_seen": 255725676, + "step": 7290 + }, + { + "epoch": 0.5888951912774941, + "grad_norm": 0.2852911055088043, + "learning_rate": 3e-05, + "loss": 1.3456, + "num_input_tokens_seen": 256093604, + "step": 7300 + }, + { + "epoch": 0.5897018970189702, + "grad_norm": 0.2513694167137146, + "learning_rate": 3e-05, + "loss": 1.3225, + "num_input_tokens_seen": 256448048, + "step": 7310 + }, + { + "epoch": 0.5905086027604463, + "grad_norm": 0.2887936532497406, + "learning_rate": 3e-05, + "loss": 1.3593, + "num_input_tokens_seen": 256787872, + "step": 7320 + }, + { + "epoch": 0.5913153085019223, + "grad_norm": 0.2790989577770233, + "learning_rate": 3e-05, + "loss": 1.3286, + "num_input_tokens_seen": 257116080, + "step": 7330 + }, + { + "epoch": 0.5921220142433983, + "grad_norm": 0.2735912799835205, + "learning_rate": 3e-05, + "loss": 1.3624, + "num_input_tokens_seen": 257477564, + "step": 7340 + }, + { + "epoch": 0.5929287199848743, + "grad_norm": 0.252945214509964, + "learning_rate": 3e-05, + "loss": 1.3592, + "num_input_tokens_seen": 257851048, + "step": 7350 + }, + { + "epoch": 0.5937354257263503, + "grad_norm": 0.3211391866207123, + "learning_rate": 3e-05, + "loss": 1.3686, + "num_input_tokens_seen": 258218684, + "step": 7360 + }, + { + "epoch": 0.5945421314678263, + "grad_norm": 0.30502480268478394, + "learning_rate": 3e-05, + "loss": 1.3227, + "num_input_tokens_seen": 258573844, + "step": 7370 + }, + { + "epoch": 0.5953488372093023, + "grad_norm": 0.28097566962242126, + "learning_rate": 3e-05, + "loss": 1.294, + "num_input_tokens_seen": 258943588, + "step": 7380 + }, + { + "epoch": 0.5961555429507783, + "grad_norm": 0.27676570415496826, + "learning_rate": 3e-05, + "loss": 1.2757, + "num_input_tokens_seen": 259313468, + "step": 7390 + }, + { + "epoch": 0.5969622486922543, + "grad_norm": 0.25486335158348083, + "learning_rate": 3e-05, + "loss": 1.3064, + "num_input_tokens_seen": 259666520, + "step": 7400 + }, + { + "epoch": 0.5977689544337303, + "grad_norm": 0.26223358511924744, + "learning_rate": 3e-05, + "loss": 1.319, + "num_input_tokens_seen": 260013508, + "step": 7410 + }, + { + "epoch": 0.5985756601752064, + "grad_norm": 0.25428274273872375, + "learning_rate": 3e-05, + "loss": 1.3169, + "num_input_tokens_seen": 260358660, + "step": 7420 + }, + { + "epoch": 0.5993823659166824, + "grad_norm": 0.26828479766845703, + "learning_rate": 3e-05, + "loss": 1.2931, + "num_input_tokens_seen": 260698792, + "step": 7430 + }, + { + "epoch": 0.6001890716581585, + "grad_norm": 0.286696195602417, + "learning_rate": 3e-05, + "loss": 1.3534, + "num_input_tokens_seen": 261051896, + "step": 7440 + }, + { + "epoch": 0.6009957773996345, + "grad_norm": 0.2686040699481964, + "learning_rate": 3e-05, + "loss": 1.3426, + "num_input_tokens_seen": 261413924, + "step": 7450 + }, + { + "epoch": 0.6018024831411105, + "grad_norm": 0.3073362112045288, + "learning_rate": 3e-05, + "loss": 1.3457, + "num_input_tokens_seen": 261759140, + "step": 7460 + }, + { + "epoch": 0.6026091888825865, + "grad_norm": 0.3184793293476105, + "learning_rate": 3e-05, + "loss": 1.2967, + "num_input_tokens_seen": 262101808, + "step": 7470 + }, + { + "epoch": 0.6034158946240625, + "grad_norm": 0.29304739832878113, + "learning_rate": 3e-05, + "loss": 1.2587, + "num_input_tokens_seen": 262420468, + "step": 7480 + }, + { + "epoch": 0.6042226003655385, + "grad_norm": 0.2968464493751526, + "learning_rate": 3e-05, + "loss": 1.306, + "num_input_tokens_seen": 262830180, + "step": 7490 + }, + { + "epoch": 0.6050293061070146, + "grad_norm": 0.29142898321151733, + "learning_rate": 3e-05, + "loss": 1.3333, + "num_input_tokens_seen": 263164684, + "step": 7500 + }, + { + "epoch": 0.6058360118484906, + "grad_norm": 0.4284871220588684, + "learning_rate": 3e-05, + "loss": 1.3222, + "num_input_tokens_seen": 263534376, + "step": 7510 + }, + { + "epoch": 0.6066427175899666, + "grad_norm": 0.25819581747055054, + "learning_rate": 3e-05, + "loss": 1.3673, + "num_input_tokens_seen": 263888472, + "step": 7520 + }, + { + "epoch": 0.6074494233314426, + "grad_norm": 0.2647174298763275, + "learning_rate": 3e-05, + "loss": 1.3096, + "num_input_tokens_seen": 264241468, + "step": 7530 + }, + { + "epoch": 0.6082561290729186, + "grad_norm": 0.26796919107437134, + "learning_rate": 3e-05, + "loss": 1.3448, + "num_input_tokens_seen": 264578804, + "step": 7540 + }, + { + "epoch": 0.6090628348143946, + "grad_norm": 0.2776355445384979, + "learning_rate": 3e-05, + "loss": 1.3448, + "num_input_tokens_seen": 264951596, + "step": 7550 + }, + { + "epoch": 0.6098695405558707, + "grad_norm": 0.2773449420928955, + "learning_rate": 3e-05, + "loss": 1.2945, + "num_input_tokens_seen": 265319936, + "step": 7560 + }, + { + "epoch": 0.6106762462973467, + "grad_norm": 0.28379198908805847, + "learning_rate": 3e-05, + "loss": 1.3447, + "num_input_tokens_seen": 265671716, + "step": 7570 + }, + { + "epoch": 0.6114829520388227, + "grad_norm": 0.2618366777896881, + "learning_rate": 3e-05, + "loss": 1.3163, + "num_input_tokens_seen": 266015756, + "step": 7580 + }, + { + "epoch": 0.6122896577802988, + "grad_norm": 0.25278952717781067, + "learning_rate": 3e-05, + "loss": 1.3918, + "num_input_tokens_seen": 266371472, + "step": 7590 + }, + { + "epoch": 0.6130963635217748, + "grad_norm": 0.29882779717445374, + "learning_rate": 3e-05, + "loss": 1.3239, + "num_input_tokens_seen": 266726232, + "step": 7600 + }, + { + "epoch": 0.6139030692632508, + "grad_norm": 0.2473878711462021, + "learning_rate": 3e-05, + "loss": 1.3322, + "num_input_tokens_seen": 267077476, + "step": 7610 + }, + { + "epoch": 0.6147097750047268, + "grad_norm": 0.26804500818252563, + "learning_rate": 3e-05, + "loss": 1.3116, + "num_input_tokens_seen": 267412680, + "step": 7620 + }, + { + "epoch": 0.6155164807462028, + "grad_norm": 0.2616485059261322, + "learning_rate": 3e-05, + "loss": 1.3633, + "num_input_tokens_seen": 267745836, + "step": 7630 + }, + { + "epoch": 0.6163231864876788, + "grad_norm": 0.29525431990623474, + "learning_rate": 3e-05, + "loss": 1.291, + "num_input_tokens_seen": 268063588, + "step": 7640 + }, + { + "epoch": 0.6171298922291548, + "grad_norm": 0.2977356016635895, + "learning_rate": 3e-05, + "loss": 1.2814, + "num_input_tokens_seen": 268434424, + "step": 7650 + }, + { + "epoch": 0.6179365979706308, + "grad_norm": 0.23729270696640015, + "learning_rate": 3e-05, + "loss": 1.3602, + "num_input_tokens_seen": 268807668, + "step": 7660 + }, + { + "epoch": 0.6187433037121068, + "grad_norm": 0.26896366477012634, + "learning_rate": 3e-05, + "loss": 1.3211, + "num_input_tokens_seen": 269146372, + "step": 7670 + }, + { + "epoch": 0.619550009453583, + "grad_norm": 0.27436563372612, + "learning_rate": 3e-05, + "loss": 1.3661, + "num_input_tokens_seen": 269516044, + "step": 7680 + }, + { + "epoch": 0.620356715195059, + "grad_norm": 0.2542715072631836, + "learning_rate": 3e-05, + "loss": 1.3132, + "num_input_tokens_seen": 269885468, + "step": 7690 + }, + { + "epoch": 0.621163420936535, + "grad_norm": 0.24955634772777557, + "learning_rate": 3e-05, + "loss": 1.3579, + "num_input_tokens_seen": 270234860, + "step": 7700 + }, + { + "epoch": 0.621970126678011, + "grad_norm": 0.2842109799385071, + "learning_rate": 3e-05, + "loss": 1.3533, + "num_input_tokens_seen": 270588556, + "step": 7710 + }, + { + "epoch": 0.622776832419487, + "grad_norm": 0.2718474864959717, + "learning_rate": 3e-05, + "loss": 1.3144, + "num_input_tokens_seen": 270900420, + "step": 7720 + }, + { + "epoch": 0.623583538160963, + "grad_norm": 0.254463255405426, + "learning_rate": 3e-05, + "loss": 1.3094, + "num_input_tokens_seen": 271266568, + "step": 7730 + }, + { + "epoch": 0.624390243902439, + "grad_norm": 0.2510669529438019, + "learning_rate": 3e-05, + "loss": 1.3114, + "num_input_tokens_seen": 271611104, + "step": 7740 + }, + { + "epoch": 0.625196949643915, + "grad_norm": 0.25194868445396423, + "learning_rate": 3e-05, + "loss": 1.3211, + "num_input_tokens_seen": 271961836, + "step": 7750 + }, + { + "epoch": 0.626003655385391, + "grad_norm": 0.26476165652275085, + "learning_rate": 3e-05, + "loss": 1.3175, + "num_input_tokens_seen": 272308540, + "step": 7760 + }, + { + "epoch": 0.626810361126867, + "grad_norm": 0.2768157720565796, + "learning_rate": 3e-05, + "loss": 1.3405, + "num_input_tokens_seen": 272680724, + "step": 7770 + }, + { + "epoch": 0.6276170668683431, + "grad_norm": 0.25789812207221985, + "learning_rate": 3e-05, + "loss": 1.3044, + "num_input_tokens_seen": 273040688, + "step": 7780 + }, + { + "epoch": 0.6284237726098191, + "grad_norm": 0.2867225408554077, + "learning_rate": 3e-05, + "loss": 1.3424, + "num_input_tokens_seen": 273396808, + "step": 7790 + }, + { + "epoch": 0.6292304783512951, + "grad_norm": 0.27524057030677795, + "learning_rate": 3e-05, + "loss": 1.3146, + "num_input_tokens_seen": 273725436, + "step": 7800 + }, + { + "epoch": 0.6300371840927712, + "grad_norm": 0.30353033542633057, + "learning_rate": 3e-05, + "loss": 1.2903, + "num_input_tokens_seen": 274061044, + "step": 7810 + }, + { + "epoch": 0.6308438898342472, + "grad_norm": 0.2527361810207367, + "learning_rate": 3e-05, + "loss": 1.3225, + "num_input_tokens_seen": 274423428, + "step": 7820 + }, + { + "epoch": 0.6316505955757232, + "grad_norm": 0.27751225233078003, + "learning_rate": 3e-05, + "loss": 1.3112, + "num_input_tokens_seen": 274770512, + "step": 7830 + }, + { + "epoch": 0.6324573013171992, + "grad_norm": 0.29832029342651367, + "learning_rate": 3e-05, + "loss": 1.3537, + "num_input_tokens_seen": 275116652, + "step": 7840 + }, + { + "epoch": 0.6332640070586752, + "grad_norm": 0.24705222249031067, + "learning_rate": 3e-05, + "loss": 1.3255, + "num_input_tokens_seen": 275512076, + "step": 7850 + }, + { + "epoch": 0.6340707128001513, + "grad_norm": 0.2816605567932129, + "learning_rate": 3e-05, + "loss": 1.3099, + "num_input_tokens_seen": 275863084, + "step": 7860 + }, + { + "epoch": 0.6348774185416273, + "grad_norm": 0.2889770567417145, + "learning_rate": 3e-05, + "loss": 1.324, + "num_input_tokens_seen": 276235188, + "step": 7870 + }, + { + "epoch": 0.6356841242831033, + "grad_norm": 0.2934252619743347, + "learning_rate": 3e-05, + "loss": 1.31, + "num_input_tokens_seen": 276601124, + "step": 7880 + }, + { + "epoch": 0.6364908300245793, + "grad_norm": 0.2529415488243103, + "learning_rate": 3e-05, + "loss": 1.2902, + "num_input_tokens_seen": 276959408, + "step": 7890 + }, + { + "epoch": 0.6372975357660553, + "grad_norm": 0.24715226888656616, + "learning_rate": 3e-05, + "loss": 1.2716, + "num_input_tokens_seen": 277332960, + "step": 7900 + }, + { + "epoch": 0.6381042415075313, + "grad_norm": 0.28011465072631836, + "learning_rate": 3e-05, + "loss": 1.3777, + "num_input_tokens_seen": 277666204, + "step": 7910 + }, + { + "epoch": 0.6389109472490073, + "grad_norm": 0.25102949142456055, + "learning_rate": 3e-05, + "loss": 1.2916, + "num_input_tokens_seen": 278007212, + "step": 7920 + }, + { + "epoch": 0.6397176529904834, + "grad_norm": 0.2727227210998535, + "learning_rate": 3e-05, + "loss": 1.2966, + "num_input_tokens_seen": 278373292, + "step": 7930 + }, + { + "epoch": 0.6405243587319595, + "grad_norm": 0.2720615267753601, + "learning_rate": 3e-05, + "loss": 1.3667, + "num_input_tokens_seen": 278725792, + "step": 7940 + }, + { + "epoch": 0.6413310644734355, + "grad_norm": 0.2724305987358093, + "learning_rate": 3e-05, + "loss": 1.2488, + "num_input_tokens_seen": 279085604, + "step": 7950 + }, + { + "epoch": 0.6421377702149115, + "grad_norm": 0.26985448598861694, + "learning_rate": 3e-05, + "loss": 1.3151, + "num_input_tokens_seen": 279472276, + "step": 7960 + }, + { + "epoch": 0.6429444759563875, + "grad_norm": 0.2569502890110016, + "learning_rate": 3e-05, + "loss": 1.2984, + "num_input_tokens_seen": 279809104, + "step": 7970 + }, + { + "epoch": 0.6437511816978635, + "grad_norm": 0.2665258049964905, + "learning_rate": 3e-05, + "loss": 1.2771, + "num_input_tokens_seen": 280160428, + "step": 7980 + }, + { + "epoch": 0.6445578874393395, + "grad_norm": 0.25413599610328674, + "learning_rate": 3e-05, + "loss": 1.2941, + "num_input_tokens_seen": 280518244, + "step": 7990 + }, + { + "epoch": 0.6453645931808155, + "grad_norm": 0.2681139409542084, + "learning_rate": 3e-05, + "loss": 1.2859, + "num_input_tokens_seen": 280866724, + "step": 8000 + }, + { + "epoch": 0.6453645931808155, + "eval_gen_len": 452.495, + "eval_loss": 1.2491791248321533, + "eval_rouge1": 37.05, + "eval_rouge2": 21.0468, + "eval_rougeL": 29.7988, + "eval_rougeLsum": 35.1882, + "eval_runtime": 1853.3751, + "eval_samples_per_second": 0.108, + "eval_steps_per_second": 0.027, + "num_input_tokens_seen": 280866724, + "step": 8000 + }, + { + "epoch": 0.6461712989222915, + "grad_norm": 0.24775606393814087, + "learning_rate": 3e-05, + "loss": 1.3535, + "num_input_tokens_seen": 281189452, + "step": 8010 + }, + { + "epoch": 0.6469780046637675, + "grad_norm": 0.2870043218135834, + "learning_rate": 3e-05, + "loss": 1.2518, + "num_input_tokens_seen": 281504304, + "step": 8020 + }, + { + "epoch": 0.6477847104052435, + "grad_norm": 0.26712578535079956, + "learning_rate": 3e-05, + "loss": 1.3021, + "num_input_tokens_seen": 281829352, + "step": 8030 + }, + { + "epoch": 0.6485914161467196, + "grad_norm": 0.24101081490516663, + "learning_rate": 3e-05, + "loss": 1.3482, + "num_input_tokens_seen": 282176564, + "step": 8040 + }, + { + "epoch": 0.6493981218881957, + "grad_norm": 0.278340220451355, + "learning_rate": 3e-05, + "loss": 1.2904, + "num_input_tokens_seen": 282536828, + "step": 8050 + }, + { + "epoch": 0.6502048276296717, + "grad_norm": 0.238587886095047, + "learning_rate": 3e-05, + "loss": 1.3017, + "num_input_tokens_seen": 282870112, + "step": 8060 + }, + { + "epoch": 0.6510115333711477, + "grad_norm": 0.25995177030563354, + "learning_rate": 3e-05, + "loss": 1.3319, + "num_input_tokens_seen": 283211856, + "step": 8070 + }, + { + "epoch": 0.6518182391126237, + "grad_norm": 0.25454819202423096, + "learning_rate": 3e-05, + "loss": 1.2937, + "num_input_tokens_seen": 283557108, + "step": 8080 + }, + { + "epoch": 0.6526249448540997, + "grad_norm": 0.2610025405883789, + "learning_rate": 3e-05, + "loss": 1.3105, + "num_input_tokens_seen": 283939544, + "step": 8090 + }, + { + "epoch": 0.6534316505955757, + "grad_norm": 0.2735656797885895, + "learning_rate": 3e-05, + "loss": 1.3028, + "num_input_tokens_seen": 284295512, + "step": 8100 + }, + { + "epoch": 0.6542383563370517, + "grad_norm": 0.2599696218967438, + "learning_rate": 3e-05, + "loss": 1.2618, + "num_input_tokens_seen": 284660732, + "step": 8110 + }, + { + "epoch": 0.6550450620785278, + "grad_norm": 0.2804352343082428, + "learning_rate": 3e-05, + "loss": 1.3266, + "num_input_tokens_seen": 284985504, + "step": 8120 + }, + { + "epoch": 0.6558517678200038, + "grad_norm": 0.26796236634254456, + "learning_rate": 3e-05, + "loss": 1.2956, + "num_input_tokens_seen": 285352540, + "step": 8130 + }, + { + "epoch": 0.6566584735614798, + "grad_norm": 0.26151329278945923, + "learning_rate": 3e-05, + "loss": 1.295, + "num_input_tokens_seen": 285678276, + "step": 8140 + }, + { + "epoch": 0.6574651793029558, + "grad_norm": 0.2699349522590637, + "learning_rate": 3e-05, + "loss": 1.3056, + "num_input_tokens_seen": 286023024, + "step": 8150 + }, + { + "epoch": 0.6582718850444318, + "grad_norm": 0.2832753360271454, + "learning_rate": 3e-05, + "loss": 1.2952, + "num_input_tokens_seen": 286356540, + "step": 8160 + }, + { + "epoch": 0.6590785907859079, + "grad_norm": 0.26573285460472107, + "learning_rate": 3e-05, + "loss": 1.3337, + "num_input_tokens_seen": 286700992, + "step": 8170 + }, + { + "epoch": 0.6598852965273839, + "grad_norm": 0.26687324047088623, + "learning_rate": 3e-05, + "loss": 1.2572, + "num_input_tokens_seen": 287050020, + "step": 8180 + }, + { + "epoch": 0.6606920022688599, + "grad_norm": 0.26736560463905334, + "learning_rate": 3e-05, + "loss": 1.2766, + "num_input_tokens_seen": 287417060, + "step": 8190 + }, + { + "epoch": 0.661498708010336, + "grad_norm": 0.26670607924461365, + "learning_rate": 3e-05, + "loss": 1.2882, + "num_input_tokens_seen": 287741024, + "step": 8200 + }, + { + "epoch": 0.662305413751812, + "grad_norm": 0.29119133949279785, + "learning_rate": 3e-05, + "loss": 1.3212, + "num_input_tokens_seen": 288119560, + "step": 8210 + }, + { + "epoch": 0.663112119493288, + "grad_norm": 0.26133713126182556, + "learning_rate": 3e-05, + "loss": 1.2795, + "num_input_tokens_seen": 288463204, + "step": 8220 + }, + { + "epoch": 0.663918825234764, + "grad_norm": 0.2923208177089691, + "learning_rate": 3e-05, + "loss": 1.3034, + "num_input_tokens_seen": 288775820, + "step": 8230 + }, + { + "epoch": 0.66472553097624, + "grad_norm": 0.24762633442878723, + "learning_rate": 3e-05, + "loss": 1.2558, + "num_input_tokens_seen": 289102444, + "step": 8240 + }, + { + "epoch": 0.665532236717716, + "grad_norm": 0.29962268471717834, + "learning_rate": 3e-05, + "loss": 1.3248, + "num_input_tokens_seen": 289448904, + "step": 8250 + }, + { + "epoch": 0.666338942459192, + "grad_norm": 0.27402591705322266, + "learning_rate": 3e-05, + "loss": 1.3558, + "num_input_tokens_seen": 289778028, + "step": 8260 + }, + { + "epoch": 0.667145648200668, + "grad_norm": 0.28333625197410583, + "learning_rate": 3e-05, + "loss": 1.2694, + "num_input_tokens_seen": 290149804, + "step": 8270 + }, + { + "epoch": 0.667952353942144, + "grad_norm": 0.26104313135147095, + "learning_rate": 3e-05, + "loss": 1.2988, + "num_input_tokens_seen": 290506276, + "step": 8280 + }, + { + "epoch": 0.66875905968362, + "grad_norm": 0.26603755354881287, + "learning_rate": 3e-05, + "loss": 1.3397, + "num_input_tokens_seen": 290867248, + "step": 8290 + }, + { + "epoch": 0.6695657654250962, + "grad_norm": 0.2591850459575653, + "learning_rate": 3e-05, + "loss": 1.27, + "num_input_tokens_seen": 291243604, + "step": 8300 + }, + { + "epoch": 0.6703724711665722, + "grad_norm": 0.2640308141708374, + "learning_rate": 3e-05, + "loss": 1.3251, + "num_input_tokens_seen": 291586776, + "step": 8310 + }, + { + "epoch": 0.6711791769080482, + "grad_norm": 0.29766708612442017, + "learning_rate": 3e-05, + "loss": 1.2843, + "num_input_tokens_seen": 291935504, + "step": 8320 + }, + { + "epoch": 0.6719858826495242, + "grad_norm": 0.24987733364105225, + "learning_rate": 3e-05, + "loss": 1.3028, + "num_input_tokens_seen": 292267852, + "step": 8330 + }, + { + "epoch": 0.6727925883910002, + "grad_norm": 0.26682114601135254, + "learning_rate": 3e-05, + "loss": 1.3688, + "num_input_tokens_seen": 292629948, + "step": 8340 + }, + { + "epoch": 0.6735992941324762, + "grad_norm": 0.25744229555130005, + "learning_rate": 3e-05, + "loss": 1.294, + "num_input_tokens_seen": 293012512, + "step": 8350 + }, + { + "epoch": 0.6744059998739522, + "grad_norm": 0.2486562579870224, + "learning_rate": 3e-05, + "loss": 1.2671, + "num_input_tokens_seen": 293349760, + "step": 8360 + }, + { + "epoch": 0.6752127056154282, + "grad_norm": 0.27496910095214844, + "learning_rate": 3e-05, + "loss": 1.2869, + "num_input_tokens_seen": 293707052, + "step": 8370 + }, + { + "epoch": 0.6760194113569042, + "grad_norm": 0.26703888177871704, + "learning_rate": 3e-05, + "loss": 1.279, + "num_input_tokens_seen": 294091848, + "step": 8380 + }, + { + "epoch": 0.6768261170983803, + "grad_norm": 0.3000788390636444, + "learning_rate": 3e-05, + "loss": 1.2746, + "num_input_tokens_seen": 294459400, + "step": 8390 + }, + { + "epoch": 0.6776328228398563, + "grad_norm": 0.2827373147010803, + "learning_rate": 3e-05, + "loss": 1.2711, + "num_input_tokens_seen": 294815776, + "step": 8400 + }, + { + "epoch": 0.6784395285813323, + "grad_norm": 0.2718258500099182, + "learning_rate": 3e-05, + "loss": 1.269, + "num_input_tokens_seen": 295182760, + "step": 8410 + }, + { + "epoch": 0.6792462343228084, + "grad_norm": 0.2768170237541199, + "learning_rate": 3e-05, + "loss": 1.3107, + "num_input_tokens_seen": 295545640, + "step": 8420 + }, + { + "epoch": 0.6800529400642844, + "grad_norm": 0.29544582962989807, + "learning_rate": 3e-05, + "loss": 1.3062, + "num_input_tokens_seen": 295896292, + "step": 8430 + }, + { + "epoch": 0.6808596458057604, + "grad_norm": 0.2775704264640808, + "learning_rate": 3e-05, + "loss": 1.2903, + "num_input_tokens_seen": 296238076, + "step": 8440 + }, + { + "epoch": 0.6816663515472364, + "grad_norm": 0.29178759455680847, + "learning_rate": 3e-05, + "loss": 1.2591, + "num_input_tokens_seen": 296593652, + "step": 8450 + }, + { + "epoch": 0.6824730572887124, + "grad_norm": 0.2721198797225952, + "learning_rate": 3e-05, + "loss": 1.3322, + "num_input_tokens_seen": 296929900, + "step": 8460 + }, + { + "epoch": 0.6832797630301884, + "grad_norm": 0.27254942059516907, + "learning_rate": 3e-05, + "loss": 1.2871, + "num_input_tokens_seen": 297259576, + "step": 8470 + }, + { + "epoch": 0.6840864687716645, + "grad_norm": 0.25439295172691345, + "learning_rate": 3e-05, + "loss": 1.3424, + "num_input_tokens_seen": 297618016, + "step": 8480 + }, + { + "epoch": 0.6848931745131405, + "grad_norm": 0.2755286395549774, + "learning_rate": 3e-05, + "loss": 1.3161, + "num_input_tokens_seen": 297965452, + "step": 8490 + }, + { + "epoch": 0.6856998802546165, + "grad_norm": 0.2616944909095764, + "learning_rate": 3e-05, + "loss": 1.2693, + "num_input_tokens_seen": 298311824, + "step": 8500 + }, + { + "epoch": 0.6865065859960925, + "grad_norm": 0.26057368516921997, + "learning_rate": 3e-05, + "loss": 1.3227, + "num_input_tokens_seen": 298690492, + "step": 8510 + }, + { + "epoch": 0.6873132917375685, + "grad_norm": 0.28719767928123474, + "learning_rate": 3e-05, + "loss": 1.3112, + "num_input_tokens_seen": 299031280, + "step": 8520 + }, + { + "epoch": 0.6881199974790445, + "grad_norm": 0.2910424768924713, + "learning_rate": 3e-05, + "loss": 1.2747, + "num_input_tokens_seen": 299391836, + "step": 8530 + }, + { + "epoch": 0.6889267032205206, + "grad_norm": 0.24181599915027618, + "learning_rate": 3e-05, + "loss": 1.2351, + "num_input_tokens_seen": 299756312, + "step": 8540 + }, + { + "epoch": 0.6897334089619966, + "grad_norm": 0.30020081996917725, + "learning_rate": 3e-05, + "loss": 1.2665, + "num_input_tokens_seen": 300106552, + "step": 8550 + }, + { + "epoch": 0.6905401147034727, + "grad_norm": 0.24974121153354645, + "learning_rate": 3e-05, + "loss": 1.2678, + "num_input_tokens_seen": 300444076, + "step": 8560 + }, + { + "epoch": 0.6913468204449487, + "grad_norm": 0.24613253772258759, + "learning_rate": 3e-05, + "loss": 1.3276, + "num_input_tokens_seen": 300774136, + "step": 8570 + }, + { + "epoch": 0.6921535261864247, + "grad_norm": 0.24651503562927246, + "learning_rate": 3e-05, + "loss": 1.3277, + "num_input_tokens_seen": 301144656, + "step": 8580 + }, + { + "epoch": 0.6929602319279007, + "grad_norm": 0.2640286982059479, + "learning_rate": 3e-05, + "loss": 1.2923, + "num_input_tokens_seen": 301508348, + "step": 8590 + }, + { + "epoch": 0.6937669376693767, + "grad_norm": 0.2569688558578491, + "learning_rate": 3e-05, + "loss": 1.3099, + "num_input_tokens_seen": 301875020, + "step": 8600 + }, + { + "epoch": 0.6945736434108527, + "grad_norm": 0.2461465746164322, + "learning_rate": 3e-05, + "loss": 1.2561, + "num_input_tokens_seen": 302220640, + "step": 8610 + }, + { + "epoch": 0.6953803491523287, + "grad_norm": 0.28222113847732544, + "learning_rate": 3e-05, + "loss": 1.3067, + "num_input_tokens_seen": 302536036, + "step": 8620 + }, + { + "epoch": 0.6961870548938047, + "grad_norm": 0.2518487870693207, + "learning_rate": 3e-05, + "loss": 1.2807, + "num_input_tokens_seen": 302868688, + "step": 8630 + }, + { + "epoch": 0.6969937606352807, + "grad_norm": 0.2543613016605377, + "learning_rate": 3e-05, + "loss": 1.2858, + "num_input_tokens_seen": 303210396, + "step": 8640 + }, + { + "epoch": 0.6978004663767567, + "grad_norm": 0.237895667552948, + "learning_rate": 3e-05, + "loss": 1.2601, + "num_input_tokens_seen": 303569724, + "step": 8650 + }, + { + "epoch": 0.6986071721182328, + "grad_norm": 0.2580051124095917, + "learning_rate": 3e-05, + "loss": 1.2494, + "num_input_tokens_seen": 303930156, + "step": 8660 + }, + { + "epoch": 0.6994138778597089, + "grad_norm": 0.269072949886322, + "learning_rate": 3e-05, + "loss": 1.3098, + "num_input_tokens_seen": 304273948, + "step": 8670 + }, + { + "epoch": 0.7002205836011849, + "grad_norm": 0.24792876839637756, + "learning_rate": 3e-05, + "loss": 1.2807, + "num_input_tokens_seen": 304633556, + "step": 8680 + }, + { + "epoch": 0.7010272893426609, + "grad_norm": 0.30012139678001404, + "learning_rate": 3e-05, + "loss": 1.2371, + "num_input_tokens_seen": 304995848, + "step": 8690 + }, + { + "epoch": 0.7018339950841369, + "grad_norm": 0.26541563868522644, + "learning_rate": 3e-05, + "loss": 1.3035, + "num_input_tokens_seen": 305341180, + "step": 8700 + }, + { + "epoch": 0.7026407008256129, + "grad_norm": 0.2490505874156952, + "learning_rate": 3e-05, + "loss": 1.3162, + "num_input_tokens_seen": 305687480, + "step": 8710 + }, + { + "epoch": 0.7034474065670889, + "grad_norm": 0.2835010290145874, + "learning_rate": 3e-05, + "loss": 1.2839, + "num_input_tokens_seen": 306057544, + "step": 8720 + }, + { + "epoch": 0.7042541123085649, + "grad_norm": 0.28543031215667725, + "learning_rate": 3e-05, + "loss": 1.303, + "num_input_tokens_seen": 306430396, + "step": 8730 + }, + { + "epoch": 0.705060818050041, + "grad_norm": 0.26753681898117065, + "learning_rate": 3e-05, + "loss": 1.3161, + "num_input_tokens_seen": 306769880, + "step": 8740 + }, + { + "epoch": 0.705867523791517, + "grad_norm": 0.26406893134117126, + "learning_rate": 3e-05, + "loss": 1.2677, + "num_input_tokens_seen": 307115084, + "step": 8750 + }, + { + "epoch": 0.706674229532993, + "grad_norm": 0.25874289870262146, + "learning_rate": 3e-05, + "loss": 1.313, + "num_input_tokens_seen": 307459572, + "step": 8760 + }, + { + "epoch": 0.707480935274469, + "grad_norm": 0.26951470971107483, + "learning_rate": 3e-05, + "loss": 1.2776, + "num_input_tokens_seen": 307829708, + "step": 8770 + }, + { + "epoch": 0.708287641015945, + "grad_norm": 0.26149865984916687, + "learning_rate": 3e-05, + "loss": 1.2774, + "num_input_tokens_seen": 308183368, + "step": 8780 + }, + { + "epoch": 0.7090943467574211, + "grad_norm": 0.27776703238487244, + "learning_rate": 3e-05, + "loss": 1.2743, + "num_input_tokens_seen": 308541012, + "step": 8790 + }, + { + "epoch": 0.7099010524988971, + "grad_norm": 0.2505494952201843, + "learning_rate": 3e-05, + "loss": 1.3011, + "num_input_tokens_seen": 308894080, + "step": 8800 + }, + { + "epoch": 0.7107077582403731, + "grad_norm": 0.26979315280914307, + "learning_rate": 3e-05, + "loss": 1.281, + "num_input_tokens_seen": 309264200, + "step": 8810 + }, + { + "epoch": 0.7115144639818491, + "grad_norm": 0.29816481471061707, + "learning_rate": 3e-05, + "loss": 1.2514, + "num_input_tokens_seen": 309617452, + "step": 8820 + }, + { + "epoch": 0.7123211697233252, + "grad_norm": 0.2611445188522339, + "learning_rate": 3e-05, + "loss": 1.317, + "num_input_tokens_seen": 309992684, + "step": 8830 + }, + { + "epoch": 0.7131278754648012, + "grad_norm": 0.24103762209415436, + "learning_rate": 3e-05, + "loss": 1.2898, + "num_input_tokens_seen": 310335500, + "step": 8840 + }, + { + "epoch": 0.7139345812062772, + "grad_norm": 0.2735673487186432, + "learning_rate": 3e-05, + "loss": 1.3122, + "num_input_tokens_seen": 310688468, + "step": 8850 + }, + { + "epoch": 0.7147412869477532, + "grad_norm": 0.28114932775497437, + "learning_rate": 3e-05, + "loss": 1.3069, + "num_input_tokens_seen": 311050176, + "step": 8860 + }, + { + "epoch": 0.7155479926892292, + "grad_norm": 0.235976904630661, + "learning_rate": 3e-05, + "loss": 1.2693, + "num_input_tokens_seen": 311402020, + "step": 8870 + }, + { + "epoch": 0.7163546984307052, + "grad_norm": 0.26939788460731506, + "learning_rate": 3e-05, + "loss": 1.3097, + "num_input_tokens_seen": 311759596, + "step": 8880 + }, + { + "epoch": 0.7171614041721812, + "grad_norm": 0.25951477885246277, + "learning_rate": 3e-05, + "loss": 1.2904, + "num_input_tokens_seen": 312097396, + "step": 8890 + }, + { + "epoch": 0.7179681099136572, + "grad_norm": 0.2675970196723938, + "learning_rate": 3e-05, + "loss": 1.2992, + "num_input_tokens_seen": 312460216, + "step": 8900 + }, + { + "epoch": 0.7187748156551333, + "grad_norm": 0.25855639576911926, + "learning_rate": 3e-05, + "loss": 1.2781, + "num_input_tokens_seen": 312826724, + "step": 8910 + }, + { + "epoch": 0.7195815213966094, + "grad_norm": 0.2917179465293884, + "learning_rate": 3e-05, + "loss": 1.2743, + "num_input_tokens_seen": 313191192, + "step": 8920 + }, + { + "epoch": 0.7203882271380854, + "grad_norm": 0.2799781262874603, + "learning_rate": 3e-05, + "loss": 1.2837, + "num_input_tokens_seen": 313535952, + "step": 8930 + }, + { + "epoch": 0.7211949328795614, + "grad_norm": 0.27598562836647034, + "learning_rate": 3e-05, + "loss": 1.2682, + "num_input_tokens_seen": 313891512, + "step": 8940 + }, + { + "epoch": 0.7220016386210374, + "grad_norm": 0.24936188757419586, + "learning_rate": 3e-05, + "loss": 1.2888, + "num_input_tokens_seen": 314237120, + "step": 8950 + }, + { + "epoch": 0.7228083443625134, + "grad_norm": 0.28626489639282227, + "learning_rate": 3e-05, + "loss": 1.2901, + "num_input_tokens_seen": 314598912, + "step": 8960 + }, + { + "epoch": 0.7236150501039894, + "grad_norm": 0.25209441781044006, + "learning_rate": 3e-05, + "loss": 1.3165, + "num_input_tokens_seen": 314942440, + "step": 8970 + }, + { + "epoch": 0.7244217558454654, + "grad_norm": 0.27700820565223694, + "learning_rate": 3e-05, + "loss": 1.2825, + "num_input_tokens_seen": 315310504, + "step": 8980 + }, + { + "epoch": 0.7252284615869414, + "grad_norm": 0.2671830356121063, + "learning_rate": 3e-05, + "loss": 1.276, + "num_input_tokens_seen": 315682248, + "step": 8990 + }, + { + "epoch": 0.7260351673284174, + "grad_norm": 0.3061155080795288, + "learning_rate": 3e-05, + "loss": 1.298, + "num_input_tokens_seen": 316042068, + "step": 9000 + }, + { + "epoch": 0.7260351673284174, + "eval_gen_len": 464.37, + "eval_loss": 1.2210745811462402, + "eval_rouge1": 36.6966, + "eval_rouge2": 20.8189, + "eval_rougeL": 29.7115, + "eval_rougeLsum": 34.7528, + "eval_runtime": 1686.3491, + "eval_samples_per_second": 0.119, + "eval_steps_per_second": 0.03, + "num_input_tokens_seen": 316042068, + "step": 9000 + }, + { + "epoch": 0.7268418730698935, + "grad_norm": 0.2395300716161728, + "learning_rate": 3e-05, + "loss": 1.28, + "num_input_tokens_seen": 316377156, + "step": 9010 + }, + { + "epoch": 0.7276485788113695, + "grad_norm": 0.26959264278411865, + "learning_rate": 3e-05, + "loss": 1.2808, + "num_input_tokens_seen": 316739840, + "step": 9020 + }, + { + "epoch": 0.7284552845528456, + "grad_norm": 0.2841363549232483, + "learning_rate": 3e-05, + "loss": 1.2911, + "num_input_tokens_seen": 317084172, + "step": 9030 + }, + { + "epoch": 0.7292619902943216, + "grad_norm": 0.2511976361274719, + "learning_rate": 3e-05, + "loss": 1.2442, + "num_input_tokens_seen": 317440872, + "step": 9040 + }, + { + "epoch": 0.7300686960357976, + "grad_norm": 0.26146405935287476, + "learning_rate": 3e-05, + "loss": 1.3228, + "num_input_tokens_seen": 317755504, + "step": 9050 + }, + { + "epoch": 0.7308754017772736, + "grad_norm": 0.2912101745605469, + "learning_rate": 3e-05, + "loss": 1.2688, + "num_input_tokens_seen": 318148088, + "step": 9060 + }, + { + "epoch": 0.7316821075187496, + "grad_norm": 0.2883487641811371, + "learning_rate": 3e-05, + "loss": 1.313, + "num_input_tokens_seen": 318499404, + "step": 9070 + }, + { + "epoch": 0.7324888132602256, + "grad_norm": 0.2744971811771393, + "learning_rate": 3e-05, + "loss": 1.2339, + "num_input_tokens_seen": 318857524, + "step": 9080 + }, + { + "epoch": 0.7332955190017016, + "grad_norm": 0.3002362847328186, + "learning_rate": 3e-05, + "loss": 1.2448, + "num_input_tokens_seen": 319207924, + "step": 9090 + }, + { + "epoch": 0.7341022247431777, + "grad_norm": 0.28158414363861084, + "learning_rate": 3e-05, + "loss": 1.27, + "num_input_tokens_seen": 319547752, + "step": 9100 + }, + { + "epoch": 0.7349089304846537, + "grad_norm": 0.2615879774093628, + "learning_rate": 3e-05, + "loss": 1.2781, + "num_input_tokens_seen": 319898356, + "step": 9110 + }, + { + "epoch": 0.7357156362261297, + "grad_norm": 0.24552986025810242, + "learning_rate": 3e-05, + "loss": 1.2847, + "num_input_tokens_seen": 320260504, + "step": 9120 + }, + { + "epoch": 0.7365223419676057, + "grad_norm": 0.2580191195011139, + "learning_rate": 3e-05, + "loss": 1.298, + "num_input_tokens_seen": 320619676, + "step": 9130 + }, + { + "epoch": 0.7373290477090817, + "grad_norm": 0.2691594660282135, + "learning_rate": 3e-05, + "loss": 1.267, + "num_input_tokens_seen": 320975396, + "step": 9140 + }, + { + "epoch": 0.7381357534505577, + "grad_norm": 0.2579469382762909, + "learning_rate": 3e-05, + "loss": 1.313, + "num_input_tokens_seen": 321340052, + "step": 9150 + }, + { + "epoch": 0.7389424591920338, + "grad_norm": 0.2658007740974426, + "learning_rate": 3e-05, + "loss": 1.3099, + "num_input_tokens_seen": 321690400, + "step": 9160 + }, + { + "epoch": 0.7397491649335098, + "grad_norm": 0.2555302679538727, + "learning_rate": 3e-05, + "loss": 1.2961, + "num_input_tokens_seen": 322040856, + "step": 9170 + }, + { + "epoch": 0.7405558706749858, + "grad_norm": 0.24547891318798065, + "learning_rate": 3e-05, + "loss": 1.316, + "num_input_tokens_seen": 322390612, + "step": 9180 + }, + { + "epoch": 0.7413625764164619, + "grad_norm": 0.26539695262908936, + "learning_rate": 3e-05, + "loss": 1.2573, + "num_input_tokens_seen": 322751676, + "step": 9190 + }, + { + "epoch": 0.7421692821579379, + "grad_norm": 0.24796757102012634, + "learning_rate": 3e-05, + "loss": 1.2308, + "num_input_tokens_seen": 323058852, + "step": 9200 + }, + { + "epoch": 0.7429759878994139, + "grad_norm": 0.26277750730514526, + "learning_rate": 3e-05, + "loss": 1.2912, + "num_input_tokens_seen": 323378492, + "step": 9210 + }, + { + "epoch": 0.7437826936408899, + "grad_norm": 0.2662057876586914, + "learning_rate": 3e-05, + "loss": 1.2891, + "num_input_tokens_seen": 323741544, + "step": 9220 + }, + { + "epoch": 0.7445893993823659, + "grad_norm": 0.30241715908050537, + "learning_rate": 3e-05, + "loss": 1.2783, + "num_input_tokens_seen": 324084828, + "step": 9230 + }, + { + "epoch": 0.7453961051238419, + "grad_norm": 0.24552224576473236, + "learning_rate": 3e-05, + "loss": 1.2602, + "num_input_tokens_seen": 324429788, + "step": 9240 + }, + { + "epoch": 0.7462028108653179, + "grad_norm": 0.2982407212257385, + "learning_rate": 3e-05, + "loss": 1.2724, + "num_input_tokens_seen": 324762408, + "step": 9250 + }, + { + "epoch": 0.7470095166067939, + "grad_norm": 0.2681979835033417, + "learning_rate": 3e-05, + "loss": 1.2691, + "num_input_tokens_seen": 325100544, + "step": 9260 + }, + { + "epoch": 0.7478162223482699, + "grad_norm": 0.23062004148960114, + "learning_rate": 3e-05, + "loss": 1.2752, + "num_input_tokens_seen": 325430040, + "step": 9270 + }, + { + "epoch": 0.7486229280897461, + "grad_norm": 0.2845359742641449, + "learning_rate": 3e-05, + "loss": 1.2589, + "num_input_tokens_seen": 325776508, + "step": 9280 + }, + { + "epoch": 0.7494296338312221, + "grad_norm": 0.28453579545021057, + "learning_rate": 3e-05, + "loss": 1.264, + "num_input_tokens_seen": 326132120, + "step": 9290 + }, + { + "epoch": 0.7502363395726981, + "grad_norm": 0.2852461040019989, + "learning_rate": 3e-05, + "loss": 1.2583, + "num_input_tokens_seen": 326483084, + "step": 9300 + }, + { + "epoch": 0.7510430453141741, + "grad_norm": 0.25744280219078064, + "learning_rate": 3e-05, + "loss": 1.2774, + "num_input_tokens_seen": 326835932, + "step": 9310 + }, + { + "epoch": 0.7518497510556501, + "grad_norm": 0.255248486995697, + "learning_rate": 3e-05, + "loss": 1.2249, + "num_input_tokens_seen": 327198368, + "step": 9320 + }, + { + "epoch": 0.7526564567971261, + "grad_norm": 0.25559529662132263, + "learning_rate": 3e-05, + "loss": 1.2555, + "num_input_tokens_seen": 327558064, + "step": 9330 + }, + { + "epoch": 0.7534631625386021, + "grad_norm": 0.27276313304901123, + "learning_rate": 3e-05, + "loss": 1.2728, + "num_input_tokens_seen": 327905308, + "step": 9340 + }, + { + "epoch": 0.7542698682800781, + "grad_norm": 0.26818275451660156, + "learning_rate": 3e-05, + "loss": 1.2931, + "num_input_tokens_seen": 328237128, + "step": 9350 + }, + { + "epoch": 0.7550765740215541, + "grad_norm": 0.29092878103256226, + "learning_rate": 3e-05, + "loss": 1.2891, + "num_input_tokens_seen": 328586876, + "step": 9360 + }, + { + "epoch": 0.7558832797630302, + "grad_norm": 0.25079798698425293, + "learning_rate": 3e-05, + "loss": 1.2917, + "num_input_tokens_seen": 328894680, + "step": 9370 + }, + { + "epoch": 0.7566899855045062, + "grad_norm": 0.23828420042991638, + "learning_rate": 3e-05, + "loss": 1.3024, + "num_input_tokens_seen": 329232548, + "step": 9380 + }, + { + "epoch": 0.7574966912459822, + "grad_norm": 0.24749857187271118, + "learning_rate": 3e-05, + "loss": 1.245, + "num_input_tokens_seen": 329572096, + "step": 9390 + }, + { + "epoch": 0.7583033969874583, + "grad_norm": 0.24294038116931915, + "learning_rate": 3e-05, + "loss": 1.2594, + "num_input_tokens_seen": 329935620, + "step": 9400 + }, + { + "epoch": 0.7591101027289343, + "grad_norm": 0.24688206613063812, + "learning_rate": 3e-05, + "loss": 1.2966, + "num_input_tokens_seen": 330298316, + "step": 9410 + }, + { + "epoch": 0.7599168084704103, + "grad_norm": 0.26844438910484314, + "learning_rate": 3e-05, + "loss": 1.2428, + "num_input_tokens_seen": 330663576, + "step": 9420 + }, + { + "epoch": 0.7607235142118863, + "grad_norm": 0.24980930984020233, + "learning_rate": 3e-05, + "loss": 1.2041, + "num_input_tokens_seen": 330992136, + "step": 9430 + }, + { + "epoch": 0.7615302199533623, + "grad_norm": 0.26029011607170105, + "learning_rate": 3e-05, + "loss": 1.2654, + "num_input_tokens_seen": 331366748, + "step": 9440 + }, + { + "epoch": 0.7623369256948384, + "grad_norm": 0.2643781006336212, + "learning_rate": 3e-05, + "loss": 1.2701, + "num_input_tokens_seen": 331701028, + "step": 9450 + }, + { + "epoch": 0.7631436314363144, + "grad_norm": 0.2505422532558441, + "learning_rate": 3e-05, + "loss": 1.2833, + "num_input_tokens_seen": 332092676, + "step": 9460 + }, + { + "epoch": 0.7639503371777904, + "grad_norm": 0.2630390524864197, + "learning_rate": 3e-05, + "loss": 1.2207, + "num_input_tokens_seen": 332401596, + "step": 9470 + }, + { + "epoch": 0.7647570429192664, + "grad_norm": 0.27384325861930847, + "learning_rate": 3e-05, + "loss": 1.3175, + "num_input_tokens_seen": 332760052, + "step": 9480 + }, + { + "epoch": 0.7655637486607424, + "grad_norm": 0.29426440596580505, + "learning_rate": 3e-05, + "loss": 1.2375, + "num_input_tokens_seen": 333118996, + "step": 9490 + }, + { + "epoch": 0.7663704544022184, + "grad_norm": 0.2638697326183319, + "learning_rate": 3e-05, + "loss": 1.2639, + "num_input_tokens_seen": 333468912, + "step": 9500 + }, + { + "epoch": 0.7671771601436944, + "grad_norm": 0.2899869978427887, + "learning_rate": 3e-05, + "loss": 1.3265, + "num_input_tokens_seen": 333808660, + "step": 9510 + }, + { + "epoch": 0.7679838658851704, + "grad_norm": 0.2559219300746918, + "learning_rate": 3e-05, + "loss": 1.2791, + "num_input_tokens_seen": 334133356, + "step": 9520 + }, + { + "epoch": 0.7687905716266465, + "grad_norm": 0.2566789388656616, + "learning_rate": 3e-05, + "loss": 1.3236, + "num_input_tokens_seen": 334515860, + "step": 9530 + }, + { + "epoch": 0.7695972773681226, + "grad_norm": 0.2541514039039612, + "learning_rate": 3e-05, + "loss": 1.2808, + "num_input_tokens_seen": 334887600, + "step": 9540 + }, + { + "epoch": 0.7704039831095986, + "grad_norm": 0.2626420557498932, + "learning_rate": 3e-05, + "loss": 1.2902, + "num_input_tokens_seen": 335269980, + "step": 9550 + }, + { + "epoch": 0.7712106888510746, + "grad_norm": 0.28111469745635986, + "learning_rate": 3e-05, + "loss": 1.2285, + "num_input_tokens_seen": 335614044, + "step": 9560 + }, + { + "epoch": 0.7720173945925506, + "grad_norm": 0.26732560992240906, + "learning_rate": 3e-05, + "loss": 1.2802, + "num_input_tokens_seen": 335947240, + "step": 9570 + }, + { + "epoch": 0.7728241003340266, + "grad_norm": 0.2630169987678528, + "learning_rate": 3e-05, + "loss": 1.2562, + "num_input_tokens_seen": 336277872, + "step": 9580 + }, + { + "epoch": 0.7736308060755026, + "grad_norm": 0.24275615811347961, + "learning_rate": 3e-05, + "loss": 1.241, + "num_input_tokens_seen": 336616732, + "step": 9590 + }, + { + "epoch": 0.7744375118169786, + "grad_norm": 0.27467086911201477, + "learning_rate": 3e-05, + "loss": 1.2783, + "num_input_tokens_seen": 336962668, + "step": 9600 + }, + { + "epoch": 0.7752442175584546, + "grad_norm": 0.22901813685894012, + "learning_rate": 3e-05, + "loss": 1.2474, + "num_input_tokens_seen": 337289600, + "step": 9610 + }, + { + "epoch": 0.7760509232999306, + "grad_norm": 0.25075381994247437, + "learning_rate": 3e-05, + "loss": 1.2911, + "num_input_tokens_seen": 337638772, + "step": 9620 + }, + { + "epoch": 0.7768576290414066, + "grad_norm": 0.26371341943740845, + "learning_rate": 3e-05, + "loss": 1.3082, + "num_input_tokens_seen": 337956240, + "step": 9630 + }, + { + "epoch": 0.7776643347828827, + "grad_norm": 0.2652187943458557, + "learning_rate": 3e-05, + "loss": 1.2419, + "num_input_tokens_seen": 338333320, + "step": 9640 + }, + { + "epoch": 0.7784710405243588, + "grad_norm": 0.275717169046402, + "learning_rate": 3e-05, + "loss": 1.2801, + "num_input_tokens_seen": 338693484, + "step": 9650 + }, + { + "epoch": 0.7792777462658348, + "grad_norm": 0.2673225402832031, + "learning_rate": 3e-05, + "loss": 1.2968, + "num_input_tokens_seen": 339059484, + "step": 9660 + }, + { + "epoch": 0.7800844520073108, + "grad_norm": 0.24011015892028809, + "learning_rate": 3e-05, + "loss": 1.2643, + "num_input_tokens_seen": 339401908, + "step": 9670 + }, + { + "epoch": 0.7808911577487868, + "grad_norm": 0.2752505838871002, + "learning_rate": 3e-05, + "loss": 1.2839, + "num_input_tokens_seen": 339759036, + "step": 9680 + }, + { + "epoch": 0.7816978634902628, + "grad_norm": 0.2720450758934021, + "learning_rate": 3e-05, + "loss": 1.2549, + "num_input_tokens_seen": 340108708, + "step": 9690 + }, + { + "epoch": 0.7825045692317388, + "grad_norm": 0.2938039004802704, + "learning_rate": 3e-05, + "loss": 1.2702, + "num_input_tokens_seen": 340425536, + "step": 9700 + }, + { + "epoch": 0.7833112749732148, + "grad_norm": 0.2659102976322174, + "learning_rate": 3e-05, + "loss": 1.23, + "num_input_tokens_seen": 340799308, + "step": 9710 + }, + { + "epoch": 0.7841179807146909, + "grad_norm": 0.26471832394599915, + "learning_rate": 3e-05, + "loss": 1.1911, + "num_input_tokens_seen": 341141740, + "step": 9720 + }, + { + "epoch": 0.7849246864561669, + "grad_norm": 0.27800023555755615, + "learning_rate": 3e-05, + "loss": 1.2801, + "num_input_tokens_seen": 341473408, + "step": 9730 + }, + { + "epoch": 0.7857313921976429, + "grad_norm": 0.25355374813079834, + "learning_rate": 3e-05, + "loss": 1.2339, + "num_input_tokens_seen": 341810480, + "step": 9740 + }, + { + "epoch": 0.7865380979391189, + "grad_norm": 0.25053349137306213, + "learning_rate": 3e-05, + "loss": 1.2741, + "num_input_tokens_seen": 342171584, + "step": 9750 + }, + { + "epoch": 0.7873448036805949, + "grad_norm": 0.2605432868003845, + "learning_rate": 3e-05, + "loss": 1.2534, + "num_input_tokens_seen": 342516448, + "step": 9760 + }, + { + "epoch": 0.788151509422071, + "grad_norm": 0.27407005429267883, + "learning_rate": 3e-05, + "loss": 1.276, + "num_input_tokens_seen": 342861752, + "step": 9770 + }, + { + "epoch": 0.788958215163547, + "grad_norm": 0.2646719515323639, + "learning_rate": 3e-05, + "loss": 1.2308, + "num_input_tokens_seen": 343235864, + "step": 9780 + }, + { + "epoch": 0.789764920905023, + "grad_norm": 0.2499488741159439, + "learning_rate": 3e-05, + "loss": 1.2787, + "num_input_tokens_seen": 343585360, + "step": 9790 + }, + { + "epoch": 0.790571626646499, + "grad_norm": 0.25169795751571655, + "learning_rate": 3e-05, + "loss": 1.272, + "num_input_tokens_seen": 343949028, + "step": 9800 + }, + { + "epoch": 0.7913783323879751, + "grad_norm": 0.25061219930648804, + "learning_rate": 3e-05, + "loss": 1.2543, + "num_input_tokens_seen": 344341964, + "step": 9810 + }, + { + "epoch": 0.7921850381294511, + "grad_norm": 0.27238261699676514, + "learning_rate": 3e-05, + "loss": 1.2527, + "num_input_tokens_seen": 344699776, + "step": 9820 + }, + { + "epoch": 0.7929917438709271, + "grad_norm": 0.26253870129585266, + "learning_rate": 3e-05, + "loss": 1.2626, + "num_input_tokens_seen": 345029320, + "step": 9830 + }, + { + "epoch": 0.7937984496124031, + "grad_norm": 0.2650923728942871, + "learning_rate": 3e-05, + "loss": 1.2253, + "num_input_tokens_seen": 345405684, + "step": 9840 + }, + { + "epoch": 0.7946051553538791, + "grad_norm": 0.2489556223154068, + "learning_rate": 3e-05, + "loss": 1.2682, + "num_input_tokens_seen": 345764444, + "step": 9850 + }, + { + "epoch": 0.7954118610953551, + "grad_norm": 0.2614899277687073, + "learning_rate": 3e-05, + "loss": 1.2586, + "num_input_tokens_seen": 346123540, + "step": 9860 + }, + { + "epoch": 0.7962185668368311, + "grad_norm": 0.25150853395462036, + "learning_rate": 3e-05, + "loss": 1.2272, + "num_input_tokens_seen": 346484268, + "step": 9870 + }, + { + "epoch": 0.7970252725783071, + "grad_norm": 0.2592512369155884, + "learning_rate": 3e-05, + "loss": 1.2478, + "num_input_tokens_seen": 346850032, + "step": 9880 + }, + { + "epoch": 0.7978319783197833, + "grad_norm": 0.26685789227485657, + "learning_rate": 3e-05, + "loss": 1.2845, + "num_input_tokens_seen": 347208532, + "step": 9890 + }, + { + "epoch": 0.7986386840612593, + "grad_norm": 0.2619518041610718, + "learning_rate": 3e-05, + "loss": 1.2526, + "num_input_tokens_seen": 347546084, + "step": 9900 + }, + { + "epoch": 0.7994453898027353, + "grad_norm": 0.3048644959926605, + "learning_rate": 3e-05, + "loss": 1.2763, + "num_input_tokens_seen": 347891900, + "step": 9910 + }, + { + "epoch": 0.8002520955442113, + "grad_norm": 0.2470572292804718, + "learning_rate": 3e-05, + "loss": 1.2488, + "num_input_tokens_seen": 348241444, + "step": 9920 + }, + { + "epoch": 0.8010588012856873, + "grad_norm": 0.25996264815330505, + "learning_rate": 3e-05, + "loss": 1.2448, + "num_input_tokens_seen": 348600368, + "step": 9930 + }, + { + "epoch": 0.8018655070271633, + "grad_norm": 0.25079694390296936, + "learning_rate": 3e-05, + "loss": 1.2158, + "num_input_tokens_seen": 348934544, + "step": 9940 + }, + { + "epoch": 0.8026722127686393, + "grad_norm": 0.2604506015777588, + "learning_rate": 3e-05, + "loss": 1.2665, + "num_input_tokens_seen": 349266356, + "step": 9950 + }, + { + "epoch": 0.8034789185101153, + "grad_norm": 0.26775991916656494, + "learning_rate": 3e-05, + "loss": 1.2491, + "num_input_tokens_seen": 349637740, + "step": 9960 + }, + { + "epoch": 0.8042856242515913, + "grad_norm": 0.2628551125526428, + "learning_rate": 3e-05, + "loss": 1.2875, + "num_input_tokens_seen": 349978220, + "step": 9970 + }, + { + "epoch": 0.8050923299930673, + "grad_norm": 0.2629667818546295, + "learning_rate": 3e-05, + "loss": 1.21, + "num_input_tokens_seen": 350338180, + "step": 9980 + }, + { + "epoch": 0.8058990357345434, + "grad_norm": 0.26192960143089294, + "learning_rate": 3e-05, + "loss": 1.2143, + "num_input_tokens_seen": 350702660, + "step": 9990 + }, + { + "epoch": 0.8067057414760194, + "grad_norm": 0.24086323380470276, + "learning_rate": 3e-05, + "loss": 1.2834, + "num_input_tokens_seen": 351056548, + "step": 10000 + }, + { + "epoch": 0.8067057414760194, + "eval_gen_len": 446.26, + "eval_loss": 1.1978570222854614, + "eval_rouge1": 37.7181, + "eval_rouge2": 20.9926, + "eval_rougeL": 30.3857, + "eval_rougeLsum": 35.8681, + "eval_runtime": 1488.0454, + "eval_samples_per_second": 0.134, + "eval_steps_per_second": 0.034, + "num_input_tokens_seen": 351056548, + "step": 10000 + }, + { + "epoch": 0.8075124472174954, + "grad_norm": 0.27893269062042236, + "learning_rate": 3e-05, + "loss": 1.2608, + "num_input_tokens_seen": 351419124, + "step": 10010 + }, + { + "epoch": 0.8083191529589715, + "grad_norm": 0.2801869511604309, + "learning_rate": 3e-05, + "loss": 1.2362, + "num_input_tokens_seen": 351760664, + "step": 10020 + }, + { + "epoch": 0.8091258587004475, + "grad_norm": 0.2547568380832672, + "learning_rate": 3e-05, + "loss": 1.1999, + "num_input_tokens_seen": 352145232, + "step": 10030 + }, + { + "epoch": 0.8099325644419235, + "grad_norm": 0.2530830502510071, + "learning_rate": 3e-05, + "loss": 1.2576, + "num_input_tokens_seen": 352484012, + "step": 10040 + }, + { + "epoch": 0.8107392701833995, + "grad_norm": 0.25283852219581604, + "learning_rate": 3e-05, + "loss": 1.2672, + "num_input_tokens_seen": 352838284, + "step": 10050 + }, + { + "epoch": 0.8115459759248755, + "grad_norm": 0.2714962661266327, + "learning_rate": 3e-05, + "loss": 1.2241, + "num_input_tokens_seen": 353196252, + "step": 10060 + }, + { + "epoch": 0.8123526816663516, + "grad_norm": 0.2614021301269531, + "learning_rate": 3e-05, + "loss": 1.2539, + "num_input_tokens_seen": 353557696, + "step": 10070 + }, + { + "epoch": 0.8131593874078276, + "grad_norm": 0.25115180015563965, + "learning_rate": 3e-05, + "loss": 1.2152, + "num_input_tokens_seen": 353901440, + "step": 10080 + }, + { + "epoch": 0.8139660931493036, + "grad_norm": 0.23511908948421478, + "learning_rate": 3e-05, + "loss": 1.227, + "num_input_tokens_seen": 354266972, + "step": 10090 + }, + { + "epoch": 0.8147727988907796, + "grad_norm": 0.2694503366947174, + "learning_rate": 3e-05, + "loss": 1.2151, + "num_input_tokens_seen": 354620136, + "step": 10100 + }, + { + "epoch": 0.8155795046322556, + "grad_norm": 0.27539879083633423, + "learning_rate": 3e-05, + "loss": 1.2867, + "num_input_tokens_seen": 354981004, + "step": 10110 + }, + { + "epoch": 0.8163862103737316, + "grad_norm": 0.25558432936668396, + "learning_rate": 3e-05, + "loss": 1.2376, + "num_input_tokens_seen": 355343284, + "step": 10120 + }, + { + "epoch": 0.8171929161152076, + "grad_norm": 0.24992291629314423, + "learning_rate": 3e-05, + "loss": 1.2487, + "num_input_tokens_seen": 355681348, + "step": 10130 + }, + { + "epoch": 0.8179996218566837, + "grad_norm": 0.25410589575767517, + "learning_rate": 3e-05, + "loss": 1.2606, + "num_input_tokens_seen": 356040020, + "step": 10140 + }, + { + "epoch": 0.8188063275981597, + "grad_norm": 0.23031924664974213, + "learning_rate": 3e-05, + "loss": 1.2462, + "num_input_tokens_seen": 356402532, + "step": 10150 + }, + { + "epoch": 0.8196130333396358, + "grad_norm": 0.26112812757492065, + "learning_rate": 3e-05, + "loss": 1.2594, + "num_input_tokens_seen": 356761504, + "step": 10160 + }, + { + "epoch": 0.8204197390811118, + "grad_norm": 0.2500099837779999, + "learning_rate": 3e-05, + "loss": 1.2115, + "num_input_tokens_seen": 357123360, + "step": 10170 + }, + { + "epoch": 0.8212264448225878, + "grad_norm": 0.2862362861633301, + "learning_rate": 3e-05, + "loss": 1.2912, + "num_input_tokens_seen": 357475416, + "step": 10180 + }, + { + "epoch": 0.8220331505640638, + "grad_norm": 0.2600359320640564, + "learning_rate": 3e-05, + "loss": 1.226, + "num_input_tokens_seen": 357793912, + "step": 10190 + }, + { + "epoch": 0.8228398563055398, + "grad_norm": 0.25250157713890076, + "learning_rate": 3e-05, + "loss": 1.2438, + "num_input_tokens_seen": 358192864, + "step": 10200 + }, + { + "epoch": 0.8236465620470158, + "grad_norm": 0.2738971412181854, + "learning_rate": 3e-05, + "loss": 1.2416, + "num_input_tokens_seen": 358538800, + "step": 10210 + }, + { + "epoch": 0.8244532677884918, + "grad_norm": 0.23127759993076324, + "learning_rate": 3e-05, + "loss": 1.2963, + "num_input_tokens_seen": 358882896, + "step": 10220 + }, + { + "epoch": 0.8252599735299678, + "grad_norm": 0.25735771656036377, + "learning_rate": 3e-05, + "loss": 1.2289, + "num_input_tokens_seen": 359243952, + "step": 10230 + }, + { + "epoch": 0.8260666792714438, + "grad_norm": 0.2557520568370819, + "learning_rate": 3e-05, + "loss": 1.235, + "num_input_tokens_seen": 359596716, + "step": 10240 + }, + { + "epoch": 0.8268733850129198, + "grad_norm": 0.24353064596652985, + "learning_rate": 3e-05, + "loss": 1.2734, + "num_input_tokens_seen": 359931524, + "step": 10250 + }, + { + "epoch": 0.827680090754396, + "grad_norm": 0.24218714237213135, + "learning_rate": 3e-05, + "loss": 1.2855, + "num_input_tokens_seen": 360277308, + "step": 10260 + }, + { + "epoch": 0.828486796495872, + "grad_norm": 0.2978828549385071, + "learning_rate": 3e-05, + "loss": 1.3319, + "num_input_tokens_seen": 360642156, + "step": 10270 + }, + { + "epoch": 0.829293502237348, + "grad_norm": 0.2385886013507843, + "learning_rate": 3e-05, + "loss": 1.2721, + "num_input_tokens_seen": 361012064, + "step": 10280 + }, + { + "epoch": 0.830100207978824, + "grad_norm": 0.274522602558136, + "learning_rate": 3e-05, + "loss": 1.2921, + "num_input_tokens_seen": 361321708, + "step": 10290 + }, + { + "epoch": 0.8309069137203, + "grad_norm": 0.22934795916080475, + "learning_rate": 3e-05, + "loss": 1.2827, + "num_input_tokens_seen": 361669832, + "step": 10300 + }, + { + "epoch": 0.831713619461776, + "grad_norm": 0.2701473534107208, + "learning_rate": 3e-05, + "loss": 1.2205, + "num_input_tokens_seen": 362010440, + "step": 10310 + }, + { + "epoch": 0.832520325203252, + "grad_norm": 0.22467046976089478, + "learning_rate": 3e-05, + "loss": 1.2203, + "num_input_tokens_seen": 362373936, + "step": 10320 + }, + { + "epoch": 0.833327030944728, + "grad_norm": 0.24814799427986145, + "learning_rate": 3e-05, + "loss": 1.2372, + "num_input_tokens_seen": 362719892, + "step": 10330 + }, + { + "epoch": 0.834133736686204, + "grad_norm": 0.25354889035224915, + "learning_rate": 3e-05, + "loss": 1.269, + "num_input_tokens_seen": 363050116, + "step": 10340 + }, + { + "epoch": 0.8349404424276801, + "grad_norm": 0.2522750496864319, + "learning_rate": 3e-05, + "loss": 1.2287, + "num_input_tokens_seen": 363410780, + "step": 10350 + }, + { + "epoch": 0.8357471481691561, + "grad_norm": 0.2644040882587433, + "learning_rate": 3e-05, + "loss": 1.283, + "num_input_tokens_seen": 363780168, + "step": 10360 + }, + { + "epoch": 0.8365538539106321, + "grad_norm": 0.24271726608276367, + "learning_rate": 3e-05, + "loss": 1.2452, + "num_input_tokens_seen": 364140384, + "step": 10370 + }, + { + "epoch": 0.8373605596521081, + "grad_norm": 0.2616620659828186, + "learning_rate": 3e-05, + "loss": 1.2483, + "num_input_tokens_seen": 364502480, + "step": 10380 + }, + { + "epoch": 0.8381672653935842, + "grad_norm": 0.26111093163490295, + "learning_rate": 3e-05, + "loss": 1.2139, + "num_input_tokens_seen": 364861816, + "step": 10390 + }, + { + "epoch": 0.8389739711350602, + "grad_norm": 0.23570705950260162, + "learning_rate": 3e-05, + "loss": 1.2376, + "num_input_tokens_seen": 365218500, + "step": 10400 + }, + { + "epoch": 0.8397806768765362, + "grad_norm": 0.24099615216255188, + "learning_rate": 3e-05, + "loss": 1.2236, + "num_input_tokens_seen": 365576996, + "step": 10410 + }, + { + "epoch": 0.8405873826180122, + "grad_norm": 0.261840283870697, + "learning_rate": 3e-05, + "loss": 1.3198, + "num_input_tokens_seen": 365916660, + "step": 10420 + }, + { + "epoch": 0.8413940883594883, + "grad_norm": 0.2680794298648834, + "learning_rate": 3e-05, + "loss": 1.2491, + "num_input_tokens_seen": 366264708, + "step": 10430 + }, + { + "epoch": 0.8422007941009643, + "grad_norm": 0.26973119378089905, + "learning_rate": 3e-05, + "loss": 1.2461, + "num_input_tokens_seen": 366626892, + "step": 10440 + }, + { + "epoch": 0.8430074998424403, + "grad_norm": 0.2640502154827118, + "learning_rate": 3e-05, + "loss": 1.234, + "num_input_tokens_seen": 366982568, + "step": 10450 + }, + { + "epoch": 0.8438142055839163, + "grad_norm": 0.2516578435897827, + "learning_rate": 3e-05, + "loss": 1.1995, + "num_input_tokens_seen": 367303208, + "step": 10460 + }, + { + "epoch": 0.8446209113253923, + "grad_norm": 0.264775812625885, + "learning_rate": 3e-05, + "loss": 1.2123, + "num_input_tokens_seen": 367626724, + "step": 10470 + }, + { + "epoch": 0.8454276170668683, + "grad_norm": 0.252989798784256, + "learning_rate": 3e-05, + "loss": 1.2622, + "num_input_tokens_seen": 367987924, + "step": 10480 + }, + { + "epoch": 0.8462343228083443, + "grad_norm": 0.2506852447986603, + "learning_rate": 3e-05, + "loss": 1.2796, + "num_input_tokens_seen": 368365224, + "step": 10490 + }, + { + "epoch": 0.8470410285498203, + "grad_norm": 0.2525902986526489, + "learning_rate": 3e-05, + "loss": 1.2393, + "num_input_tokens_seen": 368706164, + "step": 10500 + }, + { + "epoch": 0.8478477342912965, + "grad_norm": 0.3259766399860382, + "learning_rate": 3e-05, + "loss": 1.3053, + "num_input_tokens_seen": 369069824, + "step": 10510 + }, + { + "epoch": 0.8486544400327725, + "grad_norm": 0.246359184384346, + "learning_rate": 3e-05, + "loss": 1.22, + "num_input_tokens_seen": 369420620, + "step": 10520 + }, + { + "epoch": 0.8494611457742485, + "grad_norm": 0.2465633898973465, + "learning_rate": 3e-05, + "loss": 1.26, + "num_input_tokens_seen": 369789168, + "step": 10530 + }, + { + "epoch": 0.8502678515157245, + "grad_norm": 0.2697504758834839, + "learning_rate": 3e-05, + "loss": 1.2437, + "num_input_tokens_seen": 370155836, + "step": 10540 + }, + { + "epoch": 0.8510745572572005, + "grad_norm": 0.25357383489608765, + "learning_rate": 3e-05, + "loss": 1.2639, + "num_input_tokens_seen": 370489560, + "step": 10550 + }, + { + "epoch": 0.8518812629986765, + "grad_norm": 0.27426791191101074, + "learning_rate": 3e-05, + "loss": 1.2564, + "num_input_tokens_seen": 370852008, + "step": 10560 + }, + { + "epoch": 0.8526879687401525, + "grad_norm": 0.26024049520492554, + "learning_rate": 3e-05, + "loss": 1.2902, + "num_input_tokens_seen": 371194808, + "step": 10570 + }, + { + "epoch": 0.8534946744816285, + "grad_norm": 0.28873512148857117, + "learning_rate": 3e-05, + "loss": 1.1927, + "num_input_tokens_seen": 371559520, + "step": 10580 + }, + { + "epoch": 0.8543013802231045, + "grad_norm": 0.2774757146835327, + "learning_rate": 3e-05, + "loss": 1.2304, + "num_input_tokens_seen": 371899308, + "step": 10590 + }, + { + "epoch": 0.8551080859645805, + "grad_norm": 0.27191224694252014, + "learning_rate": 3e-05, + "loss": 1.1987, + "num_input_tokens_seen": 372232400, + "step": 10600 + }, + { + "epoch": 0.8559147917060566, + "grad_norm": 0.26448413729667664, + "learning_rate": 3e-05, + "loss": 1.2394, + "num_input_tokens_seen": 372589132, + "step": 10610 + }, + { + "epoch": 0.8567214974475326, + "grad_norm": 0.26863351464271545, + "learning_rate": 3e-05, + "loss": 1.2914, + "num_input_tokens_seen": 372937492, + "step": 10620 + }, + { + "epoch": 0.8575282031890087, + "grad_norm": 0.2653568685054779, + "learning_rate": 3e-05, + "loss": 1.236, + "num_input_tokens_seen": 373293780, + "step": 10630 + }, + { + "epoch": 0.8583349089304847, + "grad_norm": 0.27198871970176697, + "learning_rate": 3e-05, + "loss": 1.2327, + "num_input_tokens_seen": 373680012, + "step": 10640 + }, + { + "epoch": 0.8591416146719607, + "grad_norm": 0.2744047939777374, + "learning_rate": 3e-05, + "loss": 1.2706, + "num_input_tokens_seen": 374041616, + "step": 10650 + }, + { + "epoch": 0.8599483204134367, + "grad_norm": 0.24338699877262115, + "learning_rate": 3e-05, + "loss": 1.27, + "num_input_tokens_seen": 374421228, + "step": 10660 + }, + { + "epoch": 0.8607550261549127, + "grad_norm": 0.2561684846878052, + "learning_rate": 3e-05, + "loss": 1.2483, + "num_input_tokens_seen": 374766780, + "step": 10670 + }, + { + "epoch": 0.8615617318963887, + "grad_norm": 0.2887466549873352, + "learning_rate": 3e-05, + "loss": 1.2688, + "num_input_tokens_seen": 375121440, + "step": 10680 + }, + { + "epoch": 0.8623684376378647, + "grad_norm": 0.2793877422809601, + "learning_rate": 3e-05, + "loss": 1.2325, + "num_input_tokens_seen": 375484872, + "step": 10690 + }, + { + "epoch": 0.8631751433793408, + "grad_norm": 0.26802805066108704, + "learning_rate": 3e-05, + "loss": 1.2388, + "num_input_tokens_seen": 375836092, + "step": 10700 + }, + { + "epoch": 0.8639818491208168, + "grad_norm": 0.2660770118236542, + "learning_rate": 3e-05, + "loss": 1.2379, + "num_input_tokens_seen": 376169816, + "step": 10710 + }, + { + "epoch": 0.8647885548622928, + "grad_norm": 0.26407331228256226, + "learning_rate": 3e-05, + "loss": 1.2386, + "num_input_tokens_seen": 376523852, + "step": 10720 + }, + { + "epoch": 0.8655952606037688, + "grad_norm": 0.23881566524505615, + "learning_rate": 3e-05, + "loss": 1.278, + "num_input_tokens_seen": 376881480, + "step": 10730 + }, + { + "epoch": 0.8664019663452448, + "grad_norm": 0.2527766823768616, + "learning_rate": 3e-05, + "loss": 1.2352, + "num_input_tokens_seen": 377225680, + "step": 10740 + }, + { + "epoch": 0.8672086720867209, + "grad_norm": 0.25618699193000793, + "learning_rate": 3e-05, + "loss": 1.2561, + "num_input_tokens_seen": 377581152, + "step": 10750 + }, + { + "epoch": 0.8680153778281969, + "grad_norm": 0.2603427767753601, + "learning_rate": 3e-05, + "loss": 1.245, + "num_input_tokens_seen": 377923004, + "step": 10760 + }, + { + "epoch": 0.8688220835696729, + "grad_norm": 0.2423306107521057, + "learning_rate": 3e-05, + "loss": 1.2524, + "num_input_tokens_seen": 378270880, + "step": 10770 + }, + { + "epoch": 0.869628789311149, + "grad_norm": 0.2624494731426239, + "learning_rate": 3e-05, + "loss": 1.2438, + "num_input_tokens_seen": 378626340, + "step": 10780 + }, + { + "epoch": 0.870435495052625, + "grad_norm": 0.26242879033088684, + "learning_rate": 3e-05, + "loss": 1.2235, + "num_input_tokens_seen": 378968780, + "step": 10790 + }, + { + "epoch": 0.871242200794101, + "grad_norm": 0.2819896340370178, + "learning_rate": 3e-05, + "loss": 1.1845, + "num_input_tokens_seen": 379331496, + "step": 10800 + }, + { + "epoch": 0.872048906535577, + "grad_norm": 0.25225383043289185, + "learning_rate": 3e-05, + "loss": 1.2268, + "num_input_tokens_seen": 379686312, + "step": 10810 + }, + { + "epoch": 0.872855612277053, + "grad_norm": 0.33487361669540405, + "learning_rate": 3e-05, + "loss": 1.2184, + "num_input_tokens_seen": 380041796, + "step": 10820 + }, + { + "epoch": 0.873662318018529, + "grad_norm": 0.25806111097335815, + "learning_rate": 3e-05, + "loss": 1.2289, + "num_input_tokens_seen": 380391316, + "step": 10830 + }, + { + "epoch": 0.874469023760005, + "grad_norm": 0.2700815796852112, + "learning_rate": 3e-05, + "loss": 1.208, + "num_input_tokens_seen": 380758180, + "step": 10840 + }, + { + "epoch": 0.875275729501481, + "grad_norm": 0.24442021548748016, + "learning_rate": 3e-05, + "loss": 1.2071, + "num_input_tokens_seen": 381140420, + "step": 10850 + }, + { + "epoch": 0.876082435242957, + "grad_norm": 0.27837643027305603, + "learning_rate": 3e-05, + "loss": 1.1758, + "num_input_tokens_seen": 381510192, + "step": 10860 + }, + { + "epoch": 0.876889140984433, + "grad_norm": 0.2531345784664154, + "learning_rate": 3e-05, + "loss": 1.201, + "num_input_tokens_seen": 381860536, + "step": 10870 + }, + { + "epoch": 0.8776958467259092, + "grad_norm": 0.25533026456832886, + "learning_rate": 3e-05, + "loss": 1.2518, + "num_input_tokens_seen": 382230860, + "step": 10880 + }, + { + "epoch": 0.8785025524673852, + "grad_norm": 0.2697776257991791, + "learning_rate": 3e-05, + "loss": 1.2115, + "num_input_tokens_seen": 382575708, + "step": 10890 + }, + { + "epoch": 0.8793092582088612, + "grad_norm": 0.275545597076416, + "learning_rate": 3e-05, + "loss": 1.2691, + "num_input_tokens_seen": 382952276, + "step": 10900 + }, + { + "epoch": 0.8801159639503372, + "grad_norm": 0.2756127715110779, + "learning_rate": 3e-05, + "loss": 1.195, + "num_input_tokens_seen": 383361024, + "step": 10910 + }, + { + "epoch": 0.8809226696918132, + "grad_norm": 0.26673251390457153, + "learning_rate": 3e-05, + "loss": 1.2974, + "num_input_tokens_seen": 383709472, + "step": 10920 + }, + { + "epoch": 0.8817293754332892, + "grad_norm": 0.27520835399627686, + "learning_rate": 3e-05, + "loss": 1.1702, + "num_input_tokens_seen": 384065488, + "step": 10930 + }, + { + "epoch": 0.8825360811747652, + "grad_norm": 0.2573419213294983, + "learning_rate": 3e-05, + "loss": 1.2493, + "num_input_tokens_seen": 384413564, + "step": 10940 + }, + { + "epoch": 0.8833427869162412, + "grad_norm": 0.3231302499771118, + "learning_rate": 3e-05, + "loss": 1.2488, + "num_input_tokens_seen": 384776660, + "step": 10950 + }, + { + "epoch": 0.8841494926577173, + "grad_norm": 0.2685335874557495, + "learning_rate": 3e-05, + "loss": 1.2101, + "num_input_tokens_seen": 385121296, + "step": 10960 + }, + { + "epoch": 0.8849561983991933, + "grad_norm": 0.26467591524124146, + "learning_rate": 3e-05, + "loss": 1.2812, + "num_input_tokens_seen": 385460624, + "step": 10970 + }, + { + "epoch": 0.8857629041406693, + "grad_norm": 0.23645007610321045, + "learning_rate": 3e-05, + "loss": 1.2405, + "num_input_tokens_seen": 385805028, + "step": 10980 + }, + { + "epoch": 0.8865696098821453, + "grad_norm": 0.2732267677783966, + "learning_rate": 3e-05, + "loss": 1.2274, + "num_input_tokens_seen": 386134288, + "step": 10990 + }, + { + "epoch": 0.8873763156236214, + "grad_norm": 0.2679040729999542, + "learning_rate": 3e-05, + "loss": 1.2577, + "num_input_tokens_seen": 386471860, + "step": 11000 + }, + { + "epoch": 0.8873763156236214, + "eval_gen_len": 424.445, + "eval_loss": 1.1752405166625977, + "eval_rouge1": 39.3539, + "eval_rouge2": 23.0123, + "eval_rougeL": 31.9005, + "eval_rougeLsum": 37.4941, + "eval_runtime": 1475.7796, + "eval_samples_per_second": 0.136, + "eval_steps_per_second": 0.034, + "num_input_tokens_seen": 386471860, + "step": 11000 + }, + { + "epoch": 0.8881830213650974, + "grad_norm": 0.24609152972698212, + "learning_rate": 3e-05, + "loss": 1.2538, + "num_input_tokens_seen": 386829236, + "step": 11010 + }, + { + "epoch": 0.8889897271065734, + "grad_norm": 0.23998071253299713, + "learning_rate": 3e-05, + "loss": 1.2311, + "num_input_tokens_seen": 387162280, + "step": 11020 + }, + { + "epoch": 0.8897964328480494, + "grad_norm": 0.2572784125804901, + "learning_rate": 3e-05, + "loss": 1.2318, + "num_input_tokens_seen": 387517828, + "step": 11030 + }, + { + "epoch": 0.8906031385895254, + "grad_norm": 0.258114755153656, + "learning_rate": 3e-05, + "loss": 1.2098, + "num_input_tokens_seen": 387867880, + "step": 11040 + }, + { + "epoch": 0.8914098443310015, + "grad_norm": 0.28761738538742065, + "learning_rate": 3e-05, + "loss": 1.245, + "num_input_tokens_seen": 388222904, + "step": 11050 + }, + { + "epoch": 0.8922165500724775, + "grad_norm": 0.26138409972190857, + "learning_rate": 3e-05, + "loss": 1.2173, + "num_input_tokens_seen": 388568168, + "step": 11060 + }, + { + "epoch": 0.8930232558139535, + "grad_norm": 0.26064634323120117, + "learning_rate": 3e-05, + "loss": 1.2018, + "num_input_tokens_seen": 388926736, + "step": 11070 + }, + { + "epoch": 0.8938299615554295, + "grad_norm": 0.28964129090309143, + "learning_rate": 3e-05, + "loss": 1.2191, + "num_input_tokens_seen": 389270524, + "step": 11080 + }, + { + "epoch": 0.8946366672969055, + "grad_norm": 0.2423638552427292, + "learning_rate": 3e-05, + "loss": 1.2145, + "num_input_tokens_seen": 389663824, + "step": 11090 + }, + { + "epoch": 0.8954433730383815, + "grad_norm": 0.27935534715652466, + "learning_rate": 3e-05, + "loss": 1.2177, + "num_input_tokens_seen": 390019620, + "step": 11100 + }, + { + "epoch": 0.8962500787798575, + "grad_norm": 0.29713118076324463, + "learning_rate": 3e-05, + "loss": 1.2533, + "num_input_tokens_seen": 390367580, + "step": 11110 + }, + { + "epoch": 0.8970567845213336, + "grad_norm": 0.2777055501937866, + "learning_rate": 3e-05, + "loss": 1.2234, + "num_input_tokens_seen": 390735060, + "step": 11120 + }, + { + "epoch": 0.8978634902628096, + "grad_norm": 0.2500898838043213, + "learning_rate": 3e-05, + "loss": 1.2104, + "num_input_tokens_seen": 391075804, + "step": 11130 + }, + { + "epoch": 0.8986701960042857, + "grad_norm": 0.26286810636520386, + "learning_rate": 3e-05, + "loss": 1.2567, + "num_input_tokens_seen": 391402956, + "step": 11140 + }, + { + "epoch": 0.8994769017457617, + "grad_norm": 0.2514180839061737, + "learning_rate": 3e-05, + "loss": 1.2498, + "num_input_tokens_seen": 391738360, + "step": 11150 + }, + { + "epoch": 0.9002836074872377, + "grad_norm": 0.27611491084098816, + "learning_rate": 3e-05, + "loss": 1.1939, + "num_input_tokens_seen": 392082044, + "step": 11160 + }, + { + "epoch": 0.9010903132287137, + "grad_norm": 0.2573927342891693, + "learning_rate": 3e-05, + "loss": 1.2355, + "num_input_tokens_seen": 392441160, + "step": 11170 + }, + { + "epoch": 0.9018970189701897, + "grad_norm": 0.2716425955295563, + "learning_rate": 3e-05, + "loss": 1.2134, + "num_input_tokens_seen": 392797140, + "step": 11180 + }, + { + "epoch": 0.9027037247116657, + "grad_norm": 0.2436821162700653, + "learning_rate": 3e-05, + "loss": 1.229, + "num_input_tokens_seen": 393158316, + "step": 11190 + }, + { + "epoch": 0.9035104304531417, + "grad_norm": 0.27646389603614807, + "learning_rate": 3e-05, + "loss": 1.2138, + "num_input_tokens_seen": 393471508, + "step": 11200 + }, + { + "epoch": 0.9043171361946177, + "grad_norm": 0.2678287625312805, + "learning_rate": 3e-05, + "loss": 1.2516, + "num_input_tokens_seen": 393806264, + "step": 11210 + }, + { + "epoch": 0.9051238419360937, + "grad_norm": 0.2638424336910248, + "learning_rate": 3e-05, + "loss": 1.2467, + "num_input_tokens_seen": 394161404, + "step": 11220 + }, + { + "epoch": 0.9059305476775698, + "grad_norm": 0.2639593183994293, + "learning_rate": 3e-05, + "loss": 1.2145, + "num_input_tokens_seen": 394526568, + "step": 11230 + }, + { + "epoch": 0.9067372534190458, + "grad_norm": 0.25803256034851074, + "learning_rate": 3e-05, + "loss": 1.3036, + "num_input_tokens_seen": 394866788, + "step": 11240 + }, + { + "epoch": 0.9075439591605219, + "grad_norm": 0.2518157362937927, + "learning_rate": 3e-05, + "loss": 1.2081, + "num_input_tokens_seen": 395190516, + "step": 11250 + }, + { + "epoch": 0.9083506649019979, + "grad_norm": 0.2544965147972107, + "learning_rate": 3e-05, + "loss": 1.2234, + "num_input_tokens_seen": 395528392, + "step": 11260 + }, + { + "epoch": 0.9091573706434739, + "grad_norm": 0.24782590568065643, + "learning_rate": 3e-05, + "loss": 1.1547, + "num_input_tokens_seen": 395880192, + "step": 11270 + }, + { + "epoch": 0.9099640763849499, + "grad_norm": 0.2636893093585968, + "learning_rate": 3e-05, + "loss": 1.2305, + "num_input_tokens_seen": 396223844, + "step": 11280 + }, + { + "epoch": 0.9107707821264259, + "grad_norm": 0.2468230426311493, + "learning_rate": 3e-05, + "loss": 1.204, + "num_input_tokens_seen": 396543560, + "step": 11290 + }, + { + "epoch": 0.9115774878679019, + "grad_norm": 0.2818716764450073, + "learning_rate": 3e-05, + "loss": 1.1927, + "num_input_tokens_seen": 396879784, + "step": 11300 + }, + { + "epoch": 0.912384193609378, + "grad_norm": 0.24603427946567535, + "learning_rate": 3e-05, + "loss": 1.2276, + "num_input_tokens_seen": 397247352, + "step": 11310 + }, + { + "epoch": 0.913190899350854, + "grad_norm": 0.24526093900203705, + "learning_rate": 3e-05, + "loss": 1.2523, + "num_input_tokens_seen": 397604360, + "step": 11320 + }, + { + "epoch": 0.91399760509233, + "grad_norm": 0.26731881499290466, + "learning_rate": 3e-05, + "loss": 1.2662, + "num_input_tokens_seen": 397928512, + "step": 11330 + }, + { + "epoch": 0.914804310833806, + "grad_norm": 0.2755918800830841, + "learning_rate": 3e-05, + "loss": 1.2677, + "num_input_tokens_seen": 398264700, + "step": 11340 + }, + { + "epoch": 0.915611016575282, + "grad_norm": 0.25634992122650146, + "learning_rate": 3e-05, + "loss": 1.1865, + "num_input_tokens_seen": 398622488, + "step": 11350 + }, + { + "epoch": 0.916417722316758, + "grad_norm": 0.27104732394218445, + "learning_rate": 3e-05, + "loss": 1.2323, + "num_input_tokens_seen": 398927144, + "step": 11360 + }, + { + "epoch": 0.9172244280582341, + "grad_norm": 0.25183597207069397, + "learning_rate": 3e-05, + "loss": 1.2618, + "num_input_tokens_seen": 399315068, + "step": 11370 + }, + { + "epoch": 0.9180311337997101, + "grad_norm": 0.23518332839012146, + "learning_rate": 3e-05, + "loss": 1.2591, + "num_input_tokens_seen": 399668488, + "step": 11380 + }, + { + "epoch": 0.9188378395411861, + "grad_norm": 0.23520028591156006, + "learning_rate": 3e-05, + "loss": 1.2336, + "num_input_tokens_seen": 400018476, + "step": 11390 + }, + { + "epoch": 0.9196445452826622, + "grad_norm": 0.27664098143577576, + "learning_rate": 3e-05, + "loss": 1.2167, + "num_input_tokens_seen": 400350296, + "step": 11400 + }, + { + "epoch": 0.9204512510241382, + "grad_norm": 0.2558439373970032, + "learning_rate": 3e-05, + "loss": 1.1831, + "num_input_tokens_seen": 400653728, + "step": 11410 + }, + { + "epoch": 0.9212579567656142, + "grad_norm": 0.24782094359397888, + "learning_rate": 3e-05, + "loss": 1.2122, + "num_input_tokens_seen": 400992668, + "step": 11420 + }, + { + "epoch": 0.9220646625070902, + "grad_norm": 0.23971796035766602, + "learning_rate": 3e-05, + "loss": 1.2251, + "num_input_tokens_seen": 401351424, + "step": 11430 + }, + { + "epoch": 0.9228713682485662, + "grad_norm": 0.24755193293094635, + "learning_rate": 3e-05, + "loss": 1.197, + "num_input_tokens_seen": 401726484, + "step": 11440 + }, + { + "epoch": 0.9236780739900422, + "grad_norm": 0.26952269673347473, + "learning_rate": 3e-05, + "loss": 1.1654, + "num_input_tokens_seen": 402097328, + "step": 11450 + }, + { + "epoch": 0.9244847797315182, + "grad_norm": 0.24309176206588745, + "learning_rate": 3e-05, + "loss": 1.2339, + "num_input_tokens_seen": 402448540, + "step": 11460 + }, + { + "epoch": 0.9252914854729942, + "grad_norm": 0.2862485349178314, + "learning_rate": 3e-05, + "loss": 1.2023, + "num_input_tokens_seen": 402817680, + "step": 11470 + }, + { + "epoch": 0.9260981912144702, + "grad_norm": 0.3049052655696869, + "learning_rate": 3e-05, + "loss": 1.2021, + "num_input_tokens_seen": 403181196, + "step": 11480 + }, + { + "epoch": 0.9269048969559464, + "grad_norm": 0.25457674264907837, + "learning_rate": 3e-05, + "loss": 1.2485, + "num_input_tokens_seen": 403550872, + "step": 11490 + }, + { + "epoch": 0.9277116026974224, + "grad_norm": 0.24556294083595276, + "learning_rate": 3e-05, + "loss": 1.2269, + "num_input_tokens_seen": 403922164, + "step": 11500 + }, + { + "epoch": 0.9285183084388984, + "grad_norm": 0.292858362197876, + "learning_rate": 3e-05, + "loss": 1.2211, + "num_input_tokens_seen": 404240116, + "step": 11510 + }, + { + "epoch": 0.9293250141803744, + "grad_norm": 0.23489707708358765, + "learning_rate": 3e-05, + "loss": 1.238, + "num_input_tokens_seen": 404625636, + "step": 11520 + }, + { + "epoch": 0.9301317199218504, + "grad_norm": 0.2959127724170685, + "learning_rate": 3e-05, + "loss": 1.2062, + "num_input_tokens_seen": 404963340, + "step": 11530 + }, + { + "epoch": 0.9309384256633264, + "grad_norm": 0.2795163094997406, + "learning_rate": 3e-05, + "loss": 1.2634, + "num_input_tokens_seen": 405324692, + "step": 11540 + }, + { + "epoch": 0.9317451314048024, + "grad_norm": 0.27414393424987793, + "learning_rate": 3e-05, + "loss": 1.2477, + "num_input_tokens_seen": 405701524, + "step": 11550 + }, + { + "epoch": 0.9325518371462784, + "grad_norm": 0.26650696992874146, + "learning_rate": 3e-05, + "loss": 1.2236, + "num_input_tokens_seen": 406080452, + "step": 11560 + }, + { + "epoch": 0.9333585428877544, + "grad_norm": 0.2659411132335663, + "learning_rate": 3e-05, + "loss": 1.2534, + "num_input_tokens_seen": 406406212, + "step": 11570 + }, + { + "epoch": 0.9341652486292304, + "grad_norm": 0.24440665543079376, + "learning_rate": 3e-05, + "loss": 1.1668, + "num_input_tokens_seen": 406717196, + "step": 11580 + }, + { + "epoch": 0.9349719543707065, + "grad_norm": 0.23124107718467712, + "learning_rate": 3e-05, + "loss": 1.2525, + "num_input_tokens_seen": 407074284, + "step": 11590 + }, + { + "epoch": 0.9357786601121825, + "grad_norm": 0.2501998841762543, + "learning_rate": 3e-05, + "loss": 1.2001, + "num_input_tokens_seen": 407435636, + "step": 11600 + }, + { + "epoch": 0.9365853658536586, + "grad_norm": 0.2701874077320099, + "learning_rate": 3e-05, + "loss": 1.269, + "num_input_tokens_seen": 407777004, + "step": 11610 + }, + { + "epoch": 0.9373920715951346, + "grad_norm": 0.22814303636550903, + "learning_rate": 3e-05, + "loss": 1.2425, + "num_input_tokens_seen": 408132316, + "step": 11620 + }, + { + "epoch": 0.9381987773366106, + "grad_norm": 0.2615501880645752, + "learning_rate": 3e-05, + "loss": 1.2342, + "num_input_tokens_seen": 408489652, + "step": 11630 + }, + { + "epoch": 0.9390054830780866, + "grad_norm": 0.25700172781944275, + "learning_rate": 3e-05, + "loss": 1.1974, + "num_input_tokens_seen": 408845668, + "step": 11640 + }, + { + "epoch": 0.9398121888195626, + "grad_norm": 0.2439606785774231, + "learning_rate": 3e-05, + "loss": 1.1591, + "num_input_tokens_seen": 409174180, + "step": 11650 + }, + { + "epoch": 0.9406188945610386, + "grad_norm": 0.24392473697662354, + "learning_rate": 3e-05, + "loss": 1.2621, + "num_input_tokens_seen": 409523744, + "step": 11660 + }, + { + "epoch": 0.9414256003025147, + "grad_norm": 0.2710927426815033, + "learning_rate": 3e-05, + "loss": 1.2391, + "num_input_tokens_seen": 409895612, + "step": 11670 + }, + { + "epoch": 0.9422323060439907, + "grad_norm": 0.24979081749916077, + "learning_rate": 3e-05, + "loss": 1.2336, + "num_input_tokens_seen": 410263056, + "step": 11680 + }, + { + "epoch": 0.9430390117854667, + "grad_norm": 0.24999581277370453, + "learning_rate": 3e-05, + "loss": 1.2399, + "num_input_tokens_seen": 410610292, + "step": 11690 + }, + { + "epoch": 0.9438457175269427, + "grad_norm": 0.2609216868877411, + "learning_rate": 3e-05, + "loss": 1.1943, + "num_input_tokens_seen": 410936848, + "step": 11700 + }, + { + "epoch": 0.9446524232684187, + "grad_norm": 0.29096490144729614, + "learning_rate": 3e-05, + "loss": 1.1999, + "num_input_tokens_seen": 411292872, + "step": 11710 + }, + { + "epoch": 0.9454591290098947, + "grad_norm": 0.23998717963695526, + "learning_rate": 3e-05, + "loss": 1.2215, + "num_input_tokens_seen": 411683604, + "step": 11720 + }, + { + "epoch": 0.9462658347513707, + "grad_norm": 0.2535877823829651, + "learning_rate": 3e-05, + "loss": 1.1827, + "num_input_tokens_seen": 412049976, + "step": 11730 + }, + { + "epoch": 0.9470725404928468, + "grad_norm": 0.23387952148914337, + "learning_rate": 3e-05, + "loss": 1.2472, + "num_input_tokens_seen": 412385212, + "step": 11740 + }, + { + "epoch": 0.9478792462343228, + "grad_norm": 0.27991074323654175, + "learning_rate": 3e-05, + "loss": 1.2081, + "num_input_tokens_seen": 412719600, + "step": 11750 + }, + { + "epoch": 0.9486859519757989, + "grad_norm": 0.28498226404190063, + "learning_rate": 3e-05, + "loss": 1.234, + "num_input_tokens_seen": 413077972, + "step": 11760 + }, + { + "epoch": 0.9494926577172749, + "grad_norm": 0.24881170690059662, + "learning_rate": 3e-05, + "loss": 1.2457, + "num_input_tokens_seen": 413428308, + "step": 11770 + }, + { + "epoch": 0.9502993634587509, + "grad_norm": 0.2739012837409973, + "learning_rate": 3e-05, + "loss": 1.2318, + "num_input_tokens_seen": 413798096, + "step": 11780 + }, + { + "epoch": 0.9511060692002269, + "grad_norm": 0.2565111815929413, + "learning_rate": 3e-05, + "loss": 1.2262, + "num_input_tokens_seen": 414145460, + "step": 11790 + }, + { + "epoch": 0.9519127749417029, + "grad_norm": 0.27090346813201904, + "learning_rate": 3e-05, + "loss": 1.2413, + "num_input_tokens_seen": 414493396, + "step": 11800 + }, + { + "epoch": 0.9527194806831789, + "grad_norm": 0.25924554467201233, + "learning_rate": 3e-05, + "loss": 1.1955, + "num_input_tokens_seen": 414854028, + "step": 11810 + }, + { + "epoch": 0.9535261864246549, + "grad_norm": 0.2571480870246887, + "learning_rate": 3e-05, + "loss": 1.1855, + "num_input_tokens_seen": 415207364, + "step": 11820 + }, + { + "epoch": 0.9543328921661309, + "grad_norm": 0.27920863032341003, + "learning_rate": 3e-05, + "loss": 1.2544, + "num_input_tokens_seen": 415600924, + "step": 11830 + }, + { + "epoch": 0.9551395979076069, + "grad_norm": 0.2675030529499054, + "learning_rate": 3e-05, + "loss": 1.2391, + "num_input_tokens_seen": 415950904, + "step": 11840 + }, + { + "epoch": 0.955946303649083, + "grad_norm": 0.2398238480091095, + "learning_rate": 3e-05, + "loss": 1.2485, + "num_input_tokens_seen": 416309076, + "step": 11850 + }, + { + "epoch": 0.9567530093905591, + "grad_norm": 0.2653293311595917, + "learning_rate": 3e-05, + "loss": 1.1741, + "num_input_tokens_seen": 416682964, + "step": 11860 + }, + { + "epoch": 0.9575597151320351, + "grad_norm": 0.2775269150733948, + "learning_rate": 3e-05, + "loss": 1.2254, + "num_input_tokens_seen": 417044976, + "step": 11870 + }, + { + "epoch": 0.9583664208735111, + "grad_norm": 0.24485714733600616, + "learning_rate": 3e-05, + "loss": 1.2325, + "num_input_tokens_seen": 417409976, + "step": 11880 + }, + { + "epoch": 0.9591731266149871, + "grad_norm": 0.254849374294281, + "learning_rate": 3e-05, + "loss": 1.2358, + "num_input_tokens_seen": 417777976, + "step": 11890 + }, + { + "epoch": 0.9599798323564631, + "grad_norm": 0.24646379053592682, + "learning_rate": 3e-05, + "loss": 1.206, + "num_input_tokens_seen": 418084944, + "step": 11900 + }, + { + "epoch": 0.9607865380979391, + "grad_norm": 0.2590767741203308, + "learning_rate": 3e-05, + "loss": 1.1951, + "num_input_tokens_seen": 418475256, + "step": 11910 + }, + { + "epoch": 0.9615932438394151, + "grad_norm": 0.2564661502838135, + "learning_rate": 3e-05, + "loss": 1.2112, + "num_input_tokens_seen": 418778704, + "step": 11920 + }, + { + "epoch": 0.9623999495808911, + "grad_norm": 0.27787408232688904, + "learning_rate": 3e-05, + "loss": 1.207, + "num_input_tokens_seen": 419132332, + "step": 11930 + }, + { + "epoch": 0.9632066553223672, + "grad_norm": 0.23015113174915314, + "learning_rate": 3e-05, + "loss": 1.2402, + "num_input_tokens_seen": 419499272, + "step": 11940 + }, + { + "epoch": 0.9640133610638432, + "grad_norm": 0.23493854701519012, + "learning_rate": 3e-05, + "loss": 1.173, + "num_input_tokens_seen": 419847688, + "step": 11950 + }, + { + "epoch": 0.9648200668053192, + "grad_norm": 0.2406766414642334, + "learning_rate": 3e-05, + "loss": 1.1955, + "num_input_tokens_seen": 420188072, + "step": 11960 + }, + { + "epoch": 0.9656267725467952, + "grad_norm": 0.27738529443740845, + "learning_rate": 3e-05, + "loss": 1.2038, + "num_input_tokens_seen": 420548188, + "step": 11970 + }, + { + "epoch": 0.9664334782882713, + "grad_norm": 0.2478122115135193, + "learning_rate": 3e-05, + "loss": 1.1975, + "num_input_tokens_seen": 420900240, + "step": 11980 + }, + { + "epoch": 0.9672401840297473, + "grad_norm": 0.26496005058288574, + "learning_rate": 3e-05, + "loss": 1.2336, + "num_input_tokens_seen": 421243660, + "step": 11990 + }, + { + "epoch": 0.9680468897712233, + "grad_norm": 0.2664368450641632, + "learning_rate": 3e-05, + "loss": 1.193, + "num_input_tokens_seen": 421585440, + "step": 12000 + }, + { + "epoch": 0.9680468897712233, + "eval_gen_len": 422.225, + "eval_loss": 1.1525993347167969, + "eval_rouge1": 40.1804, + "eval_rouge2": 23.1008, + "eval_rougeL": 32.3484, + "eval_rougeLsum": 38.2103, + "eval_runtime": 1396.0916, + "eval_samples_per_second": 0.143, + "eval_steps_per_second": 0.036, + "num_input_tokens_seen": 421585440, + "step": 12000 + }, + { + "epoch": 0.9688535955126993, + "grad_norm": 0.2768273949623108, + "learning_rate": 3e-05, + "loss": 1.2532, + "num_input_tokens_seen": 421924468, + "step": 12010 + }, + { + "epoch": 0.9696603012541753, + "grad_norm": 0.23941214382648468, + "learning_rate": 3e-05, + "loss": 1.2174, + "num_input_tokens_seen": 422267696, + "step": 12020 + }, + { + "epoch": 0.9704670069956514, + "grad_norm": 0.24917346239089966, + "learning_rate": 3e-05, + "loss": 1.2038, + "num_input_tokens_seen": 422614520, + "step": 12030 + }, + { + "epoch": 0.9712737127371274, + "grad_norm": 0.2580147683620453, + "learning_rate": 3e-05, + "loss": 1.2507, + "num_input_tokens_seen": 422973276, + "step": 12040 + }, + { + "epoch": 0.9720804184786034, + "grad_norm": 0.24353154003620148, + "learning_rate": 3e-05, + "loss": 1.2144, + "num_input_tokens_seen": 423341500, + "step": 12050 + }, + { + "epoch": 0.9728871242200794, + "grad_norm": 0.27423179149627686, + "learning_rate": 3e-05, + "loss": 1.2188, + "num_input_tokens_seen": 423679004, + "step": 12060 + }, + { + "epoch": 0.9736938299615554, + "grad_norm": 0.2490026354789734, + "learning_rate": 3e-05, + "loss": 1.2043, + "num_input_tokens_seen": 424034888, + "step": 12070 + }, + { + "epoch": 0.9745005357030314, + "grad_norm": 0.2514224648475647, + "learning_rate": 3e-05, + "loss": 1.2236, + "num_input_tokens_seen": 424394100, + "step": 12080 + }, + { + "epoch": 0.9753072414445074, + "grad_norm": 0.2942357659339905, + "learning_rate": 3e-05, + "loss": 1.1832, + "num_input_tokens_seen": 424716908, + "step": 12090 + }, + { + "epoch": 0.9761139471859834, + "grad_norm": 0.2441994845867157, + "learning_rate": 3e-05, + "loss": 1.2298, + "num_input_tokens_seen": 425087956, + "step": 12100 + }, + { + "epoch": 0.9769206529274596, + "grad_norm": 0.2718014121055603, + "learning_rate": 3e-05, + "loss": 1.2549, + "num_input_tokens_seen": 425429636, + "step": 12110 + }, + { + "epoch": 0.9777273586689356, + "grad_norm": 0.23609136044979095, + "learning_rate": 3e-05, + "loss": 1.2034, + "num_input_tokens_seen": 425762244, + "step": 12120 + }, + { + "epoch": 0.9785340644104116, + "grad_norm": 0.2554143965244293, + "learning_rate": 3e-05, + "loss": 1.2059, + "num_input_tokens_seen": 426106556, + "step": 12130 + }, + { + "epoch": 0.9793407701518876, + "grad_norm": 0.2818094789981842, + "learning_rate": 3e-05, + "loss": 1.2032, + "num_input_tokens_seen": 426470164, + "step": 12140 + }, + { + "epoch": 0.9801474758933636, + "grad_norm": 0.26025861501693726, + "learning_rate": 3e-05, + "loss": 1.2107, + "num_input_tokens_seen": 426815164, + "step": 12150 + }, + { + "epoch": 0.9809541816348396, + "grad_norm": 0.29881224036216736, + "learning_rate": 3e-05, + "loss": 1.2262, + "num_input_tokens_seen": 427184952, + "step": 12160 + }, + { + "epoch": 0.9817608873763156, + "grad_norm": 0.24537017941474915, + "learning_rate": 3e-05, + "loss": 1.2207, + "num_input_tokens_seen": 427526628, + "step": 12170 + }, + { + "epoch": 0.9825675931177916, + "grad_norm": 0.28081703186035156, + "learning_rate": 3e-05, + "loss": 1.1716, + "num_input_tokens_seen": 427872948, + "step": 12180 + }, + { + "epoch": 0.9833742988592676, + "grad_norm": 0.22894425690174103, + "learning_rate": 3e-05, + "loss": 1.2104, + "num_input_tokens_seen": 428252884, + "step": 12190 + }, + { + "epoch": 0.9841810046007436, + "grad_norm": 0.23327578604221344, + "learning_rate": 3e-05, + "loss": 1.2256, + "num_input_tokens_seen": 428610824, + "step": 12200 + }, + { + "epoch": 0.9849877103422197, + "grad_norm": 0.2497028261423111, + "learning_rate": 3e-05, + "loss": 1.2069, + "num_input_tokens_seen": 428981084, + "step": 12210 + }, + { + "epoch": 0.9857944160836957, + "grad_norm": 0.2404777854681015, + "learning_rate": 3e-05, + "loss": 1.1657, + "num_input_tokens_seen": 429323900, + "step": 12220 + }, + { + "epoch": 0.9866011218251718, + "grad_norm": 0.2447100579738617, + "learning_rate": 3e-05, + "loss": 1.214, + "num_input_tokens_seen": 429692476, + "step": 12230 + }, + { + "epoch": 0.9874078275666478, + "grad_norm": 0.2328159064054489, + "learning_rate": 3e-05, + "loss": 1.2144, + "num_input_tokens_seen": 430005920, + "step": 12240 + }, + { + "epoch": 0.9882145333081238, + "grad_norm": 0.25133198499679565, + "learning_rate": 3e-05, + "loss": 1.1864, + "num_input_tokens_seen": 430333380, + "step": 12250 + }, + { + "epoch": 0.9890212390495998, + "grad_norm": 0.2603629529476166, + "learning_rate": 3e-05, + "loss": 1.212, + "num_input_tokens_seen": 430688404, + "step": 12260 + }, + { + "epoch": 0.9898279447910758, + "grad_norm": 0.25967875123023987, + "learning_rate": 3e-05, + "loss": 1.2011, + "num_input_tokens_seen": 431025404, + "step": 12270 + }, + { + "epoch": 0.9906346505325518, + "grad_norm": 0.24072428047657013, + "learning_rate": 3e-05, + "loss": 1.1536, + "num_input_tokens_seen": 431361424, + "step": 12280 + }, + { + "epoch": 0.9914413562740279, + "grad_norm": 0.2615431547164917, + "learning_rate": 3e-05, + "loss": 1.1917, + "num_input_tokens_seen": 431733732, + "step": 12290 + }, + { + "epoch": 0.9922480620155039, + "grad_norm": 0.23490871489048004, + "learning_rate": 3e-05, + "loss": 1.1985, + "num_input_tokens_seen": 432095608, + "step": 12300 + }, + { + "epoch": 0.9930547677569799, + "grad_norm": 0.2793809175491333, + "learning_rate": 3e-05, + "loss": 1.2161, + "num_input_tokens_seen": 432470776, + "step": 12310 + }, + { + "epoch": 0.9938614734984559, + "grad_norm": 0.26310858130455017, + "learning_rate": 3e-05, + "loss": 1.2369, + "num_input_tokens_seen": 432840156, + "step": 12320 + }, + { + "epoch": 0.9946681792399319, + "grad_norm": 0.2650851905345917, + "learning_rate": 3e-05, + "loss": 1.1874, + "num_input_tokens_seen": 433200564, + "step": 12330 + }, + { + "epoch": 0.9954748849814079, + "grad_norm": 0.24045298993587494, + "learning_rate": 3e-05, + "loss": 1.2043, + "num_input_tokens_seen": 433560896, + "step": 12340 + }, + { + "epoch": 0.996281590722884, + "grad_norm": 0.2662796080112457, + "learning_rate": 3e-05, + "loss": 1.228, + "num_input_tokens_seen": 433916796, + "step": 12350 + }, + { + "epoch": 0.99708829646436, + "grad_norm": 0.27926427125930786, + "learning_rate": 3e-05, + "loss": 1.1796, + "num_input_tokens_seen": 434267164, + "step": 12360 + }, + { + "epoch": 0.997895002205836, + "grad_norm": 0.29105281829833984, + "learning_rate": 3e-05, + "loss": 1.2221, + "num_input_tokens_seen": 434634960, + "step": 12370 + }, + { + "epoch": 0.9987017079473121, + "grad_norm": 0.25824907422065735, + "learning_rate": 3e-05, + "loss": 1.2037, + "num_input_tokens_seen": 434976852, + "step": 12380 + }, + { + "epoch": 0.9995084136887881, + "grad_norm": 0.2631925940513611, + "learning_rate": 3e-05, + "loss": 1.2453, + "num_input_tokens_seen": 435337884, + "step": 12390 + }, + { + "epoch": 0.9999924371336737, + "num_input_tokens_seen": 435513684, + "step": 12396, + "total_flos": 2.1022605922963784e+18, + "train_loss": 1.4536694422503678, + "train_runtime": 158351.1039, + "train_samples_per_second": 10.02, + "train_steps_per_second": 0.078 + } + ], + "logging_steps": 10, + "max_steps": 12396, + "num_input_tokens_seen": 435513684, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.1022605922963784e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}