{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999924371336737, "eval_steps": 1000, "global_step": 12396, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008067057414760194, "grad_norm": 0.6473819017410278, "learning_rate": 4.838709677419355e-07, "loss": 2.5238, "num_input_tokens_seen": 350316, "step": 10 }, { "epoch": 0.0016134114829520387, "grad_norm": 0.6671866774559021, "learning_rate": 9.67741935483871e-07, "loss": 2.5305, "num_input_tokens_seen": 706016, "step": 20 }, { "epoch": 0.0024201172244280584, "grad_norm": 0.5423910021781921, "learning_rate": 1.4516129032258064e-06, "loss": 2.5469, "num_input_tokens_seen": 1029048, "step": 30 }, { "epoch": 0.0032268229659040775, "grad_norm": 0.5091773867607117, "learning_rate": 1.935483870967742e-06, "loss": 2.4654, "num_input_tokens_seen": 1335348, "step": 40 }, { "epoch": 0.004033528707380097, "grad_norm": 0.46149584650993347, "learning_rate": 2.4193548387096776e-06, "loss": 2.4348, "num_input_tokens_seen": 1704148, "step": 50 }, { "epoch": 0.004840234448856117, "grad_norm": 0.4362267255783081, "learning_rate": 2.9032258064516128e-06, "loss": 2.4508, "num_input_tokens_seen": 2054768, "step": 60 }, { "epoch": 0.005646940190332136, "grad_norm": 0.38532692193984985, "learning_rate": 3.3870967741935484e-06, "loss": 2.4029, "num_input_tokens_seen": 2406312, "step": 70 }, { "epoch": 0.006453645931808155, "grad_norm": 0.3635064959526062, "learning_rate": 3.870967741935484e-06, "loss": 2.4462, "num_input_tokens_seen": 2760872, "step": 80 }, { "epoch": 0.007260351673284175, "grad_norm": 0.39128369092941284, "learning_rate": 4.35483870967742e-06, "loss": 2.3645, "num_input_tokens_seen": 3107736, "step": 90 }, { "epoch": 0.008067057414760194, "grad_norm": 0.4147037863731384, "learning_rate": 4.838709677419355e-06, "loss": 2.3476, "num_input_tokens_seen": 3475576, "step": 100 }, { "epoch": 0.008873763156236213, "grad_norm": 0.3666001260280609, "learning_rate": 5.322580645161291e-06, "loss": 2.4149, "num_input_tokens_seen": 3855216, "step": 110 }, { "epoch": 0.009680468897712234, "grad_norm": 0.35516056418418884, "learning_rate": 5.8064516129032256e-06, "loss": 2.3853, "num_input_tokens_seen": 4199752, "step": 120 }, { "epoch": 0.010487174639188253, "grad_norm": 0.3447749614715576, "learning_rate": 6.290322580645162e-06, "loss": 2.3515, "num_input_tokens_seen": 4539728, "step": 130 }, { "epoch": 0.011293880380664272, "grad_norm": 0.36747947335243225, "learning_rate": 6.774193548387097e-06, "loss": 2.3733, "num_input_tokens_seen": 4883460, "step": 140 }, { "epoch": 0.012100586122140291, "grad_norm": 0.33421263098716736, "learning_rate": 7.258064516129032e-06, "loss": 2.3517, "num_input_tokens_seen": 5208404, "step": 150 }, { "epoch": 0.01290729186361631, "grad_norm": 0.3581472337245941, "learning_rate": 7.741935483870968e-06, "loss": 2.3197, "num_input_tokens_seen": 5565648, "step": 160 }, { "epoch": 0.01371399760509233, "grad_norm": 0.3424622118473053, "learning_rate": 8.225806451612904e-06, "loss": 2.2875, "num_input_tokens_seen": 5932124, "step": 170 }, { "epoch": 0.01452070334656835, "grad_norm": 0.35327455401420593, "learning_rate": 8.70967741935484e-06, "loss": 2.3161, "num_input_tokens_seen": 6294120, "step": 180 }, { "epoch": 0.015327409088044369, "grad_norm": 0.35110658407211304, "learning_rate": 9.193548387096775e-06, "loss": 2.3089, "num_input_tokens_seen": 6655836, "step": 190 }, { "epoch": 0.016134114829520388, "grad_norm": 0.3083972632884979, "learning_rate": 9.67741935483871e-06, "loss": 2.2615, "num_input_tokens_seen": 7006032, "step": 200 }, { "epoch": 0.01694082057099641, "grad_norm": 0.33634695410728455, "learning_rate": 1.0161290322580644e-05, "loss": 2.3292, "num_input_tokens_seen": 7346560, "step": 210 }, { "epoch": 0.017747526312472426, "grad_norm": 0.33328714966773987, "learning_rate": 1.0645161290322582e-05, "loss": 2.2443, "num_input_tokens_seen": 7695720, "step": 220 }, { "epoch": 0.018554232053948447, "grad_norm": 0.3662545680999756, "learning_rate": 1.1129032258064517e-05, "loss": 2.2721, "num_input_tokens_seen": 8057116, "step": 230 }, { "epoch": 0.019360937795424468, "grad_norm": 0.35266175866127014, "learning_rate": 1.1612903225806451e-05, "loss": 2.2645, "num_input_tokens_seen": 8392512, "step": 240 }, { "epoch": 0.020167643536900485, "grad_norm": 0.3189757168292999, "learning_rate": 1.2096774193548387e-05, "loss": 2.2282, "num_input_tokens_seen": 8739404, "step": 250 }, { "epoch": 0.020974349278376506, "grad_norm": 0.3413700461387634, "learning_rate": 1.2580645161290324e-05, "loss": 2.265, "num_input_tokens_seen": 9092044, "step": 260 }, { "epoch": 0.021781055019852523, "grad_norm": 0.33349013328552246, "learning_rate": 1.3064516129032258e-05, "loss": 2.201, "num_input_tokens_seen": 9411188, "step": 270 }, { "epoch": 0.022587760761328544, "grad_norm": 0.3217449188232422, "learning_rate": 1.3548387096774194e-05, "loss": 2.2008, "num_input_tokens_seen": 9770648, "step": 280 }, { "epoch": 0.023394466502804565, "grad_norm": 0.33256444334983826, "learning_rate": 1.403225806451613e-05, "loss": 2.1969, "num_input_tokens_seen": 10122184, "step": 290 }, { "epoch": 0.024201172244280582, "grad_norm": 0.32968392968177795, "learning_rate": 1.4516129032258065e-05, "loss": 2.2431, "num_input_tokens_seen": 10469264, "step": 300 }, { "epoch": 0.025007877985756603, "grad_norm": 0.29683488607406616, "learning_rate": 1.5e-05, "loss": 2.2201, "num_input_tokens_seen": 10825668, "step": 310 }, { "epoch": 0.02581458372723262, "grad_norm": 0.3019179403781891, "learning_rate": 1.5483870967741936e-05, "loss": 2.1465, "num_input_tokens_seen": 11224912, "step": 320 }, { "epoch": 0.02662128946870864, "grad_norm": 0.35323160886764526, "learning_rate": 1.596774193548387e-05, "loss": 2.1791, "num_input_tokens_seen": 11570236, "step": 330 }, { "epoch": 0.02742799521018466, "grad_norm": 0.3326450288295746, "learning_rate": 1.6451612903225807e-05, "loss": 2.0943, "num_input_tokens_seen": 11909600, "step": 340 }, { "epoch": 0.02823470095166068, "grad_norm": 0.33823785185813904, "learning_rate": 1.6935483870967744e-05, "loss": 2.1687, "num_input_tokens_seen": 12268976, "step": 350 }, { "epoch": 0.0290414066931367, "grad_norm": 0.31576064229011536, "learning_rate": 1.741935483870968e-05, "loss": 2.0925, "num_input_tokens_seen": 12630560, "step": 360 }, { "epoch": 0.029848112434612717, "grad_norm": 0.33903634548187256, "learning_rate": 1.7903225806451616e-05, "loss": 2.1164, "num_input_tokens_seen": 12968760, "step": 370 }, { "epoch": 0.030654818176088738, "grad_norm": 0.33790096640586853, "learning_rate": 1.838709677419355e-05, "loss": 2.0834, "num_input_tokens_seen": 13320408, "step": 380 }, { "epoch": 0.03146152391756476, "grad_norm": 0.32104986906051636, "learning_rate": 1.8870967741935484e-05, "loss": 2.1008, "num_input_tokens_seen": 13678172, "step": 390 }, { "epoch": 0.032268229659040776, "grad_norm": 0.3284147083759308, "learning_rate": 1.935483870967742e-05, "loss": 2.1674, "num_input_tokens_seen": 14048132, "step": 400 }, { "epoch": 0.03307493540051679, "grad_norm": 0.31699198484420776, "learning_rate": 1.9838709677419355e-05, "loss": 2.1186, "num_input_tokens_seen": 14376172, "step": 410 }, { "epoch": 0.03388164114199282, "grad_norm": 0.3431970775127411, "learning_rate": 2.032258064516129e-05, "loss": 2.0956, "num_input_tokens_seen": 14729936, "step": 420 }, { "epoch": 0.034688346883468835, "grad_norm": 0.35230588912963867, "learning_rate": 2.080645161290323e-05, "loss": 2.0643, "num_input_tokens_seen": 15082004, "step": 430 }, { "epoch": 0.03549505262494485, "grad_norm": 0.3975660800933838, "learning_rate": 2.1290322580645163e-05, "loss": 2.1215, "num_input_tokens_seen": 15450700, "step": 440 }, { "epoch": 0.036301758366420876, "grad_norm": 0.31235337257385254, "learning_rate": 2.1774193548387097e-05, "loss": 1.9926, "num_input_tokens_seen": 15786772, "step": 450 }, { "epoch": 0.037108464107896894, "grad_norm": 0.33642396330833435, "learning_rate": 2.2258064516129034e-05, "loss": 2.0398, "num_input_tokens_seen": 16132004, "step": 460 }, { "epoch": 0.03791516984937291, "grad_norm": 0.3371775448322296, "learning_rate": 2.274193548387097e-05, "loss": 2.124, "num_input_tokens_seen": 16524448, "step": 470 }, { "epoch": 0.038721875590848935, "grad_norm": 0.31219518184661865, "learning_rate": 2.3225806451612902e-05, "loss": 2.0735, "num_input_tokens_seen": 16888128, "step": 480 }, { "epoch": 0.03952858133232495, "grad_norm": 0.2977409362792969, "learning_rate": 2.370967741935484e-05, "loss": 2.0295, "num_input_tokens_seen": 17227520, "step": 490 }, { "epoch": 0.04033528707380097, "grad_norm": 0.28311657905578613, "learning_rate": 2.4193548387096773e-05, "loss": 2.0672, "num_input_tokens_seen": 17584540, "step": 500 }, { "epoch": 0.04114199281527699, "grad_norm": 0.28878408670425415, "learning_rate": 2.467741935483871e-05, "loss": 1.9962, "num_input_tokens_seen": 17945160, "step": 510 }, { "epoch": 0.04194869855675301, "grad_norm": 0.3487817049026489, "learning_rate": 2.5161290322580648e-05, "loss": 2.062, "num_input_tokens_seen": 18314364, "step": 520 }, { "epoch": 0.04275540429822903, "grad_norm": 0.29766845703125, "learning_rate": 2.5645161290322582e-05, "loss": 2.0474, "num_input_tokens_seen": 18629644, "step": 530 }, { "epoch": 0.043562110039705046, "grad_norm": 1.0644810199737549, "learning_rate": 2.6129032258064516e-05, "loss": 2.0612, "num_input_tokens_seen": 18969332, "step": 540 }, { "epoch": 0.04436881578118107, "grad_norm": 0.3535769581794739, "learning_rate": 2.6612903225806453e-05, "loss": 1.9915, "num_input_tokens_seen": 19331132, "step": 550 }, { "epoch": 0.04517552152265709, "grad_norm": 0.3135438561439514, "learning_rate": 2.7096774193548387e-05, "loss": 1.9903, "num_input_tokens_seen": 19675048, "step": 560 }, { "epoch": 0.045982227264133105, "grad_norm": 0.3067088723182678, "learning_rate": 2.758064516129032e-05, "loss": 2.0101, "num_input_tokens_seen": 20024728, "step": 570 }, { "epoch": 0.04678893300560913, "grad_norm": 0.39044031500816345, "learning_rate": 2.806451612903226e-05, "loss": 2.0239, "num_input_tokens_seen": 20366444, "step": 580 }, { "epoch": 0.047595638747085146, "grad_norm": 0.28235116600990295, "learning_rate": 2.8548387096774196e-05, "loss": 1.984, "num_input_tokens_seen": 20728484, "step": 590 }, { "epoch": 0.048402344488561164, "grad_norm": 0.3268643021583557, "learning_rate": 2.903225806451613e-05, "loss": 2.0046, "num_input_tokens_seen": 21053188, "step": 600 }, { "epoch": 0.04920905023003718, "grad_norm": 0.3217363953590393, "learning_rate": 2.9516129032258067e-05, "loss": 2.0211, "num_input_tokens_seen": 21401976, "step": 610 }, { "epoch": 0.050015755971513205, "grad_norm": 0.3585478961467743, "learning_rate": 3e-05, "loss": 2.003, "num_input_tokens_seen": 21721508, "step": 620 }, { "epoch": 0.05082246171298922, "grad_norm": 0.31601133942604065, "learning_rate": 3e-05, "loss": 1.9809, "num_input_tokens_seen": 22052500, "step": 630 }, { "epoch": 0.05162916745446524, "grad_norm": 0.33022111654281616, "learning_rate": 3e-05, "loss": 2.0097, "num_input_tokens_seen": 22410848, "step": 640 }, { "epoch": 0.052435873195941264, "grad_norm": 0.30348455905914307, "learning_rate": 3e-05, "loss": 1.9627, "num_input_tokens_seen": 22769696, "step": 650 }, { "epoch": 0.05324257893741728, "grad_norm": 0.31722530722618103, "learning_rate": 3e-05, "loss": 1.9734, "num_input_tokens_seen": 23141500, "step": 660 }, { "epoch": 0.0540492846788933, "grad_norm": 0.3014906644821167, "learning_rate": 3e-05, "loss": 1.9408, "num_input_tokens_seen": 23484440, "step": 670 }, { "epoch": 0.05485599042036932, "grad_norm": 0.34274202585220337, "learning_rate": 3e-05, "loss": 1.9022, "num_input_tokens_seen": 23826764, "step": 680 }, { "epoch": 0.05566269616184534, "grad_norm": 0.31761202216148376, "learning_rate": 3e-05, "loss": 1.98, "num_input_tokens_seen": 24161792, "step": 690 }, { "epoch": 0.05646940190332136, "grad_norm": 0.37271755933761597, "learning_rate": 3e-05, "loss": 1.9989, "num_input_tokens_seen": 24503600, "step": 700 }, { "epoch": 0.057276107644797375, "grad_norm": 0.3486018180847168, "learning_rate": 3e-05, "loss": 1.9389, "num_input_tokens_seen": 24820152, "step": 710 }, { "epoch": 0.0580828133862734, "grad_norm": 0.3391967713832855, "learning_rate": 3e-05, "loss": 1.9054, "num_input_tokens_seen": 25190404, "step": 720 }, { "epoch": 0.058889519127749416, "grad_norm": 0.3023532032966614, "learning_rate": 3e-05, "loss": 1.9632, "num_input_tokens_seen": 25536472, "step": 730 }, { "epoch": 0.059696224869225434, "grad_norm": 3234.732666015625, "learning_rate": 3e-05, "loss": 2.018, "num_input_tokens_seen": 25890504, "step": 740 }, { "epoch": 0.06050293061070146, "grad_norm": 2243.118408203125, "learning_rate": 3e-05, "loss": 1.9958, "num_input_tokens_seen": 26256376, "step": 750 }, { "epoch": 0.061309636352177475, "grad_norm": 0.356982946395874, "learning_rate": 3e-05, "loss": 1.9637, "num_input_tokens_seen": 26638344, "step": 760 }, { "epoch": 0.06211634209365349, "grad_norm": 0.296735942363739, "learning_rate": 3e-05, "loss": 1.9099, "num_input_tokens_seen": 26962544, "step": 770 }, { "epoch": 0.06292304783512952, "grad_norm": 0.29231005907058716, "learning_rate": 3e-05, "loss": 1.8578, "num_input_tokens_seen": 27308112, "step": 780 }, { "epoch": 0.06372975357660553, "grad_norm": 0.31055736541748047, "learning_rate": 3e-05, "loss": 1.8843, "num_input_tokens_seen": 27676552, "step": 790 }, { "epoch": 0.06453645931808155, "grad_norm": 0.3155435025691986, "learning_rate": 3e-05, "loss": 1.9495, "num_input_tokens_seen": 28056316, "step": 800 }, { "epoch": 0.06534316505955758, "grad_norm": 0.3073548972606659, "learning_rate": 3e-05, "loss": 1.9373, "num_input_tokens_seen": 28404224, "step": 810 }, { "epoch": 0.06614987080103359, "grad_norm": 0.30792343616485596, "learning_rate": 3e-05, "loss": 1.8845, "num_input_tokens_seen": 28749544, "step": 820 }, { "epoch": 0.06695657654250961, "grad_norm": 0.27868854999542236, "learning_rate": 3e-05, "loss": 1.8979, "num_input_tokens_seen": 29086400, "step": 830 }, { "epoch": 0.06776328228398563, "grad_norm": 0.2984897792339325, "learning_rate": 3e-05, "loss": 1.8669, "num_input_tokens_seen": 29427836, "step": 840 }, { "epoch": 0.06856998802546165, "grad_norm": 0.31874415278434753, "learning_rate": 3e-05, "loss": 1.8357, "num_input_tokens_seen": 29777592, "step": 850 }, { "epoch": 0.06937669376693767, "grad_norm": 0.32049503922462463, "learning_rate": 3e-05, "loss": 1.8856, "num_input_tokens_seen": 30132364, "step": 860 }, { "epoch": 0.0701833995084137, "grad_norm": 0.6638230085372925, "learning_rate": 3e-05, "loss": 1.8624, "num_input_tokens_seen": 30474616, "step": 870 }, { "epoch": 0.0709901052498897, "grad_norm": 0.293955534696579, "learning_rate": 3e-05, "loss": 1.8969, "num_input_tokens_seen": 30836880, "step": 880 }, { "epoch": 0.07179681099136573, "grad_norm": 0.34990906715393066, "learning_rate": 3e-05, "loss": 1.8845, "num_input_tokens_seen": 31210872, "step": 890 }, { "epoch": 0.07260351673284175, "grad_norm": 0.31150537729263306, "learning_rate": 3e-05, "loss": 1.913, "num_input_tokens_seen": 31593820, "step": 900 }, { "epoch": 0.07341022247431776, "grad_norm": 0.3393719792366028, "learning_rate": 3e-05, "loss": 1.8884, "num_input_tokens_seen": 31948328, "step": 910 }, { "epoch": 0.07421692821579379, "grad_norm": 0.2771390378475189, "learning_rate": 3e-05, "loss": 1.8384, "num_input_tokens_seen": 32279500, "step": 920 }, { "epoch": 0.07502363395726981, "grad_norm": 0.29383623600006104, "learning_rate": 3e-05, "loss": 1.7929, "num_input_tokens_seen": 32639840, "step": 930 }, { "epoch": 0.07583033969874582, "grad_norm": 0.2876071333885193, "learning_rate": 3e-05, "loss": 1.8682, "num_input_tokens_seen": 32976672, "step": 940 }, { "epoch": 0.07663704544022185, "grad_norm": 0.2755143940448761, "learning_rate": 3e-05, "loss": 1.8409, "num_input_tokens_seen": 33315328, "step": 950 }, { "epoch": 0.07744375118169787, "grad_norm": 0.31250065565109253, "learning_rate": 3e-05, "loss": 1.8467, "num_input_tokens_seen": 33691124, "step": 960 }, { "epoch": 0.07825045692317388, "grad_norm": 0.3030893802642822, "learning_rate": 3e-05, "loss": 1.8342, "num_input_tokens_seen": 34049184, "step": 970 }, { "epoch": 0.0790571626646499, "grad_norm": 0.2992667555809021, "learning_rate": 3e-05, "loss": 1.8645, "num_input_tokens_seen": 34402068, "step": 980 }, { "epoch": 0.07986386840612592, "grad_norm": 0.2903348505496979, "learning_rate": 3e-05, "loss": 1.8328, "num_input_tokens_seen": 34749788, "step": 990 }, { "epoch": 0.08067057414760194, "grad_norm": 0.30363109707832336, "learning_rate": 3e-05, "loss": 1.8808, "num_input_tokens_seen": 35147692, "step": 1000 }, { "epoch": 0.08067057414760194, "eval_gen_len": 636.465, "eval_loss": 1.788309931755066, "eval_rouge1": 24.1946, "eval_rouge2": 12.2099, "eval_rougeL": 20.4185, "eval_rougeLsum": 22.251, "eval_runtime": 1680.7996, "eval_samples_per_second": 0.119, "eval_steps_per_second": 0.03, "num_input_tokens_seen": 35147692, "step": 1000 }, { "epoch": 0.08147727988907796, "grad_norm": 0.31382158398628235, "learning_rate": 3e-05, "loss": 1.8377, "num_input_tokens_seen": 35457728, "step": 1010 }, { "epoch": 0.08228398563055397, "grad_norm": 0.3211570680141449, "learning_rate": 3e-05, "loss": 1.791, "num_input_tokens_seen": 35838912, "step": 1020 }, { "epoch": 0.08309069137203, "grad_norm": 0.3069629669189453, "learning_rate": 3e-05, "loss": 1.8453, "num_input_tokens_seen": 36194004, "step": 1030 }, { "epoch": 0.08389739711350602, "grad_norm": 0.2732415497303009, "learning_rate": 3e-05, "loss": 1.7939, "num_input_tokens_seen": 36530872, "step": 1040 }, { "epoch": 0.08470410285498203, "grad_norm": 0.31079530715942383, "learning_rate": 3e-05, "loss": 1.7718, "num_input_tokens_seen": 36900376, "step": 1050 }, { "epoch": 0.08551080859645806, "grad_norm": 0.28770914673805237, "learning_rate": 3e-05, "loss": 1.8129, "num_input_tokens_seen": 37248588, "step": 1060 }, { "epoch": 0.08631751433793408, "grad_norm": 0.31988024711608887, "learning_rate": 3e-05, "loss": 1.8535, "num_input_tokens_seen": 37626604, "step": 1070 }, { "epoch": 0.08712422007941009, "grad_norm": 0.2785434126853943, "learning_rate": 3e-05, "loss": 1.8293, "num_input_tokens_seen": 37968388, "step": 1080 }, { "epoch": 0.08793092582088612, "grad_norm": 0.3427545726299286, "learning_rate": 3e-05, "loss": 1.7788, "num_input_tokens_seen": 38308276, "step": 1090 }, { "epoch": 0.08873763156236214, "grad_norm": 0.3006548583507538, "learning_rate": 3e-05, "loss": 1.7762, "num_input_tokens_seen": 38669908, "step": 1100 }, { "epoch": 0.08954433730383815, "grad_norm": 0.32136908173561096, "learning_rate": 3e-05, "loss": 1.845, "num_input_tokens_seen": 39013520, "step": 1110 }, { "epoch": 0.09035104304531417, "grad_norm": 0.34362053871154785, "learning_rate": 3e-05, "loss": 1.8068, "num_input_tokens_seen": 39354192, "step": 1120 }, { "epoch": 0.0911577487867902, "grad_norm": 0.3446958661079407, "learning_rate": 3e-05, "loss": 1.8646, "num_input_tokens_seen": 39704092, "step": 1130 }, { "epoch": 0.09196445452826621, "grad_norm": 0.3206467032432556, "learning_rate": 3e-05, "loss": 1.8266, "num_input_tokens_seen": 40064272, "step": 1140 }, { "epoch": 0.09277116026974223, "grad_norm": 0.2903178036212921, "learning_rate": 3e-05, "loss": 1.8034, "num_input_tokens_seen": 40447744, "step": 1150 }, { "epoch": 0.09357786601121826, "grad_norm": 0.29461219906806946, "learning_rate": 3e-05, "loss": 1.8363, "num_input_tokens_seen": 40784056, "step": 1160 }, { "epoch": 0.09438457175269427, "grad_norm": 0.32987499237060547, "learning_rate": 3e-05, "loss": 1.8514, "num_input_tokens_seen": 41142580, "step": 1170 }, { "epoch": 0.09519127749417029, "grad_norm": 0.31194567680358887, "learning_rate": 3e-05, "loss": 1.8027, "num_input_tokens_seen": 41471144, "step": 1180 }, { "epoch": 0.0959979832356463, "grad_norm": 0.2921917736530304, "learning_rate": 3e-05, "loss": 1.8098, "num_input_tokens_seen": 41810900, "step": 1190 }, { "epoch": 0.09680468897712233, "grad_norm": 0.2785918116569519, "learning_rate": 3e-05, "loss": 1.8202, "num_input_tokens_seen": 42140460, "step": 1200 }, { "epoch": 0.09761139471859835, "grad_norm": 0.3230614960193634, "learning_rate": 3e-05, "loss": 1.7923, "num_input_tokens_seen": 42482488, "step": 1210 }, { "epoch": 0.09841810046007436, "grad_norm": 0.2865009009838104, "learning_rate": 3e-05, "loss": 1.7968, "num_input_tokens_seen": 42810344, "step": 1220 }, { "epoch": 0.09922480620155039, "grad_norm": 0.32666832208633423, "learning_rate": 3e-05, "loss": 1.7991, "num_input_tokens_seen": 43154724, "step": 1230 }, { "epoch": 0.10003151194302641, "grad_norm": 0.28828418254852295, "learning_rate": 3e-05, "loss": 1.7948, "num_input_tokens_seen": 43514588, "step": 1240 }, { "epoch": 0.10083821768450242, "grad_norm": 0.2931421101093292, "learning_rate": 3e-05, "loss": 1.7972, "num_input_tokens_seen": 43860916, "step": 1250 }, { "epoch": 0.10164492342597845, "grad_norm": 0.3084103465080261, "learning_rate": 3e-05, "loss": 1.7792, "num_input_tokens_seen": 44227052, "step": 1260 }, { "epoch": 0.10245162916745447, "grad_norm": 0.27955740690231323, "learning_rate": 3e-05, "loss": 1.761, "num_input_tokens_seen": 44614048, "step": 1270 }, { "epoch": 0.10325833490893048, "grad_norm": 0.2971053421497345, "learning_rate": 3e-05, "loss": 1.8307, "num_input_tokens_seen": 44971956, "step": 1280 }, { "epoch": 0.1040650406504065, "grad_norm": 0.3030679225921631, "learning_rate": 3e-05, "loss": 1.8344, "num_input_tokens_seen": 45324808, "step": 1290 }, { "epoch": 0.10487174639188253, "grad_norm": 0.31672757863998413, "learning_rate": 3e-05, "loss": 1.8331, "num_input_tokens_seen": 45676540, "step": 1300 }, { "epoch": 0.10567845213335854, "grad_norm": 0.3107895255088806, "learning_rate": 3e-05, "loss": 1.7838, "num_input_tokens_seen": 46049464, "step": 1310 }, { "epoch": 0.10648515787483456, "grad_norm": 0.3014747202396393, "learning_rate": 3e-05, "loss": 1.7451, "num_input_tokens_seen": 46387884, "step": 1320 }, { "epoch": 0.10729186361631059, "grad_norm": 0.3187197148799896, "learning_rate": 3e-05, "loss": 1.7539, "num_input_tokens_seen": 46736308, "step": 1330 }, { "epoch": 0.1080985693577866, "grad_norm": 0.29054009914398193, "learning_rate": 3e-05, "loss": 1.7769, "num_input_tokens_seen": 47072184, "step": 1340 }, { "epoch": 0.10890527509926262, "grad_norm": 0.2759428322315216, "learning_rate": 3e-05, "loss": 1.7871, "num_input_tokens_seen": 47436176, "step": 1350 }, { "epoch": 0.10971198084073865, "grad_norm": 0.3081207275390625, "learning_rate": 3e-05, "loss": 1.7234, "num_input_tokens_seen": 47787408, "step": 1360 }, { "epoch": 0.11051868658221466, "grad_norm": 0.2889757454395294, "learning_rate": 3e-05, "loss": 1.8438, "num_input_tokens_seen": 48142540, "step": 1370 }, { "epoch": 0.11132539232369068, "grad_norm": 0.29038187861442566, "learning_rate": 3e-05, "loss": 1.7569, "num_input_tokens_seen": 48486176, "step": 1380 }, { "epoch": 0.1121320980651667, "grad_norm": 0.2944973409175873, "learning_rate": 3e-05, "loss": 1.769, "num_input_tokens_seen": 48856256, "step": 1390 }, { "epoch": 0.11293880380664272, "grad_norm": 0.2953120470046997, "learning_rate": 3e-05, "loss": 1.7863, "num_input_tokens_seen": 49197100, "step": 1400 }, { "epoch": 0.11374550954811874, "grad_norm": 0.2875744700431824, "learning_rate": 3e-05, "loss": 1.7009, "num_input_tokens_seen": 49520196, "step": 1410 }, { "epoch": 0.11455221528959475, "grad_norm": 0.2693103849887848, "learning_rate": 3e-05, "loss": 1.7705, "num_input_tokens_seen": 49881204, "step": 1420 }, { "epoch": 0.11535892103107077, "grad_norm": 0.2919449210166931, "learning_rate": 3e-05, "loss": 1.7068, "num_input_tokens_seen": 50202060, "step": 1430 }, { "epoch": 0.1161656267725468, "grad_norm": 0.2909579575061798, "learning_rate": 3e-05, "loss": 1.7693, "num_input_tokens_seen": 50539700, "step": 1440 }, { "epoch": 0.11697233251402281, "grad_norm": 0.29420360922813416, "learning_rate": 3e-05, "loss": 1.7307, "num_input_tokens_seen": 50919492, "step": 1450 }, { "epoch": 0.11777903825549883, "grad_norm": 0.3208655118942261, "learning_rate": 3e-05, "loss": 1.7425, "num_input_tokens_seen": 51243268, "step": 1460 }, { "epoch": 0.11858574399697486, "grad_norm": 0.2889709174633026, "learning_rate": 3e-05, "loss": 1.7642, "num_input_tokens_seen": 51599620, "step": 1470 }, { "epoch": 0.11939244973845087, "grad_norm": 0.29108598828315735, "learning_rate": 3e-05, "loss": 1.7404, "num_input_tokens_seen": 51960000, "step": 1480 }, { "epoch": 0.12019915547992689, "grad_norm": 0.3082159757614136, "learning_rate": 3e-05, "loss": 1.7389, "num_input_tokens_seen": 52343160, "step": 1490 }, { "epoch": 0.12100586122140292, "grad_norm": 0.30964505672454834, "learning_rate": 3e-05, "loss": 1.7052, "num_input_tokens_seen": 52695936, "step": 1500 }, { "epoch": 0.12181256696287893, "grad_norm": 0.2976539433002472, "learning_rate": 3e-05, "loss": 1.7637, "num_input_tokens_seen": 53041128, "step": 1510 }, { "epoch": 0.12261927270435495, "grad_norm": 0.2930919826030731, "learning_rate": 3e-05, "loss": 1.7254, "num_input_tokens_seen": 53382704, "step": 1520 }, { "epoch": 0.12342597844583097, "grad_norm": 0.31611040234565735, "learning_rate": 3e-05, "loss": 1.786, "num_input_tokens_seen": 53728720, "step": 1530 }, { "epoch": 0.12423268418730699, "grad_norm": 0.3480939269065857, "learning_rate": 3e-05, "loss": 1.76, "num_input_tokens_seen": 54063164, "step": 1540 }, { "epoch": 0.125039389928783, "grad_norm": 0.31007248163223267, "learning_rate": 3e-05, "loss": 1.7805, "num_input_tokens_seen": 54386028, "step": 1550 }, { "epoch": 0.12584609567025903, "grad_norm": 0.2958042621612549, "learning_rate": 3e-05, "loss": 1.7668, "num_input_tokens_seen": 54761288, "step": 1560 }, { "epoch": 0.12665280141173504, "grad_norm": 0.2833440899848938, "learning_rate": 3e-05, "loss": 1.747, "num_input_tokens_seen": 55134672, "step": 1570 }, { "epoch": 0.12745950715321105, "grad_norm": 0.2970580458641052, "learning_rate": 3e-05, "loss": 1.7097, "num_input_tokens_seen": 55522524, "step": 1580 }, { "epoch": 0.1282662128946871, "grad_norm": 0.3164750635623932, "learning_rate": 3e-05, "loss": 1.7395, "num_input_tokens_seen": 55869072, "step": 1590 }, { "epoch": 0.1290729186361631, "grad_norm": 0.32586508989334106, "learning_rate": 3e-05, "loss": 1.748, "num_input_tokens_seen": 56239036, "step": 1600 }, { "epoch": 0.1298796243776391, "grad_norm": 0.27935513854026794, "learning_rate": 3e-05, "loss": 1.7256, "num_input_tokens_seen": 56579324, "step": 1610 }, { "epoch": 0.13068633011911515, "grad_norm": 0.3307097256183624, "learning_rate": 3e-05, "loss": 1.668, "num_input_tokens_seen": 56928744, "step": 1620 }, { "epoch": 0.13149303586059116, "grad_norm": 0.3158148229122162, "learning_rate": 3e-05, "loss": 1.7193, "num_input_tokens_seen": 57286024, "step": 1630 }, { "epoch": 0.13229974160206717, "grad_norm": 0.29580333828926086, "learning_rate": 3e-05, "loss": 1.7249, "num_input_tokens_seen": 57671300, "step": 1640 }, { "epoch": 0.1331064473435432, "grad_norm": 0.26224178075790405, "learning_rate": 3e-05, "loss": 1.6971, "num_input_tokens_seen": 58044600, "step": 1650 }, { "epoch": 0.13391315308501922, "grad_norm": 0.2952196002006531, "learning_rate": 3e-05, "loss": 1.7619, "num_input_tokens_seen": 58389844, "step": 1660 }, { "epoch": 0.13471985882649523, "grad_norm": 0.30456557869911194, "learning_rate": 3e-05, "loss": 1.7429, "num_input_tokens_seen": 58754384, "step": 1670 }, { "epoch": 0.13552656456797127, "grad_norm": 0.2966090142726898, "learning_rate": 3e-05, "loss": 1.7241, "num_input_tokens_seen": 59114496, "step": 1680 }, { "epoch": 0.13633327030944728, "grad_norm": 0.2919583320617676, "learning_rate": 3e-05, "loss": 1.6988, "num_input_tokens_seen": 59464252, "step": 1690 }, { "epoch": 0.1371399760509233, "grad_norm": 0.2832421064376831, "learning_rate": 3e-05, "loss": 1.7817, "num_input_tokens_seen": 59837848, "step": 1700 }, { "epoch": 0.13794668179239933, "grad_norm": 0.2778345048427582, "learning_rate": 3e-05, "loss": 1.6825, "num_input_tokens_seen": 60182016, "step": 1710 }, { "epoch": 0.13875338753387534, "grad_norm": 0.3401370048522949, "learning_rate": 3e-05, "loss": 1.7525, "num_input_tokens_seen": 60532724, "step": 1720 }, { "epoch": 0.13956009327535135, "grad_norm": 0.30803683400154114, "learning_rate": 3e-05, "loss": 1.6473, "num_input_tokens_seen": 60868756, "step": 1730 }, { "epoch": 0.1403667990168274, "grad_norm": 0.2971110939979553, "learning_rate": 3e-05, "loss": 1.6967, "num_input_tokens_seen": 61206004, "step": 1740 }, { "epoch": 0.1411735047583034, "grad_norm": 0.3091312646865845, "learning_rate": 3e-05, "loss": 1.6649, "num_input_tokens_seen": 61578372, "step": 1750 }, { "epoch": 0.1419802104997794, "grad_norm": 0.25792524218559265, "learning_rate": 3e-05, "loss": 1.6868, "num_input_tokens_seen": 61954252, "step": 1760 }, { "epoch": 0.14278691624125545, "grad_norm": 0.32082629203796387, "learning_rate": 3e-05, "loss": 1.6844, "num_input_tokens_seen": 62312328, "step": 1770 }, { "epoch": 0.14359362198273146, "grad_norm": 0.2915956974029541, "learning_rate": 3e-05, "loss": 1.6998, "num_input_tokens_seen": 62657528, "step": 1780 }, { "epoch": 0.14440032772420747, "grad_norm": 0.28821295499801636, "learning_rate": 3e-05, "loss": 1.7053, "num_input_tokens_seen": 63010336, "step": 1790 }, { "epoch": 0.1452070334656835, "grad_norm": 0.2947831451892853, "learning_rate": 3e-05, "loss": 1.7078, "num_input_tokens_seen": 63341864, "step": 1800 }, { "epoch": 0.14601373920715952, "grad_norm": 0.31316396594047546, "learning_rate": 3e-05, "loss": 1.6593, "num_input_tokens_seen": 63696096, "step": 1810 }, { "epoch": 0.14682044494863553, "grad_norm": 0.3107188642024994, "learning_rate": 3e-05, "loss": 1.6506, "num_input_tokens_seen": 64077708, "step": 1820 }, { "epoch": 0.14762715069011156, "grad_norm": 0.3115972876548767, "learning_rate": 3e-05, "loss": 1.6887, "num_input_tokens_seen": 64437828, "step": 1830 }, { "epoch": 0.14843385643158757, "grad_norm": 0.34425589442253113, "learning_rate": 3e-05, "loss": 1.6977, "num_input_tokens_seen": 64819332, "step": 1840 }, { "epoch": 0.14924056217306358, "grad_norm": 0.27634525299072266, "learning_rate": 3e-05, "loss": 1.7464, "num_input_tokens_seen": 65179536, "step": 1850 }, { "epoch": 0.15004726791453962, "grad_norm": 0.31853121519088745, "learning_rate": 3e-05, "loss": 1.6382, "num_input_tokens_seen": 65551676, "step": 1860 }, { "epoch": 0.15085397365601563, "grad_norm": 0.30623626708984375, "learning_rate": 3e-05, "loss": 1.6798, "num_input_tokens_seen": 65900132, "step": 1870 }, { "epoch": 0.15166067939749164, "grad_norm": 0.28665515780448914, "learning_rate": 3e-05, "loss": 1.6672, "num_input_tokens_seen": 66266544, "step": 1880 }, { "epoch": 0.15246738513896768, "grad_norm": 0.29499661922454834, "learning_rate": 3e-05, "loss": 1.6661, "num_input_tokens_seen": 66606176, "step": 1890 }, { "epoch": 0.1532740908804437, "grad_norm": 0.3188175559043884, "learning_rate": 3e-05, "loss": 1.6772, "num_input_tokens_seen": 66975608, "step": 1900 }, { "epoch": 0.1540807966219197, "grad_norm": 0.31832584738731384, "learning_rate": 3e-05, "loss": 1.6704, "num_input_tokens_seen": 67350296, "step": 1910 }, { "epoch": 0.15488750236339574, "grad_norm": 0.329738974571228, "learning_rate": 3e-05, "loss": 1.7244, "num_input_tokens_seen": 67707796, "step": 1920 }, { "epoch": 0.15569420810487175, "grad_norm": 0.2936003804206848, "learning_rate": 3e-05, "loss": 1.6684, "num_input_tokens_seen": 68036860, "step": 1930 }, { "epoch": 0.15650091384634776, "grad_norm": 0.30164700746536255, "learning_rate": 3e-05, "loss": 1.7125, "num_input_tokens_seen": 68377696, "step": 1940 }, { "epoch": 0.15730761958782377, "grad_norm": 0.3079434931278229, "learning_rate": 3e-05, "loss": 1.6165, "num_input_tokens_seen": 68748300, "step": 1950 }, { "epoch": 0.1581143253292998, "grad_norm": 0.36346644163131714, "learning_rate": 3e-05, "loss": 1.6648, "num_input_tokens_seen": 69128968, "step": 1960 }, { "epoch": 0.15892103107077582, "grad_norm": 0.2884806990623474, "learning_rate": 3e-05, "loss": 1.6391, "num_input_tokens_seen": 69471156, "step": 1970 }, { "epoch": 0.15972773681225183, "grad_norm": 0.2658495306968689, "learning_rate": 3e-05, "loss": 1.6525, "num_input_tokens_seen": 69820100, "step": 1980 }, { "epoch": 0.16053444255372787, "grad_norm": 0.31078723073005676, "learning_rate": 3e-05, "loss": 1.6115, "num_input_tokens_seen": 70181056, "step": 1990 }, { "epoch": 0.16134114829520388, "grad_norm": 0.28954872488975525, "learning_rate": 3e-05, "loss": 1.6545, "num_input_tokens_seen": 70510224, "step": 2000 }, { "epoch": 0.16134114829520388, "eval_gen_len": 577.04, "eval_loss": 1.5985389947891235, "eval_rouge1": 28.9492, "eval_rouge2": 15.3233, "eval_rougeL": 23.871, "eval_rougeLsum": 26.9919, "eval_runtime": 1635.8727, "eval_samples_per_second": 0.122, "eval_steps_per_second": 0.031, "num_input_tokens_seen": 70510224, "step": 2000 }, { "epoch": 0.1621478540366799, "grad_norm": 0.2800785005092621, "learning_rate": 3e-05, "loss": 1.658, "num_input_tokens_seen": 70896684, "step": 2010 }, { "epoch": 0.16295455977815593, "grad_norm": 0.3101065754890442, "learning_rate": 3e-05, "loss": 1.6204, "num_input_tokens_seen": 71245800, "step": 2020 }, { "epoch": 0.16376126551963194, "grad_norm": 0.27418360114097595, "learning_rate": 3e-05, "loss": 1.6942, "num_input_tokens_seen": 71561040, "step": 2030 }, { "epoch": 0.16456797126110795, "grad_norm": 0.29117581248283386, "learning_rate": 3e-05, "loss": 1.6862, "num_input_tokens_seen": 71917876, "step": 2040 }, { "epoch": 0.165374677002584, "grad_norm": 0.3083847165107727, "learning_rate": 3e-05, "loss": 1.681, "num_input_tokens_seen": 72278364, "step": 2050 }, { "epoch": 0.16618138274406, "grad_norm": 0.29766711592674255, "learning_rate": 3e-05, "loss": 1.7143, "num_input_tokens_seen": 72618996, "step": 2060 }, { "epoch": 0.166988088485536, "grad_norm": 0.311576247215271, "learning_rate": 3e-05, "loss": 1.6782, "num_input_tokens_seen": 72956980, "step": 2070 }, { "epoch": 0.16779479422701205, "grad_norm": 0.5800204277038574, "learning_rate": 3e-05, "loss": 1.6647, "num_input_tokens_seen": 73356960, "step": 2080 }, { "epoch": 0.16860149996848806, "grad_norm": 37.67682647705078, "learning_rate": 3e-05, "loss": 1.6091, "num_input_tokens_seen": 73675816, "step": 2090 }, { "epoch": 0.16940820570996407, "grad_norm": 0.27842187881469727, "learning_rate": 3e-05, "loss": 1.6636, "num_input_tokens_seen": 74032256, "step": 2100 }, { "epoch": 0.1702149114514401, "grad_norm": 0.29616591334342957, "learning_rate": 3e-05, "loss": 1.6134, "num_input_tokens_seen": 74398400, "step": 2110 }, { "epoch": 0.17102161719291611, "grad_norm": 0.3454131782054901, "learning_rate": 3e-05, "loss": 1.6459, "num_input_tokens_seen": 74733948, "step": 2120 }, { "epoch": 0.17182832293439212, "grad_norm": 0.28399163484573364, "learning_rate": 3e-05, "loss": 1.6734, "num_input_tokens_seen": 75108376, "step": 2130 }, { "epoch": 0.17263502867586816, "grad_norm": 0.2860686480998993, "learning_rate": 3e-05, "loss": 1.6208, "num_input_tokens_seen": 75448896, "step": 2140 }, { "epoch": 0.17344173441734417, "grad_norm": 0.26892679929733276, "learning_rate": 3e-05, "loss": 1.636, "num_input_tokens_seen": 75797696, "step": 2150 }, { "epoch": 0.17424844015882018, "grad_norm": 0.2738756537437439, "learning_rate": 3e-05, "loss": 1.6832, "num_input_tokens_seen": 76171544, "step": 2160 }, { "epoch": 0.17505514590029622, "grad_norm": 0.29067671298980713, "learning_rate": 3e-05, "loss": 1.6717, "num_input_tokens_seen": 76547816, "step": 2170 }, { "epoch": 0.17586185164177223, "grad_norm": 0.28701356053352356, "learning_rate": 3e-05, "loss": 1.6343, "num_input_tokens_seen": 76890468, "step": 2180 }, { "epoch": 0.17666855738324824, "grad_norm": 0.28434693813323975, "learning_rate": 3e-05, "loss": 1.6087, "num_input_tokens_seen": 77229360, "step": 2190 }, { "epoch": 0.17747526312472428, "grad_norm": 0.3032514154911041, "learning_rate": 3e-05, "loss": 1.6132, "num_input_tokens_seen": 77586424, "step": 2200 }, { "epoch": 0.1782819688662003, "grad_norm": 0.3082556426525116, "learning_rate": 3e-05, "loss": 1.6578, "num_input_tokens_seen": 77916864, "step": 2210 }, { "epoch": 0.1790886746076763, "grad_norm": 0.28357553482055664, "learning_rate": 3e-05, "loss": 1.6743, "num_input_tokens_seen": 78271708, "step": 2220 }, { "epoch": 0.17989538034915234, "grad_norm": 0.30355584621429443, "learning_rate": 3e-05, "loss": 1.6257, "num_input_tokens_seen": 78635684, "step": 2230 }, { "epoch": 0.18070208609062835, "grad_norm": 0.3390004634857178, "learning_rate": 3e-05, "loss": 1.6041, "num_input_tokens_seen": 78983708, "step": 2240 }, { "epoch": 0.18150879183210436, "grad_norm": 0.30169346928596497, "learning_rate": 3e-05, "loss": 1.6102, "num_input_tokens_seen": 79323848, "step": 2250 }, { "epoch": 0.1823154975735804, "grad_norm": 0.33197805285453796, "learning_rate": 3e-05, "loss": 1.648, "num_input_tokens_seen": 79691064, "step": 2260 }, { "epoch": 0.1831222033150564, "grad_norm": 0.301727294921875, "learning_rate": 3e-05, "loss": 1.6121, "num_input_tokens_seen": 80057832, "step": 2270 }, { "epoch": 0.18392890905653242, "grad_norm": 0.44830191135406494, "learning_rate": 3e-05, "loss": 1.6317, "num_input_tokens_seen": 80448712, "step": 2280 }, { "epoch": 0.18473561479800846, "grad_norm": 0.2938157320022583, "learning_rate": 3e-05, "loss": 1.6598, "num_input_tokens_seen": 80804116, "step": 2290 }, { "epoch": 0.18554232053948447, "grad_norm": 0.2986922264099121, "learning_rate": 3e-05, "loss": 1.6292, "num_input_tokens_seen": 81171136, "step": 2300 }, { "epoch": 0.18634902628096048, "grad_norm": 0.2788652181625366, "learning_rate": 3e-05, "loss": 1.6548, "num_input_tokens_seen": 81540708, "step": 2310 }, { "epoch": 0.18715573202243652, "grad_norm": 0.312258243560791, "learning_rate": 3e-05, "loss": 1.6116, "num_input_tokens_seen": 81870928, "step": 2320 }, { "epoch": 0.18796243776391253, "grad_norm": 0.30631476640701294, "learning_rate": 3e-05, "loss": 1.6405, "num_input_tokens_seen": 82223772, "step": 2330 }, { "epoch": 0.18876914350538854, "grad_norm": 0.29788920283317566, "learning_rate": 3e-05, "loss": 1.6403, "num_input_tokens_seen": 82541752, "step": 2340 }, { "epoch": 0.18957584924686458, "grad_norm": 0.3009161949157715, "learning_rate": 3e-05, "loss": 1.6534, "num_input_tokens_seen": 82897448, "step": 2350 }, { "epoch": 0.19038255498834059, "grad_norm": 0.3116704821586609, "learning_rate": 3e-05, "loss": 1.6182, "num_input_tokens_seen": 83272296, "step": 2360 }, { "epoch": 0.1911892607298166, "grad_norm": 0.32088491320610046, "learning_rate": 3e-05, "loss": 1.6609, "num_input_tokens_seen": 83617696, "step": 2370 }, { "epoch": 0.1919959664712926, "grad_norm": 0.32367074489593506, "learning_rate": 3e-05, "loss": 1.6087, "num_input_tokens_seen": 83977604, "step": 2380 }, { "epoch": 0.19280267221276864, "grad_norm": 0.28396207094192505, "learning_rate": 3e-05, "loss": 1.5404, "num_input_tokens_seen": 84339752, "step": 2390 }, { "epoch": 0.19360937795424465, "grad_norm": 0.3000083267688751, "learning_rate": 3e-05, "loss": 1.599, "num_input_tokens_seen": 84695344, "step": 2400 }, { "epoch": 0.19441608369572067, "grad_norm": 0.296040415763855, "learning_rate": 3e-05, "loss": 1.5851, "num_input_tokens_seen": 85022864, "step": 2410 }, { "epoch": 0.1952227894371967, "grad_norm": 0.2935866415500641, "learning_rate": 3e-05, "loss": 1.6412, "num_input_tokens_seen": 85362004, "step": 2420 }, { "epoch": 0.1960294951786727, "grad_norm": 0.31561270356178284, "learning_rate": 3e-05, "loss": 1.6026, "num_input_tokens_seen": 85682084, "step": 2430 }, { "epoch": 0.19683620092014872, "grad_norm": 0.2930440306663513, "learning_rate": 3e-05, "loss": 1.6198, "num_input_tokens_seen": 86053116, "step": 2440 }, { "epoch": 0.19764290666162476, "grad_norm": 0.29515814781188965, "learning_rate": 3e-05, "loss": 1.6001, "num_input_tokens_seen": 86407792, "step": 2450 }, { "epoch": 0.19844961240310077, "grad_norm": 0.29479432106018066, "learning_rate": 3e-05, "loss": 1.5964, "num_input_tokens_seen": 86747732, "step": 2460 }, { "epoch": 0.19925631814457678, "grad_norm": 0.28793784976005554, "learning_rate": 3e-05, "loss": 1.5935, "num_input_tokens_seen": 87105244, "step": 2470 }, { "epoch": 0.20006302388605282, "grad_norm": 0.2696222960948944, "learning_rate": 3e-05, "loss": 1.5956, "num_input_tokens_seen": 87446420, "step": 2480 }, { "epoch": 0.20086972962752883, "grad_norm": 0.2994723618030548, "learning_rate": 3e-05, "loss": 1.5979, "num_input_tokens_seen": 87770636, "step": 2490 }, { "epoch": 0.20167643536900484, "grad_norm": 0.3084478974342346, "learning_rate": 3e-05, "loss": 1.659, "num_input_tokens_seen": 88100828, "step": 2500 }, { "epoch": 0.20248314111048088, "grad_norm": 0.2618251144886017, "learning_rate": 3e-05, "loss": 1.6145, "num_input_tokens_seen": 88483608, "step": 2510 }, { "epoch": 0.2032898468519569, "grad_norm": 0.28900229930877686, "learning_rate": 3e-05, "loss": 1.6172, "num_input_tokens_seen": 88832372, "step": 2520 }, { "epoch": 0.2040965525934329, "grad_norm": 0.30491867661476135, "learning_rate": 3e-05, "loss": 1.5989, "num_input_tokens_seen": 89160240, "step": 2530 }, { "epoch": 0.20490325833490894, "grad_norm": 0.3022604286670685, "learning_rate": 3e-05, "loss": 1.6099, "num_input_tokens_seen": 89520280, "step": 2540 }, { "epoch": 0.20570996407638495, "grad_norm": 0.27108603715896606, "learning_rate": 3e-05, "loss": 1.5853, "num_input_tokens_seen": 89873136, "step": 2550 }, { "epoch": 0.20651666981786096, "grad_norm": 0.3252500295639038, "learning_rate": 3e-05, "loss": 1.5598, "num_input_tokens_seen": 90253000, "step": 2560 }, { "epoch": 0.207323375559337, "grad_norm": 0.30979427695274353, "learning_rate": 3e-05, "loss": 1.5417, "num_input_tokens_seen": 90554752, "step": 2570 }, { "epoch": 0.208130081300813, "grad_norm": 0.2795146703720093, "learning_rate": 3e-05, "loss": 1.6095, "num_input_tokens_seen": 90936820, "step": 2580 }, { "epoch": 0.20893678704228902, "grad_norm": 0.28166651725769043, "learning_rate": 3e-05, "loss": 1.5759, "num_input_tokens_seen": 91299076, "step": 2590 }, { "epoch": 0.20974349278376506, "grad_norm": 0.3146922290325165, "learning_rate": 3e-05, "loss": 1.567, "num_input_tokens_seen": 91648568, "step": 2600 }, { "epoch": 0.21055019852524107, "grad_norm": 0.2938322424888611, "learning_rate": 3e-05, "loss": 1.5781, "num_input_tokens_seen": 91998480, "step": 2610 }, { "epoch": 0.21135690426671708, "grad_norm": 0.2709970772266388, "learning_rate": 3e-05, "loss": 1.5961, "num_input_tokens_seen": 92379904, "step": 2620 }, { "epoch": 0.21216361000819312, "grad_norm": 0.27745142579078674, "learning_rate": 3e-05, "loss": 1.5985, "num_input_tokens_seen": 92719980, "step": 2630 }, { "epoch": 0.21297031574966913, "grad_norm": 0.2709800899028778, "learning_rate": 3e-05, "loss": 1.5578, "num_input_tokens_seen": 93052044, "step": 2640 }, { "epoch": 0.21377702149114514, "grad_norm": 0.26459309458732605, "learning_rate": 3e-05, "loss": 1.5896, "num_input_tokens_seen": 93415468, "step": 2650 }, { "epoch": 0.21458372723262117, "grad_norm": 0.2925964891910553, "learning_rate": 3e-05, "loss": 1.6339, "num_input_tokens_seen": 93782336, "step": 2660 }, { "epoch": 0.21539043297409718, "grad_norm": 0.26069968938827515, "learning_rate": 3e-05, "loss": 1.5298, "num_input_tokens_seen": 94122876, "step": 2670 }, { "epoch": 0.2161971387155732, "grad_norm": 0.300855427980423, "learning_rate": 3e-05, "loss": 1.5816, "num_input_tokens_seen": 94463076, "step": 2680 }, { "epoch": 0.21700384445704923, "grad_norm": 0.283113956451416, "learning_rate": 3e-05, "loss": 1.6143, "num_input_tokens_seen": 94822824, "step": 2690 }, { "epoch": 0.21781055019852524, "grad_norm": 0.27436137199401855, "learning_rate": 3e-05, "loss": 1.5729, "num_input_tokens_seen": 95153340, "step": 2700 }, { "epoch": 0.21861725594000125, "grad_norm": 0.32102033495903015, "learning_rate": 3e-05, "loss": 1.5977, "num_input_tokens_seen": 95507556, "step": 2710 }, { "epoch": 0.2194239616814773, "grad_norm": 0.29213079810142517, "learning_rate": 3e-05, "loss": 1.5738, "num_input_tokens_seen": 95868396, "step": 2720 }, { "epoch": 0.2202306674229533, "grad_norm": 0.2973087728023529, "learning_rate": 3e-05, "loss": 1.5457, "num_input_tokens_seen": 96220008, "step": 2730 }, { "epoch": 0.2210373731644293, "grad_norm": 0.28580325841903687, "learning_rate": 3e-05, "loss": 1.5911, "num_input_tokens_seen": 96579440, "step": 2740 }, { "epoch": 0.22184407890590535, "grad_norm": 0.3367248773574829, "learning_rate": 3e-05, "loss": 1.5535, "num_input_tokens_seen": 96938504, "step": 2750 }, { "epoch": 0.22265078464738136, "grad_norm": 0.3134912848472595, "learning_rate": 3e-05, "loss": 1.5942, "num_input_tokens_seen": 97306988, "step": 2760 }, { "epoch": 0.22345749038885737, "grad_norm": 0.2981172800064087, "learning_rate": 3e-05, "loss": 1.5415, "num_input_tokens_seen": 97653476, "step": 2770 }, { "epoch": 0.2242641961303334, "grad_norm": 0.279850572347641, "learning_rate": 3e-05, "loss": 1.5997, "num_input_tokens_seen": 98021276, "step": 2780 }, { "epoch": 0.22507090187180942, "grad_norm": 0.28641802072525024, "learning_rate": 3e-05, "loss": 1.5944, "num_input_tokens_seen": 98383012, "step": 2790 }, { "epoch": 0.22587760761328543, "grad_norm": 0.3132043480873108, "learning_rate": 3e-05, "loss": 1.5811, "num_input_tokens_seen": 98714760, "step": 2800 }, { "epoch": 0.22668431335476144, "grad_norm": 0.316658079624176, "learning_rate": 3e-05, "loss": 1.5405, "num_input_tokens_seen": 99073344, "step": 2810 }, { "epoch": 0.22749101909623748, "grad_norm": 0.3003792464733124, "learning_rate": 3e-05, "loss": 1.5542, "num_input_tokens_seen": 99405504, "step": 2820 }, { "epoch": 0.2282977248377135, "grad_norm": 0.30942708253860474, "learning_rate": 3e-05, "loss": 1.603, "num_input_tokens_seen": 99721668, "step": 2830 }, { "epoch": 0.2291044305791895, "grad_norm": 0.3059990704059601, "learning_rate": 3e-05, "loss": 1.5811, "num_input_tokens_seen": 100103932, "step": 2840 }, { "epoch": 0.22991113632066554, "grad_norm": 0.28223365545272827, "learning_rate": 3e-05, "loss": 1.5837, "num_input_tokens_seen": 100444700, "step": 2850 }, { "epoch": 0.23071784206214155, "grad_norm": 0.3146832287311554, "learning_rate": 3e-05, "loss": 1.543, "num_input_tokens_seen": 100799240, "step": 2860 }, { "epoch": 0.23152454780361756, "grad_norm": 0.2812480628490448, "learning_rate": 3e-05, "loss": 1.573, "num_input_tokens_seen": 101167952, "step": 2870 }, { "epoch": 0.2323312535450936, "grad_norm": 0.29142189025878906, "learning_rate": 3e-05, "loss": 1.5945, "num_input_tokens_seen": 101537024, "step": 2880 }, { "epoch": 0.2331379592865696, "grad_norm": 0.2754380404949188, "learning_rate": 3e-05, "loss": 1.6187, "num_input_tokens_seen": 101891288, "step": 2890 }, { "epoch": 0.23394466502804562, "grad_norm": 0.2767621576786041, "learning_rate": 3e-05, "loss": 1.5483, "num_input_tokens_seen": 102222636, "step": 2900 }, { "epoch": 0.23475137076952166, "grad_norm": 0.3091464638710022, "learning_rate": 3e-05, "loss": 1.5503, "num_input_tokens_seen": 102566644, "step": 2910 }, { "epoch": 0.23555807651099767, "grad_norm": 0.29182493686676025, "learning_rate": 3e-05, "loss": 1.5685, "num_input_tokens_seen": 102911868, "step": 2920 }, { "epoch": 0.23636478225247368, "grad_norm": 0.31178319454193115, "learning_rate": 3e-05, "loss": 1.5439, "num_input_tokens_seen": 103267188, "step": 2930 }, { "epoch": 0.23717148799394971, "grad_norm": 0.2722642719745636, "learning_rate": 3e-05, "loss": 1.5385, "num_input_tokens_seen": 103570216, "step": 2940 }, { "epoch": 0.23797819373542572, "grad_norm": 0.29112839698791504, "learning_rate": 3e-05, "loss": 1.529, "num_input_tokens_seen": 103952836, "step": 2950 }, { "epoch": 0.23878489947690174, "grad_norm": 0.33165234327316284, "learning_rate": 3e-05, "loss": 1.5905, "num_input_tokens_seen": 104312972, "step": 2960 }, { "epoch": 0.23959160521837777, "grad_norm": 0.283861368894577, "learning_rate": 3e-05, "loss": 1.5791, "num_input_tokens_seen": 104674176, "step": 2970 }, { "epoch": 0.24039831095985378, "grad_norm": 0.29667556285858154, "learning_rate": 3e-05, "loss": 1.5679, "num_input_tokens_seen": 105000536, "step": 2980 }, { "epoch": 0.2412050167013298, "grad_norm": 0.2613981068134308, "learning_rate": 3e-05, "loss": 1.5683, "num_input_tokens_seen": 105334356, "step": 2990 }, { "epoch": 0.24201172244280583, "grad_norm": 0.32442784309387207, "learning_rate": 3e-05, "loss": 1.5522, "num_input_tokens_seen": 105707144, "step": 3000 }, { "epoch": 0.24201172244280583, "eval_gen_len": 537.77, "eval_loss": 1.490655779838562, "eval_rouge1": 30.4033, "eval_rouge2": 16.1354, "eval_rougeL": 24.7244, "eval_rougeLsum": 28.5037, "eval_runtime": 1529.7468, "eval_samples_per_second": 0.131, "eval_steps_per_second": 0.033, "num_input_tokens_seen": 105707144, "step": 3000 }, { "epoch": 0.24281842818428184, "grad_norm": 0.25999367237091064, "learning_rate": 3e-05, "loss": 1.5511, "num_input_tokens_seen": 106039376, "step": 3010 }, { "epoch": 0.24362513392575785, "grad_norm": 0.30608776211738586, "learning_rate": 3e-05, "loss": 1.5551, "num_input_tokens_seen": 106400776, "step": 3020 }, { "epoch": 0.2444318396672339, "grad_norm": 0.2672644257545471, "learning_rate": 3e-05, "loss": 1.5703, "num_input_tokens_seen": 106753976, "step": 3030 }, { "epoch": 0.2452385454087099, "grad_norm": 0.2924732565879822, "learning_rate": 3e-05, "loss": 1.5668, "num_input_tokens_seen": 107084116, "step": 3040 }, { "epoch": 0.2460452511501859, "grad_norm": 0.26746517419815063, "learning_rate": 3e-05, "loss": 1.5731, "num_input_tokens_seen": 107445220, "step": 3050 }, { "epoch": 0.24685195689166195, "grad_norm": 0.2895317077636719, "learning_rate": 3e-05, "loss": 1.5477, "num_input_tokens_seen": 107824932, "step": 3060 }, { "epoch": 0.24765866263313796, "grad_norm": 0.3116007447242737, "learning_rate": 3e-05, "loss": 1.5384, "num_input_tokens_seen": 108169544, "step": 3070 }, { "epoch": 0.24846536837461397, "grad_norm": 0.30636924505233765, "learning_rate": 3e-05, "loss": 1.548, "num_input_tokens_seen": 108509580, "step": 3080 }, { "epoch": 0.24927207411609, "grad_norm": 0.2778127193450928, "learning_rate": 3e-05, "loss": 1.5389, "num_input_tokens_seen": 108841048, "step": 3090 }, { "epoch": 0.250078779857566, "grad_norm": 0.33867573738098145, "learning_rate": 3e-05, "loss": 1.5356, "num_input_tokens_seen": 109174340, "step": 3100 }, { "epoch": 0.25088548559904206, "grad_norm": 0.3052271604537964, "learning_rate": 3e-05, "loss": 1.5869, "num_input_tokens_seen": 109536332, "step": 3110 }, { "epoch": 0.25169219134051807, "grad_norm": 0.3291682004928589, "learning_rate": 3e-05, "loss": 1.5583, "num_input_tokens_seen": 109876576, "step": 3120 }, { "epoch": 0.2524988970819941, "grad_norm": 0.27373817563056946, "learning_rate": 3e-05, "loss": 1.5523, "num_input_tokens_seen": 110248928, "step": 3130 }, { "epoch": 0.2533056028234701, "grad_norm": 0.2915042042732239, "learning_rate": 3e-05, "loss": 1.531, "num_input_tokens_seen": 110605440, "step": 3140 }, { "epoch": 0.2541123085649461, "grad_norm": 0.2974439561367035, "learning_rate": 3e-05, "loss": 1.5545, "num_input_tokens_seen": 110951152, "step": 3150 }, { "epoch": 0.2549190143064221, "grad_norm": 0.2974379062652588, "learning_rate": 3e-05, "loss": 1.5396, "num_input_tokens_seen": 111293688, "step": 3160 }, { "epoch": 0.2557257200478982, "grad_norm": 0.28520846366882324, "learning_rate": 3e-05, "loss": 1.553, "num_input_tokens_seen": 111657012, "step": 3170 }, { "epoch": 0.2565324257893742, "grad_norm": 0.2918589413166046, "learning_rate": 3e-05, "loss": 1.5384, "num_input_tokens_seen": 112000840, "step": 3180 }, { "epoch": 0.2573391315308502, "grad_norm": 0.2972608208656311, "learning_rate": 3e-05, "loss": 1.5092, "num_input_tokens_seen": 112363632, "step": 3190 }, { "epoch": 0.2581458372723262, "grad_norm": 0.28906238079071045, "learning_rate": 3e-05, "loss": 1.504, "num_input_tokens_seen": 112755768, "step": 3200 }, { "epoch": 0.2589525430138022, "grad_norm": 0.3328370451927185, "learning_rate": 3e-05, "loss": 1.4841, "num_input_tokens_seen": 113115408, "step": 3210 }, { "epoch": 0.2597592487552782, "grad_norm": 0.276845246553421, "learning_rate": 3e-05, "loss": 1.5259, "num_input_tokens_seen": 113485700, "step": 3220 }, { "epoch": 0.2605659544967543, "grad_norm": 0.2899667024612427, "learning_rate": 3e-05, "loss": 1.5442, "num_input_tokens_seen": 113815188, "step": 3230 }, { "epoch": 0.2613726602382303, "grad_norm": 0.2876961827278137, "learning_rate": 3e-05, "loss": 1.5318, "num_input_tokens_seen": 114160588, "step": 3240 }, { "epoch": 0.2621793659797063, "grad_norm": 0.28680142760276794, "learning_rate": 3e-05, "loss": 1.5557, "num_input_tokens_seen": 114495188, "step": 3250 }, { "epoch": 0.2629860717211823, "grad_norm": 0.3168465495109558, "learning_rate": 3e-05, "loss": 1.5693, "num_input_tokens_seen": 114854536, "step": 3260 }, { "epoch": 0.26379277746265833, "grad_norm": 0.28036338090896606, "learning_rate": 3e-05, "loss": 1.4784, "num_input_tokens_seen": 115203172, "step": 3270 }, { "epoch": 0.26459948320413434, "grad_norm": 0.3073316514492035, "learning_rate": 3e-05, "loss": 1.5274, "num_input_tokens_seen": 115585256, "step": 3280 }, { "epoch": 0.2654061889456104, "grad_norm": 0.28101012110710144, "learning_rate": 3e-05, "loss": 1.5496, "num_input_tokens_seen": 115922364, "step": 3290 }, { "epoch": 0.2662128946870864, "grad_norm": 0.2771126329898834, "learning_rate": 3e-05, "loss": 1.5634, "num_input_tokens_seen": 116267708, "step": 3300 }, { "epoch": 0.26701960042856243, "grad_norm": 0.3039109408855438, "learning_rate": 3e-05, "loss": 1.4806, "num_input_tokens_seen": 116607404, "step": 3310 }, { "epoch": 0.26782630617003844, "grad_norm": 0.2795468270778656, "learning_rate": 3e-05, "loss": 1.5449, "num_input_tokens_seen": 116917480, "step": 3320 }, { "epoch": 0.26863301191151445, "grad_norm": 0.2998358905315399, "learning_rate": 3e-05, "loss": 1.5067, "num_input_tokens_seen": 117255260, "step": 3330 }, { "epoch": 0.26943971765299046, "grad_norm": 0.3048727810382843, "learning_rate": 3e-05, "loss": 1.5021, "num_input_tokens_seen": 117596360, "step": 3340 }, { "epoch": 0.27024642339446653, "grad_norm": 0.31331056356430054, "learning_rate": 3e-05, "loss": 1.5621, "num_input_tokens_seen": 117967920, "step": 3350 }, { "epoch": 0.27105312913594254, "grad_norm": 0.3083108961582184, "learning_rate": 3e-05, "loss": 1.4923, "num_input_tokens_seen": 118314268, "step": 3360 }, { "epoch": 0.27185983487741855, "grad_norm": 0.36439692974090576, "learning_rate": 3e-05, "loss": 1.5368, "num_input_tokens_seen": 118658628, "step": 3370 }, { "epoch": 0.27266654061889456, "grad_norm": 0.2711757719516754, "learning_rate": 3e-05, "loss": 1.5048, "num_input_tokens_seen": 119018540, "step": 3380 }, { "epoch": 0.27347324636037057, "grad_norm": 0.2828957438468933, "learning_rate": 3e-05, "loss": 1.5502, "num_input_tokens_seen": 119366464, "step": 3390 }, { "epoch": 0.2742799521018466, "grad_norm": 0.3058261573314667, "learning_rate": 3e-05, "loss": 1.5167, "num_input_tokens_seen": 119713824, "step": 3400 }, { "epoch": 0.27508665784332265, "grad_norm": 0.2823350429534912, "learning_rate": 3e-05, "loss": 1.5371, "num_input_tokens_seen": 120077416, "step": 3410 }, { "epoch": 0.27589336358479866, "grad_norm": 0.2950865626335144, "learning_rate": 3e-05, "loss": 1.5208, "num_input_tokens_seen": 120429560, "step": 3420 }, { "epoch": 0.27670006932627467, "grad_norm": 0.2756860852241516, "learning_rate": 3e-05, "loss": 1.5479, "num_input_tokens_seen": 120775808, "step": 3430 }, { "epoch": 0.2775067750677507, "grad_norm": 0.32079747319221497, "learning_rate": 3e-05, "loss": 1.5235, "num_input_tokens_seen": 121146688, "step": 3440 }, { "epoch": 0.2783134808092267, "grad_norm": 0.2849906086921692, "learning_rate": 3e-05, "loss": 1.5281, "num_input_tokens_seen": 121511252, "step": 3450 }, { "epoch": 0.2791201865507027, "grad_norm": 0.3128233850002289, "learning_rate": 3e-05, "loss": 1.4737, "num_input_tokens_seen": 121880504, "step": 3460 }, { "epoch": 0.2799268922921787, "grad_norm": 0.281825989484787, "learning_rate": 3e-05, "loss": 1.4789, "num_input_tokens_seen": 122207764, "step": 3470 }, { "epoch": 0.2807335980336548, "grad_norm": 0.26039403676986694, "learning_rate": 3e-05, "loss": 1.519, "num_input_tokens_seen": 122556148, "step": 3480 }, { "epoch": 0.2815403037751308, "grad_norm": 0.34013232588768005, "learning_rate": 3e-05, "loss": 1.5325, "num_input_tokens_seen": 122911404, "step": 3490 }, { "epoch": 0.2823470095166068, "grad_norm": 0.3078472912311554, "learning_rate": 3e-05, "loss": 1.5493, "num_input_tokens_seen": 123295332, "step": 3500 }, { "epoch": 0.2831537152580828, "grad_norm": 0.3297036290168762, "learning_rate": 3e-05, "loss": 1.5111, "num_input_tokens_seen": 123608268, "step": 3510 }, { "epoch": 0.2839604209995588, "grad_norm": 0.2852914035320282, "learning_rate": 3e-05, "loss": 1.4844, "num_input_tokens_seen": 123923784, "step": 3520 }, { "epoch": 0.2847671267410348, "grad_norm": 0.2900603711605072, "learning_rate": 3e-05, "loss": 1.536, "num_input_tokens_seen": 124255432, "step": 3530 }, { "epoch": 0.2855738324825109, "grad_norm": 0.2996746003627777, "learning_rate": 3e-05, "loss": 1.4837, "num_input_tokens_seen": 124611088, "step": 3540 }, { "epoch": 0.2863805382239869, "grad_norm": 0.257682740688324, "learning_rate": 3e-05, "loss": 1.5242, "num_input_tokens_seen": 124959064, "step": 3550 }, { "epoch": 0.2871872439654629, "grad_norm": 0.3033203184604645, "learning_rate": 3e-05, "loss": 1.4912, "num_input_tokens_seen": 125314100, "step": 3560 }, { "epoch": 0.2879939497069389, "grad_norm": 0.3357955515384674, "learning_rate": 3e-05, "loss": 1.5057, "num_input_tokens_seen": 125628132, "step": 3570 }, { "epoch": 0.28880065544841493, "grad_norm": 0.30520308017730713, "learning_rate": 3e-05, "loss": 1.5287, "num_input_tokens_seen": 125984032, "step": 3580 }, { "epoch": 0.28960736118989094, "grad_norm": 0.3066059648990631, "learning_rate": 3e-05, "loss": 1.5461, "num_input_tokens_seen": 126339664, "step": 3590 }, { "epoch": 0.290414066931367, "grad_norm": 0.2903365194797516, "learning_rate": 3e-05, "loss": 1.5364, "num_input_tokens_seen": 126680156, "step": 3600 }, { "epoch": 0.291220772672843, "grad_norm": 0.30246102809906006, "learning_rate": 3e-05, "loss": 1.5888, "num_input_tokens_seen": 127076916, "step": 3610 }, { "epoch": 0.29202747841431903, "grad_norm": 0.28773432970046997, "learning_rate": 3e-05, "loss": 1.4945, "num_input_tokens_seen": 127418188, "step": 3620 }, { "epoch": 0.29283418415579504, "grad_norm": 0.36873912811279297, "learning_rate": 3e-05, "loss": 1.4849, "num_input_tokens_seen": 127795860, "step": 3630 }, { "epoch": 0.29364088989727105, "grad_norm": 0.31495216488838196, "learning_rate": 3e-05, "loss": 1.4918, "num_input_tokens_seen": 128127020, "step": 3640 }, { "epoch": 0.29444759563874706, "grad_norm": 0.9313835501670837, "learning_rate": 3e-05, "loss": 1.549, "num_input_tokens_seen": 128472256, "step": 3650 }, { "epoch": 0.2952543013802231, "grad_norm": 0.29919105768203735, "learning_rate": 3e-05, "loss": 1.5645, "num_input_tokens_seen": 128831764, "step": 3660 }, { "epoch": 0.29606100712169914, "grad_norm": 0.29914769530296326, "learning_rate": 3e-05, "loss": 1.4823, "num_input_tokens_seen": 129175644, "step": 3670 }, { "epoch": 0.29686771286317515, "grad_norm": 0.2776944041252136, "learning_rate": 3e-05, "loss": 1.4981, "num_input_tokens_seen": 129534220, "step": 3680 }, { "epoch": 0.29767441860465116, "grad_norm": 0.2623848021030426, "learning_rate": 3e-05, "loss": 1.5136, "num_input_tokens_seen": 129882948, "step": 3690 }, { "epoch": 0.29848112434612717, "grad_norm": 0.2865106165409088, "learning_rate": 3e-05, "loss": 1.4903, "num_input_tokens_seen": 130238792, "step": 3700 }, { "epoch": 0.2992878300876032, "grad_norm": 0.30147454142570496, "learning_rate": 3e-05, "loss": 1.4774, "num_input_tokens_seen": 130602272, "step": 3710 }, { "epoch": 0.30009453582907925, "grad_norm": 0.2756776809692383, "learning_rate": 3e-05, "loss": 1.5004, "num_input_tokens_seen": 130953160, "step": 3720 }, { "epoch": 0.30090124157055526, "grad_norm": 0.3233429193496704, "learning_rate": 3e-05, "loss": 1.5095, "num_input_tokens_seen": 131287396, "step": 3730 }, { "epoch": 0.30170794731203127, "grad_norm": 0.2846832871437073, "learning_rate": 3e-05, "loss": 1.5312, "num_input_tokens_seen": 131634640, "step": 3740 }, { "epoch": 0.3025146530535073, "grad_norm": 0.31799256801605225, "learning_rate": 3e-05, "loss": 1.539, "num_input_tokens_seen": 131998680, "step": 3750 }, { "epoch": 0.3033213587949833, "grad_norm": 0.2880600392818451, "learning_rate": 3e-05, "loss": 1.4928, "num_input_tokens_seen": 132325324, "step": 3760 }, { "epoch": 0.3041280645364593, "grad_norm": 0.3118450343608856, "learning_rate": 3e-05, "loss": 1.4899, "num_input_tokens_seen": 132681648, "step": 3770 }, { "epoch": 0.30493477027793536, "grad_norm": 0.2892366945743561, "learning_rate": 3e-05, "loss": 1.5506, "num_input_tokens_seen": 133029972, "step": 3780 }, { "epoch": 0.3057414760194114, "grad_norm": 0.26994529366493225, "learning_rate": 3e-05, "loss": 1.4862, "num_input_tokens_seen": 133381324, "step": 3790 }, { "epoch": 0.3065481817608874, "grad_norm": 0.30546241998672485, "learning_rate": 3e-05, "loss": 1.4856, "num_input_tokens_seen": 133726364, "step": 3800 }, { "epoch": 0.3073548875023634, "grad_norm": 0.31917914748191833, "learning_rate": 3e-05, "loss": 1.5729, "num_input_tokens_seen": 134081304, "step": 3810 }, { "epoch": 0.3081615932438394, "grad_norm": 0.28447583317756653, "learning_rate": 3e-05, "loss": 1.4627, "num_input_tokens_seen": 134427992, "step": 3820 }, { "epoch": 0.3089682989853154, "grad_norm": 0.2646794617176056, "learning_rate": 3e-05, "loss": 1.5402, "num_input_tokens_seen": 134791020, "step": 3830 }, { "epoch": 0.3097750047267915, "grad_norm": 0.33490800857543945, "learning_rate": 3e-05, "loss": 1.5013, "num_input_tokens_seen": 135143312, "step": 3840 }, { "epoch": 0.3105817104682675, "grad_norm": 0.28088971972465515, "learning_rate": 3e-05, "loss": 1.5179, "num_input_tokens_seen": 135461584, "step": 3850 }, { "epoch": 0.3113884162097435, "grad_norm": 0.31193193793296814, "learning_rate": 3e-05, "loss": 1.4818, "num_input_tokens_seen": 135833744, "step": 3860 }, { "epoch": 0.3121951219512195, "grad_norm": 0.2969256341457367, "learning_rate": 3e-05, "loss": 1.5094, "num_input_tokens_seen": 136187480, "step": 3870 }, { "epoch": 0.3130018276926955, "grad_norm": 0.2791529595851898, "learning_rate": 3e-05, "loss": 1.4803, "num_input_tokens_seen": 136526612, "step": 3880 }, { "epoch": 0.31380853343417153, "grad_norm": 0.2843697667121887, "learning_rate": 3e-05, "loss": 1.4858, "num_input_tokens_seen": 136911180, "step": 3890 }, { "epoch": 0.31461523917564754, "grad_norm": 0.289218932390213, "learning_rate": 3e-05, "loss": 1.4901, "num_input_tokens_seen": 137252616, "step": 3900 }, { "epoch": 0.3154219449171236, "grad_norm": 0.2953207790851593, "learning_rate": 3e-05, "loss": 1.498, "num_input_tokens_seen": 137622500, "step": 3910 }, { "epoch": 0.3162286506585996, "grad_norm": 0.2963256239891052, "learning_rate": 3e-05, "loss": 1.4842, "num_input_tokens_seen": 137965636, "step": 3920 }, { "epoch": 0.31703535640007563, "grad_norm": 0.26671716570854187, "learning_rate": 3e-05, "loss": 1.4532, "num_input_tokens_seen": 138320552, "step": 3930 }, { "epoch": 0.31784206214155164, "grad_norm": 0.2607724368572235, "learning_rate": 3e-05, "loss": 1.4774, "num_input_tokens_seen": 138682864, "step": 3940 }, { "epoch": 0.31864876788302765, "grad_norm": 0.25891661643981934, "learning_rate": 3e-05, "loss": 1.4808, "num_input_tokens_seen": 139009880, "step": 3950 }, { "epoch": 0.31945547362450366, "grad_norm": 0.2629043161869049, "learning_rate": 3e-05, "loss": 1.5152, "num_input_tokens_seen": 139349380, "step": 3960 }, { "epoch": 0.3202621793659797, "grad_norm": 0.2573290765285492, "learning_rate": 3e-05, "loss": 1.4592, "num_input_tokens_seen": 139690036, "step": 3970 }, { "epoch": 0.32106888510745574, "grad_norm": 0.291111022233963, "learning_rate": 3e-05, "loss": 1.5394, "num_input_tokens_seen": 140029928, "step": 3980 }, { "epoch": 0.32187559084893175, "grad_norm": 0.29500630497932434, "learning_rate": 3e-05, "loss": 1.505, "num_input_tokens_seen": 140375124, "step": 3990 }, { "epoch": 0.32268229659040776, "grad_norm": 0.27471858263015747, "learning_rate": 3e-05, "loss": 1.5059, "num_input_tokens_seen": 140722844, "step": 4000 }, { "epoch": 0.32268229659040776, "eval_gen_len": 522.495, "eval_loss": 1.4203619956970215, "eval_rouge1": 34.0294, "eval_rouge2": 19.2608, "eval_rougeL": 27.9322, "eval_rougeLsum": 32.3166, "eval_runtime": 1678.7718, "eval_samples_per_second": 0.119, "eval_steps_per_second": 0.03, "num_input_tokens_seen": 140722844, "step": 4000 }, { "epoch": 0.32348900233188377, "grad_norm": 0.28979143500328064, "learning_rate": 3e-05, "loss": 1.5586, "num_input_tokens_seen": 141066960, "step": 4010 }, { "epoch": 0.3242957080733598, "grad_norm": 0.2836126983165741, "learning_rate": 3e-05, "loss": 1.4956, "num_input_tokens_seen": 141419352, "step": 4020 }, { "epoch": 0.32510241381483584, "grad_norm": 0.28655633330345154, "learning_rate": 3e-05, "loss": 1.4839, "num_input_tokens_seen": 141790804, "step": 4030 }, { "epoch": 0.32590911955631185, "grad_norm": 0.28721150755882263, "learning_rate": 3e-05, "loss": 1.5154, "num_input_tokens_seen": 142162756, "step": 4040 }, { "epoch": 0.32671582529778787, "grad_norm": 0.30329418182373047, "learning_rate": 3e-05, "loss": 1.4852, "num_input_tokens_seen": 142517624, "step": 4050 }, { "epoch": 0.3275225310392639, "grad_norm": 0.2742053270339966, "learning_rate": 3e-05, "loss": 1.4663, "num_input_tokens_seen": 142839740, "step": 4060 }, { "epoch": 0.3283292367807399, "grad_norm": 0.2814532220363617, "learning_rate": 3e-05, "loss": 1.509, "num_input_tokens_seen": 143173156, "step": 4070 }, { "epoch": 0.3291359425222159, "grad_norm": 0.3034536838531494, "learning_rate": 3e-05, "loss": 1.4528, "num_input_tokens_seen": 143537620, "step": 4080 }, { "epoch": 0.32994264826369196, "grad_norm": 0.29641520977020264, "learning_rate": 3e-05, "loss": 1.4413, "num_input_tokens_seen": 143874732, "step": 4090 }, { "epoch": 0.330749354005168, "grad_norm": 0.2924509644508362, "learning_rate": 3e-05, "loss": 1.5089, "num_input_tokens_seen": 144230600, "step": 4100 }, { "epoch": 0.331556059746644, "grad_norm": 0.2810611128807068, "learning_rate": 3e-05, "loss": 1.4568, "num_input_tokens_seen": 144595320, "step": 4110 }, { "epoch": 0.33236276548812, "grad_norm": 0.2762203812599182, "learning_rate": 3e-05, "loss": 1.488, "num_input_tokens_seen": 144946772, "step": 4120 }, { "epoch": 0.333169471229596, "grad_norm": 0.3193224370479584, "learning_rate": 3e-05, "loss": 1.4391, "num_input_tokens_seen": 145295928, "step": 4130 }, { "epoch": 0.333976176971072, "grad_norm": 0.2631831467151642, "learning_rate": 3e-05, "loss": 1.4396, "num_input_tokens_seen": 145653456, "step": 4140 }, { "epoch": 0.3347828827125481, "grad_norm": 0.27242833375930786, "learning_rate": 3e-05, "loss": 1.4471, "num_input_tokens_seen": 146017976, "step": 4150 }, { "epoch": 0.3355895884540241, "grad_norm": 0.3117299973964691, "learning_rate": 3e-05, "loss": 1.448, "num_input_tokens_seen": 146366548, "step": 4160 }, { "epoch": 0.3363962941955001, "grad_norm": 0.28237223625183105, "learning_rate": 3e-05, "loss": 1.4627, "num_input_tokens_seen": 146688608, "step": 4170 }, { "epoch": 0.3372029999369761, "grad_norm": 0.33882033824920654, "learning_rate": 3e-05, "loss": 1.4841, "num_input_tokens_seen": 147036924, "step": 4180 }, { "epoch": 0.3380097056784521, "grad_norm": 0.2639561593532562, "learning_rate": 3e-05, "loss": 1.4653, "num_input_tokens_seen": 147354544, "step": 4190 }, { "epoch": 0.33881641141992813, "grad_norm": 0.2983449101448059, "learning_rate": 3e-05, "loss": 1.5031, "num_input_tokens_seen": 147705132, "step": 4200 }, { "epoch": 0.3396231171614042, "grad_norm": 0.30153656005859375, "learning_rate": 3e-05, "loss": 1.4866, "num_input_tokens_seen": 148044316, "step": 4210 }, { "epoch": 0.3404298229028802, "grad_norm": 0.2834070026874542, "learning_rate": 3e-05, "loss": 1.4838, "num_input_tokens_seen": 148383308, "step": 4220 }, { "epoch": 0.3412365286443562, "grad_norm": 0.28662896156311035, "learning_rate": 3e-05, "loss": 1.4963, "num_input_tokens_seen": 148711800, "step": 4230 }, { "epoch": 0.34204323438583223, "grad_norm": 0.26079222559928894, "learning_rate": 3e-05, "loss": 1.4763, "num_input_tokens_seen": 149072140, "step": 4240 }, { "epoch": 0.34284994012730824, "grad_norm": 0.29420602321624756, "learning_rate": 3e-05, "loss": 1.4634, "num_input_tokens_seen": 149418364, "step": 4250 }, { "epoch": 0.34365664586878425, "grad_norm": 0.2780504524707794, "learning_rate": 3e-05, "loss": 1.4612, "num_input_tokens_seen": 149776088, "step": 4260 }, { "epoch": 0.3444633516102603, "grad_norm": 0.308002769947052, "learning_rate": 3e-05, "loss": 1.4388, "num_input_tokens_seen": 150144108, "step": 4270 }, { "epoch": 0.3452700573517363, "grad_norm": 0.33135300874710083, "learning_rate": 3e-05, "loss": 1.4682, "num_input_tokens_seen": 150494172, "step": 4280 }, { "epoch": 0.34607676309321234, "grad_norm": 0.2844593822956085, "learning_rate": 3e-05, "loss": 1.5251, "num_input_tokens_seen": 150828560, "step": 4290 }, { "epoch": 0.34688346883468835, "grad_norm": 0.3216274082660675, "learning_rate": 3e-05, "loss": 1.5058, "num_input_tokens_seen": 151201392, "step": 4300 }, { "epoch": 0.34769017457616436, "grad_norm": 0.27584394812583923, "learning_rate": 3e-05, "loss": 1.4839, "num_input_tokens_seen": 151566364, "step": 4310 }, { "epoch": 0.34849688031764037, "grad_norm": 0.2775894105434418, "learning_rate": 3e-05, "loss": 1.4803, "num_input_tokens_seen": 151904260, "step": 4320 }, { "epoch": 0.3493035860591164, "grad_norm": 0.30853790044784546, "learning_rate": 3e-05, "loss": 1.4654, "num_input_tokens_seen": 152247804, "step": 4330 }, { "epoch": 0.35011029180059244, "grad_norm": 0.2662428617477417, "learning_rate": 3e-05, "loss": 1.4837, "num_input_tokens_seen": 152605848, "step": 4340 }, { "epoch": 0.35091699754206845, "grad_norm": 0.296486496925354, "learning_rate": 3e-05, "loss": 1.4151, "num_input_tokens_seen": 152927008, "step": 4350 }, { "epoch": 0.35172370328354446, "grad_norm": 0.314229279756546, "learning_rate": 3e-05, "loss": 1.4944, "num_input_tokens_seen": 153271476, "step": 4360 }, { "epoch": 0.3525304090250205, "grad_norm": 0.25222501158714294, "learning_rate": 3e-05, "loss": 1.5215, "num_input_tokens_seen": 153595852, "step": 4370 }, { "epoch": 0.3533371147664965, "grad_norm": 0.3103020489215851, "learning_rate": 3e-05, "loss": 1.4244, "num_input_tokens_seen": 153933224, "step": 4380 }, { "epoch": 0.3541438205079725, "grad_norm": 0.28948068618774414, "learning_rate": 3e-05, "loss": 1.4395, "num_input_tokens_seen": 154243172, "step": 4390 }, { "epoch": 0.35495052624944856, "grad_norm": 0.2793199419975281, "learning_rate": 3e-05, "loss": 1.4541, "num_input_tokens_seen": 154589252, "step": 4400 }, { "epoch": 0.35575723199092457, "grad_norm": 0.2927285432815552, "learning_rate": 3e-05, "loss": 1.4764, "num_input_tokens_seen": 154948944, "step": 4410 }, { "epoch": 0.3565639377324006, "grad_norm": 0.2556557059288025, "learning_rate": 3e-05, "loss": 1.4135, "num_input_tokens_seen": 155298440, "step": 4420 }, { "epoch": 0.3573706434738766, "grad_norm": 0.28829360008239746, "learning_rate": 3e-05, "loss": 1.4656, "num_input_tokens_seen": 155686288, "step": 4430 }, { "epoch": 0.3581773492153526, "grad_norm": 0.29673314094543457, "learning_rate": 3e-05, "loss": 1.3826, "num_input_tokens_seen": 156031180, "step": 4440 }, { "epoch": 0.3589840549568286, "grad_norm": 0.2608402371406555, "learning_rate": 3e-05, "loss": 1.4831, "num_input_tokens_seen": 156361652, "step": 4450 }, { "epoch": 0.3597907606983047, "grad_norm": 0.2800503075122833, "learning_rate": 3e-05, "loss": 1.4343, "num_input_tokens_seen": 156701024, "step": 4460 }, { "epoch": 0.3605974664397807, "grad_norm": 0.28234806656837463, "learning_rate": 3e-05, "loss": 1.4798, "num_input_tokens_seen": 157070896, "step": 4470 }, { "epoch": 0.3614041721812567, "grad_norm": 0.27914923429489136, "learning_rate": 3e-05, "loss": 1.4497, "num_input_tokens_seen": 157420460, "step": 4480 }, { "epoch": 0.3622108779227327, "grad_norm": 0.2710079550743103, "learning_rate": 3e-05, "loss": 1.4706, "num_input_tokens_seen": 157779212, "step": 4490 }, { "epoch": 0.3630175836642087, "grad_norm": 0.28353649377822876, "learning_rate": 3e-05, "loss": 1.4075, "num_input_tokens_seen": 158084872, "step": 4500 }, { "epoch": 0.36382428940568473, "grad_norm": 0.28383737802505493, "learning_rate": 3e-05, "loss": 1.4363, "num_input_tokens_seen": 158417664, "step": 4510 }, { "epoch": 0.3646309951471608, "grad_norm": 0.27592507004737854, "learning_rate": 3e-05, "loss": 1.4278, "num_input_tokens_seen": 158733056, "step": 4520 }, { "epoch": 0.3654377008886368, "grad_norm": 0.26034659147262573, "learning_rate": 3e-05, "loss": 1.4583, "num_input_tokens_seen": 159062868, "step": 4530 }, { "epoch": 0.3662444066301128, "grad_norm": 0.26085537672042847, "learning_rate": 3e-05, "loss": 1.4116, "num_input_tokens_seen": 159421052, "step": 4540 }, { "epoch": 0.36705111237158883, "grad_norm": 0.26964882016181946, "learning_rate": 3e-05, "loss": 1.4616, "num_input_tokens_seen": 159782660, "step": 4550 }, { "epoch": 0.36785781811306484, "grad_norm": 0.28062888979911804, "learning_rate": 3e-05, "loss": 1.4085, "num_input_tokens_seen": 160124688, "step": 4560 }, { "epoch": 0.36866452385454085, "grad_norm": 0.2562553286552429, "learning_rate": 3e-05, "loss": 1.4625, "num_input_tokens_seen": 160513904, "step": 4570 }, { "epoch": 0.3694712295960169, "grad_norm": 0.29400065541267395, "learning_rate": 3e-05, "loss": 1.442, "num_input_tokens_seen": 160867220, "step": 4580 }, { "epoch": 0.3702779353374929, "grad_norm": 0.2740069627761841, "learning_rate": 3e-05, "loss": 1.4217, "num_input_tokens_seen": 161238568, "step": 4590 }, { "epoch": 0.37108464107896894, "grad_norm": 0.28682824969291687, "learning_rate": 3e-05, "loss": 1.492, "num_input_tokens_seen": 161589304, "step": 4600 }, { "epoch": 0.37189134682044495, "grad_norm": 0.2908526360988617, "learning_rate": 3e-05, "loss": 1.4742, "num_input_tokens_seen": 161970132, "step": 4610 }, { "epoch": 0.37269805256192096, "grad_norm": 0.2921622097492218, "learning_rate": 3e-05, "loss": 1.4761, "num_input_tokens_seen": 162320336, "step": 4620 }, { "epoch": 0.37350475830339697, "grad_norm": 0.3282817304134369, "learning_rate": 3e-05, "loss": 1.4517, "num_input_tokens_seen": 162665048, "step": 4630 }, { "epoch": 0.37431146404487303, "grad_norm": 0.27311021089553833, "learning_rate": 3e-05, "loss": 1.4484, "num_input_tokens_seen": 163011772, "step": 4640 }, { "epoch": 0.37511816978634904, "grad_norm": 0.24732042849063873, "learning_rate": 3e-05, "loss": 1.4262, "num_input_tokens_seen": 163366004, "step": 4650 }, { "epoch": 0.37592487552782505, "grad_norm": 0.3375225365161896, "learning_rate": 3e-05, "loss": 1.4143, "num_input_tokens_seen": 163695340, "step": 4660 }, { "epoch": 0.37673158126930106, "grad_norm": 0.2611980140209198, "learning_rate": 3e-05, "loss": 1.4367, "num_input_tokens_seen": 164050628, "step": 4670 }, { "epoch": 0.3775382870107771, "grad_norm": 0.30901384353637695, "learning_rate": 3e-05, "loss": 1.458, "num_input_tokens_seen": 164403700, "step": 4680 }, { "epoch": 0.3783449927522531, "grad_norm": 0.29676762223243713, "learning_rate": 3e-05, "loss": 1.4785, "num_input_tokens_seen": 164749396, "step": 4690 }, { "epoch": 0.37915169849372915, "grad_norm": 0.29146572947502136, "learning_rate": 3e-05, "loss": 1.434, "num_input_tokens_seen": 165076256, "step": 4700 }, { "epoch": 0.37995840423520516, "grad_norm": 0.35839927196502686, "learning_rate": 3e-05, "loss": 1.4647, "num_input_tokens_seen": 165424992, "step": 4710 }, { "epoch": 0.38076510997668117, "grad_norm": 0.2916266620159149, "learning_rate": 3e-05, "loss": 1.4701, "num_input_tokens_seen": 165764352, "step": 4720 }, { "epoch": 0.3815718157181572, "grad_norm": 0.2933688163757324, "learning_rate": 3e-05, "loss": 1.4398, "num_input_tokens_seen": 166097368, "step": 4730 }, { "epoch": 0.3823785214596332, "grad_norm": 0.2589133679866791, "learning_rate": 3e-05, "loss": 1.4017, "num_input_tokens_seen": 166468532, "step": 4740 }, { "epoch": 0.3831852272011092, "grad_norm": 0.3302017152309418, "learning_rate": 3e-05, "loss": 1.4082, "num_input_tokens_seen": 166819988, "step": 4750 }, { "epoch": 0.3839919329425852, "grad_norm": 0.2915537655353546, "learning_rate": 3e-05, "loss": 1.4585, "num_input_tokens_seen": 167157084, "step": 4760 }, { "epoch": 0.3847986386840613, "grad_norm": 0.29807379841804504, "learning_rate": 3e-05, "loss": 1.4276, "num_input_tokens_seen": 167524544, "step": 4770 }, { "epoch": 0.3856053444255373, "grad_norm": 0.28128594160079956, "learning_rate": 3e-05, "loss": 1.471, "num_input_tokens_seen": 167853064, "step": 4780 }, { "epoch": 0.3864120501670133, "grad_norm": 0.2917296886444092, "learning_rate": 3e-05, "loss": 1.4871, "num_input_tokens_seen": 168220760, "step": 4790 }, { "epoch": 0.3872187559084893, "grad_norm": 0.2948204576969147, "learning_rate": 3e-05, "loss": 1.443, "num_input_tokens_seen": 168551420, "step": 4800 }, { "epoch": 0.3880254616499653, "grad_norm": 0.2919817268848419, "learning_rate": 3e-05, "loss": 1.4142, "num_input_tokens_seen": 168903208, "step": 4810 }, { "epoch": 0.38883216739144133, "grad_norm": 0.28495824337005615, "learning_rate": 3e-05, "loss": 1.4491, "num_input_tokens_seen": 169259372, "step": 4820 }, { "epoch": 0.3896388731329174, "grad_norm": 0.28058505058288574, "learning_rate": 3e-05, "loss": 1.439, "num_input_tokens_seen": 169603980, "step": 4830 }, { "epoch": 0.3904455788743934, "grad_norm": 0.27780622243881226, "learning_rate": 3e-05, "loss": 1.4333, "num_input_tokens_seen": 169969336, "step": 4840 }, { "epoch": 0.3912522846158694, "grad_norm": 0.28063181042671204, "learning_rate": 3e-05, "loss": 1.4642, "num_input_tokens_seen": 170331728, "step": 4850 }, { "epoch": 0.3920589903573454, "grad_norm": 0.2832536995410919, "learning_rate": 3e-05, "loss": 1.4097, "num_input_tokens_seen": 170698136, "step": 4860 }, { "epoch": 0.39286569609882144, "grad_norm": 0.31159868836402893, "learning_rate": 3e-05, "loss": 1.4248, "num_input_tokens_seen": 171051356, "step": 4870 }, { "epoch": 0.39367240184029745, "grad_norm": 0.3231009244918823, "learning_rate": 3e-05, "loss": 1.424, "num_input_tokens_seen": 171411700, "step": 4880 }, { "epoch": 0.3944791075817735, "grad_norm": 0.3507569432258606, "learning_rate": 3e-05, "loss": 1.4611, "num_input_tokens_seen": 171780536, "step": 4890 }, { "epoch": 0.3952858133232495, "grad_norm": 0.2700771391391754, "learning_rate": 3e-05, "loss": 1.4234, "num_input_tokens_seen": 172139560, "step": 4900 }, { "epoch": 0.39609251906472553, "grad_norm": 0.28461360931396484, "learning_rate": 3e-05, "loss": 1.4077, "num_input_tokens_seen": 172461924, "step": 4910 }, { "epoch": 0.39689922480620154, "grad_norm": 0.2726331353187561, "learning_rate": 3e-05, "loss": 1.4361, "num_input_tokens_seen": 172822620, "step": 4920 }, { "epoch": 0.39770593054767756, "grad_norm": 0.266812264919281, "learning_rate": 3e-05, "loss": 1.4222, "num_input_tokens_seen": 173165692, "step": 4930 }, { "epoch": 0.39851263628915357, "grad_norm": 0.31729623675346375, "learning_rate": 3e-05, "loss": 1.4395, "num_input_tokens_seen": 173514872, "step": 4940 }, { "epoch": 0.39931934203062963, "grad_norm": 0.2758219838142395, "learning_rate": 3e-05, "loss": 1.462, "num_input_tokens_seen": 173870404, "step": 4950 }, { "epoch": 0.40012604777210564, "grad_norm": 0.2920880615711212, "learning_rate": 3e-05, "loss": 1.4334, "num_input_tokens_seen": 174254056, "step": 4960 }, { "epoch": 0.40093275351358165, "grad_norm": 0.2842954397201538, "learning_rate": 3e-05, "loss": 1.4819, "num_input_tokens_seen": 174603984, "step": 4970 }, { "epoch": 0.40173945925505766, "grad_norm": 0.27924880385398865, "learning_rate": 3e-05, "loss": 1.4149, "num_input_tokens_seen": 174952904, "step": 4980 }, { "epoch": 0.4025461649965337, "grad_norm": 0.28720763325691223, "learning_rate": 3e-05, "loss": 1.4737, "num_input_tokens_seen": 175315668, "step": 4990 }, { "epoch": 0.4033528707380097, "grad_norm": 0.3302316963672638, "learning_rate": 3e-05, "loss": 1.4346, "num_input_tokens_seen": 175639924, "step": 5000 }, { "epoch": 0.4033528707380097, "eval_gen_len": 494.68, "eval_loss": 1.363584041595459, "eval_rouge1": 34.4104, "eval_rouge2": 19.4149, "eval_rougeL": 28.1022, "eval_rougeLsum": 32.7299, "eval_runtime": 1479.8136, "eval_samples_per_second": 0.135, "eval_steps_per_second": 0.034, "num_input_tokens_seen": 175639924, "step": 5000 }, { "epoch": 0.40415957647948575, "grad_norm": 0.264972060918808, "learning_rate": 3e-05, "loss": 1.3869, "num_input_tokens_seen": 175981392, "step": 5010 }, { "epoch": 0.40496628222096176, "grad_norm": 0.2692941129207611, "learning_rate": 3e-05, "loss": 1.4391, "num_input_tokens_seen": 176323964, "step": 5020 }, { "epoch": 0.40577298796243777, "grad_norm": 0.31324198842048645, "learning_rate": 3e-05, "loss": 1.3911, "num_input_tokens_seen": 176687880, "step": 5030 }, { "epoch": 0.4065796937039138, "grad_norm": 0.2583986222743988, "learning_rate": 3e-05, "loss": 1.4258, "num_input_tokens_seen": 177024336, "step": 5040 }, { "epoch": 0.4073863994453898, "grad_norm": 0.2632867693901062, "learning_rate": 3e-05, "loss": 1.4099, "num_input_tokens_seen": 177365432, "step": 5050 }, { "epoch": 0.4081931051868658, "grad_norm": 0.2581656277179718, "learning_rate": 3e-05, "loss": 1.3863, "num_input_tokens_seen": 177721268, "step": 5060 }, { "epoch": 0.40899981092834187, "grad_norm": 0.256698340177536, "learning_rate": 3e-05, "loss": 1.445, "num_input_tokens_seen": 178061116, "step": 5070 }, { "epoch": 0.4098065166698179, "grad_norm": 0.2994880974292755, "learning_rate": 3e-05, "loss": 1.4639, "num_input_tokens_seen": 178375268, "step": 5080 }, { "epoch": 0.4106132224112939, "grad_norm": 0.3011598587036133, "learning_rate": 3e-05, "loss": 1.4544, "num_input_tokens_seen": 178693356, "step": 5090 }, { "epoch": 0.4114199281527699, "grad_norm": 0.3107489049434662, "learning_rate": 3e-05, "loss": 1.4446, "num_input_tokens_seen": 179028016, "step": 5100 }, { "epoch": 0.4122266338942459, "grad_norm": 0.28605297207832336, "learning_rate": 3e-05, "loss": 1.394, "num_input_tokens_seen": 179372364, "step": 5110 }, { "epoch": 0.4130333396357219, "grad_norm": 0.29272472858428955, "learning_rate": 3e-05, "loss": 1.4559, "num_input_tokens_seen": 179698216, "step": 5120 }, { "epoch": 0.413840045377198, "grad_norm": 0.2901201546192169, "learning_rate": 3e-05, "loss": 1.4222, "num_input_tokens_seen": 180049712, "step": 5130 }, { "epoch": 0.414646751118674, "grad_norm": 0.3165605664253235, "learning_rate": 3e-05, "loss": 1.4017, "num_input_tokens_seen": 180424368, "step": 5140 }, { "epoch": 0.41545345686015, "grad_norm": 0.26698291301727295, "learning_rate": 3e-05, "loss": 1.427, "num_input_tokens_seen": 180756776, "step": 5150 }, { "epoch": 0.416260162601626, "grad_norm": 0.2778262197971344, "learning_rate": 3e-05, "loss": 1.4343, "num_input_tokens_seen": 181094316, "step": 5160 }, { "epoch": 0.417066868343102, "grad_norm": 0.3387869894504547, "learning_rate": 3e-05, "loss": 1.4165, "num_input_tokens_seen": 181454460, "step": 5170 }, { "epoch": 0.41787357408457804, "grad_norm": 0.2814273238182068, "learning_rate": 3e-05, "loss": 1.4512, "num_input_tokens_seen": 181811144, "step": 5180 }, { "epoch": 0.41868027982605405, "grad_norm": 0.28893864154815674, "learning_rate": 3e-05, "loss": 1.412, "num_input_tokens_seen": 182202380, "step": 5190 }, { "epoch": 0.4194869855675301, "grad_norm": 0.2955783009529114, "learning_rate": 3e-05, "loss": 1.4187, "num_input_tokens_seen": 182566948, "step": 5200 }, { "epoch": 0.4202936913090061, "grad_norm": 0.2692851722240448, "learning_rate": 3e-05, "loss": 1.4056, "num_input_tokens_seen": 182920912, "step": 5210 }, { "epoch": 0.42110039705048213, "grad_norm": 0.28022801876068115, "learning_rate": 3e-05, "loss": 1.3988, "num_input_tokens_seen": 183271764, "step": 5220 }, { "epoch": 0.42190710279195814, "grad_norm": 0.31612420082092285, "learning_rate": 3e-05, "loss": 1.4269, "num_input_tokens_seen": 183617064, "step": 5230 }, { "epoch": 0.42271380853343415, "grad_norm": 0.2966879904270172, "learning_rate": 3e-05, "loss": 1.3826, "num_input_tokens_seen": 183961216, "step": 5240 }, { "epoch": 0.42352051427491016, "grad_norm": 0.31079381704330444, "learning_rate": 3e-05, "loss": 1.3818, "num_input_tokens_seen": 184308792, "step": 5250 }, { "epoch": 0.42432722001638623, "grad_norm": 0.28356415033340454, "learning_rate": 3e-05, "loss": 1.4443, "num_input_tokens_seen": 184652412, "step": 5260 }, { "epoch": 0.42513392575786224, "grad_norm": 0.2671275734901428, "learning_rate": 3e-05, "loss": 1.4097, "num_input_tokens_seen": 185005656, "step": 5270 }, { "epoch": 0.42594063149933825, "grad_norm": 0.3049359917640686, "learning_rate": 3e-05, "loss": 1.3983, "num_input_tokens_seen": 185364004, "step": 5280 }, { "epoch": 0.42674733724081426, "grad_norm": 0.26577872037887573, "learning_rate": 3e-05, "loss": 1.4389, "num_input_tokens_seen": 185721984, "step": 5290 }, { "epoch": 0.42755404298229027, "grad_norm": 0.27239790558815, "learning_rate": 3e-05, "loss": 1.4502, "num_input_tokens_seen": 186059416, "step": 5300 }, { "epoch": 0.4283607487237663, "grad_norm": 0.30805954337120056, "learning_rate": 3e-05, "loss": 1.4108, "num_input_tokens_seen": 186400908, "step": 5310 }, { "epoch": 0.42916745446524235, "grad_norm": 0.27232635021209717, "learning_rate": 3e-05, "loss": 1.3694, "num_input_tokens_seen": 186757120, "step": 5320 }, { "epoch": 0.42997416020671836, "grad_norm": 0.30555519461631775, "learning_rate": 3e-05, "loss": 1.3979, "num_input_tokens_seen": 187084720, "step": 5330 }, { "epoch": 0.43078086594819437, "grad_norm": 0.2889952063560486, "learning_rate": 3e-05, "loss": 1.3979, "num_input_tokens_seen": 187430864, "step": 5340 }, { "epoch": 0.4315875716896704, "grad_norm": 0.28782588243484497, "learning_rate": 3e-05, "loss": 1.4026, "num_input_tokens_seen": 187772016, "step": 5350 }, { "epoch": 0.4323942774311464, "grad_norm": 0.25100380182266235, "learning_rate": 3e-05, "loss": 1.3516, "num_input_tokens_seen": 188123096, "step": 5360 }, { "epoch": 0.4332009831726224, "grad_norm": 0.2925686240196228, "learning_rate": 3e-05, "loss": 1.4206, "num_input_tokens_seen": 188491824, "step": 5370 }, { "epoch": 0.43400768891409847, "grad_norm": 0.27262914180755615, "learning_rate": 3e-05, "loss": 1.4259, "num_input_tokens_seen": 188838176, "step": 5380 }, { "epoch": 0.4348143946555745, "grad_norm": 0.2965831458568573, "learning_rate": 3e-05, "loss": 1.4348, "num_input_tokens_seen": 189176428, "step": 5390 }, { "epoch": 0.4356211003970505, "grad_norm": 0.29133981466293335, "learning_rate": 3e-05, "loss": 1.41, "num_input_tokens_seen": 189532172, "step": 5400 }, { "epoch": 0.4364278061385265, "grad_norm": 0.2646975815296173, "learning_rate": 3e-05, "loss": 1.4505, "num_input_tokens_seen": 189883400, "step": 5410 }, { "epoch": 0.4372345118800025, "grad_norm": 0.2631090581417084, "learning_rate": 3e-05, "loss": 1.3669, "num_input_tokens_seen": 190248452, "step": 5420 }, { "epoch": 0.4380412176214785, "grad_norm": 0.2600938379764557, "learning_rate": 3e-05, "loss": 1.3874, "num_input_tokens_seen": 190583324, "step": 5430 }, { "epoch": 0.4388479233629546, "grad_norm": 0.2651340663433075, "learning_rate": 3e-05, "loss": 1.4112, "num_input_tokens_seen": 190932528, "step": 5440 }, { "epoch": 0.4396546291044306, "grad_norm": 0.2757515609264374, "learning_rate": 3e-05, "loss": 1.4233, "num_input_tokens_seen": 191266632, "step": 5450 }, { "epoch": 0.4404613348459066, "grad_norm": 0.3117634057998657, "learning_rate": 3e-05, "loss": 1.3617, "num_input_tokens_seen": 191594048, "step": 5460 }, { "epoch": 0.4412680405873826, "grad_norm": 0.27428796887397766, "learning_rate": 3e-05, "loss": 1.3699, "num_input_tokens_seen": 191959780, "step": 5470 }, { "epoch": 0.4420747463288586, "grad_norm": 0.2628273367881775, "learning_rate": 3e-05, "loss": 1.4274, "num_input_tokens_seen": 192275380, "step": 5480 }, { "epoch": 0.44288145207033464, "grad_norm": 0.26145341992378235, "learning_rate": 3e-05, "loss": 1.4375, "num_input_tokens_seen": 192635040, "step": 5490 }, { "epoch": 0.4436881578118107, "grad_norm": 0.2731001675128937, "learning_rate": 3e-05, "loss": 1.4172, "num_input_tokens_seen": 192991412, "step": 5500 }, { "epoch": 0.4444948635532867, "grad_norm": 0.2722030282020569, "learning_rate": 3e-05, "loss": 1.3866, "num_input_tokens_seen": 193320880, "step": 5510 }, { "epoch": 0.4453015692947627, "grad_norm": 0.29632169008255005, "learning_rate": 3e-05, "loss": 1.447, "num_input_tokens_seen": 193669744, "step": 5520 }, { "epoch": 0.44610827503623873, "grad_norm": 0.28086063265800476, "learning_rate": 3e-05, "loss": 1.4636, "num_input_tokens_seen": 194026876, "step": 5530 }, { "epoch": 0.44691498077771474, "grad_norm": 0.29540812969207764, "learning_rate": 3e-05, "loss": 1.3922, "num_input_tokens_seen": 194367412, "step": 5540 }, { "epoch": 0.44772168651919075, "grad_norm": 0.2671002447605133, "learning_rate": 3e-05, "loss": 1.4226, "num_input_tokens_seen": 194713016, "step": 5550 }, { "epoch": 0.4485283922606668, "grad_norm": 0.2889344394207001, "learning_rate": 3e-05, "loss": 1.4291, "num_input_tokens_seen": 195084592, "step": 5560 }, { "epoch": 0.44933509800214283, "grad_norm": 0.28490033745765686, "learning_rate": 3e-05, "loss": 1.3612, "num_input_tokens_seen": 195367436, "step": 5570 }, { "epoch": 0.45014180374361884, "grad_norm": 0.25098714232444763, "learning_rate": 3e-05, "loss": 1.4348, "num_input_tokens_seen": 195724988, "step": 5580 }, { "epoch": 0.45094850948509485, "grad_norm": 0.28072845935821533, "learning_rate": 3e-05, "loss": 1.4031, "num_input_tokens_seen": 196114160, "step": 5590 }, { "epoch": 0.45175521522657086, "grad_norm": 0.26970839500427246, "learning_rate": 3e-05, "loss": 1.3853, "num_input_tokens_seen": 196463832, "step": 5600 }, { "epoch": 0.45256192096804687, "grad_norm": 0.2835977375507355, "learning_rate": 3e-05, "loss": 1.4153, "num_input_tokens_seen": 196815808, "step": 5610 }, { "epoch": 0.4533686267095229, "grad_norm": 0.3386438190937042, "learning_rate": 3e-05, "loss": 1.363, "num_input_tokens_seen": 197182716, "step": 5620 }, { "epoch": 0.45417533245099895, "grad_norm": 0.2961023449897766, "learning_rate": 3e-05, "loss": 1.4127, "num_input_tokens_seen": 197526772, "step": 5630 }, { "epoch": 0.45498203819247496, "grad_norm": 0.29476794600486755, "learning_rate": 3e-05, "loss": 1.4113, "num_input_tokens_seen": 197878328, "step": 5640 }, { "epoch": 0.45578874393395097, "grad_norm": 0.305695503950119, "learning_rate": 3e-05, "loss": 1.4272, "num_input_tokens_seen": 198239360, "step": 5650 }, { "epoch": 0.456595449675427, "grad_norm": 0.2787207365036011, "learning_rate": 3e-05, "loss": 1.4079, "num_input_tokens_seen": 198581888, "step": 5660 }, { "epoch": 0.457402155416903, "grad_norm": 0.2544805705547333, "learning_rate": 3e-05, "loss": 1.372, "num_input_tokens_seen": 198930636, "step": 5670 }, { "epoch": 0.458208861158379, "grad_norm": 0.2546211779117584, "learning_rate": 3e-05, "loss": 1.3958, "num_input_tokens_seen": 199280792, "step": 5680 }, { "epoch": 0.45901556689985507, "grad_norm": 0.2609899938106537, "learning_rate": 3e-05, "loss": 1.4331, "num_input_tokens_seen": 199631024, "step": 5690 }, { "epoch": 0.4598222726413311, "grad_norm": 0.2949337363243103, "learning_rate": 3e-05, "loss": 1.4179, "num_input_tokens_seen": 199979964, "step": 5700 }, { "epoch": 0.4606289783828071, "grad_norm": 0.2916325032711029, "learning_rate": 3e-05, "loss": 1.4107, "num_input_tokens_seen": 200317260, "step": 5710 }, { "epoch": 0.4614356841242831, "grad_norm": 0.2985553741455078, "learning_rate": 3e-05, "loss": 1.3435, "num_input_tokens_seen": 200706164, "step": 5720 }, { "epoch": 0.4622423898657591, "grad_norm": 0.29759296774864197, "learning_rate": 3e-05, "loss": 1.3962, "num_input_tokens_seen": 201041936, "step": 5730 }, { "epoch": 0.4630490956072351, "grad_norm": 0.2666504383087158, "learning_rate": 3e-05, "loss": 1.3736, "num_input_tokens_seen": 201384532, "step": 5740 }, { "epoch": 0.4638558013487112, "grad_norm": 0.2790429890155792, "learning_rate": 3e-05, "loss": 1.3505, "num_input_tokens_seen": 201732544, "step": 5750 }, { "epoch": 0.4646625070901872, "grad_norm": 0.27765849232673645, "learning_rate": 3e-05, "loss": 1.4432, "num_input_tokens_seen": 202072132, "step": 5760 }, { "epoch": 0.4654692128316632, "grad_norm": 0.27785608172416687, "learning_rate": 3e-05, "loss": 1.4137, "num_input_tokens_seen": 202425292, "step": 5770 }, { "epoch": 0.4662759185731392, "grad_norm": 0.3008098900318146, "learning_rate": 3e-05, "loss": 1.3844, "num_input_tokens_seen": 202754488, "step": 5780 }, { "epoch": 0.4670826243146152, "grad_norm": 0.2869485318660736, "learning_rate": 3e-05, "loss": 1.3913, "num_input_tokens_seen": 203080408, "step": 5790 }, { "epoch": 0.46788933005609123, "grad_norm": 0.2760210335254669, "learning_rate": 3e-05, "loss": 1.3775, "num_input_tokens_seen": 203433440, "step": 5800 }, { "epoch": 0.4686960357975673, "grad_norm": 0.29998424649238586, "learning_rate": 3e-05, "loss": 1.3395, "num_input_tokens_seen": 203773520, "step": 5810 }, { "epoch": 0.4695027415390433, "grad_norm": 0.26301300525665283, "learning_rate": 3e-05, "loss": 1.3362, "num_input_tokens_seen": 204128604, "step": 5820 }, { "epoch": 0.4703094472805193, "grad_norm": 0.2580535113811493, "learning_rate": 3e-05, "loss": 1.4004, "num_input_tokens_seen": 204494312, "step": 5830 }, { "epoch": 0.47111615302199533, "grad_norm": 0.25355467200279236, "learning_rate": 3e-05, "loss": 1.3836, "num_input_tokens_seen": 204826752, "step": 5840 }, { "epoch": 0.47192285876347134, "grad_norm": 0.2825932502746582, "learning_rate": 3e-05, "loss": 1.3717, "num_input_tokens_seen": 205188700, "step": 5850 }, { "epoch": 0.47272956450494735, "grad_norm": 0.3444035053253174, "learning_rate": 3e-05, "loss": 1.3581, "num_input_tokens_seen": 205529408, "step": 5860 }, { "epoch": 0.4735362702464234, "grad_norm": 0.25847604870796204, "learning_rate": 3e-05, "loss": 1.357, "num_input_tokens_seen": 205868532, "step": 5870 }, { "epoch": 0.47434297598789943, "grad_norm": 0.2876322269439697, "learning_rate": 3e-05, "loss": 1.3783, "num_input_tokens_seen": 206205984, "step": 5880 }, { "epoch": 0.47514968172937544, "grad_norm": 0.27320173382759094, "learning_rate": 3e-05, "loss": 1.4018, "num_input_tokens_seen": 206585048, "step": 5890 }, { "epoch": 0.47595638747085145, "grad_norm": 0.31563153862953186, "learning_rate": 3e-05, "loss": 1.4208, "num_input_tokens_seen": 206935052, "step": 5900 }, { "epoch": 0.47676309321232746, "grad_norm": 0.29032954573631287, "learning_rate": 3e-05, "loss": 1.404, "num_input_tokens_seen": 207275132, "step": 5910 }, { "epoch": 0.47756979895380347, "grad_norm": 0.27211418747901917, "learning_rate": 3e-05, "loss": 1.3487, "num_input_tokens_seen": 207633312, "step": 5920 }, { "epoch": 0.47837650469527954, "grad_norm": 0.3004505932331085, "learning_rate": 3e-05, "loss": 1.3679, "num_input_tokens_seen": 207984000, "step": 5930 }, { "epoch": 0.47918321043675555, "grad_norm": 0.25671249628067017, "learning_rate": 3e-05, "loss": 1.389, "num_input_tokens_seen": 208302920, "step": 5940 }, { "epoch": 0.47998991617823156, "grad_norm": 0.29051607847213745, "learning_rate": 3e-05, "loss": 1.4225, "num_input_tokens_seen": 208680580, "step": 5950 }, { "epoch": 0.48079662191970757, "grad_norm": 0.2684350311756134, "learning_rate": 3e-05, "loss": 1.4091, "num_input_tokens_seen": 209017588, "step": 5960 }, { "epoch": 0.4816033276611836, "grad_norm": 0.28748780488967896, "learning_rate": 3e-05, "loss": 1.396, "num_input_tokens_seen": 209385672, "step": 5970 }, { "epoch": 0.4824100334026596, "grad_norm": 0.26985928416252136, "learning_rate": 3e-05, "loss": 1.3779, "num_input_tokens_seen": 209735156, "step": 5980 }, { "epoch": 0.48321673914413565, "grad_norm": 0.26085472106933594, "learning_rate": 3e-05, "loss": 1.3524, "num_input_tokens_seen": 210074668, "step": 5990 }, { "epoch": 0.48402344488561166, "grad_norm": 0.30844658613204956, "learning_rate": 3e-05, "loss": 1.3912, "num_input_tokens_seen": 210409328, "step": 6000 }, { "epoch": 0.48402344488561166, "eval_gen_len": 469.885, "eval_loss": 1.3159054517745972, "eval_rouge1": 36.5059, "eval_rouge2": 21.2447, "eval_rougeL": 30.116, "eval_rougeLsum": 34.7303, "eval_runtime": 1601.8691, "eval_samples_per_second": 0.125, "eval_steps_per_second": 0.031, "num_input_tokens_seen": 210409328, "step": 6000 }, { "epoch": 0.4848301506270877, "grad_norm": 0.3409636914730072, "learning_rate": 3e-05, "loss": 1.3814, "num_input_tokens_seen": 210757000, "step": 6010 }, { "epoch": 0.4856368563685637, "grad_norm": 0.285645067691803, "learning_rate": 3e-05, "loss": 1.3968, "num_input_tokens_seen": 211087396, "step": 6020 }, { "epoch": 0.4864435621100397, "grad_norm": 0.23893733322620392, "learning_rate": 3e-05, "loss": 1.3671, "num_input_tokens_seen": 211458184, "step": 6030 }, { "epoch": 0.4872502678515157, "grad_norm": 0.2706129252910614, "learning_rate": 3e-05, "loss": 1.3598, "num_input_tokens_seen": 211782916, "step": 6040 }, { "epoch": 0.4880569735929917, "grad_norm": 0.2617262601852417, "learning_rate": 3e-05, "loss": 1.3718, "num_input_tokens_seen": 212146920, "step": 6050 }, { "epoch": 0.4888636793344678, "grad_norm": 0.2852620780467987, "learning_rate": 3e-05, "loss": 1.3483, "num_input_tokens_seen": 212525416, "step": 6060 }, { "epoch": 0.4896703850759438, "grad_norm": 0.30544915795326233, "learning_rate": 3e-05, "loss": 1.3977, "num_input_tokens_seen": 212850600, "step": 6070 }, { "epoch": 0.4904770908174198, "grad_norm": 0.2675735056400299, "learning_rate": 3e-05, "loss": 1.383, "num_input_tokens_seen": 213202036, "step": 6080 }, { "epoch": 0.4912837965588958, "grad_norm": 0.2696596086025238, "learning_rate": 3e-05, "loss": 1.3809, "num_input_tokens_seen": 213537836, "step": 6090 }, { "epoch": 0.4920905023003718, "grad_norm": 0.281474769115448, "learning_rate": 3e-05, "loss": 1.351, "num_input_tokens_seen": 213910248, "step": 6100 }, { "epoch": 0.49289720804184783, "grad_norm": 0.3014686405658722, "learning_rate": 3e-05, "loss": 1.381, "num_input_tokens_seen": 214261100, "step": 6110 }, { "epoch": 0.4937039137833239, "grad_norm": 0.27863389253616333, "learning_rate": 3e-05, "loss": 1.3878, "num_input_tokens_seen": 214636128, "step": 6120 }, { "epoch": 0.4945106195247999, "grad_norm": 0.28771695494651794, "learning_rate": 3e-05, "loss": 1.3721, "num_input_tokens_seen": 215003756, "step": 6130 }, { "epoch": 0.4953173252662759, "grad_norm": 0.23946808278560638, "learning_rate": 3e-05, "loss": 1.3648, "num_input_tokens_seen": 215364600, "step": 6140 }, { "epoch": 0.49612403100775193, "grad_norm": 0.2888747751712799, "learning_rate": 3e-05, "loss": 1.4156, "num_input_tokens_seen": 215725344, "step": 6150 }, { "epoch": 0.49693073674922794, "grad_norm": 0.2734207808971405, "learning_rate": 3e-05, "loss": 1.3646, "num_input_tokens_seen": 216095268, "step": 6160 }, { "epoch": 0.49773744249070395, "grad_norm": 0.2685578167438507, "learning_rate": 3e-05, "loss": 1.3459, "num_input_tokens_seen": 216469020, "step": 6170 }, { "epoch": 0.49854414823218, "grad_norm": 0.2771487236022949, "learning_rate": 3e-05, "loss": 1.3673, "num_input_tokens_seen": 216823048, "step": 6180 }, { "epoch": 0.49935085397365603, "grad_norm": 0.2881760895252228, "learning_rate": 3e-05, "loss": 1.3973, "num_input_tokens_seen": 217211200, "step": 6190 }, { "epoch": 0.500157559715132, "grad_norm": 0.2920476496219635, "learning_rate": 3e-05, "loss": 1.3809, "num_input_tokens_seen": 217569052, "step": 6200 }, { "epoch": 0.5009642654566081, "grad_norm": 0.28796783089637756, "learning_rate": 3e-05, "loss": 1.3666, "num_input_tokens_seen": 217919988, "step": 6210 }, { "epoch": 0.5017709711980841, "grad_norm": 0.24618837237358093, "learning_rate": 3e-05, "loss": 1.3739, "num_input_tokens_seen": 218277224, "step": 6220 }, { "epoch": 0.5025776769395601, "grad_norm": 0.2835310995578766, "learning_rate": 3e-05, "loss": 1.3831, "num_input_tokens_seen": 218633308, "step": 6230 }, { "epoch": 0.5033843826810361, "grad_norm": 0.25976064801216125, "learning_rate": 3e-05, "loss": 1.3267, "num_input_tokens_seen": 218982380, "step": 6240 }, { "epoch": 0.5041910884225121, "grad_norm": 0.24962379038333893, "learning_rate": 3e-05, "loss": 1.3829, "num_input_tokens_seen": 219302800, "step": 6250 }, { "epoch": 0.5049977941639882, "grad_norm": 0.2833407521247864, "learning_rate": 3e-05, "loss": 1.3575, "num_input_tokens_seen": 219657948, "step": 6260 }, { "epoch": 0.5058044999054642, "grad_norm": 0.2600440979003906, "learning_rate": 3e-05, "loss": 1.3886, "num_input_tokens_seen": 219986420, "step": 6270 }, { "epoch": 0.5066112056469402, "grad_norm": 0.28562673926353455, "learning_rate": 3e-05, "loss": 1.3903, "num_input_tokens_seen": 220335652, "step": 6280 }, { "epoch": 0.5074179113884162, "grad_norm": 0.30197736620903015, "learning_rate": 3e-05, "loss": 1.3592, "num_input_tokens_seen": 220710520, "step": 6290 }, { "epoch": 0.5082246171298922, "grad_norm": 0.35574081540107727, "learning_rate": 3e-05, "loss": 1.3804, "num_input_tokens_seen": 221048404, "step": 6300 }, { "epoch": 0.5090313228713682, "grad_norm": 0.26034465432167053, "learning_rate": 3e-05, "loss": 1.3843, "num_input_tokens_seen": 221398644, "step": 6310 }, { "epoch": 0.5098380286128442, "grad_norm": 0.27993252873420715, "learning_rate": 3e-05, "loss": 1.3452, "num_input_tokens_seen": 221766764, "step": 6320 }, { "epoch": 0.5106447343543203, "grad_norm": 0.278550386428833, "learning_rate": 3e-05, "loss": 1.3901, "num_input_tokens_seen": 222114724, "step": 6330 }, { "epoch": 0.5114514400957964, "grad_norm": 0.32215824723243713, "learning_rate": 3e-05, "loss": 1.3696, "num_input_tokens_seen": 222494148, "step": 6340 }, { "epoch": 0.5122581458372724, "grad_norm": 0.2745245099067688, "learning_rate": 3e-05, "loss": 1.3832, "num_input_tokens_seen": 222847792, "step": 6350 }, { "epoch": 0.5130648515787484, "grad_norm": 0.2977345585823059, "learning_rate": 3e-05, "loss": 1.3719, "num_input_tokens_seen": 223203024, "step": 6360 }, { "epoch": 0.5138715573202244, "grad_norm": 0.299365371465683, "learning_rate": 3e-05, "loss": 1.3602, "num_input_tokens_seen": 223552380, "step": 6370 }, { "epoch": 0.5146782630617004, "grad_norm": 0.2765893340110779, "learning_rate": 3e-05, "loss": 1.3861, "num_input_tokens_seen": 223902684, "step": 6380 }, { "epoch": 0.5154849688031764, "grad_norm": 0.3482683002948761, "learning_rate": 3e-05, "loss": 1.4014, "num_input_tokens_seen": 224249200, "step": 6390 }, { "epoch": 0.5162916745446524, "grad_norm": 0.2550183832645416, "learning_rate": 3e-05, "loss": 1.4198, "num_input_tokens_seen": 224584080, "step": 6400 }, { "epoch": 0.5170983802861284, "grad_norm": 0.2872161865234375, "learning_rate": 3e-05, "loss": 1.37, "num_input_tokens_seen": 224949828, "step": 6410 }, { "epoch": 0.5179050860276044, "grad_norm": 0.2459658682346344, "learning_rate": 3e-05, "loss": 1.3798, "num_input_tokens_seen": 225310160, "step": 6420 }, { "epoch": 0.5187117917690804, "grad_norm": 0.2668297588825226, "learning_rate": 3e-05, "loss": 1.3406, "num_input_tokens_seen": 225654484, "step": 6430 }, { "epoch": 0.5195184975105565, "grad_norm": 0.2736770808696747, "learning_rate": 3e-05, "loss": 1.341, "num_input_tokens_seen": 226010672, "step": 6440 }, { "epoch": 0.5203252032520326, "grad_norm": 0.27257856726646423, "learning_rate": 3e-05, "loss": 1.4121, "num_input_tokens_seen": 226356336, "step": 6450 }, { "epoch": 0.5211319089935086, "grad_norm": 0.278709352016449, "learning_rate": 3e-05, "loss": 1.2977, "num_input_tokens_seen": 226715132, "step": 6460 }, { "epoch": 0.5219386147349846, "grad_norm": 0.2663877010345459, "learning_rate": 3e-05, "loss": 1.3665, "num_input_tokens_seen": 227064968, "step": 6470 }, { "epoch": 0.5227453204764606, "grad_norm": 0.29134681820869446, "learning_rate": 3e-05, "loss": 1.2802, "num_input_tokens_seen": 227397320, "step": 6480 }, { "epoch": 0.5235520262179366, "grad_norm": 0.3129670321941376, "learning_rate": 3e-05, "loss": 1.3934, "num_input_tokens_seen": 227757800, "step": 6490 }, { "epoch": 0.5243587319594126, "grad_norm": 0.2857125997543335, "learning_rate": 3e-05, "loss": 1.3935, "num_input_tokens_seen": 228107956, "step": 6500 }, { "epoch": 0.5251654377008886, "grad_norm": 0.26699715852737427, "learning_rate": 3e-05, "loss": 1.4017, "num_input_tokens_seen": 228435140, "step": 6510 }, { "epoch": 0.5259721434423646, "grad_norm": 0.3041050136089325, "learning_rate": 3e-05, "loss": 1.3659, "num_input_tokens_seen": 228778040, "step": 6520 }, { "epoch": 0.5267788491838407, "grad_norm": 0.2667132616043091, "learning_rate": 3e-05, "loss": 1.3765, "num_input_tokens_seen": 229124136, "step": 6530 }, { "epoch": 0.5275855549253167, "grad_norm": 0.27975499629974365, "learning_rate": 3e-05, "loss": 1.3799, "num_input_tokens_seen": 229453708, "step": 6540 }, { "epoch": 0.5283922606667927, "grad_norm": 0.25194570422172546, "learning_rate": 3e-05, "loss": 1.3369, "num_input_tokens_seen": 229789164, "step": 6550 }, { "epoch": 0.5291989664082687, "grad_norm": 0.26208654046058655, "learning_rate": 3e-05, "loss": 1.3712, "num_input_tokens_seen": 230141916, "step": 6560 }, { "epoch": 0.5300056721497447, "grad_norm": 0.2651267647743225, "learning_rate": 3e-05, "loss": 1.3616, "num_input_tokens_seen": 230503840, "step": 6570 }, { "epoch": 0.5308123778912208, "grad_norm": 0.25937962532043457, "learning_rate": 3e-05, "loss": 1.2915, "num_input_tokens_seen": 230819784, "step": 6580 }, { "epoch": 0.5316190836326968, "grad_norm": 0.31449395418167114, "learning_rate": 3e-05, "loss": 1.3706, "num_input_tokens_seen": 231205872, "step": 6590 }, { "epoch": 0.5324257893741728, "grad_norm": 0.2909031808376312, "learning_rate": 3e-05, "loss": 1.4025, "num_input_tokens_seen": 231547816, "step": 6600 }, { "epoch": 0.5332324951156489, "grad_norm": 0.23776140809059143, "learning_rate": 3e-05, "loss": 1.3567, "num_input_tokens_seen": 231915224, "step": 6610 }, { "epoch": 0.5340392008571249, "grad_norm": 0.255609929561615, "learning_rate": 3e-05, "loss": 1.3352, "num_input_tokens_seen": 232289088, "step": 6620 }, { "epoch": 0.5348459065986009, "grad_norm": 0.2546085715293884, "learning_rate": 3e-05, "loss": 1.3868, "num_input_tokens_seen": 232690580, "step": 6630 }, { "epoch": 0.5356526123400769, "grad_norm": 0.2850560247898102, "learning_rate": 3e-05, "loss": 1.3626, "num_input_tokens_seen": 233036468, "step": 6640 }, { "epoch": 0.5364593180815529, "grad_norm": 0.26064831018447876, "learning_rate": 3e-05, "loss": 1.385, "num_input_tokens_seen": 233382180, "step": 6650 }, { "epoch": 0.5372660238230289, "grad_norm": 0.2727303206920624, "learning_rate": 3e-05, "loss": 1.3323, "num_input_tokens_seen": 233733420, "step": 6660 }, { "epoch": 0.5380727295645049, "grad_norm": 0.2605370283126831, "learning_rate": 3e-05, "loss": 1.3613, "num_input_tokens_seen": 234088620, "step": 6670 }, { "epoch": 0.5388794353059809, "grad_norm": 0.2989037036895752, "learning_rate": 3e-05, "loss": 1.3553, "num_input_tokens_seen": 234457604, "step": 6680 }, { "epoch": 0.5396861410474569, "grad_norm": 0.27383917570114136, "learning_rate": 3e-05, "loss": 1.3567, "num_input_tokens_seen": 234790868, "step": 6690 }, { "epoch": 0.5404928467889331, "grad_norm": 0.2771012783050537, "learning_rate": 3e-05, "loss": 1.3888, "num_input_tokens_seen": 235140724, "step": 6700 }, { "epoch": 0.5412995525304091, "grad_norm": 0.35448309779167175, "learning_rate": 3e-05, "loss": 1.3347, "num_input_tokens_seen": 235476928, "step": 6710 }, { "epoch": 0.5421062582718851, "grad_norm": 0.28626537322998047, "learning_rate": 3e-05, "loss": 1.3486, "num_input_tokens_seen": 235805748, "step": 6720 }, { "epoch": 0.5429129640133611, "grad_norm": 0.280998170375824, "learning_rate": 3e-05, "loss": 1.3868, "num_input_tokens_seen": 236142400, "step": 6730 }, { "epoch": 0.5437196697548371, "grad_norm": 0.2946176826953888, "learning_rate": 3e-05, "loss": 1.3424, "num_input_tokens_seen": 236492160, "step": 6740 }, { "epoch": 0.5445263754963131, "grad_norm": 0.2584805488586426, "learning_rate": 3e-05, "loss": 1.3451, "num_input_tokens_seen": 236826392, "step": 6750 }, { "epoch": 0.5453330812377891, "grad_norm": 0.27321335673332214, "learning_rate": 3e-05, "loss": 1.3567, "num_input_tokens_seen": 237160908, "step": 6760 }, { "epoch": 0.5461397869792651, "grad_norm": 0.2743065655231476, "learning_rate": 3e-05, "loss": 1.3933, "num_input_tokens_seen": 237514248, "step": 6770 }, { "epoch": 0.5469464927207411, "grad_norm": 0.26243406534194946, "learning_rate": 3e-05, "loss": 1.3716, "num_input_tokens_seen": 237850192, "step": 6780 }, { "epoch": 0.5477531984622172, "grad_norm": 0.2854134738445282, "learning_rate": 3e-05, "loss": 1.3882, "num_input_tokens_seen": 238236160, "step": 6790 }, { "epoch": 0.5485599042036932, "grad_norm": 0.2889584004878998, "learning_rate": 3e-05, "loss": 1.3232, "num_input_tokens_seen": 238608400, "step": 6800 }, { "epoch": 0.5493666099451692, "grad_norm": 0.2689494490623474, "learning_rate": 3e-05, "loss": 1.3191, "num_input_tokens_seen": 238965484, "step": 6810 }, { "epoch": 0.5501733156866453, "grad_norm": 0.2811024785041809, "learning_rate": 3e-05, "loss": 1.3883, "num_input_tokens_seen": 239305156, "step": 6820 }, { "epoch": 0.5509800214281213, "grad_norm": 0.29699015617370605, "learning_rate": 3e-05, "loss": 1.3527, "num_input_tokens_seen": 239633044, "step": 6830 }, { "epoch": 0.5517867271695973, "grad_norm": 0.2648441791534424, "learning_rate": 3e-05, "loss": 1.3464, "num_input_tokens_seen": 239984216, "step": 6840 }, { "epoch": 0.5525934329110733, "grad_norm": 0.2662919759750366, "learning_rate": 3e-05, "loss": 1.3185, "num_input_tokens_seen": 240344632, "step": 6850 }, { "epoch": 0.5534001386525493, "grad_norm": 0.3006437420845032, "learning_rate": 3e-05, "loss": 1.3587, "num_input_tokens_seen": 240702092, "step": 6860 }, { "epoch": 0.5542068443940253, "grad_norm": 0.27952778339385986, "learning_rate": 3e-05, "loss": 1.3546, "num_input_tokens_seen": 241059608, "step": 6870 }, { "epoch": 0.5550135501355014, "grad_norm": 0.27199041843414307, "learning_rate": 3e-05, "loss": 1.3444, "num_input_tokens_seen": 241417296, "step": 6880 }, { "epoch": 0.5558202558769774, "grad_norm": 0.2580903470516205, "learning_rate": 3e-05, "loss": 1.3458, "num_input_tokens_seen": 241788856, "step": 6890 }, { "epoch": 0.5566269616184534, "grad_norm": 0.2709527909755707, "learning_rate": 3e-05, "loss": 1.3785, "num_input_tokens_seen": 242139900, "step": 6900 }, { "epoch": 0.5574336673599294, "grad_norm": 0.30123209953308105, "learning_rate": 3e-05, "loss": 1.4256, "num_input_tokens_seen": 242504148, "step": 6910 }, { "epoch": 0.5582403731014054, "grad_norm": 0.2620568871498108, "learning_rate": 3e-05, "loss": 1.2906, "num_input_tokens_seen": 242827892, "step": 6920 }, { "epoch": 0.5590470788428814, "grad_norm": 0.2878223955631256, "learning_rate": 3e-05, "loss": 1.3146, "num_input_tokens_seen": 243181464, "step": 6930 }, { "epoch": 0.5598537845843574, "grad_norm": 0.26872310042381287, "learning_rate": 3e-05, "loss": 1.3387, "num_input_tokens_seen": 243523144, "step": 6940 }, { "epoch": 0.5606604903258335, "grad_norm": 0.30172818899154663, "learning_rate": 3e-05, "loss": 1.3215, "num_input_tokens_seen": 243867668, "step": 6950 }, { "epoch": 0.5614671960673095, "grad_norm": 0.2358444631099701, "learning_rate": 3e-05, "loss": 1.3118, "num_input_tokens_seen": 244222464, "step": 6960 }, { "epoch": 0.5622739018087856, "grad_norm": 0.2576392889022827, "learning_rate": 3e-05, "loss": 1.3382, "num_input_tokens_seen": 244580240, "step": 6970 }, { "epoch": 0.5630806075502616, "grad_norm": 0.2685336768627167, "learning_rate": 3e-05, "loss": 1.3596, "num_input_tokens_seen": 244952004, "step": 6980 }, { "epoch": 0.5638873132917376, "grad_norm": 0.3160952925682068, "learning_rate": 3e-05, "loss": 1.3249, "num_input_tokens_seen": 245296600, "step": 6990 }, { "epoch": 0.5646940190332136, "grad_norm": 0.2507816255092621, "learning_rate": 3e-05, "loss": 1.3148, "num_input_tokens_seen": 245601908, "step": 7000 }, { "epoch": 0.5646940190332136, "eval_gen_len": 458.28, "eval_loss": 1.2807079553604126, "eval_rouge1": 37.0123, "eval_rouge2": 21.3666, "eval_rougeL": 30.11, "eval_rougeLsum": 35.0891, "eval_runtime": 1549.9426, "eval_samples_per_second": 0.129, "eval_steps_per_second": 0.032, "num_input_tokens_seen": 245601908, "step": 7000 }, { "epoch": 0.5655007247746896, "grad_norm": 0.25211119651794434, "learning_rate": 3e-05, "loss": 1.3326, "num_input_tokens_seen": 245935868, "step": 7010 }, { "epoch": 0.5663074305161656, "grad_norm": 0.26975396275520325, "learning_rate": 3e-05, "loss": 1.344, "num_input_tokens_seen": 246286460, "step": 7020 }, { "epoch": 0.5671141362576416, "grad_norm": 0.25872379541397095, "learning_rate": 3e-05, "loss": 1.2972, "num_input_tokens_seen": 246643836, "step": 7030 }, { "epoch": 0.5679208419991176, "grad_norm": 0.25087055563926697, "learning_rate": 3e-05, "loss": 1.3577, "num_input_tokens_seen": 246981328, "step": 7040 }, { "epoch": 0.5687275477405936, "grad_norm": 0.29341551661491394, "learning_rate": 3e-05, "loss": 1.3222, "num_input_tokens_seen": 247316288, "step": 7050 }, { "epoch": 0.5695342534820697, "grad_norm": 0.2665810286998749, "learning_rate": 3e-05, "loss": 1.3704, "num_input_tokens_seen": 247677216, "step": 7060 }, { "epoch": 0.5703409592235458, "grad_norm": 0.2743811309337616, "learning_rate": 3e-05, "loss": 1.4113, "num_input_tokens_seen": 248018060, "step": 7070 }, { "epoch": 0.5711476649650218, "grad_norm": 0.2498067319393158, "learning_rate": 3e-05, "loss": 1.3407, "num_input_tokens_seen": 248347064, "step": 7080 }, { "epoch": 0.5719543707064978, "grad_norm": 0.31014665961265564, "learning_rate": 3e-05, "loss": 1.3535, "num_input_tokens_seen": 248698972, "step": 7090 }, { "epoch": 0.5727610764479738, "grad_norm": 0.304561972618103, "learning_rate": 3e-05, "loss": 1.3736, "num_input_tokens_seen": 249061164, "step": 7100 }, { "epoch": 0.5735677821894498, "grad_norm": 0.2791111469268799, "learning_rate": 3e-05, "loss": 1.3308, "num_input_tokens_seen": 249394300, "step": 7110 }, { "epoch": 0.5743744879309258, "grad_norm": 0.2657420039176941, "learning_rate": 3e-05, "loss": 1.3221, "num_input_tokens_seen": 249749096, "step": 7120 }, { "epoch": 0.5751811936724018, "grad_norm": 0.26944419741630554, "learning_rate": 3e-05, "loss": 1.2559, "num_input_tokens_seen": 250099212, "step": 7130 }, { "epoch": 0.5759878994138778, "grad_norm": 0.2616654336452484, "learning_rate": 3e-05, "loss": 1.3261, "num_input_tokens_seen": 250468696, "step": 7140 }, { "epoch": 0.5767946051553539, "grad_norm": 0.26480162143707275, "learning_rate": 3e-05, "loss": 1.3204, "num_input_tokens_seen": 250820768, "step": 7150 }, { "epoch": 0.5776013108968299, "grad_norm": 0.3008149266242981, "learning_rate": 3e-05, "loss": 1.3495, "num_input_tokens_seen": 251157348, "step": 7160 }, { "epoch": 0.5784080166383059, "grad_norm": 0.2766317129135132, "learning_rate": 3e-05, "loss": 1.3596, "num_input_tokens_seen": 251534100, "step": 7170 }, { "epoch": 0.5792147223797819, "grad_norm": 0.2724848687648773, "learning_rate": 3e-05, "loss": 1.364, "num_input_tokens_seen": 251892536, "step": 7180 }, { "epoch": 0.580021428121258, "grad_norm": 0.2869981825351715, "learning_rate": 3e-05, "loss": 1.3494, "num_input_tokens_seen": 252257808, "step": 7190 }, { "epoch": 0.580828133862734, "grad_norm": 0.2904507517814636, "learning_rate": 3e-05, "loss": 1.3117, "num_input_tokens_seen": 252602564, "step": 7200 }, { "epoch": 0.58163483960421, "grad_norm": 0.29304027557373047, "learning_rate": 3e-05, "loss": 1.3394, "num_input_tokens_seen": 253000804, "step": 7210 }, { "epoch": 0.582441545345686, "grad_norm": 0.2698158323764801, "learning_rate": 3e-05, "loss": 1.352, "num_input_tokens_seen": 253349332, "step": 7220 }, { "epoch": 0.583248251087162, "grad_norm": 0.259858638048172, "learning_rate": 3e-05, "loss": 1.3094, "num_input_tokens_seen": 253711588, "step": 7230 }, { "epoch": 0.5840549568286381, "grad_norm": 0.24673224985599518, "learning_rate": 3e-05, "loss": 1.3504, "num_input_tokens_seen": 254032876, "step": 7240 }, { "epoch": 0.5848616625701141, "grad_norm": 0.24645106494426727, "learning_rate": 3e-05, "loss": 1.3109, "num_input_tokens_seen": 254388696, "step": 7250 }, { "epoch": 0.5856683683115901, "grad_norm": 0.295168399810791, "learning_rate": 3e-05, "loss": 1.3275, "num_input_tokens_seen": 254709932, "step": 7260 }, { "epoch": 0.5864750740530661, "grad_norm": 0.28658369183540344, "learning_rate": 3e-05, "loss": 1.3473, "num_input_tokens_seen": 255043668, "step": 7270 }, { "epoch": 0.5872817797945421, "grad_norm": 0.26063695549964905, "learning_rate": 3e-05, "loss": 1.2875, "num_input_tokens_seen": 255402252, "step": 7280 }, { "epoch": 0.5880884855360181, "grad_norm": 0.2694176435470581, "learning_rate": 3e-05, "loss": 1.3576, "num_input_tokens_seen": 255725676, "step": 7290 }, { "epoch": 0.5888951912774941, "grad_norm": 0.2852911055088043, "learning_rate": 3e-05, "loss": 1.3456, "num_input_tokens_seen": 256093604, "step": 7300 }, { "epoch": 0.5897018970189702, "grad_norm": 0.2513694167137146, "learning_rate": 3e-05, "loss": 1.3225, "num_input_tokens_seen": 256448048, "step": 7310 }, { "epoch": 0.5905086027604463, "grad_norm": 0.2887936532497406, "learning_rate": 3e-05, "loss": 1.3593, "num_input_tokens_seen": 256787872, "step": 7320 }, { "epoch": 0.5913153085019223, "grad_norm": 0.2790989577770233, "learning_rate": 3e-05, "loss": 1.3286, "num_input_tokens_seen": 257116080, "step": 7330 }, { "epoch": 0.5921220142433983, "grad_norm": 0.2735912799835205, "learning_rate": 3e-05, "loss": 1.3624, "num_input_tokens_seen": 257477564, "step": 7340 }, { "epoch": 0.5929287199848743, "grad_norm": 0.252945214509964, "learning_rate": 3e-05, "loss": 1.3592, "num_input_tokens_seen": 257851048, "step": 7350 }, { "epoch": 0.5937354257263503, "grad_norm": 0.3211391866207123, "learning_rate": 3e-05, "loss": 1.3686, "num_input_tokens_seen": 258218684, "step": 7360 }, { "epoch": 0.5945421314678263, "grad_norm": 0.30502480268478394, "learning_rate": 3e-05, "loss": 1.3227, "num_input_tokens_seen": 258573844, "step": 7370 }, { "epoch": 0.5953488372093023, "grad_norm": 0.28097566962242126, "learning_rate": 3e-05, "loss": 1.294, "num_input_tokens_seen": 258943588, "step": 7380 }, { "epoch": 0.5961555429507783, "grad_norm": 0.27676570415496826, "learning_rate": 3e-05, "loss": 1.2757, "num_input_tokens_seen": 259313468, "step": 7390 }, { "epoch": 0.5969622486922543, "grad_norm": 0.25486335158348083, "learning_rate": 3e-05, "loss": 1.3064, "num_input_tokens_seen": 259666520, "step": 7400 }, { "epoch": 0.5977689544337303, "grad_norm": 0.26223358511924744, "learning_rate": 3e-05, "loss": 1.319, "num_input_tokens_seen": 260013508, "step": 7410 }, { "epoch": 0.5985756601752064, "grad_norm": 0.25428274273872375, "learning_rate": 3e-05, "loss": 1.3169, "num_input_tokens_seen": 260358660, "step": 7420 }, { "epoch": 0.5993823659166824, "grad_norm": 0.26828479766845703, "learning_rate": 3e-05, "loss": 1.2931, "num_input_tokens_seen": 260698792, "step": 7430 }, { "epoch": 0.6001890716581585, "grad_norm": 0.286696195602417, "learning_rate": 3e-05, "loss": 1.3534, "num_input_tokens_seen": 261051896, "step": 7440 }, { "epoch": 0.6009957773996345, "grad_norm": 0.2686040699481964, "learning_rate": 3e-05, "loss": 1.3426, "num_input_tokens_seen": 261413924, "step": 7450 }, { "epoch": 0.6018024831411105, "grad_norm": 0.3073362112045288, "learning_rate": 3e-05, "loss": 1.3457, "num_input_tokens_seen": 261759140, "step": 7460 }, { "epoch": 0.6026091888825865, "grad_norm": 0.3184793293476105, "learning_rate": 3e-05, "loss": 1.2967, "num_input_tokens_seen": 262101808, "step": 7470 }, { "epoch": 0.6034158946240625, "grad_norm": 0.29304739832878113, "learning_rate": 3e-05, "loss": 1.2587, "num_input_tokens_seen": 262420468, "step": 7480 }, { "epoch": 0.6042226003655385, "grad_norm": 0.2968464493751526, "learning_rate": 3e-05, "loss": 1.306, "num_input_tokens_seen": 262830180, "step": 7490 }, { "epoch": 0.6050293061070146, "grad_norm": 0.29142898321151733, "learning_rate": 3e-05, "loss": 1.3333, "num_input_tokens_seen": 263164684, "step": 7500 }, { "epoch": 0.6058360118484906, "grad_norm": 0.4284871220588684, "learning_rate": 3e-05, "loss": 1.3222, "num_input_tokens_seen": 263534376, "step": 7510 }, { "epoch": 0.6066427175899666, "grad_norm": 0.25819581747055054, "learning_rate": 3e-05, "loss": 1.3673, "num_input_tokens_seen": 263888472, "step": 7520 }, { "epoch": 0.6074494233314426, "grad_norm": 0.2647174298763275, "learning_rate": 3e-05, "loss": 1.3096, "num_input_tokens_seen": 264241468, "step": 7530 }, { "epoch": 0.6082561290729186, "grad_norm": 0.26796919107437134, "learning_rate": 3e-05, "loss": 1.3448, "num_input_tokens_seen": 264578804, "step": 7540 }, { "epoch": 0.6090628348143946, "grad_norm": 0.2776355445384979, "learning_rate": 3e-05, "loss": 1.3448, "num_input_tokens_seen": 264951596, "step": 7550 }, { "epoch": 0.6098695405558707, "grad_norm": 0.2773449420928955, "learning_rate": 3e-05, "loss": 1.2945, "num_input_tokens_seen": 265319936, "step": 7560 }, { "epoch": 0.6106762462973467, "grad_norm": 0.28379198908805847, "learning_rate": 3e-05, "loss": 1.3447, "num_input_tokens_seen": 265671716, "step": 7570 }, { "epoch": 0.6114829520388227, "grad_norm": 0.2618366777896881, "learning_rate": 3e-05, "loss": 1.3163, "num_input_tokens_seen": 266015756, "step": 7580 }, { "epoch": 0.6122896577802988, "grad_norm": 0.25278952717781067, "learning_rate": 3e-05, "loss": 1.3918, "num_input_tokens_seen": 266371472, "step": 7590 }, { "epoch": 0.6130963635217748, "grad_norm": 0.29882779717445374, "learning_rate": 3e-05, "loss": 1.3239, "num_input_tokens_seen": 266726232, "step": 7600 }, { "epoch": 0.6139030692632508, "grad_norm": 0.2473878711462021, "learning_rate": 3e-05, "loss": 1.3322, "num_input_tokens_seen": 267077476, "step": 7610 }, { "epoch": 0.6147097750047268, "grad_norm": 0.26804500818252563, "learning_rate": 3e-05, "loss": 1.3116, "num_input_tokens_seen": 267412680, "step": 7620 }, { "epoch": 0.6155164807462028, "grad_norm": 0.2616485059261322, "learning_rate": 3e-05, "loss": 1.3633, "num_input_tokens_seen": 267745836, "step": 7630 }, { "epoch": 0.6163231864876788, "grad_norm": 0.29525431990623474, "learning_rate": 3e-05, "loss": 1.291, "num_input_tokens_seen": 268063588, "step": 7640 }, { "epoch": 0.6171298922291548, "grad_norm": 0.2977356016635895, "learning_rate": 3e-05, "loss": 1.2814, "num_input_tokens_seen": 268434424, "step": 7650 }, { "epoch": 0.6179365979706308, "grad_norm": 0.23729270696640015, "learning_rate": 3e-05, "loss": 1.3602, "num_input_tokens_seen": 268807668, "step": 7660 }, { "epoch": 0.6187433037121068, "grad_norm": 0.26896366477012634, "learning_rate": 3e-05, "loss": 1.3211, "num_input_tokens_seen": 269146372, "step": 7670 }, { "epoch": 0.619550009453583, "grad_norm": 0.27436563372612, "learning_rate": 3e-05, "loss": 1.3661, "num_input_tokens_seen": 269516044, "step": 7680 }, { "epoch": 0.620356715195059, "grad_norm": 0.2542715072631836, "learning_rate": 3e-05, "loss": 1.3132, "num_input_tokens_seen": 269885468, "step": 7690 }, { "epoch": 0.621163420936535, "grad_norm": 0.24955634772777557, "learning_rate": 3e-05, "loss": 1.3579, "num_input_tokens_seen": 270234860, "step": 7700 }, { "epoch": 0.621970126678011, "grad_norm": 0.2842109799385071, "learning_rate": 3e-05, "loss": 1.3533, "num_input_tokens_seen": 270588556, "step": 7710 }, { "epoch": 0.622776832419487, "grad_norm": 0.2718474864959717, "learning_rate": 3e-05, "loss": 1.3144, "num_input_tokens_seen": 270900420, "step": 7720 }, { "epoch": 0.623583538160963, "grad_norm": 0.254463255405426, "learning_rate": 3e-05, "loss": 1.3094, "num_input_tokens_seen": 271266568, "step": 7730 }, { "epoch": 0.624390243902439, "grad_norm": 0.2510669529438019, "learning_rate": 3e-05, "loss": 1.3114, "num_input_tokens_seen": 271611104, "step": 7740 }, { "epoch": 0.625196949643915, "grad_norm": 0.25194868445396423, "learning_rate": 3e-05, "loss": 1.3211, "num_input_tokens_seen": 271961836, "step": 7750 }, { "epoch": 0.626003655385391, "grad_norm": 0.26476165652275085, "learning_rate": 3e-05, "loss": 1.3175, "num_input_tokens_seen": 272308540, "step": 7760 }, { "epoch": 0.626810361126867, "grad_norm": 0.2768157720565796, "learning_rate": 3e-05, "loss": 1.3405, "num_input_tokens_seen": 272680724, "step": 7770 }, { "epoch": 0.6276170668683431, "grad_norm": 0.25789812207221985, "learning_rate": 3e-05, "loss": 1.3044, "num_input_tokens_seen": 273040688, "step": 7780 }, { "epoch": 0.6284237726098191, "grad_norm": 0.2867225408554077, "learning_rate": 3e-05, "loss": 1.3424, "num_input_tokens_seen": 273396808, "step": 7790 }, { "epoch": 0.6292304783512951, "grad_norm": 0.27524057030677795, "learning_rate": 3e-05, "loss": 1.3146, "num_input_tokens_seen": 273725436, "step": 7800 }, { "epoch": 0.6300371840927712, "grad_norm": 0.30353033542633057, "learning_rate": 3e-05, "loss": 1.2903, "num_input_tokens_seen": 274061044, "step": 7810 }, { "epoch": 0.6308438898342472, "grad_norm": 0.2527361810207367, "learning_rate": 3e-05, "loss": 1.3225, "num_input_tokens_seen": 274423428, "step": 7820 }, { "epoch": 0.6316505955757232, "grad_norm": 0.27751225233078003, "learning_rate": 3e-05, "loss": 1.3112, "num_input_tokens_seen": 274770512, "step": 7830 }, { "epoch": 0.6324573013171992, "grad_norm": 0.29832029342651367, "learning_rate": 3e-05, "loss": 1.3537, "num_input_tokens_seen": 275116652, "step": 7840 }, { "epoch": 0.6332640070586752, "grad_norm": 0.24705222249031067, "learning_rate": 3e-05, "loss": 1.3255, "num_input_tokens_seen": 275512076, "step": 7850 }, { "epoch": 0.6340707128001513, "grad_norm": 0.2816605567932129, "learning_rate": 3e-05, "loss": 1.3099, "num_input_tokens_seen": 275863084, "step": 7860 }, { "epoch": 0.6348774185416273, "grad_norm": 0.2889770567417145, "learning_rate": 3e-05, "loss": 1.324, "num_input_tokens_seen": 276235188, "step": 7870 }, { "epoch": 0.6356841242831033, "grad_norm": 0.2934252619743347, "learning_rate": 3e-05, "loss": 1.31, "num_input_tokens_seen": 276601124, "step": 7880 }, { "epoch": 0.6364908300245793, "grad_norm": 0.2529415488243103, "learning_rate": 3e-05, "loss": 1.2902, "num_input_tokens_seen": 276959408, "step": 7890 }, { "epoch": 0.6372975357660553, "grad_norm": 0.24715226888656616, "learning_rate": 3e-05, "loss": 1.2716, "num_input_tokens_seen": 277332960, "step": 7900 }, { "epoch": 0.6381042415075313, "grad_norm": 0.28011465072631836, "learning_rate": 3e-05, "loss": 1.3777, "num_input_tokens_seen": 277666204, "step": 7910 }, { "epoch": 0.6389109472490073, "grad_norm": 0.25102949142456055, "learning_rate": 3e-05, "loss": 1.2916, "num_input_tokens_seen": 278007212, "step": 7920 }, { "epoch": 0.6397176529904834, "grad_norm": 0.2727227210998535, "learning_rate": 3e-05, "loss": 1.2966, "num_input_tokens_seen": 278373292, "step": 7930 }, { "epoch": 0.6405243587319595, "grad_norm": 0.2720615267753601, "learning_rate": 3e-05, "loss": 1.3667, "num_input_tokens_seen": 278725792, "step": 7940 }, { "epoch": 0.6413310644734355, "grad_norm": 0.2724305987358093, "learning_rate": 3e-05, "loss": 1.2488, "num_input_tokens_seen": 279085604, "step": 7950 }, { "epoch": 0.6421377702149115, "grad_norm": 0.26985448598861694, "learning_rate": 3e-05, "loss": 1.3151, "num_input_tokens_seen": 279472276, "step": 7960 }, { "epoch": 0.6429444759563875, "grad_norm": 0.2569502890110016, "learning_rate": 3e-05, "loss": 1.2984, "num_input_tokens_seen": 279809104, "step": 7970 }, { "epoch": 0.6437511816978635, "grad_norm": 0.2665258049964905, "learning_rate": 3e-05, "loss": 1.2771, "num_input_tokens_seen": 280160428, "step": 7980 }, { "epoch": 0.6445578874393395, "grad_norm": 0.25413599610328674, "learning_rate": 3e-05, "loss": 1.2941, "num_input_tokens_seen": 280518244, "step": 7990 }, { "epoch": 0.6453645931808155, "grad_norm": 0.2681139409542084, "learning_rate": 3e-05, "loss": 1.2859, "num_input_tokens_seen": 280866724, "step": 8000 }, { "epoch": 0.6453645931808155, "eval_gen_len": 452.495, "eval_loss": 1.2491791248321533, "eval_rouge1": 37.05, "eval_rouge2": 21.0468, "eval_rougeL": 29.7988, "eval_rougeLsum": 35.1882, "eval_runtime": 1853.3751, "eval_samples_per_second": 0.108, "eval_steps_per_second": 0.027, "num_input_tokens_seen": 280866724, "step": 8000 }, { "epoch": 0.6461712989222915, "grad_norm": 0.24775606393814087, "learning_rate": 3e-05, "loss": 1.3535, "num_input_tokens_seen": 281189452, "step": 8010 }, { "epoch": 0.6469780046637675, "grad_norm": 0.2870043218135834, "learning_rate": 3e-05, "loss": 1.2518, "num_input_tokens_seen": 281504304, "step": 8020 }, { "epoch": 0.6477847104052435, "grad_norm": 0.26712578535079956, "learning_rate": 3e-05, "loss": 1.3021, "num_input_tokens_seen": 281829352, "step": 8030 }, { "epoch": 0.6485914161467196, "grad_norm": 0.24101081490516663, "learning_rate": 3e-05, "loss": 1.3482, "num_input_tokens_seen": 282176564, "step": 8040 }, { "epoch": 0.6493981218881957, "grad_norm": 0.278340220451355, "learning_rate": 3e-05, "loss": 1.2904, "num_input_tokens_seen": 282536828, "step": 8050 }, { "epoch": 0.6502048276296717, "grad_norm": 0.238587886095047, "learning_rate": 3e-05, "loss": 1.3017, "num_input_tokens_seen": 282870112, "step": 8060 }, { "epoch": 0.6510115333711477, "grad_norm": 0.25995177030563354, "learning_rate": 3e-05, "loss": 1.3319, "num_input_tokens_seen": 283211856, "step": 8070 }, { "epoch": 0.6518182391126237, "grad_norm": 0.25454819202423096, "learning_rate": 3e-05, "loss": 1.2937, "num_input_tokens_seen": 283557108, "step": 8080 }, { "epoch": 0.6526249448540997, "grad_norm": 0.2610025405883789, "learning_rate": 3e-05, "loss": 1.3105, "num_input_tokens_seen": 283939544, "step": 8090 }, { "epoch": 0.6534316505955757, "grad_norm": 0.2735656797885895, "learning_rate": 3e-05, "loss": 1.3028, "num_input_tokens_seen": 284295512, "step": 8100 }, { "epoch": 0.6542383563370517, "grad_norm": 0.2599696218967438, "learning_rate": 3e-05, "loss": 1.2618, "num_input_tokens_seen": 284660732, "step": 8110 }, { "epoch": 0.6550450620785278, "grad_norm": 0.2804352343082428, "learning_rate": 3e-05, "loss": 1.3266, "num_input_tokens_seen": 284985504, "step": 8120 }, { "epoch": 0.6558517678200038, "grad_norm": 0.26796236634254456, "learning_rate": 3e-05, "loss": 1.2956, "num_input_tokens_seen": 285352540, "step": 8130 }, { "epoch": 0.6566584735614798, "grad_norm": 0.26151329278945923, "learning_rate": 3e-05, "loss": 1.295, "num_input_tokens_seen": 285678276, "step": 8140 }, { "epoch": 0.6574651793029558, "grad_norm": 0.2699349522590637, "learning_rate": 3e-05, "loss": 1.3056, "num_input_tokens_seen": 286023024, "step": 8150 }, { "epoch": 0.6582718850444318, "grad_norm": 0.2832753360271454, "learning_rate": 3e-05, "loss": 1.2952, "num_input_tokens_seen": 286356540, "step": 8160 }, { "epoch": 0.6590785907859079, "grad_norm": 0.26573285460472107, "learning_rate": 3e-05, "loss": 1.3337, "num_input_tokens_seen": 286700992, "step": 8170 }, { "epoch": 0.6598852965273839, "grad_norm": 0.26687324047088623, "learning_rate": 3e-05, "loss": 1.2572, "num_input_tokens_seen": 287050020, "step": 8180 }, { "epoch": 0.6606920022688599, "grad_norm": 0.26736560463905334, "learning_rate": 3e-05, "loss": 1.2766, "num_input_tokens_seen": 287417060, "step": 8190 }, { "epoch": 0.661498708010336, "grad_norm": 0.26670607924461365, "learning_rate": 3e-05, "loss": 1.2882, "num_input_tokens_seen": 287741024, "step": 8200 }, { "epoch": 0.662305413751812, "grad_norm": 0.29119133949279785, "learning_rate": 3e-05, "loss": 1.3212, "num_input_tokens_seen": 288119560, "step": 8210 }, { "epoch": 0.663112119493288, "grad_norm": 0.26133713126182556, "learning_rate": 3e-05, "loss": 1.2795, "num_input_tokens_seen": 288463204, "step": 8220 }, { "epoch": 0.663918825234764, "grad_norm": 0.2923208177089691, "learning_rate": 3e-05, "loss": 1.3034, "num_input_tokens_seen": 288775820, "step": 8230 }, { "epoch": 0.66472553097624, "grad_norm": 0.24762633442878723, "learning_rate": 3e-05, "loss": 1.2558, "num_input_tokens_seen": 289102444, "step": 8240 }, { "epoch": 0.665532236717716, "grad_norm": 0.29962268471717834, "learning_rate": 3e-05, "loss": 1.3248, "num_input_tokens_seen": 289448904, "step": 8250 }, { "epoch": 0.666338942459192, "grad_norm": 0.27402591705322266, "learning_rate": 3e-05, "loss": 1.3558, "num_input_tokens_seen": 289778028, "step": 8260 }, { "epoch": 0.667145648200668, "grad_norm": 0.28333625197410583, "learning_rate": 3e-05, "loss": 1.2694, "num_input_tokens_seen": 290149804, "step": 8270 }, { "epoch": 0.667952353942144, "grad_norm": 0.26104313135147095, "learning_rate": 3e-05, "loss": 1.2988, "num_input_tokens_seen": 290506276, "step": 8280 }, { "epoch": 0.66875905968362, "grad_norm": 0.26603755354881287, "learning_rate": 3e-05, "loss": 1.3397, "num_input_tokens_seen": 290867248, "step": 8290 }, { "epoch": 0.6695657654250962, "grad_norm": 0.2591850459575653, "learning_rate": 3e-05, "loss": 1.27, "num_input_tokens_seen": 291243604, "step": 8300 }, { "epoch": 0.6703724711665722, "grad_norm": 0.2640308141708374, "learning_rate": 3e-05, "loss": 1.3251, "num_input_tokens_seen": 291586776, "step": 8310 }, { "epoch": 0.6711791769080482, "grad_norm": 0.29766708612442017, "learning_rate": 3e-05, "loss": 1.2843, "num_input_tokens_seen": 291935504, "step": 8320 }, { "epoch": 0.6719858826495242, "grad_norm": 0.24987733364105225, "learning_rate": 3e-05, "loss": 1.3028, "num_input_tokens_seen": 292267852, "step": 8330 }, { "epoch": 0.6727925883910002, "grad_norm": 0.26682114601135254, "learning_rate": 3e-05, "loss": 1.3688, "num_input_tokens_seen": 292629948, "step": 8340 }, { "epoch": 0.6735992941324762, "grad_norm": 0.25744229555130005, "learning_rate": 3e-05, "loss": 1.294, "num_input_tokens_seen": 293012512, "step": 8350 }, { "epoch": 0.6744059998739522, "grad_norm": 0.2486562579870224, "learning_rate": 3e-05, "loss": 1.2671, "num_input_tokens_seen": 293349760, "step": 8360 }, { "epoch": 0.6752127056154282, "grad_norm": 0.27496910095214844, "learning_rate": 3e-05, "loss": 1.2869, "num_input_tokens_seen": 293707052, "step": 8370 }, { "epoch": 0.6760194113569042, "grad_norm": 0.26703888177871704, "learning_rate": 3e-05, "loss": 1.279, "num_input_tokens_seen": 294091848, "step": 8380 }, { "epoch": 0.6768261170983803, "grad_norm": 0.3000788390636444, "learning_rate": 3e-05, "loss": 1.2746, "num_input_tokens_seen": 294459400, "step": 8390 }, { "epoch": 0.6776328228398563, "grad_norm": 0.2827373147010803, "learning_rate": 3e-05, "loss": 1.2711, "num_input_tokens_seen": 294815776, "step": 8400 }, { "epoch": 0.6784395285813323, "grad_norm": 0.2718258500099182, "learning_rate": 3e-05, "loss": 1.269, "num_input_tokens_seen": 295182760, "step": 8410 }, { "epoch": 0.6792462343228084, "grad_norm": 0.2768170237541199, "learning_rate": 3e-05, "loss": 1.3107, "num_input_tokens_seen": 295545640, "step": 8420 }, { "epoch": 0.6800529400642844, "grad_norm": 0.29544582962989807, "learning_rate": 3e-05, "loss": 1.3062, "num_input_tokens_seen": 295896292, "step": 8430 }, { "epoch": 0.6808596458057604, "grad_norm": 0.2775704264640808, "learning_rate": 3e-05, "loss": 1.2903, "num_input_tokens_seen": 296238076, "step": 8440 }, { "epoch": 0.6816663515472364, "grad_norm": 0.29178759455680847, "learning_rate": 3e-05, "loss": 1.2591, "num_input_tokens_seen": 296593652, "step": 8450 }, { "epoch": 0.6824730572887124, "grad_norm": 0.2721198797225952, "learning_rate": 3e-05, "loss": 1.3322, "num_input_tokens_seen": 296929900, "step": 8460 }, { "epoch": 0.6832797630301884, "grad_norm": 0.27254942059516907, "learning_rate": 3e-05, "loss": 1.2871, "num_input_tokens_seen": 297259576, "step": 8470 }, { "epoch": 0.6840864687716645, "grad_norm": 0.25439295172691345, "learning_rate": 3e-05, "loss": 1.3424, "num_input_tokens_seen": 297618016, "step": 8480 }, { "epoch": 0.6848931745131405, "grad_norm": 0.2755286395549774, "learning_rate": 3e-05, "loss": 1.3161, "num_input_tokens_seen": 297965452, "step": 8490 }, { "epoch": 0.6856998802546165, "grad_norm": 0.2616944909095764, "learning_rate": 3e-05, "loss": 1.2693, "num_input_tokens_seen": 298311824, "step": 8500 }, { "epoch": 0.6865065859960925, "grad_norm": 0.26057368516921997, "learning_rate": 3e-05, "loss": 1.3227, "num_input_tokens_seen": 298690492, "step": 8510 }, { "epoch": 0.6873132917375685, "grad_norm": 0.28719767928123474, "learning_rate": 3e-05, "loss": 1.3112, "num_input_tokens_seen": 299031280, "step": 8520 }, { "epoch": 0.6881199974790445, "grad_norm": 0.2910424768924713, "learning_rate": 3e-05, "loss": 1.2747, "num_input_tokens_seen": 299391836, "step": 8530 }, { "epoch": 0.6889267032205206, "grad_norm": 0.24181599915027618, "learning_rate": 3e-05, "loss": 1.2351, "num_input_tokens_seen": 299756312, "step": 8540 }, { "epoch": 0.6897334089619966, "grad_norm": 0.30020081996917725, "learning_rate": 3e-05, "loss": 1.2665, "num_input_tokens_seen": 300106552, "step": 8550 }, { "epoch": 0.6905401147034727, "grad_norm": 0.24974121153354645, "learning_rate": 3e-05, "loss": 1.2678, "num_input_tokens_seen": 300444076, "step": 8560 }, { "epoch": 0.6913468204449487, "grad_norm": 0.24613253772258759, "learning_rate": 3e-05, "loss": 1.3276, "num_input_tokens_seen": 300774136, "step": 8570 }, { "epoch": 0.6921535261864247, "grad_norm": 0.24651503562927246, "learning_rate": 3e-05, "loss": 1.3277, "num_input_tokens_seen": 301144656, "step": 8580 }, { "epoch": 0.6929602319279007, "grad_norm": 0.2640286982059479, "learning_rate": 3e-05, "loss": 1.2923, "num_input_tokens_seen": 301508348, "step": 8590 }, { "epoch": 0.6937669376693767, "grad_norm": 0.2569688558578491, "learning_rate": 3e-05, "loss": 1.3099, "num_input_tokens_seen": 301875020, "step": 8600 }, { "epoch": 0.6945736434108527, "grad_norm": 0.2461465746164322, "learning_rate": 3e-05, "loss": 1.2561, "num_input_tokens_seen": 302220640, "step": 8610 }, { "epoch": 0.6953803491523287, "grad_norm": 0.28222113847732544, "learning_rate": 3e-05, "loss": 1.3067, "num_input_tokens_seen": 302536036, "step": 8620 }, { "epoch": 0.6961870548938047, "grad_norm": 0.2518487870693207, "learning_rate": 3e-05, "loss": 1.2807, "num_input_tokens_seen": 302868688, "step": 8630 }, { "epoch": 0.6969937606352807, "grad_norm": 0.2543613016605377, "learning_rate": 3e-05, "loss": 1.2858, "num_input_tokens_seen": 303210396, "step": 8640 }, { "epoch": 0.6978004663767567, "grad_norm": 0.237895667552948, "learning_rate": 3e-05, "loss": 1.2601, "num_input_tokens_seen": 303569724, "step": 8650 }, { "epoch": 0.6986071721182328, "grad_norm": 0.2580051124095917, "learning_rate": 3e-05, "loss": 1.2494, "num_input_tokens_seen": 303930156, "step": 8660 }, { "epoch": 0.6994138778597089, "grad_norm": 0.269072949886322, "learning_rate": 3e-05, "loss": 1.3098, "num_input_tokens_seen": 304273948, "step": 8670 }, { "epoch": 0.7002205836011849, "grad_norm": 0.24792876839637756, "learning_rate": 3e-05, "loss": 1.2807, "num_input_tokens_seen": 304633556, "step": 8680 }, { "epoch": 0.7010272893426609, "grad_norm": 0.30012139678001404, "learning_rate": 3e-05, "loss": 1.2371, "num_input_tokens_seen": 304995848, "step": 8690 }, { "epoch": 0.7018339950841369, "grad_norm": 0.26541563868522644, "learning_rate": 3e-05, "loss": 1.3035, "num_input_tokens_seen": 305341180, "step": 8700 }, { "epoch": 0.7026407008256129, "grad_norm": 0.2490505874156952, "learning_rate": 3e-05, "loss": 1.3162, "num_input_tokens_seen": 305687480, "step": 8710 }, { "epoch": 0.7034474065670889, "grad_norm": 0.2835010290145874, "learning_rate": 3e-05, "loss": 1.2839, "num_input_tokens_seen": 306057544, "step": 8720 }, { "epoch": 0.7042541123085649, "grad_norm": 0.28543031215667725, "learning_rate": 3e-05, "loss": 1.303, "num_input_tokens_seen": 306430396, "step": 8730 }, { "epoch": 0.705060818050041, "grad_norm": 0.26753681898117065, "learning_rate": 3e-05, "loss": 1.3161, "num_input_tokens_seen": 306769880, "step": 8740 }, { "epoch": 0.705867523791517, "grad_norm": 0.26406893134117126, "learning_rate": 3e-05, "loss": 1.2677, "num_input_tokens_seen": 307115084, "step": 8750 }, { "epoch": 0.706674229532993, "grad_norm": 0.25874289870262146, "learning_rate": 3e-05, "loss": 1.313, "num_input_tokens_seen": 307459572, "step": 8760 }, { "epoch": 0.707480935274469, "grad_norm": 0.26951470971107483, "learning_rate": 3e-05, "loss": 1.2776, "num_input_tokens_seen": 307829708, "step": 8770 }, { "epoch": 0.708287641015945, "grad_norm": 0.26149865984916687, "learning_rate": 3e-05, "loss": 1.2774, "num_input_tokens_seen": 308183368, "step": 8780 }, { "epoch": 0.7090943467574211, "grad_norm": 0.27776703238487244, "learning_rate": 3e-05, "loss": 1.2743, "num_input_tokens_seen": 308541012, "step": 8790 }, { "epoch": 0.7099010524988971, "grad_norm": 0.2505494952201843, "learning_rate": 3e-05, "loss": 1.3011, "num_input_tokens_seen": 308894080, "step": 8800 }, { "epoch": 0.7107077582403731, "grad_norm": 0.26979315280914307, "learning_rate": 3e-05, "loss": 1.281, "num_input_tokens_seen": 309264200, "step": 8810 }, { "epoch": 0.7115144639818491, "grad_norm": 0.29816481471061707, "learning_rate": 3e-05, "loss": 1.2514, "num_input_tokens_seen": 309617452, "step": 8820 }, { "epoch": 0.7123211697233252, "grad_norm": 0.2611445188522339, "learning_rate": 3e-05, "loss": 1.317, "num_input_tokens_seen": 309992684, "step": 8830 }, { "epoch": 0.7131278754648012, "grad_norm": 0.24103762209415436, "learning_rate": 3e-05, "loss": 1.2898, "num_input_tokens_seen": 310335500, "step": 8840 }, { "epoch": 0.7139345812062772, "grad_norm": 0.2735673487186432, "learning_rate": 3e-05, "loss": 1.3122, "num_input_tokens_seen": 310688468, "step": 8850 }, { "epoch": 0.7147412869477532, "grad_norm": 0.28114932775497437, "learning_rate": 3e-05, "loss": 1.3069, "num_input_tokens_seen": 311050176, "step": 8860 }, { "epoch": 0.7155479926892292, "grad_norm": 0.235976904630661, "learning_rate": 3e-05, "loss": 1.2693, "num_input_tokens_seen": 311402020, "step": 8870 }, { "epoch": 0.7163546984307052, "grad_norm": 0.26939788460731506, "learning_rate": 3e-05, "loss": 1.3097, "num_input_tokens_seen": 311759596, "step": 8880 }, { "epoch": 0.7171614041721812, "grad_norm": 0.25951477885246277, "learning_rate": 3e-05, "loss": 1.2904, "num_input_tokens_seen": 312097396, "step": 8890 }, { "epoch": 0.7179681099136572, "grad_norm": 0.2675970196723938, "learning_rate": 3e-05, "loss": 1.2992, "num_input_tokens_seen": 312460216, "step": 8900 }, { "epoch": 0.7187748156551333, "grad_norm": 0.25855639576911926, "learning_rate": 3e-05, "loss": 1.2781, "num_input_tokens_seen": 312826724, "step": 8910 }, { "epoch": 0.7195815213966094, "grad_norm": 0.2917179465293884, "learning_rate": 3e-05, "loss": 1.2743, "num_input_tokens_seen": 313191192, "step": 8920 }, { "epoch": 0.7203882271380854, "grad_norm": 0.2799781262874603, "learning_rate": 3e-05, "loss": 1.2837, "num_input_tokens_seen": 313535952, "step": 8930 }, { "epoch": 0.7211949328795614, "grad_norm": 0.27598562836647034, "learning_rate": 3e-05, "loss": 1.2682, "num_input_tokens_seen": 313891512, "step": 8940 }, { "epoch": 0.7220016386210374, "grad_norm": 0.24936188757419586, "learning_rate": 3e-05, "loss": 1.2888, "num_input_tokens_seen": 314237120, "step": 8950 }, { "epoch": 0.7228083443625134, "grad_norm": 0.28626489639282227, "learning_rate": 3e-05, "loss": 1.2901, "num_input_tokens_seen": 314598912, "step": 8960 }, { "epoch": 0.7236150501039894, "grad_norm": 0.25209441781044006, "learning_rate": 3e-05, "loss": 1.3165, "num_input_tokens_seen": 314942440, "step": 8970 }, { "epoch": 0.7244217558454654, "grad_norm": 0.27700820565223694, "learning_rate": 3e-05, "loss": 1.2825, "num_input_tokens_seen": 315310504, "step": 8980 }, { "epoch": 0.7252284615869414, "grad_norm": 0.2671830356121063, "learning_rate": 3e-05, "loss": 1.276, "num_input_tokens_seen": 315682248, "step": 8990 }, { "epoch": 0.7260351673284174, "grad_norm": 0.3061155080795288, "learning_rate": 3e-05, "loss": 1.298, "num_input_tokens_seen": 316042068, "step": 9000 }, { "epoch": 0.7260351673284174, "eval_gen_len": 464.37, "eval_loss": 1.2210745811462402, "eval_rouge1": 36.6966, "eval_rouge2": 20.8189, "eval_rougeL": 29.7115, "eval_rougeLsum": 34.7528, "eval_runtime": 1686.3491, "eval_samples_per_second": 0.119, "eval_steps_per_second": 0.03, "num_input_tokens_seen": 316042068, "step": 9000 }, { "epoch": 0.7268418730698935, "grad_norm": 0.2395300716161728, "learning_rate": 3e-05, "loss": 1.28, "num_input_tokens_seen": 316377156, "step": 9010 }, { "epoch": 0.7276485788113695, "grad_norm": 0.26959264278411865, "learning_rate": 3e-05, "loss": 1.2808, "num_input_tokens_seen": 316739840, "step": 9020 }, { "epoch": 0.7284552845528456, "grad_norm": 0.2841363549232483, "learning_rate": 3e-05, "loss": 1.2911, "num_input_tokens_seen": 317084172, "step": 9030 }, { "epoch": 0.7292619902943216, "grad_norm": 0.2511976361274719, "learning_rate": 3e-05, "loss": 1.2442, "num_input_tokens_seen": 317440872, "step": 9040 }, { "epoch": 0.7300686960357976, "grad_norm": 0.26146405935287476, "learning_rate": 3e-05, "loss": 1.3228, "num_input_tokens_seen": 317755504, "step": 9050 }, { "epoch": 0.7308754017772736, "grad_norm": 0.2912101745605469, "learning_rate": 3e-05, "loss": 1.2688, "num_input_tokens_seen": 318148088, "step": 9060 }, { "epoch": 0.7316821075187496, "grad_norm": 0.2883487641811371, "learning_rate": 3e-05, "loss": 1.313, "num_input_tokens_seen": 318499404, "step": 9070 }, { "epoch": 0.7324888132602256, "grad_norm": 0.2744971811771393, "learning_rate": 3e-05, "loss": 1.2339, "num_input_tokens_seen": 318857524, "step": 9080 }, { "epoch": 0.7332955190017016, "grad_norm": 0.3002362847328186, "learning_rate": 3e-05, "loss": 1.2448, "num_input_tokens_seen": 319207924, "step": 9090 }, { "epoch": 0.7341022247431777, "grad_norm": 0.28158414363861084, "learning_rate": 3e-05, "loss": 1.27, "num_input_tokens_seen": 319547752, "step": 9100 }, { "epoch": 0.7349089304846537, "grad_norm": 0.2615879774093628, "learning_rate": 3e-05, "loss": 1.2781, "num_input_tokens_seen": 319898356, "step": 9110 }, { "epoch": 0.7357156362261297, "grad_norm": 0.24552986025810242, "learning_rate": 3e-05, "loss": 1.2847, "num_input_tokens_seen": 320260504, "step": 9120 }, { "epoch": 0.7365223419676057, "grad_norm": 0.2580191195011139, "learning_rate": 3e-05, "loss": 1.298, "num_input_tokens_seen": 320619676, "step": 9130 }, { "epoch": 0.7373290477090817, "grad_norm": 0.2691594660282135, "learning_rate": 3e-05, "loss": 1.267, "num_input_tokens_seen": 320975396, "step": 9140 }, { "epoch": 0.7381357534505577, "grad_norm": 0.2579469382762909, "learning_rate": 3e-05, "loss": 1.313, "num_input_tokens_seen": 321340052, "step": 9150 }, { "epoch": 0.7389424591920338, "grad_norm": 0.2658007740974426, "learning_rate": 3e-05, "loss": 1.3099, "num_input_tokens_seen": 321690400, "step": 9160 }, { "epoch": 0.7397491649335098, "grad_norm": 0.2555302679538727, "learning_rate": 3e-05, "loss": 1.2961, "num_input_tokens_seen": 322040856, "step": 9170 }, { "epoch": 0.7405558706749858, "grad_norm": 0.24547891318798065, "learning_rate": 3e-05, "loss": 1.316, "num_input_tokens_seen": 322390612, "step": 9180 }, { "epoch": 0.7413625764164619, "grad_norm": 0.26539695262908936, "learning_rate": 3e-05, "loss": 1.2573, "num_input_tokens_seen": 322751676, "step": 9190 }, { "epoch": 0.7421692821579379, "grad_norm": 0.24796757102012634, "learning_rate": 3e-05, "loss": 1.2308, "num_input_tokens_seen": 323058852, "step": 9200 }, { "epoch": 0.7429759878994139, "grad_norm": 0.26277750730514526, "learning_rate": 3e-05, "loss": 1.2912, "num_input_tokens_seen": 323378492, "step": 9210 }, { "epoch": 0.7437826936408899, "grad_norm": 0.2662057876586914, "learning_rate": 3e-05, "loss": 1.2891, "num_input_tokens_seen": 323741544, "step": 9220 }, { "epoch": 0.7445893993823659, "grad_norm": 0.30241715908050537, "learning_rate": 3e-05, "loss": 1.2783, "num_input_tokens_seen": 324084828, "step": 9230 }, { "epoch": 0.7453961051238419, "grad_norm": 0.24552224576473236, "learning_rate": 3e-05, "loss": 1.2602, "num_input_tokens_seen": 324429788, "step": 9240 }, { "epoch": 0.7462028108653179, "grad_norm": 0.2982407212257385, "learning_rate": 3e-05, "loss": 1.2724, "num_input_tokens_seen": 324762408, "step": 9250 }, { "epoch": 0.7470095166067939, "grad_norm": 0.2681979835033417, "learning_rate": 3e-05, "loss": 1.2691, "num_input_tokens_seen": 325100544, "step": 9260 }, { "epoch": 0.7478162223482699, "grad_norm": 0.23062004148960114, "learning_rate": 3e-05, "loss": 1.2752, "num_input_tokens_seen": 325430040, "step": 9270 }, { "epoch": 0.7486229280897461, "grad_norm": 0.2845359742641449, "learning_rate": 3e-05, "loss": 1.2589, "num_input_tokens_seen": 325776508, "step": 9280 }, { "epoch": 0.7494296338312221, "grad_norm": 0.28453579545021057, "learning_rate": 3e-05, "loss": 1.264, "num_input_tokens_seen": 326132120, "step": 9290 }, { "epoch": 0.7502363395726981, "grad_norm": 0.2852461040019989, "learning_rate": 3e-05, "loss": 1.2583, "num_input_tokens_seen": 326483084, "step": 9300 }, { "epoch": 0.7510430453141741, "grad_norm": 0.25744280219078064, "learning_rate": 3e-05, "loss": 1.2774, "num_input_tokens_seen": 326835932, "step": 9310 }, { "epoch": 0.7518497510556501, "grad_norm": 0.255248486995697, "learning_rate": 3e-05, "loss": 1.2249, "num_input_tokens_seen": 327198368, "step": 9320 }, { "epoch": 0.7526564567971261, "grad_norm": 0.25559529662132263, "learning_rate": 3e-05, "loss": 1.2555, "num_input_tokens_seen": 327558064, "step": 9330 }, { "epoch": 0.7534631625386021, "grad_norm": 0.27276313304901123, "learning_rate": 3e-05, "loss": 1.2728, "num_input_tokens_seen": 327905308, "step": 9340 }, { "epoch": 0.7542698682800781, "grad_norm": 0.26818275451660156, "learning_rate": 3e-05, "loss": 1.2931, "num_input_tokens_seen": 328237128, "step": 9350 }, { "epoch": 0.7550765740215541, "grad_norm": 0.29092878103256226, "learning_rate": 3e-05, "loss": 1.2891, "num_input_tokens_seen": 328586876, "step": 9360 }, { "epoch": 0.7558832797630302, "grad_norm": 0.25079798698425293, "learning_rate": 3e-05, "loss": 1.2917, "num_input_tokens_seen": 328894680, "step": 9370 }, { "epoch": 0.7566899855045062, "grad_norm": 0.23828420042991638, "learning_rate": 3e-05, "loss": 1.3024, "num_input_tokens_seen": 329232548, "step": 9380 }, { "epoch": 0.7574966912459822, "grad_norm": 0.24749857187271118, "learning_rate": 3e-05, "loss": 1.245, "num_input_tokens_seen": 329572096, "step": 9390 }, { "epoch": 0.7583033969874583, "grad_norm": 0.24294038116931915, "learning_rate": 3e-05, "loss": 1.2594, "num_input_tokens_seen": 329935620, "step": 9400 }, { "epoch": 0.7591101027289343, "grad_norm": 0.24688206613063812, "learning_rate": 3e-05, "loss": 1.2966, "num_input_tokens_seen": 330298316, "step": 9410 }, { "epoch": 0.7599168084704103, "grad_norm": 0.26844438910484314, "learning_rate": 3e-05, "loss": 1.2428, "num_input_tokens_seen": 330663576, "step": 9420 }, { "epoch": 0.7607235142118863, "grad_norm": 0.24980930984020233, "learning_rate": 3e-05, "loss": 1.2041, "num_input_tokens_seen": 330992136, "step": 9430 }, { "epoch": 0.7615302199533623, "grad_norm": 0.26029011607170105, "learning_rate": 3e-05, "loss": 1.2654, "num_input_tokens_seen": 331366748, "step": 9440 }, { "epoch": 0.7623369256948384, "grad_norm": 0.2643781006336212, "learning_rate": 3e-05, "loss": 1.2701, "num_input_tokens_seen": 331701028, "step": 9450 }, { "epoch": 0.7631436314363144, "grad_norm": 0.2505422532558441, "learning_rate": 3e-05, "loss": 1.2833, "num_input_tokens_seen": 332092676, "step": 9460 }, { "epoch": 0.7639503371777904, "grad_norm": 0.2630390524864197, "learning_rate": 3e-05, "loss": 1.2207, "num_input_tokens_seen": 332401596, "step": 9470 }, { "epoch": 0.7647570429192664, "grad_norm": 0.27384325861930847, "learning_rate": 3e-05, "loss": 1.3175, "num_input_tokens_seen": 332760052, "step": 9480 }, { "epoch": 0.7655637486607424, "grad_norm": 0.29426440596580505, "learning_rate": 3e-05, "loss": 1.2375, "num_input_tokens_seen": 333118996, "step": 9490 }, { "epoch": 0.7663704544022184, "grad_norm": 0.2638697326183319, "learning_rate": 3e-05, "loss": 1.2639, "num_input_tokens_seen": 333468912, "step": 9500 }, { "epoch": 0.7671771601436944, "grad_norm": 0.2899869978427887, "learning_rate": 3e-05, "loss": 1.3265, "num_input_tokens_seen": 333808660, "step": 9510 }, { "epoch": 0.7679838658851704, "grad_norm": 0.2559219300746918, "learning_rate": 3e-05, "loss": 1.2791, "num_input_tokens_seen": 334133356, "step": 9520 }, { "epoch": 0.7687905716266465, "grad_norm": 0.2566789388656616, "learning_rate": 3e-05, "loss": 1.3236, "num_input_tokens_seen": 334515860, "step": 9530 }, { "epoch": 0.7695972773681226, "grad_norm": 0.2541514039039612, "learning_rate": 3e-05, "loss": 1.2808, "num_input_tokens_seen": 334887600, "step": 9540 }, { "epoch": 0.7704039831095986, "grad_norm": 0.2626420557498932, "learning_rate": 3e-05, "loss": 1.2902, "num_input_tokens_seen": 335269980, "step": 9550 }, { "epoch": 0.7712106888510746, "grad_norm": 0.28111469745635986, "learning_rate": 3e-05, "loss": 1.2285, "num_input_tokens_seen": 335614044, "step": 9560 }, { "epoch": 0.7720173945925506, "grad_norm": 0.26732560992240906, "learning_rate": 3e-05, "loss": 1.2802, "num_input_tokens_seen": 335947240, "step": 9570 }, { "epoch": 0.7728241003340266, "grad_norm": 0.2630169987678528, "learning_rate": 3e-05, "loss": 1.2562, "num_input_tokens_seen": 336277872, "step": 9580 }, { "epoch": 0.7736308060755026, "grad_norm": 0.24275615811347961, "learning_rate": 3e-05, "loss": 1.241, "num_input_tokens_seen": 336616732, "step": 9590 }, { "epoch": 0.7744375118169786, "grad_norm": 0.27467086911201477, "learning_rate": 3e-05, "loss": 1.2783, "num_input_tokens_seen": 336962668, "step": 9600 }, { "epoch": 0.7752442175584546, "grad_norm": 0.22901813685894012, "learning_rate": 3e-05, "loss": 1.2474, "num_input_tokens_seen": 337289600, "step": 9610 }, { "epoch": 0.7760509232999306, "grad_norm": 0.25075381994247437, "learning_rate": 3e-05, "loss": 1.2911, "num_input_tokens_seen": 337638772, "step": 9620 }, { "epoch": 0.7768576290414066, "grad_norm": 0.26371341943740845, "learning_rate": 3e-05, "loss": 1.3082, "num_input_tokens_seen": 337956240, "step": 9630 }, { "epoch": 0.7776643347828827, "grad_norm": 0.2652187943458557, "learning_rate": 3e-05, "loss": 1.2419, "num_input_tokens_seen": 338333320, "step": 9640 }, { "epoch": 0.7784710405243588, "grad_norm": 0.275717169046402, "learning_rate": 3e-05, "loss": 1.2801, "num_input_tokens_seen": 338693484, "step": 9650 }, { "epoch": 0.7792777462658348, "grad_norm": 0.2673225402832031, "learning_rate": 3e-05, "loss": 1.2968, "num_input_tokens_seen": 339059484, "step": 9660 }, { "epoch": 0.7800844520073108, "grad_norm": 0.24011015892028809, "learning_rate": 3e-05, "loss": 1.2643, "num_input_tokens_seen": 339401908, "step": 9670 }, { "epoch": 0.7808911577487868, "grad_norm": 0.2752505838871002, "learning_rate": 3e-05, "loss": 1.2839, "num_input_tokens_seen": 339759036, "step": 9680 }, { "epoch": 0.7816978634902628, "grad_norm": 0.2720450758934021, "learning_rate": 3e-05, "loss": 1.2549, "num_input_tokens_seen": 340108708, "step": 9690 }, { "epoch": 0.7825045692317388, "grad_norm": 0.2938039004802704, "learning_rate": 3e-05, "loss": 1.2702, "num_input_tokens_seen": 340425536, "step": 9700 }, { "epoch": 0.7833112749732148, "grad_norm": 0.2659102976322174, "learning_rate": 3e-05, "loss": 1.23, "num_input_tokens_seen": 340799308, "step": 9710 }, { "epoch": 0.7841179807146909, "grad_norm": 0.26471832394599915, "learning_rate": 3e-05, "loss": 1.1911, "num_input_tokens_seen": 341141740, "step": 9720 }, { "epoch": 0.7849246864561669, "grad_norm": 0.27800023555755615, "learning_rate": 3e-05, "loss": 1.2801, "num_input_tokens_seen": 341473408, "step": 9730 }, { "epoch": 0.7857313921976429, "grad_norm": 0.25355374813079834, "learning_rate": 3e-05, "loss": 1.2339, "num_input_tokens_seen": 341810480, "step": 9740 }, { "epoch": 0.7865380979391189, "grad_norm": 0.25053349137306213, "learning_rate": 3e-05, "loss": 1.2741, "num_input_tokens_seen": 342171584, "step": 9750 }, { "epoch": 0.7873448036805949, "grad_norm": 0.2605432868003845, "learning_rate": 3e-05, "loss": 1.2534, "num_input_tokens_seen": 342516448, "step": 9760 }, { "epoch": 0.788151509422071, "grad_norm": 0.27407005429267883, "learning_rate": 3e-05, "loss": 1.276, "num_input_tokens_seen": 342861752, "step": 9770 }, { "epoch": 0.788958215163547, "grad_norm": 0.2646719515323639, "learning_rate": 3e-05, "loss": 1.2308, "num_input_tokens_seen": 343235864, "step": 9780 }, { "epoch": 0.789764920905023, "grad_norm": 0.2499488741159439, "learning_rate": 3e-05, "loss": 1.2787, "num_input_tokens_seen": 343585360, "step": 9790 }, { "epoch": 0.790571626646499, "grad_norm": 0.25169795751571655, "learning_rate": 3e-05, "loss": 1.272, "num_input_tokens_seen": 343949028, "step": 9800 }, { "epoch": 0.7913783323879751, "grad_norm": 0.25061219930648804, "learning_rate": 3e-05, "loss": 1.2543, "num_input_tokens_seen": 344341964, "step": 9810 }, { "epoch": 0.7921850381294511, "grad_norm": 0.27238261699676514, "learning_rate": 3e-05, "loss": 1.2527, "num_input_tokens_seen": 344699776, "step": 9820 }, { "epoch": 0.7929917438709271, "grad_norm": 0.26253870129585266, "learning_rate": 3e-05, "loss": 1.2626, "num_input_tokens_seen": 345029320, "step": 9830 }, { "epoch": 0.7937984496124031, "grad_norm": 0.2650923728942871, "learning_rate": 3e-05, "loss": 1.2253, "num_input_tokens_seen": 345405684, "step": 9840 }, { "epoch": 0.7946051553538791, "grad_norm": 0.2489556223154068, "learning_rate": 3e-05, "loss": 1.2682, "num_input_tokens_seen": 345764444, "step": 9850 }, { "epoch": 0.7954118610953551, "grad_norm": 0.2614899277687073, "learning_rate": 3e-05, "loss": 1.2586, "num_input_tokens_seen": 346123540, "step": 9860 }, { "epoch": 0.7962185668368311, "grad_norm": 0.25150853395462036, "learning_rate": 3e-05, "loss": 1.2272, "num_input_tokens_seen": 346484268, "step": 9870 }, { "epoch": 0.7970252725783071, "grad_norm": 0.2592512369155884, "learning_rate": 3e-05, "loss": 1.2478, "num_input_tokens_seen": 346850032, "step": 9880 }, { "epoch": 0.7978319783197833, "grad_norm": 0.26685789227485657, "learning_rate": 3e-05, "loss": 1.2845, "num_input_tokens_seen": 347208532, "step": 9890 }, { "epoch": 0.7986386840612593, "grad_norm": 0.2619518041610718, "learning_rate": 3e-05, "loss": 1.2526, "num_input_tokens_seen": 347546084, "step": 9900 }, { "epoch": 0.7994453898027353, "grad_norm": 0.3048644959926605, "learning_rate": 3e-05, "loss": 1.2763, "num_input_tokens_seen": 347891900, "step": 9910 }, { "epoch": 0.8002520955442113, "grad_norm": 0.2470572292804718, "learning_rate": 3e-05, "loss": 1.2488, "num_input_tokens_seen": 348241444, "step": 9920 }, { "epoch": 0.8010588012856873, "grad_norm": 0.25996264815330505, "learning_rate": 3e-05, "loss": 1.2448, "num_input_tokens_seen": 348600368, "step": 9930 }, { "epoch": 0.8018655070271633, "grad_norm": 0.25079694390296936, "learning_rate": 3e-05, "loss": 1.2158, "num_input_tokens_seen": 348934544, "step": 9940 }, { "epoch": 0.8026722127686393, "grad_norm": 0.2604506015777588, "learning_rate": 3e-05, "loss": 1.2665, "num_input_tokens_seen": 349266356, "step": 9950 }, { "epoch": 0.8034789185101153, "grad_norm": 0.26775991916656494, "learning_rate": 3e-05, "loss": 1.2491, "num_input_tokens_seen": 349637740, "step": 9960 }, { "epoch": 0.8042856242515913, "grad_norm": 0.2628551125526428, "learning_rate": 3e-05, "loss": 1.2875, "num_input_tokens_seen": 349978220, "step": 9970 }, { "epoch": 0.8050923299930673, "grad_norm": 0.2629667818546295, "learning_rate": 3e-05, "loss": 1.21, "num_input_tokens_seen": 350338180, "step": 9980 }, { "epoch": 0.8058990357345434, "grad_norm": 0.26192960143089294, "learning_rate": 3e-05, "loss": 1.2143, "num_input_tokens_seen": 350702660, "step": 9990 }, { "epoch": 0.8067057414760194, "grad_norm": 0.24086323380470276, "learning_rate": 3e-05, "loss": 1.2834, "num_input_tokens_seen": 351056548, "step": 10000 }, { "epoch": 0.8067057414760194, "eval_gen_len": 446.26, "eval_loss": 1.1978570222854614, "eval_rouge1": 37.7181, "eval_rouge2": 20.9926, "eval_rougeL": 30.3857, "eval_rougeLsum": 35.8681, "eval_runtime": 1488.0454, "eval_samples_per_second": 0.134, "eval_steps_per_second": 0.034, "num_input_tokens_seen": 351056548, "step": 10000 }, { "epoch": 0.8075124472174954, "grad_norm": 0.27893269062042236, "learning_rate": 3e-05, "loss": 1.2608, "num_input_tokens_seen": 351419124, "step": 10010 }, { "epoch": 0.8083191529589715, "grad_norm": 0.2801869511604309, "learning_rate": 3e-05, "loss": 1.2362, "num_input_tokens_seen": 351760664, "step": 10020 }, { "epoch": 0.8091258587004475, "grad_norm": 0.2547568380832672, "learning_rate": 3e-05, "loss": 1.1999, "num_input_tokens_seen": 352145232, "step": 10030 }, { "epoch": 0.8099325644419235, "grad_norm": 0.2530830502510071, "learning_rate": 3e-05, "loss": 1.2576, "num_input_tokens_seen": 352484012, "step": 10040 }, { "epoch": 0.8107392701833995, "grad_norm": 0.25283852219581604, "learning_rate": 3e-05, "loss": 1.2672, "num_input_tokens_seen": 352838284, "step": 10050 }, { "epoch": 0.8115459759248755, "grad_norm": 0.2714962661266327, "learning_rate": 3e-05, "loss": 1.2241, "num_input_tokens_seen": 353196252, "step": 10060 }, { "epoch": 0.8123526816663516, "grad_norm": 0.2614021301269531, "learning_rate": 3e-05, "loss": 1.2539, "num_input_tokens_seen": 353557696, "step": 10070 }, { "epoch": 0.8131593874078276, "grad_norm": 0.25115180015563965, "learning_rate": 3e-05, "loss": 1.2152, "num_input_tokens_seen": 353901440, "step": 10080 }, { "epoch": 0.8139660931493036, "grad_norm": 0.23511908948421478, "learning_rate": 3e-05, "loss": 1.227, "num_input_tokens_seen": 354266972, "step": 10090 }, { "epoch": 0.8147727988907796, "grad_norm": 0.2694503366947174, "learning_rate": 3e-05, "loss": 1.2151, "num_input_tokens_seen": 354620136, "step": 10100 }, { "epoch": 0.8155795046322556, "grad_norm": 0.27539879083633423, "learning_rate": 3e-05, "loss": 1.2867, "num_input_tokens_seen": 354981004, "step": 10110 }, { "epoch": 0.8163862103737316, "grad_norm": 0.25558432936668396, "learning_rate": 3e-05, "loss": 1.2376, "num_input_tokens_seen": 355343284, "step": 10120 }, { "epoch": 0.8171929161152076, "grad_norm": 0.24992291629314423, "learning_rate": 3e-05, "loss": 1.2487, "num_input_tokens_seen": 355681348, "step": 10130 }, { "epoch": 0.8179996218566837, "grad_norm": 0.25410589575767517, "learning_rate": 3e-05, "loss": 1.2606, "num_input_tokens_seen": 356040020, "step": 10140 }, { "epoch": 0.8188063275981597, "grad_norm": 0.23031924664974213, "learning_rate": 3e-05, "loss": 1.2462, "num_input_tokens_seen": 356402532, "step": 10150 }, { "epoch": 0.8196130333396358, "grad_norm": 0.26112812757492065, "learning_rate": 3e-05, "loss": 1.2594, "num_input_tokens_seen": 356761504, "step": 10160 }, { "epoch": 0.8204197390811118, "grad_norm": 0.2500099837779999, "learning_rate": 3e-05, "loss": 1.2115, "num_input_tokens_seen": 357123360, "step": 10170 }, { "epoch": 0.8212264448225878, "grad_norm": 0.2862362861633301, "learning_rate": 3e-05, "loss": 1.2912, "num_input_tokens_seen": 357475416, "step": 10180 }, { "epoch": 0.8220331505640638, "grad_norm": 0.2600359320640564, "learning_rate": 3e-05, "loss": 1.226, "num_input_tokens_seen": 357793912, "step": 10190 }, { "epoch": 0.8228398563055398, "grad_norm": 0.25250157713890076, "learning_rate": 3e-05, "loss": 1.2438, "num_input_tokens_seen": 358192864, "step": 10200 }, { "epoch": 0.8236465620470158, "grad_norm": 0.2738971412181854, "learning_rate": 3e-05, "loss": 1.2416, "num_input_tokens_seen": 358538800, "step": 10210 }, { "epoch": 0.8244532677884918, "grad_norm": 0.23127759993076324, "learning_rate": 3e-05, "loss": 1.2963, "num_input_tokens_seen": 358882896, "step": 10220 }, { "epoch": 0.8252599735299678, "grad_norm": 0.25735771656036377, "learning_rate": 3e-05, "loss": 1.2289, "num_input_tokens_seen": 359243952, "step": 10230 }, { "epoch": 0.8260666792714438, "grad_norm": 0.2557520568370819, "learning_rate": 3e-05, "loss": 1.235, "num_input_tokens_seen": 359596716, "step": 10240 }, { "epoch": 0.8268733850129198, "grad_norm": 0.24353064596652985, "learning_rate": 3e-05, "loss": 1.2734, "num_input_tokens_seen": 359931524, "step": 10250 }, { "epoch": 0.827680090754396, "grad_norm": 0.24218714237213135, "learning_rate": 3e-05, "loss": 1.2855, "num_input_tokens_seen": 360277308, "step": 10260 }, { "epoch": 0.828486796495872, "grad_norm": 0.2978828549385071, "learning_rate": 3e-05, "loss": 1.3319, "num_input_tokens_seen": 360642156, "step": 10270 }, { "epoch": 0.829293502237348, "grad_norm": 0.2385886013507843, "learning_rate": 3e-05, "loss": 1.2721, "num_input_tokens_seen": 361012064, "step": 10280 }, { "epoch": 0.830100207978824, "grad_norm": 0.274522602558136, "learning_rate": 3e-05, "loss": 1.2921, "num_input_tokens_seen": 361321708, "step": 10290 }, { "epoch": 0.8309069137203, "grad_norm": 0.22934795916080475, "learning_rate": 3e-05, "loss": 1.2827, "num_input_tokens_seen": 361669832, "step": 10300 }, { "epoch": 0.831713619461776, "grad_norm": 0.2701473534107208, "learning_rate": 3e-05, "loss": 1.2205, "num_input_tokens_seen": 362010440, "step": 10310 }, { "epoch": 0.832520325203252, "grad_norm": 0.22467046976089478, "learning_rate": 3e-05, "loss": 1.2203, "num_input_tokens_seen": 362373936, "step": 10320 }, { "epoch": 0.833327030944728, "grad_norm": 0.24814799427986145, "learning_rate": 3e-05, "loss": 1.2372, "num_input_tokens_seen": 362719892, "step": 10330 }, { "epoch": 0.834133736686204, "grad_norm": 0.25354889035224915, "learning_rate": 3e-05, "loss": 1.269, "num_input_tokens_seen": 363050116, "step": 10340 }, { "epoch": 0.8349404424276801, "grad_norm": 0.2522750496864319, "learning_rate": 3e-05, "loss": 1.2287, "num_input_tokens_seen": 363410780, "step": 10350 }, { "epoch": 0.8357471481691561, "grad_norm": 0.2644040882587433, "learning_rate": 3e-05, "loss": 1.283, "num_input_tokens_seen": 363780168, "step": 10360 }, { "epoch": 0.8365538539106321, "grad_norm": 0.24271726608276367, "learning_rate": 3e-05, "loss": 1.2452, "num_input_tokens_seen": 364140384, "step": 10370 }, { "epoch": 0.8373605596521081, "grad_norm": 0.2616620659828186, "learning_rate": 3e-05, "loss": 1.2483, "num_input_tokens_seen": 364502480, "step": 10380 }, { "epoch": 0.8381672653935842, "grad_norm": 0.26111093163490295, "learning_rate": 3e-05, "loss": 1.2139, "num_input_tokens_seen": 364861816, "step": 10390 }, { "epoch": 0.8389739711350602, "grad_norm": 0.23570705950260162, "learning_rate": 3e-05, "loss": 1.2376, "num_input_tokens_seen": 365218500, "step": 10400 }, { "epoch": 0.8397806768765362, "grad_norm": 0.24099615216255188, "learning_rate": 3e-05, "loss": 1.2236, "num_input_tokens_seen": 365576996, "step": 10410 }, { "epoch": 0.8405873826180122, "grad_norm": 0.261840283870697, "learning_rate": 3e-05, "loss": 1.3198, "num_input_tokens_seen": 365916660, "step": 10420 }, { "epoch": 0.8413940883594883, "grad_norm": 0.2680794298648834, "learning_rate": 3e-05, "loss": 1.2491, "num_input_tokens_seen": 366264708, "step": 10430 }, { "epoch": 0.8422007941009643, "grad_norm": 0.26973119378089905, "learning_rate": 3e-05, "loss": 1.2461, "num_input_tokens_seen": 366626892, "step": 10440 }, { "epoch": 0.8430074998424403, "grad_norm": 0.2640502154827118, "learning_rate": 3e-05, "loss": 1.234, "num_input_tokens_seen": 366982568, "step": 10450 }, { "epoch": 0.8438142055839163, "grad_norm": 0.2516578435897827, "learning_rate": 3e-05, "loss": 1.1995, "num_input_tokens_seen": 367303208, "step": 10460 }, { "epoch": 0.8446209113253923, "grad_norm": 0.264775812625885, "learning_rate": 3e-05, "loss": 1.2123, "num_input_tokens_seen": 367626724, "step": 10470 }, { "epoch": 0.8454276170668683, "grad_norm": 0.252989798784256, "learning_rate": 3e-05, "loss": 1.2622, "num_input_tokens_seen": 367987924, "step": 10480 }, { "epoch": 0.8462343228083443, "grad_norm": 0.2506852447986603, "learning_rate": 3e-05, "loss": 1.2796, "num_input_tokens_seen": 368365224, "step": 10490 }, { "epoch": 0.8470410285498203, "grad_norm": 0.2525902986526489, "learning_rate": 3e-05, "loss": 1.2393, "num_input_tokens_seen": 368706164, "step": 10500 }, { "epoch": 0.8478477342912965, "grad_norm": 0.3259766399860382, "learning_rate": 3e-05, "loss": 1.3053, "num_input_tokens_seen": 369069824, "step": 10510 }, { "epoch": 0.8486544400327725, "grad_norm": 0.246359184384346, "learning_rate": 3e-05, "loss": 1.22, "num_input_tokens_seen": 369420620, "step": 10520 }, { "epoch": 0.8494611457742485, "grad_norm": 0.2465633898973465, "learning_rate": 3e-05, "loss": 1.26, "num_input_tokens_seen": 369789168, "step": 10530 }, { "epoch": 0.8502678515157245, "grad_norm": 0.2697504758834839, "learning_rate": 3e-05, "loss": 1.2437, "num_input_tokens_seen": 370155836, "step": 10540 }, { "epoch": 0.8510745572572005, "grad_norm": 0.25357383489608765, "learning_rate": 3e-05, "loss": 1.2639, "num_input_tokens_seen": 370489560, "step": 10550 }, { "epoch": 0.8518812629986765, "grad_norm": 0.27426791191101074, "learning_rate": 3e-05, "loss": 1.2564, "num_input_tokens_seen": 370852008, "step": 10560 }, { "epoch": 0.8526879687401525, "grad_norm": 0.26024049520492554, "learning_rate": 3e-05, "loss": 1.2902, "num_input_tokens_seen": 371194808, "step": 10570 }, { "epoch": 0.8534946744816285, "grad_norm": 0.28873512148857117, "learning_rate": 3e-05, "loss": 1.1927, "num_input_tokens_seen": 371559520, "step": 10580 }, { "epoch": 0.8543013802231045, "grad_norm": 0.2774757146835327, "learning_rate": 3e-05, "loss": 1.2304, "num_input_tokens_seen": 371899308, "step": 10590 }, { "epoch": 0.8551080859645805, "grad_norm": 0.27191224694252014, "learning_rate": 3e-05, "loss": 1.1987, "num_input_tokens_seen": 372232400, "step": 10600 }, { "epoch": 0.8559147917060566, "grad_norm": 0.26448413729667664, "learning_rate": 3e-05, "loss": 1.2394, "num_input_tokens_seen": 372589132, "step": 10610 }, { "epoch": 0.8567214974475326, "grad_norm": 0.26863351464271545, "learning_rate": 3e-05, "loss": 1.2914, "num_input_tokens_seen": 372937492, "step": 10620 }, { "epoch": 0.8575282031890087, "grad_norm": 0.2653568685054779, "learning_rate": 3e-05, "loss": 1.236, "num_input_tokens_seen": 373293780, "step": 10630 }, { "epoch": 0.8583349089304847, "grad_norm": 0.27198871970176697, "learning_rate": 3e-05, "loss": 1.2327, "num_input_tokens_seen": 373680012, "step": 10640 }, { "epoch": 0.8591416146719607, "grad_norm": 0.2744047939777374, "learning_rate": 3e-05, "loss": 1.2706, "num_input_tokens_seen": 374041616, "step": 10650 }, { "epoch": 0.8599483204134367, "grad_norm": 0.24338699877262115, "learning_rate": 3e-05, "loss": 1.27, "num_input_tokens_seen": 374421228, "step": 10660 }, { "epoch": 0.8607550261549127, "grad_norm": 0.2561684846878052, "learning_rate": 3e-05, "loss": 1.2483, "num_input_tokens_seen": 374766780, "step": 10670 }, { "epoch": 0.8615617318963887, "grad_norm": 0.2887466549873352, "learning_rate": 3e-05, "loss": 1.2688, "num_input_tokens_seen": 375121440, "step": 10680 }, { "epoch": 0.8623684376378647, "grad_norm": 0.2793877422809601, "learning_rate": 3e-05, "loss": 1.2325, "num_input_tokens_seen": 375484872, "step": 10690 }, { "epoch": 0.8631751433793408, "grad_norm": 0.26802805066108704, "learning_rate": 3e-05, "loss": 1.2388, "num_input_tokens_seen": 375836092, "step": 10700 }, { "epoch": 0.8639818491208168, "grad_norm": 0.2660770118236542, "learning_rate": 3e-05, "loss": 1.2379, "num_input_tokens_seen": 376169816, "step": 10710 }, { "epoch": 0.8647885548622928, "grad_norm": 0.26407331228256226, "learning_rate": 3e-05, "loss": 1.2386, "num_input_tokens_seen": 376523852, "step": 10720 }, { "epoch": 0.8655952606037688, "grad_norm": 0.23881566524505615, "learning_rate": 3e-05, "loss": 1.278, "num_input_tokens_seen": 376881480, "step": 10730 }, { "epoch": 0.8664019663452448, "grad_norm": 0.2527766823768616, "learning_rate": 3e-05, "loss": 1.2352, "num_input_tokens_seen": 377225680, "step": 10740 }, { "epoch": 0.8672086720867209, "grad_norm": 0.25618699193000793, "learning_rate": 3e-05, "loss": 1.2561, "num_input_tokens_seen": 377581152, "step": 10750 }, { "epoch": 0.8680153778281969, "grad_norm": 0.2603427767753601, "learning_rate": 3e-05, "loss": 1.245, "num_input_tokens_seen": 377923004, "step": 10760 }, { "epoch": 0.8688220835696729, "grad_norm": 0.2423306107521057, "learning_rate": 3e-05, "loss": 1.2524, "num_input_tokens_seen": 378270880, "step": 10770 }, { "epoch": 0.869628789311149, "grad_norm": 0.2624494731426239, "learning_rate": 3e-05, "loss": 1.2438, "num_input_tokens_seen": 378626340, "step": 10780 }, { "epoch": 0.870435495052625, "grad_norm": 0.26242879033088684, "learning_rate": 3e-05, "loss": 1.2235, "num_input_tokens_seen": 378968780, "step": 10790 }, { "epoch": 0.871242200794101, "grad_norm": 0.2819896340370178, "learning_rate": 3e-05, "loss": 1.1845, "num_input_tokens_seen": 379331496, "step": 10800 }, { "epoch": 0.872048906535577, "grad_norm": 0.25225383043289185, "learning_rate": 3e-05, "loss": 1.2268, "num_input_tokens_seen": 379686312, "step": 10810 }, { "epoch": 0.872855612277053, "grad_norm": 0.33487361669540405, "learning_rate": 3e-05, "loss": 1.2184, "num_input_tokens_seen": 380041796, "step": 10820 }, { "epoch": 0.873662318018529, "grad_norm": 0.25806111097335815, "learning_rate": 3e-05, "loss": 1.2289, "num_input_tokens_seen": 380391316, "step": 10830 }, { "epoch": 0.874469023760005, "grad_norm": 0.2700815796852112, "learning_rate": 3e-05, "loss": 1.208, "num_input_tokens_seen": 380758180, "step": 10840 }, { "epoch": 0.875275729501481, "grad_norm": 0.24442021548748016, "learning_rate": 3e-05, "loss": 1.2071, "num_input_tokens_seen": 381140420, "step": 10850 }, { "epoch": 0.876082435242957, "grad_norm": 0.27837643027305603, "learning_rate": 3e-05, "loss": 1.1758, "num_input_tokens_seen": 381510192, "step": 10860 }, { "epoch": 0.876889140984433, "grad_norm": 0.2531345784664154, "learning_rate": 3e-05, "loss": 1.201, "num_input_tokens_seen": 381860536, "step": 10870 }, { "epoch": 0.8776958467259092, "grad_norm": 0.25533026456832886, "learning_rate": 3e-05, "loss": 1.2518, "num_input_tokens_seen": 382230860, "step": 10880 }, { "epoch": 0.8785025524673852, "grad_norm": 0.2697776257991791, "learning_rate": 3e-05, "loss": 1.2115, "num_input_tokens_seen": 382575708, "step": 10890 }, { "epoch": 0.8793092582088612, "grad_norm": 0.275545597076416, "learning_rate": 3e-05, "loss": 1.2691, "num_input_tokens_seen": 382952276, "step": 10900 }, { "epoch": 0.8801159639503372, "grad_norm": 0.2756127715110779, "learning_rate": 3e-05, "loss": 1.195, "num_input_tokens_seen": 383361024, "step": 10910 }, { "epoch": 0.8809226696918132, "grad_norm": 0.26673251390457153, "learning_rate": 3e-05, "loss": 1.2974, "num_input_tokens_seen": 383709472, "step": 10920 }, { "epoch": 0.8817293754332892, "grad_norm": 0.27520835399627686, "learning_rate": 3e-05, "loss": 1.1702, "num_input_tokens_seen": 384065488, "step": 10930 }, { "epoch": 0.8825360811747652, "grad_norm": 0.2573419213294983, "learning_rate": 3e-05, "loss": 1.2493, "num_input_tokens_seen": 384413564, "step": 10940 }, { "epoch": 0.8833427869162412, "grad_norm": 0.3231302499771118, "learning_rate": 3e-05, "loss": 1.2488, "num_input_tokens_seen": 384776660, "step": 10950 }, { "epoch": 0.8841494926577173, "grad_norm": 0.2685335874557495, "learning_rate": 3e-05, "loss": 1.2101, "num_input_tokens_seen": 385121296, "step": 10960 }, { "epoch": 0.8849561983991933, "grad_norm": 0.26467591524124146, "learning_rate": 3e-05, "loss": 1.2812, "num_input_tokens_seen": 385460624, "step": 10970 }, { "epoch": 0.8857629041406693, "grad_norm": 0.23645007610321045, "learning_rate": 3e-05, "loss": 1.2405, "num_input_tokens_seen": 385805028, "step": 10980 }, { "epoch": 0.8865696098821453, "grad_norm": 0.2732267677783966, "learning_rate": 3e-05, "loss": 1.2274, "num_input_tokens_seen": 386134288, "step": 10990 }, { "epoch": 0.8873763156236214, "grad_norm": 0.2679040729999542, "learning_rate": 3e-05, "loss": 1.2577, "num_input_tokens_seen": 386471860, "step": 11000 }, { "epoch": 0.8873763156236214, "eval_gen_len": 424.445, "eval_loss": 1.1752405166625977, "eval_rouge1": 39.3539, "eval_rouge2": 23.0123, "eval_rougeL": 31.9005, "eval_rougeLsum": 37.4941, "eval_runtime": 1475.7796, "eval_samples_per_second": 0.136, "eval_steps_per_second": 0.034, "num_input_tokens_seen": 386471860, "step": 11000 }, { "epoch": 0.8881830213650974, "grad_norm": 0.24609152972698212, "learning_rate": 3e-05, "loss": 1.2538, "num_input_tokens_seen": 386829236, "step": 11010 }, { "epoch": 0.8889897271065734, "grad_norm": 0.23998071253299713, "learning_rate": 3e-05, "loss": 1.2311, "num_input_tokens_seen": 387162280, "step": 11020 }, { "epoch": 0.8897964328480494, "grad_norm": 0.2572784125804901, "learning_rate": 3e-05, "loss": 1.2318, "num_input_tokens_seen": 387517828, "step": 11030 }, { "epoch": 0.8906031385895254, "grad_norm": 0.258114755153656, "learning_rate": 3e-05, "loss": 1.2098, "num_input_tokens_seen": 387867880, "step": 11040 }, { "epoch": 0.8914098443310015, "grad_norm": 0.28761738538742065, "learning_rate": 3e-05, "loss": 1.245, "num_input_tokens_seen": 388222904, "step": 11050 }, { "epoch": 0.8922165500724775, "grad_norm": 0.26138409972190857, "learning_rate": 3e-05, "loss": 1.2173, "num_input_tokens_seen": 388568168, "step": 11060 }, { "epoch": 0.8930232558139535, "grad_norm": 0.26064634323120117, "learning_rate": 3e-05, "loss": 1.2018, "num_input_tokens_seen": 388926736, "step": 11070 }, { "epoch": 0.8938299615554295, "grad_norm": 0.28964129090309143, "learning_rate": 3e-05, "loss": 1.2191, "num_input_tokens_seen": 389270524, "step": 11080 }, { "epoch": 0.8946366672969055, "grad_norm": 0.2423638552427292, "learning_rate": 3e-05, "loss": 1.2145, "num_input_tokens_seen": 389663824, "step": 11090 }, { "epoch": 0.8954433730383815, "grad_norm": 0.27935534715652466, "learning_rate": 3e-05, "loss": 1.2177, "num_input_tokens_seen": 390019620, "step": 11100 }, { "epoch": 0.8962500787798575, "grad_norm": 0.29713118076324463, "learning_rate": 3e-05, "loss": 1.2533, "num_input_tokens_seen": 390367580, "step": 11110 }, { "epoch": 0.8970567845213336, "grad_norm": 0.2777055501937866, "learning_rate": 3e-05, "loss": 1.2234, "num_input_tokens_seen": 390735060, "step": 11120 }, { "epoch": 0.8978634902628096, "grad_norm": 0.2500898838043213, "learning_rate": 3e-05, "loss": 1.2104, "num_input_tokens_seen": 391075804, "step": 11130 }, { "epoch": 0.8986701960042857, "grad_norm": 0.26286810636520386, "learning_rate": 3e-05, "loss": 1.2567, "num_input_tokens_seen": 391402956, "step": 11140 }, { "epoch": 0.8994769017457617, "grad_norm": 0.2514180839061737, "learning_rate": 3e-05, "loss": 1.2498, "num_input_tokens_seen": 391738360, "step": 11150 }, { "epoch": 0.9002836074872377, "grad_norm": 0.27611491084098816, "learning_rate": 3e-05, "loss": 1.1939, "num_input_tokens_seen": 392082044, "step": 11160 }, { "epoch": 0.9010903132287137, "grad_norm": 0.2573927342891693, "learning_rate": 3e-05, "loss": 1.2355, "num_input_tokens_seen": 392441160, "step": 11170 }, { "epoch": 0.9018970189701897, "grad_norm": 0.2716425955295563, "learning_rate": 3e-05, "loss": 1.2134, "num_input_tokens_seen": 392797140, "step": 11180 }, { "epoch": 0.9027037247116657, "grad_norm": 0.2436821162700653, "learning_rate": 3e-05, "loss": 1.229, "num_input_tokens_seen": 393158316, "step": 11190 }, { "epoch": 0.9035104304531417, "grad_norm": 0.27646389603614807, "learning_rate": 3e-05, "loss": 1.2138, "num_input_tokens_seen": 393471508, "step": 11200 }, { "epoch": 0.9043171361946177, "grad_norm": 0.2678287625312805, "learning_rate": 3e-05, "loss": 1.2516, "num_input_tokens_seen": 393806264, "step": 11210 }, { "epoch": 0.9051238419360937, "grad_norm": 0.2638424336910248, "learning_rate": 3e-05, "loss": 1.2467, "num_input_tokens_seen": 394161404, "step": 11220 }, { "epoch": 0.9059305476775698, "grad_norm": 0.2639593183994293, "learning_rate": 3e-05, "loss": 1.2145, "num_input_tokens_seen": 394526568, "step": 11230 }, { "epoch": 0.9067372534190458, "grad_norm": 0.25803256034851074, "learning_rate": 3e-05, "loss": 1.3036, "num_input_tokens_seen": 394866788, "step": 11240 }, { "epoch": 0.9075439591605219, "grad_norm": 0.2518157362937927, "learning_rate": 3e-05, "loss": 1.2081, "num_input_tokens_seen": 395190516, "step": 11250 }, { "epoch": 0.9083506649019979, "grad_norm": 0.2544965147972107, "learning_rate": 3e-05, "loss": 1.2234, "num_input_tokens_seen": 395528392, "step": 11260 }, { "epoch": 0.9091573706434739, "grad_norm": 0.24782590568065643, "learning_rate": 3e-05, "loss": 1.1547, "num_input_tokens_seen": 395880192, "step": 11270 }, { "epoch": 0.9099640763849499, "grad_norm": 0.2636893093585968, "learning_rate": 3e-05, "loss": 1.2305, "num_input_tokens_seen": 396223844, "step": 11280 }, { "epoch": 0.9107707821264259, "grad_norm": 0.2468230426311493, "learning_rate": 3e-05, "loss": 1.204, "num_input_tokens_seen": 396543560, "step": 11290 }, { "epoch": 0.9115774878679019, "grad_norm": 0.2818716764450073, "learning_rate": 3e-05, "loss": 1.1927, "num_input_tokens_seen": 396879784, "step": 11300 }, { "epoch": 0.912384193609378, "grad_norm": 0.24603427946567535, "learning_rate": 3e-05, "loss": 1.2276, "num_input_tokens_seen": 397247352, "step": 11310 }, { "epoch": 0.913190899350854, "grad_norm": 0.24526093900203705, "learning_rate": 3e-05, "loss": 1.2523, "num_input_tokens_seen": 397604360, "step": 11320 }, { "epoch": 0.91399760509233, "grad_norm": 0.26731881499290466, "learning_rate": 3e-05, "loss": 1.2662, "num_input_tokens_seen": 397928512, "step": 11330 }, { "epoch": 0.914804310833806, "grad_norm": 0.2755918800830841, "learning_rate": 3e-05, "loss": 1.2677, "num_input_tokens_seen": 398264700, "step": 11340 }, { "epoch": 0.915611016575282, "grad_norm": 0.25634992122650146, "learning_rate": 3e-05, "loss": 1.1865, "num_input_tokens_seen": 398622488, "step": 11350 }, { "epoch": 0.916417722316758, "grad_norm": 0.27104732394218445, "learning_rate": 3e-05, "loss": 1.2323, "num_input_tokens_seen": 398927144, "step": 11360 }, { "epoch": 0.9172244280582341, "grad_norm": 0.25183597207069397, "learning_rate": 3e-05, "loss": 1.2618, "num_input_tokens_seen": 399315068, "step": 11370 }, { "epoch": 0.9180311337997101, "grad_norm": 0.23518332839012146, "learning_rate": 3e-05, "loss": 1.2591, "num_input_tokens_seen": 399668488, "step": 11380 }, { "epoch": 0.9188378395411861, "grad_norm": 0.23520028591156006, "learning_rate": 3e-05, "loss": 1.2336, "num_input_tokens_seen": 400018476, "step": 11390 }, { "epoch": 0.9196445452826622, "grad_norm": 0.27664098143577576, "learning_rate": 3e-05, "loss": 1.2167, "num_input_tokens_seen": 400350296, "step": 11400 }, { "epoch": 0.9204512510241382, "grad_norm": 0.2558439373970032, "learning_rate": 3e-05, "loss": 1.1831, "num_input_tokens_seen": 400653728, "step": 11410 }, { "epoch": 0.9212579567656142, "grad_norm": 0.24782094359397888, "learning_rate": 3e-05, "loss": 1.2122, "num_input_tokens_seen": 400992668, "step": 11420 }, { "epoch": 0.9220646625070902, "grad_norm": 0.23971796035766602, "learning_rate": 3e-05, "loss": 1.2251, "num_input_tokens_seen": 401351424, "step": 11430 }, { "epoch": 0.9228713682485662, "grad_norm": 0.24755193293094635, "learning_rate": 3e-05, "loss": 1.197, "num_input_tokens_seen": 401726484, "step": 11440 }, { "epoch": 0.9236780739900422, "grad_norm": 0.26952269673347473, "learning_rate": 3e-05, "loss": 1.1654, "num_input_tokens_seen": 402097328, "step": 11450 }, { "epoch": 0.9244847797315182, "grad_norm": 0.24309176206588745, "learning_rate": 3e-05, "loss": 1.2339, "num_input_tokens_seen": 402448540, "step": 11460 }, { "epoch": 0.9252914854729942, "grad_norm": 0.2862485349178314, "learning_rate": 3e-05, "loss": 1.2023, "num_input_tokens_seen": 402817680, "step": 11470 }, { "epoch": 0.9260981912144702, "grad_norm": 0.3049052655696869, "learning_rate": 3e-05, "loss": 1.2021, "num_input_tokens_seen": 403181196, "step": 11480 }, { "epoch": 0.9269048969559464, "grad_norm": 0.25457674264907837, "learning_rate": 3e-05, "loss": 1.2485, "num_input_tokens_seen": 403550872, "step": 11490 }, { "epoch": 0.9277116026974224, "grad_norm": 0.24556294083595276, "learning_rate": 3e-05, "loss": 1.2269, "num_input_tokens_seen": 403922164, "step": 11500 }, { "epoch": 0.9285183084388984, "grad_norm": 0.292858362197876, "learning_rate": 3e-05, "loss": 1.2211, "num_input_tokens_seen": 404240116, "step": 11510 }, { "epoch": 0.9293250141803744, "grad_norm": 0.23489707708358765, "learning_rate": 3e-05, "loss": 1.238, "num_input_tokens_seen": 404625636, "step": 11520 }, { "epoch": 0.9301317199218504, "grad_norm": 0.2959127724170685, "learning_rate": 3e-05, "loss": 1.2062, "num_input_tokens_seen": 404963340, "step": 11530 }, { "epoch": 0.9309384256633264, "grad_norm": 0.2795163094997406, "learning_rate": 3e-05, "loss": 1.2634, "num_input_tokens_seen": 405324692, "step": 11540 }, { "epoch": 0.9317451314048024, "grad_norm": 0.27414393424987793, "learning_rate": 3e-05, "loss": 1.2477, "num_input_tokens_seen": 405701524, "step": 11550 }, { "epoch": 0.9325518371462784, "grad_norm": 0.26650696992874146, "learning_rate": 3e-05, "loss": 1.2236, "num_input_tokens_seen": 406080452, "step": 11560 }, { "epoch": 0.9333585428877544, "grad_norm": 0.2659411132335663, "learning_rate": 3e-05, "loss": 1.2534, "num_input_tokens_seen": 406406212, "step": 11570 }, { "epoch": 0.9341652486292304, "grad_norm": 0.24440665543079376, "learning_rate": 3e-05, "loss": 1.1668, "num_input_tokens_seen": 406717196, "step": 11580 }, { "epoch": 0.9349719543707065, "grad_norm": 0.23124107718467712, "learning_rate": 3e-05, "loss": 1.2525, "num_input_tokens_seen": 407074284, "step": 11590 }, { "epoch": 0.9357786601121825, "grad_norm": 0.2501998841762543, "learning_rate": 3e-05, "loss": 1.2001, "num_input_tokens_seen": 407435636, "step": 11600 }, { "epoch": 0.9365853658536586, "grad_norm": 0.2701874077320099, "learning_rate": 3e-05, "loss": 1.269, "num_input_tokens_seen": 407777004, "step": 11610 }, { "epoch": 0.9373920715951346, "grad_norm": 0.22814303636550903, "learning_rate": 3e-05, "loss": 1.2425, "num_input_tokens_seen": 408132316, "step": 11620 }, { "epoch": 0.9381987773366106, "grad_norm": 0.2615501880645752, "learning_rate": 3e-05, "loss": 1.2342, "num_input_tokens_seen": 408489652, "step": 11630 }, { "epoch": 0.9390054830780866, "grad_norm": 0.25700172781944275, "learning_rate": 3e-05, "loss": 1.1974, "num_input_tokens_seen": 408845668, "step": 11640 }, { "epoch": 0.9398121888195626, "grad_norm": 0.2439606785774231, "learning_rate": 3e-05, "loss": 1.1591, "num_input_tokens_seen": 409174180, "step": 11650 }, { "epoch": 0.9406188945610386, "grad_norm": 0.24392473697662354, "learning_rate": 3e-05, "loss": 1.2621, "num_input_tokens_seen": 409523744, "step": 11660 }, { "epoch": 0.9414256003025147, "grad_norm": 0.2710927426815033, "learning_rate": 3e-05, "loss": 1.2391, "num_input_tokens_seen": 409895612, "step": 11670 }, { "epoch": 0.9422323060439907, "grad_norm": 0.24979081749916077, "learning_rate": 3e-05, "loss": 1.2336, "num_input_tokens_seen": 410263056, "step": 11680 }, { "epoch": 0.9430390117854667, "grad_norm": 0.24999581277370453, "learning_rate": 3e-05, "loss": 1.2399, "num_input_tokens_seen": 410610292, "step": 11690 }, { "epoch": 0.9438457175269427, "grad_norm": 0.2609216868877411, "learning_rate": 3e-05, "loss": 1.1943, "num_input_tokens_seen": 410936848, "step": 11700 }, { "epoch": 0.9446524232684187, "grad_norm": 0.29096490144729614, "learning_rate": 3e-05, "loss": 1.1999, "num_input_tokens_seen": 411292872, "step": 11710 }, { "epoch": 0.9454591290098947, "grad_norm": 0.23998717963695526, "learning_rate": 3e-05, "loss": 1.2215, "num_input_tokens_seen": 411683604, "step": 11720 }, { "epoch": 0.9462658347513707, "grad_norm": 0.2535877823829651, "learning_rate": 3e-05, "loss": 1.1827, "num_input_tokens_seen": 412049976, "step": 11730 }, { "epoch": 0.9470725404928468, "grad_norm": 0.23387952148914337, "learning_rate": 3e-05, "loss": 1.2472, "num_input_tokens_seen": 412385212, "step": 11740 }, { "epoch": 0.9478792462343228, "grad_norm": 0.27991074323654175, "learning_rate": 3e-05, "loss": 1.2081, "num_input_tokens_seen": 412719600, "step": 11750 }, { "epoch": 0.9486859519757989, "grad_norm": 0.28498226404190063, "learning_rate": 3e-05, "loss": 1.234, "num_input_tokens_seen": 413077972, "step": 11760 }, { "epoch": 0.9494926577172749, "grad_norm": 0.24881170690059662, "learning_rate": 3e-05, "loss": 1.2457, "num_input_tokens_seen": 413428308, "step": 11770 }, { "epoch": 0.9502993634587509, "grad_norm": 0.2739012837409973, "learning_rate": 3e-05, "loss": 1.2318, "num_input_tokens_seen": 413798096, "step": 11780 }, { "epoch": 0.9511060692002269, "grad_norm": 0.2565111815929413, "learning_rate": 3e-05, "loss": 1.2262, "num_input_tokens_seen": 414145460, "step": 11790 }, { "epoch": 0.9519127749417029, "grad_norm": 0.27090346813201904, "learning_rate": 3e-05, "loss": 1.2413, "num_input_tokens_seen": 414493396, "step": 11800 }, { "epoch": 0.9527194806831789, "grad_norm": 0.25924554467201233, "learning_rate": 3e-05, "loss": 1.1955, "num_input_tokens_seen": 414854028, "step": 11810 }, { "epoch": 0.9535261864246549, "grad_norm": 0.2571480870246887, "learning_rate": 3e-05, "loss": 1.1855, "num_input_tokens_seen": 415207364, "step": 11820 }, { "epoch": 0.9543328921661309, "grad_norm": 0.27920863032341003, "learning_rate": 3e-05, "loss": 1.2544, "num_input_tokens_seen": 415600924, "step": 11830 }, { "epoch": 0.9551395979076069, "grad_norm": 0.2675030529499054, "learning_rate": 3e-05, "loss": 1.2391, "num_input_tokens_seen": 415950904, "step": 11840 }, { "epoch": 0.955946303649083, "grad_norm": 0.2398238480091095, "learning_rate": 3e-05, "loss": 1.2485, "num_input_tokens_seen": 416309076, "step": 11850 }, { "epoch": 0.9567530093905591, "grad_norm": 0.2653293311595917, "learning_rate": 3e-05, "loss": 1.1741, "num_input_tokens_seen": 416682964, "step": 11860 }, { "epoch": 0.9575597151320351, "grad_norm": 0.2775269150733948, "learning_rate": 3e-05, "loss": 1.2254, "num_input_tokens_seen": 417044976, "step": 11870 }, { "epoch": 0.9583664208735111, "grad_norm": 0.24485714733600616, "learning_rate": 3e-05, "loss": 1.2325, "num_input_tokens_seen": 417409976, "step": 11880 }, { "epoch": 0.9591731266149871, "grad_norm": 0.254849374294281, "learning_rate": 3e-05, "loss": 1.2358, "num_input_tokens_seen": 417777976, "step": 11890 }, { "epoch": 0.9599798323564631, "grad_norm": 0.24646379053592682, "learning_rate": 3e-05, "loss": 1.206, "num_input_tokens_seen": 418084944, "step": 11900 }, { "epoch": 0.9607865380979391, "grad_norm": 0.2590767741203308, "learning_rate": 3e-05, "loss": 1.1951, "num_input_tokens_seen": 418475256, "step": 11910 }, { "epoch": 0.9615932438394151, "grad_norm": 0.2564661502838135, "learning_rate": 3e-05, "loss": 1.2112, "num_input_tokens_seen": 418778704, "step": 11920 }, { "epoch": 0.9623999495808911, "grad_norm": 0.27787408232688904, "learning_rate": 3e-05, "loss": 1.207, "num_input_tokens_seen": 419132332, "step": 11930 }, { "epoch": 0.9632066553223672, "grad_norm": 0.23015113174915314, "learning_rate": 3e-05, "loss": 1.2402, "num_input_tokens_seen": 419499272, "step": 11940 }, { "epoch": 0.9640133610638432, "grad_norm": 0.23493854701519012, "learning_rate": 3e-05, "loss": 1.173, "num_input_tokens_seen": 419847688, "step": 11950 }, { "epoch": 0.9648200668053192, "grad_norm": 0.2406766414642334, "learning_rate": 3e-05, "loss": 1.1955, "num_input_tokens_seen": 420188072, "step": 11960 }, { "epoch": 0.9656267725467952, "grad_norm": 0.27738529443740845, "learning_rate": 3e-05, "loss": 1.2038, "num_input_tokens_seen": 420548188, "step": 11970 }, { "epoch": 0.9664334782882713, "grad_norm": 0.2478122115135193, "learning_rate": 3e-05, "loss": 1.1975, "num_input_tokens_seen": 420900240, "step": 11980 }, { "epoch": 0.9672401840297473, "grad_norm": 0.26496005058288574, "learning_rate": 3e-05, "loss": 1.2336, "num_input_tokens_seen": 421243660, "step": 11990 }, { "epoch": 0.9680468897712233, "grad_norm": 0.2664368450641632, "learning_rate": 3e-05, "loss": 1.193, "num_input_tokens_seen": 421585440, "step": 12000 }, { "epoch": 0.9680468897712233, "eval_gen_len": 422.225, "eval_loss": 1.1525993347167969, "eval_rouge1": 40.1804, "eval_rouge2": 23.1008, "eval_rougeL": 32.3484, "eval_rougeLsum": 38.2103, "eval_runtime": 1396.0916, "eval_samples_per_second": 0.143, "eval_steps_per_second": 0.036, "num_input_tokens_seen": 421585440, "step": 12000 }, { "epoch": 0.9688535955126993, "grad_norm": 0.2768273949623108, "learning_rate": 3e-05, "loss": 1.2532, "num_input_tokens_seen": 421924468, "step": 12010 }, { "epoch": 0.9696603012541753, "grad_norm": 0.23941214382648468, "learning_rate": 3e-05, "loss": 1.2174, "num_input_tokens_seen": 422267696, "step": 12020 }, { "epoch": 0.9704670069956514, "grad_norm": 0.24917346239089966, "learning_rate": 3e-05, "loss": 1.2038, "num_input_tokens_seen": 422614520, "step": 12030 }, { "epoch": 0.9712737127371274, "grad_norm": 0.2580147683620453, "learning_rate": 3e-05, "loss": 1.2507, "num_input_tokens_seen": 422973276, "step": 12040 }, { "epoch": 0.9720804184786034, "grad_norm": 0.24353154003620148, "learning_rate": 3e-05, "loss": 1.2144, "num_input_tokens_seen": 423341500, "step": 12050 }, { "epoch": 0.9728871242200794, "grad_norm": 0.27423179149627686, "learning_rate": 3e-05, "loss": 1.2188, "num_input_tokens_seen": 423679004, "step": 12060 }, { "epoch": 0.9736938299615554, "grad_norm": 0.2490026354789734, "learning_rate": 3e-05, "loss": 1.2043, "num_input_tokens_seen": 424034888, "step": 12070 }, { "epoch": 0.9745005357030314, "grad_norm": 0.2514224648475647, "learning_rate": 3e-05, "loss": 1.2236, "num_input_tokens_seen": 424394100, "step": 12080 }, { "epoch": 0.9753072414445074, "grad_norm": 0.2942357659339905, "learning_rate": 3e-05, "loss": 1.1832, "num_input_tokens_seen": 424716908, "step": 12090 }, { "epoch": 0.9761139471859834, "grad_norm": 0.2441994845867157, "learning_rate": 3e-05, "loss": 1.2298, "num_input_tokens_seen": 425087956, "step": 12100 }, { "epoch": 0.9769206529274596, "grad_norm": 0.2718014121055603, "learning_rate": 3e-05, "loss": 1.2549, "num_input_tokens_seen": 425429636, "step": 12110 }, { "epoch": 0.9777273586689356, "grad_norm": 0.23609136044979095, "learning_rate": 3e-05, "loss": 1.2034, "num_input_tokens_seen": 425762244, "step": 12120 }, { "epoch": 0.9785340644104116, "grad_norm": 0.2554143965244293, "learning_rate": 3e-05, "loss": 1.2059, "num_input_tokens_seen": 426106556, "step": 12130 }, { "epoch": 0.9793407701518876, "grad_norm": 0.2818094789981842, "learning_rate": 3e-05, "loss": 1.2032, "num_input_tokens_seen": 426470164, "step": 12140 }, { "epoch": 0.9801474758933636, "grad_norm": 0.26025861501693726, "learning_rate": 3e-05, "loss": 1.2107, "num_input_tokens_seen": 426815164, "step": 12150 }, { "epoch": 0.9809541816348396, "grad_norm": 0.29881224036216736, "learning_rate": 3e-05, "loss": 1.2262, "num_input_tokens_seen": 427184952, "step": 12160 }, { "epoch": 0.9817608873763156, "grad_norm": 0.24537017941474915, "learning_rate": 3e-05, "loss": 1.2207, "num_input_tokens_seen": 427526628, "step": 12170 }, { "epoch": 0.9825675931177916, "grad_norm": 0.28081703186035156, "learning_rate": 3e-05, "loss": 1.1716, "num_input_tokens_seen": 427872948, "step": 12180 }, { "epoch": 0.9833742988592676, "grad_norm": 0.22894425690174103, "learning_rate": 3e-05, "loss": 1.2104, "num_input_tokens_seen": 428252884, "step": 12190 }, { "epoch": 0.9841810046007436, "grad_norm": 0.23327578604221344, "learning_rate": 3e-05, "loss": 1.2256, "num_input_tokens_seen": 428610824, "step": 12200 }, { "epoch": 0.9849877103422197, "grad_norm": 0.2497028261423111, "learning_rate": 3e-05, "loss": 1.2069, "num_input_tokens_seen": 428981084, "step": 12210 }, { "epoch": 0.9857944160836957, "grad_norm": 0.2404777854681015, "learning_rate": 3e-05, "loss": 1.1657, "num_input_tokens_seen": 429323900, "step": 12220 }, { "epoch": 0.9866011218251718, "grad_norm": 0.2447100579738617, "learning_rate": 3e-05, "loss": 1.214, "num_input_tokens_seen": 429692476, "step": 12230 }, { "epoch": 0.9874078275666478, "grad_norm": 0.2328159064054489, "learning_rate": 3e-05, "loss": 1.2144, "num_input_tokens_seen": 430005920, "step": 12240 }, { "epoch": 0.9882145333081238, "grad_norm": 0.25133198499679565, "learning_rate": 3e-05, "loss": 1.1864, "num_input_tokens_seen": 430333380, "step": 12250 }, { "epoch": 0.9890212390495998, "grad_norm": 0.2603629529476166, "learning_rate": 3e-05, "loss": 1.212, "num_input_tokens_seen": 430688404, "step": 12260 }, { "epoch": 0.9898279447910758, "grad_norm": 0.25967875123023987, "learning_rate": 3e-05, "loss": 1.2011, "num_input_tokens_seen": 431025404, "step": 12270 }, { "epoch": 0.9906346505325518, "grad_norm": 0.24072428047657013, "learning_rate": 3e-05, "loss": 1.1536, "num_input_tokens_seen": 431361424, "step": 12280 }, { "epoch": 0.9914413562740279, "grad_norm": 0.2615431547164917, "learning_rate": 3e-05, "loss": 1.1917, "num_input_tokens_seen": 431733732, "step": 12290 }, { "epoch": 0.9922480620155039, "grad_norm": 0.23490871489048004, "learning_rate": 3e-05, "loss": 1.1985, "num_input_tokens_seen": 432095608, "step": 12300 }, { "epoch": 0.9930547677569799, "grad_norm": 0.2793809175491333, "learning_rate": 3e-05, "loss": 1.2161, "num_input_tokens_seen": 432470776, "step": 12310 }, { "epoch": 0.9938614734984559, "grad_norm": 0.26310858130455017, "learning_rate": 3e-05, "loss": 1.2369, "num_input_tokens_seen": 432840156, "step": 12320 }, { "epoch": 0.9946681792399319, "grad_norm": 0.2650851905345917, "learning_rate": 3e-05, "loss": 1.1874, "num_input_tokens_seen": 433200564, "step": 12330 }, { "epoch": 0.9954748849814079, "grad_norm": 0.24045298993587494, "learning_rate": 3e-05, "loss": 1.2043, "num_input_tokens_seen": 433560896, "step": 12340 }, { "epoch": 0.996281590722884, "grad_norm": 0.2662796080112457, "learning_rate": 3e-05, "loss": 1.228, "num_input_tokens_seen": 433916796, "step": 12350 }, { "epoch": 0.99708829646436, "grad_norm": 0.27926427125930786, "learning_rate": 3e-05, "loss": 1.1796, "num_input_tokens_seen": 434267164, "step": 12360 }, { "epoch": 0.997895002205836, "grad_norm": 0.29105281829833984, "learning_rate": 3e-05, "loss": 1.2221, "num_input_tokens_seen": 434634960, "step": 12370 }, { "epoch": 0.9987017079473121, "grad_norm": 0.25824907422065735, "learning_rate": 3e-05, "loss": 1.2037, "num_input_tokens_seen": 434976852, "step": 12380 }, { "epoch": 0.9995084136887881, "grad_norm": 0.2631925940513611, "learning_rate": 3e-05, "loss": 1.2453, "num_input_tokens_seen": 435337884, "step": 12390 }, { "epoch": 0.9999924371336737, "num_input_tokens_seen": 435513684, "step": 12396, "total_flos": 2.1022605922963784e+18, "train_loss": 1.4536694422503678, "train_runtime": 158351.1039, "train_samples_per_second": 10.02, "train_steps_per_second": 0.078 } ], "logging_steps": 10, "max_steps": 12396, "num_input_tokens_seen": 435513684, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1022605922963784e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }