{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007142857142857143, "grad_norm": 2.519733190536499, "learning_rate": 9.976190476190477e-05, "loss": 1.2999, "step": 10 }, { "epoch": 0.014285714285714285, "grad_norm": 2.4232161045074463, "learning_rate": 9.952380952380953e-05, "loss": 1.1228, "step": 20 }, { "epoch": 0.02142857142857143, "grad_norm": 2.7126998901367188, "learning_rate": 9.92857142857143e-05, "loss": 0.9913, "step": 30 }, { "epoch": 0.02857142857142857, "grad_norm": 7.3811235427856445, "learning_rate": 9.904761904761905e-05, "loss": 0.9306, "step": 40 }, { "epoch": 0.03571428571428571, "grad_norm": 6.5697197914123535, "learning_rate": 9.880952380952381e-05, "loss": 0.7457, "step": 50 }, { "epoch": 0.04285714285714286, "grad_norm": 1.2972687482833862, "learning_rate": 9.857142857142858e-05, "loss": 0.6408, "step": 60 }, { "epoch": 0.05, "grad_norm": 3.600625991821289, "learning_rate": 9.833333333333333e-05, "loss": 0.6541, "step": 70 }, { "epoch": 0.05714285714285714, "grad_norm": 1.5594621896743774, "learning_rate": 9.80952380952381e-05, "loss": 0.538, "step": 80 }, { "epoch": 0.06428571428571428, "grad_norm": 4.595429420471191, "learning_rate": 9.785714285714286e-05, "loss": 0.556, "step": 90 }, { "epoch": 0.07142857142857142, "grad_norm": 2.620643138885498, "learning_rate": 9.761904761904762e-05, "loss": 0.5521, "step": 100 }, { "epoch": 0.07857142857142857, "grad_norm": 3.503641366958618, "learning_rate": 9.738095238095239e-05, "loss": 0.5912, "step": 110 }, { "epoch": 0.08571428571428572, "grad_norm": 1.5058848857879639, "learning_rate": 9.714285714285715e-05, "loss": 0.5421, "step": 120 }, { "epoch": 0.09285714285714286, "grad_norm": 3.7034990787506104, "learning_rate": 9.69047619047619e-05, "loss": 0.5568, "step": 130 }, { "epoch": 0.1, "grad_norm": 3.714170455932617, "learning_rate": 9.666666666666667e-05, "loss": 0.5286, "step": 140 }, { "epoch": 0.10714285714285714, "grad_norm": 2.069328546524048, "learning_rate": 9.642857142857143e-05, "loss": 0.4755, "step": 150 }, { "epoch": 0.11428571428571428, "grad_norm": 1.2878583669662476, "learning_rate": 9.61904761904762e-05, "loss": 0.4817, "step": 160 }, { "epoch": 0.12142857142857143, "grad_norm": 3.1945483684539795, "learning_rate": 9.595238095238096e-05, "loss": 0.5035, "step": 170 }, { "epoch": 0.12857142857142856, "grad_norm": 1.9643166065216064, "learning_rate": 9.571428571428573e-05, "loss": 0.4932, "step": 180 }, { "epoch": 0.1357142857142857, "grad_norm": 3.5002968311309814, "learning_rate": 9.547619047619049e-05, "loss": 0.5531, "step": 190 }, { "epoch": 0.14285714285714285, "grad_norm": 4.7095794677734375, "learning_rate": 9.523809523809524e-05, "loss": 0.4601, "step": 200 }, { "epoch": 0.15, "grad_norm": 1.4850050210952759, "learning_rate": 9.5e-05, "loss": 0.432, "step": 210 }, { "epoch": 0.15714285714285714, "grad_norm": 2.860565423965454, "learning_rate": 9.476190476190476e-05, "loss": 0.4699, "step": 220 }, { "epoch": 0.16428571428571428, "grad_norm": 1.8467603921890259, "learning_rate": 9.452380952380952e-05, "loss": 0.42, "step": 230 }, { "epoch": 0.17142857142857143, "grad_norm": 1.6448352336883545, "learning_rate": 9.428571428571429e-05, "loss": 0.431, "step": 240 }, { "epoch": 0.17857142857142858, "grad_norm": 1.8279732465744019, "learning_rate": 9.404761904761905e-05, "loss": 0.4413, "step": 250 }, { "epoch": 0.18571428571428572, "grad_norm": 3.0112857818603516, "learning_rate": 9.380952380952381e-05, "loss": 0.4263, "step": 260 }, { "epoch": 0.19285714285714287, "grad_norm": 1.7717293500900269, "learning_rate": 9.357142857142858e-05, "loss": 0.4829, "step": 270 }, { "epoch": 0.2, "grad_norm": 4.515851974487305, "learning_rate": 9.333333333333334e-05, "loss": 0.4194, "step": 280 }, { "epoch": 0.20714285714285716, "grad_norm": 3.998619556427002, "learning_rate": 9.309523809523811e-05, "loss": 0.4422, "step": 290 }, { "epoch": 0.21428571428571427, "grad_norm": 3.5681912899017334, "learning_rate": 9.285714285714286e-05, "loss": 0.4677, "step": 300 }, { "epoch": 0.22142857142857142, "grad_norm": 3.286348342895508, "learning_rate": 9.261904761904762e-05, "loss": 0.4152, "step": 310 }, { "epoch": 0.22857142857142856, "grad_norm": 2.018721580505371, "learning_rate": 9.238095238095239e-05, "loss": 0.5065, "step": 320 }, { "epoch": 0.2357142857142857, "grad_norm": 1.8489034175872803, "learning_rate": 9.214285714285714e-05, "loss": 0.392, "step": 330 }, { "epoch": 0.24285714285714285, "grad_norm": 1.3958369493484497, "learning_rate": 9.19047619047619e-05, "loss": 0.3988, "step": 340 }, { "epoch": 0.25, "grad_norm": 2.9772961139678955, "learning_rate": 9.166666666666667e-05, "loss": 0.3955, "step": 350 }, { "epoch": 0.2571428571428571, "grad_norm": 3.2752914428710938, "learning_rate": 9.142857142857143e-05, "loss": 0.3998, "step": 360 }, { "epoch": 0.2642857142857143, "grad_norm": 1.8811057806015015, "learning_rate": 9.11904761904762e-05, "loss": 0.4775, "step": 370 }, { "epoch": 0.2714285714285714, "grad_norm": 2.3310513496398926, "learning_rate": 9.095238095238096e-05, "loss": 0.3703, "step": 380 }, { "epoch": 0.2785714285714286, "grad_norm": 2.967597484588623, "learning_rate": 9.071428571428571e-05, "loss": 0.4177, "step": 390 }, { "epoch": 0.2857142857142857, "grad_norm": 2.671673536300659, "learning_rate": 9.047619047619048e-05, "loss": 0.3823, "step": 400 }, { "epoch": 0.29285714285714287, "grad_norm": 4.647867679595947, "learning_rate": 9.023809523809524e-05, "loss": 0.3888, "step": 410 }, { "epoch": 0.3, "grad_norm": 3.4788432121276855, "learning_rate": 9e-05, "loss": 0.3856, "step": 420 }, { "epoch": 0.30714285714285716, "grad_norm": 3.308420181274414, "learning_rate": 8.976190476190477e-05, "loss": 0.4143, "step": 430 }, { "epoch": 0.3142857142857143, "grad_norm": 3.499880075454712, "learning_rate": 8.952380952380953e-05, "loss": 0.4857, "step": 440 }, { "epoch": 0.32142857142857145, "grad_norm": 1.7907614707946777, "learning_rate": 8.92857142857143e-05, "loss": 0.356, "step": 450 }, { "epoch": 0.32857142857142857, "grad_norm": 3.0111501216888428, "learning_rate": 8.904761904761905e-05, "loss": 0.358, "step": 460 }, { "epoch": 0.3357142857142857, "grad_norm": 2.1389689445495605, "learning_rate": 8.880952380952381e-05, "loss": 0.3331, "step": 470 }, { "epoch": 0.34285714285714286, "grad_norm": 1.8007326126098633, "learning_rate": 8.857142857142857e-05, "loss": 0.3148, "step": 480 }, { "epoch": 0.35, "grad_norm": 3.603548288345337, "learning_rate": 8.833333333333333e-05, "loss": 0.4513, "step": 490 }, { "epoch": 0.35714285714285715, "grad_norm": 2.270984411239624, "learning_rate": 8.80952380952381e-05, "loss": 0.3392, "step": 500 }, { "epoch": 0.36428571428571427, "grad_norm": 1.5367858409881592, "learning_rate": 8.785714285714286e-05, "loss": 0.3665, "step": 510 }, { "epoch": 0.37142857142857144, "grad_norm": 1.5397262573242188, "learning_rate": 8.761904761904762e-05, "loss": 0.3875, "step": 520 }, { "epoch": 0.37857142857142856, "grad_norm": 3.6661577224731445, "learning_rate": 8.738095238095239e-05, "loss": 0.3617, "step": 530 }, { "epoch": 0.38571428571428573, "grad_norm": 2.5946245193481445, "learning_rate": 8.714285714285715e-05, "loss": 0.3318, "step": 540 }, { "epoch": 0.39285714285714285, "grad_norm": 2.508056640625, "learning_rate": 8.690476190476192e-05, "loss": 0.3681, "step": 550 }, { "epoch": 0.4, "grad_norm": 5.406072616577148, "learning_rate": 8.666666666666667e-05, "loss": 0.3589, "step": 560 }, { "epoch": 0.40714285714285714, "grad_norm": 2.0363543033599854, "learning_rate": 8.642857142857143e-05, "loss": 0.4328, "step": 570 }, { "epoch": 0.4142857142857143, "grad_norm": 1.796236515045166, "learning_rate": 8.61904761904762e-05, "loss": 0.4005, "step": 580 }, { "epoch": 0.42142857142857143, "grad_norm": 5.6172943115234375, "learning_rate": 8.595238095238096e-05, "loss": 0.3414, "step": 590 }, { "epoch": 0.42857142857142855, "grad_norm": 2.953678846359253, "learning_rate": 8.571428571428571e-05, "loss": 0.3917, "step": 600 }, { "epoch": 0.4357142857142857, "grad_norm": 1.8276687860488892, "learning_rate": 8.547619047619048e-05, "loss": 0.4057, "step": 610 }, { "epoch": 0.44285714285714284, "grad_norm": 1.8668811321258545, "learning_rate": 8.523809523809524e-05, "loss": 0.4153, "step": 620 }, { "epoch": 0.45, "grad_norm": 2.87809419631958, "learning_rate": 8.5e-05, "loss": 0.3911, "step": 630 }, { "epoch": 0.45714285714285713, "grad_norm": 4.863674640655518, "learning_rate": 8.476190476190477e-05, "loss": 0.4458, "step": 640 }, { "epoch": 0.4642857142857143, "grad_norm": 1.7150845527648926, "learning_rate": 8.452380952380952e-05, "loss": 0.3951, "step": 650 }, { "epoch": 0.4714285714285714, "grad_norm": 2.0149145126342773, "learning_rate": 8.428571428571429e-05, "loss": 0.3238, "step": 660 }, { "epoch": 0.4785714285714286, "grad_norm": 2.3146402835845947, "learning_rate": 8.404761904761905e-05, "loss": 0.3129, "step": 670 }, { "epoch": 0.4857142857142857, "grad_norm": 1.8723537921905518, "learning_rate": 8.380952380952382e-05, "loss": 0.2969, "step": 680 }, { "epoch": 0.4928571428571429, "grad_norm": 1.6206679344177246, "learning_rate": 8.357142857142858e-05, "loss": 0.3673, "step": 690 }, { "epoch": 0.5, "grad_norm": 2.490722179412842, "learning_rate": 8.333333333333334e-05, "loss": 0.3493, "step": 700 }, { "epoch": 0.5071428571428571, "grad_norm": 1.7868791818618774, "learning_rate": 8.309523809523811e-05, "loss": 0.423, "step": 710 }, { "epoch": 0.5142857142857142, "grad_norm": 2.3152015209198, "learning_rate": 8.285714285714287e-05, "loss": 0.3448, "step": 720 }, { "epoch": 0.5214285714285715, "grad_norm": 1.934523105621338, "learning_rate": 8.261904761904762e-05, "loss": 0.372, "step": 730 }, { "epoch": 0.5285714285714286, "grad_norm": 2.421624183654785, "learning_rate": 8.238095238095238e-05, "loss": 0.3602, "step": 740 }, { "epoch": 0.5357142857142857, "grad_norm": 2.0156285762786865, "learning_rate": 8.214285714285714e-05, "loss": 0.334, "step": 750 }, { "epoch": 0.5428571428571428, "grad_norm": 1.6513855457305908, "learning_rate": 8.19047619047619e-05, "loss": 0.3553, "step": 760 }, { "epoch": 0.55, "grad_norm": 1.950451374053955, "learning_rate": 8.166666666666667e-05, "loss": 0.3696, "step": 770 }, { "epoch": 0.5571428571428572, "grad_norm": 2.4662423133850098, "learning_rate": 8.142857142857143e-05, "loss": 0.3232, "step": 780 }, { "epoch": 0.5642857142857143, "grad_norm": 2.060654401779175, "learning_rate": 8.11904761904762e-05, "loss": 0.3294, "step": 790 }, { "epoch": 0.5714285714285714, "grad_norm": 3.3581550121307373, "learning_rate": 8.095238095238096e-05, "loss": 0.3022, "step": 800 }, { "epoch": 0.5785714285714286, "grad_norm": 1.6778148412704468, "learning_rate": 8.071428571428573e-05, "loss": 0.2808, "step": 810 }, { "epoch": 0.5857142857142857, "grad_norm": 5.509958267211914, "learning_rate": 8.047619047619048e-05, "loss": 0.3584, "step": 820 }, { "epoch": 0.5928571428571429, "grad_norm": 2.3136510848999023, "learning_rate": 8.023809523809524e-05, "loss": 0.2967, "step": 830 }, { "epoch": 0.6, "grad_norm": 2.757749557495117, "learning_rate": 8e-05, "loss": 0.3091, "step": 840 }, { "epoch": 0.6071428571428571, "grad_norm": 1.6453288793563843, "learning_rate": 7.976190476190477e-05, "loss": 0.2882, "step": 850 }, { "epoch": 0.6142857142857143, "grad_norm": 3.599712371826172, "learning_rate": 7.952380952380952e-05, "loss": 0.35, "step": 860 }, { "epoch": 0.6214285714285714, "grad_norm": 1.203050136566162, "learning_rate": 7.928571428571429e-05, "loss": 0.3147, "step": 870 }, { "epoch": 0.6285714285714286, "grad_norm": 3.0582826137542725, "learning_rate": 7.904761904761905e-05, "loss": 0.369, "step": 880 }, { "epoch": 0.6357142857142857, "grad_norm": 1.7133578062057495, "learning_rate": 7.880952380952382e-05, "loss": 0.4404, "step": 890 }, { "epoch": 0.6428571428571429, "grad_norm": 2.1532819271087646, "learning_rate": 7.857142857142858e-05, "loss": 0.3249, "step": 900 }, { "epoch": 0.65, "grad_norm": 3.529282808303833, "learning_rate": 7.833333333333333e-05, "loss": 0.3449, "step": 910 }, { "epoch": 0.6571428571428571, "grad_norm": 3.412729501724243, "learning_rate": 7.80952380952381e-05, "loss": 0.3678, "step": 920 }, { "epoch": 0.6642857142857143, "grad_norm": 1.8676691055297852, "learning_rate": 7.785714285714286e-05, "loss": 0.3522, "step": 930 }, { "epoch": 0.6714285714285714, "grad_norm": 1.8748834133148193, "learning_rate": 7.761904761904762e-05, "loss": 0.2561, "step": 940 }, { "epoch": 0.6785714285714286, "grad_norm": 3.1622543334960938, "learning_rate": 7.738095238095239e-05, "loss": 0.3245, "step": 950 }, { "epoch": 0.6857142857142857, "grad_norm": 2.17108416557312, "learning_rate": 7.714285714285715e-05, "loss": 0.335, "step": 960 }, { "epoch": 0.6928571428571428, "grad_norm": 3.686673402786255, "learning_rate": 7.690476190476192e-05, "loss": 0.319, "step": 970 }, { "epoch": 0.7, "grad_norm": 7.108774662017822, "learning_rate": 7.666666666666667e-05, "loss": 0.3337, "step": 980 }, { "epoch": 0.7071428571428572, "grad_norm": 2.532336711883545, "learning_rate": 7.642857142857143e-05, "loss": 0.377, "step": 990 }, { "epoch": 0.7142857142857143, "grad_norm": 1.8768038749694824, "learning_rate": 7.619047619047618e-05, "loss": 0.3745, "step": 1000 }, { "epoch": 0.7214285714285714, "grad_norm": 1.8926101922988892, "learning_rate": 7.595238095238095e-05, "loss": 0.3503, "step": 1010 }, { "epoch": 0.7285714285714285, "grad_norm": 2.2763147354125977, "learning_rate": 7.571428571428571e-05, "loss": 0.307, "step": 1020 }, { "epoch": 0.7357142857142858, "grad_norm": 3.663257360458374, "learning_rate": 7.547619047619048e-05, "loss": 0.2876, "step": 1030 }, { "epoch": 0.7428571428571429, "grad_norm": 1.844648003578186, "learning_rate": 7.523809523809524e-05, "loss": 0.3512, "step": 1040 }, { "epoch": 0.75, "grad_norm": 2.5800540447235107, "learning_rate": 7.500000000000001e-05, "loss": 0.301, "step": 1050 }, { "epoch": 0.7571428571428571, "grad_norm": 2.4674232006073, "learning_rate": 7.476190476190477e-05, "loss": 0.3635, "step": 1060 }, { "epoch": 0.7642857142857142, "grad_norm": 3.5119080543518066, "learning_rate": 7.452380952380952e-05, "loss": 0.3688, "step": 1070 }, { "epoch": 0.7714285714285715, "grad_norm": 2.5902345180511475, "learning_rate": 7.428571428571429e-05, "loss": 0.3136, "step": 1080 }, { "epoch": 0.7785714285714286, "grad_norm": 1.734082818031311, "learning_rate": 7.404761904761905e-05, "loss": 0.3366, "step": 1090 }, { "epoch": 0.7857142857142857, "grad_norm": 2.0863406658172607, "learning_rate": 7.380952380952382e-05, "loss": 0.331, "step": 1100 }, { "epoch": 0.7928571428571428, "grad_norm": 2.9227652549743652, "learning_rate": 7.357142857142858e-05, "loss": 0.3131, "step": 1110 }, { "epoch": 0.8, "grad_norm": 5.268605709075928, "learning_rate": 7.333333333333333e-05, "loss": 0.3212, "step": 1120 }, { "epoch": 0.8071428571428572, "grad_norm": 2.633431911468506, "learning_rate": 7.30952380952381e-05, "loss": 0.2707, "step": 1130 }, { "epoch": 0.8142857142857143, "grad_norm": 1.652354121208191, "learning_rate": 7.285714285714286e-05, "loss": 0.3037, "step": 1140 }, { "epoch": 0.8214285714285714, "grad_norm": 3.1987786293029785, "learning_rate": 7.261904761904762e-05, "loss": 0.3603, "step": 1150 }, { "epoch": 0.8285714285714286, "grad_norm": 1.7624555826187134, "learning_rate": 7.238095238095238e-05, "loss": 0.308, "step": 1160 }, { "epoch": 0.8357142857142857, "grad_norm": 2.6379318237304688, "learning_rate": 7.214285714285714e-05, "loss": 0.3068, "step": 1170 }, { "epoch": 0.8428571428571429, "grad_norm": 2.6688101291656494, "learning_rate": 7.19047619047619e-05, "loss": 0.3134, "step": 1180 }, { "epoch": 0.85, "grad_norm": 3.698643207550049, "learning_rate": 7.166666666666667e-05, "loss": 0.2973, "step": 1190 }, { "epoch": 0.8571428571428571, "grad_norm": 2.9004178047180176, "learning_rate": 7.142857142857143e-05, "loss": 0.3116, "step": 1200 }, { "epoch": 0.8642857142857143, "grad_norm": 2.3414883613586426, "learning_rate": 7.11904761904762e-05, "loss": 0.334, "step": 1210 }, { "epoch": 0.8714285714285714, "grad_norm": 3.6203651428222656, "learning_rate": 7.095238095238096e-05, "loss": 0.3241, "step": 1220 }, { "epoch": 0.8785714285714286, "grad_norm": 2.7472829818725586, "learning_rate": 7.071428571428573e-05, "loss": 0.2727, "step": 1230 }, { "epoch": 0.8857142857142857, "grad_norm": 3.2332115173339844, "learning_rate": 7.047619047619048e-05, "loss": 0.3329, "step": 1240 }, { "epoch": 0.8928571428571429, "grad_norm": 1.968595027923584, "learning_rate": 7.023809523809524e-05, "loss": 0.3478, "step": 1250 }, { "epoch": 0.9, "grad_norm": 2.3712966442108154, "learning_rate": 7e-05, "loss": 0.3964, "step": 1260 }, { "epoch": 0.9071428571428571, "grad_norm": 1.7704048156738281, "learning_rate": 6.976190476190476e-05, "loss": 0.3983, "step": 1270 }, { "epoch": 0.9142857142857143, "grad_norm": 1.8879201412200928, "learning_rate": 6.952380952380952e-05, "loss": 0.3296, "step": 1280 }, { "epoch": 0.9214285714285714, "grad_norm": 4.0522780418396, "learning_rate": 6.928571428571429e-05, "loss": 0.3722, "step": 1290 }, { "epoch": 0.9285714285714286, "grad_norm": 1.7849159240722656, "learning_rate": 6.904761904761905e-05, "loss": 0.3022, "step": 1300 }, { "epoch": 0.9357142857142857, "grad_norm": 3.376678705215454, "learning_rate": 6.880952380952382e-05, "loss": 0.3429, "step": 1310 }, { "epoch": 0.9428571428571428, "grad_norm": 4.100617408752441, "learning_rate": 6.857142857142858e-05, "loss": 0.2793, "step": 1320 }, { "epoch": 0.95, "grad_norm": 3.8000288009643555, "learning_rate": 6.833333333333333e-05, "loss": 0.3114, "step": 1330 }, { "epoch": 0.9571428571428572, "grad_norm": 4.846517562866211, "learning_rate": 6.80952380952381e-05, "loss": 0.3377, "step": 1340 }, { "epoch": 0.9642857142857143, "grad_norm": 2.0567409992218018, "learning_rate": 6.785714285714286e-05, "loss": 0.3198, "step": 1350 }, { "epoch": 0.9714285714285714, "grad_norm": 2.3197948932647705, "learning_rate": 6.761904761904763e-05, "loss": 0.2817, "step": 1360 }, { "epoch": 0.9785714285714285, "grad_norm": 2.7312827110290527, "learning_rate": 6.738095238095239e-05, "loss": 0.3378, "step": 1370 }, { "epoch": 0.9857142857142858, "grad_norm": 3.115516185760498, "learning_rate": 6.714285714285714e-05, "loss": 0.3638, "step": 1380 }, { "epoch": 0.9928571428571429, "grad_norm": 1.0742294788360596, "learning_rate": 6.69047619047619e-05, "loss": 0.3176, "step": 1390 }, { "epoch": 1.0, "grad_norm": 2.3872761726379395, "learning_rate": 6.666666666666667e-05, "loss": 0.2921, "step": 1400 }, { "epoch": 1.0, "eval_loss": 0.2929375469684601, "eval_runtime": 68.5646, "eval_samples_per_second": 2.917, "eval_steps_per_second": 0.365, "step": 1400 } ], "logging_steps": 10, "max_steps": 4200, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8377941884928000.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }