{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.96, "eval_steps": 50, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032, "grad_norm": 0.2421875, "learning_rate": 5e-06, "loss": 0.7, "step": 10 }, { "epoch": 0.0064, "grad_norm": 0.30859375, "learning_rate": 1e-05, "loss": 0.6988, "step": 20 }, { "epoch": 0.0096, "grad_norm": 0.28515625, "learning_rate": 1.5e-05, "loss": 0.6977, "step": 30 }, { "epoch": 0.0128, "grad_norm": 0.244140625, "learning_rate": 2e-05, "loss": 0.6969, "step": 40 }, { "epoch": 0.016, "grad_norm": 0.259765625, "learning_rate": 2.5e-05, "loss": 0.6941, "step": 50 }, { "epoch": 0.016, "eval_loss": 0.6939687728881836, "eval_runtime": 192.7405, "eval_samples_per_second": 2.594, "eval_steps_per_second": 0.083, "step": 50 }, { "epoch": 0.0192, "grad_norm": 0.232421875, "learning_rate": 3e-05, "loss": 0.6871, "step": 60 }, { "epoch": 0.0224, "grad_norm": 0.302734375, "learning_rate": 3.5e-05, "loss": 0.6828, "step": 70 }, { "epoch": 0.0256, "grad_norm": 0.8203125, "learning_rate": 4e-05, "loss": 0.6687, "step": 80 }, { "epoch": 0.0288, "grad_norm": 1.2421875, "learning_rate": 4.5e-05, "loss": 0.6309, "step": 90 }, { "epoch": 0.032, "grad_norm": 2.03125, "learning_rate": 5e-05, "loss": 0.5781, "step": 100 }, { "epoch": 0.032, "eval_loss": 0.5389062762260437, "eval_runtime": 108.3608, "eval_samples_per_second": 4.614, "eval_steps_per_second": 0.148, "step": 100 }, { "epoch": 0.0352, "grad_norm": 2.328125, "learning_rate": 4.9834710743801654e-05, "loss": 0.4818, "step": 110 }, { "epoch": 0.0384, "grad_norm": 2.265625, "learning_rate": 4.9669421487603305e-05, "loss": 0.4291, "step": 120 }, { "epoch": 0.0416, "grad_norm": 2.203125, "learning_rate": 4.9504132231404956e-05, "loss": 0.3615, "step": 130 }, { "epoch": 0.0448, "grad_norm": 2.515625, "learning_rate": 4.9338842975206614e-05, "loss": 0.3408, "step": 140 }, { "epoch": 0.048, "grad_norm": 1.515625, "learning_rate": 4.917355371900827e-05, "loss": 0.306, "step": 150 }, { "epoch": 0.048, "eval_loss": 0.2862031161785126, "eval_runtime": 110.0641, "eval_samples_per_second": 4.543, "eval_steps_per_second": 0.145, "step": 150 }, { "epoch": 0.0512, "grad_norm": 1.7265625, "learning_rate": 4.900826446280992e-05, "loss": 0.2904, "step": 160 }, { "epoch": 0.0544, "grad_norm": 1.5703125, "learning_rate": 4.8842975206611575e-05, "loss": 0.2648, "step": 170 }, { "epoch": 0.0576, "grad_norm": 1.6875, "learning_rate": 4.8677685950413226e-05, "loss": 0.2373, "step": 180 }, { "epoch": 0.0608, "grad_norm": 1.0859375, "learning_rate": 4.851239669421488e-05, "loss": 0.1749, "step": 190 }, { "epoch": 0.064, "grad_norm": 0.8671875, "learning_rate": 4.834710743801653e-05, "loss": 0.1556, "step": 200 }, { "epoch": 0.064, "eval_loss": 0.16703906655311584, "eval_runtime": 109.9961, "eval_samples_per_second": 4.546, "eval_steps_per_second": 0.145, "step": 200 }, { "epoch": 0.0672, "grad_norm": 2.5625, "learning_rate": 4.8181818181818186e-05, "loss": 0.1382, "step": 210 }, { "epoch": 0.0704, "grad_norm": 1.859375, "learning_rate": 4.801652892561984e-05, "loss": 0.125, "step": 220 }, { "epoch": 0.0736, "grad_norm": 1.3515625, "learning_rate": 4.785123966942149e-05, "loss": 0.1729, "step": 230 }, { "epoch": 0.0768, "grad_norm": 1.1953125, "learning_rate": 4.768595041322314e-05, "loss": 0.1252, "step": 240 }, { "epoch": 0.08, "grad_norm": 1.0078125, "learning_rate": 4.75206611570248e-05, "loss": 0.1108, "step": 250 }, { "epoch": 0.08, "eval_loss": 0.13416016101837158, "eval_runtime": 110.9679, "eval_samples_per_second": 4.506, "eval_steps_per_second": 0.144, "step": 250 }, { "epoch": 0.0832, "grad_norm": 1.7890625, "learning_rate": 4.735537190082645e-05, "loss": 0.1502, "step": 260 }, { "epoch": 0.0864, "grad_norm": 0.66796875, "learning_rate": 4.71900826446281e-05, "loss": 0.1291, "step": 270 }, { "epoch": 0.0896, "grad_norm": 2.546875, "learning_rate": 4.702479338842976e-05, "loss": 0.1326, "step": 280 }, { "epoch": 0.0928, "grad_norm": 1.7265625, "learning_rate": 4.685950413223141e-05, "loss": 0.1374, "step": 290 }, { "epoch": 0.096, "grad_norm": 1.15625, "learning_rate": 4.669421487603306e-05, "loss": 0.1081, "step": 300 }, { "epoch": 0.096, "eval_loss": 0.12271875143051147, "eval_runtime": 110.6168, "eval_samples_per_second": 4.52, "eval_steps_per_second": 0.145, "step": 300 }, { "epoch": 0.0992, "grad_norm": 1.140625, "learning_rate": 4.652892561983471e-05, "loss": 0.1266, "step": 310 }, { "epoch": 0.1024, "grad_norm": 1.2265625, "learning_rate": 4.636363636363636e-05, "loss": 0.1368, "step": 320 }, { "epoch": 0.1056, "grad_norm": 0.7109375, "learning_rate": 4.6198347107438014e-05, "loss": 0.1064, "step": 330 }, { "epoch": 0.1088, "grad_norm": 1.4375, "learning_rate": 4.603305785123967e-05, "loss": 0.1233, "step": 340 }, { "epoch": 0.112, "grad_norm": 0.6328125, "learning_rate": 4.586776859504133e-05, "loss": 0.0881, "step": 350 }, { "epoch": 0.112, "eval_loss": 0.1180742159485817, "eval_runtime": 109.5127, "eval_samples_per_second": 4.566, "eval_steps_per_second": 0.146, "step": 350 }, { "epoch": 0.1152, "grad_norm": 1.109375, "learning_rate": 4.570247933884298e-05, "loss": 0.1136, "step": 360 }, { "epoch": 0.1184, "grad_norm": 0.89453125, "learning_rate": 4.553719008264463e-05, "loss": 0.1338, "step": 370 }, { "epoch": 0.1216, "grad_norm": 2.65625, "learning_rate": 4.5371900826446284e-05, "loss": 0.1636, "step": 380 }, { "epoch": 0.1248, "grad_norm": 2.375, "learning_rate": 4.5206611570247935e-05, "loss": 0.1067, "step": 390 }, { "epoch": 0.128, "grad_norm": 1.078125, "learning_rate": 4.504132231404959e-05, "loss": 0.1287, "step": 400 }, { "epoch": 0.128, "eval_loss": 0.1135859340429306, "eval_runtime": 115.1853, "eval_samples_per_second": 4.341, "eval_steps_per_second": 0.139, "step": 400 }, { "epoch": 0.1312, "grad_norm": 1.5859375, "learning_rate": 4.487603305785124e-05, "loss": 0.122, "step": 410 }, { "epoch": 0.1344, "grad_norm": 1.6796875, "learning_rate": 4.4710743801652896e-05, "loss": 0.1232, "step": 420 }, { "epoch": 0.1376, "grad_norm": 0.99609375, "learning_rate": 4.454545454545455e-05, "loss": 0.11, "step": 430 }, { "epoch": 0.1408, "grad_norm": 1.6328125, "learning_rate": 4.43801652892562e-05, "loss": 0.0937, "step": 440 }, { "epoch": 0.144, "grad_norm": 1.8671875, "learning_rate": 4.4214876033057856e-05, "loss": 0.1162, "step": 450 }, { "epoch": 0.144, "eval_loss": 0.10819531232118607, "eval_runtime": 110.4172, "eval_samples_per_second": 4.528, "eval_steps_per_second": 0.145, "step": 450 }, { "epoch": 0.1472, "grad_norm": 3.78125, "learning_rate": 4.404958677685951e-05, "loss": 0.1171, "step": 460 }, { "epoch": 0.1504, "grad_norm": 1.71875, "learning_rate": 4.388429752066116e-05, "loss": 0.1032, "step": 470 }, { "epoch": 0.1536, "grad_norm": 2.796875, "learning_rate": 4.371900826446281e-05, "loss": 0.1217, "step": 480 }, { "epoch": 0.1568, "grad_norm": 0.703125, "learning_rate": 4.355371900826447e-05, "loss": 0.1197, "step": 490 }, { "epoch": 0.16, "grad_norm": 1.734375, "learning_rate": 4.338842975206612e-05, "loss": 0.0878, "step": 500 }, { "epoch": 0.16, "eval_loss": 0.10691406577825546, "eval_runtime": 110.9181, "eval_samples_per_second": 4.508, "eval_steps_per_second": 0.144, "step": 500 }, { "epoch": 0.1632, "grad_norm": 1.3125, "learning_rate": 4.322314049586777e-05, "loss": 0.1102, "step": 510 }, { "epoch": 0.1664, "grad_norm": 1.171875, "learning_rate": 4.305785123966942e-05, "loss": 0.0982, "step": 520 }, { "epoch": 0.1696, "grad_norm": 0.90625, "learning_rate": 4.289256198347107e-05, "loss": 0.1096, "step": 530 }, { "epoch": 0.1728, "grad_norm": 0.93359375, "learning_rate": 4.2727272727272724e-05, "loss": 0.1269, "step": 540 }, { "epoch": 0.176, "grad_norm": 1.609375, "learning_rate": 4.256198347107438e-05, "loss": 0.0994, "step": 550 }, { "epoch": 0.176, "eval_loss": 0.10447265952825546, "eval_runtime": 110.3941, "eval_samples_per_second": 4.529, "eval_steps_per_second": 0.145, "step": 550 }, { "epoch": 0.1792, "grad_norm": 0.703125, "learning_rate": 4.239669421487604e-05, "loss": 0.1118, "step": 560 }, { "epoch": 0.1824, "grad_norm": 1.234375, "learning_rate": 4.223140495867769e-05, "loss": 0.098, "step": 570 }, { "epoch": 0.1856, "grad_norm": 0.9375, "learning_rate": 4.206611570247934e-05, "loss": 0.0811, "step": 580 }, { "epoch": 0.1888, "grad_norm": 1.3984375, "learning_rate": 4.1900826446280994e-05, "loss": 0.1208, "step": 590 }, { "epoch": 0.192, "grad_norm": 1.34375, "learning_rate": 4.1735537190082645e-05, "loss": 0.1275, "step": 600 }, { "epoch": 0.192, "eval_loss": 0.10120312869548798, "eval_runtime": 109.2003, "eval_samples_per_second": 4.579, "eval_steps_per_second": 0.147, "step": 600 }, { "epoch": 0.1952, "grad_norm": 0.5859375, "learning_rate": 4.1570247933884296e-05, "loss": 0.0872, "step": 610 }, { "epoch": 0.1984, "grad_norm": 1.0390625, "learning_rate": 4.1404958677685954e-05, "loss": 0.1025, "step": 620 }, { "epoch": 0.2016, "grad_norm": 1.0546875, "learning_rate": 4.1239669421487606e-05, "loss": 0.0932, "step": 630 }, { "epoch": 0.2048, "grad_norm": 1.3046875, "learning_rate": 4.107438016528926e-05, "loss": 0.0838, "step": 640 }, { "epoch": 0.208, "grad_norm": 1.0703125, "learning_rate": 4.0909090909090915e-05, "loss": 0.0921, "step": 650 }, { "epoch": 0.208, "eval_loss": 0.10271484404802322, "eval_runtime": 110.3072, "eval_samples_per_second": 4.533, "eval_steps_per_second": 0.145, "step": 650 }, { "epoch": 0.2112, "grad_norm": 1.453125, "learning_rate": 4.0743801652892566e-05, "loss": 0.0842, "step": 660 }, { "epoch": 0.2144, "grad_norm": 1.9609375, "learning_rate": 4.057851239669422e-05, "loss": 0.0969, "step": 670 }, { "epoch": 0.2176, "grad_norm": 1.5859375, "learning_rate": 4.041322314049587e-05, "loss": 0.1037, "step": 680 }, { "epoch": 0.2208, "grad_norm": 0.96875, "learning_rate": 4.024793388429752e-05, "loss": 0.0872, "step": 690 }, { "epoch": 0.224, "grad_norm": 0.703125, "learning_rate": 4.008264462809918e-05, "loss": 0.0722, "step": 700 }, { "epoch": 0.224, "eval_loss": 0.10465820133686066, "eval_runtime": 109.5448, "eval_samples_per_second": 4.564, "eval_steps_per_second": 0.146, "step": 700 }, { "epoch": 0.2272, "grad_norm": 0.66015625, "learning_rate": 3.991735537190083e-05, "loss": 0.0926, "step": 710 }, { "epoch": 0.2304, "grad_norm": 3.234375, "learning_rate": 3.975206611570248e-05, "loss": 0.0945, "step": 720 }, { "epoch": 0.2336, "grad_norm": 0.5390625, "learning_rate": 3.958677685950413e-05, "loss": 0.076, "step": 730 }, { "epoch": 0.2368, "grad_norm": 1.0234375, "learning_rate": 3.942148760330578e-05, "loss": 0.0959, "step": 740 }, { "epoch": 0.24, "grad_norm": 0.455078125, "learning_rate": 3.925619834710744e-05, "loss": 0.0916, "step": 750 }, { "epoch": 0.24, "eval_loss": 0.10287109017372131, "eval_runtime": 122.5858, "eval_samples_per_second": 4.079, "eval_steps_per_second": 0.131, "step": 750 }, { "epoch": 0.2432, "grad_norm": 1.609375, "learning_rate": 3.909090909090909e-05, "loss": 0.0891, "step": 760 }, { "epoch": 0.2464, "grad_norm": 0.8359375, "learning_rate": 3.892561983471075e-05, "loss": 0.0954, "step": 770 }, { "epoch": 0.2496, "grad_norm": 1.203125, "learning_rate": 3.87603305785124e-05, "loss": 0.11, "step": 780 }, { "epoch": 0.2528, "grad_norm": 0.365234375, "learning_rate": 3.859504132231405e-05, "loss": 0.067, "step": 790 }, { "epoch": 0.256, "grad_norm": 1.1171875, "learning_rate": 3.8429752066115703e-05, "loss": 0.0909, "step": 800 }, { "epoch": 0.256, "eval_loss": 0.09786719083786011, "eval_runtime": 112.8442, "eval_samples_per_second": 4.431, "eval_steps_per_second": 0.142, "step": 800 }, { "epoch": 0.2592, "grad_norm": 0.61328125, "learning_rate": 3.8264462809917355e-05, "loss": 0.0801, "step": 810 }, { "epoch": 0.2624, "grad_norm": 1.9296875, "learning_rate": 3.8099173553719006e-05, "loss": 0.1088, "step": 820 }, { "epoch": 0.2656, "grad_norm": 1.7421875, "learning_rate": 3.7933884297520664e-05, "loss": 0.08, "step": 830 }, { "epoch": 0.2688, "grad_norm": 1.1875, "learning_rate": 3.7768595041322315e-05, "loss": 0.1055, "step": 840 }, { "epoch": 0.272, "grad_norm": 0.85546875, "learning_rate": 3.760330578512397e-05, "loss": 0.0951, "step": 850 }, { "epoch": 0.272, "eval_loss": 0.09604492038488388, "eval_runtime": 109.2932, "eval_samples_per_second": 4.575, "eval_steps_per_second": 0.146, "step": 850 }, { "epoch": 0.2752, "grad_norm": 1.7265625, "learning_rate": 3.7438016528925624e-05, "loss": 0.116, "step": 860 }, { "epoch": 0.2784, "grad_norm": 1.640625, "learning_rate": 3.7272727272727276e-05, "loss": 0.0771, "step": 870 }, { "epoch": 0.2816, "grad_norm": 0.92578125, "learning_rate": 3.710743801652893e-05, "loss": 0.08, "step": 880 }, { "epoch": 0.2848, "grad_norm": 1.7890625, "learning_rate": 3.694214876033058e-05, "loss": 0.105, "step": 890 }, { "epoch": 0.288, "grad_norm": 0.60546875, "learning_rate": 3.6776859504132236e-05, "loss": 0.0686, "step": 900 }, { "epoch": 0.288, "eval_loss": 0.08956640958786011, "eval_runtime": 109.9587, "eval_samples_per_second": 4.547, "eval_steps_per_second": 0.146, "step": 900 }, { "epoch": 0.2912, "grad_norm": 0.484375, "learning_rate": 3.661157024793389e-05, "loss": 0.0867, "step": 910 }, { "epoch": 0.2944, "grad_norm": 1.25, "learning_rate": 3.644628099173554e-05, "loss": 0.0975, "step": 920 }, { "epoch": 0.2976, "grad_norm": 1.625, "learning_rate": 3.628099173553719e-05, "loss": 0.0905, "step": 930 }, { "epoch": 0.3008, "grad_norm": 0.44140625, "learning_rate": 3.611570247933884e-05, "loss": 0.0983, "step": 940 }, { "epoch": 0.304, "grad_norm": 0.51953125, "learning_rate": 3.59504132231405e-05, "loss": 0.0772, "step": 950 }, { "epoch": 0.304, "eval_loss": 0.09039648622274399, "eval_runtime": 111.9644, "eval_samples_per_second": 4.466, "eval_steps_per_second": 0.143, "step": 950 }, { "epoch": 0.3072, "grad_norm": 0.828125, "learning_rate": 3.578512396694215e-05, "loss": 0.0907, "step": 960 }, { "epoch": 0.3104, "grad_norm": 0.84765625, "learning_rate": 3.56198347107438e-05, "loss": 0.0939, "step": 970 }, { "epoch": 0.3136, "grad_norm": 2.203125, "learning_rate": 3.545454545454546e-05, "loss": 0.0984, "step": 980 }, { "epoch": 0.3168, "grad_norm": 0.494140625, "learning_rate": 3.528925619834711e-05, "loss": 0.0924, "step": 990 }, { "epoch": 0.32, "grad_norm": 0.91015625, "learning_rate": 3.512396694214876e-05, "loss": 0.0761, "step": 1000 }, { "epoch": 0.32, "eval_loss": 0.09162402153015137, "eval_runtime": 112.9796, "eval_samples_per_second": 4.426, "eval_steps_per_second": 0.142, "step": 1000 }, { "epoch": 0.3232, "grad_norm": 0.7265625, "learning_rate": 3.495867768595041e-05, "loss": 0.0845, "step": 1010 }, { "epoch": 0.3264, "grad_norm": 1.0859375, "learning_rate": 3.4793388429752064e-05, "loss": 0.0852, "step": 1020 }, { "epoch": 0.3296, "grad_norm": 1.40625, "learning_rate": 3.4628099173553716e-05, "loss": 0.1208, "step": 1030 }, { "epoch": 0.3328, "grad_norm": 1.3203125, "learning_rate": 3.4462809917355374e-05, "loss": 0.0807, "step": 1040 }, { "epoch": 0.336, "grad_norm": 1.515625, "learning_rate": 3.429752066115703e-05, "loss": 0.0756, "step": 1050 }, { "epoch": 0.336, "eval_loss": 0.09120605140924454, "eval_runtime": 111.5564, "eval_samples_per_second": 4.482, "eval_steps_per_second": 0.143, "step": 1050 }, { "epoch": 0.3392, "grad_norm": 0.84765625, "learning_rate": 3.413223140495868e-05, "loss": 0.1092, "step": 1060 }, { "epoch": 0.3424, "grad_norm": 2.671875, "learning_rate": 3.3966942148760334e-05, "loss": 0.1217, "step": 1070 }, { "epoch": 0.3456, "grad_norm": 0.9921875, "learning_rate": 3.3801652892561985e-05, "loss": 0.0921, "step": 1080 }, { "epoch": 0.3488, "grad_norm": 1.0703125, "learning_rate": 3.3636363636363636e-05, "loss": 0.1023, "step": 1090 }, { "epoch": 0.352, "grad_norm": 2.34375, "learning_rate": 3.347107438016529e-05, "loss": 0.107, "step": 1100 }, { "epoch": 0.352, "eval_loss": 0.08991601318120956, "eval_runtime": 109.5083, "eval_samples_per_second": 4.566, "eval_steps_per_second": 0.146, "step": 1100 }, { "epoch": 0.3552, "grad_norm": 0.98828125, "learning_rate": 3.3305785123966946e-05, "loss": 0.0888, "step": 1110 }, { "epoch": 0.3584, "grad_norm": 2.640625, "learning_rate": 3.31404958677686e-05, "loss": 0.0896, "step": 1120 }, { "epoch": 0.3616, "grad_norm": 0.4375, "learning_rate": 3.297520661157025e-05, "loss": 0.072, "step": 1130 }, { "epoch": 0.3648, "grad_norm": 1.515625, "learning_rate": 3.28099173553719e-05, "loss": 0.1046, "step": 1140 }, { "epoch": 0.368, "grad_norm": 0.6640625, "learning_rate": 3.264462809917356e-05, "loss": 0.0791, "step": 1150 }, { "epoch": 0.368, "eval_loss": 0.08706250041723251, "eval_runtime": 108.4302, "eval_samples_per_second": 4.611, "eval_steps_per_second": 0.148, "step": 1150 }, { "epoch": 0.3712, "grad_norm": 1.4140625, "learning_rate": 3.247933884297521e-05, "loss": 0.0791, "step": 1160 }, { "epoch": 0.3744, "grad_norm": 1.734375, "learning_rate": 3.231404958677686e-05, "loss": 0.0854, "step": 1170 }, { "epoch": 0.3776, "grad_norm": 1.09375, "learning_rate": 3.214876033057852e-05, "loss": 0.0912, "step": 1180 }, { "epoch": 0.3808, "grad_norm": 0.48828125, "learning_rate": 3.198347107438017e-05, "loss": 0.1034, "step": 1190 }, { "epoch": 0.384, "grad_norm": 1.3515625, "learning_rate": 3.181818181818182e-05, "loss": 0.0863, "step": 1200 }, { "epoch": 0.384, "eval_loss": 0.08619336038827896, "eval_runtime": 108.0491, "eval_samples_per_second": 4.628, "eval_steps_per_second": 0.148, "step": 1200 }, { "epoch": 0.3872, "grad_norm": 0.80078125, "learning_rate": 3.165289256198347e-05, "loss": 0.0904, "step": 1210 }, { "epoch": 0.3904, "grad_norm": 0.7890625, "learning_rate": 3.148760330578512e-05, "loss": 0.1262, "step": 1220 }, { "epoch": 0.3936, "grad_norm": 1.21875, "learning_rate": 3.1322314049586774e-05, "loss": 0.08, "step": 1230 }, { "epoch": 0.3968, "grad_norm": 1.609375, "learning_rate": 3.1157024793388425e-05, "loss": 0.0861, "step": 1240 }, { "epoch": 0.4, "grad_norm": 1.28125, "learning_rate": 3.099173553719008e-05, "loss": 0.0803, "step": 1250 }, { "epoch": 0.4, "eval_loss": 0.08419531583786011, "eval_runtime": 110.4296, "eval_samples_per_second": 4.528, "eval_steps_per_second": 0.145, "step": 1250 }, { "epoch": 0.4032, "grad_norm": 1.140625, "learning_rate": 3.082644628099174e-05, "loss": 0.0889, "step": 1260 }, { "epoch": 0.4064, "grad_norm": 0.7890625, "learning_rate": 3.066115702479339e-05, "loss": 0.0879, "step": 1270 }, { "epoch": 0.4096, "grad_norm": 1.015625, "learning_rate": 3.0495867768595044e-05, "loss": 0.0776, "step": 1280 }, { "epoch": 0.4128, "grad_norm": 0.8984375, "learning_rate": 3.0330578512396695e-05, "loss": 0.0903, "step": 1290 }, { "epoch": 0.416, "grad_norm": 0.7578125, "learning_rate": 3.016528925619835e-05, "loss": 0.0877, "step": 1300 }, { "epoch": 0.416, "eval_loss": 0.08515820652246475, "eval_runtime": 107.6996, "eval_samples_per_second": 4.643, "eval_steps_per_second": 0.149, "step": 1300 }, { "epoch": 0.4192, "grad_norm": 0.84765625, "learning_rate": 3e-05, "loss": 0.0912, "step": 1310 }, { "epoch": 0.4224, "grad_norm": 0.671875, "learning_rate": 2.9834710743801652e-05, "loss": 0.0634, "step": 1320 }, { "epoch": 0.4256, "grad_norm": 1.03125, "learning_rate": 2.9669421487603307e-05, "loss": 0.076, "step": 1330 }, { "epoch": 0.4288, "grad_norm": 1.7421875, "learning_rate": 2.950413223140496e-05, "loss": 0.0911, "step": 1340 }, { "epoch": 0.432, "grad_norm": 0.859375, "learning_rate": 2.9338842975206616e-05, "loss": 0.0804, "step": 1350 }, { "epoch": 0.432, "eval_loss": 0.0840449184179306, "eval_runtime": 106.9192, "eval_samples_per_second": 4.676, "eval_steps_per_second": 0.15, "step": 1350 }, { "epoch": 0.4352, "grad_norm": 0.74609375, "learning_rate": 2.9173553719008267e-05, "loss": 0.0779, "step": 1360 }, { "epoch": 0.4384, "grad_norm": 0.93359375, "learning_rate": 2.9008264462809918e-05, "loss": 0.0749, "step": 1370 }, { "epoch": 0.4416, "grad_norm": 1.359375, "learning_rate": 2.8842975206611573e-05, "loss": 0.0772, "step": 1380 }, { "epoch": 0.4448, "grad_norm": 0.86328125, "learning_rate": 2.8677685950413224e-05, "loss": 0.0853, "step": 1390 }, { "epoch": 0.448, "grad_norm": 1.5390625, "learning_rate": 2.8512396694214875e-05, "loss": 0.1136, "step": 1400 }, { "epoch": 0.448, "eval_loss": 0.0832226574420929, "eval_runtime": 109.8641, "eval_samples_per_second": 4.551, "eval_steps_per_second": 0.146, "step": 1400 }, { "epoch": 0.4512, "grad_norm": 0.59765625, "learning_rate": 2.834710743801653e-05, "loss": 0.0655, "step": 1410 }, { "epoch": 0.4544, "grad_norm": 2.03125, "learning_rate": 2.818181818181818e-05, "loss": 0.0952, "step": 1420 }, { "epoch": 0.4576, "grad_norm": 1.1796875, "learning_rate": 2.8016528925619832e-05, "loss": 0.0812, "step": 1430 }, { "epoch": 0.4608, "grad_norm": 0.54296875, "learning_rate": 2.785123966942149e-05, "loss": 0.0649, "step": 1440 }, { "epoch": 0.464, "grad_norm": 0.671875, "learning_rate": 2.7685950413223145e-05, "loss": 0.0799, "step": 1450 }, { "epoch": 0.464, "eval_loss": 0.08352148532867432, "eval_runtime": 109.4479, "eval_samples_per_second": 4.568, "eval_steps_per_second": 0.146, "step": 1450 }, { "epoch": 0.4672, "grad_norm": 0.62890625, "learning_rate": 2.7520661157024796e-05, "loss": 0.0699, "step": 1460 }, { "epoch": 0.4704, "grad_norm": 1.734375, "learning_rate": 2.7355371900826447e-05, "loss": 0.0688, "step": 1470 }, { "epoch": 0.4736, "grad_norm": 0.93359375, "learning_rate": 2.7190082644628102e-05, "loss": 0.085, "step": 1480 }, { "epoch": 0.4768, "grad_norm": 0.490234375, "learning_rate": 2.7024793388429753e-05, "loss": 0.0874, "step": 1490 }, { "epoch": 0.48, "grad_norm": 1.703125, "learning_rate": 2.6859504132231405e-05, "loss": 0.0896, "step": 1500 }, { "epoch": 0.48, "eval_loss": 0.08368749916553497, "eval_runtime": 110.4491, "eval_samples_per_second": 4.527, "eval_steps_per_second": 0.145, "step": 1500 }, { "epoch": 0.4832, "grad_norm": 0.828125, "learning_rate": 2.669421487603306e-05, "loss": 0.1036, "step": 1510 }, { "epoch": 0.4864, "grad_norm": 1.78125, "learning_rate": 2.652892561983471e-05, "loss": 0.0917, "step": 1520 }, { "epoch": 0.4896, "grad_norm": 1.1328125, "learning_rate": 2.636363636363636e-05, "loss": 0.0985, "step": 1530 }, { "epoch": 0.4928, "grad_norm": 1.171875, "learning_rate": 2.619834710743802e-05, "loss": 0.0881, "step": 1540 }, { "epoch": 0.496, "grad_norm": 0.7734375, "learning_rate": 2.6033057851239674e-05, "loss": 0.078, "step": 1550 }, { "epoch": 0.496, "eval_loss": 0.08341991901397705, "eval_runtime": 109.2192, "eval_samples_per_second": 4.578, "eval_steps_per_second": 0.146, "step": 1550 }, { "epoch": 0.4992, "grad_norm": 1.2265625, "learning_rate": 2.5867768595041325e-05, "loss": 0.0784, "step": 1560 }, { "epoch": 0.5024, "grad_norm": 0.71484375, "learning_rate": 2.5702479338842977e-05, "loss": 0.0748, "step": 1570 }, { "epoch": 0.5056, "grad_norm": 1.203125, "learning_rate": 2.553719008264463e-05, "loss": 0.0776, "step": 1580 }, { "epoch": 0.5088, "grad_norm": 1.40625, "learning_rate": 2.5371900826446283e-05, "loss": 0.0927, "step": 1590 }, { "epoch": 0.512, "grad_norm": 0.6640625, "learning_rate": 2.5206611570247934e-05, "loss": 0.0878, "step": 1600 }, { "epoch": 0.512, "eval_loss": 0.08232617378234863, "eval_runtime": 110.112, "eval_samples_per_second": 4.541, "eval_steps_per_second": 0.145, "step": 1600 }, { "epoch": 0.5152, "grad_norm": 1.8125, "learning_rate": 2.504132231404959e-05, "loss": 0.0929, "step": 1610 }, { "epoch": 0.5184, "grad_norm": 0.75, "learning_rate": 2.4876033057851243e-05, "loss": 0.0796, "step": 1620 }, { "epoch": 0.5216, "grad_norm": 0.9921875, "learning_rate": 2.4710743801652894e-05, "loss": 0.0932, "step": 1630 }, { "epoch": 0.5248, "grad_norm": 1.0078125, "learning_rate": 2.4545454545454545e-05, "loss": 0.1084, "step": 1640 }, { "epoch": 0.528, "grad_norm": 1.03125, "learning_rate": 2.43801652892562e-05, "loss": 0.0763, "step": 1650 }, { "epoch": 0.528, "eval_loss": 0.08151757717132568, "eval_runtime": 111.3314, "eval_samples_per_second": 4.491, "eval_steps_per_second": 0.144, "step": 1650 }, { "epoch": 0.5312, "grad_norm": 0.859375, "learning_rate": 2.421487603305785e-05, "loss": 0.062, "step": 1660 }, { "epoch": 0.5344, "grad_norm": 1.0859375, "learning_rate": 2.4049586776859506e-05, "loss": 0.0628, "step": 1670 }, { "epoch": 0.5376, "grad_norm": 0.59375, "learning_rate": 2.3884297520661157e-05, "loss": 0.0908, "step": 1680 }, { "epoch": 0.5408, "grad_norm": 1.3515625, "learning_rate": 2.3719008264462812e-05, "loss": 0.0927, "step": 1690 }, { "epoch": 0.544, "grad_norm": 1.109375, "learning_rate": 2.3553719008264463e-05, "loss": 0.0958, "step": 1700 }, { "epoch": 0.544, "eval_loss": 0.08200781047344208, "eval_runtime": 114.049, "eval_samples_per_second": 4.384, "eval_steps_per_second": 0.14, "step": 1700 }, { "epoch": 0.5472, "grad_norm": 0.53125, "learning_rate": 2.3388429752066114e-05, "loss": 0.0716, "step": 1710 }, { "epoch": 0.5504, "grad_norm": 0.421875, "learning_rate": 2.3223140495867772e-05, "loss": 0.0715, "step": 1720 }, { "epoch": 0.5536, "grad_norm": 2.703125, "learning_rate": 2.3057851239669423e-05, "loss": 0.0941, "step": 1730 }, { "epoch": 0.5568, "grad_norm": 0.89453125, "learning_rate": 2.2892561983471075e-05, "loss": 0.0785, "step": 1740 }, { "epoch": 0.56, "grad_norm": 0.76953125, "learning_rate": 2.272727272727273e-05, "loss": 0.0857, "step": 1750 }, { "epoch": 0.56, "eval_loss": 0.08109960705041885, "eval_runtime": 112.2299, "eval_samples_per_second": 4.455, "eval_steps_per_second": 0.143, "step": 1750 }, { "epoch": 0.5632, "grad_norm": 0.953125, "learning_rate": 2.256198347107438e-05, "loss": 0.0601, "step": 1760 }, { "epoch": 0.5664, "grad_norm": 0.322265625, "learning_rate": 2.2396694214876035e-05, "loss": 0.0866, "step": 1770 }, { "epoch": 0.5696, "grad_norm": 1.015625, "learning_rate": 2.2231404958677686e-05, "loss": 0.0708, "step": 1780 }, { "epoch": 0.5728, "grad_norm": 0.72265625, "learning_rate": 2.206611570247934e-05, "loss": 0.0762, "step": 1790 }, { "epoch": 0.576, "grad_norm": 0.5703125, "learning_rate": 2.1900826446280992e-05, "loss": 0.0831, "step": 1800 }, { "epoch": 0.576, "eval_loss": 0.07973046600818634, "eval_runtime": 108.6444, "eval_samples_per_second": 4.602, "eval_steps_per_second": 0.147, "step": 1800 }, { "epoch": 0.5792, "grad_norm": 0.87109375, "learning_rate": 2.1735537190082643e-05, "loss": 0.0677, "step": 1810 }, { "epoch": 0.5824, "grad_norm": 0.8203125, "learning_rate": 2.1570247933884298e-05, "loss": 0.0738, "step": 1820 }, { "epoch": 0.5856, "grad_norm": 0.58203125, "learning_rate": 2.1404958677685953e-05, "loss": 0.064, "step": 1830 }, { "epoch": 0.5888, "grad_norm": 1.3359375, "learning_rate": 2.1239669421487604e-05, "loss": 0.0777, "step": 1840 }, { "epoch": 0.592, "grad_norm": 1.0078125, "learning_rate": 2.1074380165289255e-05, "loss": 0.0792, "step": 1850 }, { "epoch": 0.592, "eval_loss": 0.08046093583106995, "eval_runtime": 110.6488, "eval_samples_per_second": 4.519, "eval_steps_per_second": 0.145, "step": 1850 }, { "epoch": 0.5952, "grad_norm": 1.28125, "learning_rate": 2.090909090909091e-05, "loss": 0.0778, "step": 1860 }, { "epoch": 0.5984, "grad_norm": 0.8125, "learning_rate": 2.0743801652892564e-05, "loss": 0.0831, "step": 1870 }, { "epoch": 0.6016, "grad_norm": 0.90234375, "learning_rate": 2.0578512396694216e-05, "loss": 0.0721, "step": 1880 }, { "epoch": 0.6048, "grad_norm": 0.69140625, "learning_rate": 2.0413223140495867e-05, "loss": 0.0661, "step": 1890 }, { "epoch": 0.608, "grad_norm": 0.21484375, "learning_rate": 2.024793388429752e-05, "loss": 0.0725, "step": 1900 }, { "epoch": 0.608, "eval_loss": 0.08086328208446503, "eval_runtime": 108.5455, "eval_samples_per_second": 4.606, "eval_steps_per_second": 0.147, "step": 1900 }, { "epoch": 0.6112, "grad_norm": 0.5, "learning_rate": 2.0082644628099173e-05, "loss": 0.0813, "step": 1910 }, { "epoch": 0.6144, "grad_norm": 1.2109375, "learning_rate": 1.9917355371900827e-05, "loss": 0.0724, "step": 1920 }, { "epoch": 0.6176, "grad_norm": 0.921875, "learning_rate": 1.9752066115702482e-05, "loss": 0.0585, "step": 1930 }, { "epoch": 0.6208, "grad_norm": 1.890625, "learning_rate": 1.9586776859504133e-05, "loss": 0.0928, "step": 1940 }, { "epoch": 0.624, "grad_norm": 1.0546875, "learning_rate": 1.9421487603305784e-05, "loss": 0.0795, "step": 1950 }, { "epoch": 0.624, "eval_loss": 0.08059374988079071, "eval_runtime": 107.6528, "eval_samples_per_second": 4.645, "eval_steps_per_second": 0.149, "step": 1950 }, { "epoch": 0.6272, "grad_norm": 0.765625, "learning_rate": 1.925619834710744e-05, "loss": 0.0675, "step": 1960 }, { "epoch": 0.6304, "grad_norm": 1.6328125, "learning_rate": 1.9090909090909094e-05, "loss": 0.0896, "step": 1970 }, { "epoch": 0.6336, "grad_norm": 0.76953125, "learning_rate": 1.8925619834710745e-05, "loss": 0.075, "step": 1980 }, { "epoch": 0.6368, "grad_norm": 1.7890625, "learning_rate": 1.8760330578512396e-05, "loss": 0.0808, "step": 1990 }, { "epoch": 0.64, "grad_norm": 1.21875, "learning_rate": 1.859504132231405e-05, "loss": 0.0774, "step": 2000 }, { "epoch": 0.64, "eval_loss": 0.07988671958446503, "eval_runtime": 107.6118, "eval_samples_per_second": 4.646, "eval_steps_per_second": 0.149, "step": 2000 }, { "epoch": 0.6432, "grad_norm": 0.63671875, "learning_rate": 1.8429752066115705e-05, "loss": 0.0795, "step": 2010 }, { "epoch": 0.6464, "grad_norm": 1.4140625, "learning_rate": 1.8264462809917356e-05, "loss": 0.1115, "step": 2020 }, { "epoch": 0.6496, "grad_norm": 0.8203125, "learning_rate": 1.8099173553719008e-05, "loss": 0.079, "step": 2030 }, { "epoch": 0.6528, "grad_norm": 0.28515625, "learning_rate": 1.7933884297520662e-05, "loss": 0.0646, "step": 2040 }, { "epoch": 0.656, "grad_norm": 1.25, "learning_rate": 1.7768595041322314e-05, "loss": 0.0846, "step": 2050 }, { "epoch": 0.656, "eval_loss": 0.07978320121765137, "eval_runtime": 107.8393, "eval_samples_per_second": 4.637, "eval_steps_per_second": 0.148, "step": 2050 }, { "epoch": 0.6592, "grad_norm": 0.55859375, "learning_rate": 1.7603305785123968e-05, "loss": 0.0949, "step": 2060 }, { "epoch": 0.6624, "grad_norm": 0.6171875, "learning_rate": 1.7438016528925623e-05, "loss": 0.0822, "step": 2070 }, { "epoch": 0.6656, "grad_norm": 0.75390625, "learning_rate": 1.7272727272727274e-05, "loss": 0.0666, "step": 2080 }, { "epoch": 0.6688, "grad_norm": 1.1796875, "learning_rate": 1.7107438016528925e-05, "loss": 0.0745, "step": 2090 }, { "epoch": 0.672, "grad_norm": 1.2265625, "learning_rate": 1.694214876033058e-05, "loss": 0.1087, "step": 2100 }, { "epoch": 0.672, "eval_loss": 0.07949023693799973, "eval_runtime": 112.9895, "eval_samples_per_second": 4.425, "eval_steps_per_second": 0.142, "step": 2100 }, { "epoch": 0.6752, "grad_norm": 1.953125, "learning_rate": 1.6776859504132234e-05, "loss": 0.0917, "step": 2110 }, { "epoch": 0.6784, "grad_norm": 0.56640625, "learning_rate": 1.6611570247933886e-05, "loss": 0.075, "step": 2120 }, { "epoch": 0.6816, "grad_norm": 1.296875, "learning_rate": 1.6446280991735537e-05, "loss": 0.0788, "step": 2130 }, { "epoch": 0.6848, "grad_norm": 1.203125, "learning_rate": 1.628099173553719e-05, "loss": 0.0821, "step": 2140 }, { "epoch": 0.688, "grad_norm": 1.296875, "learning_rate": 1.6115702479338843e-05, "loss": 0.0836, "step": 2150 }, { "epoch": 0.688, "eval_loss": 0.0802304670214653, "eval_runtime": 109.4516, "eval_samples_per_second": 4.568, "eval_steps_per_second": 0.146, "step": 2150 }, { "epoch": 0.6912, "grad_norm": 1.21875, "learning_rate": 1.5950413223140497e-05, "loss": 0.0845, "step": 2160 }, { "epoch": 0.6944, "grad_norm": 0.69921875, "learning_rate": 1.578512396694215e-05, "loss": 0.0701, "step": 2170 }, { "epoch": 0.6976, "grad_norm": 1.5078125, "learning_rate": 1.5619834710743803e-05, "loss": 0.0789, "step": 2180 }, { "epoch": 0.7008, "grad_norm": 1.1015625, "learning_rate": 1.5454545454545454e-05, "loss": 0.0611, "step": 2190 }, { "epoch": 0.704, "grad_norm": 1.578125, "learning_rate": 1.5289256198347106e-05, "loss": 0.0782, "step": 2200 }, { "epoch": 0.704, "eval_loss": 0.07987890392541885, "eval_runtime": 108.1627, "eval_samples_per_second": 4.623, "eval_steps_per_second": 0.148, "step": 2200 }, { "epoch": 0.7072, "grad_norm": 0.69921875, "learning_rate": 1.5123966942148762e-05, "loss": 0.0692, "step": 2210 }, { "epoch": 0.7104, "grad_norm": 0.9140625, "learning_rate": 1.4958677685950415e-05, "loss": 0.0656, "step": 2220 }, { "epoch": 0.7136, "grad_norm": 1.2890625, "learning_rate": 1.4793388429752068e-05, "loss": 0.0854, "step": 2230 }, { "epoch": 0.7168, "grad_norm": 0.5703125, "learning_rate": 1.4628099173553719e-05, "loss": 0.0798, "step": 2240 }, { "epoch": 0.72, "grad_norm": 0.458984375, "learning_rate": 1.4462809917355372e-05, "loss": 0.0885, "step": 2250 }, { "epoch": 0.72, "eval_loss": 0.07889453321695328, "eval_runtime": 110.4069, "eval_samples_per_second": 4.529, "eval_steps_per_second": 0.145, "step": 2250 }, { "epoch": 0.7232, "grad_norm": 1.234375, "learning_rate": 1.4297520661157027e-05, "loss": 0.0813, "step": 2260 }, { "epoch": 0.7264, "grad_norm": 0.76171875, "learning_rate": 1.413223140495868e-05, "loss": 0.0615, "step": 2270 }, { "epoch": 0.7296, "grad_norm": 1.0390625, "learning_rate": 1.396694214876033e-05, "loss": 0.0773, "step": 2280 }, { "epoch": 0.7328, "grad_norm": 1.1484375, "learning_rate": 1.3801652892561984e-05, "loss": 0.0854, "step": 2290 }, { "epoch": 0.736, "grad_norm": 1.390625, "learning_rate": 1.3636363636363637e-05, "loss": 0.0926, "step": 2300 }, { "epoch": 0.736, "eval_loss": 0.07876367121934891, "eval_runtime": 120.6799, "eval_samples_per_second": 4.143, "eval_steps_per_second": 0.133, "step": 2300 }, { "epoch": 0.7392, "grad_norm": 1.046875, "learning_rate": 1.3471074380165291e-05, "loss": 0.0992, "step": 2310 }, { "epoch": 0.7424, "grad_norm": 0.44140625, "learning_rate": 1.3305785123966944e-05, "loss": 0.0629, "step": 2320 }, { "epoch": 0.7456, "grad_norm": 0.7421875, "learning_rate": 1.3140495867768595e-05, "loss": 0.0799, "step": 2330 }, { "epoch": 0.7488, "grad_norm": 1.203125, "learning_rate": 1.2975206611570248e-05, "loss": 0.092, "step": 2340 }, { "epoch": 0.752, "grad_norm": 1.34375, "learning_rate": 1.2809917355371901e-05, "loss": 0.064, "step": 2350 }, { "epoch": 0.752, "eval_loss": 0.07899804413318634, "eval_runtime": 110.2904, "eval_samples_per_second": 4.533, "eval_steps_per_second": 0.145, "step": 2350 }, { "epoch": 0.7552, "grad_norm": 0.6875, "learning_rate": 1.2644628099173556e-05, "loss": 0.0833, "step": 2360 }, { "epoch": 0.7584, "grad_norm": 2.015625, "learning_rate": 1.2479338842975207e-05, "loss": 0.0841, "step": 2370 }, { "epoch": 0.7616, "grad_norm": 1.1875, "learning_rate": 1.231404958677686e-05, "loss": 0.0798, "step": 2380 }, { "epoch": 0.7648, "grad_norm": 1.7421875, "learning_rate": 1.2148760330578513e-05, "loss": 0.0956, "step": 2390 }, { "epoch": 0.768, "grad_norm": 0.99609375, "learning_rate": 1.1983471074380166e-05, "loss": 0.0798, "step": 2400 }, { "epoch": 0.768, "eval_loss": 0.07969531416893005, "eval_runtime": 110.6153, "eval_samples_per_second": 4.52, "eval_steps_per_second": 0.145, "step": 2400 }, { "epoch": 0.7712, "grad_norm": 0.5078125, "learning_rate": 1.1818181818181819e-05, "loss": 0.0711, "step": 2410 }, { "epoch": 0.7744, "grad_norm": 1.4765625, "learning_rate": 1.1652892561983472e-05, "loss": 0.0708, "step": 2420 }, { "epoch": 0.7776, "grad_norm": 0.9609375, "learning_rate": 1.1487603305785125e-05, "loss": 0.0656, "step": 2430 }, { "epoch": 0.7808, "grad_norm": 0.71484375, "learning_rate": 1.1322314049586777e-05, "loss": 0.0723, "step": 2440 }, { "epoch": 0.784, "grad_norm": 1.09375, "learning_rate": 1.115702479338843e-05, "loss": 0.0789, "step": 2450 }, { "epoch": 0.784, "eval_loss": 0.0796191394329071, "eval_runtime": 110.6578, "eval_samples_per_second": 4.518, "eval_steps_per_second": 0.145, "step": 2450 }, { "epoch": 0.7872, "grad_norm": 1.046875, "learning_rate": 1.0991735537190083e-05, "loss": 0.0755, "step": 2460 }, { "epoch": 0.7904, "grad_norm": 1.140625, "learning_rate": 1.0826446280991736e-05, "loss": 0.0833, "step": 2470 }, { "epoch": 0.7936, "grad_norm": 0.5625, "learning_rate": 1.0661157024793389e-05, "loss": 0.1063, "step": 2480 }, { "epoch": 0.7968, "grad_norm": 0.48046875, "learning_rate": 1.0495867768595042e-05, "loss": 0.0747, "step": 2490 }, { "epoch": 0.8, "grad_norm": 1.390625, "learning_rate": 1.0330578512396695e-05, "loss": 0.0997, "step": 2500 }, { "epoch": 0.8, "eval_loss": 0.0799550786614418, "eval_runtime": 106.8244, "eval_samples_per_second": 4.681, "eval_steps_per_second": 0.15, "step": 2500 }, { "epoch": 0.8032, "grad_norm": 1.046875, "learning_rate": 1.0165289256198348e-05, "loss": 0.0737, "step": 2510 }, { "epoch": 0.8064, "grad_norm": 1.1328125, "learning_rate": 1e-05, "loss": 0.099, "step": 2520 }, { "epoch": 0.8096, "grad_norm": 0.58203125, "learning_rate": 9.834710743801654e-06, "loss": 0.0668, "step": 2530 }, { "epoch": 0.8128, "grad_norm": 1.140625, "learning_rate": 9.669421487603305e-06, "loss": 0.0772, "step": 2540 }, { "epoch": 0.816, "grad_norm": 0.7734375, "learning_rate": 9.50413223140496e-06, "loss": 0.0755, "step": 2550 }, { "epoch": 0.816, "eval_loss": 0.07922851294279099, "eval_runtime": 106.4516, "eval_samples_per_second": 4.697, "eval_steps_per_second": 0.15, "step": 2550 }, { "epoch": 0.8192, "grad_norm": 2.125, "learning_rate": 9.338842975206613e-06, "loss": 0.0735, "step": 2560 }, { "epoch": 0.8224, "grad_norm": 1.7265625, "learning_rate": 9.173553719008265e-06, "loss": 0.0879, "step": 2570 }, { "epoch": 0.8256, "grad_norm": 0.2578125, "learning_rate": 9.008264462809918e-06, "loss": 0.075, "step": 2580 }, { "epoch": 0.8288, "grad_norm": 0.9375, "learning_rate": 8.84297520661157e-06, "loss": 0.0718, "step": 2590 }, { "epoch": 0.832, "grad_norm": 0.828125, "learning_rate": 8.677685950413224e-06, "loss": 0.069, "step": 2600 }, { "epoch": 0.832, "eval_loss": 0.07956640422344208, "eval_runtime": 116.0737, "eval_samples_per_second": 4.308, "eval_steps_per_second": 0.138, "step": 2600 }, { "epoch": 0.8352, "grad_norm": 1.7109375, "learning_rate": 8.512396694214875e-06, "loss": 0.1006, "step": 2610 }, { "epoch": 0.8384, "grad_norm": 1.34375, "learning_rate": 8.34710743801653e-06, "loss": 0.0771, "step": 2620 }, { "epoch": 0.8416, "grad_norm": 0.9375, "learning_rate": 8.181818181818183e-06, "loss": 0.0668, "step": 2630 }, { "epoch": 0.8448, "grad_norm": 1.1875, "learning_rate": 8.016528925619834e-06, "loss": 0.0663, "step": 2640 }, { "epoch": 0.848, "grad_norm": 0.466796875, "learning_rate": 7.851239669421489e-06, "loss": 0.0687, "step": 2650 }, { "epoch": 0.848, "eval_loss": 0.07871288806200027, "eval_runtime": 108.1878, "eval_samples_per_second": 4.622, "eval_steps_per_second": 0.148, "step": 2650 }, { "epoch": 0.8512, "grad_norm": 1.0546875, "learning_rate": 7.68595041322314e-06, "loss": 0.0899, "step": 2660 }, { "epoch": 0.8544, "grad_norm": 0.9921875, "learning_rate": 7.520661157024795e-06, "loss": 0.0735, "step": 2670 }, { "epoch": 0.8576, "grad_norm": 0.416015625, "learning_rate": 7.355371900826447e-06, "loss": 0.0905, "step": 2680 }, { "epoch": 0.8608, "grad_norm": 1.2109375, "learning_rate": 7.190082644628099e-06, "loss": 0.0664, "step": 2690 }, { "epoch": 0.864, "grad_norm": 1.09375, "learning_rate": 7.0247933884297525e-06, "loss": 0.0791, "step": 2700 }, { "epoch": 0.864, "eval_loss": 0.07861328125, "eval_runtime": 109.1401, "eval_samples_per_second": 4.581, "eval_steps_per_second": 0.147, "step": 2700 }, { "epoch": 0.8672, "grad_norm": 0.470703125, "learning_rate": 6.859504132231405e-06, "loss": 0.0588, "step": 2710 }, { "epoch": 0.8704, "grad_norm": 0.4453125, "learning_rate": 6.694214876033058e-06, "loss": 0.1022, "step": 2720 }, { "epoch": 0.8736, "grad_norm": 0.77734375, "learning_rate": 6.528925619834711e-06, "loss": 0.0645, "step": 2730 }, { "epoch": 0.8768, "grad_norm": 0.79296875, "learning_rate": 6.363636363636363e-06, "loss": 0.0688, "step": 2740 }, { "epoch": 0.88, "grad_norm": 1.0, "learning_rate": 6.198347107438017e-06, "loss": 0.0763, "step": 2750 }, { "epoch": 0.88, "eval_loss": 0.07813085615634918, "eval_runtime": 107.2687, "eval_samples_per_second": 4.661, "eval_steps_per_second": 0.149, "step": 2750 }, { "epoch": 0.8832, "grad_norm": 1.0625, "learning_rate": 6.03305785123967e-06, "loss": 0.0619, "step": 2760 }, { "epoch": 0.8864, "grad_norm": 0.81640625, "learning_rate": 5.867768595041322e-06, "loss": 0.0861, "step": 2770 }, { "epoch": 0.8896, "grad_norm": 1.140625, "learning_rate": 5.702479338842975e-06, "loss": 0.0573, "step": 2780 }, { "epoch": 0.8928, "grad_norm": 0.6015625, "learning_rate": 5.537190082644628e-06, "loss": 0.0546, "step": 2790 }, { "epoch": 0.896, "grad_norm": 0.408203125, "learning_rate": 5.371900826446282e-06, "loss": 0.0656, "step": 2800 }, { "epoch": 0.896, "eval_loss": 0.0781230479478836, "eval_runtime": 120.1101, "eval_samples_per_second": 4.163, "eval_steps_per_second": 0.133, "step": 2800 }, { "epoch": 0.8992, "grad_norm": 1.4609375, "learning_rate": 5.206611570247935e-06, "loss": 0.0634, "step": 2810 }, { "epoch": 0.9024, "grad_norm": 1.515625, "learning_rate": 5.041322314049587e-06, "loss": 0.093, "step": 2820 }, { "epoch": 0.9056, "grad_norm": 1.8359375, "learning_rate": 4.87603305785124e-06, "loss": 0.0624, "step": 2830 }, { "epoch": 0.9088, "grad_norm": 1.0859375, "learning_rate": 4.710743801652893e-06, "loss": 0.0911, "step": 2840 }, { "epoch": 0.912, "grad_norm": 1.171875, "learning_rate": 4.5454545454545455e-06, "loss": 0.074, "step": 2850 }, { "epoch": 0.912, "eval_loss": 0.07803711295127869, "eval_runtime": 109.0375, "eval_samples_per_second": 4.586, "eval_steps_per_second": 0.147, "step": 2850 }, { "epoch": 0.9152, "grad_norm": 0.62890625, "learning_rate": 4.3801652892561984e-06, "loss": 0.0734, "step": 2860 }, { "epoch": 0.9184, "grad_norm": 0.5703125, "learning_rate": 4.214876033057851e-06, "loss": 0.0521, "step": 2870 }, { "epoch": 0.9216, "grad_norm": 1.453125, "learning_rate": 4.049586776859504e-06, "loss": 0.0843, "step": 2880 }, { "epoch": 0.9248, "grad_norm": 0.302734375, "learning_rate": 3.884297520661157e-06, "loss": 0.0775, "step": 2890 }, { "epoch": 0.928, "grad_norm": 1.1328125, "learning_rate": 3.71900826446281e-06, "loss": 0.0974, "step": 2900 }, { "epoch": 0.928, "eval_loss": 0.07862695306539536, "eval_runtime": 109.8662, "eval_samples_per_second": 4.551, "eval_steps_per_second": 0.146, "step": 2900 }, { "epoch": 0.9312, "grad_norm": 1.3515625, "learning_rate": 3.553719008264463e-06, "loss": 0.0611, "step": 2910 }, { "epoch": 0.9344, "grad_norm": 1.0078125, "learning_rate": 3.3884297520661155e-06, "loss": 0.0697, "step": 2920 }, { "epoch": 0.9376, "grad_norm": 0.56640625, "learning_rate": 3.2231404958677685e-06, "loss": 0.076, "step": 2930 }, { "epoch": 0.9408, "grad_norm": 1.5, "learning_rate": 3.0578512396694214e-06, "loss": 0.084, "step": 2940 }, { "epoch": 0.944, "grad_norm": 1.015625, "learning_rate": 2.8925619834710747e-06, "loss": 0.0769, "step": 2950 }, { "epoch": 0.944, "eval_loss": 0.07859765738248825, "eval_runtime": 109.8176, "eval_samples_per_second": 4.553, "eval_steps_per_second": 0.146, "step": 2950 }, { "epoch": 0.9472, "grad_norm": 1.1640625, "learning_rate": 2.7272727272727272e-06, "loss": 0.0722, "step": 2960 }, { "epoch": 0.9504, "grad_norm": 1.1015625, "learning_rate": 2.56198347107438e-06, "loss": 0.0839, "step": 2970 }, { "epoch": 0.9536, "grad_norm": 1.171875, "learning_rate": 2.3966942148760335e-06, "loss": 0.0818, "step": 2980 }, { "epoch": 0.9568, "grad_norm": 1.5, "learning_rate": 2.231404958677686e-06, "loss": 0.0757, "step": 2990 }, { "epoch": 0.96, "grad_norm": 1.015625, "learning_rate": 2.066115702479339e-06, "loss": 0.0816, "step": 3000 }, { "epoch": 0.96, "eval_loss": 0.07787304371595383, "eval_runtime": 108.0913, "eval_samples_per_second": 4.626, "eval_steps_per_second": 0.148, "step": 3000 } ], "logging_steps": 10, "max_steps": 3125, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.75772653691904e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }