|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.96, |
|
"eval_steps": 50, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0032, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0064, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6988, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0096, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.6977, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0128, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6969, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.6941, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"eval_loss": 0.6939687728881836, |
|
"eval_runtime": 192.7405, |
|
"eval_samples_per_second": 2.594, |
|
"eval_steps_per_second": 0.083, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0192, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6871, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0224, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.6828, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0256, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 4e-05, |
|
"loss": 0.6687, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0288, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.6309, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 5e-05, |
|
"loss": 0.5781, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"eval_loss": 0.5389062762260437, |
|
"eval_runtime": 108.3608, |
|
"eval_samples_per_second": 4.614, |
|
"eval_steps_per_second": 0.148, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0352, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 4.9834710743801654e-05, |
|
"loss": 0.4818, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0384, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 4.9669421487603305e-05, |
|
"loss": 0.4291, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0416, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 4.9504132231404956e-05, |
|
"loss": 0.3615, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0448, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 4.9338842975206614e-05, |
|
"loss": 0.3408, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 4.917355371900827e-05, |
|
"loss": 0.306, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"eval_loss": 0.2862031161785126, |
|
"eval_runtime": 110.0641, |
|
"eval_samples_per_second": 4.543, |
|
"eval_steps_per_second": 0.145, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0512, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 4.900826446280992e-05, |
|
"loss": 0.2904, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0544, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 4.8842975206611575e-05, |
|
"loss": 0.2648, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0576, |
|
"grad_norm": 1.6875, |
|
"learning_rate": 4.8677685950413226e-05, |
|
"loss": 0.2373, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0608, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 4.851239669421488e-05, |
|
"loss": 0.1749, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 4.834710743801653e-05, |
|
"loss": 0.1556, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"eval_loss": 0.16703906655311584, |
|
"eval_runtime": 109.9961, |
|
"eval_samples_per_second": 4.546, |
|
"eval_steps_per_second": 0.145, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0672, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 4.8181818181818186e-05, |
|
"loss": 0.1382, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.0704, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 4.801652892561984e-05, |
|
"loss": 0.125, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.0736, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 4.785123966942149e-05, |
|
"loss": 0.1729, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.0768, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 4.768595041322314e-05, |
|
"loss": 0.1252, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.75206611570248e-05, |
|
"loss": 0.1108, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 0.13416016101837158, |
|
"eval_runtime": 110.9679, |
|
"eval_samples_per_second": 4.506, |
|
"eval_steps_per_second": 0.144, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.0832, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 4.735537190082645e-05, |
|
"loss": 0.1502, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.0864, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 4.71900826446281e-05, |
|
"loss": 0.1291, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.0896, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 4.702479338842976e-05, |
|
"loss": 0.1326, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.0928, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 4.685950413223141e-05, |
|
"loss": 0.1374, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 4.669421487603306e-05, |
|
"loss": 0.1081, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"eval_loss": 0.12271875143051147, |
|
"eval_runtime": 110.6168, |
|
"eval_samples_per_second": 4.52, |
|
"eval_steps_per_second": 0.145, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0992, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 4.652892561983471e-05, |
|
"loss": 0.1266, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1024, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 4.636363636363636e-05, |
|
"loss": 0.1368, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.1056, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 4.6198347107438014e-05, |
|
"loss": 0.1064, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.1088, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 4.603305785123967e-05, |
|
"loss": 0.1233, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 4.586776859504133e-05, |
|
"loss": 0.0881, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"eval_loss": 0.1180742159485817, |
|
"eval_runtime": 109.5127, |
|
"eval_samples_per_second": 4.566, |
|
"eval_steps_per_second": 0.146, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1152, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 4.570247933884298e-05, |
|
"loss": 0.1136, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.1184, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 4.553719008264463e-05, |
|
"loss": 0.1338, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1216, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 4.5371900826446284e-05, |
|
"loss": 0.1636, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.1248, |
|
"grad_norm": 2.375, |
|
"learning_rate": 4.5206611570247935e-05, |
|
"loss": 0.1067, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 4.504132231404959e-05, |
|
"loss": 0.1287, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"eval_loss": 0.1135859340429306, |
|
"eval_runtime": 115.1853, |
|
"eval_samples_per_second": 4.341, |
|
"eval_steps_per_second": 0.139, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1312, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 4.487603305785124e-05, |
|
"loss": 0.122, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.1344, |
|
"grad_norm": 1.6796875, |
|
"learning_rate": 4.4710743801652896e-05, |
|
"loss": 0.1232, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.1376, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 4.454545454545455e-05, |
|
"loss": 0.11, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.1408, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 4.43801652892562e-05, |
|
"loss": 0.0937, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 4.4214876033057856e-05, |
|
"loss": 0.1162, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"eval_loss": 0.10819531232118607, |
|
"eval_runtime": 110.4172, |
|
"eval_samples_per_second": 4.528, |
|
"eval_steps_per_second": 0.145, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.1472, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 4.404958677685951e-05, |
|
"loss": 0.1171, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.1504, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 4.388429752066116e-05, |
|
"loss": 0.1032, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.1536, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 4.371900826446281e-05, |
|
"loss": 0.1217, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.1568, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 4.355371900826447e-05, |
|
"loss": 0.1197, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 4.338842975206612e-05, |
|
"loss": 0.0878, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 0.10691406577825546, |
|
"eval_runtime": 110.9181, |
|
"eval_samples_per_second": 4.508, |
|
"eval_steps_per_second": 0.144, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1632, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 4.322314049586777e-05, |
|
"loss": 0.1102, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.1664, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 4.305785123966942e-05, |
|
"loss": 0.0982, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.1696, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 4.289256198347107e-05, |
|
"loss": 0.1096, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.1728, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 4.2727272727272724e-05, |
|
"loss": 0.1269, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 4.256198347107438e-05, |
|
"loss": 0.0994, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"eval_loss": 0.10447265952825546, |
|
"eval_runtime": 110.3941, |
|
"eval_samples_per_second": 4.529, |
|
"eval_steps_per_second": 0.145, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.1792, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 4.239669421487604e-05, |
|
"loss": 0.1118, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.1824, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 4.223140495867769e-05, |
|
"loss": 0.098, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.1856, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 4.206611570247934e-05, |
|
"loss": 0.0811, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.1888, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 4.1900826446280994e-05, |
|
"loss": 0.1208, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 4.1735537190082645e-05, |
|
"loss": 0.1275, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"eval_loss": 0.10120312869548798, |
|
"eval_runtime": 109.2003, |
|
"eval_samples_per_second": 4.579, |
|
"eval_steps_per_second": 0.147, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1952, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 4.1570247933884296e-05, |
|
"loss": 0.0872, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.1984, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 4.1404958677685954e-05, |
|
"loss": 0.1025, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.2016, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 4.1239669421487606e-05, |
|
"loss": 0.0932, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.2048, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 4.107438016528926e-05, |
|
"loss": 0.0838, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 4.0909090909090915e-05, |
|
"loss": 0.0921, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"eval_loss": 0.10271484404802322, |
|
"eval_runtime": 110.3072, |
|
"eval_samples_per_second": 4.533, |
|
"eval_steps_per_second": 0.145, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.2112, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 4.0743801652892566e-05, |
|
"loss": 0.0842, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.2144, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 4.057851239669422e-05, |
|
"loss": 0.0969, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.2176, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 4.041322314049587e-05, |
|
"loss": 0.1037, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.2208, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 4.024793388429752e-05, |
|
"loss": 0.0872, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 4.008264462809918e-05, |
|
"loss": 0.0722, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"eval_loss": 0.10465820133686066, |
|
"eval_runtime": 109.5448, |
|
"eval_samples_per_second": 4.564, |
|
"eval_steps_per_second": 0.146, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.2272, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 3.991735537190083e-05, |
|
"loss": 0.0926, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.2304, |
|
"grad_norm": 3.234375, |
|
"learning_rate": 3.975206611570248e-05, |
|
"loss": 0.0945, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.2336, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 3.958677685950413e-05, |
|
"loss": 0.076, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.2368, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 3.942148760330578e-05, |
|
"loss": 0.0959, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 3.925619834710744e-05, |
|
"loss": 0.0916, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 0.10287109017372131, |
|
"eval_runtime": 122.5858, |
|
"eval_samples_per_second": 4.079, |
|
"eval_steps_per_second": 0.131, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.2432, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 3.909090909090909e-05, |
|
"loss": 0.0891, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.2464, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 3.892561983471075e-05, |
|
"loss": 0.0954, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.2496, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 3.87603305785124e-05, |
|
"loss": 0.11, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.2528, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 3.859504132231405e-05, |
|
"loss": 0.067, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 3.8429752066115703e-05, |
|
"loss": 0.0909, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"eval_loss": 0.09786719083786011, |
|
"eval_runtime": 112.8442, |
|
"eval_samples_per_second": 4.431, |
|
"eval_steps_per_second": 0.142, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.2592, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 3.8264462809917355e-05, |
|
"loss": 0.0801, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.2624, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 3.8099173553719006e-05, |
|
"loss": 0.1088, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.2656, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 3.7933884297520664e-05, |
|
"loss": 0.08, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.2688, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 3.7768595041322315e-05, |
|
"loss": 0.1055, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 3.760330578512397e-05, |
|
"loss": 0.0951, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"eval_loss": 0.09604492038488388, |
|
"eval_runtime": 109.2932, |
|
"eval_samples_per_second": 4.575, |
|
"eval_steps_per_second": 0.146, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.2752, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 3.7438016528925624e-05, |
|
"loss": 0.116, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.2784, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 3.7272727272727276e-05, |
|
"loss": 0.0771, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.2816, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 3.710743801652893e-05, |
|
"loss": 0.08, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.2848, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 3.694214876033058e-05, |
|
"loss": 0.105, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 3.6776859504132236e-05, |
|
"loss": 0.0686, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"eval_loss": 0.08956640958786011, |
|
"eval_runtime": 109.9587, |
|
"eval_samples_per_second": 4.547, |
|
"eval_steps_per_second": 0.146, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2912, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 3.661157024793389e-05, |
|
"loss": 0.0867, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.2944, |
|
"grad_norm": 1.25, |
|
"learning_rate": 3.644628099173554e-05, |
|
"loss": 0.0975, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.2976, |
|
"grad_norm": 1.625, |
|
"learning_rate": 3.628099173553719e-05, |
|
"loss": 0.0905, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.3008, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 3.611570247933884e-05, |
|
"loss": 0.0983, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 3.59504132231405e-05, |
|
"loss": 0.0772, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"eval_loss": 0.09039648622274399, |
|
"eval_runtime": 111.9644, |
|
"eval_samples_per_second": 4.466, |
|
"eval_steps_per_second": 0.143, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.3072, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 3.578512396694215e-05, |
|
"loss": 0.0907, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.3104, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 3.56198347107438e-05, |
|
"loss": 0.0939, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.3136, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 3.545454545454546e-05, |
|
"loss": 0.0984, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.3168, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 3.528925619834711e-05, |
|
"loss": 0.0924, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 3.512396694214876e-05, |
|
"loss": 0.0761, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_loss": 0.09162402153015137, |
|
"eval_runtime": 112.9796, |
|
"eval_samples_per_second": 4.426, |
|
"eval_steps_per_second": 0.142, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3232, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 3.495867768595041e-05, |
|
"loss": 0.0845, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.3264, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 3.4793388429752064e-05, |
|
"loss": 0.0852, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.3296, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 3.4628099173553716e-05, |
|
"loss": 0.1208, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.3328, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 3.4462809917355374e-05, |
|
"loss": 0.0807, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 3.429752066115703e-05, |
|
"loss": 0.0756, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"eval_loss": 0.09120605140924454, |
|
"eval_runtime": 111.5564, |
|
"eval_samples_per_second": 4.482, |
|
"eval_steps_per_second": 0.143, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.3392, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 3.413223140495868e-05, |
|
"loss": 0.1092, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.3424, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 3.3966942148760334e-05, |
|
"loss": 0.1217, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.3456, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 3.3801652892561985e-05, |
|
"loss": 0.0921, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.3488, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 3.3636363636363636e-05, |
|
"loss": 0.1023, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 3.347107438016529e-05, |
|
"loss": 0.107, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"eval_loss": 0.08991601318120956, |
|
"eval_runtime": 109.5083, |
|
"eval_samples_per_second": 4.566, |
|
"eval_steps_per_second": 0.146, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.3552, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 3.3305785123966946e-05, |
|
"loss": 0.0888, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.3584, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 3.31404958677686e-05, |
|
"loss": 0.0896, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.3616, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 3.297520661157025e-05, |
|
"loss": 0.072, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.3648, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 3.28099173553719e-05, |
|
"loss": 0.1046, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 3.264462809917356e-05, |
|
"loss": 0.0791, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"eval_loss": 0.08706250041723251, |
|
"eval_runtime": 108.4302, |
|
"eval_samples_per_second": 4.611, |
|
"eval_steps_per_second": 0.148, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.3712, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 3.247933884297521e-05, |
|
"loss": 0.0791, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.3744, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 3.231404958677686e-05, |
|
"loss": 0.0854, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.3776, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 3.214876033057852e-05, |
|
"loss": 0.0912, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.3808, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 3.198347107438017e-05, |
|
"loss": 0.1034, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 3.181818181818182e-05, |
|
"loss": 0.0863, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"eval_loss": 0.08619336038827896, |
|
"eval_runtime": 108.0491, |
|
"eval_samples_per_second": 4.628, |
|
"eval_steps_per_second": 0.148, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3872, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 3.165289256198347e-05, |
|
"loss": 0.0904, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.3904, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 3.148760330578512e-05, |
|
"loss": 0.1262, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.3936, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 3.1322314049586774e-05, |
|
"loss": 0.08, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.3968, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 3.1157024793388425e-05, |
|
"loss": 0.0861, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 3.099173553719008e-05, |
|
"loss": 0.0803, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.08419531583786011, |
|
"eval_runtime": 110.4296, |
|
"eval_samples_per_second": 4.528, |
|
"eval_steps_per_second": 0.145, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.4032, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 3.082644628099174e-05, |
|
"loss": 0.0889, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.4064, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 3.066115702479339e-05, |
|
"loss": 0.0879, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.4096, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 3.0495867768595044e-05, |
|
"loss": 0.0776, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.4128, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 3.0330578512396695e-05, |
|
"loss": 0.0903, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 3.016528925619835e-05, |
|
"loss": 0.0877, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"eval_loss": 0.08515820652246475, |
|
"eval_runtime": 107.6996, |
|
"eval_samples_per_second": 4.643, |
|
"eval_steps_per_second": 0.149, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.4192, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 3e-05, |
|
"loss": 0.0912, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.4224, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 2.9834710743801652e-05, |
|
"loss": 0.0634, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.4256, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 2.9669421487603307e-05, |
|
"loss": 0.076, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.4288, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 2.950413223140496e-05, |
|
"loss": 0.0911, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 2.9338842975206616e-05, |
|
"loss": 0.0804, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"eval_loss": 0.0840449184179306, |
|
"eval_runtime": 106.9192, |
|
"eval_samples_per_second": 4.676, |
|
"eval_steps_per_second": 0.15, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.4352, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 2.9173553719008267e-05, |
|
"loss": 0.0779, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.4384, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 2.9008264462809918e-05, |
|
"loss": 0.0749, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.4416, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 2.8842975206611573e-05, |
|
"loss": 0.0772, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.4448, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 2.8677685950413224e-05, |
|
"loss": 0.0853, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 2.8512396694214875e-05, |
|
"loss": 0.1136, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"eval_loss": 0.0832226574420929, |
|
"eval_runtime": 109.8641, |
|
"eval_samples_per_second": 4.551, |
|
"eval_steps_per_second": 0.146, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.4512, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 2.834710743801653e-05, |
|
"loss": 0.0655, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.4544, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 2.818181818181818e-05, |
|
"loss": 0.0952, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.4576, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 2.8016528925619832e-05, |
|
"loss": 0.0812, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.4608, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 2.785123966942149e-05, |
|
"loss": 0.0649, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 2.7685950413223145e-05, |
|
"loss": 0.0799, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"eval_loss": 0.08352148532867432, |
|
"eval_runtime": 109.4479, |
|
"eval_samples_per_second": 4.568, |
|
"eval_steps_per_second": 0.146, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.4672, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 2.7520661157024796e-05, |
|
"loss": 0.0699, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.4704, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 2.7355371900826447e-05, |
|
"loss": 0.0688, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.4736, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 2.7190082644628102e-05, |
|
"loss": 0.085, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.4768, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 2.7024793388429753e-05, |
|
"loss": 0.0874, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 2.6859504132231405e-05, |
|
"loss": 0.0896, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_loss": 0.08368749916553497, |
|
"eval_runtime": 110.4491, |
|
"eval_samples_per_second": 4.527, |
|
"eval_steps_per_second": 0.145, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4832, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 2.669421487603306e-05, |
|
"loss": 0.1036, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.4864, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 2.652892561983471e-05, |
|
"loss": 0.0917, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.4896, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 2.636363636363636e-05, |
|
"loss": 0.0985, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.4928, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 2.619834710743802e-05, |
|
"loss": 0.0881, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 2.6033057851239674e-05, |
|
"loss": 0.078, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"eval_loss": 0.08341991901397705, |
|
"eval_runtime": 109.2192, |
|
"eval_samples_per_second": 4.578, |
|
"eval_steps_per_second": 0.146, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.4992, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 2.5867768595041325e-05, |
|
"loss": 0.0784, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.5024, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 2.5702479338842977e-05, |
|
"loss": 0.0748, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.5056, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 2.553719008264463e-05, |
|
"loss": 0.0776, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.5088, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 2.5371900826446283e-05, |
|
"loss": 0.0927, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 2.5206611570247934e-05, |
|
"loss": 0.0878, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"eval_loss": 0.08232617378234863, |
|
"eval_runtime": 110.112, |
|
"eval_samples_per_second": 4.541, |
|
"eval_steps_per_second": 0.145, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.5152, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 2.504132231404959e-05, |
|
"loss": 0.0929, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.5184, |
|
"grad_norm": 0.75, |
|
"learning_rate": 2.4876033057851243e-05, |
|
"loss": 0.0796, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.5216, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 2.4710743801652894e-05, |
|
"loss": 0.0932, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.5248, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 2.4545454545454545e-05, |
|
"loss": 0.1084, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 2.43801652892562e-05, |
|
"loss": 0.0763, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"eval_loss": 0.08151757717132568, |
|
"eval_runtime": 111.3314, |
|
"eval_samples_per_second": 4.491, |
|
"eval_steps_per_second": 0.144, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.5312, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 2.421487603305785e-05, |
|
"loss": 0.062, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.5344, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 2.4049586776859506e-05, |
|
"loss": 0.0628, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.5376, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 2.3884297520661157e-05, |
|
"loss": 0.0908, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.5408, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 2.3719008264462812e-05, |
|
"loss": 0.0927, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 2.3553719008264463e-05, |
|
"loss": 0.0958, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"eval_loss": 0.08200781047344208, |
|
"eval_runtime": 114.049, |
|
"eval_samples_per_second": 4.384, |
|
"eval_steps_per_second": 0.14, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.5472, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 2.3388429752066114e-05, |
|
"loss": 0.0716, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.5504, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 2.3223140495867772e-05, |
|
"loss": 0.0715, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.5536, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 2.3057851239669423e-05, |
|
"loss": 0.0941, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.5568, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 2.2892561983471075e-05, |
|
"loss": 0.0785, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 2.272727272727273e-05, |
|
"loss": 0.0857, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 0.08109960705041885, |
|
"eval_runtime": 112.2299, |
|
"eval_samples_per_second": 4.455, |
|
"eval_steps_per_second": 0.143, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.5632, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 2.256198347107438e-05, |
|
"loss": 0.0601, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.5664, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 2.2396694214876035e-05, |
|
"loss": 0.0866, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.5696, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 2.2231404958677686e-05, |
|
"loss": 0.0708, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.5728, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 2.206611570247934e-05, |
|
"loss": 0.0762, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 2.1900826446280992e-05, |
|
"loss": 0.0831, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"eval_loss": 0.07973046600818634, |
|
"eval_runtime": 108.6444, |
|
"eval_samples_per_second": 4.602, |
|
"eval_steps_per_second": 0.147, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.5792, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 2.1735537190082643e-05, |
|
"loss": 0.0677, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.5824, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 2.1570247933884298e-05, |
|
"loss": 0.0738, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.5856, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 2.1404958677685953e-05, |
|
"loss": 0.064, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.5888, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 2.1239669421487604e-05, |
|
"loss": 0.0777, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 2.1074380165289255e-05, |
|
"loss": 0.0792, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"eval_loss": 0.08046093583106995, |
|
"eval_runtime": 110.6488, |
|
"eval_samples_per_second": 4.519, |
|
"eval_steps_per_second": 0.145, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.5952, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 2.090909090909091e-05, |
|
"loss": 0.0778, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.5984, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 2.0743801652892564e-05, |
|
"loss": 0.0831, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.6016, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 2.0578512396694216e-05, |
|
"loss": 0.0721, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.6048, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 2.0413223140495867e-05, |
|
"loss": 0.0661, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 2.024793388429752e-05, |
|
"loss": 0.0725, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"eval_loss": 0.08086328208446503, |
|
"eval_runtime": 108.5455, |
|
"eval_samples_per_second": 4.606, |
|
"eval_steps_per_second": 0.147, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.6112, |
|
"grad_norm": 0.5, |
|
"learning_rate": 2.0082644628099173e-05, |
|
"loss": 0.0813, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.6144, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 1.9917355371900827e-05, |
|
"loss": 0.0724, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.6176, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 1.9752066115702482e-05, |
|
"loss": 0.0585, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.6208, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.9586776859504133e-05, |
|
"loss": 0.0928, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.9421487603305784e-05, |
|
"loss": 0.0795, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"eval_loss": 0.08059374988079071, |
|
"eval_runtime": 107.6528, |
|
"eval_samples_per_second": 4.645, |
|
"eval_steps_per_second": 0.149, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.6272, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 1.925619834710744e-05, |
|
"loss": 0.0675, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.6304, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 1.9090909090909094e-05, |
|
"loss": 0.0896, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.6336, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 1.8925619834710745e-05, |
|
"loss": 0.075, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.6368, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 1.8760330578512396e-05, |
|
"loss": 0.0808, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 1.859504132231405e-05, |
|
"loss": 0.0774, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 0.07988671958446503, |
|
"eval_runtime": 107.6118, |
|
"eval_samples_per_second": 4.646, |
|
"eval_steps_per_second": 0.149, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.6432, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 1.8429752066115705e-05, |
|
"loss": 0.0795, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.6464, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 1.8264462809917356e-05, |
|
"loss": 0.1115, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.6496, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 1.8099173553719008e-05, |
|
"loss": 0.079, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.6528, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 1.7933884297520662e-05, |
|
"loss": 0.0646, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 1.25, |
|
"learning_rate": 1.7768595041322314e-05, |
|
"loss": 0.0846, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"eval_loss": 0.07978320121765137, |
|
"eval_runtime": 107.8393, |
|
"eval_samples_per_second": 4.637, |
|
"eval_steps_per_second": 0.148, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.6592, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 1.7603305785123968e-05, |
|
"loss": 0.0949, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.6624, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 1.7438016528925623e-05, |
|
"loss": 0.0822, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.6656, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 1.7272727272727274e-05, |
|
"loss": 0.0666, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.6688, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 1.7107438016528925e-05, |
|
"loss": 0.0745, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 1.694214876033058e-05, |
|
"loss": 0.1087, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"eval_loss": 0.07949023693799973, |
|
"eval_runtime": 112.9895, |
|
"eval_samples_per_second": 4.425, |
|
"eval_steps_per_second": 0.142, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.6752, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.6776859504132234e-05, |
|
"loss": 0.0917, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.6784, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 1.6611570247933886e-05, |
|
"loss": 0.075, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.6816, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 1.6446280991735537e-05, |
|
"loss": 0.0788, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.6848, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 1.628099173553719e-05, |
|
"loss": 0.0821, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 1.6115702479338843e-05, |
|
"loss": 0.0836, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"eval_loss": 0.0802304670214653, |
|
"eval_runtime": 109.4516, |
|
"eval_samples_per_second": 4.568, |
|
"eval_steps_per_second": 0.146, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.6912, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 1.5950413223140497e-05, |
|
"loss": 0.0845, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.6944, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 1.578512396694215e-05, |
|
"loss": 0.0701, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.6976, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 1.5619834710743803e-05, |
|
"loss": 0.0789, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.7008, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 1.5454545454545454e-05, |
|
"loss": 0.0611, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 1.5289256198347106e-05, |
|
"loss": 0.0782, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"eval_loss": 0.07987890392541885, |
|
"eval_runtime": 108.1627, |
|
"eval_samples_per_second": 4.623, |
|
"eval_steps_per_second": 0.148, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.7072, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 1.5123966942148762e-05, |
|
"loss": 0.0692, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.7104, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 1.4958677685950415e-05, |
|
"loss": 0.0656, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.7136, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 1.4793388429752068e-05, |
|
"loss": 0.0854, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.7168, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 1.4628099173553719e-05, |
|
"loss": 0.0798, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 1.4462809917355372e-05, |
|
"loss": 0.0885, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_loss": 0.07889453321695328, |
|
"eval_runtime": 110.4069, |
|
"eval_samples_per_second": 4.529, |
|
"eval_steps_per_second": 0.145, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.7232, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 1.4297520661157027e-05, |
|
"loss": 0.0813, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.7264, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 1.413223140495868e-05, |
|
"loss": 0.0615, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.7296, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 1.396694214876033e-05, |
|
"loss": 0.0773, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.7328, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 1.3801652892561984e-05, |
|
"loss": 0.0854, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 1.3636363636363637e-05, |
|
"loss": 0.0926, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"eval_loss": 0.07876367121934891, |
|
"eval_runtime": 120.6799, |
|
"eval_samples_per_second": 4.143, |
|
"eval_steps_per_second": 0.133, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.7392, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.3471074380165291e-05, |
|
"loss": 0.0992, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.7424, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 1.3305785123966944e-05, |
|
"loss": 0.0629, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.7456, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 1.3140495867768595e-05, |
|
"loss": 0.0799, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.7488, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 1.2975206611570248e-05, |
|
"loss": 0.092, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 1.2809917355371901e-05, |
|
"loss": 0.064, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"eval_loss": 0.07899804413318634, |
|
"eval_runtime": 110.2904, |
|
"eval_samples_per_second": 4.533, |
|
"eval_steps_per_second": 0.145, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.7552, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.2644628099173556e-05, |
|
"loss": 0.0833, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.7584, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.2479338842975207e-05, |
|
"loss": 0.0841, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.7616, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 1.231404958677686e-05, |
|
"loss": 0.0798, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.7648, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 1.2148760330578513e-05, |
|
"loss": 0.0956, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 1.1983471074380166e-05, |
|
"loss": 0.0798, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"eval_loss": 0.07969531416893005, |
|
"eval_runtime": 110.6153, |
|
"eval_samples_per_second": 4.52, |
|
"eval_steps_per_second": 0.145, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.7712, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 1.1818181818181819e-05, |
|
"loss": 0.0711, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.7744, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 1.1652892561983472e-05, |
|
"loss": 0.0708, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.7776, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 1.1487603305785125e-05, |
|
"loss": 0.0656, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.7808, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 1.1322314049586777e-05, |
|
"loss": 0.0723, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 1.115702479338843e-05, |
|
"loss": 0.0789, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"eval_loss": 0.0796191394329071, |
|
"eval_runtime": 110.6578, |
|
"eval_samples_per_second": 4.518, |
|
"eval_steps_per_second": 0.145, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.7872, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.0991735537190083e-05, |
|
"loss": 0.0755, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.7904, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 1.0826446280991736e-05, |
|
"loss": 0.0833, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.7936, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 1.0661157024793389e-05, |
|
"loss": 0.1063, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.7968, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 1.0495867768595042e-05, |
|
"loss": 0.0747, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 1.0330578512396695e-05, |
|
"loss": 0.0997, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.0799550786614418, |
|
"eval_runtime": 106.8244, |
|
"eval_samples_per_second": 4.681, |
|
"eval_steps_per_second": 0.15, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.8032, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.0165289256198348e-05, |
|
"loss": 0.0737, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.8064, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 1e-05, |
|
"loss": 0.099, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.8096, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 9.834710743801654e-06, |
|
"loss": 0.0668, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.8128, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 9.669421487603305e-06, |
|
"loss": 0.0772, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 9.50413223140496e-06, |
|
"loss": 0.0755, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"eval_loss": 0.07922851294279099, |
|
"eval_runtime": 106.4516, |
|
"eval_samples_per_second": 4.697, |
|
"eval_steps_per_second": 0.15, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.8192, |
|
"grad_norm": 2.125, |
|
"learning_rate": 9.338842975206613e-06, |
|
"loss": 0.0735, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.8224, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 9.173553719008265e-06, |
|
"loss": 0.0879, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.8256, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 9.008264462809918e-06, |
|
"loss": 0.075, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.8288, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 8.84297520661157e-06, |
|
"loss": 0.0718, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 8.677685950413224e-06, |
|
"loss": 0.069, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"eval_loss": 0.07956640422344208, |
|
"eval_runtime": 116.0737, |
|
"eval_samples_per_second": 4.308, |
|
"eval_steps_per_second": 0.138, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.8352, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 8.512396694214875e-06, |
|
"loss": 0.1006, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.8384, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 8.34710743801653e-06, |
|
"loss": 0.0771, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.8416, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 8.181818181818183e-06, |
|
"loss": 0.0668, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.8448, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 8.016528925619834e-06, |
|
"loss": 0.0663, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 7.851239669421489e-06, |
|
"loss": 0.0687, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"eval_loss": 0.07871288806200027, |
|
"eval_runtime": 108.1878, |
|
"eval_samples_per_second": 4.622, |
|
"eval_steps_per_second": 0.148, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.8512, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 7.68595041322314e-06, |
|
"loss": 0.0899, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.8544, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 7.520661157024795e-06, |
|
"loss": 0.0735, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.8576, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 7.355371900826447e-06, |
|
"loss": 0.0905, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.8608, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 7.190082644628099e-06, |
|
"loss": 0.0664, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 7.0247933884297525e-06, |
|
"loss": 0.0791, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"eval_loss": 0.07861328125, |
|
"eval_runtime": 109.1401, |
|
"eval_samples_per_second": 4.581, |
|
"eval_steps_per_second": 0.147, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.8672, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 6.859504132231405e-06, |
|
"loss": 0.0588, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.8704, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 6.694214876033058e-06, |
|
"loss": 0.1022, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.8736, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 6.528925619834711e-06, |
|
"loss": 0.0645, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.8768, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 6.363636363636363e-06, |
|
"loss": 0.0688, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.0, |
|
"learning_rate": 6.198347107438017e-06, |
|
"loss": 0.0763, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_loss": 0.07813085615634918, |
|
"eval_runtime": 107.2687, |
|
"eval_samples_per_second": 4.661, |
|
"eval_steps_per_second": 0.149, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.8832, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 6.03305785123967e-06, |
|
"loss": 0.0619, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.8864, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 5.867768595041322e-06, |
|
"loss": 0.0861, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.8896, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 5.702479338842975e-06, |
|
"loss": 0.0573, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.8928, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 5.537190082644628e-06, |
|
"loss": 0.0546, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 5.371900826446282e-06, |
|
"loss": 0.0656, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"eval_loss": 0.0781230479478836, |
|
"eval_runtime": 120.1101, |
|
"eval_samples_per_second": 4.163, |
|
"eval_steps_per_second": 0.133, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.8992, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 5.206611570247935e-06, |
|
"loss": 0.0634, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.9024, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 5.041322314049587e-06, |
|
"loss": 0.093, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.9056, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 4.87603305785124e-06, |
|
"loss": 0.0624, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.9088, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 4.710743801652893e-06, |
|
"loss": 0.0911, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 0.074, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"eval_loss": 0.07803711295127869, |
|
"eval_runtime": 109.0375, |
|
"eval_samples_per_second": 4.586, |
|
"eval_steps_per_second": 0.147, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.9152, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 4.3801652892561984e-06, |
|
"loss": 0.0734, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.9184, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 4.214876033057851e-06, |
|
"loss": 0.0521, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.9216, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 4.049586776859504e-06, |
|
"loss": 0.0843, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.9248, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 3.884297520661157e-06, |
|
"loss": 0.0775, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 3.71900826446281e-06, |
|
"loss": 0.0974, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"eval_loss": 0.07862695306539536, |
|
"eval_runtime": 109.8662, |
|
"eval_samples_per_second": 4.551, |
|
"eval_steps_per_second": 0.146, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.9312, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 3.553719008264463e-06, |
|
"loss": 0.0611, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.9344, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 3.3884297520661155e-06, |
|
"loss": 0.0697, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.9376, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 3.2231404958677685e-06, |
|
"loss": 0.076, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.9408, |
|
"grad_norm": 1.5, |
|
"learning_rate": 3.0578512396694214e-06, |
|
"loss": 0.084, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 2.8925619834710747e-06, |
|
"loss": 0.0769, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"eval_loss": 0.07859765738248825, |
|
"eval_runtime": 109.8176, |
|
"eval_samples_per_second": 4.553, |
|
"eval_steps_per_second": 0.146, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.9472, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 2.7272727272727272e-06, |
|
"loss": 0.0722, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.9504, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 2.56198347107438e-06, |
|
"loss": 0.0839, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.9536, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 2.3966942148760335e-06, |
|
"loss": 0.0818, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.9568, |
|
"grad_norm": 1.5, |
|
"learning_rate": 2.231404958677686e-06, |
|
"loss": 0.0757, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 2.066115702479339e-06, |
|
"loss": 0.0816, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_loss": 0.07787304371595383, |
|
"eval_runtime": 108.0913, |
|
"eval_samples_per_second": 4.626, |
|
"eval_steps_per_second": 0.148, |
|
"step": 3000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3125, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.75772653691904e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|