|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 7.18562874251497, |
|
"eval_steps": 500, |
|
"global_step": 600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11976047904191617, |
|
"grad_norm": 2.8749659061431885, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4673, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.23952095808383234, |
|
"grad_norm": 3.4398715496063232, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0019, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3592814371257485, |
|
"grad_norm": 1.9059951305389404, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0575, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.47904191616766467, |
|
"grad_norm": 4.149394989013672, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8164, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5988023952095808, |
|
"grad_norm": 1.4866076707839966, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8684, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.718562874251497, |
|
"grad_norm": 3.1927452087402344, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8016, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8383233532934131, |
|
"grad_norm": 1.1162314414978027, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6809, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9580838323353293, |
|
"grad_norm": 2.829102039337158, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6962, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.0778443113772456, |
|
"grad_norm": 1.2642532587051392, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5769, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.1976047904191618, |
|
"grad_norm": 1.3799452781677246, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4128, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.3173652694610778, |
|
"grad_norm": 2.3143367767333984, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6441, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.437125748502994, |
|
"grad_norm": 1.085919976234436, |
|
"learning_rate": 0.0002, |
|
"loss": 0.393, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.55688622754491, |
|
"grad_norm": 1.2423957586288452, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5582, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.6766467065868262, |
|
"grad_norm": 1.2964059114456177, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4276, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.7964071856287425, |
|
"grad_norm": 1.8397400379180908, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6162, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.9161676646706587, |
|
"grad_norm": 1.0209627151489258, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4565, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.035928143712575, |
|
"grad_norm": 0.8725757598876953, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4807, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.155688622754491, |
|
"grad_norm": 1.1269447803497314, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3895, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.2754491017964074, |
|
"grad_norm": 1.528011679649353, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3553, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.3952095808383236, |
|
"grad_norm": 0.8296527862548828, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3516, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.5149700598802394, |
|
"grad_norm": 1.301917552947998, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3918, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.6347305389221556, |
|
"grad_norm": 0.8420801758766174, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3497, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.754491017964072, |
|
"grad_norm": 1.1430580615997314, |
|
"learning_rate": 0.0002, |
|
"loss": 0.4311, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.874251497005988, |
|
"grad_norm": 0.9065356850624084, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3551, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.9940119760479043, |
|
"grad_norm": 1.1302285194396973, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3513, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.1137724550898205, |
|
"grad_norm": 0.9960314631462097, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3124, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.2335329341317367, |
|
"grad_norm": 1.680296778678894, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3065, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.3532934131736525, |
|
"grad_norm": 1.1697853803634644, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3009, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.4730538922155687, |
|
"grad_norm": 1.9219907522201538, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2802, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.592814371257485, |
|
"grad_norm": 1.384773850440979, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3419, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.712574850299401, |
|
"grad_norm": 1.3956997394561768, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3172, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 3.8323353293413174, |
|
"grad_norm": 1.058669924736023, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3723, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.9520958083832336, |
|
"grad_norm": 1.5626955032348633, |
|
"learning_rate": 0.0002, |
|
"loss": 0.325, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 4.07185628742515, |
|
"grad_norm": 1.2782564163208008, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2912, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 4.191616766467066, |
|
"grad_norm": 1.0916423797607422, |
|
"learning_rate": 0.0002, |
|
"loss": 0.233, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 4.311377245508982, |
|
"grad_norm": 0.8613762855529785, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3058, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 4.431137724550898, |
|
"grad_norm": 0.6293674111366272, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2334, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 4.550898203592815, |
|
"grad_norm": 1.6042566299438477, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3287, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 4.6706586826347305, |
|
"grad_norm": 0.8140411376953125, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2372, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 4.790419161676647, |
|
"grad_norm": 1.5365833044052124, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3266, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 4.910179640718563, |
|
"grad_norm": 0.9418448805809021, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2513, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 5.029940119760479, |
|
"grad_norm": 0.6695829033851624, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2688, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 5.149700598802395, |
|
"grad_norm": 0.628887414932251, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2149, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 5.269461077844311, |
|
"grad_norm": 0.964766263961792, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2606, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 5.389221556886228, |
|
"grad_norm": 0.5990360975265503, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2364, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 5.508982035928144, |
|
"grad_norm": 0.8189520835876465, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2857, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 5.62874251497006, |
|
"grad_norm": 0.5583224296569824, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2414, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 5.748502994011976, |
|
"grad_norm": 0.7695009708404541, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2434, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 5.868263473053892, |
|
"grad_norm": 0.3456665575504303, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2597, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 5.9880239520958085, |
|
"grad_norm": 0.7596808671951294, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2983, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 6.107784431137724, |
|
"grad_norm": 0.9513673782348633, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2139, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 6.227544910179641, |
|
"grad_norm": 1.0958881378173828, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2211, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 6.347305389221557, |
|
"grad_norm": 0.6882690787315369, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2347, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 6.467065868263473, |
|
"grad_norm": 1.0562934875488281, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2276, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 6.586826347305389, |
|
"grad_norm": 1.1535356044769287, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2469, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 6.706586826347305, |
|
"grad_norm": 0.9436424970626831, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2713, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 6.826347305389222, |
|
"grad_norm": 1.0283164978027344, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2449, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 6.946107784431137, |
|
"grad_norm": 1.3945902585983276, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2193, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 7.065868263473054, |
|
"grad_norm": 0.5662649869918823, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2415, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 7.18562874251497, |
|
"grad_norm": 0.4687662720680237, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1792, |
|
"step": 600 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 8, |
|
"save_steps": 100, |
|
"total_flos": 1753733775298560.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|