|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.032137032305751725, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0006427406461150345, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 1e-05, |
|
"loss": 3.0419, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.001285481292230069, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 2e-05, |
|
"loss": 3.0641, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0019282219383451036, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 1.9999950454033063e-05, |
|
"loss": 3.2405, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.002570962584460138, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 1.9999801816623205e-05, |
|
"loss": 2.8447, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0032137032305751727, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 1.9999554089243305e-05, |
|
"loss": 2.9348, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.003856443876690207, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 1.9999207274348143e-05, |
|
"loss": 2.966, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.004499184522805242, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 1.9998761375374376e-05, |
|
"loss": 2.8849, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.005141925168920276, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 1.9998216396740497e-05, |
|
"loss": 2.8674, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0057846658150353105, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 1.9997572343846814e-05, |
|
"loss": 2.789, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.006427406461150345, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 1.9996829223075363e-05, |
|
"loss": 2.8306, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0070701471072653795, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 1.9995987041789876e-05, |
|
"loss": 2.5481, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.007712887753380414, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 1.999504580833569e-05, |
|
"loss": 2.6249, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.00835562839949545, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 1.9994005532039665e-05, |
|
"loss": 2.6078, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.008998369045610483, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 1.9992866223210105e-05, |
|
"loss": 2.4607, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.009641109691725517, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 1.999162789313664e-05, |
|
"loss": 2.3672, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.010283850337840551, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 1.9990290554090123e-05, |
|
"loss": 2.4135, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.010926590983955587, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 1.9988854219322507e-05, |
|
"loss": 2.5179, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.011569331630070621, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 1.9987318903066704e-05, |
|
"loss": 2.3391, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.012212072276185655, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 1.9985684620536466e-05, |
|
"loss": 2.4168, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.01285481292230069, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 1.9983951387926216e-05, |
|
"loss": 2.2706, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.013497553568415725, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 1.998211922241088e-05, |
|
"loss": 2.2684, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.014140294214530759, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 1.9980188142145755e-05, |
|
"loss": 2.2022, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.014783034860645793, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 1.997815816626628e-05, |
|
"loss": 2.2821, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.015425775506760829, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 1.9976029314887882e-05, |
|
"loss": 2.1055, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.016068516152875863, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 1.9973801609105757e-05, |
|
"loss": 2.1481, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.0167112567989909, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 1.9971475070994675e-05, |
|
"loss": 2.1277, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.01735399744510593, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 1.9969049723608753e-05, |
|
"loss": 2.1143, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.017996738091220967, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 1.9966525590981228e-05, |
|
"loss": 2.1824, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.018639478737336002, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 1.9963902698124212e-05, |
|
"loss": 2.1849, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.019282219383451035, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 1.996118107102847e-05, |
|
"loss": 2.1996, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.01992496002956607, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 1.9958360736663117e-05, |
|
"loss": 2.1914, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.020567700675681103, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 1.99554417229754e-05, |
|
"loss": 2.0519, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.02121044132179614, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 1.995242405889039e-05, |
|
"loss": 2.0804, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.021853181967911174, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 1.99493077743107e-05, |
|
"loss": 2.0424, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.022495922614026206, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 1.99460929001162e-05, |
|
"loss": 2.1151, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.023138663260141242, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 1.9942779468163696e-05, |
|
"loss": 1.8499, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.023781403906256278, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 1.9939367511286635e-05, |
|
"loss": 2.1271, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.02442414455237131, |
|
"grad_norm": 0.75, |
|
"learning_rate": 1.993585706329475e-05, |
|
"loss": 2.0739, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.025066885198486346, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 1.9932248158973746e-05, |
|
"loss": 2.0379, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.02570962584460138, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 1.992854083408496e-05, |
|
"loss": 1.9566, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.026352366490716414, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 1.992473512536499e-05, |
|
"loss": 2.0377, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.02699510713683145, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 1.992083107052534e-05, |
|
"loss": 1.9839, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.027637847782946486, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 1.9916828708252046e-05, |
|
"loss": 1.9477, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.028280588429061518, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.9912728078205285e-05, |
|
"loss": 2.0033, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.028923329075176554, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.9908529221018994e-05, |
|
"loss": 1.9196, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.029566069721291586, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 1.9904232178300465e-05, |
|
"loss": 1.9712, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.03020881036740662, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 1.9899836992629922e-05, |
|
"loss": 1.9251, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.030851551013521657, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 1.989534370756011e-05, |
|
"loss": 1.9695, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.03149429165963669, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 1.989075236761586e-05, |
|
"loss": 1.9208, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.032137032305751725, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 1.988606301829365e-05, |
|
"loss": 1.9474, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"total_flos": 1.915828644400333e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|