|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.461607677094324, |
|
"eval_steps": 500, |
|
"global_step": 10000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0001461607677094324, |
|
"grad_norm": 6.625, |
|
"learning_rate": 4.8721071863581e-08, |
|
"loss": 1.8813, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.05846430708377296, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 1.94884287454324e-05, |
|
"loss": 1.2825, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.11692861416754592, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 3.89768574908648e-05, |
|
"loss": 1.1339, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.17539292125131886, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 5.84652862362972e-05, |
|
"loss": 1.0996, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.23385722833509184, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 7.79537149817296e-05, |
|
"loss": 1.0737, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.2923215354188648, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 9.7442143727162e-05, |
|
"loss": 1.054, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3507858425026377, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 0.0001169305724725944, |
|
"loss": 1.0348, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.4092501495864107, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 0.0001364190012180268, |
|
"loss": 1.0185, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.4677144566701837, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.0001559074299634592, |
|
"loss": 1.0078, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.5261787637539567, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.0001753958587088916, |
|
"loss": 0.9962, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.5846430708377296, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.000194884287454324, |
|
"loss": 0.9884, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.6431073779215025, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.00019996853166916095, |
|
"loss": 0.9802, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.7015716850052754, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.00019982537933878626, |
|
"loss": 0.9697, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.7600359920890485, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.0001995667218123705, |
|
"loss": 0.9594, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.8185002991728214, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.00019919285837541084, |
|
"loss": 0.9493, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.8769646062565943, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00019870422161498958, |
|
"loss": 0.9399, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.9354289133403674, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00019810137691923923, |
|
"loss": 0.9322, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.9938932204241403, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.00019738502182314765, |
|
"loss": 0.9238, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.0523575275079133, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00019655598520145953, |
|
"loss": 0.8823, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.1108218345916863, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00019561522630960813, |
|
"loss": 0.8803, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.1692861416754592, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00019456383367378742, |
|
"loss": 0.8727, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.2277504487592321, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00019340302383144832, |
|
"loss": 0.8737, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.286214755843005, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00019213413992367673, |
|
"loss": 0.8672, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 1.344679062926778, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00019075865014108194, |
|
"loss": 0.8652, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 1.4031433700105511, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00018927814602499394, |
|
"loss": 0.8605, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 1.461607677094324, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00018769434062593454, |
|
"loss": 0.8569, |
|
"step": 10000 |
|
} |
|
], |
|
"logging_steps": 400, |
|
"max_steps": 41046, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 5000, |
|
"total_flos": 2.2714013468356038e+20, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|