|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 1548, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.802351951599121, |
|
"learning_rate": 3.2258064516129034e-05, |
|
"loss": 3.7331, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.442800760269165, |
|
"learning_rate": 6.451612903225807e-05, |
|
"loss": 2.9581, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.616957902908325, |
|
"learning_rate": 9.677419354838711e-05, |
|
"loss": 2.4309, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 2.3956120014190674, |
|
"learning_rate": 0.00012903225806451613, |
|
"loss": 2.2667, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.698212146759033, |
|
"learning_rate": 0.00016129032258064516, |
|
"loss": 2.0782, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 2.7342419624328613, |
|
"learning_rate": 0.00019354838709677422, |
|
"loss": 2.0691, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 2.039422035217285, |
|
"learning_rate": 0.00019712849964106247, |
|
"loss": 2.0865, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.0298664569854736, |
|
"learning_rate": 0.00019353912419239053, |
|
"loss": 1.9552, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.8978090286254883, |
|
"learning_rate": 0.0001899497487437186, |
|
"loss": 2.0111, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.9588110446929932, |
|
"learning_rate": 0.00018636037329504667, |
|
"loss": 1.961, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.160839796066284, |
|
"learning_rate": 0.00018277099784637474, |
|
"loss": 2.0133, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 2.0899839401245117, |
|
"learning_rate": 0.00017918162239770278, |
|
"loss": 1.958, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 2.029879570007324, |
|
"learning_rate": 0.00017559224694903088, |
|
"loss": 1.9116, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.7021583318710327, |
|
"learning_rate": 0.00017200287150035895, |
|
"loss": 1.8842, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.5238200426101685, |
|
"learning_rate": 0.00016841349605168702, |
|
"loss": 1.8229, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.9168670177459717, |
|
"learning_rate": 0.0001648241206030151, |
|
"loss": 1.9235, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.8892511129379272, |
|
"learning_rate": 0.00016123474515434316, |
|
"loss": 1.8859, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.7955598831176758, |
|
"learning_rate": 0.00015764536970567123, |
|
"loss": 1.8619, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.7658586502075195, |
|
"learning_rate": 0.00015405599425699927, |
|
"loss": 1.8235, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.598111629486084, |
|
"learning_rate": 0.00015046661880832737, |
|
"loss": 1.8554, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.9352959394454956, |
|
"learning_rate": 0.00014687724335965544, |
|
"loss": 1.801, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.9746991395950317, |
|
"learning_rate": 0.0001432878679109835, |
|
"loss": 1.895, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.476921796798706, |
|
"learning_rate": 0.00013969849246231157, |
|
"loss": 1.7978, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.667672872543335, |
|
"learning_rate": 0.00013610911701363964, |
|
"loss": 1.76, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.5855716466903687, |
|
"learning_rate": 0.00013251974156496769, |
|
"loss": 1.8171, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.9429579973220825, |
|
"learning_rate": 0.00012893036611629576, |
|
"loss": 1.811, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.684810996055603, |
|
"learning_rate": 0.00012534099066762382, |
|
"loss": 1.8148, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 1.6180702447891235, |
|
"learning_rate": 0.00012175161521895191, |
|
"loss": 1.7356, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.7795788049697876, |
|
"learning_rate": 0.00011816223977027998, |
|
"loss": 1.7969, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.7645013332366943, |
|
"learning_rate": 0.00011457286432160806, |
|
"loss": 1.8198, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.713512897491455, |
|
"learning_rate": 0.00011098348887293613, |
|
"loss": 1.7514, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.3435418605804443, |
|
"learning_rate": 0.00010739411342426417, |
|
"loss": 1.6679, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.5819846391677856, |
|
"learning_rate": 0.00010380473797559225, |
|
"loss": 1.7182, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 1.3324297666549683, |
|
"learning_rate": 0.00010021536252692032, |
|
"loss": 1.7301, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 1.4800920486450195, |
|
"learning_rate": 9.662598707824839e-05, |
|
"loss": 1.7123, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.5812880992889404, |
|
"learning_rate": 9.303661162957645e-05, |
|
"loss": 1.7816, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.5376070737838745, |
|
"learning_rate": 8.944723618090453e-05, |
|
"loss": 1.7446, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.5884149074554443, |
|
"learning_rate": 8.58578607322326e-05, |
|
"loss": 1.6483, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 2.03753662109375, |
|
"learning_rate": 8.226848528356066e-05, |
|
"loss": 1.6695, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 1.7924336194992065, |
|
"learning_rate": 7.867910983488873e-05, |
|
"loss": 1.7109, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.80973219871521, |
|
"learning_rate": 7.508973438621681e-05, |
|
"loss": 1.7017, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 1.7763901948928833, |
|
"learning_rate": 7.150035893754488e-05, |
|
"loss": 1.6669, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 3.0561070442199707, |
|
"learning_rate": 6.791098348887293e-05, |
|
"loss": 1.6681, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 1.7292817831039429, |
|
"learning_rate": 6.4321608040201e-05, |
|
"loss": 1.6787, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 1.6157281398773193, |
|
"learning_rate": 6.073223259152908e-05, |
|
"loss": 1.6716, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 1.5873336791992188, |
|
"learning_rate": 5.714285714285714e-05, |
|
"loss": 1.6724, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.7574703693389893, |
|
"learning_rate": 5.355348169418522e-05, |
|
"loss": 1.6608, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 1.7600945234298706, |
|
"learning_rate": 4.996410624551328e-05, |
|
"loss": 1.7204, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 1.4515677690505981, |
|
"learning_rate": 4.6374730796841356e-05, |
|
"loss": 1.6755, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 1.9057530164718628, |
|
"learning_rate": 4.278535534816942e-05, |
|
"loss": 1.6953, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 1.7482775449752808, |
|
"learning_rate": 3.919597989949749e-05, |
|
"loss": 1.686, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 1.8991056680679321, |
|
"learning_rate": 3.560660445082556e-05, |
|
"loss": 1.6611, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 1.7882860898971558, |
|
"learning_rate": 3.201722900215363e-05, |
|
"loss": 1.7103, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 1.6628391742706299, |
|
"learning_rate": 2.84278535534817e-05, |
|
"loss": 1.6925, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 1.5516228675842285, |
|
"learning_rate": 2.4838478104809766e-05, |
|
"loss": 1.652, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 1.5719884634017944, |
|
"learning_rate": 2.1249102656137835e-05, |
|
"loss": 1.6383, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 1.8628472089767456, |
|
"learning_rate": 1.76597272074659e-05, |
|
"loss": 1.6922, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 1.7590694427490234, |
|
"learning_rate": 1.407035175879397e-05, |
|
"loss": 1.687, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 1.6741101741790771, |
|
"learning_rate": 1.048097631012204e-05, |
|
"loss": 1.6599, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 1.7378610372543335, |
|
"learning_rate": 6.891600861450108e-06, |
|
"loss": 1.6765, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 1.5435367822647095, |
|
"learning_rate": 3.3022254127781766e-06, |
|
"loss": 1.607, |
|
"step": 1525 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 1548, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"total_flos": 1.0151913205845197e+17, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|