|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 563, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.017761989342806393, |
|
"grad_norm": 10.114191055297852, |
|
"learning_rate": 1.9644760213143874e-05, |
|
"loss": 3.1416, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.035523978685612786, |
|
"grad_norm": 4.123176097869873, |
|
"learning_rate": 1.9289520426287745e-05, |
|
"loss": 2.5072, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05328596802841918, |
|
"grad_norm": 5.617274761199951, |
|
"learning_rate": 1.8934280639431617e-05, |
|
"loss": 2.5089, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07104795737122557, |
|
"grad_norm": 5.457020282745361, |
|
"learning_rate": 1.857904085257549e-05, |
|
"loss": 2.4729, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08880994671403197, |
|
"grad_norm": 5.344240665435791, |
|
"learning_rate": 1.822380106571936e-05, |
|
"loss": 2.2751, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.10657193605683836, |
|
"grad_norm": 4.997218132019043, |
|
"learning_rate": 1.7868561278863233e-05, |
|
"loss": 2.1903, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.12433392539964476, |
|
"grad_norm": 5.089644908905029, |
|
"learning_rate": 1.751332149200711e-05, |
|
"loss": 2.2842, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.14209591474245115, |
|
"grad_norm": 4.242591857910156, |
|
"learning_rate": 1.7158081705150977e-05, |
|
"loss": 2.3067, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.15985790408525755, |
|
"grad_norm": 4.736593246459961, |
|
"learning_rate": 1.680284191829485e-05, |
|
"loss": 2.2627, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.17761989342806395, |
|
"grad_norm": 4.923036575317383, |
|
"learning_rate": 1.644760213143872e-05, |
|
"loss": 2.3868, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.19538188277087035, |
|
"grad_norm": 3.8926291465759277, |
|
"learning_rate": 1.6092362344582596e-05, |
|
"loss": 2.2509, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.21314387211367672, |
|
"grad_norm": 4.4710373878479, |
|
"learning_rate": 1.5737122557726468e-05, |
|
"loss": 2.2431, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.23090586145648312, |
|
"grad_norm": 4.363410949707031, |
|
"learning_rate": 1.5381882770870337e-05, |
|
"loss": 2.197, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.24866785079928952, |
|
"grad_norm": 4.414468288421631, |
|
"learning_rate": 1.502664298401421e-05, |
|
"loss": 2.1243, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2664298401420959, |
|
"grad_norm": 5.004277229309082, |
|
"learning_rate": 1.4671403197158082e-05, |
|
"loss": 2.2229, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2841918294849023, |
|
"grad_norm": 5.9036431312561035, |
|
"learning_rate": 1.4316163410301956e-05, |
|
"loss": 2.35, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3019538188277087, |
|
"grad_norm": 3.8498282432556152, |
|
"learning_rate": 1.3960923623445828e-05, |
|
"loss": 2.197, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3197158081705151, |
|
"grad_norm": 4.546940326690674, |
|
"learning_rate": 1.3605683836589698e-05, |
|
"loss": 2.2107, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.33747779751332146, |
|
"grad_norm": 4.567042350769043, |
|
"learning_rate": 1.3250444049733571e-05, |
|
"loss": 2.1656, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3552397868561279, |
|
"grad_norm": 4.599060535430908, |
|
"learning_rate": 1.2895204262877443e-05, |
|
"loss": 2.097, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.37300177619893427, |
|
"grad_norm": 4.4139084815979, |
|
"learning_rate": 1.2539964476021315e-05, |
|
"loss": 2.2216, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3907637655417407, |
|
"grad_norm": 4.82399320602417, |
|
"learning_rate": 1.2184724689165189e-05, |
|
"loss": 2.2686, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.40852575488454707, |
|
"grad_norm": 5.282867431640625, |
|
"learning_rate": 1.1829484902309059e-05, |
|
"loss": 2.2976, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.42628774422735344, |
|
"grad_norm": 4.4112229347229, |
|
"learning_rate": 1.1474245115452931e-05, |
|
"loss": 2.0539, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.44404973357015987, |
|
"grad_norm": 4.809335708618164, |
|
"learning_rate": 1.1119005328596803e-05, |
|
"loss": 2.0071, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.46181172291296624, |
|
"grad_norm": 4.342746734619141, |
|
"learning_rate": 1.0763765541740677e-05, |
|
"loss": 2.2071, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.47957371225577267, |
|
"grad_norm": 4.607499599456787, |
|
"learning_rate": 1.0408525754884548e-05, |
|
"loss": 2.1116, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.49733570159857904, |
|
"grad_norm": 4.707973957061768, |
|
"learning_rate": 1.0053285968028419e-05, |
|
"loss": 2.3467, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5150976909413855, |
|
"grad_norm": 5.3809814453125, |
|
"learning_rate": 9.698046181172292e-06, |
|
"loss": 2.265, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5328596802841918, |
|
"grad_norm": 4.981960296630859, |
|
"learning_rate": 9.342806394316164e-06, |
|
"loss": 2.239, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5506216696269982, |
|
"grad_norm": 4.437684059143066, |
|
"learning_rate": 8.987566607460036e-06, |
|
"loss": 2.2065, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5683836589698046, |
|
"grad_norm": 4.121458530426025, |
|
"learning_rate": 8.632326820603908e-06, |
|
"loss": 2.2214, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5861456483126111, |
|
"grad_norm": 4.4555768966674805, |
|
"learning_rate": 8.27708703374778e-06, |
|
"loss": 2.0929, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6039076376554174, |
|
"grad_norm": 4.929543972015381, |
|
"learning_rate": 7.921847246891654e-06, |
|
"loss": 1.9948, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6216696269982238, |
|
"grad_norm": 4.537841320037842, |
|
"learning_rate": 7.566607460035525e-06, |
|
"loss": 2.0598, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6394316163410302, |
|
"grad_norm": 4.358743667602539, |
|
"learning_rate": 7.2113676731793965e-06, |
|
"loss": 2.2486, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6571936056838366, |
|
"grad_norm": 5.218609809875488, |
|
"learning_rate": 6.8561278863232685e-06, |
|
"loss": 2.1021, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6749555950266429, |
|
"grad_norm": 4.320308685302734, |
|
"learning_rate": 6.500888099467141e-06, |
|
"loss": 2.0445, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6927175843694494, |
|
"grad_norm": 5.573863506317139, |
|
"learning_rate": 6.145648312611013e-06, |
|
"loss": 2.0654, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7104795737122558, |
|
"grad_norm": 4.323981761932373, |
|
"learning_rate": 5.790408525754885e-06, |
|
"loss": 1.988, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7282415630550622, |
|
"grad_norm": 5.24467658996582, |
|
"learning_rate": 5.435168738898757e-06, |
|
"loss": 2.2422, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7460035523978685, |
|
"grad_norm": 4.423385143280029, |
|
"learning_rate": 5.079928952042629e-06, |
|
"loss": 2.2142, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7637655417406749, |
|
"grad_norm": 4.1253275871276855, |
|
"learning_rate": 4.724689165186501e-06, |
|
"loss": 2.0602, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7815275310834814, |
|
"grad_norm": 5.4505815505981445, |
|
"learning_rate": 4.3694493783303736e-06, |
|
"loss": 2.1685, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7992895204262878, |
|
"grad_norm": 4.528077602386475, |
|
"learning_rate": 4.0142095914742455e-06, |
|
"loss": 2.1555, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8170515097690941, |
|
"grad_norm": 4.204817771911621, |
|
"learning_rate": 3.658969804618118e-06, |
|
"loss": 2.085, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8348134991119005, |
|
"grad_norm": 5.724953651428223, |
|
"learning_rate": 3.3037300177619897e-06, |
|
"loss": 2.0761, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8525754884547069, |
|
"grad_norm": 3.909616231918335, |
|
"learning_rate": 2.9484902309058617e-06, |
|
"loss": 2.1404, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8703374777975134, |
|
"grad_norm": 4.555464744567871, |
|
"learning_rate": 2.5932504440497336e-06, |
|
"loss": 2.1344, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8880994671403197, |
|
"grad_norm": 4.636111736297607, |
|
"learning_rate": 2.238010657193606e-06, |
|
"loss": 2.1713, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9058614564831261, |
|
"grad_norm": 4.105838775634766, |
|
"learning_rate": 1.882770870337478e-06, |
|
"loss": 2.0844, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9236234458259325, |
|
"grad_norm": 5.204021453857422, |
|
"learning_rate": 1.52753108348135e-06, |
|
"loss": 2.0752, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9413854351687388, |
|
"grad_norm": 5.496901988983154, |
|
"learning_rate": 1.172291296625222e-06, |
|
"loss": 2.164, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9591474245115453, |
|
"grad_norm": 4.677813529968262, |
|
"learning_rate": 8.170515097690942e-07, |
|
"loss": 1.9237, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.9769094138543517, |
|
"grad_norm": 4.969646453857422, |
|
"learning_rate": 4.618117229129663e-07, |
|
"loss": 2.134, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.9946714031971581, |
|
"grad_norm": 4.424323558807373, |
|
"learning_rate": 1.0657193605683837e-07, |
|
"loss": 2.2305, |
|
"step": 560 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 563, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2438020988928000.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|