{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 563, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017761989342806393, "grad_norm": 10.114191055297852, "learning_rate": 1.9644760213143874e-05, "loss": 3.1416, "step": 10 }, { "epoch": 0.035523978685612786, "grad_norm": 4.123176097869873, "learning_rate": 1.9289520426287745e-05, "loss": 2.5072, "step": 20 }, { "epoch": 0.05328596802841918, "grad_norm": 5.617274761199951, "learning_rate": 1.8934280639431617e-05, "loss": 2.5089, "step": 30 }, { "epoch": 0.07104795737122557, "grad_norm": 5.457020282745361, "learning_rate": 1.857904085257549e-05, "loss": 2.4729, "step": 40 }, { "epoch": 0.08880994671403197, "grad_norm": 5.344240665435791, "learning_rate": 1.822380106571936e-05, "loss": 2.2751, "step": 50 }, { "epoch": 0.10657193605683836, "grad_norm": 4.997218132019043, "learning_rate": 1.7868561278863233e-05, "loss": 2.1903, "step": 60 }, { "epoch": 0.12433392539964476, "grad_norm": 5.089644908905029, "learning_rate": 1.751332149200711e-05, "loss": 2.2842, "step": 70 }, { "epoch": 0.14209591474245115, "grad_norm": 4.242591857910156, "learning_rate": 1.7158081705150977e-05, "loss": 2.3067, "step": 80 }, { "epoch": 0.15985790408525755, "grad_norm": 4.736593246459961, "learning_rate": 1.680284191829485e-05, "loss": 2.2627, "step": 90 }, { "epoch": 0.17761989342806395, "grad_norm": 4.923036575317383, "learning_rate": 1.644760213143872e-05, "loss": 2.3868, "step": 100 }, { "epoch": 0.19538188277087035, "grad_norm": 3.8926291465759277, "learning_rate": 1.6092362344582596e-05, "loss": 2.2509, "step": 110 }, { "epoch": 0.21314387211367672, "grad_norm": 4.4710373878479, "learning_rate": 1.5737122557726468e-05, "loss": 2.2431, "step": 120 }, { "epoch": 0.23090586145648312, "grad_norm": 4.363410949707031, "learning_rate": 1.5381882770870337e-05, "loss": 2.197, "step": 130 }, { "epoch": 0.24866785079928952, "grad_norm": 4.414468288421631, "learning_rate": 1.502664298401421e-05, "loss": 2.1243, "step": 140 }, { "epoch": 0.2664298401420959, "grad_norm": 5.004277229309082, "learning_rate": 1.4671403197158082e-05, "loss": 2.2229, "step": 150 }, { "epoch": 0.2841918294849023, "grad_norm": 5.9036431312561035, "learning_rate": 1.4316163410301956e-05, "loss": 2.35, "step": 160 }, { "epoch": 0.3019538188277087, "grad_norm": 3.8498282432556152, "learning_rate": 1.3960923623445828e-05, "loss": 2.197, "step": 170 }, { "epoch": 0.3197158081705151, "grad_norm": 4.546940326690674, "learning_rate": 1.3605683836589698e-05, "loss": 2.2107, "step": 180 }, { "epoch": 0.33747779751332146, "grad_norm": 4.567042350769043, "learning_rate": 1.3250444049733571e-05, "loss": 2.1656, "step": 190 }, { "epoch": 0.3552397868561279, "grad_norm": 4.599060535430908, "learning_rate": 1.2895204262877443e-05, "loss": 2.097, "step": 200 }, { "epoch": 0.37300177619893427, "grad_norm": 4.4139084815979, "learning_rate": 1.2539964476021315e-05, "loss": 2.2216, "step": 210 }, { "epoch": 0.3907637655417407, "grad_norm": 4.82399320602417, "learning_rate": 1.2184724689165189e-05, "loss": 2.2686, "step": 220 }, { "epoch": 0.40852575488454707, "grad_norm": 5.282867431640625, "learning_rate": 1.1829484902309059e-05, "loss": 2.2976, "step": 230 }, { "epoch": 0.42628774422735344, "grad_norm": 4.4112229347229, "learning_rate": 1.1474245115452931e-05, "loss": 2.0539, "step": 240 }, { "epoch": 0.44404973357015987, "grad_norm": 4.809335708618164, "learning_rate": 1.1119005328596803e-05, "loss": 2.0071, "step": 250 }, { "epoch": 0.46181172291296624, "grad_norm": 4.342746734619141, "learning_rate": 1.0763765541740677e-05, "loss": 2.2071, "step": 260 }, { "epoch": 0.47957371225577267, "grad_norm": 4.607499599456787, "learning_rate": 1.0408525754884548e-05, "loss": 2.1116, "step": 270 }, { "epoch": 0.49733570159857904, "grad_norm": 4.707973957061768, "learning_rate": 1.0053285968028419e-05, "loss": 2.3467, "step": 280 }, { "epoch": 0.5150976909413855, "grad_norm": 5.3809814453125, "learning_rate": 9.698046181172292e-06, "loss": 2.265, "step": 290 }, { "epoch": 0.5328596802841918, "grad_norm": 4.981960296630859, "learning_rate": 9.342806394316164e-06, "loss": 2.239, "step": 300 }, { "epoch": 0.5506216696269982, "grad_norm": 4.437684059143066, "learning_rate": 8.987566607460036e-06, "loss": 2.2065, "step": 310 }, { "epoch": 0.5683836589698046, "grad_norm": 4.121458530426025, "learning_rate": 8.632326820603908e-06, "loss": 2.2214, "step": 320 }, { "epoch": 0.5861456483126111, "grad_norm": 4.4555768966674805, "learning_rate": 8.27708703374778e-06, "loss": 2.0929, "step": 330 }, { "epoch": 0.6039076376554174, "grad_norm": 4.929543972015381, "learning_rate": 7.921847246891654e-06, "loss": 1.9948, "step": 340 }, { "epoch": 0.6216696269982238, "grad_norm": 4.537841320037842, "learning_rate": 7.566607460035525e-06, "loss": 2.0598, "step": 350 }, { "epoch": 0.6394316163410302, "grad_norm": 4.358743667602539, "learning_rate": 7.2113676731793965e-06, "loss": 2.2486, "step": 360 }, { "epoch": 0.6571936056838366, "grad_norm": 5.218609809875488, "learning_rate": 6.8561278863232685e-06, "loss": 2.1021, "step": 370 }, { "epoch": 0.6749555950266429, "grad_norm": 4.320308685302734, "learning_rate": 6.500888099467141e-06, "loss": 2.0445, "step": 380 }, { "epoch": 0.6927175843694494, "grad_norm": 5.573863506317139, "learning_rate": 6.145648312611013e-06, "loss": 2.0654, "step": 390 }, { "epoch": 0.7104795737122558, "grad_norm": 4.323981761932373, "learning_rate": 5.790408525754885e-06, "loss": 1.988, "step": 400 }, { "epoch": 0.7282415630550622, "grad_norm": 5.24467658996582, "learning_rate": 5.435168738898757e-06, "loss": 2.2422, "step": 410 }, { "epoch": 0.7460035523978685, "grad_norm": 4.423385143280029, "learning_rate": 5.079928952042629e-06, "loss": 2.2142, "step": 420 }, { "epoch": 0.7637655417406749, "grad_norm": 4.1253275871276855, "learning_rate": 4.724689165186501e-06, "loss": 2.0602, "step": 430 }, { "epoch": 0.7815275310834814, "grad_norm": 5.4505815505981445, "learning_rate": 4.3694493783303736e-06, "loss": 2.1685, "step": 440 }, { "epoch": 0.7992895204262878, "grad_norm": 4.528077602386475, "learning_rate": 4.0142095914742455e-06, "loss": 2.1555, "step": 450 }, { "epoch": 0.8170515097690941, "grad_norm": 4.204817771911621, "learning_rate": 3.658969804618118e-06, "loss": 2.085, "step": 460 }, { "epoch": 0.8348134991119005, "grad_norm": 5.724953651428223, "learning_rate": 3.3037300177619897e-06, "loss": 2.0761, "step": 470 }, { "epoch": 0.8525754884547069, "grad_norm": 3.909616231918335, "learning_rate": 2.9484902309058617e-06, "loss": 2.1404, "step": 480 }, { "epoch": 0.8703374777975134, "grad_norm": 4.555464744567871, "learning_rate": 2.5932504440497336e-06, "loss": 2.1344, "step": 490 }, { "epoch": 0.8880994671403197, "grad_norm": 4.636111736297607, "learning_rate": 2.238010657193606e-06, "loss": 2.1713, "step": 500 }, { "epoch": 0.9058614564831261, "grad_norm": 4.105838775634766, "learning_rate": 1.882770870337478e-06, "loss": 2.0844, "step": 510 }, { "epoch": 0.9236234458259325, "grad_norm": 5.204021453857422, "learning_rate": 1.52753108348135e-06, "loss": 2.0752, "step": 520 }, { "epoch": 0.9413854351687388, "grad_norm": 5.496901988983154, "learning_rate": 1.172291296625222e-06, "loss": 2.164, "step": 530 }, { "epoch": 0.9591474245115453, "grad_norm": 4.677813529968262, "learning_rate": 8.170515097690942e-07, "loss": 1.9237, "step": 540 }, { "epoch": 0.9769094138543517, "grad_norm": 4.969646453857422, "learning_rate": 4.618117229129663e-07, "loss": 2.134, "step": 550 }, { "epoch": 0.9946714031971581, "grad_norm": 4.424323558807373, "learning_rate": 1.0657193605683837e-07, "loss": 2.2305, "step": 560 } ], "logging_steps": 10, "max_steps": 563, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2438020988928000.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }