{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 1730, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07, "grad_norm": 4.66322660446167, "learning_rate": 2.8901734104046245e-05, "loss": 2.8724, "step": 25 }, { "epoch": 0.14, "grad_norm": 4.364796161651611, "learning_rate": 5.780346820809249e-05, "loss": 1.8669, "step": 50 }, { "epoch": 0.22, "grad_norm": 7.0544962882995605, "learning_rate": 8.670520231213874e-05, "loss": 1.0394, "step": 75 }, { "epoch": 0.29, "grad_norm": 2.588165521621704, "learning_rate": 0.00011560693641618498, "loss": 0.8095, "step": 100 }, { "epoch": 0.36, "grad_norm": 2.311723470687866, "learning_rate": 0.00014450867052023122, "loss": 0.7673, "step": 125 }, { "epoch": 0.43, "grad_norm": 2.0632615089416504, "learning_rate": 0.00017341040462427748, "loss": 0.7738, "step": 150 }, { "epoch": 0.51, "grad_norm": 1.7875710725784302, "learning_rate": 0.00019974309569685292, "loss": 0.7151, "step": 175 }, { "epoch": 0.58, "grad_norm": 1.7314724922180176, "learning_rate": 0.00019653179190751445, "loss": 0.6782, "step": 200 }, { "epoch": 0.65, "grad_norm": 2.1410841941833496, "learning_rate": 0.00019332048811817598, "loss": 0.6733, "step": 225 }, { "epoch": 0.72, "grad_norm": 1.6796475648880005, "learning_rate": 0.0001901091843288375, "loss": 0.711, "step": 250 }, { "epoch": 0.79, "grad_norm": 1.9651137590408325, "learning_rate": 0.00018689788053949903, "loss": 0.6285, "step": 275 }, { "epoch": 0.87, "grad_norm": 1.9334725141525269, "learning_rate": 0.00018368657675016056, "loss": 0.6387, "step": 300 }, { "epoch": 0.94, "grad_norm": 1.9697507619857788, "learning_rate": 0.00018047527296082209, "loss": 0.6203, "step": 325 }, { "epoch": 1.01, "grad_norm": 1.5927447080612183, "learning_rate": 0.0001772639691714836, "loss": 0.6036, "step": 350 }, { "epoch": 1.08, "grad_norm": 1.5775573253631592, "learning_rate": 0.00017405266538214514, "loss": 0.5443, "step": 375 }, { "epoch": 1.16, "grad_norm": 2.265123128890991, "learning_rate": 0.0001708413615928067, "loss": 0.5864, "step": 400 }, { "epoch": 1.23, "grad_norm": 1.9081721305847168, "learning_rate": 0.00016763005780346822, "loss": 0.497, "step": 425 }, { "epoch": 1.3, "grad_norm": 1.7822768688201904, "learning_rate": 0.00016441875401412975, "loss": 0.5652, "step": 450 }, { "epoch": 1.37, "grad_norm": 1.585552453994751, "learning_rate": 0.00016120745022479128, "loss": 0.5356, "step": 475 }, { "epoch": 1.45, "grad_norm": 1.8936752080917358, "learning_rate": 0.0001579961464354528, "loss": 0.4897, "step": 500 }, { "epoch": 1.52, "grad_norm": 1.7502835988998413, "learning_rate": 0.00015478484264611433, "loss": 0.5051, "step": 525 }, { "epoch": 1.59, "grad_norm": 1.6106928586959839, "learning_rate": 0.00015157353885677586, "loss": 0.5034, "step": 550 }, { "epoch": 1.66, "grad_norm": 1.6488401889801025, "learning_rate": 0.00014836223506743738, "loss": 0.4692, "step": 575 }, { "epoch": 1.73, "grad_norm": 1.5491576194763184, "learning_rate": 0.0001451509312780989, "loss": 0.5016, "step": 600 }, { "epoch": 1.81, "grad_norm": 1.7753480672836304, "learning_rate": 0.00014193962748876044, "loss": 0.4836, "step": 625 }, { "epoch": 1.88, "grad_norm": 1.5605947971343994, "learning_rate": 0.00013872832369942197, "loss": 0.5059, "step": 650 }, { "epoch": 1.95, "grad_norm": 1.4864176511764526, "learning_rate": 0.0001355170199100835, "loss": 0.4667, "step": 675 }, { "epoch": 2.02, "grad_norm": 1.6282180547714233, "learning_rate": 0.00013230571612074502, "loss": 0.4208, "step": 700 }, { "epoch": 2.1, "grad_norm": 1.9214680194854736, "learning_rate": 0.00012909441233140655, "loss": 0.3733, "step": 725 }, { "epoch": 2.17, "grad_norm": 1.9027807712554932, "learning_rate": 0.00012588310854206808, "loss": 0.3831, "step": 750 }, { "epoch": 2.24, "grad_norm": 1.756855845451355, "learning_rate": 0.0001226718047527296, "loss": 0.3876, "step": 775 }, { "epoch": 2.31, "grad_norm": 1.5269505977630615, "learning_rate": 0.00011946050096339114, "loss": 0.3847, "step": 800 }, { "epoch": 2.38, "grad_norm": 1.7476940155029297, "learning_rate": 0.00011624919717405267, "loss": 0.3604, "step": 825 }, { "epoch": 2.46, "grad_norm": 1.8043824434280396, "learning_rate": 0.0001130378933847142, "loss": 0.384, "step": 850 }, { "epoch": 2.53, "grad_norm": 1.4151132106781006, "learning_rate": 0.00010982658959537572, "loss": 0.3745, "step": 875 }, { "epoch": 2.6, "grad_norm": 1.907926082611084, "learning_rate": 0.00010661528580603725, "loss": 0.372, "step": 900 }, { "epoch": 2.67, "grad_norm": 1.8263049125671387, "learning_rate": 0.00010340398201669879, "loss": 0.3972, "step": 925 }, { "epoch": 2.75, "grad_norm": 1.8957775831222534, "learning_rate": 0.00010019267822736032, "loss": 0.3967, "step": 950 }, { "epoch": 2.82, "grad_norm": 1.772609829902649, "learning_rate": 9.698137443802185e-05, "loss": 0.3775, "step": 975 }, { "epoch": 2.89, "grad_norm": 1.976365566253662, "learning_rate": 9.377007064868337e-05, "loss": 0.4043, "step": 1000 }, { "epoch": 2.96, "grad_norm": 1.689432978630066, "learning_rate": 9.05587668593449e-05, "loss": 0.3226, "step": 1025 }, { "epoch": 3.03, "grad_norm": 1.9092849493026733, "learning_rate": 8.734746307000643e-05, "loss": 0.3758, "step": 1050 }, { "epoch": 3.11, "grad_norm": 1.7606756687164307, "learning_rate": 8.413615928066796e-05, "loss": 0.3095, "step": 1075 }, { "epoch": 3.18, "grad_norm": 1.8509767055511475, "learning_rate": 8.092485549132948e-05, "loss": 0.2712, "step": 1100 }, { "epoch": 3.25, "grad_norm": 1.5414364337921143, "learning_rate": 7.771355170199101e-05, "loss": 0.2695, "step": 1125 }, { "epoch": 3.32, "grad_norm": 1.8055434226989746, "learning_rate": 7.450224791265255e-05, "loss": 0.2828, "step": 1150 }, { "epoch": 3.4, "grad_norm": 2.0254344940185547, "learning_rate": 7.129094412331408e-05, "loss": 0.2729, "step": 1175 }, { "epoch": 3.47, "grad_norm": 2.2866463661193848, "learning_rate": 6.80796403339756e-05, "loss": 0.3022, "step": 1200 }, { "epoch": 3.54, "grad_norm": 1.5647475719451904, "learning_rate": 6.486833654463712e-05, "loss": 0.2893, "step": 1225 }, { "epoch": 3.61, "grad_norm": 1.9078121185302734, "learning_rate": 6.165703275529865e-05, "loss": 0.2751, "step": 1250 }, { "epoch": 3.68, "grad_norm": 1.9310845136642456, "learning_rate": 5.844572896596018e-05, "loss": 0.2614, "step": 1275 }, { "epoch": 3.76, "grad_norm": 1.5173165798187256, "learning_rate": 5.523442517662171e-05, "loss": 0.2781, "step": 1300 }, { "epoch": 3.83, "grad_norm": 2.2908642292022705, "learning_rate": 5.2023121387283234e-05, "loss": 0.2889, "step": 1325 }, { "epoch": 3.9, "grad_norm": 2.0346012115478516, "learning_rate": 4.881181759794477e-05, "loss": 0.2997, "step": 1350 }, { "epoch": 3.97, "grad_norm": 1.6963484287261963, "learning_rate": 4.56005138086063e-05, "loss": 0.2415, "step": 1375 }, { "epoch": 4.05, "grad_norm": 1.552463173866272, "learning_rate": 4.238921001926782e-05, "loss": 0.229, "step": 1400 }, { "epoch": 4.12, "grad_norm": 2.3235974311828613, "learning_rate": 3.917790622992935e-05, "loss": 0.1927, "step": 1425 }, { "epoch": 4.19, "grad_norm": 1.6171674728393555, "learning_rate": 3.596660244059088e-05, "loss": 0.2116, "step": 1450 }, { "epoch": 4.26, "grad_norm": 1.9670432806015015, "learning_rate": 3.275529865125241e-05, "loss": 0.207, "step": 1475 }, { "epoch": 4.34, "grad_norm": 2.1716980934143066, "learning_rate": 2.9543994861913938e-05, "loss": 0.2023, "step": 1500 }, { "epoch": 4.41, "grad_norm": 2.0103349685668945, "learning_rate": 2.6332691072575465e-05, "loss": 0.2081, "step": 1525 }, { "epoch": 4.48, "grad_norm": 2.488182544708252, "learning_rate": 2.3121387283236996e-05, "loss": 0.2206, "step": 1550 }, { "epoch": 4.55, "grad_norm": 2.150543451309204, "learning_rate": 1.9910083493898523e-05, "loss": 0.1989, "step": 1575 }, { "epoch": 4.62, "grad_norm": 1.952592134475708, "learning_rate": 1.6698779704560053e-05, "loss": 0.2074, "step": 1600 }, { "epoch": 4.7, "grad_norm": 1.8111902475357056, "learning_rate": 1.348747591522158e-05, "loss": 0.2122, "step": 1625 }, { "epoch": 4.77, "grad_norm": 2.595923662185669, "learning_rate": 1.027617212588311e-05, "loss": 0.191, "step": 1650 }, { "epoch": 4.84, "grad_norm": 2.17901611328125, "learning_rate": 7.064868336544637e-06, "loss": 0.2205, "step": 1675 }, { "epoch": 4.91, "grad_norm": 2.284546136856079, "learning_rate": 3.853564547206165e-06, "loss": 0.2239, "step": 1700 }, { "epoch": 4.99, "grad_norm": 2.4415647983551025, "learning_rate": 6.422607578676943e-07, "loss": 0.1967, "step": 1725 } ], "logging_steps": 25, "max_steps": 1730, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 7.5248089823232e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }