{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.996941896024465, "eval_steps": 500, "global_step": 735, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1, "grad_norm": 0.2513757646083832, "learning_rate": 1.0135135135135135e-05, "loss": 1.5178, "step": 25 }, { "epoch": 0.2, "grad_norm": 0.38634929060935974, "learning_rate": 2.027027027027027e-05, "loss": 1.4603, "step": 50 }, { "epoch": 0.31, "grad_norm": 0.5580678582191467, "learning_rate": 2.995461422087746e-05, "loss": 1.3665, "step": 75 }, { "epoch": 0.41, "grad_norm": 0.6025068163871765, "learning_rate": 2.881996974281392e-05, "loss": 1.2772, "step": 100 }, { "epoch": 0.51, "grad_norm": 0.6379350423812866, "learning_rate": 2.768532526475038e-05, "loss": 1.2555, "step": 125 }, { "epoch": 0.61, "grad_norm": 0.666475236415863, "learning_rate": 2.655068078668684e-05, "loss": 1.2434, "step": 150 }, { "epoch": 0.71, "grad_norm": 0.7137765288352966, "learning_rate": 2.54160363086233e-05, "loss": 1.2304, "step": 175 }, { "epoch": 0.82, "grad_norm": 0.6858211159706116, "learning_rate": 2.428139183055976e-05, "loss": 1.1842, "step": 200 }, { "epoch": 0.92, "grad_norm": 0.6926656365394592, "learning_rate": 2.314674735249622e-05, "loss": 1.1929, "step": 225 }, { "epoch": 1.02, "grad_norm": 0.6960567235946655, "learning_rate": 2.2012102874432675e-05, "loss": 1.1852, "step": 250 }, { "epoch": 1.12, "grad_norm": 0.6939908266067505, "learning_rate": 2.087745839636914e-05, "loss": 1.17, "step": 275 }, { "epoch": 1.22, "grad_norm": 0.7124253511428833, "learning_rate": 1.97428139183056e-05, "loss": 1.1512, "step": 300 }, { "epoch": 1.33, "grad_norm": 0.6936700344085693, "learning_rate": 1.8608169440242055e-05, "loss": 1.1686, "step": 325 }, { "epoch": 1.43, "grad_norm": 0.7236443758010864, "learning_rate": 1.747352496217852e-05, "loss": 1.1702, "step": 350 }, { "epoch": 1.53, "grad_norm": 0.7483662366867065, "learning_rate": 1.6338880484114978e-05, "loss": 1.1655, "step": 375 }, { "epoch": 1.63, "grad_norm": 0.8027617931365967, "learning_rate": 1.5204236006051437e-05, "loss": 1.1616, "step": 400 }, { "epoch": 1.73, "grad_norm": 0.8072408437728882, "learning_rate": 1.4069591527987896e-05, "loss": 1.1828, "step": 425 }, { "epoch": 1.83, "grad_norm": 0.7527270913124084, "learning_rate": 1.2934947049924356e-05, "loss": 1.1429, "step": 450 }, { "epoch": 1.94, "grad_norm": 0.7795941829681396, "learning_rate": 1.1800302571860818e-05, "loss": 1.1738, "step": 475 }, { "epoch": 2.04, "grad_norm": 0.8151872158050537, "learning_rate": 1.0665658093797276e-05, "loss": 1.1376, "step": 500 }, { "epoch": 2.14, "grad_norm": 0.7859562635421753, "learning_rate": 9.531013615733736e-06, "loss": 1.1459, "step": 525 }, { "epoch": 2.24, "grad_norm": 0.8044187426567078, "learning_rate": 8.396369137670198e-06, "loss": 1.1523, "step": 550 }, { "epoch": 2.34, "grad_norm": 0.8209382891654968, "learning_rate": 7.261724659606657e-06, "loss": 1.1165, "step": 575 }, { "epoch": 2.45, "grad_norm": 0.8191347122192383, "learning_rate": 6.127080181543117e-06, "loss": 1.1574, "step": 600 }, { "epoch": 2.55, "grad_norm": 0.864337682723999, "learning_rate": 4.992435703479576e-06, "loss": 1.1237, "step": 625 }, { "epoch": 2.65, "grad_norm": 0.8072352409362793, "learning_rate": 3.857791225416037e-06, "loss": 1.1432, "step": 650 }, { "epoch": 2.75, "grad_norm": 0.8192684054374695, "learning_rate": 2.7231467473524962e-06, "loss": 1.1176, "step": 675 }, { "epoch": 2.85, "grad_norm": 0.7951787710189819, "learning_rate": 1.5885022692889562e-06, "loss": 1.1419, "step": 700 }, { "epoch": 2.96, "grad_norm": 0.8341733813285828, "learning_rate": 4.5385779122541606e-07, "loss": 1.1387, "step": 725 } ], "logging_steps": 25, "max_steps": 735, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 4.82271317563392e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }