{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 7679, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013022528975126969, "grad_norm": 0.13363803923130035, "learning_rate": 6.666666666666667e-06, "loss": 2.4964, "step": 100 }, { "epoch": 0.026045057950253938, "grad_norm": 0.2602158486843109, "learning_rate": 1.3333333333333333e-05, "loss": 2.484, "step": 200 }, { "epoch": 0.03906758692538091, "grad_norm": 0.3781226873397827, "learning_rate": 2e-05, "loss": 2.4035, "step": 300 }, { "epoch": 0.052090115900507876, "grad_norm": 0.4238300323486328, "learning_rate": 1.999093831153269e-05, "loss": 2.3381, "step": 400 }, { "epoch": 0.06511264487563485, "grad_norm": 0.4826453924179077, "learning_rate": 1.9963769668970327e-05, "loss": 2.3359, "step": 500 }, { "epoch": 0.07813517385076182, "grad_norm": 0.5577328205108643, "learning_rate": 1.991854331106791e-05, "loss": 2.3252, "step": 600 }, { "epoch": 0.09115770282588878, "grad_norm": 0.6093964576721191, "learning_rate": 1.9855341203258605e-05, "loss": 2.3003, "step": 700 }, { "epoch": 0.10418023180101575, "grad_norm": 0.6448187828063965, "learning_rate": 1.9774277889104696e-05, "loss": 2.2692, "step": 800 }, { "epoch": 0.11720276077614272, "grad_norm": 0.6290944218635559, "learning_rate": 1.967550028270599e-05, "loss": 2.2529, "step": 900 }, { "epoch": 0.1302252897512697, "grad_norm": 0.6941649317741394, "learning_rate": 1.9559187402441825e-05, "loss": 2.2357, "step": 1000 }, { "epoch": 0.14324781872639666, "grad_norm": 0.752715528011322, "learning_rate": 1.942555004652934e-05, "loss": 2.2442, "step": 1100 }, { "epoch": 0.15627034770152365, "grad_norm": 0.6779051423072815, "learning_rate": 1.9274830410985915e-05, "loss": 2.2162, "step": 1200 }, { "epoch": 0.1692928766766506, "grad_norm": 0.6820743083953857, "learning_rate": 1.9107301650688188e-05, "loss": 2.2573, "step": 1300 }, { "epoch": 0.18231540565177756, "grad_norm": 1.0248582363128662, "learning_rate": 1.8923267384323184e-05, "loss": 2.2348, "step": 1400 }, { "epoch": 0.19533793462690455, "grad_norm": 0.6467209458351135, "learning_rate": 1.8723061144128728e-05, "loss": 2.2301, "step": 1500 }, { "epoch": 0.2083604636020315, "grad_norm": 0.918159008026123, "learning_rate": 1.8507045771420383e-05, "loss": 2.2039, "step": 1600 }, { "epoch": 0.2213829925771585, "grad_norm": 0.7963811755180359, "learning_rate": 1.8275612759000486e-05, "loss": 2.2139, "step": 1700 }, { "epoch": 0.23440552155228545, "grad_norm": 0.731287956237793, "learning_rate": 1.8029181541640952e-05, "loss": 2.1956, "step": 1800 }, { "epoch": 0.24742805052741243, "grad_norm": 0.7656735777854919, "learning_rate": 1.7768198735925848e-05, "loss": 2.2251, "step": 1900 }, { "epoch": 0.2604505795025394, "grad_norm": 0.9210675358772278, "learning_rate": 1.7493137330831318e-05, "loss": 2.1839, "step": 2000 }, { "epoch": 0.27347310847766637, "grad_norm": 0.6853137016296387, "learning_rate": 1.7204495830509832e-05, "loss": 2.1842, "step": 2100 }, { "epoch": 0.28649563745279333, "grad_norm": 0.6904870271682739, "learning_rate": 1.6902797350832318e-05, "loss": 2.1832, "step": 2200 }, { "epoch": 0.2995181664279203, "grad_norm": 1.0184141397476196, "learning_rate": 1.6588588671325554e-05, "loss": 2.1457, "step": 2300 }, { "epoch": 0.3125406954030473, "grad_norm": 0.8124328851699829, "learning_rate": 1.626243924422303e-05, "loss": 2.1382, "step": 2400 }, { "epoch": 0.32556322437817425, "grad_norm": 0.9830845594406128, "learning_rate": 1.592494016242518e-05, "loss": 2.1469, "step": 2500 }, { "epoch": 0.3385857533533012, "grad_norm": 0.9020227193832397, "learning_rate": 1.5576703088239455e-05, "loss": 2.1907, "step": 2600 }, { "epoch": 0.35160828232842817, "grad_norm": 1.0550016164779663, "learning_rate": 1.5218359144841666e-05, "loss": 2.1908, "step": 2700 }, { "epoch": 0.3646308113035551, "grad_norm": 0.7753348350524902, "learning_rate": 1.4850557772467655e-05, "loss": 2.1503, "step": 2800 }, { "epoch": 0.37765334027868214, "grad_norm": 0.9372894167900085, "learning_rate": 1.4473965551408284e-05, "loss": 2.1639, "step": 2900 }, { "epoch": 0.3906758692538091, "grad_norm": 0.7956731915473938, "learning_rate": 1.4089264993940843e-05, "loss": 2.1498, "step": 3000 }, { "epoch": 0.40369839822893605, "grad_norm": 0.7356022000312805, "learning_rate": 1.3697153307386327e-05, "loss": 2.1705, "step": 3100 }, { "epoch": 0.416720927204063, "grad_norm": 0.864648699760437, "learning_rate": 1.3298341130534323e-05, "loss": 2.1514, "step": 3200 }, { "epoch": 0.42974345617919, "grad_norm": 1.0018072128295898, "learning_rate": 1.2893551245725551e-05, "loss": 2.1069, "step": 3300 }, { "epoch": 0.442765985154317, "grad_norm": 0.9439816474914551, "learning_rate": 1.2483517268926188e-05, "loss": 2.1194, "step": 3400 }, { "epoch": 0.45578851412944393, "grad_norm": 0.8040792346000671, "learning_rate": 1.2068982320167986e-05, "loss": 2.1365, "step": 3500 }, { "epoch": 0.4688110431045709, "grad_norm": 1.0141490697860718, "learning_rate": 1.1650697676763833e-05, "loss": 2.1117, "step": 3600 }, { "epoch": 0.4818335720796979, "grad_norm": 1.0436880588531494, "learning_rate": 1.1229421411739574e-05, "loss": 2.1248, "step": 3700 }, { "epoch": 0.49485610105482486, "grad_norm": 1.00360906124115, "learning_rate": 1.0805917019949665e-05, "loss": 2.1349, "step": 3800 }, { "epoch": 0.5078786300299518, "grad_norm": 0.9457325339317322, "learning_rate": 1.0380952034366703e-05, "loss": 2.1253, "step": 3900 }, { "epoch": 0.5209011590050788, "grad_norm": 0.871446967124939, "learning_rate": 9.955296635052454e-06, "loss": 2.1322, "step": 4000 }, { "epoch": 0.5339236879802057, "grad_norm": 1.199268102645874, "learning_rate": 9.529722253331522e-06, "loss": 2.1386, "step": 4100 }, { "epoch": 0.5469462169553327, "grad_norm": 0.8790176510810852, "learning_rate": 9.105000173697276e-06, "loss": 2.1618, "step": 4200 }, { "epoch": 0.5599687459304596, "grad_norm": 1.089074969291687, "learning_rate": 8.681900135983885e-06, "loss": 2.132, "step": 4300 }, { "epoch": 0.5729912749055867, "grad_norm": 1.0114529132843018, "learning_rate": 8.26118894033779e-06, "loss": 2.1433, "step": 4400 }, { "epoch": 0.5860138038807137, "grad_norm": 1.2954638004302979, "learning_rate": 7.843629057516935e-06, "loss": 2.1213, "step": 4500 }, { "epoch": 0.5990363328558406, "grad_norm": 1.0429459810256958, "learning_rate": 7.429977247036231e-06, "loss": 2.0845, "step": 4600 }, { "epoch": 0.6120588618309676, "grad_norm": 0.9565938115119934, "learning_rate": 7.020983185663779e-06, "loss": 2.1291, "step": 4700 }, { "epoch": 0.6250813908060946, "grad_norm": 0.900711715221405, "learning_rate": 6.617388108753403e-06, "loss": 2.1065, "step": 4800 }, { "epoch": 0.6381039197812215, "grad_norm": 1.0982881784439087, "learning_rate": 6.219923466875894e-06, "loss": 2.1607, "step": 4900 }, { "epoch": 0.6511264487563485, "grad_norm": 1.1638540029525757, "learning_rate": 5.829309600183536e-06, "loss": 2.0958, "step": 5000 }, { "epoch": 0.6641489777314754, "grad_norm": 0.9292285442352295, "learning_rate": 5.446254432910526e-06, "loss": 2.1075, "step": 5100 }, { "epoch": 0.6771715067066024, "grad_norm": 0.8159144520759583, "learning_rate": 5.071452190375194e-06, "loss": 2.1218, "step": 5200 }, { "epoch": 0.6901940356817294, "grad_norm": 1.1008979082107544, "learning_rate": 4.705582140809275e-06, "loss": 2.1127, "step": 5300 }, { "epoch": 0.7032165646568563, "grad_norm": 0.885331928730011, "learning_rate": 4.349307364294512e-06, "loss": 2.1082, "step": 5400 }, { "epoch": 0.7162390936319833, "grad_norm": 0.9930873513221741, "learning_rate": 4.0032735510376055e-06, "loss": 2.1218, "step": 5500 }, { "epoch": 0.7292616226071102, "grad_norm": 0.9115188717842102, "learning_rate": 3.668107831161537e-06, "loss": 2.1659, "step": 5600 }, { "epoch": 0.7422841515822373, "grad_norm": 1.0631247758865356, "learning_rate": 3.344417638133999e-06, "loss": 2.1263, "step": 5700 }, { "epoch": 0.7553066805573643, "grad_norm": 1.2672806978225708, "learning_rate": 3.032789607892811e-06, "loss": 2.1333, "step": 5800 }, { "epoch": 0.7683292095324912, "grad_norm": 1.0414445400238037, "learning_rate": 2.733788515663528e-06, "loss": 2.1135, "step": 5900 }, { "epoch": 0.7813517385076182, "grad_norm": 1.0180047750473022, "learning_rate": 2.447956252395974e-06, "loss": 2.1743, "step": 6000 }, { "epoch": 0.7943742674827452, "grad_norm": 0.8480224609375, "learning_rate": 2.1758108426748847e-06, "loss": 2.1312, "step": 6100 }, { "epoch": 0.8073967964578721, "grad_norm": 1.0538519620895386, "learning_rate": 1.9178455058843938e-06, "loss": 2.0941, "step": 6200 }, { "epoch": 0.8204193254329991, "grad_norm": 0.8907907605171204, "learning_rate": 1.6745277623279766e-06, "loss": 2.1227, "step": 6300 }, { "epoch": 0.833441854408126, "grad_norm": 0.8479962348937988, "learning_rate": 1.446298585923771e-06, "loss": 2.1258, "step": 6400 }, { "epoch": 0.846464383383253, "grad_norm": 1.204111933708191, "learning_rate": 1.2335716050109182e-06, "loss": 2.1328, "step": 6500 }, { "epoch": 0.85948691235838, "grad_norm": 1.111570119857788, "learning_rate": 1.0367323527153462e-06, "loss": 2.1176, "step": 6600 }, { "epoch": 0.8725094413335069, "grad_norm": 0.7793114185333252, "learning_rate": 8.561375682335393e-07, "loss": 2.1142, "step": 6700 }, { "epoch": 0.885531970308634, "grad_norm": 0.8252727389335632, "learning_rate": 6.92114550300661e-07, "loss": 2.1273, "step": 6800 }, { "epoch": 0.8985544992837609, "grad_norm": 0.9007234573364258, "learning_rate": 5.449605640147038e-07, "loss": 2.1098, "step": 6900 }, { "epoch": 0.9115770282588879, "grad_norm": 0.9859702587127686, "learning_rate": 4.149423020917587e-07, "loss": 2.1, "step": 7000 }, { "epoch": 0.9245995572340149, "grad_norm": 1.1626622676849365, "learning_rate": 3.022954015287449e-07, "loss": 2.1123, "step": 7100 }, { "epoch": 0.9376220862091418, "grad_norm": 0.9756370186805725, "learning_rate": 2.0722401654960644e-07, "loss": 2.1186, "step": 7200 }, { "epoch": 0.9506446151842688, "grad_norm": 1.004597783088684, "learning_rate": 1.299004486089095e-07, "loss": 2.0744, "step": 7300 }, { "epoch": 0.9636671441593958, "grad_norm": 1.021781086921692, "learning_rate": 7.046483412342708e-08, "loss": 2.086, "step": 7400 }, { "epoch": 0.9766896731345227, "grad_norm": 0.9634168148040771, "learning_rate": 2.9024890497625356e-08, "loss": 2.1365, "step": 7500 }, { "epoch": 0.9897122021096497, "grad_norm": 1.1906155347824097, "learning_rate": 5.655720903351425e-09, "loss": 2.1136, "step": 7600 }, { "epoch": 1.0, "step": 7679, "total_flos": 1.3987105112064e+17, "train_loss": 2.1709037589008475, "train_runtime": 4004.8154, "train_samples_per_second": 3.835, "train_steps_per_second": 1.917 } ], "logging_steps": 100, "max_steps": 7679, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3987105112064e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }