{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1548, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 3.802351951599121, "learning_rate": 3.2258064516129034e-05, "loss": 3.7331, "step": 25 }, { "epoch": 0.06, "grad_norm": 3.442800760269165, "learning_rate": 6.451612903225807e-05, "loss": 2.9581, "step": 50 }, { "epoch": 0.1, "grad_norm": 3.616957902908325, "learning_rate": 9.677419354838711e-05, "loss": 2.4309, "step": 75 }, { "epoch": 0.13, "grad_norm": 2.3956120014190674, "learning_rate": 0.00012903225806451613, "loss": 2.2667, "step": 100 }, { "epoch": 0.16, "grad_norm": 2.698212146759033, "learning_rate": 0.00016129032258064516, "loss": 2.0782, "step": 125 }, { "epoch": 0.19, "grad_norm": 2.7342419624328613, "learning_rate": 0.00019354838709677422, "loss": 2.0691, "step": 150 }, { "epoch": 0.23, "grad_norm": 2.039422035217285, "learning_rate": 0.00019712849964106247, "loss": 2.0865, "step": 175 }, { "epoch": 0.26, "grad_norm": 2.0298664569854736, "learning_rate": 0.00019353912419239053, "loss": 1.9552, "step": 200 }, { "epoch": 0.29, "grad_norm": 1.8978090286254883, "learning_rate": 0.0001899497487437186, "loss": 2.0111, "step": 225 }, { "epoch": 0.32, "grad_norm": 1.9588110446929932, "learning_rate": 0.00018636037329504667, "loss": 1.961, "step": 250 }, { "epoch": 0.36, "grad_norm": 2.160839796066284, "learning_rate": 0.00018277099784637474, "loss": 2.0133, "step": 275 }, { "epoch": 0.39, "grad_norm": 2.0899839401245117, "learning_rate": 0.00017918162239770278, "loss": 1.958, "step": 300 }, { "epoch": 0.42, "grad_norm": 2.029879570007324, "learning_rate": 0.00017559224694903088, "loss": 1.9116, "step": 325 }, { "epoch": 0.45, "grad_norm": 1.7021583318710327, "learning_rate": 0.00017200287150035895, "loss": 1.8842, "step": 350 }, { "epoch": 0.48, "grad_norm": 1.5238200426101685, "learning_rate": 0.00016841349605168702, "loss": 1.8229, "step": 375 }, { "epoch": 0.52, "grad_norm": 1.9168670177459717, "learning_rate": 0.0001648241206030151, "loss": 1.9235, "step": 400 }, { "epoch": 0.55, "grad_norm": 1.8892511129379272, "learning_rate": 0.00016123474515434316, "loss": 1.8859, "step": 425 }, { "epoch": 0.58, "grad_norm": 1.7955598831176758, "learning_rate": 0.00015764536970567123, "loss": 1.8619, "step": 450 }, { "epoch": 0.61, "grad_norm": 1.7658586502075195, "learning_rate": 0.00015405599425699927, "loss": 1.8235, "step": 475 }, { "epoch": 0.65, "grad_norm": 1.598111629486084, "learning_rate": 0.00015046661880832737, "loss": 1.8554, "step": 500 }, { "epoch": 0.68, "grad_norm": 1.9352959394454956, "learning_rate": 0.00014687724335965544, "loss": 1.801, "step": 525 }, { "epoch": 0.71, "grad_norm": 1.9746991395950317, "learning_rate": 0.0001432878679109835, "loss": 1.895, "step": 550 }, { "epoch": 0.74, "grad_norm": 1.476921796798706, "learning_rate": 0.00013969849246231157, "loss": 1.7978, "step": 575 }, { "epoch": 0.78, "grad_norm": 1.667672872543335, "learning_rate": 0.00013610911701363964, "loss": 1.76, "step": 600 }, { "epoch": 0.81, "grad_norm": 1.5855716466903687, "learning_rate": 0.00013251974156496769, "loss": 1.8171, "step": 625 }, { "epoch": 0.84, "grad_norm": 1.9429579973220825, "learning_rate": 0.00012893036611629576, "loss": 1.811, "step": 650 }, { "epoch": 0.87, "grad_norm": 1.684810996055603, "learning_rate": 0.00012534099066762382, "loss": 1.8148, "step": 675 }, { "epoch": 0.9, "grad_norm": 1.6180702447891235, "learning_rate": 0.00012175161521895191, "loss": 1.7356, "step": 700 }, { "epoch": 0.94, "grad_norm": 1.7795788049697876, "learning_rate": 0.00011816223977027998, "loss": 1.7969, "step": 725 }, { "epoch": 0.97, "grad_norm": 1.7645013332366943, "learning_rate": 0.00011457286432160806, "loss": 1.8198, "step": 750 }, { "epoch": 1.0, "grad_norm": 1.713512897491455, "learning_rate": 0.00011098348887293613, "loss": 1.7514, "step": 775 }, { "epoch": 1.03, "grad_norm": 1.3435418605804443, "learning_rate": 0.00010739411342426417, "loss": 1.6679, "step": 800 }, { "epoch": 1.07, "grad_norm": 1.5819846391677856, "learning_rate": 0.00010380473797559225, "loss": 1.7182, "step": 825 }, { "epoch": 1.1, "grad_norm": 1.3324297666549683, "learning_rate": 0.00010021536252692032, "loss": 1.7301, "step": 850 }, { "epoch": 1.13, "grad_norm": 1.4800920486450195, "learning_rate": 9.662598707824839e-05, "loss": 1.7123, "step": 875 }, { "epoch": 1.16, "grad_norm": 1.5812880992889404, "learning_rate": 9.303661162957645e-05, "loss": 1.7816, "step": 900 }, { "epoch": 1.2, "grad_norm": 1.5376070737838745, "learning_rate": 8.944723618090453e-05, "loss": 1.7446, "step": 925 }, { "epoch": 1.23, "grad_norm": 1.5884149074554443, "learning_rate": 8.58578607322326e-05, "loss": 1.6483, "step": 950 }, { "epoch": 1.26, "grad_norm": 2.03753662109375, "learning_rate": 8.226848528356066e-05, "loss": 1.6695, "step": 975 }, { "epoch": 1.29, "grad_norm": 1.7924336194992065, "learning_rate": 7.867910983488873e-05, "loss": 1.7109, "step": 1000 }, { "epoch": 1.32, "grad_norm": 1.80973219871521, "learning_rate": 7.508973438621681e-05, "loss": 1.7017, "step": 1025 }, { "epoch": 1.36, "grad_norm": 1.7763901948928833, "learning_rate": 7.150035893754488e-05, "loss": 1.6669, "step": 1050 }, { "epoch": 1.39, "grad_norm": 3.0561070442199707, "learning_rate": 6.791098348887293e-05, "loss": 1.6681, "step": 1075 }, { "epoch": 1.42, "grad_norm": 1.7292817831039429, "learning_rate": 6.4321608040201e-05, "loss": 1.6787, "step": 1100 }, { "epoch": 1.45, "grad_norm": 1.6157281398773193, "learning_rate": 6.073223259152908e-05, "loss": 1.6716, "step": 1125 }, { "epoch": 1.49, "grad_norm": 1.5873336791992188, "learning_rate": 5.714285714285714e-05, "loss": 1.6724, "step": 1150 }, { "epoch": 1.52, "grad_norm": 1.7574703693389893, "learning_rate": 5.355348169418522e-05, "loss": 1.6608, "step": 1175 }, { "epoch": 1.55, "grad_norm": 1.7600945234298706, "learning_rate": 4.996410624551328e-05, "loss": 1.7204, "step": 1200 }, { "epoch": 1.58, "grad_norm": 1.4515677690505981, "learning_rate": 4.6374730796841356e-05, "loss": 1.6755, "step": 1225 }, { "epoch": 1.61, "grad_norm": 1.9057530164718628, "learning_rate": 4.278535534816942e-05, "loss": 1.6953, "step": 1250 }, { "epoch": 1.65, "grad_norm": 1.7482775449752808, "learning_rate": 3.919597989949749e-05, "loss": 1.686, "step": 1275 }, { "epoch": 1.68, "grad_norm": 1.8991056680679321, "learning_rate": 3.560660445082556e-05, "loss": 1.6611, "step": 1300 }, { "epoch": 1.71, "grad_norm": 1.7882860898971558, "learning_rate": 3.201722900215363e-05, "loss": 1.7103, "step": 1325 }, { "epoch": 1.74, "grad_norm": 1.6628391742706299, "learning_rate": 2.84278535534817e-05, "loss": 1.6925, "step": 1350 }, { "epoch": 1.78, "grad_norm": 1.5516228675842285, "learning_rate": 2.4838478104809766e-05, "loss": 1.652, "step": 1375 }, { "epoch": 1.81, "grad_norm": 1.5719884634017944, "learning_rate": 2.1249102656137835e-05, "loss": 1.6383, "step": 1400 }, { "epoch": 1.84, "grad_norm": 1.8628472089767456, "learning_rate": 1.76597272074659e-05, "loss": 1.6922, "step": 1425 }, { "epoch": 1.87, "grad_norm": 1.7590694427490234, "learning_rate": 1.407035175879397e-05, "loss": 1.687, "step": 1450 }, { "epoch": 1.91, "grad_norm": 1.6741101741790771, "learning_rate": 1.048097631012204e-05, "loss": 1.6599, "step": 1475 }, { "epoch": 1.94, "grad_norm": 1.7378610372543335, "learning_rate": 6.891600861450108e-06, "loss": 1.6765, "step": 1500 }, { "epoch": 1.97, "grad_norm": 1.5435367822647095, "learning_rate": 3.3022254127781766e-06, "loss": 1.607, "step": 1525 } ], "logging_steps": 25, "max_steps": 1548, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 1.0151913205845197e+17, "train_batch_size": 6, "trial_name": null, "trial_params": null }