{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.997342781222321, "eval_steps": 500, "global_step": 1692, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "grad_norm": 1.2252470254898071, "learning_rate": 4.411764705882353e-06, "loss": 2.746, "step": 25 }, { "epoch": 0.09, "grad_norm": 1.2458932399749756, "learning_rate": 8.823529411764707e-06, "loss": 2.7375, "step": 50 }, { "epoch": 0.13, "grad_norm": 0.9806848764419556, "learning_rate": 1.323529411764706e-05, "loss": 2.6088, "step": 75 }, { "epoch": 0.18, "grad_norm": 0.7579270601272583, "learning_rate": 1.7647058823529414e-05, "loss": 2.3883, "step": 100 }, { "epoch": 0.22, "grad_norm": 0.5973591804504395, "learning_rate": 2.2058823529411766e-05, "loss": 2.148, "step": 125 }, { "epoch": 0.27, "grad_norm": 0.4005584120750427, "learning_rate": 2.647058823529412e-05, "loss": 1.9697, "step": 150 }, { "epoch": 0.31, "grad_norm": 0.30321258306503296, "learning_rate": 2.990144546649146e-05, "loss": 1.8807, "step": 175 }, { "epoch": 0.35, "grad_norm": 0.33162617683410645, "learning_rate": 2.9408672798948752e-05, "loss": 1.788, "step": 200 }, { "epoch": 0.4, "grad_norm": 0.35974565148353577, "learning_rate": 2.8915900131406044e-05, "loss": 1.7825, "step": 225 }, { "epoch": 0.44, "grad_norm": 0.33461177349090576, "learning_rate": 2.842312746386334e-05, "loss": 1.7241, "step": 250 }, { "epoch": 0.49, "grad_norm": 0.31440624594688416, "learning_rate": 2.793035479632063e-05, "loss": 1.6708, "step": 275 }, { "epoch": 0.53, "grad_norm": 0.32463395595550537, "learning_rate": 2.7437582128777926e-05, "loss": 1.6098, "step": 300 }, { "epoch": 0.58, "grad_norm": 0.35138121247291565, "learning_rate": 2.6944809461235218e-05, "loss": 1.5992, "step": 325 }, { "epoch": 0.62, "grad_norm": 0.3213985562324524, "learning_rate": 2.645203679369251e-05, "loss": 1.5133, "step": 350 }, { "epoch": 0.66, "grad_norm": 0.30699512362480164, "learning_rate": 2.5959264126149805e-05, "loss": 1.4807, "step": 375 }, { "epoch": 0.71, "grad_norm": 0.3282407522201538, "learning_rate": 2.5466491458607097e-05, "loss": 1.4928, "step": 400 }, { "epoch": 0.75, "grad_norm": 0.36608022451400757, "learning_rate": 2.4973718791064392e-05, "loss": 1.4802, "step": 425 }, { "epoch": 0.8, "grad_norm": 0.38351428508758545, "learning_rate": 2.4480946123521684e-05, "loss": 1.4886, "step": 450 }, { "epoch": 0.84, "grad_norm": 0.3251235783100128, "learning_rate": 2.3988173455978975e-05, "loss": 1.4557, "step": 475 }, { "epoch": 0.89, "grad_norm": 0.3361181914806366, "learning_rate": 2.349540078843627e-05, "loss": 1.3857, "step": 500 }, { "epoch": 0.93, "grad_norm": 0.36659279465675354, "learning_rate": 2.3002628120893562e-05, "loss": 1.4257, "step": 525 }, { "epoch": 0.97, "grad_norm": 0.4663073420524597, "learning_rate": 2.2509855453350857e-05, "loss": 1.4516, "step": 550 }, { "epoch": 1.02, "grad_norm": 0.3494812846183777, "learning_rate": 2.201708278580815e-05, "loss": 1.3714, "step": 575 }, { "epoch": 1.06, "grad_norm": 0.3586205542087555, "learning_rate": 2.152431011826544e-05, "loss": 1.3569, "step": 600 }, { "epoch": 1.11, "grad_norm": 0.38619765639305115, "learning_rate": 2.1031537450722736e-05, "loss": 1.3331, "step": 625 }, { "epoch": 1.15, "grad_norm": 0.3586406111717224, "learning_rate": 2.0538764783180025e-05, "loss": 1.3446, "step": 650 }, { "epoch": 1.2, "grad_norm": 0.33255332708358765, "learning_rate": 2.004599211563732e-05, "loss": 1.3421, "step": 675 }, { "epoch": 1.24, "grad_norm": 0.362377792596817, "learning_rate": 1.955321944809461e-05, "loss": 1.3543, "step": 700 }, { "epoch": 1.28, "grad_norm": 0.43340814113616943, "learning_rate": 1.9060446780551903e-05, "loss": 1.3592, "step": 725 }, { "epoch": 1.33, "grad_norm": 0.43384671211242676, "learning_rate": 1.85676741130092e-05, "loss": 1.3393, "step": 750 }, { "epoch": 1.37, "grad_norm": 0.6346977353096008, "learning_rate": 1.807490144546649e-05, "loss": 1.3142, "step": 775 }, { "epoch": 1.42, "grad_norm": 0.37973758578300476, "learning_rate": 1.7582128777923785e-05, "loss": 1.3279, "step": 800 }, { "epoch": 1.46, "grad_norm": 0.3827611207962036, "learning_rate": 1.7089356110381077e-05, "loss": 1.3263, "step": 825 }, { "epoch": 1.51, "grad_norm": 0.4215668737888336, "learning_rate": 1.659658344283837e-05, "loss": 1.3353, "step": 850 }, { "epoch": 1.55, "grad_norm": 0.3658856153488159, "learning_rate": 1.6103810775295664e-05, "loss": 1.3144, "step": 875 }, { "epoch": 1.59, "grad_norm": 0.3511466383934021, "learning_rate": 1.5611038107752956e-05, "loss": 1.2621, "step": 900 }, { "epoch": 1.64, "grad_norm": 0.3667345941066742, "learning_rate": 1.511826544021025e-05, "loss": 1.2563, "step": 925 }, { "epoch": 1.68, "grad_norm": 0.40708670020103455, "learning_rate": 1.4625492772667543e-05, "loss": 1.2945, "step": 950 }, { "epoch": 1.73, "grad_norm": 0.42138373851776123, "learning_rate": 1.4132720105124836e-05, "loss": 1.3062, "step": 975 }, { "epoch": 1.77, "grad_norm": 0.35037311911582947, "learning_rate": 1.3639947437582128e-05, "loss": 1.2745, "step": 1000 }, { "epoch": 1.82, "grad_norm": 0.48709043860435486, "learning_rate": 1.3147174770039422e-05, "loss": 1.3324, "step": 1025 }, { "epoch": 1.86, "grad_norm": 0.38190776109695435, "learning_rate": 1.2654402102496715e-05, "loss": 1.2465, "step": 1050 }, { "epoch": 1.9, "grad_norm": 0.4355185329914093, "learning_rate": 1.2161629434954009e-05, "loss": 1.2555, "step": 1075 }, { "epoch": 1.95, "grad_norm": 0.41925591230392456, "learning_rate": 1.16688567674113e-05, "loss": 1.2977, "step": 1100 }, { "epoch": 1.99, "grad_norm": 0.3973592221736908, "learning_rate": 1.1176084099868594e-05, "loss": 1.2695, "step": 1125 }, { "epoch": 2.04, "grad_norm": 0.35202836990356445, "learning_rate": 1.0683311432325887e-05, "loss": 1.3174, "step": 1150 }, { "epoch": 2.08, "grad_norm": 0.40330058336257935, "learning_rate": 1.019053876478318e-05, "loss": 1.2708, "step": 1175 }, { "epoch": 2.13, "grad_norm": 0.37632250785827637, "learning_rate": 9.697766097240474e-06, "loss": 1.2483, "step": 1200 }, { "epoch": 2.17, "grad_norm": 0.3991636633872986, "learning_rate": 9.204993429697766e-06, "loss": 1.2635, "step": 1225 }, { "epoch": 2.21, "grad_norm": 0.400860071182251, "learning_rate": 8.71222076215506e-06, "loss": 1.2464, "step": 1250 }, { "epoch": 2.26, "grad_norm": 0.40742385387420654, "learning_rate": 8.219448094612353e-06, "loss": 1.2532, "step": 1275 }, { "epoch": 2.3, "grad_norm": 0.4327584207057953, "learning_rate": 7.726675427069646e-06, "loss": 1.2771, "step": 1300 }, { "epoch": 2.35, "grad_norm": 0.3787235915660858, "learning_rate": 7.233902759526938e-06, "loss": 1.2508, "step": 1325 }, { "epoch": 2.39, "grad_norm": 0.423367977142334, "learning_rate": 6.741130091984232e-06, "loss": 1.2382, "step": 1350 }, { "epoch": 2.44, "grad_norm": 0.3932174742221832, "learning_rate": 6.248357424441524e-06, "loss": 1.2759, "step": 1375 }, { "epoch": 2.48, "grad_norm": 0.40788665413856506, "learning_rate": 5.755584756898818e-06, "loss": 1.2471, "step": 1400 }, { "epoch": 2.52, "grad_norm": 0.456950843334198, "learning_rate": 5.26281208935611e-06, "loss": 1.2384, "step": 1425 }, { "epoch": 2.57, "grad_norm": 0.36899831891059875, "learning_rate": 4.770039421813404e-06, "loss": 1.1969, "step": 1450 }, { "epoch": 2.61, "grad_norm": 0.44063258171081543, "learning_rate": 4.277266754270697e-06, "loss": 1.2758, "step": 1475 }, { "epoch": 2.66, "grad_norm": 0.3583914339542389, "learning_rate": 3.784494086727989e-06, "loss": 1.2332, "step": 1500 }, { "epoch": 2.7, "grad_norm": 0.4087056517601013, "learning_rate": 3.2917214191852826e-06, "loss": 1.2646, "step": 1525 }, { "epoch": 2.75, "grad_norm": 0.44617870450019836, "learning_rate": 2.7989487516425756e-06, "loss": 1.2265, "step": 1550 }, { "epoch": 2.79, "grad_norm": 0.4349097013473511, "learning_rate": 2.3061760840998687e-06, "loss": 1.2976, "step": 1575 }, { "epoch": 2.83, "grad_norm": 0.4503318965435028, "learning_rate": 1.8134034165571617e-06, "loss": 1.2467, "step": 1600 }, { "epoch": 2.88, "grad_norm": 0.40000948309898376, "learning_rate": 1.3206307490144548e-06, "loss": 1.2146, "step": 1625 }, { "epoch": 2.92, "grad_norm": 0.39695021510124207, "learning_rate": 8.278580814717477e-07, "loss": 1.263, "step": 1650 }, { "epoch": 2.97, "grad_norm": 0.4137335419654846, "learning_rate": 3.3508541392904077e-07, "loss": 1.2436, "step": 1675 } ], "logging_steps": 25, "max_steps": 1692, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 7269909176254464.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }