{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.994334277620396, "eval_steps": 500, "global_step": 3174, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 1.3795244693756104, "learning_rate": 2.358490566037736e-06, "loss": 2.6953, "step": 25 }, { "epoch": 0.09, "grad_norm": 1.2704118490219116, "learning_rate": 4.716981132075472e-06, "loss": 2.7089, "step": 50 }, { "epoch": 0.14, "grad_norm": 1.1683423519134521, "learning_rate": 7.0754716981132075e-06, "loss": 2.6511, "step": 75 }, { "epoch": 0.19, "grad_norm": 1.2082363367080688, "learning_rate": 9.433962264150944e-06, "loss": 2.5193, "step": 100 }, { "epoch": 0.24, "grad_norm": 1.1215194463729858, "learning_rate": 1.179245283018868e-05, "loss": 2.306, "step": 125 }, { "epoch": 0.28, "grad_norm": 0.6498327255249023, "learning_rate": 1.4150943396226415e-05, "loss": 2.1279, "step": 150 }, { "epoch": 0.33, "grad_norm": 0.5128926038742065, "learning_rate": 1.650943396226415e-05, "loss": 1.979, "step": 175 }, { "epoch": 0.38, "grad_norm": 0.4196425676345825, "learning_rate": 1.8867924528301888e-05, "loss": 1.857, "step": 200 }, { "epoch": 0.42, "grad_norm": 0.39977598190307617, "learning_rate": 2.122641509433962e-05, "loss": 1.7905, "step": 225 }, { "epoch": 0.47, "grad_norm": 0.3468642830848694, "learning_rate": 2.358490566037736e-05, "loss": 1.7396, "step": 250 }, { "epoch": 0.52, "grad_norm": 0.3703348934650421, "learning_rate": 2.5943396226415097e-05, "loss": 1.6826, "step": 275 }, { "epoch": 0.57, "grad_norm": 0.3471335470676422, "learning_rate": 2.830188679245283e-05, "loss": 1.6333, "step": 300 }, { "epoch": 0.61, "grad_norm": 0.30928292870521545, "learning_rate": 2.9926470588235295e-05, "loss": 1.5934, "step": 325 }, { "epoch": 0.66, "grad_norm": 0.33998557925224304, "learning_rate": 2.966386554621849e-05, "loss": 1.5503, "step": 350 }, { "epoch": 0.71, "grad_norm": 0.3642776906490326, "learning_rate": 2.940126050420168e-05, "loss": 1.5243, "step": 375 }, { "epoch": 0.76, "grad_norm": 0.31012433767318726, "learning_rate": 2.9138655462184876e-05, "loss": 1.4618, "step": 400 }, { "epoch": 0.8, "grad_norm": 0.4258916974067688, "learning_rate": 2.8876050420168067e-05, "loss": 1.4161, "step": 425 }, { "epoch": 0.85, "grad_norm": 0.3025980591773987, "learning_rate": 2.8613445378151262e-05, "loss": 1.419, "step": 450 }, { "epoch": 0.9, "grad_norm": 0.3354116678237915, "learning_rate": 2.8350840336134453e-05, "loss": 1.3576, "step": 475 }, { "epoch": 0.94, "grad_norm": 0.3400489091873169, "learning_rate": 2.8088235294117648e-05, "loss": 1.3323, "step": 500 }, { "epoch": 0.99, "grad_norm": 0.315164715051651, "learning_rate": 2.7825630252100843e-05, "loss": 1.344, "step": 525 }, { "epoch": 1.04, "grad_norm": 0.3593141734600067, "learning_rate": 2.7563025210084034e-05, "loss": 1.3023, "step": 550 }, { "epoch": 1.09, "grad_norm": 0.4317137598991394, "learning_rate": 2.730042016806723e-05, "loss": 1.3028, "step": 575 }, { "epoch": 1.13, "grad_norm": 0.3506380617618561, "learning_rate": 2.703781512605042e-05, "loss": 1.3026, "step": 600 }, { "epoch": 1.18, "grad_norm": 0.33726122975349426, "learning_rate": 2.6775210084033615e-05, "loss": 1.319, "step": 625 }, { "epoch": 1.23, "grad_norm": 0.37094271183013916, "learning_rate": 2.6512605042016806e-05, "loss": 1.2583, "step": 650 }, { "epoch": 1.27, "grad_norm": 0.41374334692955017, "learning_rate": 2.625e-05, "loss": 1.2486, "step": 675 }, { "epoch": 1.32, "grad_norm": 0.46000728011131287, "learning_rate": 2.5987394957983196e-05, "loss": 1.2463, "step": 700 }, { "epoch": 1.37, "grad_norm": 0.3955087661743164, "learning_rate": 2.5724789915966387e-05, "loss": 1.2397, "step": 725 }, { "epoch": 1.42, "grad_norm": 0.4096736013889313, "learning_rate": 2.546218487394958e-05, "loss": 1.229, "step": 750 }, { "epoch": 1.46, "grad_norm": 0.3845139741897583, "learning_rate": 2.5199579831932773e-05, "loss": 1.2314, "step": 775 }, { "epoch": 1.51, "grad_norm": 0.4077882170677185, "learning_rate": 2.4936974789915968e-05, "loss": 1.2219, "step": 800 }, { "epoch": 1.56, "grad_norm": 0.36021721363067627, "learning_rate": 2.467436974789916e-05, "loss": 1.234, "step": 825 }, { "epoch": 1.61, "grad_norm": 0.36913222074508667, "learning_rate": 2.4411764705882354e-05, "loss": 1.1998, "step": 850 }, { "epoch": 1.65, "grad_norm": 0.35471582412719727, "learning_rate": 2.414915966386555e-05, "loss": 1.1988, "step": 875 }, { "epoch": 1.7, "grad_norm": 0.3558790683746338, "learning_rate": 2.3886554621848737e-05, "loss": 1.2106, "step": 900 }, { "epoch": 1.75, "grad_norm": 0.36467084288597107, "learning_rate": 2.362394957983193e-05, "loss": 1.1717, "step": 925 }, { "epoch": 1.79, "grad_norm": 0.381874680519104, "learning_rate": 2.3361344537815126e-05, "loss": 1.1896, "step": 950 }, { "epoch": 1.84, "grad_norm": 0.3758748769760132, "learning_rate": 2.309873949579832e-05, "loss": 1.1712, "step": 975 }, { "epoch": 1.89, "grad_norm": 0.35793235898017883, "learning_rate": 2.2836134453781513e-05, "loss": 1.1389, "step": 1000 }, { "epoch": 1.94, "grad_norm": 0.44111478328704834, "learning_rate": 2.2573529411764707e-05, "loss": 1.1726, "step": 1025 }, { "epoch": 1.98, "grad_norm": 0.3741939663887024, "learning_rate": 2.2310924369747902e-05, "loss": 1.1607, "step": 1050 }, { "epoch": 2.03, "grad_norm": 0.3894720673561096, "learning_rate": 2.2048319327731093e-05, "loss": 1.2186, "step": 1075 }, { "epoch": 2.08, "grad_norm": 0.3636987805366516, "learning_rate": 2.1785714285714285e-05, "loss": 1.1376, "step": 1100 }, { "epoch": 2.12, "grad_norm": 0.42893752455711365, "learning_rate": 2.152310924369748e-05, "loss": 1.158, "step": 1125 }, { "epoch": 2.17, "grad_norm": 0.3795158863067627, "learning_rate": 2.1260504201680674e-05, "loss": 1.1574, "step": 1150 }, { "epoch": 2.22, "grad_norm": 0.36902275681495667, "learning_rate": 2.0997899159663866e-05, "loss": 1.1523, "step": 1175 }, { "epoch": 2.27, "grad_norm": 0.431219220161438, "learning_rate": 2.073529411764706e-05, "loss": 1.1433, "step": 1200 }, { "epoch": 2.31, "grad_norm": 0.4199659824371338, "learning_rate": 2.0472689075630252e-05, "loss": 1.1481, "step": 1225 }, { "epoch": 2.36, "grad_norm": 0.6324878334999084, "learning_rate": 2.0210084033613447e-05, "loss": 1.1526, "step": 1250 }, { "epoch": 2.41, "grad_norm": 0.523536205291748, "learning_rate": 1.9947478991596638e-05, "loss": 1.1216, "step": 1275 }, { "epoch": 2.46, "grad_norm": 0.5140235424041748, "learning_rate": 1.9684873949579833e-05, "loss": 1.1539, "step": 1300 }, { "epoch": 2.5, "grad_norm": 0.3695720136165619, "learning_rate": 1.9422268907563027e-05, "loss": 1.1666, "step": 1325 }, { "epoch": 2.55, "grad_norm": 0.4080689251422882, "learning_rate": 1.915966386554622e-05, "loss": 1.1037, "step": 1350 }, { "epoch": 2.6, "grad_norm": 0.35790908336639404, "learning_rate": 1.889705882352941e-05, "loss": 1.136, "step": 1375 }, { "epoch": 2.64, "grad_norm": 0.42846861481666565, "learning_rate": 1.8634453781512605e-05, "loss": 1.1325, "step": 1400 }, { "epoch": 2.69, "grad_norm": 0.37662366032600403, "learning_rate": 1.83718487394958e-05, "loss": 1.1439, "step": 1425 }, { "epoch": 2.74, "grad_norm": 0.4963545501232147, "learning_rate": 1.810924369747899e-05, "loss": 1.1701, "step": 1450 }, { "epoch": 2.79, "grad_norm": 0.4511197507381439, "learning_rate": 1.7846638655462186e-05, "loss": 1.1338, "step": 1475 }, { "epoch": 2.83, "grad_norm": 0.44771987199783325, "learning_rate": 1.758403361344538e-05, "loss": 1.1021, "step": 1500 }, { "epoch": 2.88, "grad_norm": 0.4158724248409271, "learning_rate": 1.7321428571428572e-05, "loss": 1.094, "step": 1525 }, { "epoch": 2.93, "grad_norm": 0.43490564823150635, "learning_rate": 1.7058823529411763e-05, "loss": 1.1154, "step": 1550 }, { "epoch": 2.97, "grad_norm": 0.4746383726596832, "learning_rate": 1.6796218487394958e-05, "loss": 1.1311, "step": 1575 }, { "epoch": 3.02, "grad_norm": 0.4157463312149048, "learning_rate": 1.6533613445378153e-05, "loss": 1.1202, "step": 1600 }, { "epoch": 3.07, "grad_norm": 0.38272300362586975, "learning_rate": 1.6271008403361344e-05, "loss": 1.1173, "step": 1625 }, { "epoch": 3.12, "grad_norm": 0.5032052397727966, "learning_rate": 1.600840336134454e-05, "loss": 1.1313, "step": 1650 }, { "epoch": 3.16, "grad_norm": 0.3842039704322815, "learning_rate": 1.5745798319327734e-05, "loss": 1.0984, "step": 1675 }, { "epoch": 3.21, "grad_norm": 0.43160513043403625, "learning_rate": 1.5483193277310925e-05, "loss": 1.1108, "step": 1700 }, { "epoch": 3.26, "grad_norm": 0.420173704624176, "learning_rate": 1.5220588235294118e-05, "loss": 1.144, "step": 1725 }, { "epoch": 3.31, "grad_norm": 0.43490853905677795, "learning_rate": 1.4957983193277311e-05, "loss": 1.0752, "step": 1750 }, { "epoch": 3.35, "grad_norm": 0.45708540081977844, "learning_rate": 1.4695378151260504e-05, "loss": 1.1447, "step": 1775 }, { "epoch": 3.4, "grad_norm": 0.417322039604187, "learning_rate": 1.4432773109243699e-05, "loss": 1.102, "step": 1800 }, { "epoch": 3.45, "grad_norm": 0.4371644854545593, "learning_rate": 1.417016806722689e-05, "loss": 1.1473, "step": 1825 }, { "epoch": 3.49, "grad_norm": 0.4273310899734497, "learning_rate": 1.3907563025210085e-05, "loss": 1.0967, "step": 1850 }, { "epoch": 3.54, "grad_norm": 0.5089781880378723, "learning_rate": 1.3644957983193278e-05, "loss": 1.1297, "step": 1875 }, { "epoch": 3.59, "grad_norm": 0.48617228865623474, "learning_rate": 1.3382352941176471e-05, "loss": 1.0955, "step": 1900 }, { "epoch": 3.64, "grad_norm": 0.4370473623275757, "learning_rate": 1.3119747899159664e-05, "loss": 1.0791, "step": 1925 }, { "epoch": 3.68, "grad_norm": 0.4495941400527954, "learning_rate": 1.2857142857142857e-05, "loss": 1.0648, "step": 1950 }, { "epoch": 3.73, "grad_norm": 0.4138700067996979, "learning_rate": 1.259453781512605e-05, "loss": 1.0948, "step": 1975 }, { "epoch": 3.78, "grad_norm": 0.4161551296710968, "learning_rate": 1.2331932773109243e-05, "loss": 1.0947, "step": 2000 }, { "epoch": 3.82, "grad_norm": 0.3938988745212555, "learning_rate": 1.2069327731092438e-05, "loss": 1.0863, "step": 2025 }, { "epoch": 3.87, "grad_norm": 0.44733569025993347, "learning_rate": 1.180672268907563e-05, "loss": 1.1015, "step": 2050 }, { "epoch": 3.92, "grad_norm": 0.4151917099952698, "learning_rate": 1.1544117647058824e-05, "loss": 1.0817, "step": 2075 }, { "epoch": 3.97, "grad_norm": 0.45207536220550537, "learning_rate": 1.1281512605042017e-05, "loss": 1.0935, "step": 2100 }, { "epoch": 4.01, "grad_norm": 0.43334582448005676, "learning_rate": 1.1018907563025212e-05, "loss": 1.0843, "step": 2125 }, { "epoch": 4.06, "grad_norm": 0.44301116466522217, "learning_rate": 1.0756302521008403e-05, "loss": 1.0617, "step": 2150 }, { "epoch": 4.11, "grad_norm": 0.42584851384162903, "learning_rate": 1.0493697478991596e-05, "loss": 1.102, "step": 2175 }, { "epoch": 4.15, "grad_norm": 0.46070751547813416, "learning_rate": 1.0231092436974791e-05, "loss": 1.0943, "step": 2200 }, { "epoch": 4.2, "grad_norm": 0.43757393956184387, "learning_rate": 9.968487394957983e-06, "loss": 1.082, "step": 2225 }, { "epoch": 4.25, "grad_norm": 0.43552663922309875, "learning_rate": 9.705882352941177e-06, "loss": 1.1033, "step": 2250 }, { "epoch": 4.3, "grad_norm": 0.44868725538253784, "learning_rate": 9.44327731092437e-06, "loss": 1.0912, "step": 2275 }, { "epoch": 4.34, "grad_norm": 0.43542513251304626, "learning_rate": 9.180672268907563e-06, "loss": 1.1113, "step": 2300 }, { "epoch": 4.39, "grad_norm": 0.47481635212898254, "learning_rate": 8.918067226890756e-06, "loss": 1.0455, "step": 2325 }, { "epoch": 4.44, "grad_norm": 0.46137455105781555, "learning_rate": 8.65546218487395e-06, "loss": 1.0898, "step": 2350 }, { "epoch": 4.49, "grad_norm": 0.4473894536495209, "learning_rate": 8.392857142857143e-06, "loss": 1.0836, "step": 2375 }, { "epoch": 4.53, "grad_norm": 0.39784467220306396, "learning_rate": 8.130252100840336e-06, "loss": 1.0629, "step": 2400 }, { "epoch": 4.58, "grad_norm": 0.48481184244155884, "learning_rate": 7.86764705882353e-06, "loss": 1.1173, "step": 2425 }, { "epoch": 4.63, "grad_norm": 0.485196590423584, "learning_rate": 7.605042016806723e-06, "loss": 1.0673, "step": 2450 }, { "epoch": 4.67, "grad_norm": 0.5114961266517639, "learning_rate": 7.342436974789916e-06, "loss": 1.0877, "step": 2475 }, { "epoch": 4.72, "grad_norm": 0.4506637752056122, "learning_rate": 7.07983193277311e-06, "loss": 1.0995, "step": 2500 }, { "epoch": 4.77, "grad_norm": 0.45109784603118896, "learning_rate": 6.817226890756303e-06, "loss": 1.0819, "step": 2525 }, { "epoch": 4.82, "grad_norm": 0.4272564947605133, "learning_rate": 6.554621848739496e-06, "loss": 1.1109, "step": 2550 }, { "epoch": 4.86, "grad_norm": 0.4301404058933258, "learning_rate": 6.29201680672269e-06, "loss": 1.0738, "step": 2575 }, { "epoch": 4.91, "grad_norm": 0.49940961599349976, "learning_rate": 6.029411764705883e-06, "loss": 1.0865, "step": 2600 }, { "epoch": 4.96, "grad_norm": 0.41319113969802856, "learning_rate": 5.7773109243697485e-06, "loss": 1.0535, "step": 2625 }, { "epoch": 5.0, "grad_norm": 0.4326096773147583, "learning_rate": 5.5147058823529415e-06, "loss": 1.0745, "step": 2650 }, { "epoch": 5.05, "grad_norm": 0.4360290765762329, "learning_rate": 5.252100840336135e-06, "loss": 1.0745, "step": 2675 }, { "epoch": 5.1, "grad_norm": 0.42354682087898254, "learning_rate": 4.989495798319328e-06, "loss": 1.0685, "step": 2700 }, { "epoch": 5.15, "grad_norm": 0.49250248074531555, "learning_rate": 4.726890756302521e-06, "loss": 1.0841, "step": 2725 }, { "epoch": 5.19, "grad_norm": 0.4505230784416199, "learning_rate": 4.464285714285715e-06, "loss": 1.0935, "step": 2750 }, { "epoch": 5.24, "grad_norm": 0.41872066259384155, "learning_rate": 4.201680672268908e-06, "loss": 1.0827, "step": 2775 }, { "epoch": 5.29, "grad_norm": 0.45635831356048584, "learning_rate": 3.939075630252101e-06, "loss": 1.0973, "step": 2800 }, { "epoch": 5.34, "grad_norm": 0.49893826246261597, "learning_rate": 3.6764705882352942e-06, "loss": 1.0859, "step": 2825 }, { "epoch": 5.38, "grad_norm": 0.5377572774887085, "learning_rate": 3.4138655462184873e-06, "loss": 1.088, "step": 2850 }, { "epoch": 5.43, "grad_norm": 0.45102909207344055, "learning_rate": 3.1512605042016808e-06, "loss": 1.0875, "step": 2875 }, { "epoch": 5.48, "grad_norm": 0.3922051191329956, "learning_rate": 2.8886554621848742e-06, "loss": 1.0708, "step": 2900 }, { "epoch": 5.52, "grad_norm": 0.4416084289550781, "learning_rate": 2.6260504201680673e-06, "loss": 1.0816, "step": 2925 }, { "epoch": 5.57, "grad_norm": 0.5171985626220703, "learning_rate": 2.3634453781512604e-06, "loss": 1.0859, "step": 2950 }, { "epoch": 5.62, "grad_norm": 0.4239521920681, "learning_rate": 2.100840336134454e-06, "loss": 1.0387, "step": 2975 }, { "epoch": 5.67, "grad_norm": 0.5627429485321045, "learning_rate": 1.8382352941176471e-06, "loss": 1.0818, "step": 3000 }, { "epoch": 5.71, "grad_norm": 0.4605351686477661, "learning_rate": 1.5756302521008404e-06, "loss": 1.0637, "step": 3025 }, { "epoch": 5.76, "grad_norm": 0.40121838450431824, "learning_rate": 1.3130252100840336e-06, "loss": 1.039, "step": 3050 }, { "epoch": 5.81, "grad_norm": 0.45940887928009033, "learning_rate": 1.050420168067227e-06, "loss": 1.0434, "step": 3075 }, { "epoch": 5.85, "grad_norm": 0.4496408998966217, "learning_rate": 7.878151260504202e-07, "loss": 1.1024, "step": 3100 }, { "epoch": 5.9, "grad_norm": 0.4458378553390503, "learning_rate": 5.252100840336135e-07, "loss": 1.0948, "step": 3125 }, { "epoch": 5.95, "grad_norm": 0.49208617210388184, "learning_rate": 2.6260504201680673e-07, "loss": 1.0673, "step": 3150 } ], "logging_steps": 25, "max_steps": 3174, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "total_flos": 1.3634839262527488e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }