{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.673469387755102, "eval_steps": 500, "global_step": 90, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "learning_rate": 6.666666666666667e-05, "loss": 3.1608, "step": 1 }, { "epoch": 0.08, "learning_rate": 0.00013333333333333334, "loss": 3.7941, "step": 2 }, { "epoch": 0.12, "learning_rate": 0.0002, "loss": 3.2615, "step": 3 }, { "epoch": 0.16, "learning_rate": 0.00019784946236559142, "loss": 2.493, "step": 4 }, { "epoch": 0.2, "learning_rate": 0.0001956989247311828, "loss": 2.478, "step": 5 }, { "epoch": 0.24, "learning_rate": 0.00019354838709677422, "loss": 2.4586, "step": 6 }, { "epoch": 0.29, "learning_rate": 0.0001913978494623656, "loss": 1.8358, "step": 7 }, { "epoch": 0.33, "learning_rate": 0.000189247311827957, "loss": 1.8871, "step": 8 }, { "epoch": 0.37, "learning_rate": 0.0001870967741935484, "loss": 1.8396, "step": 9 }, { "epoch": 0.41, "learning_rate": 0.00018494623655913978, "loss": 1.9403, "step": 10 }, { "epoch": 0.45, "learning_rate": 0.0001827956989247312, "loss": 1.9696, "step": 11 }, { "epoch": 0.49, "learning_rate": 0.00018064516129032257, "loss": 2.0119, "step": 12 }, { "epoch": 0.53, "learning_rate": 0.00017849462365591398, "loss": 1.7039, "step": 13 }, { "epoch": 0.57, "learning_rate": 0.0001763440860215054, "loss": 1.5358, "step": 14 }, { "epoch": 0.61, "learning_rate": 0.00017419354838709678, "loss": 1.4563, "step": 15 }, { "epoch": 0.65, "learning_rate": 0.0001720430107526882, "loss": 1.4858, "step": 16 }, { "epoch": 0.69, "learning_rate": 0.00016989247311827957, "loss": 1.6904, "step": 17 }, { "epoch": 0.73, "learning_rate": 0.00016774193548387098, "loss": 1.9223, "step": 18 }, { "epoch": 0.78, "learning_rate": 0.0001655913978494624, "loss": 1.4318, "step": 19 }, { "epoch": 0.82, "learning_rate": 0.00016344086021505378, "loss": 1.3245, "step": 20 }, { "epoch": 0.86, "learning_rate": 0.00016129032258064516, "loss": 1.2831, "step": 21 }, { "epoch": 0.9, "learning_rate": 0.00015913978494623657, "loss": 1.4268, "step": 22 }, { "epoch": 0.94, "learning_rate": 0.00015698924731182796, "loss": 1.4465, "step": 23 }, { "epoch": 0.98, "learning_rate": 0.00015483870967741937, "loss": 1.5686, "step": 24 }, { "epoch": 1.02, "learning_rate": 0.00015268817204301075, "loss": 1.3262, "step": 25 }, { "epoch": 1.06, "learning_rate": 0.00015053763440860216, "loss": 1.1648, "step": 26 }, { "epoch": 1.1, "learning_rate": 0.00014838709677419355, "loss": 1.1263, "step": 27 }, { "epoch": 1.14, "learning_rate": 0.00014623655913978496, "loss": 1.119, "step": 28 }, { "epoch": 1.18, "learning_rate": 0.00014408602150537637, "loss": 1.1306, "step": 29 }, { "epoch": 1.22, "learning_rate": 0.00014193548387096775, "loss": 1.2271, "step": 30 }, { "epoch": 1.27, "learning_rate": 0.00013978494623655916, "loss": 1.269, "step": 31 }, { "epoch": 1.31, "learning_rate": 0.00013763440860215055, "loss": 0.9227, "step": 32 }, { "epoch": 1.35, "learning_rate": 0.00013548387096774193, "loss": 0.9733, "step": 33 }, { "epoch": 1.39, "learning_rate": 0.00013333333333333334, "loss": 0.8932, "step": 34 }, { "epoch": 1.43, "learning_rate": 0.00013118279569892472, "loss": 0.9639, "step": 35 }, { "epoch": 1.47, "learning_rate": 0.00012903225806451613, "loss": 1.0789, "step": 36 }, { "epoch": 1.51, "learning_rate": 0.00012688172043010752, "loss": 1.2016, "step": 37 }, { "epoch": 1.55, "learning_rate": 0.00012473118279569893, "loss": 0.8486, "step": 38 }, { "epoch": 1.59, "learning_rate": 0.00012258064516129034, "loss": 0.8141, "step": 39 }, { "epoch": 1.63, "learning_rate": 0.00012043010752688172, "loss": 0.8986, "step": 40 }, { "epoch": 1.67, "learning_rate": 0.00011827956989247313, "loss": 0.9075, "step": 41 }, { "epoch": 1.71, "learning_rate": 0.00011612903225806453, "loss": 0.9939, "step": 42 }, { "epoch": 1.76, "learning_rate": 0.00011397849462365593, "loss": 1.1734, "step": 43 }, { "epoch": 1.8, "learning_rate": 0.00011182795698924731, "loss": 0.7703, "step": 44 }, { "epoch": 1.84, "learning_rate": 0.00010967741935483871, "loss": 0.7249, "step": 45 }, { "epoch": 1.88, "learning_rate": 0.00010752688172043011, "loss": 0.6638, "step": 46 }, { "epoch": 1.92, "learning_rate": 0.0001053763440860215, "loss": 0.754, "step": 47 }, { "epoch": 1.96, "learning_rate": 0.0001032258064516129, "loss": 1.1539, "step": 48 }, { "epoch": 2.0, "learning_rate": 0.0001010752688172043, "loss": 0.8483, "step": 49 }, { "epoch": 2.04, "learning_rate": 9.892473118279571e-05, "loss": 0.8251, "step": 50 }, { "epoch": 2.08, "learning_rate": 9.677419354838711e-05, "loss": 0.5105, "step": 51 }, { "epoch": 2.12, "learning_rate": 9.46236559139785e-05, "loss": 0.4452, "step": 52 }, { "epoch": 2.16, "learning_rate": 9.247311827956989e-05, "loss": 0.5524, "step": 53 }, { "epoch": 2.2, "learning_rate": 9.032258064516129e-05, "loss": 0.6445, "step": 54 }, { "epoch": 2.24, "learning_rate": 8.81720430107527e-05, "loss": 0.6312, "step": 55 }, { "epoch": 2.29, "learning_rate": 8.60215053763441e-05, "loss": 0.8308, "step": 56 }, { "epoch": 2.33, "learning_rate": 8.387096774193549e-05, "loss": 0.4779, "step": 57 }, { "epoch": 2.37, "learning_rate": 8.172043010752689e-05, "loss": 0.4451, "step": 58 }, { "epoch": 2.41, "learning_rate": 7.956989247311829e-05, "loss": 0.4612, "step": 59 }, { "epoch": 2.45, "learning_rate": 7.741935483870968e-05, "loss": 0.5381, "step": 60 }, { "epoch": 2.49, "learning_rate": 7.526881720430108e-05, "loss": 0.6287, "step": 61 }, { "epoch": 2.53, "learning_rate": 7.311827956989248e-05, "loss": 0.6218, "step": 62 }, { "epoch": 2.57, "learning_rate": 7.096774193548388e-05, "loss": 0.44, "step": 63 }, { "epoch": 2.61, "learning_rate": 6.881720430107527e-05, "loss": 0.453, "step": 64 }, { "epoch": 2.65, "learning_rate": 6.666666666666667e-05, "loss": 0.4534, "step": 65 }, { "epoch": 2.69, "learning_rate": 6.451612903225807e-05, "loss": 0.5502, "step": 66 }, { "epoch": 2.73, "learning_rate": 6.236559139784946e-05, "loss": 0.5771, "step": 67 }, { "epoch": 2.78, "learning_rate": 6.021505376344086e-05, "loss": 0.5361, "step": 68 }, { "epoch": 2.82, "learning_rate": 5.8064516129032266e-05, "loss": 0.4565, "step": 69 }, { "epoch": 2.86, "learning_rate": 5.5913978494623656e-05, "loss": 0.4856, "step": 70 }, { "epoch": 2.9, "learning_rate": 5.3763440860215054e-05, "loss": 0.4482, "step": 71 }, { "epoch": 2.94, "learning_rate": 5.161290322580645e-05, "loss": 0.545, "step": 72 }, { "epoch": 2.98, "learning_rate": 4.9462365591397855e-05, "loss": 0.59, "step": 73 }, { "epoch": 3.02, "learning_rate": 4.731182795698925e-05, "loss": 0.6918, "step": 74 }, { "epoch": 3.06, "learning_rate": 4.516129032258064e-05, "loss": 0.3778, "step": 75 }, { "epoch": 3.1, "learning_rate": 4.301075268817205e-05, "loss": 0.3713, "step": 76 }, { "epoch": 3.14, "learning_rate": 4.0860215053763444e-05, "loss": 0.3689, "step": 77 }, { "epoch": 3.18, "learning_rate": 3.870967741935484e-05, "loss": 0.3884, "step": 78 }, { "epoch": 3.22, "learning_rate": 3.655913978494624e-05, "loss": 0.4363, "step": 79 }, { "epoch": 3.27, "learning_rate": 3.4408602150537636e-05, "loss": 0.627, "step": 80 }, { "epoch": 3.31, "learning_rate": 3.2258064516129034e-05, "loss": 0.357, "step": 81 }, { "epoch": 3.35, "learning_rate": 3.010752688172043e-05, "loss": 0.3098, "step": 82 }, { "epoch": 3.39, "learning_rate": 2.7956989247311828e-05, "loss": 0.341, "step": 83 }, { "epoch": 3.43, "learning_rate": 2.5806451612903226e-05, "loss": 0.3881, "step": 84 }, { "epoch": 3.47, "learning_rate": 2.3655913978494626e-05, "loss": 0.4509, "step": 85 }, { "epoch": 3.51, "learning_rate": 2.1505376344086024e-05, "loss": 0.6519, "step": 86 }, { "epoch": 3.55, "learning_rate": 1.935483870967742e-05, "loss": 0.3646, "step": 87 }, { "epoch": 3.59, "learning_rate": 1.7204301075268818e-05, "loss": 0.2933, "step": 88 }, { "epoch": 3.63, "learning_rate": 1.5053763440860215e-05, "loss": 0.3629, "step": 89 }, { "epoch": 3.67, "learning_rate": 1.2903225806451613e-05, "loss": 0.3859, "step": 90 } ], "logging_steps": 1, "max_steps": 96, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 10, "total_flos": 1.2623208115765248e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }