Undi95's picture
Upload folder using huggingface_hub
65152cf verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5003174603174603,
"eval_steps": 500,
"global_step": 394,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012698412698412698,
"grad_norm": 11.4375,
"learning_rate": 7e-08,
"loss": 2.3377,
"step": 1
},
{
"epoch": 0.0025396825396825397,
"grad_norm": 12.4375,
"learning_rate": 1.4e-07,
"loss": 2.4221,
"step": 2
},
{
"epoch": 0.0038095238095238095,
"grad_norm": 10.375,
"learning_rate": 2.1e-07,
"loss": 2.1995,
"step": 3
},
{
"epoch": 0.005079365079365079,
"grad_norm": 11.25,
"learning_rate": 2.8e-07,
"loss": 2.3242,
"step": 4
},
{
"epoch": 0.006349206349206349,
"grad_norm": 11.0625,
"learning_rate": 3.5000000000000004e-07,
"loss": 2.3848,
"step": 5
},
{
"epoch": 0.007619047619047619,
"grad_norm": 10.875,
"learning_rate": 4.2e-07,
"loss": 2.2233,
"step": 6
},
{
"epoch": 0.008888888888888889,
"grad_norm": 11.125,
"learning_rate": 4.900000000000001e-07,
"loss": 2.3011,
"step": 7
},
{
"epoch": 0.010158730158730159,
"grad_norm": 12.25,
"learning_rate": 5.6e-07,
"loss": 2.4318,
"step": 8
},
{
"epoch": 0.011428571428571429,
"grad_norm": 10.8125,
"learning_rate": 6.3e-07,
"loss": 2.2978,
"step": 9
},
{
"epoch": 0.012698412698412698,
"grad_norm": 11.3125,
"learning_rate": 7.000000000000001e-07,
"loss": 2.4126,
"step": 10
},
{
"epoch": 0.013968253968253968,
"grad_norm": 10.75,
"learning_rate": 7.7e-07,
"loss": 2.2333,
"step": 11
},
{
"epoch": 0.015238095238095238,
"grad_norm": 11.3125,
"learning_rate": 8.4e-07,
"loss": 2.3186,
"step": 12
},
{
"epoch": 0.01650793650793651,
"grad_norm": 11.0,
"learning_rate": 9.1e-07,
"loss": 2.3917,
"step": 13
},
{
"epoch": 0.017777777777777778,
"grad_norm": 10.8125,
"learning_rate": 9.800000000000001e-07,
"loss": 2.3092,
"step": 14
},
{
"epoch": 0.01904761904761905,
"grad_norm": 10.25,
"learning_rate": 1.05e-06,
"loss": 2.1844,
"step": 15
},
{
"epoch": 0.020317460317460317,
"grad_norm": 9.625,
"learning_rate": 1.12e-06,
"loss": 2.2563,
"step": 16
},
{
"epoch": 0.02158730158730159,
"grad_norm": 10.3125,
"learning_rate": 1.19e-06,
"loss": 2.3583,
"step": 17
},
{
"epoch": 0.022857142857142857,
"grad_norm": 10.1875,
"learning_rate": 1.26e-06,
"loss": 2.2967,
"step": 18
},
{
"epoch": 0.02412698412698413,
"grad_norm": 9.6875,
"learning_rate": 1.33e-06,
"loss": 2.2498,
"step": 19
},
{
"epoch": 0.025396825396825397,
"grad_norm": 9.4375,
"learning_rate": 1.4000000000000001e-06,
"loss": 2.2613,
"step": 20
},
{
"epoch": 0.02666666666666667,
"grad_norm": 9.875,
"learning_rate": 1.47e-06,
"loss": 2.2819,
"step": 21
},
{
"epoch": 0.027936507936507936,
"grad_norm": 10.3125,
"learning_rate": 1.54e-06,
"loss": 2.2904,
"step": 22
},
{
"epoch": 0.029206349206349208,
"grad_norm": 8.9375,
"learning_rate": 1.61e-06,
"loss": 2.1933,
"step": 23
},
{
"epoch": 0.030476190476190476,
"grad_norm": 8.9375,
"learning_rate": 1.68e-06,
"loss": 2.225,
"step": 24
},
{
"epoch": 0.031746031746031744,
"grad_norm": 8.75,
"learning_rate": 1.75e-06,
"loss": 2.2458,
"step": 25
},
{
"epoch": 0.03301587301587302,
"grad_norm": 8.5,
"learning_rate": 1.82e-06,
"loss": 2.1693,
"step": 26
},
{
"epoch": 0.03428571428571429,
"grad_norm": 9.1875,
"learning_rate": 1.8900000000000001e-06,
"loss": 2.2753,
"step": 27
},
{
"epoch": 0.035555555555555556,
"grad_norm": 9.5,
"learning_rate": 1.9600000000000003e-06,
"loss": 2.3356,
"step": 28
},
{
"epoch": 0.036825396825396824,
"grad_norm": 8.4375,
"learning_rate": 2.0299999999999996e-06,
"loss": 2.2706,
"step": 29
},
{
"epoch": 0.0380952380952381,
"grad_norm": 7.96875,
"learning_rate": 2.1e-06,
"loss": 2.1023,
"step": 30
},
{
"epoch": 0.03936507936507937,
"grad_norm": 8.375,
"learning_rate": 2.17e-06,
"loss": 2.2069,
"step": 31
},
{
"epoch": 0.040634920634920635,
"grad_norm": 6.9375,
"learning_rate": 2.24e-06,
"loss": 2.1079,
"step": 32
},
{
"epoch": 0.0419047619047619,
"grad_norm": 7.375,
"learning_rate": 2.31e-06,
"loss": 2.2311,
"step": 33
},
{
"epoch": 0.04317460317460318,
"grad_norm": 6.28125,
"learning_rate": 2.38e-06,
"loss": 2.1627,
"step": 34
},
{
"epoch": 0.044444444444444446,
"grad_norm": 6.125,
"learning_rate": 2.45e-06,
"loss": 2.1843,
"step": 35
},
{
"epoch": 0.045714285714285714,
"grad_norm": 6.0,
"learning_rate": 2.52e-06,
"loss": 2.1101,
"step": 36
},
{
"epoch": 0.04698412698412698,
"grad_norm": 5.375,
"learning_rate": 2.5899999999999998e-06,
"loss": 2.0643,
"step": 37
},
{
"epoch": 0.04825396825396826,
"grad_norm": 5.0625,
"learning_rate": 2.66e-06,
"loss": 2.0844,
"step": 38
},
{
"epoch": 0.049523809523809526,
"grad_norm": 4.71875,
"learning_rate": 2.73e-06,
"loss": 2.0624,
"step": 39
},
{
"epoch": 0.050793650793650794,
"grad_norm": 4.6875,
"learning_rate": 2.8000000000000003e-06,
"loss": 2.1232,
"step": 40
},
{
"epoch": 0.05206349206349206,
"grad_norm": 4.71875,
"learning_rate": 2.8699999999999996e-06,
"loss": 2.124,
"step": 41
},
{
"epoch": 0.05333333333333334,
"grad_norm": 4.53125,
"learning_rate": 2.94e-06,
"loss": 1.9933,
"step": 42
},
{
"epoch": 0.054603174603174605,
"grad_norm": 4.65625,
"learning_rate": 3.01e-06,
"loss": 2.0451,
"step": 43
},
{
"epoch": 0.05587301587301587,
"grad_norm": 4.65625,
"learning_rate": 3.08e-06,
"loss": 2.1581,
"step": 44
},
{
"epoch": 0.05714285714285714,
"grad_norm": 4.65625,
"learning_rate": 3.15e-06,
"loss": 2.1967,
"step": 45
},
{
"epoch": 0.058412698412698416,
"grad_norm": 4.3125,
"learning_rate": 3.22e-06,
"loss": 2.0406,
"step": 46
},
{
"epoch": 0.059682539682539684,
"grad_norm": 4.09375,
"learning_rate": 3.29e-06,
"loss": 1.9816,
"step": 47
},
{
"epoch": 0.06095238095238095,
"grad_norm": 4.0625,
"learning_rate": 3.36e-06,
"loss": 1.9638,
"step": 48
},
{
"epoch": 0.06222222222222222,
"grad_norm": 3.796875,
"learning_rate": 3.4299999999999998e-06,
"loss": 1.9013,
"step": 49
},
{
"epoch": 0.06349206349206349,
"grad_norm": 3.78125,
"learning_rate": 3.5e-06,
"loss": 2.0392,
"step": 50
},
{
"epoch": 0.06476190476190476,
"grad_norm": 3.28125,
"learning_rate": 3.57e-06,
"loss": 1.8965,
"step": 51
},
{
"epoch": 0.06603174603174604,
"grad_norm": 3.328125,
"learning_rate": 3.64e-06,
"loss": 1.8598,
"step": 52
},
{
"epoch": 0.0673015873015873,
"grad_norm": 3.15625,
"learning_rate": 3.71e-06,
"loss": 1.9478,
"step": 53
},
{
"epoch": 0.06857142857142857,
"grad_norm": 3.09375,
"learning_rate": 3.7800000000000002e-06,
"loss": 1.8397,
"step": 54
},
{
"epoch": 0.06984126984126984,
"grad_norm": 3.0625,
"learning_rate": 3.85e-06,
"loss": 1.8759,
"step": 55
},
{
"epoch": 0.07111111111111111,
"grad_norm": 3.078125,
"learning_rate": 3.920000000000001e-06,
"loss": 1.9694,
"step": 56
},
{
"epoch": 0.07238095238095238,
"grad_norm": 2.875,
"learning_rate": 3.99e-06,
"loss": 1.8433,
"step": 57
},
{
"epoch": 0.07365079365079365,
"grad_norm": 3.09375,
"learning_rate": 4.059999999999999e-06,
"loss": 1.9218,
"step": 58
},
{
"epoch": 0.07492063492063492,
"grad_norm": 2.890625,
"learning_rate": 4.129999999999999e-06,
"loss": 1.8318,
"step": 59
},
{
"epoch": 0.0761904761904762,
"grad_norm": 2.8125,
"learning_rate": 4.2e-06,
"loss": 1.898,
"step": 60
},
{
"epoch": 0.07746031746031747,
"grad_norm": 2.84375,
"learning_rate": 4.27e-06,
"loss": 1.8301,
"step": 61
},
{
"epoch": 0.07873015873015873,
"grad_norm": 2.78125,
"learning_rate": 4.34e-06,
"loss": 1.8442,
"step": 62
},
{
"epoch": 0.08,
"grad_norm": 2.671875,
"learning_rate": 4.41e-06,
"loss": 1.8299,
"step": 63
},
{
"epoch": 0.08126984126984127,
"grad_norm": 2.8125,
"learning_rate": 4.48e-06,
"loss": 1.8178,
"step": 64
},
{
"epoch": 0.08253968253968254,
"grad_norm": 2.65625,
"learning_rate": 4.5500000000000005e-06,
"loss": 1.7625,
"step": 65
},
{
"epoch": 0.0838095238095238,
"grad_norm": 2.5625,
"learning_rate": 4.62e-06,
"loss": 1.7792,
"step": 66
},
{
"epoch": 0.08507936507936507,
"grad_norm": 2.75,
"learning_rate": 4.69e-06,
"loss": 1.9265,
"step": 67
},
{
"epoch": 0.08634920634920636,
"grad_norm": 2.671875,
"learning_rate": 4.76e-06,
"loss": 1.8957,
"step": 68
},
{
"epoch": 0.08761904761904762,
"grad_norm": 2.765625,
"learning_rate": 4.8299999999999995e-06,
"loss": 1.7738,
"step": 69
},
{
"epoch": 0.08888888888888889,
"grad_norm": 2.625,
"learning_rate": 4.9e-06,
"loss": 1.7845,
"step": 70
},
{
"epoch": 0.09015873015873016,
"grad_norm": 2.421875,
"learning_rate": 4.97e-06,
"loss": 1.6498,
"step": 71
},
{
"epoch": 0.09142857142857143,
"grad_norm": 2.609375,
"learning_rate": 5.04e-06,
"loss": 1.7736,
"step": 72
},
{
"epoch": 0.0926984126984127,
"grad_norm": 2.515625,
"learning_rate": 5.11e-06,
"loss": 1.7067,
"step": 73
},
{
"epoch": 0.09396825396825396,
"grad_norm": 2.75,
"learning_rate": 5.1799999999999995e-06,
"loss": 1.8757,
"step": 74
},
{
"epoch": 0.09523809523809523,
"grad_norm": 2.53125,
"learning_rate": 5.25e-06,
"loss": 1.7789,
"step": 75
},
{
"epoch": 0.09650793650793651,
"grad_norm": 2.5,
"learning_rate": 5.32e-06,
"loss": 1.7109,
"step": 76
},
{
"epoch": 0.09777777777777778,
"grad_norm": 2.578125,
"learning_rate": 5.39e-06,
"loss": 1.8135,
"step": 77
},
{
"epoch": 0.09904761904761905,
"grad_norm": 2.71875,
"learning_rate": 5.46e-06,
"loss": 1.8264,
"step": 78
},
{
"epoch": 0.10031746031746032,
"grad_norm": 2.703125,
"learning_rate": 5.53e-06,
"loss": 1.822,
"step": 79
},
{
"epoch": 0.10158730158730159,
"grad_norm": 2.734375,
"learning_rate": 5.600000000000001e-06,
"loss": 1.7601,
"step": 80
},
{
"epoch": 0.10285714285714286,
"grad_norm": 2.609375,
"learning_rate": 5.67e-06,
"loss": 1.8076,
"step": 81
},
{
"epoch": 0.10412698412698412,
"grad_norm": 2.828125,
"learning_rate": 5.739999999999999e-06,
"loss": 1.7772,
"step": 82
},
{
"epoch": 0.10539682539682539,
"grad_norm": 2.765625,
"learning_rate": 5.8099999999999994e-06,
"loss": 1.8925,
"step": 83
},
{
"epoch": 0.10666666666666667,
"grad_norm": 2.59375,
"learning_rate": 5.88e-06,
"loss": 1.7806,
"step": 84
},
{
"epoch": 0.10793650793650794,
"grad_norm": 2.515625,
"learning_rate": 5.95e-06,
"loss": 1.6426,
"step": 85
},
{
"epoch": 0.10920634920634921,
"grad_norm": 2.65625,
"learning_rate": 6.02e-06,
"loss": 1.7752,
"step": 86
},
{
"epoch": 0.11047619047619048,
"grad_norm": 2.484375,
"learning_rate": 6.09e-06,
"loss": 1.7094,
"step": 87
},
{
"epoch": 0.11174603174603175,
"grad_norm": 2.5,
"learning_rate": 6.16e-06,
"loss": 1.671,
"step": 88
},
{
"epoch": 0.11301587301587301,
"grad_norm": 2.484375,
"learning_rate": 6.23e-06,
"loss": 1.6576,
"step": 89
},
{
"epoch": 0.11428571428571428,
"grad_norm": 2.65625,
"learning_rate": 6.3e-06,
"loss": 1.6634,
"step": 90
},
{
"epoch": 0.11555555555555555,
"grad_norm": 2.640625,
"learning_rate": 6.37e-06,
"loss": 1.6374,
"step": 91
},
{
"epoch": 0.11682539682539683,
"grad_norm": 2.5625,
"learning_rate": 6.44e-06,
"loss": 1.6832,
"step": 92
},
{
"epoch": 0.1180952380952381,
"grad_norm": 2.59375,
"learning_rate": 6.51e-06,
"loss": 1.6738,
"step": 93
},
{
"epoch": 0.11936507936507937,
"grad_norm": 2.484375,
"learning_rate": 6.58e-06,
"loss": 1.6874,
"step": 94
},
{
"epoch": 0.12063492063492064,
"grad_norm": 2.4375,
"learning_rate": 6.65e-06,
"loss": 1.6468,
"step": 95
},
{
"epoch": 0.1219047619047619,
"grad_norm": 2.546875,
"learning_rate": 6.72e-06,
"loss": 1.6261,
"step": 96
},
{
"epoch": 0.12317460317460317,
"grad_norm": 2.328125,
"learning_rate": 6.789999999999999e-06,
"loss": 1.5797,
"step": 97
},
{
"epoch": 0.12444444444444444,
"grad_norm": 2.4375,
"learning_rate": 6.8599999999999995e-06,
"loss": 1.6279,
"step": 98
},
{
"epoch": 0.12571428571428572,
"grad_norm": 2.53125,
"learning_rate": 6.93e-06,
"loss": 1.6165,
"step": 99
},
{
"epoch": 0.12698412698412698,
"grad_norm": 2.4375,
"learning_rate": 7e-06,
"loss": 1.7051,
"step": 100
},
{
"epoch": 0.12825396825396826,
"grad_norm": 2.484375,
"learning_rate": 6.999998140878257e-06,
"loss": 1.6242,
"step": 101
},
{
"epoch": 0.1295238095238095,
"grad_norm": 2.546875,
"learning_rate": 6.999992563515e-06,
"loss": 1.591,
"step": 102
},
{
"epoch": 0.1307936507936508,
"grad_norm": 2.421875,
"learning_rate": 6.999983267916156e-06,
"loss": 1.6395,
"step": 103
},
{
"epoch": 0.13206349206349208,
"grad_norm": 2.4375,
"learning_rate": 6.9999702540916e-06,
"loss": 1.6801,
"step": 104
},
{
"epoch": 0.13333333333333333,
"grad_norm": 2.453125,
"learning_rate": 6.999953522055158e-06,
"loss": 1.6198,
"step": 105
},
{
"epoch": 0.1346031746031746,
"grad_norm": 2.453125,
"learning_rate": 6.999933071824603e-06,
"loss": 1.6125,
"step": 106
},
{
"epoch": 0.13587301587301587,
"grad_norm": 2.34375,
"learning_rate": 6.9999089034216635e-06,
"loss": 1.6474,
"step": 107
},
{
"epoch": 0.13714285714285715,
"grad_norm": 2.421875,
"learning_rate": 6.999881016872011e-06,
"loss": 1.5838,
"step": 108
},
{
"epoch": 0.1384126984126984,
"grad_norm": 2.453125,
"learning_rate": 6.9998494122052754e-06,
"loss": 1.5673,
"step": 109
},
{
"epoch": 0.13968253968253969,
"grad_norm": 2.484375,
"learning_rate": 6.9998140894550295e-06,
"loss": 1.5612,
"step": 110
},
{
"epoch": 0.14095238095238094,
"grad_norm": 2.4375,
"learning_rate": 6.999775048658799e-06,
"loss": 1.5632,
"step": 111
},
{
"epoch": 0.14222222222222222,
"grad_norm": 2.359375,
"learning_rate": 6.999732289858059e-06,
"loss": 1.6273,
"step": 112
},
{
"epoch": 0.1434920634920635,
"grad_norm": 2.40625,
"learning_rate": 6.999685813098235e-06,
"loss": 1.5568,
"step": 113
},
{
"epoch": 0.14476190476190476,
"grad_norm": 2.40625,
"learning_rate": 6.999635618428701e-06,
"loss": 1.5606,
"step": 114
},
{
"epoch": 0.14603174603174604,
"grad_norm": 2.4375,
"learning_rate": 6.999581705902782e-06,
"loss": 1.5716,
"step": 115
},
{
"epoch": 0.1473015873015873,
"grad_norm": 2.546875,
"learning_rate": 6.999524075577753e-06,
"loss": 1.5832,
"step": 116
},
{
"epoch": 0.14857142857142858,
"grad_norm": 2.46875,
"learning_rate": 6.9994627275148364e-06,
"loss": 1.6203,
"step": 117
},
{
"epoch": 0.14984126984126983,
"grad_norm": 2.515625,
"learning_rate": 6.999397661779208e-06,
"loss": 1.5982,
"step": 118
},
{
"epoch": 0.1511111111111111,
"grad_norm": 2.296875,
"learning_rate": 6.999328878439989e-06,
"loss": 1.4827,
"step": 119
},
{
"epoch": 0.1523809523809524,
"grad_norm": 2.359375,
"learning_rate": 6.99925637757025e-06,
"loss": 1.5523,
"step": 120
},
{
"epoch": 0.15365079365079365,
"grad_norm": 2.234375,
"learning_rate": 6.9991801592470155e-06,
"loss": 1.4812,
"step": 121
},
{
"epoch": 0.15492063492063493,
"grad_norm": 2.203125,
"learning_rate": 6.999100223551257e-06,
"loss": 1.5039,
"step": 122
},
{
"epoch": 0.15619047619047619,
"grad_norm": 2.46875,
"learning_rate": 6.9990165705678915e-06,
"loss": 1.5991,
"step": 123
},
{
"epoch": 0.15746031746031747,
"grad_norm": 2.21875,
"learning_rate": 6.9989292003857905e-06,
"loss": 1.5819,
"step": 124
},
{
"epoch": 0.15873015873015872,
"grad_norm": 2.203125,
"learning_rate": 6.998838113097772e-06,
"loss": 1.5072,
"step": 125
},
{
"epoch": 0.16,
"grad_norm": 2.234375,
"learning_rate": 6.998743308800602e-06,
"loss": 1.539,
"step": 126
},
{
"epoch": 0.16126984126984126,
"grad_norm": 2.25,
"learning_rate": 6.998644787594997e-06,
"loss": 1.6346,
"step": 127
},
{
"epoch": 0.16253968253968254,
"grad_norm": 2.015625,
"learning_rate": 6.998542549585622e-06,
"loss": 1.4874,
"step": 128
},
{
"epoch": 0.16380952380952382,
"grad_norm": 2.15625,
"learning_rate": 6.998436594881089e-06,
"loss": 1.4606,
"step": 129
},
{
"epoch": 0.16507936507936508,
"grad_norm": 2.0625,
"learning_rate": 6.99832692359396e-06,
"loss": 1.4556,
"step": 130
},
{
"epoch": 0.16634920634920636,
"grad_norm": 2.140625,
"learning_rate": 6.998213535840745e-06,
"loss": 1.5251,
"step": 131
},
{
"epoch": 0.1676190476190476,
"grad_norm": 2.078125,
"learning_rate": 6.998096431741903e-06,
"loss": 1.6041,
"step": 132
},
{
"epoch": 0.1688888888888889,
"grad_norm": 2.03125,
"learning_rate": 6.997975611421838e-06,
"loss": 1.516,
"step": 133
},
{
"epoch": 0.17015873015873015,
"grad_norm": 1.890625,
"learning_rate": 6.997851075008906e-06,
"loss": 1.401,
"step": 134
},
{
"epoch": 0.17142857142857143,
"grad_norm": 1.984375,
"learning_rate": 6.997722822635408e-06,
"loss": 1.3963,
"step": 135
},
{
"epoch": 0.1726984126984127,
"grad_norm": 1.78125,
"learning_rate": 6.997590854437593e-06,
"loss": 1.4488,
"step": 136
},
{
"epoch": 0.17396825396825397,
"grad_norm": 1.9375,
"learning_rate": 6.9974551705556605e-06,
"loss": 1.4161,
"step": 137
},
{
"epoch": 0.17523809523809525,
"grad_norm": 1.8359375,
"learning_rate": 6.997315771133752e-06,
"loss": 1.4458,
"step": 138
},
{
"epoch": 0.1765079365079365,
"grad_norm": 1.921875,
"learning_rate": 6.997172656319962e-06,
"loss": 1.5626,
"step": 139
},
{
"epoch": 0.17777777777777778,
"grad_norm": 1.8515625,
"learning_rate": 6.997025826266326e-06,
"loss": 1.4079,
"step": 140
},
{
"epoch": 0.17904761904761904,
"grad_norm": 1.890625,
"learning_rate": 6.996875281128833e-06,
"loss": 1.5113,
"step": 141
},
{
"epoch": 0.18031746031746032,
"grad_norm": 1.765625,
"learning_rate": 6.996721021067415e-06,
"loss": 1.4129,
"step": 142
},
{
"epoch": 0.18158730158730158,
"grad_norm": 1.7734375,
"learning_rate": 6.996563046245947e-06,
"loss": 1.416,
"step": 143
},
{
"epoch": 0.18285714285714286,
"grad_norm": 1.890625,
"learning_rate": 6.99640135683226e-06,
"loss": 1.4688,
"step": 144
},
{
"epoch": 0.18412698412698414,
"grad_norm": 1.7890625,
"learning_rate": 6.9962359529981225e-06,
"loss": 1.4409,
"step": 145
},
{
"epoch": 0.1853968253968254,
"grad_norm": 1.796875,
"learning_rate": 6.996066834919252e-06,
"loss": 1.4301,
"step": 146
},
{
"epoch": 0.18666666666666668,
"grad_norm": 1.7109375,
"learning_rate": 6.995894002775314e-06,
"loss": 1.3702,
"step": 147
},
{
"epoch": 0.18793650793650793,
"grad_norm": 1.7578125,
"learning_rate": 6.995717456749914e-06,
"loss": 1.5495,
"step": 148
},
{
"epoch": 0.1892063492063492,
"grad_norm": 1.6953125,
"learning_rate": 6.99553719703061e-06,
"loss": 1.4909,
"step": 149
},
{
"epoch": 0.19047619047619047,
"grad_norm": 1.7109375,
"learning_rate": 6.9953532238089014e-06,
"loss": 1.4744,
"step": 150
},
{
"epoch": 0.19174603174603175,
"grad_norm": 1.6171875,
"learning_rate": 6.995165537280231e-06,
"loss": 1.3965,
"step": 151
},
{
"epoch": 0.19301587301587303,
"grad_norm": 1.65625,
"learning_rate": 6.994974137643991e-06,
"loss": 1.4505,
"step": 152
},
{
"epoch": 0.19428571428571428,
"grad_norm": 1.640625,
"learning_rate": 6.994779025103515e-06,
"loss": 1.4028,
"step": 153
},
{
"epoch": 0.19555555555555557,
"grad_norm": 1.625,
"learning_rate": 6.994580199866081e-06,
"loss": 1.4503,
"step": 154
},
{
"epoch": 0.19682539682539682,
"grad_norm": 1.5234375,
"learning_rate": 6.994377662142914e-06,
"loss": 1.3517,
"step": 155
},
{
"epoch": 0.1980952380952381,
"grad_norm": 1.6875,
"learning_rate": 6.9941714121491785e-06,
"loss": 1.4896,
"step": 156
},
{
"epoch": 0.19936507936507936,
"grad_norm": 1.5,
"learning_rate": 6.993961450103987e-06,
"loss": 1.4458,
"step": 157
},
{
"epoch": 0.20063492063492064,
"grad_norm": 1.6796875,
"learning_rate": 6.993747776230393e-06,
"loss": 1.4939,
"step": 158
},
{
"epoch": 0.2019047619047619,
"grad_norm": 1.4609375,
"learning_rate": 6.9935303907553945e-06,
"loss": 1.3731,
"step": 159
},
{
"epoch": 0.20317460317460317,
"grad_norm": 1.5859375,
"learning_rate": 6.993309293909931e-06,
"loss": 1.4482,
"step": 160
},
{
"epoch": 0.20444444444444446,
"grad_norm": 1.5546875,
"learning_rate": 6.993084485928888e-06,
"loss": 1.4789,
"step": 161
},
{
"epoch": 0.2057142857142857,
"grad_norm": 1.515625,
"learning_rate": 6.992855967051091e-06,
"loss": 1.3584,
"step": 162
},
{
"epoch": 0.206984126984127,
"grad_norm": 1.484375,
"learning_rate": 6.9926237375193055e-06,
"loss": 1.3995,
"step": 163
},
{
"epoch": 0.20825396825396825,
"grad_norm": 1.421875,
"learning_rate": 6.992387797580246e-06,
"loss": 1.4282,
"step": 164
},
{
"epoch": 0.20952380952380953,
"grad_norm": 1.359375,
"learning_rate": 6.992148147484561e-06,
"loss": 1.4075,
"step": 165
},
{
"epoch": 0.21079365079365078,
"grad_norm": 1.4609375,
"learning_rate": 6.991904787486846e-06,
"loss": 1.4205,
"step": 166
},
{
"epoch": 0.21206349206349207,
"grad_norm": 1.4453125,
"learning_rate": 6.991657717845635e-06,
"loss": 1.365,
"step": 167
},
{
"epoch": 0.21333333333333335,
"grad_norm": 1.3984375,
"learning_rate": 6.991406938823403e-06,
"loss": 1.4606,
"step": 168
},
{
"epoch": 0.2146031746031746,
"grad_norm": 1.3671875,
"learning_rate": 6.991152450686569e-06,
"loss": 1.3828,
"step": 169
},
{
"epoch": 0.21587301587301588,
"grad_norm": 1.265625,
"learning_rate": 6.9908942537054875e-06,
"loss": 1.4357,
"step": 170
},
{
"epoch": 0.21714285714285714,
"grad_norm": 1.328125,
"learning_rate": 6.990632348154456e-06,
"loss": 1.3723,
"step": 171
},
{
"epoch": 0.21841269841269842,
"grad_norm": 1.4140625,
"learning_rate": 6.990366734311711e-06,
"loss": 1.4549,
"step": 172
},
{
"epoch": 0.21968253968253967,
"grad_norm": 1.2734375,
"learning_rate": 6.9900974124594295e-06,
"loss": 1.3897,
"step": 173
},
{
"epoch": 0.22095238095238096,
"grad_norm": 1.4375,
"learning_rate": 6.9898243828837265e-06,
"loss": 1.381,
"step": 174
},
{
"epoch": 0.2222222222222222,
"grad_norm": 1.3515625,
"learning_rate": 6.989547645874657e-06,
"loss": 1.3959,
"step": 175
},
{
"epoch": 0.2234920634920635,
"grad_norm": 1.3984375,
"learning_rate": 6.989267201726213e-06,
"loss": 1.3529,
"step": 176
},
{
"epoch": 0.22476190476190477,
"grad_norm": 1.3828125,
"learning_rate": 6.988983050736326e-06,
"loss": 1.3437,
"step": 177
},
{
"epoch": 0.22603174603174603,
"grad_norm": 1.4453125,
"learning_rate": 6.988695193206866e-06,
"loss": 1.3084,
"step": 178
},
{
"epoch": 0.2273015873015873,
"grad_norm": 1.390625,
"learning_rate": 6.98840362944364e-06,
"loss": 1.2895,
"step": 179
},
{
"epoch": 0.22857142857142856,
"grad_norm": 1.34375,
"learning_rate": 6.9881083597563915e-06,
"loss": 1.3205,
"step": 180
},
{
"epoch": 0.22984126984126985,
"grad_norm": 1.4140625,
"learning_rate": 6.987809384458803e-06,
"loss": 1.3976,
"step": 181
},
{
"epoch": 0.2311111111111111,
"grad_norm": 1.328125,
"learning_rate": 6.987506703868491e-06,
"loss": 1.3126,
"step": 182
},
{
"epoch": 0.23238095238095238,
"grad_norm": 1.359375,
"learning_rate": 6.987200318307011e-06,
"loss": 1.4032,
"step": 183
},
{
"epoch": 0.23365079365079366,
"grad_norm": 1.2890625,
"learning_rate": 6.986890228099852e-06,
"loss": 1.3045,
"step": 184
},
{
"epoch": 0.23492063492063492,
"grad_norm": 1.421875,
"learning_rate": 6.986576433576441e-06,
"loss": 1.3976,
"step": 185
},
{
"epoch": 0.2361904761904762,
"grad_norm": 1.53125,
"learning_rate": 6.9862589350701396e-06,
"loss": 1.3839,
"step": 186
},
{
"epoch": 0.23746031746031745,
"grad_norm": 1.40625,
"learning_rate": 6.985937732918243e-06,
"loss": 1.3107,
"step": 187
},
{
"epoch": 0.23873015873015874,
"grad_norm": 1.6328125,
"learning_rate": 6.985612827461983e-06,
"loss": 1.4188,
"step": 188
},
{
"epoch": 0.24,
"grad_norm": 1.3359375,
"learning_rate": 6.9852842190465244e-06,
"loss": 1.2768,
"step": 189
},
{
"epoch": 0.24126984126984127,
"grad_norm": 1.359375,
"learning_rate": 6.984951908020966e-06,
"loss": 1.2394,
"step": 190
},
{
"epoch": 0.24253968253968253,
"grad_norm": 1.4375,
"learning_rate": 6.984615894738339e-06,
"loss": 1.3525,
"step": 191
},
{
"epoch": 0.2438095238095238,
"grad_norm": 1.5546875,
"learning_rate": 6.984276179555611e-06,
"loss": 1.3494,
"step": 192
},
{
"epoch": 0.2450793650793651,
"grad_norm": 1.3671875,
"learning_rate": 6.983932762833678e-06,
"loss": 1.3324,
"step": 193
},
{
"epoch": 0.24634920634920635,
"grad_norm": 1.421875,
"learning_rate": 6.983585644937373e-06,
"loss": 1.2826,
"step": 194
},
{
"epoch": 0.24761904761904763,
"grad_norm": 1.375,
"learning_rate": 6.983234826235456e-06,
"loss": 1.3029,
"step": 195
},
{
"epoch": 0.24888888888888888,
"grad_norm": 1.4453125,
"learning_rate": 6.982880307100624e-06,
"loss": 1.3598,
"step": 196
},
{
"epoch": 0.25015873015873014,
"grad_norm": 1.40625,
"learning_rate": 6.982522087909498e-06,
"loss": 1.3622,
"step": 197
},
{
"epoch": 0.25142857142857145,
"grad_norm": 1.296875,
"learning_rate": 6.9821601690426384e-06,
"loss": 1.2689,
"step": 198
},
{
"epoch": 0.2526984126984127,
"grad_norm": 1.4453125,
"learning_rate": 6.981794550884529e-06,
"loss": 1.3869,
"step": 199
},
{
"epoch": 0.25396825396825395,
"grad_norm": 1.4765625,
"learning_rate": 6.981425233823588e-06,
"loss": 1.3919,
"step": 200
},
{
"epoch": 0.25523809523809526,
"grad_norm": 1.3515625,
"learning_rate": 6.98105221825216e-06,
"loss": 1.3206,
"step": 201
},
{
"epoch": 0.2565079365079365,
"grad_norm": 1.3671875,
"learning_rate": 6.98067550456652e-06,
"loss": 1.386,
"step": 202
},
{
"epoch": 0.2577777777777778,
"grad_norm": 1.421875,
"learning_rate": 6.980295093166873e-06,
"loss": 1.409,
"step": 203
},
{
"epoch": 0.259047619047619,
"grad_norm": 1.375,
"learning_rate": 6.97991098445735e-06,
"loss": 1.3044,
"step": 204
},
{
"epoch": 0.26031746031746034,
"grad_norm": 1.3984375,
"learning_rate": 6.979523178846011e-06,
"loss": 1.29,
"step": 205
},
{
"epoch": 0.2615873015873016,
"grad_norm": 1.3359375,
"learning_rate": 6.979131676744844e-06,
"loss": 1.4239,
"step": 206
},
{
"epoch": 0.26285714285714284,
"grad_norm": 1.4921875,
"learning_rate": 6.978736478569762e-06,
"loss": 1.4212,
"step": 207
},
{
"epoch": 0.26412698412698415,
"grad_norm": 1.5,
"learning_rate": 6.978337584740607e-06,
"loss": 1.3177,
"step": 208
},
{
"epoch": 0.2653968253968254,
"grad_norm": 1.3203125,
"learning_rate": 6.977934995681146e-06,
"loss": 1.3258,
"step": 209
},
{
"epoch": 0.26666666666666666,
"grad_norm": 1.390625,
"learning_rate": 6.977528711819072e-06,
"loss": 1.4282,
"step": 210
},
{
"epoch": 0.2679365079365079,
"grad_norm": 1.4453125,
"learning_rate": 6.977118733586e-06,
"loss": 1.4156,
"step": 211
},
{
"epoch": 0.2692063492063492,
"grad_norm": 1.4296875,
"learning_rate": 6.976705061417477e-06,
"loss": 1.4307,
"step": 212
},
{
"epoch": 0.2704761904761905,
"grad_norm": 1.3671875,
"learning_rate": 6.976287695752965e-06,
"loss": 1.3837,
"step": 213
},
{
"epoch": 0.27174603174603174,
"grad_norm": 1.453125,
"learning_rate": 6.975866637035859e-06,
"loss": 1.3319,
"step": 214
},
{
"epoch": 0.273015873015873,
"grad_norm": 1.4375,
"learning_rate": 6.975441885713471e-06,
"loss": 1.3346,
"step": 215
},
{
"epoch": 0.2742857142857143,
"grad_norm": 1.4296875,
"learning_rate": 6.975013442237037e-06,
"loss": 1.2907,
"step": 216
},
{
"epoch": 0.27555555555555555,
"grad_norm": 1.4296875,
"learning_rate": 6.974581307061718e-06,
"loss": 1.3801,
"step": 217
},
{
"epoch": 0.2768253968253968,
"grad_norm": 1.4765625,
"learning_rate": 6.974145480646593e-06,
"loss": 1.3904,
"step": 218
},
{
"epoch": 0.2780952380952381,
"grad_norm": 1.421875,
"learning_rate": 6.973705963454666e-06,
"loss": 1.3587,
"step": 219
},
{
"epoch": 0.27936507936507937,
"grad_norm": 1.609375,
"learning_rate": 6.973262755952861e-06,
"loss": 1.3843,
"step": 220
},
{
"epoch": 0.2806349206349206,
"grad_norm": 1.5078125,
"learning_rate": 6.9728158586120195e-06,
"loss": 1.3899,
"step": 221
},
{
"epoch": 0.2819047619047619,
"grad_norm": 1.4296875,
"learning_rate": 6.9723652719069074e-06,
"loss": 1.3172,
"step": 222
},
{
"epoch": 0.2831746031746032,
"grad_norm": 1.3828125,
"learning_rate": 6.971910996316207e-06,
"loss": 1.3525,
"step": 223
},
{
"epoch": 0.28444444444444444,
"grad_norm": 1.3828125,
"learning_rate": 6.97145303232252e-06,
"loss": 1.2543,
"step": 224
},
{
"epoch": 0.2857142857142857,
"grad_norm": 1.4921875,
"learning_rate": 6.970991380412367e-06,
"loss": 1.3654,
"step": 225
},
{
"epoch": 0.286984126984127,
"grad_norm": 1.546875,
"learning_rate": 6.9705260410761876e-06,
"loss": 1.3518,
"step": 226
},
{
"epoch": 0.28825396825396826,
"grad_norm": 1.3984375,
"learning_rate": 6.970057014808337e-06,
"loss": 1.406,
"step": 227
},
{
"epoch": 0.2895238095238095,
"grad_norm": 1.3828125,
"learning_rate": 6.9695843021070855e-06,
"loss": 1.2749,
"step": 228
},
{
"epoch": 0.29079365079365077,
"grad_norm": 1.4609375,
"learning_rate": 6.969107903474625e-06,
"loss": 1.3623,
"step": 229
},
{
"epoch": 0.2920634920634921,
"grad_norm": 1.375,
"learning_rate": 6.9686278194170586e-06,
"loss": 1.3651,
"step": 230
},
{
"epoch": 0.29333333333333333,
"grad_norm": 1.40625,
"learning_rate": 6.968144050444407e-06,
"loss": 1.3215,
"step": 231
},
{
"epoch": 0.2946031746031746,
"grad_norm": 1.484375,
"learning_rate": 6.967656597070603e-06,
"loss": 1.3236,
"step": 232
},
{
"epoch": 0.2958730158730159,
"grad_norm": 1.4140625,
"learning_rate": 6.9671654598134965e-06,
"loss": 1.2878,
"step": 233
},
{
"epoch": 0.29714285714285715,
"grad_norm": 1.40625,
"learning_rate": 6.96667063919485e-06,
"loss": 1.257,
"step": 234
},
{
"epoch": 0.2984126984126984,
"grad_norm": 1.4296875,
"learning_rate": 6.966172135740339e-06,
"loss": 1.3465,
"step": 235
},
{
"epoch": 0.29968253968253966,
"grad_norm": 1.484375,
"learning_rate": 6.96566994997955e-06,
"loss": 1.3257,
"step": 236
},
{
"epoch": 0.30095238095238097,
"grad_norm": 1.421875,
"learning_rate": 6.965164082445983e-06,
"loss": 1.367,
"step": 237
},
{
"epoch": 0.3022222222222222,
"grad_norm": 1.4609375,
"learning_rate": 6.96465453367705e-06,
"loss": 1.3181,
"step": 238
},
{
"epoch": 0.3034920634920635,
"grad_norm": 1.3671875,
"learning_rate": 6.964141304214072e-06,
"loss": 1.3274,
"step": 239
},
{
"epoch": 0.3047619047619048,
"grad_norm": 1.328125,
"learning_rate": 6.963624394602281e-06,
"loss": 1.2907,
"step": 240
},
{
"epoch": 0.30603174603174604,
"grad_norm": 1.4375,
"learning_rate": 6.963103805390821e-06,
"loss": 1.3028,
"step": 241
},
{
"epoch": 0.3073015873015873,
"grad_norm": 1.3984375,
"learning_rate": 6.9625795371327375e-06,
"loss": 1.3416,
"step": 242
},
{
"epoch": 0.30857142857142855,
"grad_norm": 1.4140625,
"learning_rate": 6.962051590384995e-06,
"loss": 1.3399,
"step": 243
},
{
"epoch": 0.30984126984126986,
"grad_norm": 1.421875,
"learning_rate": 6.961519965708457e-06,
"loss": 1.2766,
"step": 244
},
{
"epoch": 0.3111111111111111,
"grad_norm": 1.265625,
"learning_rate": 6.9609846636679e-06,
"loss": 1.1575,
"step": 245
},
{
"epoch": 0.31238095238095237,
"grad_norm": 1.34375,
"learning_rate": 6.960445684832004e-06,
"loss": 1.276,
"step": 246
},
{
"epoch": 0.3136507936507936,
"grad_norm": 1.3984375,
"learning_rate": 6.959903029773356e-06,
"loss": 1.2703,
"step": 247
},
{
"epoch": 0.31492063492063493,
"grad_norm": 1.4375,
"learning_rate": 6.9593566990684474e-06,
"loss": 1.3219,
"step": 248
},
{
"epoch": 0.3161904761904762,
"grad_norm": 1.4296875,
"learning_rate": 6.9588066932976785e-06,
"loss": 1.314,
"step": 249
},
{
"epoch": 0.31746031746031744,
"grad_norm": 1.390625,
"learning_rate": 6.958253013045348e-06,
"loss": 1.2881,
"step": 250
},
{
"epoch": 0.31873015873015875,
"grad_norm": 1.3984375,
"learning_rate": 6.957695658899663e-06,
"loss": 1.336,
"step": 251
},
{
"epoch": 0.32,
"grad_norm": 1.5078125,
"learning_rate": 6.95713463145273e-06,
"loss": 1.3237,
"step": 252
},
{
"epoch": 0.32126984126984126,
"grad_norm": 1.3515625,
"learning_rate": 6.956569931300559e-06,
"loss": 1.2288,
"step": 253
},
{
"epoch": 0.3225396825396825,
"grad_norm": 1.375,
"learning_rate": 6.956001559043064e-06,
"loss": 1.3324,
"step": 254
},
{
"epoch": 0.3238095238095238,
"grad_norm": 1.40625,
"learning_rate": 6.955429515284058e-06,
"loss": 1.329,
"step": 255
},
{
"epoch": 0.3250793650793651,
"grad_norm": 1.515625,
"learning_rate": 6.954853800631254e-06,
"loss": 1.2696,
"step": 256
},
{
"epoch": 0.32634920634920633,
"grad_norm": 1.3984375,
"learning_rate": 6.954274415696267e-06,
"loss": 1.3572,
"step": 257
},
{
"epoch": 0.32761904761904764,
"grad_norm": 1.34375,
"learning_rate": 6.953691361094606e-06,
"loss": 1.2582,
"step": 258
},
{
"epoch": 0.3288888888888889,
"grad_norm": 1.3828125,
"learning_rate": 6.953104637445686e-06,
"loss": 1.2598,
"step": 259
},
{
"epoch": 0.33015873015873015,
"grad_norm": 1.5390625,
"learning_rate": 6.952514245372815e-06,
"loss": 1.274,
"step": 260
},
{
"epoch": 0.3314285714285714,
"grad_norm": 1.390625,
"learning_rate": 6.951920185503199e-06,
"loss": 1.2684,
"step": 261
},
{
"epoch": 0.3326984126984127,
"grad_norm": 1.4453125,
"learning_rate": 6.951322458467938e-06,
"loss": 1.2921,
"step": 262
},
{
"epoch": 0.33396825396825397,
"grad_norm": 1.4609375,
"learning_rate": 6.950721064902034e-06,
"loss": 1.3409,
"step": 263
},
{
"epoch": 0.3352380952380952,
"grad_norm": 1.375,
"learning_rate": 6.95011600544438e-06,
"loss": 1.2827,
"step": 264
},
{
"epoch": 0.33650793650793653,
"grad_norm": 1.390625,
"learning_rate": 6.9495072807377634e-06,
"loss": 1.3019,
"step": 265
},
{
"epoch": 0.3377777777777778,
"grad_norm": 1.328125,
"learning_rate": 6.948894891428866e-06,
"loss": 1.2564,
"step": 266
},
{
"epoch": 0.33904761904761904,
"grad_norm": 1.28125,
"learning_rate": 6.948278838168263e-06,
"loss": 1.2545,
"step": 267
},
{
"epoch": 0.3403174603174603,
"grad_norm": 1.34375,
"learning_rate": 6.947659121610421e-06,
"loss": 1.29,
"step": 268
},
{
"epoch": 0.3415873015873016,
"grad_norm": 1.2890625,
"learning_rate": 6.947035742413701e-06,
"loss": 1.1939,
"step": 269
},
{
"epoch": 0.34285714285714286,
"grad_norm": 1.4921875,
"learning_rate": 6.9464087012403534e-06,
"loss": 1.2794,
"step": 270
},
{
"epoch": 0.3441269841269841,
"grad_norm": 1.359375,
"learning_rate": 6.945777998756516e-06,
"loss": 1.2046,
"step": 271
},
{
"epoch": 0.3453968253968254,
"grad_norm": 1.4140625,
"learning_rate": 6.94514363563222e-06,
"loss": 1.2897,
"step": 272
},
{
"epoch": 0.3466666666666667,
"grad_norm": 1.375,
"learning_rate": 6.944505612541386e-06,
"loss": 1.3137,
"step": 273
},
{
"epoch": 0.34793650793650793,
"grad_norm": 1.390625,
"learning_rate": 6.94386393016182e-06,
"loss": 1.2758,
"step": 274
},
{
"epoch": 0.3492063492063492,
"grad_norm": 1.34375,
"learning_rate": 6.943218589175216e-06,
"loss": 1.2078,
"step": 275
},
{
"epoch": 0.3504761904761905,
"grad_norm": 1.2890625,
"learning_rate": 6.942569590267157e-06,
"loss": 1.2597,
"step": 276
},
{
"epoch": 0.35174603174603175,
"grad_norm": 1.4140625,
"learning_rate": 6.9419169341271085e-06,
"loss": 1.2439,
"step": 277
},
{
"epoch": 0.353015873015873,
"grad_norm": 1.3359375,
"learning_rate": 6.9412606214484245e-06,
"loss": 1.2509,
"step": 278
},
{
"epoch": 0.35428571428571426,
"grad_norm": 1.296875,
"learning_rate": 6.9406006529283425e-06,
"loss": 1.2124,
"step": 279
},
{
"epoch": 0.35555555555555557,
"grad_norm": 1.59375,
"learning_rate": 6.939937029267983e-06,
"loss": 1.2922,
"step": 280
},
{
"epoch": 0.3568253968253968,
"grad_norm": 1.3671875,
"learning_rate": 6.93926975117235e-06,
"loss": 1.3179,
"step": 281
},
{
"epoch": 0.3580952380952381,
"grad_norm": 1.28125,
"learning_rate": 6.93859881935033e-06,
"loss": 1.2535,
"step": 282
},
{
"epoch": 0.3593650793650794,
"grad_norm": 1.3046875,
"learning_rate": 6.937924234514692e-06,
"loss": 1.2865,
"step": 283
},
{
"epoch": 0.36063492063492064,
"grad_norm": 1.28125,
"learning_rate": 6.9372459973820815e-06,
"loss": 1.2395,
"step": 284
},
{
"epoch": 0.3619047619047619,
"grad_norm": 1.4296875,
"learning_rate": 6.936564108673031e-06,
"loss": 1.2963,
"step": 285
},
{
"epoch": 0.36317460317460315,
"grad_norm": 1.3671875,
"learning_rate": 6.935878569111948e-06,
"loss": 1.2609,
"step": 286
},
{
"epoch": 0.36444444444444446,
"grad_norm": 1.25,
"learning_rate": 6.935189379427116e-06,
"loss": 1.1527,
"step": 287
},
{
"epoch": 0.3657142857142857,
"grad_norm": 1.34375,
"learning_rate": 6.934496540350704e-06,
"loss": 1.3306,
"step": 288
},
{
"epoch": 0.36698412698412697,
"grad_norm": 1.28125,
"learning_rate": 6.933800052618749e-06,
"loss": 1.1579,
"step": 289
},
{
"epoch": 0.3682539682539683,
"grad_norm": 1.3984375,
"learning_rate": 6.933099916971171e-06,
"loss": 1.2648,
"step": 290
},
{
"epoch": 0.36952380952380953,
"grad_norm": 1.375,
"learning_rate": 6.932396134151762e-06,
"loss": 1.2618,
"step": 291
},
{
"epoch": 0.3707936507936508,
"grad_norm": 1.265625,
"learning_rate": 6.9316887049081885e-06,
"loss": 1.2435,
"step": 292
},
{
"epoch": 0.37206349206349204,
"grad_norm": 1.3203125,
"learning_rate": 6.930977629991993e-06,
"loss": 1.196,
"step": 293
},
{
"epoch": 0.37333333333333335,
"grad_norm": 1.296875,
"learning_rate": 6.93026291015859e-06,
"loss": 1.1568,
"step": 294
},
{
"epoch": 0.3746031746031746,
"grad_norm": 1.234375,
"learning_rate": 6.929544546167265e-06,
"loss": 1.2868,
"step": 295
},
{
"epoch": 0.37587301587301586,
"grad_norm": 1.1640625,
"learning_rate": 6.928822538781175e-06,
"loss": 1.1996,
"step": 296
},
{
"epoch": 0.37714285714285717,
"grad_norm": 1.296875,
"learning_rate": 6.92809688876735e-06,
"loss": 1.2809,
"step": 297
},
{
"epoch": 0.3784126984126984,
"grad_norm": 1.34375,
"learning_rate": 6.9273675968966874e-06,
"loss": 1.204,
"step": 298
},
{
"epoch": 0.3796825396825397,
"grad_norm": 1.328125,
"learning_rate": 6.926634663943954e-06,
"loss": 1.1716,
"step": 299
},
{
"epoch": 0.38095238095238093,
"grad_norm": 1.2265625,
"learning_rate": 6.925898090687786e-06,
"loss": 1.2552,
"step": 300
},
{
"epoch": 0.38222222222222224,
"grad_norm": 1.2265625,
"learning_rate": 6.9251578779106855e-06,
"loss": 1.179,
"step": 301
},
{
"epoch": 0.3834920634920635,
"grad_norm": 1.1640625,
"learning_rate": 6.9244140263990194e-06,
"loss": 1.1177,
"step": 302
},
{
"epoch": 0.38476190476190475,
"grad_norm": 1.2734375,
"learning_rate": 6.9236665369430255e-06,
"loss": 1.2488,
"step": 303
},
{
"epoch": 0.38603174603174606,
"grad_norm": 1.203125,
"learning_rate": 6.9229154103368015e-06,
"loss": 1.208,
"step": 304
},
{
"epoch": 0.3873015873015873,
"grad_norm": 1.2265625,
"learning_rate": 6.92216064737831e-06,
"loss": 1.1659,
"step": 305
},
{
"epoch": 0.38857142857142857,
"grad_norm": 1.2109375,
"learning_rate": 6.9214022488693786e-06,
"loss": 1.2404,
"step": 306
},
{
"epoch": 0.3898412698412698,
"grad_norm": 1.2421875,
"learning_rate": 6.920640215615697e-06,
"loss": 1.2515,
"step": 307
},
{
"epoch": 0.39111111111111113,
"grad_norm": 1.21875,
"learning_rate": 6.919874548426813e-06,
"loss": 1.2701,
"step": 308
},
{
"epoch": 0.3923809523809524,
"grad_norm": 1.1796875,
"learning_rate": 6.919105248116138e-06,
"loss": 1.253,
"step": 309
},
{
"epoch": 0.39365079365079364,
"grad_norm": 1.15625,
"learning_rate": 6.918332315500942e-06,
"loss": 1.1332,
"step": 310
},
{
"epoch": 0.3949206349206349,
"grad_norm": 1.2578125,
"learning_rate": 6.917555751402356e-06,
"loss": 1.2004,
"step": 311
},
{
"epoch": 0.3961904761904762,
"grad_norm": 1.1953125,
"learning_rate": 6.916775556645364e-06,
"loss": 1.2051,
"step": 312
},
{
"epoch": 0.39746031746031746,
"grad_norm": 1.2890625,
"learning_rate": 6.915991732058812e-06,
"loss": 1.249,
"step": 313
},
{
"epoch": 0.3987301587301587,
"grad_norm": 1.1171875,
"learning_rate": 6.915204278475399e-06,
"loss": 1.2079,
"step": 314
},
{
"epoch": 0.4,
"grad_norm": 1.25,
"learning_rate": 6.914413196731681e-06,
"loss": 1.2628,
"step": 315
},
{
"epoch": 0.4012698412698413,
"grad_norm": 1.5078125,
"learning_rate": 6.913618487668069e-06,
"loss": 1.3653,
"step": 316
},
{
"epoch": 0.40253968253968253,
"grad_norm": 1.09375,
"learning_rate": 6.912820152128825e-06,
"loss": 1.1615,
"step": 317
},
{
"epoch": 0.4038095238095238,
"grad_norm": 1.2265625,
"learning_rate": 6.912018190962065e-06,
"loss": 1.1839,
"step": 318
},
{
"epoch": 0.4050793650793651,
"grad_norm": 1.109375,
"learning_rate": 6.911212605019757e-06,
"loss": 1.1813,
"step": 319
},
{
"epoch": 0.40634920634920635,
"grad_norm": 1.234375,
"learning_rate": 6.910403395157719e-06,
"loss": 1.2426,
"step": 320
},
{
"epoch": 0.4076190476190476,
"grad_norm": 1.2421875,
"learning_rate": 6.909590562235621e-06,
"loss": 1.2174,
"step": 321
},
{
"epoch": 0.4088888888888889,
"grad_norm": 1.1640625,
"learning_rate": 6.908774107116979e-06,
"loss": 1.2622,
"step": 322
},
{
"epoch": 0.41015873015873017,
"grad_norm": 1.15625,
"learning_rate": 6.907954030669158e-06,
"loss": 1.2048,
"step": 323
},
{
"epoch": 0.4114285714285714,
"grad_norm": 1.1796875,
"learning_rate": 6.907130333763371e-06,
"loss": 1.3177,
"step": 324
},
{
"epoch": 0.4126984126984127,
"grad_norm": 1.15625,
"learning_rate": 6.906303017274677e-06,
"loss": 1.2328,
"step": 325
},
{
"epoch": 0.413968253968254,
"grad_norm": 1.2890625,
"learning_rate": 6.90547208208198e-06,
"loss": 1.2359,
"step": 326
},
{
"epoch": 0.41523809523809524,
"grad_norm": 1.09375,
"learning_rate": 6.904637529068028e-06,
"loss": 1.1834,
"step": 327
},
{
"epoch": 0.4165079365079365,
"grad_norm": 1.2265625,
"learning_rate": 6.9037993591194145e-06,
"loss": 1.1702,
"step": 328
},
{
"epoch": 0.4177777777777778,
"grad_norm": 1.1484375,
"learning_rate": 6.902957573126571e-06,
"loss": 1.3178,
"step": 329
},
{
"epoch": 0.41904761904761906,
"grad_norm": 1.21875,
"learning_rate": 6.902112171983775e-06,
"loss": 1.2117,
"step": 330
},
{
"epoch": 0.4203174603174603,
"grad_norm": 1.1328125,
"learning_rate": 6.901263156589144e-06,
"loss": 1.2687,
"step": 331
},
{
"epoch": 0.42158730158730157,
"grad_norm": 1.1171875,
"learning_rate": 6.90041052784463e-06,
"loss": 1.2933,
"step": 332
},
{
"epoch": 0.4228571428571429,
"grad_norm": 1.125,
"learning_rate": 6.899554286656032e-06,
"loss": 1.2205,
"step": 333
},
{
"epoch": 0.42412698412698413,
"grad_norm": 1.1796875,
"learning_rate": 6.89869443393298e-06,
"loss": 1.2044,
"step": 334
},
{
"epoch": 0.4253968253968254,
"grad_norm": 1.140625,
"learning_rate": 6.897830970588943e-06,
"loss": 1.2005,
"step": 335
},
{
"epoch": 0.4266666666666667,
"grad_norm": 1.328125,
"learning_rate": 6.896963897541227e-06,
"loss": 1.2374,
"step": 336
},
{
"epoch": 0.42793650793650795,
"grad_norm": 1.3671875,
"learning_rate": 6.896093215710971e-06,
"loss": 1.2029,
"step": 337
},
{
"epoch": 0.4292063492063492,
"grad_norm": 1.0390625,
"learning_rate": 6.89521892602315e-06,
"loss": 1.1452,
"step": 338
},
{
"epoch": 0.43047619047619046,
"grad_norm": 1.1015625,
"learning_rate": 6.894341029406567e-06,
"loss": 1.2151,
"step": 339
},
{
"epoch": 0.43174603174603177,
"grad_norm": 1.09375,
"learning_rate": 6.893459526793863e-06,
"loss": 1.225,
"step": 340
},
{
"epoch": 0.433015873015873,
"grad_norm": 1.125,
"learning_rate": 6.8925744191215055e-06,
"loss": 1.2256,
"step": 341
},
{
"epoch": 0.4342857142857143,
"grad_norm": 1.0390625,
"learning_rate": 6.8916857073297935e-06,
"loss": 1.1827,
"step": 342
},
{
"epoch": 0.43555555555555553,
"grad_norm": 1.078125,
"learning_rate": 6.890793392362855e-06,
"loss": 1.2185,
"step": 343
},
{
"epoch": 0.43682539682539684,
"grad_norm": 1.015625,
"learning_rate": 6.889897475168645e-06,
"loss": 1.1634,
"step": 344
},
{
"epoch": 0.4380952380952381,
"grad_norm": 1.0546875,
"learning_rate": 6.888997956698947e-06,
"loss": 1.2444,
"step": 345
},
{
"epoch": 0.43936507936507935,
"grad_norm": 1.0,
"learning_rate": 6.888094837909369e-06,
"loss": 1.1364,
"step": 346
},
{
"epoch": 0.44063492063492066,
"grad_norm": 1.03125,
"learning_rate": 6.887188119759343e-06,
"loss": 1.2109,
"step": 347
},
{
"epoch": 0.4419047619047619,
"grad_norm": 1.09375,
"learning_rate": 6.886277803212125e-06,
"loss": 1.2345,
"step": 348
},
{
"epoch": 0.44317460317460317,
"grad_norm": 1.015625,
"learning_rate": 6.885363889234797e-06,
"loss": 1.1564,
"step": 349
},
{
"epoch": 0.4444444444444444,
"grad_norm": 1.0625,
"learning_rate": 6.884446378798258e-06,
"loss": 1.1894,
"step": 350
},
{
"epoch": 0.44571428571428573,
"grad_norm": 1.0390625,
"learning_rate": 6.8835252728772335e-06,
"loss": 1.194,
"step": 351
},
{
"epoch": 0.446984126984127,
"grad_norm": 1.234375,
"learning_rate": 6.882600572450261e-06,
"loss": 1.2596,
"step": 352
},
{
"epoch": 0.44825396825396824,
"grad_norm": 1.171875,
"learning_rate": 6.881672278499705e-06,
"loss": 1.1671,
"step": 353
},
{
"epoch": 0.44952380952380955,
"grad_norm": 0.9765625,
"learning_rate": 6.880740392011738e-06,
"loss": 1.0679,
"step": 354
},
{
"epoch": 0.4507936507936508,
"grad_norm": 1.2734375,
"learning_rate": 6.879804913976361e-06,
"loss": 1.2133,
"step": 355
},
{
"epoch": 0.45206349206349206,
"grad_norm": 1.1171875,
"learning_rate": 6.87886584538738e-06,
"loss": 1.2645,
"step": 356
},
{
"epoch": 0.4533333333333333,
"grad_norm": 1.0546875,
"learning_rate": 6.87792318724242e-06,
"loss": 1.2126,
"step": 357
},
{
"epoch": 0.4546031746031746,
"grad_norm": 1.1875,
"learning_rate": 6.87697694054292e-06,
"loss": 1.2352,
"step": 358
},
{
"epoch": 0.4558730158730159,
"grad_norm": 0.99609375,
"learning_rate": 6.8760271062941286e-06,
"loss": 1.1688,
"step": 359
},
{
"epoch": 0.45714285714285713,
"grad_norm": 0.96875,
"learning_rate": 6.875073685505108e-06,
"loss": 1.2012,
"step": 360
},
{
"epoch": 0.45841269841269844,
"grad_norm": 1.1328125,
"learning_rate": 6.874116679188731e-06,
"loss": 1.2021,
"step": 361
},
{
"epoch": 0.4596825396825397,
"grad_norm": 1.078125,
"learning_rate": 6.873156088361677e-06,
"loss": 1.2262,
"step": 362
},
{
"epoch": 0.46095238095238095,
"grad_norm": 1.0,
"learning_rate": 6.872191914044435e-06,
"loss": 1.1552,
"step": 363
},
{
"epoch": 0.4622222222222222,
"grad_norm": 1.03125,
"learning_rate": 6.8712241572613e-06,
"loss": 1.2148,
"step": 364
},
{
"epoch": 0.4634920634920635,
"grad_norm": 1.0078125,
"learning_rate": 6.870252819040374e-06,
"loss": 1.2414,
"step": 365
},
{
"epoch": 0.46476190476190476,
"grad_norm": 0.9609375,
"learning_rate": 6.869277900413564e-06,
"loss": 1.1179,
"step": 366
},
{
"epoch": 0.466031746031746,
"grad_norm": 1.0859375,
"learning_rate": 6.868299402416579e-06,
"loss": 1.0943,
"step": 367
},
{
"epoch": 0.46730158730158733,
"grad_norm": 1.0546875,
"learning_rate": 6.867317326088932e-06,
"loss": 1.1381,
"step": 368
},
{
"epoch": 0.4685714285714286,
"grad_norm": 1.015625,
"learning_rate": 6.866331672473937e-06,
"loss": 1.174,
"step": 369
},
{
"epoch": 0.46984126984126984,
"grad_norm": 0.9921875,
"learning_rate": 6.865342442618709e-06,
"loss": 1.2216,
"step": 370
},
{
"epoch": 0.4711111111111111,
"grad_norm": 1.0078125,
"learning_rate": 6.86434963757416e-06,
"loss": 1.1455,
"step": 371
},
{
"epoch": 0.4723809523809524,
"grad_norm": 1.1015625,
"learning_rate": 6.863353258395003e-06,
"loss": 1.1913,
"step": 372
},
{
"epoch": 0.47365079365079366,
"grad_norm": 1.0078125,
"learning_rate": 6.8623533061397456e-06,
"loss": 1.2139,
"step": 373
},
{
"epoch": 0.4749206349206349,
"grad_norm": 1.15625,
"learning_rate": 6.861349781870693e-06,
"loss": 1.1511,
"step": 374
},
{
"epoch": 0.47619047619047616,
"grad_norm": 1.0625,
"learning_rate": 6.8603426866539436e-06,
"loss": 1.15,
"step": 375
},
{
"epoch": 0.4774603174603175,
"grad_norm": 1.0546875,
"learning_rate": 6.859332021559393e-06,
"loss": 1.1681,
"step": 376
},
{
"epoch": 0.47873015873015873,
"grad_norm": 1.1484375,
"learning_rate": 6.858317787660723e-06,
"loss": 1.2131,
"step": 377
},
{
"epoch": 0.48,
"grad_norm": 1.0390625,
"learning_rate": 6.857299986035413e-06,
"loss": 1.1533,
"step": 378
},
{
"epoch": 0.4812698412698413,
"grad_norm": 1.078125,
"learning_rate": 6.856278617764729e-06,
"loss": 1.1488,
"step": 379
},
{
"epoch": 0.48253968253968255,
"grad_norm": 1.203125,
"learning_rate": 6.855253683933727e-06,
"loss": 1.12,
"step": 380
},
{
"epoch": 0.4838095238095238,
"grad_norm": 1.0703125,
"learning_rate": 6.85422518563125e-06,
"loss": 1.2726,
"step": 381
},
{
"epoch": 0.48507936507936505,
"grad_norm": 1.0078125,
"learning_rate": 6.85319312394993e-06,
"loss": 1.2218,
"step": 382
},
{
"epoch": 0.48634920634920636,
"grad_norm": 1.125,
"learning_rate": 6.852157499986183e-06,
"loss": 1.1984,
"step": 383
},
{
"epoch": 0.4876190476190476,
"grad_norm": 1.0625,
"learning_rate": 6.85111831484021e-06,
"loss": 1.1689,
"step": 384
},
{
"epoch": 0.4888888888888889,
"grad_norm": 0.94140625,
"learning_rate": 6.8500755696159925e-06,
"loss": 1.1334,
"step": 385
},
{
"epoch": 0.4901587301587302,
"grad_norm": 1.0546875,
"learning_rate": 6.849029265421299e-06,
"loss": 1.0929,
"step": 386
},
{
"epoch": 0.49142857142857144,
"grad_norm": 1.0625,
"learning_rate": 6.847979403367674e-06,
"loss": 1.2601,
"step": 387
},
{
"epoch": 0.4926984126984127,
"grad_norm": 1.0234375,
"learning_rate": 6.846925984570446e-06,
"loss": 1.2579,
"step": 388
},
{
"epoch": 0.49396825396825395,
"grad_norm": 1.0078125,
"learning_rate": 6.8458690101487195e-06,
"loss": 1.1605,
"step": 389
},
{
"epoch": 0.49523809523809526,
"grad_norm": 0.96875,
"learning_rate": 6.844808481225377e-06,
"loss": 1.1947,
"step": 390
},
{
"epoch": 0.4965079365079365,
"grad_norm": 0.8828125,
"learning_rate": 6.8437443989270756e-06,
"loss": 1.0904,
"step": 391
},
{
"epoch": 0.49777777777777776,
"grad_norm": 0.921875,
"learning_rate": 6.84267676438425e-06,
"loss": 1.1382,
"step": 392
},
{
"epoch": 0.4990476190476191,
"grad_norm": 1.0703125,
"learning_rate": 6.8416055787311076e-06,
"loss": 1.252,
"step": 393
},
{
"epoch": 0.5003174603174603,
"grad_norm": 0.91796875,
"learning_rate": 6.840530843105628e-06,
"loss": 1.1412,
"step": 394
}
],
"logging_steps": 1,
"max_steps": 3148,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 394,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.734007407002255e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}