TokenBender's picture
Upload folder using huggingface_hub
973ffd8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.30039011703511054,
"eval_steps": 500,
"global_step": 462,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.0303955078125,
"learning_rate": 4.2553191489361704e-07,
"loss": 1.11,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 0.0294189453125,
"learning_rate": 8.510638297872341e-07,
"loss": 0.9825,
"step": 2
},
{
"epoch": 0.0,
"grad_norm": 0.028076171875,
"learning_rate": 1.276595744680851e-06,
"loss": 1.0375,
"step": 3
},
{
"epoch": 0.0,
"grad_norm": 0.036865234375,
"learning_rate": 1.7021276595744682e-06,
"loss": 1.042,
"step": 4
},
{
"epoch": 0.0,
"grad_norm": 0.0311279296875,
"learning_rate": 2.1276595744680853e-06,
"loss": 0.9769,
"step": 5
},
{
"epoch": 0.0,
"grad_norm": 0.036865234375,
"learning_rate": 2.553191489361702e-06,
"loss": 0.9316,
"step": 6
},
{
"epoch": 0.0,
"grad_norm": 0.0299072265625,
"learning_rate": 2.978723404255319e-06,
"loss": 1.0077,
"step": 7
},
{
"epoch": 0.01,
"grad_norm": 0.03369140625,
"learning_rate": 3.4042553191489363e-06,
"loss": 1.0346,
"step": 8
},
{
"epoch": 0.01,
"grad_norm": 0.0301513671875,
"learning_rate": 3.8297872340425535e-06,
"loss": 1.0193,
"step": 9
},
{
"epoch": 0.01,
"grad_norm": 0.0267333984375,
"learning_rate": 4.255319148936171e-06,
"loss": 1.0297,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 0.0291748046875,
"learning_rate": 4.680851063829788e-06,
"loss": 1.0868,
"step": 11
},
{
"epoch": 0.01,
"grad_norm": 0.0296630859375,
"learning_rate": 5.106382978723404e-06,
"loss": 1.0641,
"step": 12
},
{
"epoch": 0.01,
"grad_norm": 0.028564453125,
"learning_rate": 5.531914893617022e-06,
"loss": 0.9389,
"step": 13
},
{
"epoch": 0.01,
"grad_norm": 0.03515625,
"learning_rate": 5.957446808510638e-06,
"loss": 1.033,
"step": 14
},
{
"epoch": 0.01,
"grad_norm": 0.0283203125,
"learning_rate": 6.382978723404256e-06,
"loss": 1.0216,
"step": 15
},
{
"epoch": 0.01,
"grad_norm": 0.0322265625,
"learning_rate": 6.808510638297873e-06,
"loss": 1.1086,
"step": 16
},
{
"epoch": 0.01,
"grad_norm": 0.0322265625,
"learning_rate": 7.234042553191491e-06,
"loss": 1.0577,
"step": 17
},
{
"epoch": 0.01,
"grad_norm": 0.030517578125,
"learning_rate": 7.659574468085107e-06,
"loss": 1.0733,
"step": 18
},
{
"epoch": 0.01,
"grad_norm": 0.0303955078125,
"learning_rate": 8.085106382978723e-06,
"loss": 0.9865,
"step": 19
},
{
"epoch": 0.01,
"grad_norm": 0.0291748046875,
"learning_rate": 8.510638297872341e-06,
"loss": 1.0125,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 0.034423828125,
"learning_rate": 8.936170212765958e-06,
"loss": 1.1245,
"step": 21
},
{
"epoch": 0.01,
"grad_norm": 0.0260009765625,
"learning_rate": 9.361702127659576e-06,
"loss": 1.0024,
"step": 22
},
{
"epoch": 0.01,
"grad_norm": 0.0306396484375,
"learning_rate": 9.787234042553192e-06,
"loss": 1.0131,
"step": 23
},
{
"epoch": 0.02,
"grad_norm": 0.033447265625,
"learning_rate": 1.0212765957446808e-05,
"loss": 1.0171,
"step": 24
},
{
"epoch": 0.02,
"grad_norm": 0.033203125,
"learning_rate": 1.0638297872340426e-05,
"loss": 0.9613,
"step": 25
},
{
"epoch": 0.02,
"grad_norm": 0.03857421875,
"learning_rate": 1.1063829787234044e-05,
"loss": 1.1312,
"step": 26
},
{
"epoch": 0.02,
"grad_norm": 0.036865234375,
"learning_rate": 1.1489361702127662e-05,
"loss": 1.0187,
"step": 27
},
{
"epoch": 0.02,
"grad_norm": 0.03515625,
"learning_rate": 1.1914893617021277e-05,
"loss": 0.9934,
"step": 28
},
{
"epoch": 0.02,
"grad_norm": 0.036376953125,
"learning_rate": 1.2340425531914895e-05,
"loss": 1.0872,
"step": 29
},
{
"epoch": 0.02,
"grad_norm": 0.035888671875,
"learning_rate": 1.2765957446808513e-05,
"loss": 0.9591,
"step": 30
},
{
"epoch": 0.02,
"grad_norm": 0.033203125,
"learning_rate": 1.3191489361702127e-05,
"loss": 0.9589,
"step": 31
},
{
"epoch": 0.02,
"grad_norm": 0.040771484375,
"learning_rate": 1.3617021276595745e-05,
"loss": 1.0093,
"step": 32
},
{
"epoch": 0.02,
"grad_norm": 0.0400390625,
"learning_rate": 1.4042553191489363e-05,
"loss": 1.0195,
"step": 33
},
{
"epoch": 0.02,
"grad_norm": 0.038330078125,
"learning_rate": 1.4468085106382981e-05,
"loss": 0.8936,
"step": 34
},
{
"epoch": 0.02,
"grad_norm": 0.04443359375,
"learning_rate": 1.4893617021276596e-05,
"loss": 0.9958,
"step": 35
},
{
"epoch": 0.02,
"grad_norm": 0.045654296875,
"learning_rate": 1.5319148936170214e-05,
"loss": 0.9279,
"step": 36
},
{
"epoch": 0.02,
"grad_norm": 0.03662109375,
"learning_rate": 1.5744680851063832e-05,
"loss": 1.0153,
"step": 37
},
{
"epoch": 0.02,
"grad_norm": 0.04638671875,
"learning_rate": 1.6170212765957446e-05,
"loss": 0.9862,
"step": 38
},
{
"epoch": 0.03,
"grad_norm": 0.042236328125,
"learning_rate": 1.6595744680851064e-05,
"loss": 1.0962,
"step": 39
},
{
"epoch": 0.03,
"grad_norm": 0.040771484375,
"learning_rate": 1.7021276595744682e-05,
"loss": 0.956,
"step": 40
},
{
"epoch": 0.03,
"grad_norm": 0.035888671875,
"learning_rate": 1.74468085106383e-05,
"loss": 1.0559,
"step": 41
},
{
"epoch": 0.03,
"grad_norm": 0.04736328125,
"learning_rate": 1.7872340425531915e-05,
"loss": 1.0014,
"step": 42
},
{
"epoch": 0.03,
"grad_norm": 0.051025390625,
"learning_rate": 1.8297872340425533e-05,
"loss": 1.0252,
"step": 43
},
{
"epoch": 0.03,
"grad_norm": 0.04931640625,
"learning_rate": 1.872340425531915e-05,
"loss": 0.9541,
"step": 44
},
{
"epoch": 0.03,
"grad_norm": 0.0556640625,
"learning_rate": 1.914893617021277e-05,
"loss": 0.9603,
"step": 45
},
{
"epoch": 0.03,
"grad_norm": 0.0419921875,
"learning_rate": 1.9574468085106384e-05,
"loss": 1.0601,
"step": 46
},
{
"epoch": 0.03,
"grad_norm": 0.0478515625,
"learning_rate": 2e-05,
"loss": 0.9919,
"step": 47
},
{
"epoch": 0.03,
"grad_norm": 0.047119140625,
"learning_rate": 1.9999977801976743e-05,
"loss": 1.0247,
"step": 48
},
{
"epoch": 0.03,
"grad_norm": 0.048095703125,
"learning_rate": 1.999991120800551e-05,
"loss": 0.9936,
"step": 49
},
{
"epoch": 0.03,
"grad_norm": 0.05419921875,
"learning_rate": 1.9999800218381958e-05,
"loss": 1.0315,
"step": 50
},
{
"epoch": 0.03,
"grad_norm": 0.0478515625,
"learning_rate": 1.9999644833598836e-05,
"loss": 0.9392,
"step": 51
},
{
"epoch": 0.03,
"grad_norm": 0.0546875,
"learning_rate": 1.9999445054345993e-05,
"loss": 1.0716,
"step": 52
},
{
"epoch": 0.03,
"grad_norm": 0.05224609375,
"learning_rate": 1.9999200881510366e-05,
"loss": 0.9724,
"step": 53
},
{
"epoch": 0.04,
"grad_norm": 0.04736328125,
"learning_rate": 1.999891231617599e-05,
"loss": 0.9966,
"step": 54
},
{
"epoch": 0.04,
"grad_norm": 0.049072265625,
"learning_rate": 1.9998579359623977e-05,
"loss": 0.969,
"step": 55
},
{
"epoch": 0.04,
"grad_norm": 0.051513671875,
"learning_rate": 1.9998202013332525e-05,
"loss": 0.972,
"step": 56
},
{
"epoch": 0.04,
"grad_norm": 0.043701171875,
"learning_rate": 1.99977802789769e-05,
"loss": 0.9705,
"step": 57
},
{
"epoch": 0.04,
"grad_norm": 0.044189453125,
"learning_rate": 1.999731415842944e-05,
"loss": 1.002,
"step": 58
},
{
"epoch": 0.04,
"grad_norm": 0.039794921875,
"learning_rate": 1.9996803653759534e-05,
"loss": 0.9508,
"step": 59
},
{
"epoch": 0.04,
"grad_norm": 0.03759765625,
"learning_rate": 1.9996248767233616e-05,
"loss": 0.9232,
"step": 60
},
{
"epoch": 0.04,
"grad_norm": 0.0390625,
"learning_rate": 1.9995649501315172e-05,
"loss": 1.0054,
"step": 61
},
{
"epoch": 0.04,
"grad_norm": 0.034423828125,
"learning_rate": 1.9995005858664696e-05,
"loss": 0.9685,
"step": 62
},
{
"epoch": 0.04,
"grad_norm": 0.03369140625,
"learning_rate": 1.9994317842139715e-05,
"loss": 0.9313,
"step": 63
},
{
"epoch": 0.04,
"grad_norm": 0.0311279296875,
"learning_rate": 1.9993585454794748e-05,
"loss": 0.9463,
"step": 64
},
{
"epoch": 0.04,
"grad_norm": 0.0311279296875,
"learning_rate": 1.9992808699881303e-05,
"loss": 0.9049,
"step": 65
},
{
"epoch": 0.04,
"grad_norm": 0.0322265625,
"learning_rate": 1.999198758084787e-05,
"loss": 0.9088,
"step": 66
},
{
"epoch": 0.04,
"grad_norm": 0.033203125,
"learning_rate": 1.9991122101339885e-05,
"loss": 0.9369,
"step": 67
},
{
"epoch": 0.04,
"grad_norm": 0.0478515625,
"learning_rate": 1.9990212265199738e-05,
"loss": 0.9902,
"step": 68
},
{
"epoch": 0.04,
"grad_norm": 0.03466796875,
"learning_rate": 1.9989258076466743e-05,
"loss": 0.9569,
"step": 69
},
{
"epoch": 0.05,
"grad_norm": 0.042724609375,
"learning_rate": 1.998825953937712e-05,
"loss": 0.9779,
"step": 70
},
{
"epoch": 0.05,
"grad_norm": 0.0380859375,
"learning_rate": 1.9987216658363983e-05,
"loss": 0.9505,
"step": 71
},
{
"epoch": 0.05,
"grad_norm": 0.036376953125,
"learning_rate": 1.9986129438057306e-05,
"loss": 0.9374,
"step": 72
},
{
"epoch": 0.05,
"grad_norm": 0.0361328125,
"learning_rate": 1.998499788328392e-05,
"loss": 1.0086,
"step": 73
},
{
"epoch": 0.05,
"grad_norm": 0.034912109375,
"learning_rate": 1.9983821999067478e-05,
"loss": 1.046,
"step": 74
},
{
"epoch": 0.05,
"grad_norm": 0.031005859375,
"learning_rate": 1.998260179062844e-05,
"loss": 0.9375,
"step": 75
},
{
"epoch": 0.05,
"grad_norm": 0.032958984375,
"learning_rate": 1.9981337263384057e-05,
"loss": 0.9514,
"step": 76
},
{
"epoch": 0.05,
"grad_norm": 0.031982421875,
"learning_rate": 1.9980028422948323e-05,
"loss": 0.8629,
"step": 77
},
{
"epoch": 0.05,
"grad_norm": 0.03125,
"learning_rate": 1.9978675275131975e-05,
"loss": 0.933,
"step": 78
},
{
"epoch": 0.05,
"grad_norm": 0.0299072265625,
"learning_rate": 1.9977277825942453e-05,
"loss": 0.9408,
"step": 79
},
{
"epoch": 0.05,
"grad_norm": 0.031494140625,
"learning_rate": 1.997583608158388e-05,
"loss": 1.0041,
"step": 80
},
{
"epoch": 0.05,
"grad_norm": 0.031982421875,
"learning_rate": 1.997435004845703e-05,
"loss": 0.9605,
"step": 81
},
{
"epoch": 0.05,
"grad_norm": 0.03271484375,
"learning_rate": 1.99728197331593e-05,
"loss": 0.9256,
"step": 82
},
{
"epoch": 0.05,
"grad_norm": 0.034912109375,
"learning_rate": 1.9971245142484693e-05,
"loss": 1.0026,
"step": 83
},
{
"epoch": 0.05,
"grad_norm": 0.031494140625,
"learning_rate": 1.996962628342376e-05,
"loss": 0.9789,
"step": 84
},
{
"epoch": 0.06,
"grad_norm": 0.0341796875,
"learning_rate": 1.99679631631636e-05,
"loss": 0.9437,
"step": 85
},
{
"epoch": 0.06,
"grad_norm": 0.030517578125,
"learning_rate": 1.996625578908781e-05,
"loss": 0.9487,
"step": 86
},
{
"epoch": 0.06,
"grad_norm": 0.033447265625,
"learning_rate": 1.9964504168776454e-05,
"loss": 0.9645,
"step": 87
},
{
"epoch": 0.06,
"grad_norm": 0.03271484375,
"learning_rate": 1.9962708310006032e-05,
"loss": 0.9967,
"step": 88
},
{
"epoch": 0.06,
"grad_norm": 0.0296630859375,
"learning_rate": 1.996086822074945e-05,
"loss": 1.0195,
"step": 89
},
{
"epoch": 0.06,
"grad_norm": 0.030517578125,
"learning_rate": 1.9958983909175977e-05,
"loss": 0.8769,
"step": 90
},
{
"epoch": 0.06,
"grad_norm": 0.031494140625,
"learning_rate": 1.995705538365121e-05,
"loss": 0.8407,
"step": 91
},
{
"epoch": 0.06,
"grad_norm": 0.033203125,
"learning_rate": 1.995508265273704e-05,
"loss": 0.9368,
"step": 92
},
{
"epoch": 0.06,
"grad_norm": 0.031982421875,
"learning_rate": 1.9953065725191613e-05,
"loss": 0.9308,
"step": 93
},
{
"epoch": 0.06,
"grad_norm": 0.03076171875,
"learning_rate": 1.9951004609969286e-05,
"loss": 0.9235,
"step": 94
},
{
"epoch": 0.06,
"grad_norm": 0.032958984375,
"learning_rate": 1.9948899316220603e-05,
"loss": 0.9008,
"step": 95
},
{
"epoch": 0.06,
"grad_norm": 0.03173828125,
"learning_rate": 1.9946749853292233e-05,
"loss": 0.9735,
"step": 96
},
{
"epoch": 0.06,
"grad_norm": 0.033447265625,
"learning_rate": 1.994455623072694e-05,
"loss": 0.9328,
"step": 97
},
{
"epoch": 0.06,
"grad_norm": 0.033203125,
"learning_rate": 1.994231845826354e-05,
"loss": 0.8967,
"step": 98
},
{
"epoch": 0.06,
"grad_norm": 0.03173828125,
"learning_rate": 1.994003654583686e-05,
"loss": 0.8363,
"step": 99
},
{
"epoch": 0.07,
"grad_norm": 0.033447265625,
"learning_rate": 1.993771050357769e-05,
"loss": 0.9072,
"step": 100
},
{
"epoch": 0.07,
"grad_norm": 0.03369140625,
"learning_rate": 1.9935340341812737e-05,
"loss": 0.9502,
"step": 101
},
{
"epoch": 0.07,
"grad_norm": 0.03271484375,
"learning_rate": 1.993292607106458e-05,
"loss": 0.8794,
"step": 102
},
{
"epoch": 0.07,
"grad_norm": 0.03466796875,
"learning_rate": 1.9930467702051632e-05,
"loss": 0.9601,
"step": 103
},
{
"epoch": 0.07,
"grad_norm": 0.0341796875,
"learning_rate": 1.9927965245688073e-05,
"loss": 0.9099,
"step": 104
},
{
"epoch": 0.07,
"grad_norm": 0.033935546875,
"learning_rate": 1.9925418713083824e-05,
"loss": 0.929,
"step": 105
},
{
"epoch": 0.07,
"grad_norm": 0.033447265625,
"learning_rate": 1.992282811554448e-05,
"loss": 0.9046,
"step": 106
},
{
"epoch": 0.07,
"grad_norm": 0.031005859375,
"learning_rate": 1.9920193464571277e-05,
"loss": 0.9393,
"step": 107
},
{
"epoch": 0.07,
"grad_norm": 0.03515625,
"learning_rate": 1.9917514771861015e-05,
"loss": 0.9933,
"step": 108
},
{
"epoch": 0.07,
"grad_norm": 0.035400390625,
"learning_rate": 1.9914792049306034e-05,
"loss": 0.8865,
"step": 109
},
{
"epoch": 0.07,
"grad_norm": 0.032958984375,
"learning_rate": 1.9912025308994146e-05,
"loss": 0.9158,
"step": 110
},
{
"epoch": 0.07,
"grad_norm": 0.035888671875,
"learning_rate": 1.990921456320859e-05,
"loss": 0.9143,
"step": 111
},
{
"epoch": 0.07,
"grad_norm": 0.03369140625,
"learning_rate": 1.9906359824427953e-05,
"loss": 0.9707,
"step": 112
},
{
"epoch": 0.07,
"grad_norm": 0.0341796875,
"learning_rate": 1.9903461105326155e-05,
"loss": 0.8894,
"step": 113
},
{
"epoch": 0.07,
"grad_norm": 0.0341796875,
"learning_rate": 1.9900518418772364e-05,
"loss": 0.966,
"step": 114
},
{
"epoch": 0.07,
"grad_norm": 0.035888671875,
"learning_rate": 1.989753177783094e-05,
"loss": 0.9201,
"step": 115
},
{
"epoch": 0.08,
"grad_norm": 0.0341796875,
"learning_rate": 1.9894501195761393e-05,
"loss": 0.9299,
"step": 116
},
{
"epoch": 0.08,
"grad_norm": 0.03369140625,
"learning_rate": 1.9891426686018308e-05,
"loss": 0.8812,
"step": 117
},
{
"epoch": 0.08,
"grad_norm": 0.035888671875,
"learning_rate": 1.9888308262251286e-05,
"loss": 0.9995,
"step": 118
},
{
"epoch": 0.08,
"grad_norm": 0.031982421875,
"learning_rate": 1.9885145938304905e-05,
"loss": 0.8804,
"step": 119
},
{
"epoch": 0.08,
"grad_norm": 0.03955078125,
"learning_rate": 1.988193972821863e-05,
"loss": 0.9021,
"step": 120
},
{
"epoch": 0.08,
"grad_norm": 0.039306640625,
"learning_rate": 1.987868964622676e-05,
"loss": 0.8066,
"step": 121
},
{
"epoch": 0.08,
"grad_norm": 0.0361328125,
"learning_rate": 1.9875395706758388e-05,
"loss": 0.909,
"step": 122
},
{
"epoch": 0.08,
"grad_norm": 0.037109375,
"learning_rate": 1.987205792443729e-05,
"loss": 0.8611,
"step": 123
},
{
"epoch": 0.08,
"grad_norm": 0.037841796875,
"learning_rate": 1.9868676314081907e-05,
"loss": 0.9249,
"step": 124
},
{
"epoch": 0.08,
"grad_norm": 0.0322265625,
"learning_rate": 1.986525089070525e-05,
"loss": 0.837,
"step": 125
},
{
"epoch": 0.08,
"grad_norm": 0.03515625,
"learning_rate": 1.986178166951484e-05,
"loss": 0.8653,
"step": 126
},
{
"epoch": 0.08,
"grad_norm": 0.03662109375,
"learning_rate": 1.9858268665912653e-05,
"loss": 0.9011,
"step": 127
},
{
"epoch": 0.08,
"grad_norm": 0.035888671875,
"learning_rate": 1.9854711895495034e-05,
"loss": 0.9942,
"step": 128
},
{
"epoch": 0.08,
"grad_norm": 0.032958984375,
"learning_rate": 1.985111137405264e-05,
"loss": 0.9303,
"step": 129
},
{
"epoch": 0.08,
"grad_norm": 0.03369140625,
"learning_rate": 1.9847467117570364e-05,
"loss": 0.9206,
"step": 130
},
{
"epoch": 0.09,
"grad_norm": 0.033935546875,
"learning_rate": 1.9843779142227258e-05,
"loss": 0.8366,
"step": 131
},
{
"epoch": 0.09,
"grad_norm": 0.0380859375,
"learning_rate": 1.9840047464396477e-05,
"loss": 0.8988,
"step": 132
},
{
"epoch": 0.09,
"grad_norm": 0.06005859375,
"learning_rate": 1.98362721006452e-05,
"loss": 0.9719,
"step": 133
},
{
"epoch": 0.09,
"grad_norm": 0.03466796875,
"learning_rate": 1.983245306773454e-05,
"loss": 0.9629,
"step": 134
},
{
"epoch": 0.09,
"grad_norm": 0.0380859375,
"learning_rate": 1.98285903826195e-05,
"loss": 0.8384,
"step": 135
},
{
"epoch": 0.09,
"grad_norm": 0.034423828125,
"learning_rate": 1.9824684062448876e-05,
"loss": 0.8031,
"step": 136
},
{
"epoch": 0.09,
"grad_norm": 0.036376953125,
"learning_rate": 1.982073412456518e-05,
"loss": 0.8623,
"step": 137
},
{
"epoch": 0.09,
"grad_norm": 0.033935546875,
"learning_rate": 1.981674058650458e-05,
"loss": 0.8357,
"step": 138
},
{
"epoch": 0.09,
"grad_norm": 0.036376953125,
"learning_rate": 1.98127034659968e-05,
"loss": 0.9306,
"step": 139
},
{
"epoch": 0.09,
"grad_norm": 0.03564453125,
"learning_rate": 1.9808622780965064e-05,
"loss": 0.9464,
"step": 140
},
{
"epoch": 0.09,
"grad_norm": 0.033935546875,
"learning_rate": 1.9804498549526e-05,
"loss": 0.9146,
"step": 141
},
{
"epoch": 0.09,
"grad_norm": 0.034912109375,
"learning_rate": 1.980033078998956e-05,
"loss": 0.8999,
"step": 142
},
{
"epoch": 0.09,
"grad_norm": 0.03564453125,
"learning_rate": 1.9796119520858957e-05,
"loss": 0.9932,
"step": 143
},
{
"epoch": 0.09,
"grad_norm": 0.035888671875,
"learning_rate": 1.9791864760830554e-05,
"loss": 0.8976,
"step": 144
},
{
"epoch": 0.09,
"grad_norm": 0.03369140625,
"learning_rate": 1.9787566528793806e-05,
"loss": 0.9024,
"step": 145
},
{
"epoch": 0.09,
"grad_norm": 0.033447265625,
"learning_rate": 1.9783224843831162e-05,
"loss": 0.8262,
"step": 146
},
{
"epoch": 0.1,
"grad_norm": 0.036376953125,
"learning_rate": 1.977883972521799e-05,
"loss": 0.9491,
"step": 147
},
{
"epoch": 0.1,
"grad_norm": 0.0361328125,
"learning_rate": 1.9774411192422486e-05,
"loss": 0.9347,
"step": 148
},
{
"epoch": 0.1,
"grad_norm": 0.0390625,
"learning_rate": 1.9769939265105573e-05,
"loss": 0.8401,
"step": 149
},
{
"epoch": 0.1,
"grad_norm": 0.03466796875,
"learning_rate": 1.976542396312085e-05,
"loss": 0.8949,
"step": 150
},
{
"epoch": 0.1,
"grad_norm": 0.03369140625,
"learning_rate": 1.976086530651447e-05,
"loss": 0.8675,
"step": 151
},
{
"epoch": 0.1,
"grad_norm": 0.0322265625,
"learning_rate": 1.975626331552507e-05,
"loss": 0.8617,
"step": 152
},
{
"epoch": 0.1,
"grad_norm": 0.034423828125,
"learning_rate": 1.9751618010583665e-05,
"loss": 0.8374,
"step": 153
},
{
"epoch": 0.1,
"grad_norm": 0.036865234375,
"learning_rate": 1.974692941231357e-05,
"loss": 0.8396,
"step": 154
},
{
"epoch": 0.1,
"grad_norm": 0.034912109375,
"learning_rate": 1.974219754153032e-05,
"loss": 0.9553,
"step": 155
},
{
"epoch": 0.1,
"grad_norm": 0.0341796875,
"learning_rate": 1.9737422419241538e-05,
"loss": 0.8821,
"step": 156
},
{
"epoch": 0.1,
"grad_norm": 0.03466796875,
"learning_rate": 1.9732604066646882e-05,
"loss": 0.8778,
"step": 157
},
{
"epoch": 0.1,
"grad_norm": 0.03955078125,
"learning_rate": 1.9727742505137936e-05,
"loss": 0.8552,
"step": 158
},
{
"epoch": 0.1,
"grad_norm": 0.03515625,
"learning_rate": 1.9722837756298112e-05,
"loss": 0.9358,
"step": 159
},
{
"epoch": 0.1,
"grad_norm": 0.03466796875,
"learning_rate": 1.9717889841902553e-05,
"loss": 0.9171,
"step": 160
},
{
"epoch": 0.1,
"grad_norm": 0.033447265625,
"learning_rate": 1.971289878391804e-05,
"loss": 0.8395,
"step": 161
},
{
"epoch": 0.11,
"grad_norm": 0.035400390625,
"learning_rate": 1.97078646045029e-05,
"loss": 0.8955,
"step": 162
},
{
"epoch": 0.11,
"grad_norm": 0.0419921875,
"learning_rate": 1.9702787326006906e-05,
"loss": 0.8192,
"step": 163
},
{
"epoch": 0.11,
"grad_norm": 0.037353515625,
"learning_rate": 1.9697666970971153e-05,
"loss": 0.8264,
"step": 164
},
{
"epoch": 0.11,
"grad_norm": 0.03857421875,
"learning_rate": 1.9692503562128004e-05,
"loss": 0.9093,
"step": 165
},
{
"epoch": 0.11,
"grad_norm": 0.044189453125,
"learning_rate": 1.9687297122400952e-05,
"loss": 0.9446,
"step": 166
},
{
"epoch": 0.11,
"grad_norm": 0.03466796875,
"learning_rate": 1.9682047674904527e-05,
"loss": 0.8802,
"step": 167
},
{
"epoch": 0.11,
"grad_norm": 0.03564453125,
"learning_rate": 1.9676755242944202e-05,
"loss": 0.9152,
"step": 168
},
{
"epoch": 0.11,
"grad_norm": 0.032958984375,
"learning_rate": 1.9671419850016283e-05,
"loss": 0.8396,
"step": 169
},
{
"epoch": 0.11,
"grad_norm": 0.0341796875,
"learning_rate": 1.9666041519807802e-05,
"loss": 0.7976,
"step": 170
},
{
"epoch": 0.11,
"grad_norm": 0.036376953125,
"learning_rate": 1.966062027619643e-05,
"loss": 0.8979,
"step": 171
},
{
"epoch": 0.11,
"grad_norm": 0.032470703125,
"learning_rate": 1.9655156143250328e-05,
"loss": 0.8632,
"step": 172
},
{
"epoch": 0.11,
"grad_norm": 0.0390625,
"learning_rate": 1.96496491452281e-05,
"loss": 0.9456,
"step": 173
},
{
"epoch": 0.11,
"grad_norm": 0.0361328125,
"learning_rate": 1.9644099306578636e-05,
"loss": 0.837,
"step": 174
},
{
"epoch": 0.11,
"grad_norm": 0.031982421875,
"learning_rate": 1.9638506651941024e-05,
"loss": 0.7911,
"step": 175
},
{
"epoch": 0.11,
"grad_norm": 0.03564453125,
"learning_rate": 1.963287120614444e-05,
"loss": 0.8926,
"step": 176
},
{
"epoch": 0.12,
"grad_norm": 0.0341796875,
"learning_rate": 1.9627192994208038e-05,
"loss": 0.8054,
"step": 177
},
{
"epoch": 0.12,
"grad_norm": 0.034423828125,
"learning_rate": 1.962147204134083e-05,
"loss": 0.9226,
"step": 178
},
{
"epoch": 0.12,
"grad_norm": 0.035400390625,
"learning_rate": 1.9615708372941588e-05,
"loss": 0.8987,
"step": 179
},
{
"epoch": 0.12,
"grad_norm": 0.040283203125,
"learning_rate": 1.960990201459872e-05,
"loss": 0.8729,
"step": 180
},
{
"epoch": 0.12,
"grad_norm": 0.039306640625,
"learning_rate": 1.960405299209016e-05,
"loss": 0.9454,
"step": 181
},
{
"epoch": 0.12,
"grad_norm": 0.035888671875,
"learning_rate": 1.9598161331383258e-05,
"loss": 0.9157,
"step": 182
},
{
"epoch": 0.12,
"grad_norm": 0.03857421875,
"learning_rate": 1.9592227058634655e-05,
"loss": 0.8724,
"step": 183
},
{
"epoch": 0.12,
"grad_norm": 0.0361328125,
"learning_rate": 1.958625020019018e-05,
"loss": 0.8446,
"step": 184
},
{
"epoch": 0.12,
"grad_norm": 0.0341796875,
"learning_rate": 1.9580230782584722e-05,
"loss": 0.8441,
"step": 185
},
{
"epoch": 0.12,
"grad_norm": 0.037841796875,
"learning_rate": 1.957416883254211e-05,
"loss": 0.9078,
"step": 186
},
{
"epoch": 0.12,
"grad_norm": 0.037109375,
"learning_rate": 1.9568064376975013e-05,
"loss": 0.9075,
"step": 187
},
{
"epoch": 0.12,
"grad_norm": 0.036376953125,
"learning_rate": 1.956191744298479e-05,
"loss": 0.8932,
"step": 188
},
{
"epoch": 0.12,
"grad_norm": 0.03466796875,
"learning_rate": 1.955572805786141e-05,
"loss": 0.8577,
"step": 189
},
{
"epoch": 0.12,
"grad_norm": 0.03564453125,
"learning_rate": 1.9549496249083288e-05,
"loss": 0.8257,
"step": 190
},
{
"epoch": 0.12,
"grad_norm": 0.03369140625,
"learning_rate": 1.954322204431719e-05,
"loss": 0.7848,
"step": 191
},
{
"epoch": 0.12,
"grad_norm": 0.037353515625,
"learning_rate": 1.953690547141811e-05,
"loss": 0.8617,
"step": 192
},
{
"epoch": 0.13,
"grad_norm": 0.034912109375,
"learning_rate": 1.953054655842913e-05,
"loss": 0.7992,
"step": 193
},
{
"epoch": 0.13,
"grad_norm": 0.03515625,
"learning_rate": 1.9524145333581315e-05,
"loss": 0.8101,
"step": 194
},
{
"epoch": 0.13,
"grad_norm": 0.038330078125,
"learning_rate": 1.951770182529357e-05,
"loss": 0.8669,
"step": 195
},
{
"epoch": 0.13,
"grad_norm": 0.03564453125,
"learning_rate": 1.951121606217252e-05,
"loss": 0.8589,
"step": 196
},
{
"epoch": 0.13,
"grad_norm": 0.036376953125,
"learning_rate": 1.9504688073012397e-05,
"loss": 0.9205,
"step": 197
},
{
"epoch": 0.13,
"grad_norm": 0.039794921875,
"learning_rate": 1.9498117886794885e-05,
"loss": 0.9052,
"step": 198
},
{
"epoch": 0.13,
"grad_norm": 0.037109375,
"learning_rate": 1.9491505532689017e-05,
"loss": 0.8167,
"step": 199
},
{
"epoch": 0.13,
"grad_norm": 0.03662109375,
"learning_rate": 1.948485104005103e-05,
"loss": 0.9358,
"step": 200
},
{
"epoch": 0.13,
"grad_norm": 0.03759765625,
"learning_rate": 1.947815443842424e-05,
"loss": 0.8639,
"step": 201
},
{
"epoch": 0.13,
"grad_norm": 0.034423828125,
"learning_rate": 1.9471415757538918e-05,
"loss": 0.8684,
"step": 202
},
{
"epoch": 0.13,
"grad_norm": 0.032470703125,
"learning_rate": 1.946463502731213e-05,
"loss": 0.7762,
"step": 203
},
{
"epoch": 0.13,
"grad_norm": 0.034912109375,
"learning_rate": 1.9457812277847645e-05,
"loss": 0.8664,
"step": 204
},
{
"epoch": 0.13,
"grad_norm": 0.038330078125,
"learning_rate": 1.945094753943577e-05,
"loss": 0.9964,
"step": 205
},
{
"epoch": 0.13,
"grad_norm": 0.037353515625,
"learning_rate": 1.944404084255324e-05,
"loss": 0.8768,
"step": 206
},
{
"epoch": 0.13,
"grad_norm": 0.0380859375,
"learning_rate": 1.9437092217863043e-05,
"loss": 0.8999,
"step": 207
},
{
"epoch": 0.14,
"grad_norm": 0.036376953125,
"learning_rate": 1.9430101696214335e-05,
"loss": 0.8437,
"step": 208
},
{
"epoch": 0.14,
"grad_norm": 0.037841796875,
"learning_rate": 1.9423069308642267e-05,
"loss": 0.8273,
"step": 209
},
{
"epoch": 0.14,
"grad_norm": 0.04052734375,
"learning_rate": 1.9415995086367858e-05,
"loss": 0.9275,
"step": 210
},
{
"epoch": 0.14,
"grad_norm": 0.036865234375,
"learning_rate": 1.940887906079786e-05,
"loss": 0.8938,
"step": 211
},
{
"epoch": 0.14,
"grad_norm": 0.033203125,
"learning_rate": 1.9401721263524616e-05,
"loss": 0.8414,
"step": 212
},
{
"epoch": 0.14,
"grad_norm": 0.037109375,
"learning_rate": 1.9394521726325907e-05,
"loss": 0.9055,
"step": 213
},
{
"epoch": 0.14,
"grad_norm": 0.042236328125,
"learning_rate": 1.938728048116484e-05,
"loss": 0.9002,
"step": 214
},
{
"epoch": 0.14,
"grad_norm": 0.034912109375,
"learning_rate": 1.9379997560189677e-05,
"loss": 0.8598,
"step": 215
},
{
"epoch": 0.14,
"grad_norm": 0.038330078125,
"learning_rate": 1.9372672995733706e-05,
"loss": 0.8557,
"step": 216
},
{
"epoch": 0.14,
"grad_norm": 0.036376953125,
"learning_rate": 1.9365306820315104e-05,
"loss": 0.9001,
"step": 217
},
{
"epoch": 0.14,
"grad_norm": 0.037109375,
"learning_rate": 1.9357899066636774e-05,
"loss": 0.842,
"step": 218
},
{
"epoch": 0.14,
"grad_norm": 0.036865234375,
"learning_rate": 1.935044976758621e-05,
"loss": 0.8759,
"step": 219
},
{
"epoch": 0.14,
"grad_norm": 0.03759765625,
"learning_rate": 1.9342958956235365e-05,
"loss": 0.8306,
"step": 220
},
{
"epoch": 0.14,
"grad_norm": 0.03759765625,
"learning_rate": 1.933542666584047e-05,
"loss": 0.8322,
"step": 221
},
{
"epoch": 0.14,
"grad_norm": 0.03515625,
"learning_rate": 1.9327852929841918e-05,
"loss": 0.8149,
"step": 222
},
{
"epoch": 0.14,
"grad_norm": 0.03955078125,
"learning_rate": 1.9320237781864106e-05,
"loss": 0.8458,
"step": 223
},
{
"epoch": 0.15,
"grad_norm": 0.03759765625,
"learning_rate": 1.9312581255715276e-05,
"loss": 0.84,
"step": 224
},
{
"epoch": 0.15,
"grad_norm": 0.038818359375,
"learning_rate": 1.9304883385387383e-05,
"loss": 0.8254,
"step": 225
},
{
"epoch": 0.15,
"grad_norm": 0.03466796875,
"learning_rate": 1.9297144205055925e-05,
"loss": 0.8898,
"step": 226
},
{
"epoch": 0.15,
"grad_norm": 0.037353515625,
"learning_rate": 1.9289363749079798e-05,
"loss": 0.8231,
"step": 227
},
{
"epoch": 0.15,
"grad_norm": 0.041015625,
"learning_rate": 1.928154205200116e-05,
"loss": 0.8764,
"step": 228
},
{
"epoch": 0.15,
"grad_norm": 0.037353515625,
"learning_rate": 1.9273679148545246e-05,
"loss": 0.8436,
"step": 229
},
{
"epoch": 0.15,
"grad_norm": 0.037841796875,
"learning_rate": 1.9265775073620244e-05,
"loss": 0.8622,
"step": 230
},
{
"epoch": 0.15,
"grad_norm": 0.036865234375,
"learning_rate": 1.9257829862317118e-05,
"loss": 0.8484,
"step": 231
},
{
"epoch": 0.15,
"grad_norm": 0.037841796875,
"learning_rate": 1.9249843549909467e-05,
"loss": 0.8765,
"step": 232
},
{
"epoch": 0.15,
"grad_norm": 0.03759765625,
"learning_rate": 1.9241816171853362e-05,
"loss": 0.8762,
"step": 233
},
{
"epoch": 0.15,
"grad_norm": 0.03955078125,
"learning_rate": 1.9233747763787187e-05,
"loss": 0.8716,
"step": 234
},
{
"epoch": 0.15,
"grad_norm": 0.04443359375,
"learning_rate": 1.9225638361531482e-05,
"loss": 0.8453,
"step": 235
},
{
"epoch": 0.15,
"grad_norm": 0.037109375,
"learning_rate": 1.9217488001088784e-05,
"loss": 0.7992,
"step": 236
},
{
"epoch": 0.15,
"grad_norm": 0.036865234375,
"learning_rate": 1.920929671864348e-05,
"loss": 0.9607,
"step": 237
},
{
"epoch": 0.15,
"grad_norm": 0.037841796875,
"learning_rate": 1.920106455056162e-05,
"loss": 0.8416,
"step": 238
},
{
"epoch": 0.16,
"grad_norm": 0.0380859375,
"learning_rate": 1.9192791533390778e-05,
"loss": 0.7983,
"step": 239
},
{
"epoch": 0.16,
"grad_norm": 0.040283203125,
"learning_rate": 1.9184477703859876e-05,
"loss": 0.8942,
"step": 240
},
{
"epoch": 0.16,
"grad_norm": 0.037841796875,
"learning_rate": 1.9176123098879035e-05,
"loss": 0.8849,
"step": 241
},
{
"epoch": 0.16,
"grad_norm": 0.037109375,
"learning_rate": 1.9167727755539393e-05,
"loss": 0.83,
"step": 242
},
{
"epoch": 0.16,
"grad_norm": 0.036865234375,
"learning_rate": 1.9159291711112962e-05,
"loss": 0.7999,
"step": 243
},
{
"epoch": 0.16,
"grad_norm": 0.035400390625,
"learning_rate": 1.9150815003052436e-05,
"loss": 0.8281,
"step": 244
},
{
"epoch": 0.16,
"grad_norm": 0.038818359375,
"learning_rate": 1.9142297668991053e-05,
"loss": 0.884,
"step": 245
},
{
"epoch": 0.16,
"grad_norm": 0.044189453125,
"learning_rate": 1.913373974674241e-05,
"loss": 0.8701,
"step": 246
},
{
"epoch": 0.16,
"grad_norm": 0.039306640625,
"learning_rate": 1.9125141274300293e-05,
"loss": 0.8734,
"step": 247
},
{
"epoch": 0.16,
"grad_norm": 0.03857421875,
"learning_rate": 1.9116502289838524e-05,
"loss": 0.8851,
"step": 248
},
{
"epoch": 0.16,
"grad_norm": 0.044189453125,
"learning_rate": 1.910782283171078e-05,
"loss": 0.9402,
"step": 249
},
{
"epoch": 0.16,
"grad_norm": 0.038818359375,
"learning_rate": 1.909910293845042e-05,
"loss": 0.831,
"step": 250
},
{
"epoch": 0.16,
"grad_norm": 0.038330078125,
"learning_rate": 1.909034264877032e-05,
"loss": 0.8093,
"step": 251
},
{
"epoch": 0.16,
"grad_norm": 0.039794921875,
"learning_rate": 1.9081542001562713e-05,
"loss": 0.9085,
"step": 252
},
{
"epoch": 0.16,
"grad_norm": 0.038330078125,
"learning_rate": 1.9072701035898985e-05,
"loss": 0.8466,
"step": 253
},
{
"epoch": 0.17,
"grad_norm": 0.0419921875,
"learning_rate": 1.906381979102953e-05,
"loss": 0.8938,
"step": 254
},
{
"epoch": 0.17,
"grad_norm": 0.0400390625,
"learning_rate": 1.9054898306383568e-05,
"loss": 0.8787,
"step": 255
},
{
"epoch": 0.17,
"grad_norm": 0.040283203125,
"learning_rate": 1.904593662156896e-05,
"loss": 0.882,
"step": 256
},
{
"epoch": 0.17,
"grad_norm": 0.03759765625,
"learning_rate": 1.903693477637204e-05,
"loss": 0.7803,
"step": 257
},
{
"epoch": 0.17,
"grad_norm": 0.037353515625,
"learning_rate": 1.902789281075745e-05,
"loss": 0.8078,
"step": 258
},
{
"epoch": 0.17,
"grad_norm": 0.03857421875,
"learning_rate": 1.9018810764867935e-05,
"loss": 0.8318,
"step": 259
},
{
"epoch": 0.17,
"grad_norm": 0.04150390625,
"learning_rate": 1.900968867902419e-05,
"loss": 0.8728,
"step": 260
},
{
"epoch": 0.17,
"grad_norm": 0.0390625,
"learning_rate": 1.9000526593724678e-05,
"loss": 0.836,
"step": 261
},
{
"epoch": 0.17,
"grad_norm": 0.0380859375,
"learning_rate": 1.8991324549645424e-05,
"loss": 0.9197,
"step": 262
},
{
"epoch": 0.17,
"grad_norm": 0.04150390625,
"learning_rate": 1.898208258763987e-05,
"loss": 0.7965,
"step": 263
},
{
"epoch": 0.17,
"grad_norm": 0.041015625,
"learning_rate": 1.897280074873868e-05,
"loss": 0.8078,
"step": 264
},
{
"epoch": 0.17,
"grad_norm": 0.042236328125,
"learning_rate": 1.8963479074149537e-05,
"loss": 0.9035,
"step": 265
},
{
"epoch": 0.17,
"grad_norm": 0.040771484375,
"learning_rate": 1.8954117605257e-05,
"loss": 0.8515,
"step": 266
},
{
"epoch": 0.17,
"grad_norm": 0.041015625,
"learning_rate": 1.8944716383622288e-05,
"loss": 0.8147,
"step": 267
},
{
"epoch": 0.17,
"grad_norm": 0.040771484375,
"learning_rate": 1.8935275450983102e-05,
"loss": 0.8121,
"step": 268
},
{
"epoch": 0.17,
"grad_norm": 0.04150390625,
"learning_rate": 1.8925794849253462e-05,
"loss": 0.858,
"step": 269
},
{
"epoch": 0.18,
"grad_norm": 0.044921875,
"learning_rate": 1.8916274620523482e-05,
"loss": 0.8502,
"step": 270
},
{
"epoch": 0.18,
"grad_norm": 0.04150390625,
"learning_rate": 1.8906714807059218e-05,
"loss": 0.8438,
"step": 271
},
{
"epoch": 0.18,
"grad_norm": 0.0390625,
"learning_rate": 1.889711545130246e-05,
"loss": 0.8464,
"step": 272
},
{
"epoch": 0.18,
"grad_norm": 0.05712890625,
"learning_rate": 1.8887476595870558e-05,
"loss": 0.8227,
"step": 273
},
{
"epoch": 0.18,
"grad_norm": 0.036865234375,
"learning_rate": 1.887779828355621e-05,
"loss": 0.8546,
"step": 274
},
{
"epoch": 0.18,
"grad_norm": 0.037109375,
"learning_rate": 1.8868080557327305e-05,
"loss": 0.8932,
"step": 275
},
{
"epoch": 0.18,
"grad_norm": 0.041748046875,
"learning_rate": 1.8858323460326704e-05,
"loss": 0.889,
"step": 276
},
{
"epoch": 0.18,
"grad_norm": 0.039306640625,
"learning_rate": 1.8848527035872057e-05,
"loss": 0.8174,
"step": 277
},
{
"epoch": 0.18,
"grad_norm": 0.038818359375,
"learning_rate": 1.883869132745561e-05,
"loss": 0.8183,
"step": 278
},
{
"epoch": 0.18,
"grad_norm": 0.0390625,
"learning_rate": 1.8828816378744035e-05,
"loss": 0.8924,
"step": 279
},
{
"epoch": 0.18,
"grad_norm": 0.038330078125,
"learning_rate": 1.8818902233578188e-05,
"loss": 0.7906,
"step": 280
},
{
"epoch": 0.18,
"grad_norm": 0.04248046875,
"learning_rate": 1.8808948935972965e-05,
"loss": 0.8118,
"step": 281
},
{
"epoch": 0.18,
"grad_norm": 0.03857421875,
"learning_rate": 1.8798956530117058e-05,
"loss": 0.8512,
"step": 282
},
{
"epoch": 0.18,
"grad_norm": 0.044677734375,
"learning_rate": 1.8788925060372806e-05,
"loss": 0.8224,
"step": 283
},
{
"epoch": 0.18,
"grad_norm": 0.047607421875,
"learning_rate": 1.8778854571275972e-05,
"loss": 0.8207,
"step": 284
},
{
"epoch": 0.19,
"grad_norm": 0.038330078125,
"learning_rate": 1.876874510753554e-05,
"loss": 0.8011,
"step": 285
},
{
"epoch": 0.19,
"grad_norm": 0.04296875,
"learning_rate": 1.875859671403354e-05,
"loss": 0.8132,
"step": 286
},
{
"epoch": 0.19,
"grad_norm": 0.042236328125,
"learning_rate": 1.874840943582482e-05,
"loss": 0.9056,
"step": 287
},
{
"epoch": 0.19,
"grad_norm": 0.0400390625,
"learning_rate": 1.8738183318136867e-05,
"loss": 0.8353,
"step": 288
},
{
"epoch": 0.19,
"grad_norm": 0.040771484375,
"learning_rate": 1.872791840636961e-05,
"loss": 0.7943,
"step": 289
},
{
"epoch": 0.19,
"grad_norm": 0.045654296875,
"learning_rate": 1.871761474609519e-05,
"loss": 0.8207,
"step": 290
},
{
"epoch": 0.19,
"grad_norm": 0.04345703125,
"learning_rate": 1.8707272383057785e-05,
"loss": 0.8415,
"step": 291
},
{
"epoch": 0.19,
"grad_norm": 0.04052734375,
"learning_rate": 1.8696891363173405e-05,
"loss": 0.797,
"step": 292
},
{
"epoch": 0.19,
"grad_norm": 0.046142578125,
"learning_rate": 1.8686471732529667e-05,
"loss": 0.8248,
"step": 293
},
{
"epoch": 0.19,
"grad_norm": 0.041259765625,
"learning_rate": 1.8676013537385614e-05,
"loss": 0.76,
"step": 294
},
{
"epoch": 0.19,
"grad_norm": 0.04150390625,
"learning_rate": 1.8665516824171497e-05,
"loss": 0.8362,
"step": 295
},
{
"epoch": 0.19,
"grad_norm": 0.040771484375,
"learning_rate": 1.865498163948858e-05,
"loss": 0.8093,
"step": 296
},
{
"epoch": 0.19,
"grad_norm": 0.0380859375,
"learning_rate": 1.864440803010891e-05,
"loss": 0.7735,
"step": 297
},
{
"epoch": 0.19,
"grad_norm": 0.041015625,
"learning_rate": 1.863379604297513e-05,
"loss": 0.8824,
"step": 298
},
{
"epoch": 0.19,
"grad_norm": 0.039794921875,
"learning_rate": 1.862314572520028e-05,
"loss": 0.8157,
"step": 299
},
{
"epoch": 0.2,
"grad_norm": 0.03857421875,
"learning_rate": 1.861245712406755e-05,
"loss": 0.8084,
"step": 300
},
{
"epoch": 0.2,
"grad_norm": 0.049072265625,
"learning_rate": 1.86017302870301e-05,
"loss": 0.7976,
"step": 301
},
{
"epoch": 0.2,
"grad_norm": 0.041259765625,
"learning_rate": 1.8590965261710856e-05,
"loss": 0.8406,
"step": 302
},
{
"epoch": 0.2,
"grad_norm": 0.041015625,
"learning_rate": 1.858016209590227e-05,
"loss": 0.8145,
"step": 303
},
{
"epoch": 0.2,
"grad_norm": 0.038818359375,
"learning_rate": 1.8569320837566128e-05,
"loss": 0.8142,
"step": 304
},
{
"epoch": 0.2,
"grad_norm": 0.04052734375,
"learning_rate": 1.8558441534833327e-05,
"loss": 0.8894,
"step": 305
},
{
"epoch": 0.2,
"grad_norm": 0.04296875,
"learning_rate": 1.8547524236003675e-05,
"loss": 0.8793,
"step": 306
},
{
"epoch": 0.2,
"grad_norm": 0.0380859375,
"learning_rate": 1.8536568989545662e-05,
"loss": 0.868,
"step": 307
},
{
"epoch": 0.2,
"grad_norm": 0.0419921875,
"learning_rate": 1.8525575844096243e-05,
"loss": 0.8572,
"step": 308
},
{
"epoch": 0.2,
"grad_norm": 0.04931640625,
"learning_rate": 1.8514544848460653e-05,
"loss": 0.7933,
"step": 309
},
{
"epoch": 0.2,
"grad_norm": 0.0390625,
"learning_rate": 1.8503476051612138e-05,
"loss": 0.8017,
"step": 310
},
{
"epoch": 0.2,
"grad_norm": 0.042724609375,
"learning_rate": 1.8492369502691785e-05,
"loss": 0.8317,
"step": 311
},
{
"epoch": 0.2,
"grad_norm": 0.04052734375,
"learning_rate": 1.8481225251008284e-05,
"loss": 0.8201,
"step": 312
},
{
"epoch": 0.2,
"grad_norm": 0.041748046875,
"learning_rate": 1.8470043346037698e-05,
"loss": 0.8258,
"step": 313
},
{
"epoch": 0.2,
"grad_norm": 0.0419921875,
"learning_rate": 1.8458823837423274e-05,
"loss": 0.8402,
"step": 314
},
{
"epoch": 0.2,
"grad_norm": 0.044921875,
"learning_rate": 1.8447566774975187e-05,
"loss": 0.9293,
"step": 315
},
{
"epoch": 0.21,
"grad_norm": 0.048583984375,
"learning_rate": 1.8436272208670346e-05,
"loss": 0.8716,
"step": 316
},
{
"epoch": 0.21,
"grad_norm": 0.0458984375,
"learning_rate": 1.842494018865216e-05,
"loss": 0.8868,
"step": 317
},
{
"epoch": 0.21,
"grad_norm": 0.047607421875,
"learning_rate": 1.841357076523032e-05,
"loss": 0.9027,
"step": 318
},
{
"epoch": 0.21,
"grad_norm": 0.044189453125,
"learning_rate": 1.840216398888057e-05,
"loss": 0.7936,
"step": 319
},
{
"epoch": 0.21,
"grad_norm": 0.0458984375,
"learning_rate": 1.8390719910244487e-05,
"loss": 0.8498,
"step": 320
},
{
"epoch": 0.21,
"grad_norm": 0.044677734375,
"learning_rate": 1.8379238580129256e-05,
"loss": 0.798,
"step": 321
},
{
"epoch": 0.21,
"grad_norm": 0.044677734375,
"learning_rate": 1.836772004950744e-05,
"loss": 0.8746,
"step": 322
},
{
"epoch": 0.21,
"grad_norm": 0.04541015625,
"learning_rate": 1.8356164369516772e-05,
"loss": 0.8658,
"step": 323
},
{
"epoch": 0.21,
"grad_norm": 0.0400390625,
"learning_rate": 1.834457159145989e-05,
"loss": 0.8299,
"step": 324
},
{
"epoch": 0.21,
"grad_norm": 0.0458984375,
"learning_rate": 1.8332941766804152e-05,
"loss": 0.8723,
"step": 325
},
{
"epoch": 0.21,
"grad_norm": 0.04150390625,
"learning_rate": 1.832127494718138e-05,
"loss": 0.8311,
"step": 326
},
{
"epoch": 0.21,
"grad_norm": 0.0439453125,
"learning_rate": 1.830957118438764e-05,
"loss": 0.8159,
"step": 327
},
{
"epoch": 0.21,
"grad_norm": 0.044921875,
"learning_rate": 1.829783053038301e-05,
"loss": 0.8351,
"step": 328
},
{
"epoch": 0.21,
"grad_norm": 0.0419921875,
"learning_rate": 1.8286053037291356e-05,
"loss": 0.7679,
"step": 329
},
{
"epoch": 0.21,
"grad_norm": 0.04345703125,
"learning_rate": 1.8274238757400096e-05,
"loss": 0.7848,
"step": 330
},
{
"epoch": 0.22,
"grad_norm": 0.04150390625,
"learning_rate": 1.826238774315995e-05,
"loss": 0.8741,
"step": 331
},
{
"epoch": 0.22,
"grad_norm": 0.042236328125,
"learning_rate": 1.8250500047184744e-05,
"loss": 0.8517,
"step": 332
},
{
"epoch": 0.22,
"grad_norm": 0.046875,
"learning_rate": 1.8238575722251144e-05,
"loss": 0.8602,
"step": 333
},
{
"epoch": 0.22,
"grad_norm": 0.041259765625,
"learning_rate": 1.8226614821298444e-05,
"loss": 0.8087,
"step": 334
},
{
"epoch": 0.22,
"grad_norm": 0.042724609375,
"learning_rate": 1.821461739742831e-05,
"loss": 0.8301,
"step": 335
},
{
"epoch": 0.22,
"grad_norm": 0.046875,
"learning_rate": 1.820258350390456e-05,
"loss": 0.8342,
"step": 336
},
{
"epoch": 0.22,
"grad_norm": 0.043701171875,
"learning_rate": 1.819051319415293e-05,
"loss": 0.8249,
"step": 337
},
{
"epoch": 0.22,
"grad_norm": 0.041748046875,
"learning_rate": 1.817840652176082e-05,
"loss": 0.7909,
"step": 338
},
{
"epoch": 0.22,
"grad_norm": 0.04248046875,
"learning_rate": 1.8166263540477068e-05,
"loss": 0.8071,
"step": 339
},
{
"epoch": 0.22,
"grad_norm": 0.043212890625,
"learning_rate": 1.815408430421171e-05,
"loss": 0.7983,
"step": 340
},
{
"epoch": 0.22,
"grad_norm": 0.041748046875,
"learning_rate": 1.8141868867035745e-05,
"loss": 0.7877,
"step": 341
},
{
"epoch": 0.22,
"grad_norm": 0.04443359375,
"learning_rate": 1.8129617283180878e-05,
"loss": 0.9056,
"step": 342
},
{
"epoch": 0.22,
"grad_norm": 0.043212890625,
"learning_rate": 1.81173296070393e-05,
"loss": 0.8708,
"step": 343
},
{
"epoch": 0.22,
"grad_norm": 0.04541015625,
"learning_rate": 1.8105005893163436e-05,
"loss": 0.8387,
"step": 344
},
{
"epoch": 0.22,
"grad_norm": 0.042724609375,
"learning_rate": 1.8092646196265705e-05,
"loss": 0.8578,
"step": 345
},
{
"epoch": 0.22,
"grad_norm": 0.042236328125,
"learning_rate": 1.808025057121827e-05,
"loss": 0.8642,
"step": 346
},
{
"epoch": 0.23,
"grad_norm": 0.048095703125,
"learning_rate": 1.8067819073052813e-05,
"loss": 0.8058,
"step": 347
},
{
"epoch": 0.23,
"grad_norm": 0.041259765625,
"learning_rate": 1.8055351756960262e-05,
"loss": 0.8128,
"step": 348
},
{
"epoch": 0.23,
"grad_norm": 0.04296875,
"learning_rate": 1.804284867829058e-05,
"loss": 0.8387,
"step": 349
},
{
"epoch": 0.23,
"grad_norm": 0.0458984375,
"learning_rate": 1.8030309892552488e-05,
"loss": 0.9106,
"step": 350
},
{
"epoch": 0.23,
"grad_norm": 0.041259765625,
"learning_rate": 1.801773545541324e-05,
"loss": 0.752,
"step": 351
},
{
"epoch": 0.23,
"grad_norm": 0.044189453125,
"learning_rate": 1.800512542269836e-05,
"loss": 0.881,
"step": 352
},
{
"epoch": 0.23,
"grad_norm": 0.045166015625,
"learning_rate": 1.7992479850391416e-05,
"loss": 0.8004,
"step": 353
},
{
"epoch": 0.23,
"grad_norm": 0.046875,
"learning_rate": 1.797979879463375e-05,
"loss": 0.8075,
"step": 354
},
{
"epoch": 0.23,
"grad_norm": 0.044677734375,
"learning_rate": 1.796708231172423e-05,
"loss": 0.8315,
"step": 355
},
{
"epoch": 0.23,
"grad_norm": 0.041259765625,
"learning_rate": 1.795433045811901e-05,
"loss": 0.8506,
"step": 356
},
{
"epoch": 0.23,
"grad_norm": 0.051025390625,
"learning_rate": 1.7941543290431286e-05,
"loss": 0.8314,
"step": 357
},
{
"epoch": 0.23,
"grad_norm": 0.045166015625,
"learning_rate": 1.792872086543103e-05,
"loss": 0.7697,
"step": 358
},
{
"epoch": 0.23,
"grad_norm": 0.04541015625,
"learning_rate": 1.7915863240044727e-05,
"loss": 0.9001,
"step": 359
},
{
"epoch": 0.23,
"grad_norm": 0.04443359375,
"learning_rate": 1.7902970471355162e-05,
"loss": 0.7685,
"step": 360
},
{
"epoch": 0.23,
"grad_norm": 0.0458984375,
"learning_rate": 1.7890042616601125e-05,
"loss": 0.8105,
"step": 361
},
{
"epoch": 0.24,
"grad_norm": 0.044677734375,
"learning_rate": 1.7877079733177185e-05,
"loss": 0.9061,
"step": 362
},
{
"epoch": 0.24,
"grad_norm": 0.043212890625,
"learning_rate": 1.7864081878633414e-05,
"loss": 0.813,
"step": 363
},
{
"epoch": 0.24,
"grad_norm": 0.04345703125,
"learning_rate": 1.785104911067515e-05,
"loss": 0.8197,
"step": 364
},
{
"epoch": 0.24,
"grad_norm": 0.044189453125,
"learning_rate": 1.783798148716273e-05,
"loss": 0.894,
"step": 365
},
{
"epoch": 0.24,
"grad_norm": 0.04296875,
"learning_rate": 1.782487906611124e-05,
"loss": 0.7809,
"step": 366
},
{
"epoch": 0.24,
"grad_norm": 0.04296875,
"learning_rate": 1.781174190569024e-05,
"loss": 0.8428,
"step": 367
},
{
"epoch": 0.24,
"grad_norm": 0.049072265625,
"learning_rate": 1.7798570064223536e-05,
"loss": 0.8276,
"step": 368
},
{
"epoch": 0.24,
"grad_norm": 0.04541015625,
"learning_rate": 1.7785363600188894e-05,
"loss": 0.7937,
"step": 369
},
{
"epoch": 0.24,
"grad_norm": 0.03955078125,
"learning_rate": 1.7772122572217796e-05,
"loss": 0.7835,
"step": 370
},
{
"epoch": 0.24,
"grad_norm": 0.04833984375,
"learning_rate": 1.7758847039095167e-05,
"loss": 0.8456,
"step": 371
},
{
"epoch": 0.24,
"grad_norm": 0.0439453125,
"learning_rate": 1.774553705975913e-05,
"loss": 0.8483,
"step": 372
},
{
"epoch": 0.24,
"grad_norm": 0.041259765625,
"learning_rate": 1.773219269330073e-05,
"loss": 0.7902,
"step": 373
},
{
"epoch": 0.24,
"grad_norm": 0.04296875,
"learning_rate": 1.7718813998963678e-05,
"loss": 0.8734,
"step": 374
},
{
"epoch": 0.24,
"grad_norm": 0.044921875,
"learning_rate": 1.7705401036144086e-05,
"loss": 0.8646,
"step": 375
},
{
"epoch": 0.24,
"grad_norm": 0.04345703125,
"learning_rate": 1.7691953864390208e-05,
"loss": 0.8005,
"step": 376
},
{
"epoch": 0.25,
"grad_norm": 0.0419921875,
"learning_rate": 1.7678472543402166e-05,
"loss": 0.8701,
"step": 377
},
{
"epoch": 0.25,
"grad_norm": 0.045166015625,
"learning_rate": 1.7664957133031705e-05,
"loss": 0.8099,
"step": 378
},
{
"epoch": 0.25,
"grad_norm": 0.057373046875,
"learning_rate": 1.7651407693281896e-05,
"loss": 0.8524,
"step": 379
},
{
"epoch": 0.25,
"grad_norm": 0.05224609375,
"learning_rate": 1.7637824284306898e-05,
"loss": 0.8456,
"step": 380
},
{
"epoch": 0.25,
"grad_norm": 0.05078125,
"learning_rate": 1.762420696641167e-05,
"loss": 0.7977,
"step": 381
},
{
"epoch": 0.25,
"grad_norm": 0.044189453125,
"learning_rate": 1.7610555800051727e-05,
"loss": 0.7834,
"step": 382
},
{
"epoch": 0.25,
"grad_norm": 0.045166015625,
"learning_rate": 1.759687084583285e-05,
"loss": 0.7946,
"step": 383
},
{
"epoch": 0.25,
"grad_norm": 0.04443359375,
"learning_rate": 1.7583152164510827e-05,
"loss": 0.7456,
"step": 384
},
{
"epoch": 0.25,
"grad_norm": 0.044677734375,
"learning_rate": 1.7569399816991174e-05,
"loss": 0.8358,
"step": 385
},
{
"epoch": 0.25,
"grad_norm": 0.047119140625,
"learning_rate": 1.7555613864328876e-05,
"loss": 0.7976,
"step": 386
},
{
"epoch": 0.25,
"grad_norm": 0.09423828125,
"learning_rate": 1.754179436772812e-05,
"loss": 0.9486,
"step": 387
},
{
"epoch": 0.25,
"grad_norm": 0.046142578125,
"learning_rate": 1.7527941388542006e-05,
"loss": 0.7898,
"step": 388
},
{
"epoch": 0.25,
"grad_norm": 0.04150390625,
"learning_rate": 1.751405498827228e-05,
"loss": 0.7644,
"step": 389
},
{
"epoch": 0.25,
"grad_norm": 0.04931640625,
"learning_rate": 1.7500135228569067e-05,
"loss": 0.8363,
"step": 390
},
{
"epoch": 0.25,
"grad_norm": 0.047119140625,
"learning_rate": 1.748618217123061e-05,
"loss": 0.801,
"step": 391
},
{
"epoch": 0.25,
"grad_norm": 0.044677734375,
"learning_rate": 1.7472195878202955e-05,
"loss": 0.8487,
"step": 392
},
{
"epoch": 0.26,
"grad_norm": 0.046142578125,
"learning_rate": 1.7458176411579715e-05,
"loss": 0.8884,
"step": 393
},
{
"epoch": 0.26,
"grad_norm": 0.0439453125,
"learning_rate": 1.7444123833601784e-05,
"loss": 0.8484,
"step": 394
},
{
"epoch": 0.26,
"grad_norm": 0.043701171875,
"learning_rate": 1.743003820665705e-05,
"loss": 0.8325,
"step": 395
},
{
"epoch": 0.26,
"grad_norm": 0.048095703125,
"learning_rate": 1.741591959328013e-05,
"loss": 0.8061,
"step": 396
},
{
"epoch": 0.26,
"grad_norm": 0.047607421875,
"learning_rate": 1.7401768056152083e-05,
"loss": 0.7888,
"step": 397
},
{
"epoch": 0.26,
"grad_norm": 0.047119140625,
"learning_rate": 1.7387583658100144e-05,
"loss": 0.8564,
"step": 398
},
{
"epoch": 0.26,
"grad_norm": 0.044677734375,
"learning_rate": 1.737336646209742e-05,
"loss": 0.8412,
"step": 399
},
{
"epoch": 0.26,
"grad_norm": 0.0439453125,
"learning_rate": 1.7359116531262654e-05,
"loss": 0.9182,
"step": 400
},
{
"epoch": 0.26,
"grad_norm": 0.047119140625,
"learning_rate": 1.73448339288599e-05,
"loss": 0.8653,
"step": 401
},
{
"epoch": 0.26,
"grad_norm": 0.0478515625,
"learning_rate": 1.7330518718298263e-05,
"loss": 0.8174,
"step": 402
},
{
"epoch": 0.26,
"grad_norm": 0.05859375,
"learning_rate": 1.7316170963131627e-05,
"loss": 0.8621,
"step": 403
},
{
"epoch": 0.26,
"grad_norm": 0.047119140625,
"learning_rate": 1.7301790727058344e-05,
"loss": 0.7991,
"step": 404
},
{
"epoch": 0.26,
"grad_norm": 0.046142578125,
"learning_rate": 1.728737807392098e-05,
"loss": 0.8706,
"step": 405
},
{
"epoch": 0.26,
"grad_norm": 0.046142578125,
"learning_rate": 1.727293306770602e-05,
"loss": 0.824,
"step": 406
},
{
"epoch": 0.26,
"grad_norm": 0.0498046875,
"learning_rate": 1.7258455772543573e-05,
"loss": 0.9865,
"step": 407
},
{
"epoch": 0.27,
"grad_norm": 0.0537109375,
"learning_rate": 1.7243946252707115e-05,
"loss": 0.844,
"step": 408
},
{
"epoch": 0.27,
"grad_norm": 0.04052734375,
"learning_rate": 1.7229404572613174e-05,
"loss": 0.7566,
"step": 409
},
{
"epoch": 0.27,
"grad_norm": 0.0439453125,
"learning_rate": 1.721483079682106e-05,
"loss": 0.8393,
"step": 410
},
{
"epoch": 0.27,
"grad_norm": 0.0458984375,
"learning_rate": 1.7200224990032577e-05,
"loss": 0.7992,
"step": 411
},
{
"epoch": 0.27,
"grad_norm": 0.04443359375,
"learning_rate": 1.7185587217091727e-05,
"loss": 0.8862,
"step": 412
},
{
"epoch": 0.27,
"grad_norm": 0.046630859375,
"learning_rate": 1.7170917542984445e-05,
"loss": 0.8859,
"step": 413
},
{
"epoch": 0.27,
"grad_norm": 0.050537109375,
"learning_rate": 1.7156216032838275e-05,
"loss": 0.8738,
"step": 414
},
{
"epoch": 0.27,
"grad_norm": 0.047119140625,
"learning_rate": 1.7141482751922117e-05,
"loss": 0.8702,
"step": 415
},
{
"epoch": 0.27,
"grad_norm": 0.044677734375,
"learning_rate": 1.7126717765645908e-05,
"loss": 0.8496,
"step": 416
},
{
"epoch": 0.27,
"grad_norm": 0.047607421875,
"learning_rate": 1.7111921139560356e-05,
"loss": 0.8402,
"step": 417
},
{
"epoch": 0.27,
"grad_norm": 0.044677734375,
"learning_rate": 1.7097092939356622e-05,
"loss": 0.8719,
"step": 418
},
{
"epoch": 0.27,
"grad_norm": 0.05126953125,
"learning_rate": 1.7082233230866064e-05,
"loss": 0.865,
"step": 419
},
{
"epoch": 0.27,
"grad_norm": 0.0517578125,
"learning_rate": 1.7067342080059904e-05,
"loss": 0.8876,
"step": 420
},
{
"epoch": 0.27,
"grad_norm": 0.0478515625,
"learning_rate": 1.7052419553048965e-05,
"loss": 0.8594,
"step": 421
},
{
"epoch": 0.27,
"grad_norm": 0.041015625,
"learning_rate": 1.703746571608337e-05,
"loss": 0.7774,
"step": 422
},
{
"epoch": 0.28,
"grad_norm": 0.04638671875,
"learning_rate": 1.7022480635552243e-05,
"loss": 0.8357,
"step": 423
},
{
"epoch": 0.28,
"grad_norm": 0.044189453125,
"learning_rate": 1.700746437798342e-05,
"loss": 0.8365,
"step": 424
},
{
"epoch": 0.28,
"grad_norm": 0.0517578125,
"learning_rate": 1.6992417010043144e-05,
"loss": 0.7916,
"step": 425
},
{
"epoch": 0.28,
"grad_norm": 0.0439453125,
"learning_rate": 1.6977338598535776e-05,
"loss": 0.886,
"step": 426
},
{
"epoch": 0.28,
"grad_norm": 0.04638671875,
"learning_rate": 1.696222921040351e-05,
"loss": 0.8391,
"step": 427
},
{
"epoch": 0.28,
"grad_norm": 0.045166015625,
"learning_rate": 1.6947088912726054e-05,
"loss": 0.8403,
"step": 428
},
{
"epoch": 0.28,
"grad_norm": 0.046875,
"learning_rate": 1.693191777272034e-05,
"loss": 0.8048,
"step": 429
},
{
"epoch": 0.28,
"grad_norm": 0.049072265625,
"learning_rate": 1.6916715857740234e-05,
"loss": 0.7742,
"step": 430
},
{
"epoch": 0.28,
"grad_norm": 0.045166015625,
"learning_rate": 1.690148323527623e-05,
"loss": 0.7859,
"step": 431
},
{
"epoch": 0.28,
"grad_norm": 0.045166015625,
"learning_rate": 1.688621997295515e-05,
"loss": 0.7956,
"step": 432
},
{
"epoch": 0.28,
"grad_norm": 0.04443359375,
"learning_rate": 1.6870926138539837e-05,
"loss": 0.8672,
"step": 433
},
{
"epoch": 0.28,
"grad_norm": 0.060546875,
"learning_rate": 1.6855601799928877e-05,
"loss": 0.848,
"step": 434
},
{
"epoch": 0.28,
"grad_norm": 0.046875,
"learning_rate": 1.6840247025156272e-05,
"loss": 0.8125,
"step": 435
},
{
"epoch": 0.28,
"grad_norm": 0.052001953125,
"learning_rate": 1.6824861882391154e-05,
"loss": 0.8359,
"step": 436
},
{
"epoch": 0.28,
"grad_norm": 0.048583984375,
"learning_rate": 1.6809446439937472e-05,
"loss": 0.877,
"step": 437
},
{
"epoch": 0.28,
"grad_norm": 0.04833984375,
"learning_rate": 1.6794000766233697e-05,
"loss": 0.8408,
"step": 438
},
{
"epoch": 0.29,
"grad_norm": 0.04833984375,
"learning_rate": 1.6778524929852513e-05,
"loss": 0.8381,
"step": 439
},
{
"epoch": 0.29,
"grad_norm": 0.052490234375,
"learning_rate": 1.676301899950052e-05,
"loss": 0.782,
"step": 440
},
{
"epoch": 0.29,
"grad_norm": 0.04345703125,
"learning_rate": 1.674748304401791e-05,
"loss": 0.8621,
"step": 441
},
{
"epoch": 0.29,
"grad_norm": 0.04345703125,
"learning_rate": 1.673191713237819e-05,
"loss": 0.8012,
"step": 442
},
{
"epoch": 0.29,
"grad_norm": 0.05322265625,
"learning_rate": 1.671632133368785e-05,
"loss": 0.8245,
"step": 443
},
{
"epoch": 0.29,
"grad_norm": 0.0478515625,
"learning_rate": 1.670069571718607e-05,
"loss": 0.7882,
"step": 444
},
{
"epoch": 0.29,
"grad_norm": 0.045166015625,
"learning_rate": 1.6685040352244414e-05,
"loss": 0.8387,
"step": 445
},
{
"epoch": 0.29,
"grad_norm": 0.055419921875,
"learning_rate": 1.666935530836651e-05,
"loss": 0.7766,
"step": 446
},
{
"epoch": 0.29,
"grad_norm": 0.04931640625,
"learning_rate": 1.665364065518775e-05,
"loss": 0.8204,
"step": 447
},
{
"epoch": 0.29,
"grad_norm": 0.046630859375,
"learning_rate": 1.6637896462474986e-05,
"loss": 0.8133,
"step": 448
},
{
"epoch": 0.29,
"grad_norm": 0.048828125,
"learning_rate": 1.662212280012621e-05,
"loss": 0.85,
"step": 449
},
{
"epoch": 0.29,
"grad_norm": 0.045654296875,
"learning_rate": 1.660631973817024e-05,
"loss": 0.8247,
"step": 450
},
{
"epoch": 0.29,
"grad_norm": 0.048583984375,
"learning_rate": 1.6590487346766426e-05,
"loss": 0.8977,
"step": 451
},
{
"epoch": 0.29,
"grad_norm": 0.05712890625,
"learning_rate": 1.657462569620433e-05,
"loss": 0.8456,
"step": 452
},
{
"epoch": 0.29,
"grad_norm": 0.048583984375,
"learning_rate": 1.6558734856903406e-05,
"loss": 0.8369,
"step": 453
},
{
"epoch": 0.3,
"grad_norm": 0.04638671875,
"learning_rate": 1.6542814899412694e-05,
"loss": 0.8055,
"step": 454
},
{
"epoch": 0.3,
"grad_norm": 0.04443359375,
"learning_rate": 1.6526865894410526e-05,
"loss": 0.8358,
"step": 455
},
{
"epoch": 0.3,
"grad_norm": 0.046630859375,
"learning_rate": 1.651088791270416e-05,
"loss": 0.8094,
"step": 456
},
{
"epoch": 0.3,
"grad_norm": 0.04833984375,
"learning_rate": 1.6494881025229535e-05,
"loss": 0.8518,
"step": 457
},
{
"epoch": 0.3,
"grad_norm": 0.048583984375,
"learning_rate": 1.647884530305089e-05,
"loss": 0.9644,
"step": 458
},
{
"epoch": 0.3,
"grad_norm": 0.046630859375,
"learning_rate": 1.6462780817360502e-05,
"loss": 0.8415,
"step": 459
},
{
"epoch": 0.3,
"grad_norm": 0.050537109375,
"learning_rate": 1.644668763947833e-05,
"loss": 0.8764,
"step": 460
},
{
"epoch": 0.3,
"grad_norm": 0.044921875,
"learning_rate": 1.6430565840851723e-05,
"loss": 0.7737,
"step": 461
},
{
"epoch": 0.3,
"grad_norm": 0.047607421875,
"learning_rate": 1.641441549305509e-05,
"loss": 0.7559,
"step": 462
}
],
"logging_steps": 1,
"max_steps": 1538,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 462,
"total_flos": 1.2674443629129892e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}