Airoboros-13b-SuperHOT-8k / trainer_state.json
Peeepy's picture
Upload 11 files
32dd952
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0275229357798166,
"global_step": 330,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"learning_rate": 5.555555555555555e-07,
"loss": 0.6498,
"step": 1
},
{
"epoch": 0.02,
"learning_rate": 1.111111111111111e-06,
"loss": 0.6642,
"step": 2
},
{
"epoch": 0.03,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.6479,
"step": 3
},
{
"epoch": 0.04,
"learning_rate": 2.222222222222222e-06,
"loss": 0.5813,
"step": 4
},
{
"epoch": 0.05,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.6306,
"step": 5
},
{
"epoch": 0.06,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.6881,
"step": 6
},
{
"epoch": 0.06,
"learning_rate": 3.88888888888889e-06,
"loss": 0.6066,
"step": 7
},
{
"epoch": 0.07,
"learning_rate": 4.444444444444444e-06,
"loss": 0.5688,
"step": 8
},
{
"epoch": 0.08,
"learning_rate": 5e-06,
"loss": 0.5619,
"step": 9
},
{
"epoch": 0.09,
"learning_rate": 5.555555555555557e-06,
"loss": 0.5144,
"step": 10
},
{
"epoch": 0.1,
"learning_rate": 6.111111111111112e-06,
"loss": 0.5744,
"step": 11
},
{
"epoch": 0.11,
"learning_rate": 6.666666666666667e-06,
"loss": 0.6051,
"step": 12
},
{
"epoch": 0.12,
"learning_rate": 7.222222222222223e-06,
"loss": 0.5917,
"step": 13
},
{
"epoch": 0.13,
"learning_rate": 7.77777777777778e-06,
"loss": 0.5239,
"step": 14
},
{
"epoch": 0.14,
"learning_rate": 8.333333333333334e-06,
"loss": 0.5253,
"step": 15
},
{
"epoch": 0.15,
"learning_rate": 8.888888888888888e-06,
"loss": 0.4143,
"step": 16
},
{
"epoch": 0.16,
"learning_rate": 9.444444444444445e-06,
"loss": 0.4822,
"step": 17
},
{
"epoch": 0.17,
"learning_rate": 1e-05,
"loss": 0.4627,
"step": 18
},
{
"epoch": 0.17,
"learning_rate": 9.999858783596665e-06,
"loss": 0.5389,
"step": 19
},
{
"epoch": 0.18,
"learning_rate": 9.999435142363484e-06,
"loss": 0.5544,
"step": 20
},
{
"epoch": 0.19,
"learning_rate": 9.998729100230497e-06,
"loss": 0.5013,
"step": 21
},
{
"epoch": 0.2,
"learning_rate": 9.997740697079595e-06,
"loss": 0.5035,
"step": 22
},
{
"epoch": 0.21,
"learning_rate": 9.99646998874227e-06,
"loss": 0.4924,
"step": 23
},
{
"epoch": 0.22,
"learning_rate": 9.994917046996472e-06,
"loss": 0.507,
"step": 24
},
{
"epoch": 0.23,
"learning_rate": 9.993081959562539e-06,
"loss": 0.4876,
"step": 25
},
{
"epoch": 0.24,
"learning_rate": 9.990964830098246e-06,
"loss": 0.555,
"step": 26
},
{
"epoch": 0.25,
"learning_rate": 9.98856577819296e-06,
"loss": 0.4692,
"step": 27
},
{
"epoch": 0.26,
"learning_rate": 9.985884939360873e-06,
"loss": 0.466,
"step": 28
},
{
"epoch": 0.27,
"learning_rate": 9.98292246503335e-06,
"loss": 0.4132,
"step": 29
},
{
"epoch": 0.28,
"learning_rate": 9.979678522550382e-06,
"loss": 0.4741,
"step": 30
},
{
"epoch": 0.28,
"learning_rate": 9.976153295151123e-06,
"loss": 0.5424,
"step": 31
},
{
"epoch": 0.29,
"learning_rate": 9.972346981963546e-06,
"loss": 0.4949,
"step": 32
},
{
"epoch": 0.3,
"learning_rate": 9.968259797993197e-06,
"loss": 0.5126,
"step": 33
},
{
"epoch": 0.31,
"learning_rate": 9.963891974111042e-06,
"loss": 0.4202,
"step": 34
},
{
"epoch": 0.32,
"learning_rate": 9.959243757040434e-06,
"loss": 0.4736,
"step": 35
},
{
"epoch": 0.33,
"learning_rate": 9.95431540934317e-06,
"loss": 0.3851,
"step": 36
},
{
"epoch": 0.34,
"learning_rate": 9.949107209404664e-06,
"loss": 0.5542,
"step": 37
},
{
"epoch": 0.35,
"learning_rate": 9.943619451418225e-06,
"loss": 0.5095,
"step": 38
},
{
"epoch": 0.36,
"learning_rate": 9.937852445368427e-06,
"loss": 0.4338,
"step": 39
},
{
"epoch": 0.37,
"learning_rate": 9.931806517013612e-06,
"loss": 0.45,
"step": 40
},
{
"epoch": 0.38,
"learning_rate": 9.925482007867485e-06,
"loss": 0.4735,
"step": 41
},
{
"epoch": 0.39,
"learning_rate": 9.918879275179819e-06,
"loss": 0.4735,
"step": 42
},
{
"epoch": 0.39,
"learning_rate": 9.911998691916275e-06,
"loss": 0.5041,
"step": 43
},
{
"epoch": 0.4,
"learning_rate": 9.904840646737346e-06,
"loss": 0.4432,
"step": 44
},
{
"epoch": 0.41,
"learning_rate": 9.89740554397639e-06,
"loss": 0.4457,
"step": 45
},
{
"epoch": 0.42,
"learning_rate": 9.889693803616793e-06,
"loss": 0.4692,
"step": 46
},
{
"epoch": 0.43,
"learning_rate": 9.881705861268252e-06,
"loss": 0.4469,
"step": 47
},
{
"epoch": 0.44,
"learning_rate": 9.873442168142158e-06,
"loss": 0.4389,
"step": 48
},
{
"epoch": 0.45,
"learning_rate": 9.864903191026125e-06,
"loss": 0.4195,
"step": 49
},
{
"epoch": 0.46,
"learning_rate": 9.856089412257605e-06,
"loss": 0.4635,
"step": 50
},
{
"epoch": 0.47,
"learning_rate": 9.847001329696653e-06,
"loss": 0.4944,
"step": 51
},
{
"epoch": 0.48,
"learning_rate": 9.837639456697802e-06,
"loss": 0.4917,
"step": 52
},
{
"epoch": 0.49,
"learning_rate": 9.828004322081067e-06,
"loss": 0.4197,
"step": 53
},
{
"epoch": 0.5,
"learning_rate": 9.818096470102067e-06,
"loss": 0.4294,
"step": 54
},
{
"epoch": 0.5,
"learning_rate": 9.807916460421294e-06,
"loss": 0.4839,
"step": 55
},
{
"epoch": 0.51,
"learning_rate": 9.797464868072489e-06,
"loss": 0.4918,
"step": 56
},
{
"epoch": 0.52,
"learning_rate": 9.78674228343016e-06,
"loss": 0.5003,
"step": 57
},
{
"epoch": 0.53,
"learning_rate": 9.775749312176249e-06,
"loss": 0.4864,
"step": 58
},
{
"epoch": 0.54,
"learning_rate": 9.764486575265893e-06,
"loss": 0.4977,
"step": 59
},
{
"epoch": 0.55,
"learning_rate": 9.752954708892379e-06,
"loss": 0.4125,
"step": 60
},
{
"epoch": 0.56,
"learning_rate": 9.741154364451179e-06,
"loss": 0.4442,
"step": 61
},
{
"epoch": 0.57,
"learning_rate": 9.729086208503174e-06,
"loss": 0.4624,
"step": 62
},
{
"epoch": 0.58,
"learning_rate": 9.716750922736998e-06,
"loss": 0.3754,
"step": 63
},
{
"epoch": 0.59,
"learning_rate": 9.704149203930522e-06,
"loss": 0.4218,
"step": 64
},
{
"epoch": 0.6,
"learning_rate": 9.691281763911513e-06,
"loss": 0.4997,
"step": 65
},
{
"epoch": 0.61,
"learning_rate": 9.67814932951741e-06,
"loss": 0.3829,
"step": 66
},
{
"epoch": 0.61,
"learning_rate": 9.664752642554272e-06,
"loss": 0.3637,
"step": 67
},
{
"epoch": 0.62,
"learning_rate": 9.651092459754879e-06,
"loss": 0.5271,
"step": 68
},
{
"epoch": 0.63,
"learning_rate": 9.637169552735985e-06,
"loss": 0.4425,
"step": 69
},
{
"epoch": 0.64,
"learning_rate": 9.622984707954732e-06,
"loss": 0.4109,
"step": 70
},
{
"epoch": 0.65,
"learning_rate": 9.608538726664224e-06,
"loss": 0.3699,
"step": 71
},
{
"epoch": 0.66,
"learning_rate": 9.593832424868271e-06,
"loss": 0.4199,
"step": 72
},
{
"epoch": 0.67,
"learning_rate": 9.578866633275289e-06,
"loss": 0.4548,
"step": 73
},
{
"epoch": 0.68,
"learning_rate": 9.563642197251382e-06,
"loss": 0.4464,
"step": 74
},
{
"epoch": 0.69,
"learning_rate": 9.548159976772593e-06,
"loss": 0.4286,
"step": 75
},
{
"epoch": 0.7,
"learning_rate": 9.532420846376316e-06,
"loss": 0.397,
"step": 76
},
{
"epoch": 0.71,
"learning_rate": 9.516425695111906e-06,
"loss": 0.4522,
"step": 77
},
{
"epoch": 0.72,
"learning_rate": 9.500175426490455e-06,
"loss": 0.4763,
"step": 78
},
{
"epoch": 0.72,
"learning_rate": 9.48367095843376e-06,
"loss": 0.4386,
"step": 79
},
{
"epoch": 0.73,
"learning_rate": 9.466913223222467e-06,
"loss": 0.411,
"step": 80
},
{
"epoch": 0.74,
"learning_rate": 9.449903167443415e-06,
"loss": 0.3719,
"step": 81
},
{
"epoch": 0.75,
"learning_rate": 9.432641751936162e-06,
"loss": 0.3693,
"step": 82
},
{
"epoch": 0.76,
"learning_rate": 9.415129951738713e-06,
"loss": 0.3955,
"step": 83
},
{
"epoch": 0.77,
"learning_rate": 9.397368756032445e-06,
"loss": 0.4196,
"step": 84
},
{
"epoch": 0.78,
"learning_rate": 9.379359168086231e-06,
"loss": 0.4527,
"step": 85
},
{
"epoch": 0.79,
"learning_rate": 9.361102205199762e-06,
"loss": 0.4054,
"step": 86
},
{
"epoch": 0.8,
"learning_rate": 9.34259889864609e-06,
"loss": 0.4001,
"step": 87
},
{
"epoch": 0.81,
"learning_rate": 9.32385029361338e-06,
"loss": 0.4189,
"step": 88
},
{
"epoch": 0.82,
"learning_rate": 9.304857449145858e-06,
"loss": 0.439,
"step": 89
},
{
"epoch": 0.83,
"learning_rate": 9.285621438083997e-06,
"loss": 0.4221,
"step": 90
},
{
"epoch": 0.83,
"learning_rate": 9.26614334700392e-06,
"loss": 0.4208,
"step": 91
},
{
"epoch": 0.84,
"learning_rate": 9.246424276156008e-06,
"loss": 0.4707,
"step": 92
},
{
"epoch": 0.85,
"learning_rate": 9.226465339402768e-06,
"loss": 0.3372,
"step": 93
},
{
"epoch": 0.86,
"learning_rate": 9.206267664155906e-06,
"loss": 0.3637,
"step": 94
},
{
"epoch": 0.87,
"learning_rate": 9.185832391312644e-06,
"loss": 0.4146,
"step": 95
},
{
"epoch": 0.88,
"learning_rate": 9.165160675191272e-06,
"loss": 0.393,
"step": 96
},
{
"epoch": 0.89,
"learning_rate": 9.144253683465953e-06,
"loss": 0.4712,
"step": 97
},
{
"epoch": 0.9,
"learning_rate": 9.123112597100759e-06,
"loss": 0.3904,
"step": 98
},
{
"epoch": 0.91,
"learning_rate": 9.101738610282956e-06,
"loss": 0.4192,
"step": 99
},
{
"epoch": 0.92,
"learning_rate": 9.080132930355567e-06,
"loss": 0.4039,
"step": 100
},
{
"epoch": 0.93,
"learning_rate": 9.058296777749154e-06,
"loss": 0.4516,
"step": 101
},
{
"epoch": 0.94,
"learning_rate": 9.03623138591289e-06,
"loss": 0.3348,
"step": 102
},
{
"epoch": 0.94,
"learning_rate": 9.013938001244885e-06,
"loss": 0.4256,
"step": 103
},
{
"epoch": 0.95,
"learning_rate": 8.99141788302178e-06,
"loss": 0.3774,
"step": 104
},
{
"epoch": 0.96,
"learning_rate": 8.968672303327614e-06,
"loss": 0.4471,
"step": 105
},
{
"epoch": 0.97,
"learning_rate": 8.94570254698197e-06,
"loss": 0.3771,
"step": 106
},
{
"epoch": 0.98,
"learning_rate": 8.922509911467395e-06,
"loss": 0.4137,
"step": 107
},
{
"epoch": 0.99,
"learning_rate": 8.899095706856122e-06,
"loss": 0.4269,
"step": 108
},
{
"epoch": 1.0,
"learning_rate": 8.875461255736055e-06,
"loss": 0.4102,
"step": 109
},
{
"epoch": 1.01,
"learning_rate": 8.851607893136065e-06,
"loss": 0.2619,
"step": 110
},
{
"epoch": 1.02,
"learning_rate": 8.827536966450584e-06,
"loss": 0.2864,
"step": 111
},
{
"epoch": 1.03,
"learning_rate": 8.803249835363486e-06,
"loss": 0.2712,
"step": 112
},
{
"epoch": 1.04,
"learning_rate": 8.778747871771293e-06,
"loss": 0.2186,
"step": 113
},
{
"epoch": 1.05,
"learning_rate": 8.754032459705672e-06,
"loss": 0.2585,
"step": 114
},
{
"epoch": 1.06,
"learning_rate": 8.729104995255265e-06,
"loss": 0.2711,
"step": 115
},
{
"epoch": 1.06,
"learning_rate": 8.703966886486819e-06,
"loss": 0.2801,
"step": 116
},
{
"epoch": 1.07,
"learning_rate": 8.67861955336566e-06,
"loss": 0.2586,
"step": 117
},
{
"epoch": 1.08,
"learning_rate": 8.65306442767547e-06,
"loss": 0.3134,
"step": 118
},
{
"epoch": 1.09,
"learning_rate": 8.627302952937431e-06,
"loss": 0.2067,
"step": 119
},
{
"epoch": 1.1,
"learning_rate": 8.601336584328659e-06,
"loss": 0.2637,
"step": 120
},
{
"epoch": 1.11,
"learning_rate": 8.575166788600031e-06,
"loss": 0.2445,
"step": 121
},
{
"epoch": 1.12,
"learning_rate": 8.548795043993316e-06,
"loss": 0.2456,
"step": 122
},
{
"epoch": 1.13,
"learning_rate": 8.522222840157687e-06,
"loss": 0.2963,
"step": 123
},
{
"epoch": 1.14,
"learning_rate": 8.495451678065563e-06,
"loss": 0.2781,
"step": 124
},
{
"epoch": 1.15,
"learning_rate": 8.468483069927832e-06,
"loss": 0.2572,
"step": 125
},
{
"epoch": 1.16,
"learning_rate": 8.441318539108433e-06,
"loss": 0.2473,
"step": 126
},
{
"epoch": 1.17,
"learning_rate": 8.413959620038306e-06,
"loss": 0.1962,
"step": 127
},
{
"epoch": 1.17,
"learning_rate": 8.386407858128707e-06,
"loss": 0.2644,
"step": 128
},
{
"epoch": 1.18,
"learning_rate": 8.358664809683926e-06,
"loss": 0.2451,
"step": 129
},
{
"epoch": 1.19,
"learning_rate": 8.330732041813367e-06,
"loss": 0.2801,
"step": 130
},
{
"epoch": 1.2,
"learning_rate": 8.302611132343042e-06,
"loss": 0.2425,
"step": 131
},
{
"epoch": 1.21,
"learning_rate": 8.274303669726427e-06,
"loss": 0.2113,
"step": 132
},
{
"epoch": 1.22,
"learning_rate": 8.245811252954741e-06,
"loss": 0.2918,
"step": 133
},
{
"epoch": 1.23,
"learning_rate": 8.217135491466636e-06,
"loss": 0.2356,
"step": 134
},
{
"epoch": 1.24,
"learning_rate": 8.18827800505727e-06,
"loss": 0.2809,
"step": 135
},
{
"epoch": 1.25,
"learning_rate": 8.15924042378682e-06,
"loss": 0.2559,
"step": 136
},
{
"epoch": 1.26,
"learning_rate": 8.130024387888402e-06,
"loss": 0.2446,
"step": 137
},
{
"epoch": 1.27,
"learning_rate": 8.100631547675417e-06,
"loss": 0.2314,
"step": 138
},
{
"epoch": 1.28,
"learning_rate": 8.071063563448341e-06,
"loss": 0.2729,
"step": 139
},
{
"epoch": 1.28,
"learning_rate": 8.041322105400923e-06,
"loss": 0.2017,
"step": 140
},
{
"epoch": 1.29,
"learning_rate": 8.01140885352586e-06,
"loss": 0.2195,
"step": 141
},
{
"epoch": 1.3,
"learning_rate": 7.981325497519892e-06,
"loss": 0.2934,
"step": 142
},
{
"epoch": 1.31,
"learning_rate": 7.951073736688348e-06,
"loss": 0.2598,
"step": 143
},
{
"epoch": 1.32,
"learning_rate": 7.920655279849173e-06,
"loss": 0.2776,
"step": 144
},
{
"epoch": 1.33,
"learning_rate": 7.890071845236395e-06,
"loss": 0.2622,
"step": 145
},
{
"epoch": 1.34,
"learning_rate": 7.859325160403073e-06,
"loss": 0.2621,
"step": 146
},
{
"epoch": 1.35,
"learning_rate": 7.8284169621237e-06,
"loss": 0.2499,
"step": 147
},
{
"epoch": 1.36,
"learning_rate": 7.797348996296116e-06,
"loss": 0.28,
"step": 148
},
{
"epoch": 1.37,
"learning_rate": 7.766123017842877e-06,
"loss": 0.1906,
"step": 149
},
{
"epoch": 1.38,
"learning_rate": 7.734740790612137e-06,
"loss": 0.2373,
"step": 150
},
{
"epoch": 1.39,
"learning_rate": 7.703204087277989e-06,
"loss": 0.2,
"step": 151
},
{
"epoch": 1.39,
"learning_rate": 7.671514689240366e-06,
"loss": 0.2678,
"step": 152
},
{
"epoch": 1.4,
"learning_rate": 7.639674386524395e-06,
"loss": 0.2295,
"step": 153
},
{
"epoch": 1.41,
"learning_rate": 7.607684977679284e-06,
"loss": 0.2417,
"step": 154
},
{
"epoch": 1.42,
"learning_rate": 7.575548269676741e-06,
"loss": 0.2662,
"step": 155
},
{
"epoch": 1.43,
"learning_rate": 7.543266077808893e-06,
"loss": 0.2645,
"step": 156
},
{
"epoch": 1.44,
"learning_rate": 7.510840225585749e-06,
"loss": 0.2757,
"step": 157
},
{
"epoch": 1.45,
"learning_rate": 7.478272544632204e-06,
"loss": 0.2324,
"step": 158
},
{
"epoch": 1.46,
"learning_rate": 7.44556487458456e-06,
"loss": 0.2655,
"step": 159
},
{
"epoch": 1.47,
"learning_rate": 7.412719062986632e-06,
"loss": 0.2091,
"step": 160
},
{
"epoch": 1.48,
"learning_rate": 7.379736965185369e-06,
"loss": 0.2293,
"step": 161
},
{
"epoch": 1.49,
"learning_rate": 7.3466204442260605e-06,
"loss": 0.246,
"step": 162
},
{
"epoch": 1.5,
"learning_rate": 7.313371370747104e-06,
"loss": 0.2362,
"step": 163
},
{
"epoch": 1.5,
"learning_rate": 7.279991622874319e-06,
"loss": 0.2074,
"step": 164
},
{
"epoch": 1.51,
"learning_rate": 7.24648308611489e-06,
"loss": 0.2035,
"step": 165
},
{
"epoch": 1.52,
"learning_rate": 7.212847653250828e-06,
"loss": 0.3013,
"step": 166
},
{
"epoch": 1.53,
"learning_rate": 7.1790872242320775e-06,
"loss": 0.2286,
"step": 167
},
{
"epoch": 1.54,
"learning_rate": 7.145203706069183e-06,
"loss": 0.2005,
"step": 168
},
{
"epoch": 1.55,
"learning_rate": 7.1111990127255684e-06,
"loss": 0.2351,
"step": 169
},
{
"epoch": 1.56,
"learning_rate": 7.0770750650094335e-06,
"loss": 0.2378,
"step": 170
},
{
"epoch": 1.57,
"learning_rate": 7.042833790465241e-06,
"loss": 0.2557,
"step": 171
},
{
"epoch": 1.58,
"learning_rate": 7.008477123264849e-06,
"loss": 0.1742,
"step": 172
},
{
"epoch": 1.59,
"learning_rate": 6.974007004098243e-06,
"loss": 0.2862,
"step": 173
},
{
"epoch": 1.6,
"learning_rate": 6.939425380063924e-06,
"loss": 0.2359,
"step": 174
},
{
"epoch": 1.61,
"learning_rate": 6.9047342045589224e-06,
"loss": 0.2646,
"step": 175
},
{
"epoch": 1.61,
"learning_rate": 6.869935437168449e-06,
"loss": 0.2223,
"step": 176
},
{
"epoch": 1.62,
"learning_rate": 6.835031043555211e-06,
"loss": 0.2292,
"step": 177
},
{
"epoch": 1.63,
"learning_rate": 6.800022995348381e-06,
"loss": 0.2395,
"step": 178
},
{
"epoch": 1.64,
"learning_rate": 6.76491327003222e-06,
"loss": 0.2201,
"step": 179
},
{
"epoch": 1.65,
"learning_rate": 6.729703850834381e-06,
"loss": 0.2071,
"step": 180
},
{
"epoch": 1.66,
"learning_rate": 6.694396726613883e-06,
"loss": 0.2461,
"step": 181
},
{
"epoch": 1.67,
"learning_rate": 6.65899389174876e-06,
"loss": 0.2804,
"step": 182
},
{
"epoch": 1.68,
"learning_rate": 6.6234973460234184e-06,
"loss": 0.2859,
"step": 183
},
{
"epoch": 1.69,
"learning_rate": 6.587909094515663e-06,
"loss": 0.2358,
"step": 184
},
{
"epoch": 1.7,
"learning_rate": 6.552231147483448e-06,
"loss": 0.229,
"step": 185
},
{
"epoch": 1.71,
"learning_rate": 6.5164655202513135e-06,
"loss": 0.2673,
"step": 186
},
{
"epoch": 1.72,
"learning_rate": 6.480614233096558e-06,
"loss": 0.2169,
"step": 187
},
{
"epoch": 1.72,
"learning_rate": 6.444679311135112e-06,
"loss": 0.1908,
"step": 188
},
{
"epoch": 1.73,
"learning_rate": 6.408662784207149e-06,
"loss": 0.2167,
"step": 189
},
{
"epoch": 1.74,
"learning_rate": 6.372566686762427e-06,
"loss": 0.2533,
"step": 190
},
{
"epoch": 1.75,
"learning_rate": 6.336393057745365e-06,
"loss": 0.2361,
"step": 191
},
{
"epoch": 1.76,
"learning_rate": 6.300143940479881e-06,
"loss": 0.2396,
"step": 192
},
{
"epoch": 1.77,
"learning_rate": 6.2638213825539595e-06,
"loss": 0.2716,
"step": 193
},
{
"epoch": 1.78,
"learning_rate": 6.227427435703997e-06,
"loss": 0.2187,
"step": 194
},
{
"epoch": 1.79,
"learning_rate": 6.190964155698903e-06,
"loss": 0.2419,
"step": 195
},
{
"epoch": 1.8,
"learning_rate": 6.154433602223979e-06,
"loss": 0.2547,
"step": 196
},
{
"epoch": 1.81,
"learning_rate": 6.117837838764579e-06,
"loss": 0.2534,
"step": 197
},
{
"epoch": 1.82,
"learning_rate": 6.0811789324895365e-06,
"loss": 0.255,
"step": 198
},
{
"epoch": 1.83,
"learning_rate": 6.044458954134411e-06,
"loss": 0.2383,
"step": 199
},
{
"epoch": 1.83,
"learning_rate": 6.0076799778845105e-06,
"loss": 0.2407,
"step": 200
},
{
"epoch": 1.84,
"learning_rate": 5.970844081257734e-06,
"loss": 0.1883,
"step": 201
},
{
"epoch": 1.85,
"learning_rate": 5.933953344987215e-06,
"loss": 0.3112,
"step": 202
},
{
"epoch": 1.86,
"learning_rate": 5.897009852903792e-06,
"loss": 0.2424,
"step": 203
},
{
"epoch": 1.87,
"learning_rate": 5.860015691818292e-06,
"loss": 0.2663,
"step": 204
},
{
"epoch": 1.88,
"learning_rate": 5.82297295140367e-06,
"loss": 0.2616,
"step": 205
},
{
"epoch": 1.89,
"learning_rate": 5.78588372407695e-06,
"loss": 0.2067,
"step": 206
},
{
"epoch": 1.9,
"learning_rate": 5.748750104881051e-06,
"loss": 0.2092,
"step": 207
},
{
"epoch": 1.91,
"learning_rate": 5.711574191366427e-06,
"loss": 0.2383,
"step": 208
},
{
"epoch": 1.92,
"learning_rate": 5.674358083472598e-06,
"loss": 0.2377,
"step": 209
},
{
"epoch": 1.93,
"learning_rate": 5.637103883409525e-06,
"loss": 0.2415,
"step": 210
},
{
"epoch": 1.94,
"learning_rate": 5.599813695538866e-06,
"loss": 0.2478,
"step": 211
},
{
"epoch": 1.94,
"learning_rate": 5.562489626255104e-06,
"loss": 0.2752,
"step": 212
},
{
"epoch": 1.95,
"learning_rate": 5.52513378386657e-06,
"loss": 0.2133,
"step": 213
},
{
"epoch": 1.96,
"learning_rate": 5.487748278476342e-06,
"loss": 0.2043,
"step": 214
},
{
"epoch": 1.97,
"learning_rate": 5.450335221863068e-06,
"loss": 0.2664,
"step": 215
},
{
"epoch": 1.98,
"learning_rate": 5.412896727361663e-06,
"loss": 0.2617,
"step": 216
},
{
"epoch": 1.99,
"learning_rate": 5.375434909743942e-06,
"loss": 0.2001,
"step": 217
},
{
"epoch": 2.0,
"learning_rate": 5.337951885099167e-06,
"loss": 0.2353,
"step": 218
},
{
"epoch": 2.01,
"learning_rate": 5.300449770714502e-06,
"loss": 0.112,
"step": 219
},
{
"epoch": 2.02,
"learning_rate": 5.262930684955439e-06,
"loss": 0.1022,
"step": 220
},
{
"epoch": 2.03,
"learning_rate": 5.225396747146112e-06,
"loss": 0.1263,
"step": 221
},
{
"epoch": 2.04,
"learning_rate": 5.187850077449604e-06,
"loss": 0.1059,
"step": 222
},
{
"epoch": 2.05,
"learning_rate": 5.150292796748174e-06,
"loss": 0.1101,
"step": 223
},
{
"epoch": 2.06,
"learning_rate": 5.112727026523461e-06,
"loss": 0.1274,
"step": 224
},
{
"epoch": 2.06,
"learning_rate": 5.075154888736653e-06,
"loss": 0.1604,
"step": 225
},
{
"epoch": 2.07,
"learning_rate": 5.03757850570861e-06,
"loss": 0.1067,
"step": 226
},
{
"epoch": 2.08,
"learning_rate": 5e-06,
"loss": 0.1124,
"step": 227
},
{
"epoch": 2.09,
"learning_rate": 4.9624214942913916e-06,
"loss": 0.0976,
"step": 228
},
{
"epoch": 2.1,
"learning_rate": 4.924845111263349e-06,
"loss": 0.1094,
"step": 229
},
{
"epoch": 2.11,
"learning_rate": 4.88727297347654e-06,
"loss": 0.1156,
"step": 230
},
{
"epoch": 2.12,
"learning_rate": 4.8497072032518274e-06,
"loss": 0.1201,
"step": 231
},
{
"epoch": 2.13,
"learning_rate": 4.8121499225503974e-06,
"loss": 0.1053,
"step": 232
},
{
"epoch": 2.14,
"learning_rate": 4.774603252853889e-06,
"loss": 0.1344,
"step": 233
},
{
"epoch": 2.15,
"learning_rate": 4.737069315044562e-06,
"loss": 0.1171,
"step": 234
},
{
"epoch": 2.16,
"learning_rate": 4.699550229285499e-06,
"loss": 0.0906,
"step": 235
},
{
"epoch": 2.17,
"learning_rate": 4.662048114900837e-06,
"loss": 0.1313,
"step": 236
},
{
"epoch": 2.17,
"learning_rate": 4.624565090256059e-06,
"loss": 0.1096,
"step": 237
},
{
"epoch": 2.18,
"learning_rate": 4.587103272638339e-06,
"loss": 0.1185,
"step": 238
},
{
"epoch": 2.19,
"learning_rate": 4.549664778136933e-06,
"loss": 0.1372,
"step": 239
},
{
"epoch": 2.2,
"learning_rate": 4.512251721523659e-06,
"loss": 0.1042,
"step": 240
},
{
"epoch": 2.21,
"learning_rate": 4.4748662161334335e-06,
"loss": 0.1185,
"step": 241
},
{
"epoch": 2.22,
"learning_rate": 4.437510373744897e-06,
"loss": 0.1448,
"step": 242
},
{
"epoch": 2.23,
"learning_rate": 4.400186304461136e-06,
"loss": 0.113,
"step": 243
},
{
"epoch": 2.24,
"learning_rate": 4.362896116590475e-06,
"loss": 0.1211,
"step": 244
},
{
"epoch": 2.25,
"learning_rate": 4.325641916527405e-06,
"loss": 0.132,
"step": 245
},
{
"epoch": 2.26,
"learning_rate": 4.2884258086335755e-06,
"loss": 0.1105,
"step": 246
},
{
"epoch": 2.27,
"learning_rate": 4.25124989511895e-06,
"loss": 0.0987,
"step": 247
},
{
"epoch": 2.28,
"learning_rate": 4.214116275923051e-06,
"loss": 0.1197,
"step": 248
},
{
"epoch": 2.28,
"learning_rate": 4.17702704859633e-06,
"loss": 0.1334,
"step": 249
},
{
"epoch": 2.29,
"learning_rate": 4.1399843081817085e-06,
"loss": 0.1468,
"step": 250
},
{
"epoch": 2.3,
"learning_rate": 4.1029901470962105e-06,
"loss": 0.1161,
"step": 251
},
{
"epoch": 2.31,
"learning_rate": 4.066046655012786e-06,
"loss": 0.1106,
"step": 252
},
{
"epoch": 2.32,
"learning_rate": 4.029155918742268e-06,
"loss": 0.1234,
"step": 253
},
{
"epoch": 2.33,
"learning_rate": 3.992320022115492e-06,
"loss": 0.1261,
"step": 254
},
{
"epoch": 2.34,
"learning_rate": 3.955541045865591e-06,
"loss": 0.1011,
"step": 255
},
{
"epoch": 2.35,
"learning_rate": 3.918821067510464e-06,
"loss": 0.1058,
"step": 256
},
{
"epoch": 2.36,
"learning_rate": 3.882162161235421e-06,
"loss": 0.1002,
"step": 257
},
{
"epoch": 2.37,
"learning_rate": 3.845566397776022e-06,
"loss": 0.1023,
"step": 258
},
{
"epoch": 2.38,
"learning_rate": 3.8090358443010993e-06,
"loss": 0.1174,
"step": 259
},
{
"epoch": 2.39,
"learning_rate": 3.7725725642960047e-06,
"loss": 0.111,
"step": 260
},
{
"epoch": 2.39,
"learning_rate": 3.7361786174460414e-06,
"loss": 0.0948,
"step": 261
},
{
"epoch": 2.4,
"learning_rate": 3.6998560595201188e-06,
"loss": 0.0972,
"step": 262
},
{
"epoch": 2.41,
"learning_rate": 3.6636069422546363e-06,
"loss": 0.0982,
"step": 263
},
{
"epoch": 2.42,
"learning_rate": 3.627433313237576e-06,
"loss": 0.1374,
"step": 264
},
{
"epoch": 2.43,
"learning_rate": 3.5913372157928515e-06,
"loss": 0.0988,
"step": 265
},
{
"epoch": 2.44,
"learning_rate": 3.555320688864889e-06,
"loss": 0.1281,
"step": 266
},
{
"epoch": 2.45,
"learning_rate": 3.519385766903442e-06,
"loss": 0.1273,
"step": 267
},
{
"epoch": 2.46,
"learning_rate": 3.483534479748688e-06,
"loss": 0.1256,
"step": 268
},
{
"epoch": 2.47,
"learning_rate": 3.447768852516554e-06,
"loss": 0.1131,
"step": 269
},
{
"epoch": 2.48,
"learning_rate": 3.4120909054843375e-06,
"loss": 0.0974,
"step": 270
},
{
"epoch": 2.49,
"learning_rate": 3.3765026539765832e-06,
"loss": 0.1233,
"step": 271
},
{
"epoch": 2.5,
"learning_rate": 3.3410061082512422e-06,
"loss": 0.1175,
"step": 272
},
{
"epoch": 2.5,
"learning_rate": 3.3056032733861188e-06,
"loss": 0.1178,
"step": 273
},
{
"epoch": 2.51,
"learning_rate": 3.2702961491656197e-06,
"loss": 0.11,
"step": 274
},
{
"epoch": 2.52,
"learning_rate": 3.2350867299677802e-06,
"loss": 0.1091,
"step": 275
},
{
"epoch": 2.53,
"learning_rate": 3.1999770046516198e-06,
"loss": 0.1146,
"step": 276
},
{
"epoch": 2.54,
"learning_rate": 3.164968956444791e-06,
"loss": 0.0945,
"step": 277
},
{
"epoch": 2.55,
"learning_rate": 3.130064562831553e-06,
"loss": 0.1216,
"step": 278
},
{
"epoch": 2.56,
"learning_rate": 3.0952657954410792e-06,
"loss": 0.1273,
"step": 279
},
{
"epoch": 2.57,
"learning_rate": 3.0605746199360755e-06,
"loss": 0.1161,
"step": 280
},
{
"epoch": 2.58,
"learning_rate": 3.0259929959017585e-06,
"loss": 0.1395,
"step": 281
},
{
"epoch": 2.59,
"learning_rate": 2.991522876735154e-06,
"loss": 0.1224,
"step": 282
},
{
"epoch": 2.6,
"learning_rate": 2.95716620953476e-06,
"loss": 0.1052,
"step": 283
},
{
"epoch": 2.61,
"learning_rate": 2.9229249349905686e-06,
"loss": 0.1383,
"step": 284
},
{
"epoch": 2.61,
"learning_rate": 2.8888009872744332e-06,
"loss": 0.0985,
"step": 285
},
{
"epoch": 2.62,
"learning_rate": 2.8547962939308187e-06,
"loss": 0.1164,
"step": 286
},
{
"epoch": 2.63,
"learning_rate": 2.8209127757679246e-06,
"loss": 0.095,
"step": 287
},
{
"epoch": 2.64,
"learning_rate": 2.787152346749173e-06,
"loss": 0.1308,
"step": 288
},
{
"epoch": 2.65,
"learning_rate": 2.7535169138851124e-06,
"loss": 0.1336,
"step": 289
},
{
"epoch": 2.66,
"learning_rate": 2.720008377125682e-06,
"loss": 0.0952,
"step": 290
},
{
"epoch": 2.67,
"learning_rate": 2.686628629252899e-06,
"loss": 0.134,
"step": 291
},
{
"epoch": 2.68,
"learning_rate": 2.6533795557739407e-06,
"loss": 0.1008,
"step": 292
},
{
"epoch": 2.69,
"learning_rate": 2.6202630348146323e-06,
"loss": 0.0938,
"step": 293
},
{
"epoch": 2.7,
"learning_rate": 2.5872809370133704e-06,
"loss": 0.1146,
"step": 294
},
{
"epoch": 2.71,
"learning_rate": 2.5544351254154407e-06,
"loss": 0.0909,
"step": 295
},
{
"epoch": 2.72,
"learning_rate": 2.5217274553677975e-06,
"loss": 0.1054,
"step": 296
},
{
"epoch": 2.72,
"learning_rate": 2.489159774414252e-06,
"loss": 0.0909,
"step": 297
},
{
"epoch": 2.73,
"learning_rate": 2.4567339221911086e-06,
"loss": 0.0918,
"step": 298
},
{
"epoch": 2.74,
"learning_rate": 2.424451730323261e-06,
"loss": 0.1043,
"step": 299
},
{
"epoch": 2.75,
"learning_rate": 2.3923150223207176e-06,
"loss": 0.1139,
"step": 300
},
{
"epoch": 2.76,
"learning_rate": 2.3603256134756066e-06,
"loss": 0.0958,
"step": 301
},
{
"epoch": 2.77,
"learning_rate": 2.328485310759635e-06,
"loss": 0.1115,
"step": 302
},
{
"epoch": 2.78,
"learning_rate": 2.296795912722014e-06,
"loss": 0.092,
"step": 303
},
{
"epoch": 2.79,
"learning_rate": 2.265259209387867e-06,
"loss": 0.1011,
"step": 304
},
{
"epoch": 2.8,
"learning_rate": 2.2338769821571225e-06,
"loss": 0.106,
"step": 305
},
{
"epoch": 2.81,
"learning_rate": 2.202651003703885e-06,
"loss": 0.1053,
"step": 306
},
{
"epoch": 2.82,
"learning_rate": 2.1715830378763025e-06,
"loss": 0.1314,
"step": 307
},
{
"epoch": 2.83,
"learning_rate": 2.140674839596931e-06,
"loss": 0.1252,
"step": 308
},
{
"epoch": 2.83,
"learning_rate": 2.109928154763606e-06,
"loss": 0.1032,
"step": 309
},
{
"epoch": 2.84,
"learning_rate": 2.0793447201508288e-06,
"loss": 0.0911,
"step": 310
},
{
"epoch": 2.85,
"learning_rate": 2.0489262633116536e-06,
"loss": 0.0865,
"step": 311
},
{
"epoch": 2.86,
"learning_rate": 2.01867450248011e-06,
"loss": 0.1087,
"step": 312
},
{
"epoch": 2.87,
"learning_rate": 1.9885911464741413e-06,
"loss": 0.1136,
"step": 313
},
{
"epoch": 2.88,
"learning_rate": 1.9586778945990785e-06,
"loss": 0.0873,
"step": 314
},
{
"epoch": 2.89,
"learning_rate": 1.928936436551661e-06,
"loss": 0.1155,
"step": 315
},
{
"epoch": 2.9,
"learning_rate": 1.8993684523245842e-06,
"loss": 0.117,
"step": 316
},
{
"epoch": 2.91,
"learning_rate": 1.8699756121115997e-06,
"loss": 0.1083,
"step": 317
},
{
"epoch": 2.92,
"learning_rate": 1.8407595762131814e-06,
"loss": 0.1099,
"step": 318
},
{
"epoch": 2.93,
"learning_rate": 1.811721994942731e-06,
"loss": 0.1183,
"step": 319
},
{
"epoch": 2.94,
"learning_rate": 1.7828645085333645e-06,
"loss": 0.1168,
"step": 320
},
{
"epoch": 2.94,
"learning_rate": 1.7541887470452606e-06,
"loss": 0.0985,
"step": 321
},
{
"epoch": 2.95,
"learning_rate": 1.7256963302735752e-06,
"loss": 0.1012,
"step": 322
},
{
"epoch": 2.96,
"learning_rate": 1.6973888676569594e-06,
"loss": 0.1107,
"step": 323
},
{
"epoch": 2.97,
"learning_rate": 1.6692679581866334e-06,
"loss": 0.1301,
"step": 324
},
{
"epoch": 2.98,
"learning_rate": 1.6413351903160763e-06,
"loss": 0.1022,
"step": 325
},
{
"epoch": 2.99,
"learning_rate": 1.6135921418712959e-06,
"loss": 0.1213,
"step": 326
},
{
"epoch": 3.0,
"learning_rate": 1.5860403799616951e-06,
"loss": 0.1093,
"step": 327
},
{
"epoch": 3.01,
"learning_rate": 1.5586814608915673e-06,
"loss": 0.0554,
"step": 328
},
{
"epoch": 3.02,
"learning_rate": 1.5315169300721694e-06,
"loss": 0.084,
"step": 329
},
{
"epoch": 3.03,
"learning_rate": 1.5045483219344387e-06,
"loss": 0.0741,
"step": 330
}
],
"max_steps": 436,
"num_train_epochs": 4,
"total_flos": 1.2638375430617825e+18,
"trial_name": null,
"trial_params": null
}