diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5850 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 831, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0012033694344163659, + "grad_norm": 12.4375, + "learning_rate": 1.9999928539313353e-05, + "loss": 1.9787, + "step": 1 + }, + { + "epoch": 0.0024067388688327317, + "grad_norm": 8.375, + "learning_rate": 1.9999714158274743e-05, + "loss": 3.438, + "step": 2 + }, + { + "epoch": 0.0036101083032490976, + "grad_norm": 12.9375, + "learning_rate": 1.9999356859948126e-05, + "loss": 1.7121, + "step": 3 + }, + { + "epoch": 0.0048134777376654635, + "grad_norm": 12.6875, + "learning_rate": 1.9998856649440058e-05, + "loss": 3.1317, + "step": 4 + }, + { + "epoch": 0.006016847172081829, + "grad_norm": 8.8125, + "learning_rate": 1.9998213533899625e-05, + "loss": 2.1194, + "step": 5 + }, + { + "epoch": 0.007220216606498195, + "grad_norm": 10.3125, + "learning_rate": 1.9997427522518315e-05, + "loss": 2.833, + "step": 6 + }, + { + "epoch": 0.00842358604091456, + "grad_norm": 14.1875, + "learning_rate": 1.9996498626529914e-05, + "loss": 2.1864, + "step": 7 + }, + { + "epoch": 0.009626955475330927, + "grad_norm": 8.6875, + "learning_rate": 1.999542685921033e-05, + "loss": 2.7296, + "step": 8 + }, + { + "epoch": 0.010830324909747292, + "grad_norm": 9.4375, + "learning_rate": 1.9994212235877407e-05, + "loss": 1.9642, + "step": 9 + }, + { + "epoch": 0.012033694344163659, + "grad_norm": 6.46875, + "learning_rate": 1.9992854773890714e-05, + "loss": 2.6336, + "step": 10 + }, + { + "epoch": 0.013237063778580024, + "grad_norm": 7.875, + "learning_rate": 1.9991354492651283e-05, + "loss": 1.7843, + "step": 11 + }, + { + "epoch": 0.01444043321299639, + "grad_norm": 4.8125, + "learning_rate": 1.9989711413601332e-05, + "loss": 2.5641, + "step": 12 + }, + { + "epoch": 0.015643802647412757, + "grad_norm": 13.9375, + "learning_rate": 1.998792556022398e-05, + "loss": 2.2421, + "step": 13 + }, + { + "epoch": 0.01684717208182912, + "grad_norm": 3.984375, + "learning_rate": 1.9985996958042887e-05, + "loss": 2.4817, + "step": 14 + }, + { + "epoch": 0.018050541516245487, + "grad_norm": 15.875, + "learning_rate": 1.9983925634621894e-05, + "loss": 2.2535, + "step": 15 + }, + { + "epoch": 0.019253910950661854, + "grad_norm": 3.5625, + "learning_rate": 1.9981711619564654e-05, + "loss": 2.3963, + "step": 16 + }, + { + "epoch": 0.02045728038507822, + "grad_norm": 10.75, + "learning_rate": 1.997935494451416e-05, + "loss": 2.03, + "step": 17 + }, + { + "epoch": 0.021660649819494584, + "grad_norm": 3.15625, + "learning_rate": 1.997685564315234e-05, + "loss": 2.344, + "step": 18 + }, + { + "epoch": 0.02286401925391095, + "grad_norm": 12.25, + "learning_rate": 1.9974213751199556e-05, + "loss": 1.928, + "step": 19 + }, + { + "epoch": 0.024067388688327317, + "grad_norm": 2.90625, + "learning_rate": 1.9971429306414087e-05, + "loss": 2.3099, + "step": 20 + }, + { + "epoch": 0.02527075812274368, + "grad_norm": 7.84375, + "learning_rate": 1.99685023485916e-05, + "loss": 1.7257, + "step": 21 + }, + { + "epoch": 0.026474127557160047, + "grad_norm": 2.671875, + "learning_rate": 1.9965432919564583e-05, + "loss": 2.2477, + "step": 22 + }, + { + "epoch": 0.027677496991576414, + "grad_norm": 8.625, + "learning_rate": 1.9962221063201734e-05, + "loss": 1.8483, + "step": 23 + }, + { + "epoch": 0.02888086642599278, + "grad_norm": 3.21875, + "learning_rate": 1.995886682540734e-05, + "loss": 2.1731, + "step": 24 + }, + { + "epoch": 0.030084235860409144, + "grad_norm": 7.625, + "learning_rate": 1.9955370254120635e-05, + "loss": 1.5969, + "step": 25 + }, + { + "epoch": 0.031287605294825514, + "grad_norm": 2.609375, + "learning_rate": 1.9951731399315095e-05, + "loss": 2.2097, + "step": 26 + }, + { + "epoch": 0.032490974729241874, + "grad_norm": 9.0, + "learning_rate": 1.994795031299773e-05, + "loss": 1.4314, + "step": 27 + }, + { + "epoch": 0.03369434416365824, + "grad_norm": 2.59375, + "learning_rate": 1.9944027049208347e-05, + "loss": 2.14, + "step": 28 + }, + { + "epoch": 0.03489771359807461, + "grad_norm": 9.6875, + "learning_rate": 1.993996166401877e-05, + "loss": 1.7424, + "step": 29 + }, + { + "epoch": 0.036101083032490974, + "grad_norm": 2.234375, + "learning_rate": 1.993575421553204e-05, + "loss": 2.1018, + "step": 30 + }, + { + "epoch": 0.03730445246690734, + "grad_norm": 12.0, + "learning_rate": 1.9931404763881598e-05, + "loss": 1.9668, + "step": 31 + }, + { + "epoch": 0.03850782190132371, + "grad_norm": 2.25, + "learning_rate": 1.9926913371230393e-05, + "loss": 2.1388, + "step": 32 + }, + { + "epoch": 0.039711191335740074, + "grad_norm": 12.5625, + "learning_rate": 1.992228010177003e-05, + "loss": 2.1242, + "step": 33 + }, + { + "epoch": 0.04091456077015644, + "grad_norm": 2.1875, + "learning_rate": 1.9917505021719833e-05, + "loss": 2.0531, + "step": 34 + }, + { + "epoch": 0.0421179302045728, + "grad_norm": 11.0625, + "learning_rate": 1.99125881993259e-05, + "loss": 2.0355, + "step": 35 + }, + { + "epoch": 0.04332129963898917, + "grad_norm": 2.0625, + "learning_rate": 1.990752970486014e-05, + "loss": 2.1009, + "step": 36 + }, + { + "epoch": 0.044524669073405534, + "grad_norm": 8.9375, + "learning_rate": 1.990232961061924e-05, + "loss": 1.6248, + "step": 37 + }, + { + "epoch": 0.0457280385078219, + "grad_norm": 1.984375, + "learning_rate": 1.989698799092366e-05, + "loss": 2.0782, + "step": 38 + }, + { + "epoch": 0.04693140794223827, + "grad_norm": 9.625, + "learning_rate": 1.9891504922116572e-05, + "loss": 1.6215, + "step": 39 + }, + { + "epoch": 0.048134777376654635, + "grad_norm": 2.484375, + "learning_rate": 1.988588048256274e-05, + "loss": 2.0563, + "step": 40 + }, + { + "epoch": 0.049338146811071, + "grad_norm": 9.1875, + "learning_rate": 1.9880114752647434e-05, + "loss": 2.0851, + "step": 41 + }, + { + "epoch": 0.05054151624548736, + "grad_norm": 2.09375, + "learning_rate": 1.9874207814775252e-05, + "loss": 2.0189, + "step": 42 + }, + { + "epoch": 0.05174488567990373, + "grad_norm": 9.375, + "learning_rate": 1.9868159753368964e-05, + "loss": 1.8673, + "step": 43 + }, + { + "epoch": 0.052948255114320095, + "grad_norm": 1.8828125, + "learning_rate": 1.9861970654868292e-05, + "loss": 2.0445, + "step": 44 + }, + { + "epoch": 0.05415162454873646, + "grad_norm": 8.625, + "learning_rate": 1.9855640607728684e-05, + "loss": 2.1062, + "step": 45 + }, + { + "epoch": 0.05535499398315283, + "grad_norm": 2.234375, + "learning_rate": 1.9849169702420044e-05, + "loss": 2.0244, + "step": 46 + }, + { + "epoch": 0.056558363417569195, + "grad_norm": 9.0625, + "learning_rate": 1.9842558031425434e-05, + "loss": 1.9904, + "step": 47 + }, + { + "epoch": 0.05776173285198556, + "grad_norm": 2.296875, + "learning_rate": 1.983580568923977e-05, + "loss": 2.0685, + "step": 48 + }, + { + "epoch": 0.05896510228640193, + "grad_norm": 10.5625, + "learning_rate": 1.982891277236845e-05, + "loss": 1.6353, + "step": 49 + }, + { + "epoch": 0.06016847172081829, + "grad_norm": 1.8046875, + "learning_rate": 1.9821879379325985e-05, + "loss": 1.9923, + "step": 50 + }, + { + "epoch": 0.061371841155234655, + "grad_norm": 8.125, + "learning_rate": 1.9814705610634602e-05, + "loss": 1.9369, + "step": 51 + }, + { + "epoch": 0.06257521058965103, + "grad_norm": 1.9765625, + "learning_rate": 1.9807391568822785e-05, + "loss": 1.9696, + "step": 52 + }, + { + "epoch": 0.06377858002406739, + "grad_norm": 9.5, + "learning_rate": 1.9799937358423826e-05, + "loss": 2.1856, + "step": 53 + }, + { + "epoch": 0.06498194945848375, + "grad_norm": 2.375, + "learning_rate": 1.9792343085974316e-05, + "loss": 2.0433, + "step": 54 + }, + { + "epoch": 0.06618531889290012, + "grad_norm": 9.25, + "learning_rate": 1.9784608860012652e-05, + "loss": 2.0544, + "step": 55 + }, + { + "epoch": 0.06738868832731648, + "grad_norm": 1.875, + "learning_rate": 1.9776734791077442e-05, + "loss": 1.9472, + "step": 56 + }, + { + "epoch": 0.06859205776173286, + "grad_norm": 11.3125, + "learning_rate": 1.976872099170597e-05, + "loss": 1.8057, + "step": 57 + }, + { + "epoch": 0.06979542719614922, + "grad_norm": 2.125, + "learning_rate": 1.976056757643255e-05, + "loss": 1.9963, + "step": 58 + }, + { + "epoch": 0.07099879663056559, + "grad_norm": 11.375, + "learning_rate": 1.9752274661786916e-05, + "loss": 1.8518, + "step": 59 + }, + { + "epoch": 0.07220216606498195, + "grad_norm": 1.8125, + "learning_rate": 1.9743842366292544e-05, + "loss": 1.9508, + "step": 60 + }, + { + "epoch": 0.07340553549939831, + "grad_norm": 9.3125, + "learning_rate": 1.9735270810464958e-05, + "loss": 1.9033, + "step": 61 + }, + { + "epoch": 0.07460890493381468, + "grad_norm": 1.625, + "learning_rate": 1.9726560116810006e-05, + "loss": 1.9837, + "step": 62 + }, + { + "epoch": 0.07581227436823104, + "grad_norm": 15.0, + "learning_rate": 1.971771040982213e-05, + "loss": 1.8616, + "step": 63 + }, + { + "epoch": 0.07701564380264742, + "grad_norm": 1.6875, + "learning_rate": 1.9708721815982543e-05, + "loss": 1.9847, + "step": 64 + }, + { + "epoch": 0.07821901323706378, + "grad_norm": 7.59375, + "learning_rate": 1.9699594463757475e-05, + "loss": 1.7556, + "step": 65 + }, + { + "epoch": 0.07942238267148015, + "grad_norm": 1.6796875, + "learning_rate": 1.9690328483596287e-05, + "loss": 1.9431, + "step": 66 + }, + { + "epoch": 0.08062575210589651, + "grad_norm": 8.25, + "learning_rate": 1.968092400792965e-05, + "loss": 2.1827, + "step": 67 + }, + { + "epoch": 0.08182912154031288, + "grad_norm": 2.0625, + "learning_rate": 1.9671381171167616e-05, + "loss": 1.9171, + "step": 68 + }, + { + "epoch": 0.08303249097472924, + "grad_norm": 8.0625, + "learning_rate": 1.9661700109697718e-05, + "loss": 1.8986, + "step": 69 + }, + { + "epoch": 0.0842358604091456, + "grad_norm": 2.1875, + "learning_rate": 1.9651880961883025e-05, + "loss": 2.0004, + "step": 70 + }, + { + "epoch": 0.08543922984356198, + "grad_norm": 8.875, + "learning_rate": 1.964192386806013e-05, + "loss": 1.4482, + "step": 71 + }, + { + "epoch": 0.08664259927797834, + "grad_norm": 2.28125, + "learning_rate": 1.9631828970537196e-05, + "loss": 1.918, + "step": 72 + }, + { + "epoch": 0.08784596871239471, + "grad_norm": 11.375, + "learning_rate": 1.9621596413591885e-05, + "loss": 1.8718, + "step": 73 + }, + { + "epoch": 0.08904933814681107, + "grad_norm": 1.6015625, + "learning_rate": 1.96112263434693e-05, + "loss": 1.9367, + "step": 74 + }, + { + "epoch": 0.09025270758122744, + "grad_norm": 10.4375, + "learning_rate": 1.960071890837991e-05, + "loss": 1.8661, + "step": 75 + }, + { + "epoch": 0.0914560770156438, + "grad_norm": 1.75, + "learning_rate": 1.9590074258497423e-05, + "loss": 1.9062, + "step": 76 + }, + { + "epoch": 0.09265944645006016, + "grad_norm": 8.625, + "learning_rate": 1.957929254595664e-05, + "loss": 2.0265, + "step": 77 + }, + { + "epoch": 0.09386281588447654, + "grad_norm": 2.125, + "learning_rate": 1.9568373924851267e-05, + "loss": 1.9558, + "step": 78 + }, + { + "epoch": 0.0950661853188929, + "grad_norm": 10.9375, + "learning_rate": 1.9557318551231745e-05, + "loss": 1.7762, + "step": 79 + }, + { + "epoch": 0.09626955475330927, + "grad_norm": 1.875, + "learning_rate": 1.9546126583102983e-05, + "loss": 1.8972, + "step": 80 + }, + { + "epoch": 0.09747292418772563, + "grad_norm": 9.625, + "learning_rate": 1.953479818042214e-05, + "loss": 2.105, + "step": 81 + }, + { + "epoch": 0.098676293622142, + "grad_norm": 2.28125, + "learning_rate": 1.952333350509629e-05, + "loss": 1.9388, + "step": 82 + }, + { + "epoch": 0.09987966305655836, + "grad_norm": 7.34375, + "learning_rate": 1.9511732720980156e-05, + "loss": 1.7736, + "step": 83 + }, + { + "epoch": 0.10108303249097472, + "grad_norm": 2.375, + "learning_rate": 1.949999599387373e-05, + "loss": 1.8761, + "step": 84 + }, + { + "epoch": 0.1022864019253911, + "grad_norm": 7.71875, + "learning_rate": 1.9488123491519935e-05, + "loss": 1.8785, + "step": 85 + }, + { + "epoch": 0.10348977135980746, + "grad_norm": 2.125, + "learning_rate": 1.94761153836022e-05, + "loss": 1.9391, + "step": 86 + }, + { + "epoch": 0.10469314079422383, + "grad_norm": 7.875, + "learning_rate": 1.9463971841742057e-05, + "loss": 1.3668, + "step": 87 + }, + { + "epoch": 0.10589651022864019, + "grad_norm": 2.296875, + "learning_rate": 1.9451693039496665e-05, + "loss": 1.9245, + "step": 88 + }, + { + "epoch": 0.10709987966305656, + "grad_norm": 8.5, + "learning_rate": 1.9439279152356363e-05, + "loss": 2.1398, + "step": 89 + }, + { + "epoch": 0.10830324909747292, + "grad_norm": 2.453125, + "learning_rate": 1.9426730357742123e-05, + "loss": 1.9277, + "step": 90 + }, + { + "epoch": 0.1095066185318893, + "grad_norm": 9.3125, + "learning_rate": 1.9414046835003043e-05, + "loss": 1.6924, + "step": 91 + }, + { + "epoch": 0.11070998796630566, + "grad_norm": 2.390625, + "learning_rate": 1.9401228765413774e-05, + "loss": 1.8802, + "step": 92 + }, + { + "epoch": 0.11191335740072202, + "grad_norm": 9.0625, + "learning_rate": 1.938827633217193e-05, + "loss": 2.0444, + "step": 93 + }, + { + "epoch": 0.11311672683513839, + "grad_norm": 2.109375, + "learning_rate": 1.9375189720395454e-05, + "loss": 1.8791, + "step": 94 + }, + { + "epoch": 0.11432009626955475, + "grad_norm": 8.0625, + "learning_rate": 1.936196911712001e-05, + "loss": 1.5531, + "step": 95 + }, + { + "epoch": 0.11552346570397112, + "grad_norm": 1.9765625, + "learning_rate": 1.934861471129627e-05, + "loss": 1.886, + "step": 96 + }, + { + "epoch": 0.11672683513838748, + "grad_norm": 12.375, + "learning_rate": 1.9335126693787237e-05, + "loss": 1.464, + "step": 97 + }, + { + "epoch": 0.11793020457280386, + "grad_norm": 1.71875, + "learning_rate": 1.9321505257365508e-05, + "loss": 1.8344, + "step": 98 + }, + { + "epoch": 0.11913357400722022, + "grad_norm": 9.25, + "learning_rate": 1.930775059671053e-05, + "loss": 2.2084, + "step": 99 + }, + { + "epoch": 0.12033694344163658, + "grad_norm": 2.203125, + "learning_rate": 1.9293862908405795e-05, + "loss": 1.8806, + "step": 100 + }, + { + "epoch": 0.12154031287605295, + "grad_norm": 8.125, + "learning_rate": 1.927984239093605e-05, + "loss": 1.5898, + "step": 101 + }, + { + "epoch": 0.12274368231046931, + "grad_norm": 1.8515625, + "learning_rate": 1.926568924468446e-05, + "loss": 1.8556, + "step": 102 + }, + { + "epoch": 0.12394705174488568, + "grad_norm": 7.90625, + "learning_rate": 1.9251403671929738e-05, + "loss": 1.7307, + "step": 103 + }, + { + "epoch": 0.12515042117930206, + "grad_norm": 2.046875, + "learning_rate": 1.9236985876843243e-05, + "loss": 1.8625, + "step": 104 + }, + { + "epoch": 0.1263537906137184, + "grad_norm": 7.875, + "learning_rate": 1.922243606548609e-05, + "loss": 2.0208, + "step": 105 + }, + { + "epoch": 0.12755716004813478, + "grad_norm": 2.453125, + "learning_rate": 1.9207754445806176e-05, + "loss": 1.9865, + "step": 106 + }, + { + "epoch": 0.12876052948255115, + "grad_norm": 10.0, + "learning_rate": 1.9192941227635232e-05, + "loss": 1.6236, + "step": 107 + }, + { + "epoch": 0.1299638989169675, + "grad_norm": 3.140625, + "learning_rate": 1.91779966226858e-05, + "loss": 1.9087, + "step": 108 + }, + { + "epoch": 0.13116726835138387, + "grad_norm": 8.3125, + "learning_rate": 1.9162920844548227e-05, + "loss": 1.8531, + "step": 109 + }, + { + "epoch": 0.13237063778580024, + "grad_norm": 2.21875, + "learning_rate": 1.914771410868761e-05, + "loss": 1.8767, + "step": 110 + }, + { + "epoch": 0.13357400722021662, + "grad_norm": 9.8125, + "learning_rate": 1.91323766324407e-05, + "loss": 1.867, + "step": 111 + }, + { + "epoch": 0.13477737665463296, + "grad_norm": 2.46875, + "learning_rate": 1.9116908635012813e-05, + "loss": 1.8482, + "step": 112 + }, + { + "epoch": 0.13598074608904934, + "grad_norm": 7.53125, + "learning_rate": 1.91013103374747e-05, + "loss": 1.7155, + "step": 113 + }, + { + "epoch": 0.1371841155234657, + "grad_norm": 2.390625, + "learning_rate": 1.9085581962759366e-05, + "loss": 1.9164, + "step": 114 + }, + { + "epoch": 0.13838748495788206, + "grad_norm": 7.40625, + "learning_rate": 1.9069723735658903e-05, + "loss": 1.7375, + "step": 115 + }, + { + "epoch": 0.13959085439229843, + "grad_norm": 2.203125, + "learning_rate": 1.905373588282127e-05, + "loss": 1.9173, + "step": 116 + }, + { + "epoch": 0.1407942238267148, + "grad_norm": 11.875, + "learning_rate": 1.903761863274706e-05, + "loss": 2.1522, + "step": 117 + }, + { + "epoch": 0.14199759326113118, + "grad_norm": 2.171875, + "learning_rate": 1.9021372215786218e-05, + "loss": 1.9097, + "step": 118 + }, + { + "epoch": 0.14320096269554752, + "grad_norm": 11.875, + "learning_rate": 1.9004996864134767e-05, + "loss": 1.7723, + "step": 119 + }, + { + "epoch": 0.1444043321299639, + "grad_norm": 2.234375, + "learning_rate": 1.8988492811831485e-05, + "loss": 1.8475, + "step": 120 + }, + { + "epoch": 0.14560770156438027, + "grad_norm": 8.125, + "learning_rate": 1.8971860294754554e-05, + "loss": 1.8492, + "step": 121 + }, + { + "epoch": 0.14681107099879662, + "grad_norm": 1.5703125, + "learning_rate": 1.8955099550618194e-05, + "loss": 1.8399, + "step": 122 + }, + { + "epoch": 0.148014440433213, + "grad_norm": 11.6875, + "learning_rate": 1.8938210818969257e-05, + "loss": 2.1739, + "step": 123 + }, + { + "epoch": 0.14921780986762936, + "grad_norm": 1.8984375, + "learning_rate": 1.8921194341183815e-05, + "loss": 1.8505, + "step": 124 + }, + { + "epoch": 0.15042117930204574, + "grad_norm": 9.0625, + "learning_rate": 1.8904050360463708e-05, + "loss": 1.9786, + "step": 125 + }, + { + "epoch": 0.15162454873646208, + "grad_norm": 1.6171875, + "learning_rate": 1.8886779121833065e-05, + "loss": 1.8304, + "step": 126 + }, + { + "epoch": 0.15282791817087846, + "grad_norm": 8.9375, + "learning_rate": 1.886938087213479e-05, + "loss": 1.6926, + "step": 127 + }, + { + "epoch": 0.15403128760529483, + "grad_norm": 1.78125, + "learning_rate": 1.885185586002707e-05, + "loss": 1.8204, + "step": 128 + }, + { + "epoch": 0.1552346570397112, + "grad_norm": 9.0625, + "learning_rate": 1.8834204335979777e-05, + "loss": 1.8364, + "step": 129 + }, + { + "epoch": 0.15643802647412755, + "grad_norm": 2.03125, + "learning_rate": 1.8816426552270922e-05, + "loss": 1.8379, + "step": 130 + }, + { + "epoch": 0.15764139590854392, + "grad_norm": 11.5625, + "learning_rate": 1.8798522762983026e-05, + "loss": 1.5707, + "step": 131 + }, + { + "epoch": 0.1588447653429603, + "grad_norm": 1.703125, + "learning_rate": 1.8780493223999508e-05, + "loss": 1.8673, + "step": 132 + }, + { + "epoch": 0.16004813477737664, + "grad_norm": 8.625, + "learning_rate": 1.8762338193001013e-05, + "loss": 2.0149, + "step": 133 + }, + { + "epoch": 0.16125150421179302, + "grad_norm": 1.890625, + "learning_rate": 1.8744057929461736e-05, + "loss": 1.8588, + "step": 134 + }, + { + "epoch": 0.1624548736462094, + "grad_norm": 9.8125, + "learning_rate": 1.8725652694645714e-05, + "loss": 1.6913, + "step": 135 + }, + { + "epoch": 0.16365824308062576, + "grad_norm": 2.265625, + "learning_rate": 1.8707122751603098e-05, + "loss": 1.8392, + "step": 136 + }, + { + "epoch": 0.1648616125150421, + "grad_norm": 10.0, + "learning_rate": 1.868846836516637e-05, + "loss": 1.7601, + "step": 137 + }, + { + "epoch": 0.16606498194945848, + "grad_norm": 2.1875, + "learning_rate": 1.8669689801946585e-05, + "loss": 1.7836, + "step": 138 + }, + { + "epoch": 0.16726835138387486, + "grad_norm": 12.0, + "learning_rate": 1.8650787330329546e-05, + "loss": 1.9955, + "step": 139 + }, + { + "epoch": 0.1684717208182912, + "grad_norm": 2.296875, + "learning_rate": 1.863176122047198e-05, + "loss": 1.8541, + "step": 140 + }, + { + "epoch": 0.16967509025270758, + "grad_norm": 15.75, + "learning_rate": 1.861261174429765e-05, + "loss": 1.9102, + "step": 141 + }, + { + "epoch": 0.17087845968712395, + "grad_norm": 2.734375, + "learning_rate": 1.8593339175493514e-05, + "loss": 1.9752, + "step": 142 + }, + { + "epoch": 0.17208182912154033, + "grad_norm": 8.0, + "learning_rate": 1.8573943789505762e-05, + "loss": 1.7416, + "step": 143 + }, + { + "epoch": 0.17328519855595667, + "grad_norm": 1.7890625, + "learning_rate": 1.8554425863535915e-05, + "loss": 1.8621, + "step": 144 + }, + { + "epoch": 0.17448856799037304, + "grad_norm": 9.875, + "learning_rate": 1.8534785676536856e-05, + "loss": 1.4806, + "step": 145 + }, + { + "epoch": 0.17569193742478942, + "grad_norm": 1.8046875, + "learning_rate": 1.851502350920883e-05, + "loss": 1.8894, + "step": 146 + }, + { + "epoch": 0.17689530685920576, + "grad_norm": 8.5625, + "learning_rate": 1.849513964399545e-05, + "loss": 1.5029, + "step": 147 + }, + { + "epoch": 0.17809867629362214, + "grad_norm": 2.125, + "learning_rate": 1.8475134365079642e-05, + "loss": 1.8094, + "step": 148 + }, + { + "epoch": 0.1793020457280385, + "grad_norm": 9.5, + "learning_rate": 1.8455007958379604e-05, + "loss": 1.325, + "step": 149 + }, + { + "epoch": 0.18050541516245489, + "grad_norm": 1.765625, + "learning_rate": 1.8434760711544707e-05, + "loss": 1.8688, + "step": 150 + }, + { + "epoch": 0.18170878459687123, + "grad_norm": 8.25, + "learning_rate": 1.8414392913951382e-05, + "loss": 1.7627, + "step": 151 + }, + { + "epoch": 0.1829121540312876, + "grad_norm": 2.0, + "learning_rate": 1.8393904856698987e-05, + "loss": 1.843, + "step": 152 + }, + { + "epoch": 0.18411552346570398, + "grad_norm": 9.9375, + "learning_rate": 1.8373296832605647e-05, + "loss": 1.5985, + "step": 153 + }, + { + "epoch": 0.18531889290012032, + "grad_norm": 2.0625, + "learning_rate": 1.835256913620408e-05, + "loss": 1.8304, + "step": 154 + }, + { + "epoch": 0.1865222623345367, + "grad_norm": 7.40625, + "learning_rate": 1.8331722063737365e-05, + "loss": 1.7801, + "step": 155 + }, + { + "epoch": 0.18772563176895307, + "grad_norm": 2.171875, + "learning_rate": 1.8310755913154726e-05, + "loss": 1.8439, + "step": 156 + }, + { + "epoch": 0.18892900120336945, + "grad_norm": 8.125, + "learning_rate": 1.8289670984107263e-05, + "loss": 1.5793, + "step": 157 + }, + { + "epoch": 0.1901323706377858, + "grad_norm": 1.7734375, + "learning_rate": 1.826846757794368e-05, + "loss": 1.8254, + "step": 158 + }, + { + "epoch": 0.19133574007220217, + "grad_norm": 10.6875, + "learning_rate": 1.8247145997705977e-05, + "loss": 1.5562, + "step": 159 + }, + { + "epoch": 0.19253910950661854, + "grad_norm": 2.078125, + "learning_rate": 1.8225706548125094e-05, + "loss": 1.8317, + "step": 160 + }, + { + "epoch": 0.19374247894103488, + "grad_norm": 8.125, + "learning_rate": 1.8204149535616596e-05, + "loss": 1.7808, + "step": 161 + }, + { + "epoch": 0.19494584837545126, + "grad_norm": 2.109375, + "learning_rate": 1.8182475268276265e-05, + "loss": 1.8565, + "step": 162 + }, + { + "epoch": 0.19614921780986763, + "grad_norm": 9.0, + "learning_rate": 1.8160684055875704e-05, + "loss": 1.6122, + "step": 163 + }, + { + "epoch": 0.197352587244284, + "grad_norm": 2.1875, + "learning_rate": 1.813877620985792e-05, + "loss": 1.8802, + "step": 164 + }, + { + "epoch": 0.19855595667870035, + "grad_norm": 10.8125, + "learning_rate": 1.8116752043332848e-05, + "loss": 1.8362, + "step": 165 + }, + { + "epoch": 0.19975932611311673, + "grad_norm": 1.84375, + "learning_rate": 1.8094611871072906e-05, + "loss": 1.9092, + "step": 166 + }, + { + "epoch": 0.2009626955475331, + "grad_norm": 10.25, + "learning_rate": 1.8072356009508473e-05, + "loss": 2.1806, + "step": 167 + }, + { + "epoch": 0.20216606498194944, + "grad_norm": 1.9375, + "learning_rate": 1.8049984776723383e-05, + "loss": 1.8313, + "step": 168 + }, + { + "epoch": 0.20336943441636582, + "grad_norm": 8.6875, + "learning_rate": 1.8027498492450367e-05, + "loss": 1.8958, + "step": 169 + }, + { + "epoch": 0.2045728038507822, + "grad_norm": 1.9375, + "learning_rate": 1.8004897478066482e-05, + "loss": 1.8194, + "step": 170 + }, + { + "epoch": 0.20577617328519857, + "grad_norm": 8.875, + "learning_rate": 1.7982182056588536e-05, + "loss": 1.4826, + "step": 171 + }, + { + "epoch": 0.2069795427196149, + "grad_norm": 1.8984375, + "learning_rate": 1.795935255266845e-05, + "loss": 1.8559, + "step": 172 + }, + { + "epoch": 0.20818291215403129, + "grad_norm": 10.8125, + "learning_rate": 1.7936409292588627e-05, + "loss": 1.6988, + "step": 173 + }, + { + "epoch": 0.20938628158844766, + "grad_norm": 2.3125, + "learning_rate": 1.791335260425729e-05, + "loss": 1.8318, + "step": 174 + }, + { + "epoch": 0.21058965102286403, + "grad_norm": 7.78125, + "learning_rate": 1.7890182817203806e-05, + "loss": 1.7352, + "step": 175 + }, + { + "epoch": 0.21179302045728038, + "grad_norm": 2.65625, + "learning_rate": 1.786690026257394e-05, + "loss": 1.8497, + "step": 176 + }, + { + "epoch": 0.21299638989169675, + "grad_norm": 6.6875, + "learning_rate": 1.7843505273125164e-05, + "loss": 1.4856, + "step": 177 + }, + { + "epoch": 0.21419975932611313, + "grad_norm": 1.9296875, + "learning_rate": 1.7819998183221883e-05, + "loss": 1.8217, + "step": 178 + }, + { + "epoch": 0.21540312876052947, + "grad_norm": 9.375, + "learning_rate": 1.7796379328830652e-05, + "loss": 1.9262, + "step": 179 + }, + { + "epoch": 0.21660649819494585, + "grad_norm": 2.15625, + "learning_rate": 1.7772649047515384e-05, + "loss": 1.8533, + "step": 180 + }, + { + "epoch": 0.21780986762936222, + "grad_norm": 7.4375, + "learning_rate": 1.7748807678432514e-05, + "loss": 1.869, + "step": 181 + }, + { + "epoch": 0.2190132370637786, + "grad_norm": 3.3125, + "learning_rate": 1.7724855562326167e-05, + "loss": 1.7709, + "step": 182 + }, + { + "epoch": 0.22021660649819494, + "grad_norm": 6.625, + "learning_rate": 1.7700793041523272e-05, + "loss": 1.3658, + "step": 183 + }, + { + "epoch": 0.2214199759326113, + "grad_norm": 2.484375, + "learning_rate": 1.7676620459928683e-05, + "loss": 1.9054, + "step": 184 + }, + { + "epoch": 0.2226233453670277, + "grad_norm": 9.1875, + "learning_rate": 1.7652338163020257e-05, + "loss": 1.9153, + "step": 185 + }, + { + "epoch": 0.22382671480144403, + "grad_norm": 2.03125, + "learning_rate": 1.7627946497843917e-05, + "loss": 1.9365, + "step": 186 + }, + { + "epoch": 0.2250300842358604, + "grad_norm": 7.15625, + "learning_rate": 1.7603445813008685e-05, + "loss": 1.5009, + "step": 187 + }, + { + "epoch": 0.22623345367027678, + "grad_norm": 2.765625, + "learning_rate": 1.7578836458681718e-05, + "loss": 1.862, + "step": 188 + }, + { + "epoch": 0.22743682310469315, + "grad_norm": 11.0, + "learning_rate": 1.755411878658329e-05, + "loss": 1.7548, + "step": 189 + }, + { + "epoch": 0.2286401925391095, + "grad_norm": 1.71875, + "learning_rate": 1.7529293149981758e-05, + "loss": 1.8371, + "step": 190 + }, + { + "epoch": 0.22984356197352587, + "grad_norm": 10.9375, + "learning_rate": 1.7504359903688537e-05, + "loss": 1.9448, + "step": 191 + }, + { + "epoch": 0.23104693140794225, + "grad_norm": 1.8046875, + "learning_rate": 1.7479319404053004e-05, + "loss": 1.8082, + "step": 192 + }, + { + "epoch": 0.2322503008423586, + "grad_norm": 9.5, + "learning_rate": 1.7454172008957417e-05, + "loss": 1.55, + "step": 193 + }, + { + "epoch": 0.23345367027677497, + "grad_norm": 1.6875, + "learning_rate": 1.7428918077811802e-05, + "loss": 1.8567, + "step": 194 + }, + { + "epoch": 0.23465703971119134, + "grad_norm": 9.1875, + "learning_rate": 1.740355797154881e-05, + "loss": 1.6953, + "step": 195 + }, + { + "epoch": 0.2358604091456077, + "grad_norm": 1.578125, + "learning_rate": 1.7378092052618565e-05, + "loss": 1.781, + "step": 196 + }, + { + "epoch": 0.23706377858002406, + "grad_norm": 7.90625, + "learning_rate": 1.7352520684983474e-05, + "loss": 1.9668, + "step": 197 + }, + { + "epoch": 0.23826714801444043, + "grad_norm": 2.0625, + "learning_rate": 1.7326844234113037e-05, + "loss": 1.8419, + "step": 198 + }, + { + "epoch": 0.2394705174488568, + "grad_norm": 14.4375, + "learning_rate": 1.7301063066978617e-05, + "loss": 1.7109, + "step": 199 + }, + { + "epoch": 0.24067388688327315, + "grad_norm": 1.8203125, + "learning_rate": 1.727517755204819e-05, + "loss": 1.8454, + "step": 200 + }, + { + "epoch": 0.24187725631768953, + "grad_norm": 7.65625, + "learning_rate": 1.72491880592811e-05, + "loss": 1.6736, + "step": 201 + }, + { + "epoch": 0.2430806257521059, + "grad_norm": 1.7734375, + "learning_rate": 1.7223094960122733e-05, + "loss": 1.8484, + "step": 202 + }, + { + "epoch": 0.24428399518652227, + "grad_norm": 8.3125, + "learning_rate": 1.719689862749926e-05, + "loss": 2.0111, + "step": 203 + }, + { + "epoch": 0.24548736462093862, + "grad_norm": 1.7421875, + "learning_rate": 1.7170599435812253e-05, + "loss": 1.8034, + "step": 204 + }, + { + "epoch": 0.246690734055355, + "grad_norm": 8.8125, + "learning_rate": 1.714419776093338e-05, + "loss": 1.6535, + "step": 205 + }, + { + "epoch": 0.24789410348977137, + "grad_norm": 1.8125, + "learning_rate": 1.7117693980198996e-05, + "loss": 1.7763, + "step": 206 + }, + { + "epoch": 0.2490974729241877, + "grad_norm": 9.5625, + "learning_rate": 1.709108847240478e-05, + "loss": 1.4858, + "step": 207 + }, + { + "epoch": 0.2503008423586041, + "grad_norm": 2.015625, + "learning_rate": 1.7064381617800302e-05, + "loss": 1.8895, + "step": 208 + }, + { + "epoch": 0.25150421179302046, + "grad_norm": 9.375, + "learning_rate": 1.7037573798083598e-05, + "loss": 1.7483, + "step": 209 + }, + { + "epoch": 0.2527075812274368, + "grad_norm": 1.78125, + "learning_rate": 1.7010665396395706e-05, + "loss": 1.8537, + "step": 210 + }, + { + "epoch": 0.2539109506618532, + "grad_norm": 7.625, + "learning_rate": 1.6983656797315197e-05, + "loss": 1.4985, + "step": 211 + }, + { + "epoch": 0.25511432009626955, + "grad_norm": 1.765625, + "learning_rate": 1.6956548386852684e-05, + "loss": 1.8389, + "step": 212 + }, + { + "epoch": 0.2563176895306859, + "grad_norm": 8.3125, + "learning_rate": 1.6929340552445283e-05, + "loss": 2.014, + "step": 213 + }, + { + "epoch": 0.2575210589651023, + "grad_norm": 2.21875, + "learning_rate": 1.6902033682951104e-05, + "loss": 1.8123, + "step": 214 + }, + { + "epoch": 0.25872442839951865, + "grad_norm": 8.75, + "learning_rate": 1.6874628168643683e-05, + "loss": 1.4427, + "step": 215 + }, + { + "epoch": 0.259927797833935, + "grad_norm": 1.703125, + "learning_rate": 1.6847124401206384e-05, + "loss": 1.8153, + "step": 216 + }, + { + "epoch": 0.2611311672683514, + "grad_norm": 10.4375, + "learning_rate": 1.681952277372683e-05, + "loss": 1.7849, + "step": 217 + }, + { + "epoch": 0.26233453670276774, + "grad_norm": 1.7890625, + "learning_rate": 1.6791823680691276e-05, + "loss": 1.8759, + "step": 218 + }, + { + "epoch": 0.26353790613718414, + "grad_norm": 7.96875, + "learning_rate": 1.676402751797896e-05, + "loss": 1.5881, + "step": 219 + }, + { + "epoch": 0.2647412755716005, + "grad_norm": 1.7890625, + "learning_rate": 1.673613468285646e-05, + "loss": 1.7757, + "step": 220 + }, + { + "epoch": 0.26594464500601683, + "grad_norm": 9.125, + "learning_rate": 1.6708145573972005e-05, + "loss": 1.6333, + "step": 221 + }, + { + "epoch": 0.26714801444043323, + "grad_norm": 2.03125, + "learning_rate": 1.6680060591349774e-05, + "loss": 1.7887, + "step": 222 + }, + { + "epoch": 0.2683513838748496, + "grad_norm": 11.125, + "learning_rate": 1.6651880136384215e-05, + "loss": 1.8873, + "step": 223 + }, + { + "epoch": 0.2695547533092659, + "grad_norm": 1.8203125, + "learning_rate": 1.662360461183424e-05, + "loss": 1.8092, + "step": 224 + }, + { + "epoch": 0.27075812274368233, + "grad_norm": 8.5, + "learning_rate": 1.659523442181754e-05, + "loss": 1.7207, + "step": 225 + }, + { + "epoch": 0.2719614921780987, + "grad_norm": 1.8828125, + "learning_rate": 1.6566769971804763e-05, + "loss": 1.7972, + "step": 226 + }, + { + "epoch": 0.273164861612515, + "grad_norm": 9.5, + "learning_rate": 1.653821166861374e-05, + "loss": 1.6875, + "step": 227 + }, + { + "epoch": 0.2743682310469314, + "grad_norm": 1.859375, + "learning_rate": 1.6509559920403663e-05, + "loss": 1.8418, + "step": 228 + }, + { + "epoch": 0.27557160048134777, + "grad_norm": 7.71875, + "learning_rate": 1.6480815136669248e-05, + "loss": 1.5936, + "step": 229 + }, + { + "epoch": 0.2767749699157641, + "grad_norm": 2.109375, + "learning_rate": 1.6451977728234894e-05, + "loss": 1.8199, + "step": 230 + }, + { + "epoch": 0.2779783393501805, + "grad_norm": 8.75, + "learning_rate": 1.64230481072488e-05, + "loss": 1.7345, + "step": 231 + }, + { + "epoch": 0.27918170878459686, + "grad_norm": 2.046875, + "learning_rate": 1.639402668717709e-05, + "loss": 1.8464, + "step": 232 + }, + { + "epoch": 0.28038507821901326, + "grad_norm": 8.3125, + "learning_rate": 1.6364913882797875e-05, + "loss": 1.6354, + "step": 233 + }, + { + "epoch": 0.2815884476534296, + "grad_norm": 1.9609375, + "learning_rate": 1.633571011019536e-05, + "loss": 1.809, + "step": 234 + }, + { + "epoch": 0.28279181708784595, + "grad_norm": 7.6875, + "learning_rate": 1.630641578675387e-05, + "loss": 1.9231, + "step": 235 + }, + { + "epoch": 0.28399518652226236, + "grad_norm": 1.84375, + "learning_rate": 1.62770313311519e-05, + "loss": 1.8013, + "step": 236 + }, + { + "epoch": 0.2851985559566787, + "grad_norm": 8.75, + "learning_rate": 1.6247557163356127e-05, + "loss": 1.5578, + "step": 237 + }, + { + "epoch": 0.28640192539109505, + "grad_norm": 2.46875, + "learning_rate": 1.62179937046154e-05, + "loss": 1.77, + "step": 238 + }, + { + "epoch": 0.28760529482551145, + "grad_norm": 7.53125, + "learning_rate": 1.6188341377454735e-05, + "loss": 1.4921, + "step": 239 + }, + { + "epoch": 0.2888086642599278, + "grad_norm": 1.8125, + "learning_rate": 1.6158600605669264e-05, + "loss": 1.7778, + "step": 240 + }, + { + "epoch": 0.29001203369434414, + "grad_norm": 10.875, + "learning_rate": 1.6128771814318178e-05, + "loss": 1.7163, + "step": 241 + }, + { + "epoch": 0.29121540312876054, + "grad_norm": 2.109375, + "learning_rate": 1.6098855429718662e-05, + "loss": 1.7586, + "step": 242 + }, + { + "epoch": 0.2924187725631769, + "grad_norm": 8.5, + "learning_rate": 1.606885187943979e-05, + "loss": 1.5912, + "step": 243 + }, + { + "epoch": 0.29362214199759323, + "grad_norm": 2.34375, + "learning_rate": 1.6038761592296435e-05, + "loss": 1.7934, + "step": 244 + }, + { + "epoch": 0.29482551143200963, + "grad_norm": 8.6875, + "learning_rate": 1.60085849983431e-05, + "loss": 1.6993, + "step": 245 + }, + { + "epoch": 0.296028880866426, + "grad_norm": 2.21875, + "learning_rate": 1.597832252886781e-05, + "loss": 1.8313, + "step": 246 + }, + { + "epoch": 0.2972322503008424, + "grad_norm": 9.1875, + "learning_rate": 1.594797461638594e-05, + "loss": 1.8327, + "step": 247 + }, + { + "epoch": 0.29843561973525873, + "grad_norm": 2.140625, + "learning_rate": 1.591754169463402e-05, + "loss": 1.747, + "step": 248 + }, + { + "epoch": 0.2996389891696751, + "grad_norm": 8.25, + "learning_rate": 1.5887024198563552e-05, + "loss": 1.5689, + "step": 249 + }, + { + "epoch": 0.3008423586040915, + "grad_norm": 1.625, + "learning_rate": 1.5856422564334772e-05, + "loss": 1.7359, + "step": 250 + }, + { + "epoch": 0.3020457280385078, + "grad_norm": 9.75, + "learning_rate": 1.5825737229310448e-05, + "loss": 1.636, + "step": 251 + }, + { + "epoch": 0.30324909747292417, + "grad_norm": 2.5625, + "learning_rate": 1.5794968632049598e-05, + "loss": 1.8081, + "step": 252 + }, + { + "epoch": 0.30445246690734057, + "grad_norm": 7.59375, + "learning_rate": 1.576411721230124e-05, + "loss": 1.9024, + "step": 253 + }, + { + "epoch": 0.3056558363417569, + "grad_norm": 2.078125, + "learning_rate": 1.57331834109981e-05, + "loss": 1.8062, + "step": 254 + }, + { + "epoch": 0.30685920577617326, + "grad_norm": 7.5, + "learning_rate": 1.570216767025032e-05, + "loss": 1.7436, + "step": 255 + }, + { + "epoch": 0.30806257521058966, + "grad_norm": 1.8828125, + "learning_rate": 1.5671070433339116e-05, + "loss": 1.8097, + "step": 256 + }, + { + "epoch": 0.309265944645006, + "grad_norm": 7.6875, + "learning_rate": 1.5639892144710477e-05, + "loss": 1.5525, + "step": 257 + }, + { + "epoch": 0.3104693140794224, + "grad_norm": 1.90625, + "learning_rate": 1.5608633249968783e-05, + "loss": 1.824, + "step": 258 + }, + { + "epoch": 0.31167268351383876, + "grad_norm": 8.125, + "learning_rate": 1.557729419587045e-05, + "loss": 1.8183, + "step": 259 + }, + { + "epoch": 0.3128760529482551, + "grad_norm": 2.296875, + "learning_rate": 1.5545875430317546e-05, + "loss": 1.7785, + "step": 260 + }, + { + "epoch": 0.3140794223826715, + "grad_norm": 9.125, + "learning_rate": 1.5514377402351376e-05, + "loss": 1.7519, + "step": 261 + }, + { + "epoch": 0.31528279181708785, + "grad_norm": 2.296875, + "learning_rate": 1.548280056214609e-05, + "loss": 1.8864, + "step": 262 + }, + { + "epoch": 0.3164861612515042, + "grad_norm": 9.4375, + "learning_rate": 1.545114536100222e-05, + "loss": 1.7366, + "step": 263 + }, + { + "epoch": 0.3176895306859206, + "grad_norm": 1.796875, + "learning_rate": 1.541941225134025e-05, + "loss": 1.8211, + "step": 264 + }, + { + "epoch": 0.31889290012033694, + "grad_norm": 7.5, + "learning_rate": 1.5387601686694134e-05, + "loss": 1.9038, + "step": 265 + }, + { + "epoch": 0.3200962695547533, + "grad_norm": 1.9765625, + "learning_rate": 1.5355714121704846e-05, + "loss": 1.7855, + "step": 266 + }, + { + "epoch": 0.3212996389891697, + "grad_norm": 7.5625, + "learning_rate": 1.532375001211383e-05, + "loss": 2.1078, + "step": 267 + }, + { + "epoch": 0.32250300842358604, + "grad_norm": 1.9140625, + "learning_rate": 1.529170981475653e-05, + "loss": 1.8535, + "step": 268 + }, + { + "epoch": 0.3237063778580024, + "grad_norm": 13.3125, + "learning_rate": 1.525959398755585e-05, + "loss": 2.0128, + "step": 269 + }, + { + "epoch": 0.3249097472924188, + "grad_norm": 1.6875, + "learning_rate": 1.5227402989515607e-05, + "loss": 1.8164, + "step": 270 + }, + { + "epoch": 0.32611311672683513, + "grad_norm": 8.3125, + "learning_rate": 1.519513728071396e-05, + "loss": 1.7586, + "step": 271 + }, + { + "epoch": 0.32731648616125153, + "grad_norm": 2.015625, + "learning_rate": 1.5162797322296855e-05, + "loss": 1.8448, + "step": 272 + }, + { + "epoch": 0.3285198555956679, + "grad_norm": 12.6875, + "learning_rate": 1.5130383576471415e-05, + "loss": 1.5937, + "step": 273 + }, + { + "epoch": 0.3297232250300842, + "grad_norm": 1.875, + "learning_rate": 1.5097896506499349e-05, + "loss": 1.8645, + "step": 274 + }, + { + "epoch": 0.3309265944645006, + "grad_norm": 6.875, + "learning_rate": 1.5065336576690318e-05, + "loss": 1.5509, + "step": 275 + }, + { + "epoch": 0.33212996389891697, + "grad_norm": 1.6796875, + "learning_rate": 1.5032704252395315e-05, + "loss": 1.7767, + "step": 276 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 8.8125, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.5383, + "step": 277 + }, + { + "epoch": 0.3345367027677497, + "grad_norm": 1.84375, + "learning_rate": 1.496722428691804e-05, + "loss": 1.8775, + "step": 278 + }, + { + "epoch": 0.33574007220216606, + "grad_norm": 8.0625, + "learning_rate": 1.4934377581584425e-05, + "loss": 1.6032, + "step": 279 + }, + { + "epoch": 0.3369434416365824, + "grad_norm": 2.109375, + "learning_rate": 1.490146035344878e-05, + "loss": 1.8144, + "step": 280 + }, + { + "epoch": 0.3381468110709988, + "grad_norm": 11.6875, + "learning_rate": 1.4868473072968645e-05, + "loss": 2.3891, + "step": 281 + }, + { + "epoch": 0.33935018050541516, + "grad_norm": 1.4765625, + "learning_rate": 1.4835416211602771e-05, + "loss": 1.7891, + "step": 282 + }, + { + "epoch": 0.3405535499398315, + "grad_norm": 9.8125, + "learning_rate": 1.4802290241804355e-05, + "loss": 1.9642, + "step": 283 + }, + { + "epoch": 0.3417569193742479, + "grad_norm": 2.03125, + "learning_rate": 1.4769095637014308e-05, + "loss": 1.784, + "step": 284 + }, + { + "epoch": 0.34296028880866425, + "grad_norm": 8.9375, + "learning_rate": 1.473583287165448e-05, + "loss": 1.9608, + "step": 285 + }, + { + "epoch": 0.34416365824308065, + "grad_norm": 1.640625, + "learning_rate": 1.4702502421120884e-05, + "loss": 1.7739, + "step": 286 + }, + { + "epoch": 0.345367027677497, + "grad_norm": 9.3125, + "learning_rate": 1.4669104761776892e-05, + "loss": 1.7284, + "step": 287 + }, + { + "epoch": 0.34657039711191334, + "grad_norm": 2.203125, + "learning_rate": 1.463564037094644e-05, + "loss": 1.7732, + "step": 288 + }, + { + "epoch": 0.34777376654632974, + "grad_norm": 7.78125, + "learning_rate": 1.4602109726907197e-05, + "loss": 1.5802, + "step": 289 + }, + { + "epoch": 0.3489771359807461, + "grad_norm": 1.796875, + "learning_rate": 1.4568513308883732e-05, + "loss": 1.7499, + "step": 290 + }, + { + "epoch": 0.35018050541516244, + "grad_norm": 8.4375, + "learning_rate": 1.4534851597040666e-05, + "loss": 1.7473, + "step": 291 + }, + { + "epoch": 0.35138387484957884, + "grad_norm": 2.375, + "learning_rate": 1.4501125072475804e-05, + "loss": 1.7983, + "step": 292 + }, + { + "epoch": 0.3525872442839952, + "grad_norm": 10.625, + "learning_rate": 1.4467334217213274e-05, + "loss": 2.0496, + "step": 293 + }, + { + "epoch": 0.35379061371841153, + "grad_norm": 1.953125, + "learning_rate": 1.4433479514196615e-05, + "loss": 1.7396, + "step": 294 + }, + { + "epoch": 0.35499398315282793, + "grad_norm": 6.75, + "learning_rate": 1.439956144728189e-05, + "loss": 1.4716, + "step": 295 + }, + { + "epoch": 0.3561973525872443, + "grad_norm": 1.8671875, + "learning_rate": 1.4365580501230776e-05, + "loss": 1.8019, + "step": 296 + }, + { + "epoch": 0.3574007220216607, + "grad_norm": 9.875, + "learning_rate": 1.4331537161703612e-05, + "loss": 2.123, + "step": 297 + }, + { + "epoch": 0.358604091456077, + "grad_norm": 1.921875, + "learning_rate": 1.4297431915252487e-05, + "loss": 1.7762, + "step": 298 + }, + { + "epoch": 0.35980746089049337, + "grad_norm": 8.0625, + "learning_rate": 1.4263265249314269e-05, + "loss": 1.5948, + "step": 299 + }, + { + "epoch": 0.36101083032490977, + "grad_norm": 1.7734375, + "learning_rate": 1.422903765220363e-05, + "loss": 1.8318, + "step": 300 + }, + { + "epoch": 0.3622141997593261, + "grad_norm": 14.9375, + "learning_rate": 1.41947496131061e-05, + "loss": 1.6696, + "step": 301 + }, + { + "epoch": 0.36341756919374246, + "grad_norm": 1.7578125, + "learning_rate": 1.4160401622071039e-05, + "loss": 1.7549, + "step": 302 + }, + { + "epoch": 0.36462093862815886, + "grad_norm": 9.75, + "learning_rate": 1.4125994170004644e-05, + "loss": 1.749, + "step": 303 + }, + { + "epoch": 0.3658243080625752, + "grad_norm": 2.6875, + "learning_rate": 1.4091527748662957e-05, + "loss": 1.8525, + "step": 304 + }, + { + "epoch": 0.36702767749699156, + "grad_norm": 9.3125, + "learning_rate": 1.4057002850644796e-05, + "loss": 1.5852, + "step": 305 + }, + { + "epoch": 0.36823104693140796, + "grad_norm": 1.890625, + "learning_rate": 1.402241996938475e-05, + "loss": 1.7885, + "step": 306 + }, + { + "epoch": 0.3694344163658243, + "grad_norm": 9.4375, + "learning_rate": 1.3987779599146105e-05, + "loss": 1.7472, + "step": 307 + }, + { + "epoch": 0.37063778580024065, + "grad_norm": 1.859375, + "learning_rate": 1.3953082235013788e-05, + "loss": 1.7533, + "step": 308 + }, + { + "epoch": 0.37184115523465705, + "grad_norm": 9.0625, + "learning_rate": 1.3918328372887295e-05, + "loss": 1.6258, + "step": 309 + }, + { + "epoch": 0.3730445246690734, + "grad_norm": 2.21875, + "learning_rate": 1.3883518509473598e-05, + "loss": 1.7508, + "step": 310 + }, + { + "epoch": 0.3742478941034898, + "grad_norm": 8.125, + "learning_rate": 1.3848653142280037e-05, + "loss": 1.9308, + "step": 311 + }, + { + "epoch": 0.37545126353790614, + "grad_norm": 2.140625, + "learning_rate": 1.381373276960724e-05, + "loss": 1.8421, + "step": 312 + }, + { + "epoch": 0.3766546329723225, + "grad_norm": 10.0625, + "learning_rate": 1.377875789054196e-05, + "loss": 1.6391, + "step": 313 + }, + { + "epoch": 0.3778580024067389, + "grad_norm": 1.6640625, + "learning_rate": 1.3743729004949972e-05, + "loss": 1.7977, + "step": 314 + }, + { + "epoch": 0.37906137184115524, + "grad_norm": 11.875, + "learning_rate": 1.3708646613468925e-05, + "loss": 1.6365, + "step": 315 + }, + { + "epoch": 0.3802647412755716, + "grad_norm": 1.9921875, + "learning_rate": 1.3673511217501172e-05, + "loss": 1.7404, + "step": 316 + }, + { + "epoch": 0.381468110709988, + "grad_norm": 8.25, + "learning_rate": 1.3638323319206617e-05, + "loss": 1.4718, + "step": 317 + }, + { + "epoch": 0.38267148014440433, + "grad_norm": 2.59375, + "learning_rate": 1.3603083421495535e-05, + "loss": 1.8494, + "step": 318 + }, + { + "epoch": 0.3838748495788207, + "grad_norm": 9.8125, + "learning_rate": 1.3567792028021382e-05, + "loss": 1.7267, + "step": 319 + }, + { + "epoch": 0.3850782190132371, + "grad_norm": 1.765625, + "learning_rate": 1.3532449643173604e-05, + "loss": 1.7702, + "step": 320 + }, + { + "epoch": 0.3862815884476534, + "grad_norm": 6.25, + "learning_rate": 1.3497056772070417e-05, + "loss": 1.6415, + "step": 321 + }, + { + "epoch": 0.38748495788206977, + "grad_norm": 2.109375, + "learning_rate": 1.3461613920551598e-05, + "loss": 1.7866, + "step": 322 + }, + { + "epoch": 0.38868832731648617, + "grad_norm": 10.375, + "learning_rate": 1.3426121595171242e-05, + "loss": 1.7979, + "step": 323 + }, + { + "epoch": 0.3898916967509025, + "grad_norm": 1.8359375, + "learning_rate": 1.3390580303190541e-05, + "loss": 1.7856, + "step": 324 + }, + { + "epoch": 0.3910950661853189, + "grad_norm": 9.75, + "learning_rate": 1.335499055257052e-05, + "loss": 1.7654, + "step": 325 + }, + { + "epoch": 0.39229843561973526, + "grad_norm": 2.1875, + "learning_rate": 1.3319352851964787e-05, + "loss": 1.8444, + "step": 326 + }, + { + "epoch": 0.3935018050541516, + "grad_norm": 8.3125, + "learning_rate": 1.3283667710712245e-05, + "loss": 1.3942, + "step": 327 + }, + { + "epoch": 0.394705174488568, + "grad_norm": 1.9453125, + "learning_rate": 1.3247935638829838e-05, + "loss": 1.7739, + "step": 328 + }, + { + "epoch": 0.39590854392298436, + "grad_norm": 8.0625, + "learning_rate": 1.3212157147005244e-05, + "loss": 1.7018, + "step": 329 + }, + { + "epoch": 0.3971119133574007, + "grad_norm": 1.4140625, + "learning_rate": 1.3176332746589587e-05, + "loss": 1.7783, + "step": 330 + }, + { + "epoch": 0.3983152827918171, + "grad_norm": 10.5625, + "learning_rate": 1.3140462949590107e-05, + "loss": 2.0592, + "step": 331 + }, + { + "epoch": 0.39951865222623345, + "grad_norm": 1.71875, + "learning_rate": 1.3104548268662873e-05, + "loss": 1.75, + "step": 332 + }, + { + "epoch": 0.4007220216606498, + "grad_norm": 10.375, + "learning_rate": 1.3068589217105441e-05, + "loss": 1.879, + "step": 333 + }, + { + "epoch": 0.4019253910950662, + "grad_norm": 1.9765625, + "learning_rate": 1.3032586308849512e-05, + "loss": 1.875, + "step": 334 + }, + { + "epoch": 0.40312876052948254, + "grad_norm": 10.3125, + "learning_rate": 1.2996540058453589e-05, + "loss": 1.745, + "step": 335 + }, + { + "epoch": 0.4043321299638989, + "grad_norm": 1.8515625, + "learning_rate": 1.2960450981095643e-05, + "loss": 1.75, + "step": 336 + }, + { + "epoch": 0.4055354993983153, + "grad_norm": 10.375, + "learning_rate": 1.2924319592565713e-05, + "loss": 1.891, + "step": 337 + }, + { + "epoch": 0.40673886883273164, + "grad_norm": 1.671875, + "learning_rate": 1.2888146409258575e-05, + "loss": 1.7498, + "step": 338 + }, + { + "epoch": 0.40794223826714804, + "grad_norm": 8.5625, + "learning_rate": 1.2851931948166328e-05, + "loss": 1.7661, + "step": 339 + }, + { + "epoch": 0.4091456077015644, + "grad_norm": 2.125, + "learning_rate": 1.281567672687102e-05, + "loss": 1.7895, + "step": 340 + }, + { + "epoch": 0.41034897713598073, + "grad_norm": 8.4375, + "learning_rate": 1.2779381263537262e-05, + "loss": 1.4515, + "step": 341 + }, + { + "epoch": 0.41155234657039713, + "grad_norm": 2.078125, + "learning_rate": 1.2743046076904795e-05, + "loss": 1.7657, + "step": 342 + }, + { + "epoch": 0.4127557160048135, + "grad_norm": 11.625, + "learning_rate": 1.2706671686281094e-05, + "loss": 2.1425, + "step": 343 + }, + { + "epoch": 0.4139590854392298, + "grad_norm": 2.109375, + "learning_rate": 1.2670258611533947e-05, + "loss": 1.7438, + "step": 344 + }, + { + "epoch": 0.4151624548736462, + "grad_norm": 10.9375, + "learning_rate": 1.2633807373084022e-05, + "loss": 1.7648, + "step": 345 + }, + { + "epoch": 0.41636582430806257, + "grad_norm": 1.828125, + "learning_rate": 1.2597318491897416e-05, + "loss": 1.8152, + "step": 346 + }, + { + "epoch": 0.4175691937424789, + "grad_norm": 8.0, + "learning_rate": 1.2560792489478244e-05, + "loss": 1.6368, + "step": 347 + }, + { + "epoch": 0.4187725631768953, + "grad_norm": 1.5625, + "learning_rate": 1.2524229887861132e-05, + "loss": 1.7436, + "step": 348 + }, + { + "epoch": 0.41997593261131166, + "grad_norm": 8.1875, + "learning_rate": 1.2487631209603819e-05, + "loss": 1.6639, + "step": 349 + }, + { + "epoch": 0.42117930204572807, + "grad_norm": 1.921875, + "learning_rate": 1.245099697777963e-05, + "loss": 1.766, + "step": 350 + }, + { + "epoch": 0.4223826714801444, + "grad_norm": 8.125, + "learning_rate": 1.241432771597004e-05, + "loss": 1.4623, + "step": 351 + }, + { + "epoch": 0.42358604091456076, + "grad_norm": 1.8984375, + "learning_rate": 1.237762394825718e-05, + "loss": 1.8042, + "step": 352 + }, + { + "epoch": 0.42478941034897716, + "grad_norm": 7.8125, + "learning_rate": 1.234088619921633e-05, + "loss": 1.6498, + "step": 353 + }, + { + "epoch": 0.4259927797833935, + "grad_norm": 2.53125, + "learning_rate": 1.230411499390845e-05, + "loss": 1.7521, + "step": 354 + }, + { + "epoch": 0.42719614921780985, + "grad_norm": 7.59375, + "learning_rate": 1.2267310857872654e-05, + "loss": 1.6759, + "step": 355 + }, + { + "epoch": 0.42839951865222625, + "grad_norm": 2.203125, + "learning_rate": 1.2230474317118708e-05, + "loss": 1.8461, + "step": 356 + }, + { + "epoch": 0.4296028880866426, + "grad_norm": 8.8125, + "learning_rate": 1.2193605898119513e-05, + "loss": 1.7665, + "step": 357 + }, + { + "epoch": 0.43080625752105894, + "grad_norm": 1.5234375, + "learning_rate": 1.2156706127803578e-05, + "loss": 1.7748, + "step": 358 + }, + { + "epoch": 0.43200962695547535, + "grad_norm": 8.25, + "learning_rate": 1.2119775533547482e-05, + "loss": 1.6212, + "step": 359 + }, + { + "epoch": 0.4332129963898917, + "grad_norm": 1.53125, + "learning_rate": 1.2082814643168357e-05, + "loss": 1.7921, + "step": 360 + }, + { + "epoch": 0.43441636582430804, + "grad_norm": 12.3125, + "learning_rate": 1.2045823984916317e-05, + "loss": 1.6924, + "step": 361 + }, + { + "epoch": 0.43561973525872444, + "grad_norm": 2.234375, + "learning_rate": 1.2008804087466931e-05, + "loss": 1.8069, + "step": 362 + }, + { + "epoch": 0.4368231046931408, + "grad_norm": 8.625, + "learning_rate": 1.1971755479913665e-05, + "loss": 1.656, + "step": 363 + }, + { + "epoch": 0.4380264741275572, + "grad_norm": 1.953125, + "learning_rate": 1.1934678691760296e-05, + "loss": 1.7714, + "step": 364 + }, + { + "epoch": 0.43922984356197353, + "grad_norm": 9.375, + "learning_rate": 1.1897574252913377e-05, + "loss": 1.5384, + "step": 365 + }, + { + "epoch": 0.4404332129963899, + "grad_norm": 1.84375, + "learning_rate": 1.1860442693674648e-05, + "loss": 1.7601, + "step": 366 + }, + { + "epoch": 0.4416365824308063, + "grad_norm": 10.125, + "learning_rate": 1.182328454473344e-05, + "loss": 1.7158, + "step": 367 + }, + { + "epoch": 0.4428399518652226, + "grad_norm": 2.1875, + "learning_rate": 1.1786100337159132e-05, + "loss": 1.8104, + "step": 368 + }, + { + "epoch": 0.44404332129963897, + "grad_norm": 10.0625, + "learning_rate": 1.1748890602393521e-05, + "loss": 1.9543, + "step": 369 + }, + { + "epoch": 0.4452466907340554, + "grad_norm": 1.625, + "learning_rate": 1.1711655872243247e-05, + "loss": 1.6937, + "step": 370 + }, + { + "epoch": 0.4464500601684717, + "grad_norm": 9.6875, + "learning_rate": 1.1674396678872186e-05, + "loss": 1.8733, + "step": 371 + }, + { + "epoch": 0.44765342960288806, + "grad_norm": 2.296875, + "learning_rate": 1.1637113554793846e-05, + "loss": 1.871, + "step": 372 + }, + { + "epoch": 0.44885679903730447, + "grad_norm": 7.1875, + "learning_rate": 1.1599807032863756e-05, + "loss": 1.685, + "step": 373 + }, + { + "epoch": 0.4500601684717208, + "grad_norm": 1.859375, + "learning_rate": 1.1562477646271856e-05, + "loss": 1.7347, + "step": 374 + }, + { + "epoch": 0.45126353790613716, + "grad_norm": 7.65625, + "learning_rate": 1.152512592853486e-05, + "loss": 1.7329, + "step": 375 + }, + { + "epoch": 0.45246690734055356, + "grad_norm": 1.6484375, + "learning_rate": 1.1487752413488646e-05, + "loss": 1.7809, + "step": 376 + }, + { + "epoch": 0.4536702767749699, + "grad_norm": 8.1875, + "learning_rate": 1.1450357635280628e-05, + "loss": 1.4815, + "step": 377 + }, + { + "epoch": 0.4548736462093863, + "grad_norm": 1.875, + "learning_rate": 1.141294212836211e-05, + "loss": 1.7814, + "step": 378 + }, + { + "epoch": 0.45607701564380265, + "grad_norm": 10.0625, + "learning_rate": 1.1375506427480658e-05, + "loss": 1.5316, + "step": 379 + }, + { + "epoch": 0.457280385078219, + "grad_norm": 2.234375, + "learning_rate": 1.1338051067672444e-05, + "loss": 1.7501, + "step": 380 + }, + { + "epoch": 0.4584837545126354, + "grad_norm": 7.875, + "learning_rate": 1.1300576584254617e-05, + "loss": 1.7606, + "step": 381 + }, + { + "epoch": 0.45968712394705175, + "grad_norm": 1.953125, + "learning_rate": 1.1263083512817644e-05, + "loss": 1.7445, + "step": 382 + }, + { + "epoch": 0.4608904933814681, + "grad_norm": 8.5, + "learning_rate": 1.1225572389217643e-05, + "loss": 1.7416, + "step": 383 + }, + { + "epoch": 0.4620938628158845, + "grad_norm": 1.6171875, + "learning_rate": 1.1188043749568752e-05, + "loss": 1.7761, + "step": 384 + }, + { + "epoch": 0.46329723225030084, + "grad_norm": 8.6875, + "learning_rate": 1.1150498130235435e-05, + "loss": 1.8021, + "step": 385 + }, + { + "epoch": 0.4645006016847172, + "grad_norm": 2.09375, + "learning_rate": 1.1112936067824847e-05, + "loss": 1.8076, + "step": 386 + }, + { + "epoch": 0.4657039711191336, + "grad_norm": 9.0, + "learning_rate": 1.1075358099179136e-05, + "loss": 1.8252, + "step": 387 + }, + { + "epoch": 0.46690734055354993, + "grad_norm": 1.7890625, + "learning_rate": 1.1037764761367795e-05, + "loss": 1.7513, + "step": 388 + }, + { + "epoch": 0.4681107099879663, + "grad_norm": 8.9375, + "learning_rate": 1.1000156591679971e-05, + "loss": 1.6586, + "step": 389 + }, + { + "epoch": 0.4693140794223827, + "grad_norm": 2.078125, + "learning_rate": 1.0962534127616784e-05, + "loss": 1.8155, + "step": 390 + }, + { + "epoch": 0.470517448856799, + "grad_norm": 9.0625, + "learning_rate": 1.0924897906883663e-05, + "loss": 1.5809, + "step": 391 + }, + { + "epoch": 0.4717208182912154, + "grad_norm": 2.015625, + "learning_rate": 1.088724846738264e-05, + "loss": 1.7288, + "step": 392 + }, + { + "epoch": 0.4729241877256318, + "grad_norm": 9.0, + "learning_rate": 1.0849586347204677e-05, + "loss": 1.8313, + "step": 393 + }, + { + "epoch": 0.4741275571600481, + "grad_norm": 2.140625, + "learning_rate": 1.0811912084621968e-05, + "loss": 1.7223, + "step": 394 + }, + { + "epoch": 0.4753309265944645, + "grad_norm": 6.96875, + "learning_rate": 1.0774226218080244e-05, + "loss": 1.5261, + "step": 395 + }, + { + "epoch": 0.47653429602888087, + "grad_norm": 2.078125, + "learning_rate": 1.0736529286191087e-05, + "loss": 1.8465, + "step": 396 + }, + { + "epoch": 0.4777376654632972, + "grad_norm": 9.0, + "learning_rate": 1.0698821827724225e-05, + "loss": 1.6954, + "step": 397 + }, + { + "epoch": 0.4789410348977136, + "grad_norm": 1.7578125, + "learning_rate": 1.0661104381599833e-05, + "loss": 1.7999, + "step": 398 + }, + { + "epoch": 0.48014440433212996, + "grad_norm": 10.75, + "learning_rate": 1.0623377486880831e-05, + "loss": 1.7214, + "step": 399 + }, + { + "epoch": 0.4813477737665463, + "grad_norm": 1.9921875, + "learning_rate": 1.058564168276518e-05, + "loss": 1.7619, + "step": 400 + }, + { + "epoch": 0.4825511432009627, + "grad_norm": 11.375, + "learning_rate": 1.054789750857817e-05, + "loss": 1.9203, + "step": 401 + }, + { + "epoch": 0.48375451263537905, + "grad_norm": 1.78125, + "learning_rate": 1.0510145503764727e-05, + "loss": 1.7714, + "step": 402 + }, + { + "epoch": 0.48495788206979545, + "grad_norm": 7.78125, + "learning_rate": 1.0472386207881684e-05, + "loss": 1.748, + "step": 403 + }, + { + "epoch": 0.4861612515042118, + "grad_norm": 2.265625, + "learning_rate": 1.0434620160590086e-05, + "loss": 1.7704, + "step": 404 + }, + { + "epoch": 0.48736462093862815, + "grad_norm": 9.3125, + "learning_rate": 1.0396847901647469e-05, + "loss": 1.8945, + "step": 405 + }, + { + "epoch": 0.48856799037304455, + "grad_norm": 1.703125, + "learning_rate": 1.0359069970900139e-05, + "loss": 1.6917, + "step": 406 + }, + { + "epoch": 0.4897713598074609, + "grad_norm": 7.875, + "learning_rate": 1.0321286908275476e-05, + "loss": 1.6683, + "step": 407 + }, + { + "epoch": 0.49097472924187724, + "grad_norm": 1.921875, + "learning_rate": 1.0283499253774201e-05, + "loss": 1.7628, + "step": 408 + }, + { + "epoch": 0.49217809867629364, + "grad_norm": 9.8125, + "learning_rate": 1.0245707547462654e-05, + "loss": 1.7848, + "step": 409 + }, + { + "epoch": 0.49338146811071, + "grad_norm": 1.765625, + "learning_rate": 1.0207912329465097e-05, + "loss": 1.7637, + "step": 410 + }, + { + "epoch": 0.49458483754512633, + "grad_norm": 10.6875, + "learning_rate": 1.0170114139955975e-05, + "loss": 1.8309, + "step": 411 + }, + { + "epoch": 0.49578820697954273, + "grad_norm": 1.8515625, + "learning_rate": 1.01323135191522e-05, + "loss": 1.7202, + "step": 412 + }, + { + "epoch": 0.4969915764139591, + "grad_norm": 7.9375, + "learning_rate": 1.0094511007305445e-05, + "loss": 1.4065, + "step": 413 + }, + { + "epoch": 0.4981949458483754, + "grad_norm": 1.953125, + "learning_rate": 1.005670714469439e-05, + "loss": 1.8112, + "step": 414 + }, + { + "epoch": 0.4993983152827918, + "grad_norm": 8.75, + "learning_rate": 1.0018902471617037e-05, + "loss": 1.9798, + "step": 415 + }, + { + "epoch": 0.5006016847172082, + "grad_norm": 2.0, + "learning_rate": 9.981097528382964e-06, + "loss": 1.7527, + "step": 416 + }, + { + "epoch": 0.5018050541516246, + "grad_norm": 9.75, + "learning_rate": 9.943292855305611e-06, + "loss": 1.4855, + "step": 417 + }, + { + "epoch": 0.5030084235860409, + "grad_norm": 2.203125, + "learning_rate": 9.905488992694558e-06, + "loss": 1.7516, + "step": 418 + }, + { + "epoch": 0.5042117930204573, + "grad_norm": 7.21875, + "learning_rate": 9.867686480847801e-06, + "loss": 1.5753, + "step": 419 + }, + { + "epoch": 0.5054151624548736, + "grad_norm": 1.875, + "learning_rate": 9.829885860044028e-06, + "loss": 1.7993, + "step": 420 + }, + { + "epoch": 0.50661853188929, + "grad_norm": 7.6875, + "learning_rate": 9.792087670534908e-06, + "loss": 1.6177, + "step": 421 + }, + { + "epoch": 0.5078219013237064, + "grad_norm": 2.296875, + "learning_rate": 9.754292452537348e-06, + "loss": 1.7433, + "step": 422 + }, + { + "epoch": 0.5090252707581228, + "grad_norm": 9.375, + "learning_rate": 9.716500746225802e-06, + "loss": 1.6121, + "step": 423 + }, + { + "epoch": 0.5102286401925391, + "grad_norm": 3.15625, + "learning_rate": 9.678713091724527e-06, + "loss": 1.8498, + "step": 424 + }, + { + "epoch": 0.5114320096269555, + "grad_norm": 8.4375, + "learning_rate": 9.640930029099863e-06, + "loss": 2.0478, + "step": 425 + }, + { + "epoch": 0.5126353790613718, + "grad_norm": 1.8359375, + "learning_rate": 9.603152098352538e-06, + "loss": 1.7512, + "step": 426 + }, + { + "epoch": 0.5138387484957883, + "grad_norm": 8.75, + "learning_rate": 9.565379839409916e-06, + "loss": 1.9938, + "step": 427 + }, + { + "epoch": 0.5150421179302046, + "grad_norm": 1.8515625, + "learning_rate": 9.527613792118318e-06, + "loss": 1.7771, + "step": 428 + }, + { + "epoch": 0.516245487364621, + "grad_norm": 8.6875, + "learning_rate": 9.489854496235278e-06, + "loss": 1.6869, + "step": 429 + }, + { + "epoch": 0.5174488567990373, + "grad_norm": 2.03125, + "learning_rate": 9.452102491421835e-06, + "loss": 1.7586, + "step": 430 + }, + { + "epoch": 0.5186522262334536, + "grad_norm": 8.625, + "learning_rate": 9.414358317234826e-06, + "loss": 1.4468, + "step": 431 + }, + { + "epoch": 0.51985559566787, + "grad_norm": 2.296875, + "learning_rate": 9.376622513119174e-06, + "loss": 1.8391, + "step": 432 + }, + { + "epoch": 0.5210589651022864, + "grad_norm": 7.71875, + "learning_rate": 9.338895618400168e-06, + "loss": 1.6689, + "step": 433 + }, + { + "epoch": 0.5222623345367028, + "grad_norm": 1.84375, + "learning_rate": 9.301178172275776e-06, + "loss": 1.793, + "step": 434 + }, + { + "epoch": 0.5234657039711191, + "grad_norm": 7.1875, + "learning_rate": 9.263470713808917e-06, + "loss": 1.7688, + "step": 435 + }, + { + "epoch": 0.5246690734055355, + "grad_norm": 1.8125, + "learning_rate": 9.22577378191976e-06, + "loss": 1.7634, + "step": 436 + }, + { + "epoch": 0.5258724428399518, + "grad_norm": 6.75, + "learning_rate": 9.188087915378037e-06, + "loss": 1.5367, + "step": 437 + }, + { + "epoch": 0.5270758122743683, + "grad_norm": 1.859375, + "learning_rate": 9.150413652795325e-06, + "loss": 1.7128, + "step": 438 + }, + { + "epoch": 0.5282791817087846, + "grad_norm": 8.3125, + "learning_rate": 9.112751532617361e-06, + "loss": 1.776, + "step": 439 + }, + { + "epoch": 0.529482551143201, + "grad_norm": 1.7109375, + "learning_rate": 9.07510209311634e-06, + "loss": 1.7408, + "step": 440 + }, + { + "epoch": 0.5306859205776173, + "grad_norm": 8.875, + "learning_rate": 9.037465872383219e-06, + "loss": 1.732, + "step": 441 + }, + { + "epoch": 0.5318892900120337, + "grad_norm": 1.765625, + "learning_rate": 8.999843408320034e-06, + "loss": 1.7082, + "step": 442 + }, + { + "epoch": 0.53309265944645, + "grad_norm": 8.5625, + "learning_rate": 8.962235238632208e-06, + "loss": 1.4712, + "step": 443 + }, + { + "epoch": 0.5342960288808665, + "grad_norm": 2.15625, + "learning_rate": 8.924641900820864e-06, + "loss": 1.7716, + "step": 444 + }, + { + "epoch": 0.5354993983152828, + "grad_norm": 7.71875, + "learning_rate": 8.887063932175156e-06, + "loss": 1.729, + "step": 445 + }, + { + "epoch": 0.5367027677496992, + "grad_norm": 1.96875, + "learning_rate": 8.849501869764569e-06, + "loss": 1.7342, + "step": 446 + }, + { + "epoch": 0.5379061371841155, + "grad_norm": 9.3125, + "learning_rate": 8.811956250431253e-06, + "loss": 1.7085, + "step": 447 + }, + { + "epoch": 0.5391095066185319, + "grad_norm": 2.015625, + "learning_rate": 8.77442761078236e-06, + "loss": 1.77, + "step": 448 + }, + { + "epoch": 0.5403128760529483, + "grad_norm": 8.375, + "learning_rate": 8.73691648718236e-06, + "loss": 1.9969, + "step": 449 + }, + { + "epoch": 0.5415162454873647, + "grad_norm": 2.671875, + "learning_rate": 8.699423415745383e-06, + "loss": 1.7322, + "step": 450 + }, + { + "epoch": 0.542719614921781, + "grad_norm": 10.1875, + "learning_rate": 8.661948932327558e-06, + "loss": 1.7757, + "step": 451 + }, + { + "epoch": 0.5439229843561973, + "grad_norm": 1.6484375, + "learning_rate": 8.624493572519345e-06, + "loss": 1.7314, + "step": 452 + }, + { + "epoch": 0.5451263537906137, + "grad_norm": 10.25, + "learning_rate": 8.587057871637891e-06, + "loss": 1.517, + "step": 453 + }, + { + "epoch": 0.54632972322503, + "grad_norm": 1.9609375, + "learning_rate": 8.549642364719373e-06, + "loss": 1.8205, + "step": 454 + }, + { + "epoch": 0.5475330926594465, + "grad_norm": 19.75, + "learning_rate": 8.512247586511354e-06, + "loss": 2.0706, + "step": 455 + }, + { + "epoch": 0.5487364620938628, + "grad_norm": 2.078125, + "learning_rate": 8.474874071465144e-06, + "loss": 1.7824, + "step": 456 + }, + { + "epoch": 0.5499398315282792, + "grad_norm": 10.0625, + "learning_rate": 8.437522353728147e-06, + "loss": 1.5444, + "step": 457 + }, + { + "epoch": 0.5511432009626955, + "grad_norm": 2.09375, + "learning_rate": 8.400192967136245e-06, + "loss": 1.7308, + "step": 458 + }, + { + "epoch": 0.5523465703971119, + "grad_norm": 7.1875, + "learning_rate": 8.36288644520616e-06, + "loss": 1.3597, + "step": 459 + }, + { + "epoch": 0.5535499398315282, + "grad_norm": 1.875, + "learning_rate": 8.325603321127819e-06, + "loss": 1.7225, + "step": 460 + }, + { + "epoch": 0.5547533092659447, + "grad_norm": 9.25, + "learning_rate": 8.288344127756755e-06, + "loss": 1.15, + "step": 461 + }, + { + "epoch": 0.555956678700361, + "grad_norm": 1.75, + "learning_rate": 8.251109397606482e-06, + "loss": 1.8114, + "step": 462 + }, + { + "epoch": 0.5571600481347774, + "grad_norm": 13.5625, + "learning_rate": 8.213899662840871e-06, + "loss": 1.638, + "step": 463 + }, + { + "epoch": 0.5583634175691937, + "grad_norm": 2.140625, + "learning_rate": 8.176715455266564e-06, + "loss": 1.7039, + "step": 464 + }, + { + "epoch": 0.5595667870036101, + "grad_norm": 9.375, + "learning_rate": 8.139557306325359e-06, + "loss": 1.7151, + "step": 465 + }, + { + "epoch": 0.5607701564380265, + "grad_norm": 2.09375, + "learning_rate": 8.102425747086623e-06, + "loss": 1.7974, + "step": 466 + }, + { + "epoch": 0.5619735258724429, + "grad_norm": 8.6875, + "learning_rate": 8.065321308239706e-06, + "loss": 1.5067, + "step": 467 + }, + { + "epoch": 0.5631768953068592, + "grad_norm": 2.296875, + "learning_rate": 8.028244520086338e-06, + "loss": 1.7269, + "step": 468 + }, + { + "epoch": 0.5643802647412756, + "grad_norm": 10.6875, + "learning_rate": 7.99119591253307e-06, + "loss": 1.3304, + "step": 469 + }, + { + "epoch": 0.5655836341756919, + "grad_norm": 2.21875, + "learning_rate": 7.954176015083687e-06, + "loss": 1.7198, + "step": 470 + }, + { + "epoch": 0.5667870036101083, + "grad_norm": 11.3125, + "learning_rate": 7.91718535683165e-06, + "loss": 1.7933, + "step": 471 + }, + { + "epoch": 0.5679903730445247, + "grad_norm": 2.28125, + "learning_rate": 7.88022446645252e-06, + "loss": 1.7562, + "step": 472 + }, + { + "epoch": 0.5691937424789411, + "grad_norm": 8.6875, + "learning_rate": 7.843293872196425e-06, + "loss": 1.7976, + "step": 473 + }, + { + "epoch": 0.5703971119133574, + "grad_norm": 1.734375, + "learning_rate": 7.806394101880488e-06, + "loss": 1.8179, + "step": 474 + }, + { + "epoch": 0.5716004813477737, + "grad_norm": 8.6875, + "learning_rate": 7.769525682881295e-06, + "loss": 1.7221, + "step": 475 + }, + { + "epoch": 0.5728038507821901, + "grad_norm": 1.4921875, + "learning_rate": 7.73268914212735e-06, + "loss": 1.786, + "step": 476 + }, + { + "epoch": 0.5740072202166066, + "grad_norm": 9.625, + "learning_rate": 7.695885006091552e-06, + "loss": 1.8582, + "step": 477 + }, + { + "epoch": 0.5752105896510229, + "grad_norm": 2.015625, + "learning_rate": 7.659113800783672e-06, + "loss": 1.7797, + "step": 478 + }, + { + "epoch": 0.5764139590854392, + "grad_norm": 7.03125, + "learning_rate": 7.622376051742824e-06, + "loss": 1.1962, + "step": 479 + }, + { + "epoch": 0.5776173285198556, + "grad_norm": 1.9453125, + "learning_rate": 7.585672284029962e-06, + "loss": 1.7732, + "step": 480 + }, + { + "epoch": 0.5788206979542719, + "grad_norm": 8.0, + "learning_rate": 7.549003022220374e-06, + "loss": 1.4953, + "step": 481 + }, + { + "epoch": 0.5800240673886883, + "grad_norm": 3.171875, + "learning_rate": 7.512368790396186e-06, + "loss": 1.75, + "step": 482 + }, + { + "epoch": 0.5812274368231047, + "grad_norm": 8.25, + "learning_rate": 7.475770112138867e-06, + "loss": 1.7405, + "step": 483 + }, + { + "epoch": 0.5824308062575211, + "grad_norm": 1.796875, + "learning_rate": 7.43920751052176e-06, + "loss": 1.7973, + "step": 484 + }, + { + "epoch": 0.5836341756919374, + "grad_norm": 8.875, + "learning_rate": 7.402681508102585e-06, + "loss": 1.9704, + "step": 485 + }, + { + "epoch": 0.5848375451263538, + "grad_norm": 1.7578125, + "learning_rate": 7.366192626915982e-06, + "loss": 1.6921, + "step": 486 + }, + { + "epoch": 0.5860409145607701, + "grad_norm": 8.8125, + "learning_rate": 7.329741388466056e-06, + "loss": 1.6404, + "step": 487 + }, + { + "epoch": 0.5872442839951865, + "grad_norm": 1.703125, + "learning_rate": 7.293328313718912e-06, + "loss": 1.7947, + "step": 488 + }, + { + "epoch": 0.5884476534296029, + "grad_norm": 9.5625, + "learning_rate": 7.256953923095209e-06, + "loss": 1.6754, + "step": 489 + }, + { + "epoch": 0.5896510228640193, + "grad_norm": 1.84375, + "learning_rate": 7.220618736462739e-06, + "loss": 1.7515, + "step": 490 + }, + { + "epoch": 0.5908543922984356, + "grad_norm": 8.625, + "learning_rate": 7.184323273128981e-06, + "loss": 1.612, + "step": 491 + }, + { + "epoch": 0.592057761732852, + "grad_norm": 1.703125, + "learning_rate": 7.1480680518336766e-06, + "loss": 1.7887, + "step": 492 + }, + { + "epoch": 0.5932611311672683, + "grad_norm": 11.375, + "learning_rate": 7.111853590741431e-06, + "loss": 2.0435, + "step": 493 + }, + { + "epoch": 0.5944645006016848, + "grad_norm": 1.8828125, + "learning_rate": 7.075680407434289e-06, + "loss": 1.7261, + "step": 494 + }, + { + "epoch": 0.5956678700361011, + "grad_norm": 7.96875, + "learning_rate": 7.039549018904362e-06, + "loss": 1.8012, + "step": 495 + }, + { + "epoch": 0.5968712394705175, + "grad_norm": 1.7734375, + "learning_rate": 7.0034599415464135e-06, + "loss": 1.77, + "step": 496 + }, + { + "epoch": 0.5980746089049338, + "grad_norm": 10.625, + "learning_rate": 6.967413691150493e-06, + "loss": 2.0642, + "step": 497 + }, + { + "epoch": 0.5992779783393501, + "grad_norm": 2.828125, + "learning_rate": 6.931410782894563e-06, + "loss": 1.806, + "step": 498 + }, + { + "epoch": 0.6004813477737665, + "grad_norm": 11.8125, + "learning_rate": 6.895451731337129e-06, + "loss": 1.8019, + "step": 499 + }, + { + "epoch": 0.601684717208183, + "grad_norm": 2.484375, + "learning_rate": 6.859537050409895e-06, + "loss": 1.7507, + "step": 500 + }, + { + "epoch": 0.6028880866425993, + "grad_norm": 8.5625, + "learning_rate": 6.823667253410417e-06, + "loss": 1.726, + "step": 501 + }, + { + "epoch": 0.6040914560770156, + "grad_norm": 1.9375, + "learning_rate": 6.787842852994757e-06, + "loss": 1.7218, + "step": 502 + }, + { + "epoch": 0.605294825511432, + "grad_norm": 9.6875, + "learning_rate": 6.752064361170165e-06, + "loss": 1.4246, + "step": 503 + }, + { + "epoch": 0.6064981949458483, + "grad_norm": 2.078125, + "learning_rate": 6.716332289287759e-06, + "loss": 1.7802, + "step": 504 + }, + { + "epoch": 0.6077015643802648, + "grad_norm": 9.3125, + "learning_rate": 6.6806471480352175e-06, + "loss": 1.8945, + "step": 505 + }, + { + "epoch": 0.6089049338146811, + "grad_norm": 1.6328125, + "learning_rate": 6.64500944742948e-06, + "loss": 1.7197, + "step": 506 + }, + { + "epoch": 0.6101083032490975, + "grad_norm": 13.0, + "learning_rate": 6.609419696809463e-06, + "loss": 1.7151, + "step": 507 + }, + { + "epoch": 0.6113116726835138, + "grad_norm": 1.9609375, + "learning_rate": 6.5738784048287615e-06, + "loss": 1.7689, + "step": 508 + }, + { + "epoch": 0.6125150421179302, + "grad_norm": 13.1875, + "learning_rate": 6.5383860794484065e-06, + "loss": 1.9819, + "step": 509 + }, + { + "epoch": 0.6137184115523465, + "grad_norm": 2.359375, + "learning_rate": 6.502943227929586e-06, + "loss": 1.7717, + "step": 510 + }, + { + "epoch": 0.614921780986763, + "grad_norm": 6.9375, + "learning_rate": 6.4675503568263955e-06, + "loss": 1.5605, + "step": 511 + }, + { + "epoch": 0.6161251504211793, + "grad_norm": 1.6328125, + "learning_rate": 6.432207971978619e-06, + "loss": 1.7743, + "step": 512 + }, + { + "epoch": 0.6173285198555957, + "grad_norm": 7.875, + "learning_rate": 6.396916578504468e-06, + "loss": 1.5416, + "step": 513 + }, + { + "epoch": 0.618531889290012, + "grad_norm": 1.78125, + "learning_rate": 6.3616766807933875e-06, + "loss": 1.6756, + "step": 514 + }, + { + "epoch": 0.6197352587244284, + "grad_norm": 10.5625, + "learning_rate": 6.326488782498831e-06, + "loss": 1.5457, + "step": 515 + }, + { + "epoch": 0.6209386281588448, + "grad_norm": 1.9609375, + "learning_rate": 6.291353386531074e-06, + "loss": 1.705, + "step": 516 + }, + { + "epoch": 0.6221419975932612, + "grad_norm": 9.125, + "learning_rate": 6.256270995050026e-06, + "loss": 1.7022, + "step": 517 + }, + { + "epoch": 0.6233453670276775, + "grad_norm": 2.03125, + "learning_rate": 6.221242109458043e-06, + "loss": 1.825, + "step": 518 + }, + { + "epoch": 0.6245487364620939, + "grad_norm": 8.0, + "learning_rate": 6.186267230392762e-06, + "loss": 1.5879, + "step": 519 + }, + { + "epoch": 0.6257521058965102, + "grad_norm": 2.21875, + "learning_rate": 6.151346857719964e-06, + "loss": 1.8472, + "step": 520 + }, + { + "epoch": 0.6269554753309265, + "grad_norm": 6.375, + "learning_rate": 6.116481490526407e-06, + "loss": 1.6695, + "step": 521 + }, + { + "epoch": 0.628158844765343, + "grad_norm": 1.7890625, + "learning_rate": 6.081671627112704e-06, + "loss": 1.7865, + "step": 522 + }, + { + "epoch": 0.6293622141997594, + "grad_norm": 8.8125, + "learning_rate": 6.046917764986213e-06, + "loss": 1.6878, + "step": 523 + }, + { + "epoch": 0.6305655836341757, + "grad_norm": 1.78125, + "learning_rate": 6.012220400853899e-06, + "loss": 1.835, + "step": 524 + }, + { + "epoch": 0.631768953068592, + "grad_norm": 8.0625, + "learning_rate": 5.977580030615254e-06, + "loss": 1.5589, + "step": 525 + }, + { + "epoch": 0.6329723225030084, + "grad_norm": 2.09375, + "learning_rate": 5.942997149355208e-06, + "loss": 1.7318, + "step": 526 + }, + { + "epoch": 0.6341756919374247, + "grad_norm": 11.25, + "learning_rate": 5.9084722513370485e-06, + "loss": 1.6077, + "step": 527 + }, + { + "epoch": 0.6353790613718412, + "grad_norm": 1.5703125, + "learning_rate": 5.874005829995358e-06, + "loss": 1.7789, + "step": 528 + }, + { + "epoch": 0.6365824308062575, + "grad_norm": 7.6875, + "learning_rate": 5.839598377928964e-06, + "loss": 1.3707, + "step": 529 + }, + { + "epoch": 0.6377858002406739, + "grad_norm": 2.328125, + "learning_rate": 5.8052503868939005e-06, + "loss": 1.7943, + "step": 530 + }, + { + "epoch": 0.6389891696750902, + "grad_norm": 8.9375, + "learning_rate": 5.7709623477963696e-06, + "loss": 1.7155, + "step": 531 + }, + { + "epoch": 0.6401925391095066, + "grad_norm": 1.4921875, + "learning_rate": 5.736734750685737e-06, + "loss": 1.7964, + "step": 532 + }, + { + "epoch": 0.641395908543923, + "grad_norm": 7.53125, + "learning_rate": 5.702568084747513e-06, + "loss": 1.529, + "step": 533 + }, + { + "epoch": 0.6425992779783394, + "grad_norm": 2.0625, + "learning_rate": 5.6684628382963905e-06, + "loss": 1.7383, + "step": 534 + }, + { + "epoch": 0.6438026474127557, + "grad_norm": 8.875, + "learning_rate": 5.6344194987692304e-06, + "loss": 1.7915, + "step": 535 + }, + { + "epoch": 0.6450060168471721, + "grad_norm": 1.8671875, + "learning_rate": 5.60043855271811e-06, + "loss": 1.7456, + "step": 536 + }, + { + "epoch": 0.6462093862815884, + "grad_norm": 8.25, + "learning_rate": 5.566520485803388e-06, + "loss": 1.8949, + "step": 537 + }, + { + "epoch": 0.6474127557160048, + "grad_norm": 2.03125, + "learning_rate": 5.53266578278673e-06, + "loss": 1.7912, + "step": 538 + }, + { + "epoch": 0.6486161251504212, + "grad_norm": 11.3125, + "learning_rate": 5.498874927524196e-06, + "loss": 1.7826, + "step": 539 + }, + { + "epoch": 0.6498194945848376, + "grad_norm": 1.7890625, + "learning_rate": 5.465148402959339e-06, + "loss": 1.76, + "step": 540 + }, + { + "epoch": 0.6510228640192539, + "grad_norm": 6.96875, + "learning_rate": 5.43148669111627e-06, + "loss": 1.4643, + "step": 541 + }, + { + "epoch": 0.6522262334536703, + "grad_norm": 2.1875, + "learning_rate": 5.397890273092807e-06, + "loss": 1.7544, + "step": 542 + }, + { + "epoch": 0.6534296028880866, + "grad_norm": 8.5625, + "learning_rate": 5.364359629053566e-06, + "loss": 1.5244, + "step": 543 + }, + { + "epoch": 0.6546329723225031, + "grad_norm": 2.140625, + "learning_rate": 5.33089523822311e-06, + "loss": 1.7565, + "step": 544 + }, + { + "epoch": 0.6558363417569194, + "grad_norm": 7.40625, + "learning_rate": 5.29749757887912e-06, + "loss": 1.675, + "step": 545 + }, + { + "epoch": 0.6570397111913358, + "grad_norm": 2.171875, + "learning_rate": 5.264167128345523e-06, + "loss": 1.7383, + "step": 546 + }, + { + "epoch": 0.6582430806257521, + "grad_norm": 8.8125, + "learning_rate": 5.230904362985694e-06, + "loss": 1.6509, + "step": 547 + }, + { + "epoch": 0.6594464500601684, + "grad_norm": 1.8515625, + "learning_rate": 5.197709758195648e-06, + "loss": 1.7565, + "step": 548 + }, + { + "epoch": 0.6606498194945848, + "grad_norm": 8.9375, + "learning_rate": 5.164583788397234e-06, + "loss": 1.5013, + "step": 549 + }, + { + "epoch": 0.6618531889290012, + "grad_norm": 1.84375, + "learning_rate": 5.131526927031356e-06, + "loss": 1.7796, + "step": 550 + }, + { + "epoch": 0.6630565583634176, + "grad_norm": 7.0625, + "learning_rate": 5.098539646551226e-06, + "loss": 1.6768, + "step": 551 + }, + { + "epoch": 0.6642599277978339, + "grad_norm": 2.34375, + "learning_rate": 5.0656224184155764e-06, + "loss": 1.8201, + "step": 552 + }, + { + "epoch": 0.6654632972322503, + "grad_norm": 7.09375, + "learning_rate": 5.032775713081963e-06, + "loss": 1.6698, + "step": 553 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.9140625, + "learning_rate": 5.000000000000003e-06, + "loss": 1.7584, + "step": 554 + }, + { + "epoch": 0.6678700361010831, + "grad_norm": 8.125, + "learning_rate": 4.967295747604685e-06, + "loss": 1.7075, + "step": 555 + }, + { + "epoch": 0.6690734055354994, + "grad_norm": 2.171875, + "learning_rate": 4.934663423309685e-06, + "loss": 1.8019, + "step": 556 + }, + { + "epoch": 0.6702767749699158, + "grad_norm": 8.9375, + "learning_rate": 4.902103493500654e-06, + "loss": 1.7358, + "step": 557 + }, + { + "epoch": 0.6714801444043321, + "grad_norm": 2.28125, + "learning_rate": 4.869616423528588e-06, + "loss": 1.8208, + "step": 558 + }, + { + "epoch": 0.6726835138387485, + "grad_norm": 8.9375, + "learning_rate": 4.837202677703149e-06, + "loss": 1.66, + "step": 559 + }, + { + "epoch": 0.6738868832731648, + "grad_norm": 2.21875, + "learning_rate": 4.804862719286044e-06, + "loss": 1.7842, + "step": 560 + }, + { + "epoch": 0.6750902527075813, + "grad_norm": 9.6875, + "learning_rate": 4.772597010484396e-06, + "loss": 1.7979, + "step": 561 + }, + { + "epoch": 0.6762936221419976, + "grad_norm": 1.7890625, + "learning_rate": 4.740406012444153e-06, + "loss": 1.7748, + "step": 562 + }, + { + "epoch": 0.677496991576414, + "grad_norm": 9.75, + "learning_rate": 4.7082901852434734e-06, + "loss": 1.78, + "step": 563 + }, + { + "epoch": 0.6787003610108303, + "grad_norm": 2.125, + "learning_rate": 4.6762499878861764e-06, + "loss": 1.7373, + "step": 564 + }, + { + "epoch": 0.6799037304452467, + "grad_norm": 16.375, + "learning_rate": 4.644285878295161e-06, + "loss": 1.8036, + "step": 565 + }, + { + "epoch": 0.681107099879663, + "grad_norm": 1.875, + "learning_rate": 4.612398313305867e-06, + "loss": 1.8106, + "step": 566 + }, + { + "epoch": 0.6823104693140795, + "grad_norm": 20.75, + "learning_rate": 4.580587748659753e-06, + "loss": 1.8474, + "step": 567 + }, + { + "epoch": 0.6835138387484958, + "grad_norm": 2.015625, + "learning_rate": 4.548854638997778e-06, + "loss": 1.8194, + "step": 568 + }, + { + "epoch": 0.6847172081829122, + "grad_norm": 8.5, + "learning_rate": 4.517199437853909e-06, + "loss": 1.7905, + "step": 569 + }, + { + "epoch": 0.6859205776173285, + "grad_norm": 2.6875, + "learning_rate": 4.485622597648624e-06, + "loss": 1.8076, + "step": 570 + }, + { + "epoch": 0.6871239470517448, + "grad_norm": 7.09375, + "learning_rate": 4.454124569682459e-06, + "loss": 1.5847, + "step": 571 + }, + { + "epoch": 0.6883273164861613, + "grad_norm": 1.65625, + "learning_rate": 4.4227058041295515e-06, + "loss": 1.7217, + "step": 572 + }, + { + "epoch": 0.6895306859205776, + "grad_norm": 7.40625, + "learning_rate": 4.391366750031217e-06, + "loss": 1.6804, + "step": 573 + }, + { + "epoch": 0.690734055354994, + "grad_norm": 2.109375, + "learning_rate": 4.3601078552895245e-06, + "loss": 1.747, + "step": 574 + }, + { + "epoch": 0.6919374247894103, + "grad_norm": 8.875, + "learning_rate": 4.3289295666608865e-06, + "loss": 1.7562, + "step": 575 + }, + { + "epoch": 0.6931407942238267, + "grad_norm": 2.09375, + "learning_rate": 4.297832329749687e-06, + "loss": 1.7482, + "step": 576 + }, + { + "epoch": 0.694344163658243, + "grad_norm": 10.0625, + "learning_rate": 4.2668165890019044e-06, + "loss": 1.6526, + "step": 577 + }, + { + "epoch": 0.6955475330926595, + "grad_norm": 1.609375, + "learning_rate": 4.235882787698763e-06, + "loss": 1.7288, + "step": 578 + }, + { + "epoch": 0.6967509025270758, + "grad_norm": 8.8125, + "learning_rate": 4.205031367950402e-06, + "loss": 1.9839, + "step": 579 + }, + { + "epoch": 0.6979542719614922, + "grad_norm": 1.7109375, + "learning_rate": 4.174262770689552e-06, + "loss": 1.8001, + "step": 580 + }, + { + "epoch": 0.6991576413959085, + "grad_norm": 8.125, + "learning_rate": 4.143577435665229e-06, + "loss": 1.7003, + "step": 581 + }, + { + "epoch": 0.7003610108303249, + "grad_norm": 2.171875, + "learning_rate": 4.112975801436454e-06, + "loss": 1.757, + "step": 582 + }, + { + "epoch": 0.7015643802647413, + "grad_norm": 9.0, + "learning_rate": 4.082458305365982e-06, + "loss": 1.8433, + "step": 583 + }, + { + "epoch": 0.7027677496991577, + "grad_norm": 1.9296875, + "learning_rate": 4.052025383614061e-06, + "loss": 1.7098, + "step": 584 + }, + { + "epoch": 0.703971119133574, + "grad_norm": 7.625, + "learning_rate": 4.021677471132192e-06, + "loss": 1.5424, + "step": 585 + }, + { + "epoch": 0.7051744885679904, + "grad_norm": 1.796875, + "learning_rate": 3.991415001656906e-06, + "loss": 1.7097, + "step": 586 + }, + { + "epoch": 0.7063778580024067, + "grad_norm": 7.9375, + "learning_rate": 3.9612384077035705e-06, + "loss": 1.8125, + "step": 587 + }, + { + "epoch": 0.7075812274368231, + "grad_norm": 1.734375, + "learning_rate": 3.931148120560211e-06, + "loss": 1.7072, + "step": 588 + }, + { + "epoch": 0.7087845968712395, + "grad_norm": 9.6875, + "learning_rate": 3.90114457028134e-06, + "loss": 1.7082, + "step": 589 + }, + { + "epoch": 0.7099879663056559, + "grad_norm": 2.421875, + "learning_rate": 3.871228185681822e-06, + "loss": 1.7654, + "step": 590 + }, + { + "epoch": 0.7111913357400722, + "grad_norm": 7.46875, + "learning_rate": 3.84139939433074e-06, + "loss": 1.66, + "step": 591 + }, + { + "epoch": 0.7123947051744886, + "grad_norm": 1.890625, + "learning_rate": 3.811658622545268e-06, + "loss": 1.6727, + "step": 592 + }, + { + "epoch": 0.7135980746089049, + "grad_norm": 9.125, + "learning_rate": 3.782006295384604e-06, + "loss": 1.4466, + "step": 593 + }, + { + "epoch": 0.7148014440433214, + "grad_norm": 2.046875, + "learning_rate": 3.7524428366438757e-06, + "loss": 1.7239, + "step": 594 + }, + { + "epoch": 0.7160048134777377, + "grad_norm": 7.9375, + "learning_rate": 3.722968668848098e-06, + "loss": 1.8132, + "step": 595 + }, + { + "epoch": 0.717208182912154, + "grad_norm": 1.7890625, + "learning_rate": 3.6935842132461307e-06, + "loss": 1.7111, + "step": 596 + }, + { + "epoch": 0.7184115523465704, + "grad_norm": 10.25, + "learning_rate": 3.664289889804643e-06, + "loss": 1.9077, + "step": 597 + }, + { + "epoch": 0.7196149217809867, + "grad_norm": 2.828125, + "learning_rate": 3.635086117202128e-06, + "loss": 1.8594, + "step": 598 + }, + { + "epoch": 0.7208182912154031, + "grad_norm": 7.4375, + "learning_rate": 3.6059733128229125e-06, + "loss": 1.6636, + "step": 599 + }, + { + "epoch": 0.7220216606498195, + "grad_norm": 2.421875, + "learning_rate": 3.576951892751197e-06, + "loss": 1.7859, + "step": 600 + }, + { + "epoch": 0.7232250300842359, + "grad_norm": 7.28125, + "learning_rate": 3.548022271765107e-06, + "loss": 1.6168, + "step": 601 + }, + { + "epoch": 0.7244283995186522, + "grad_norm": 1.9140625, + "learning_rate": 3.5191848633307545e-06, + "loss": 1.7513, + "step": 602 + }, + { + "epoch": 0.7256317689530686, + "grad_norm": 6.96875, + "learning_rate": 3.490440079596341e-06, + "loss": 1.413, + "step": 603 + }, + { + "epoch": 0.7268351383874849, + "grad_norm": 2.125, + "learning_rate": 3.4617883313862633e-06, + "loss": 1.7607, + "step": 604 + }, + { + "epoch": 0.7280385078219013, + "grad_norm": 8.0625, + "learning_rate": 3.433230028195239e-06, + "loss": 1.4379, + "step": 605 + }, + { + "epoch": 0.7292418772563177, + "grad_norm": 2.21875, + "learning_rate": 3.4047655781824605e-06, + "loss": 1.7282, + "step": 606 + }, + { + "epoch": 0.7304452466907341, + "grad_norm": 8.3125, + "learning_rate": 3.376395388165762e-06, + "loss": 1.6588, + "step": 607 + }, + { + "epoch": 0.7316486161251504, + "grad_norm": 1.8359375, + "learning_rate": 3.3481198636157908e-06, + "loss": 1.7713, + "step": 608 + }, + { + "epoch": 0.7328519855595668, + "grad_norm": 7.75, + "learning_rate": 3.3199394086502257e-06, + "loss": 1.1923, + "step": 609 + }, + { + "epoch": 0.7340553549939831, + "grad_norm": 2.546875, + "learning_rate": 3.2918544260279985e-06, + "loss": 1.773, + "step": 610 + }, + { + "epoch": 0.7352587244283996, + "grad_norm": 8.625, + "learning_rate": 3.2638653171435387e-06, + "loss": 1.6534, + "step": 611 + }, + { + "epoch": 0.7364620938628159, + "grad_norm": 2.71875, + "learning_rate": 3.2359724820210394e-06, + "loss": 1.8335, + "step": 612 + }, + { + "epoch": 0.7376654632972323, + "grad_norm": 7.875, + "learning_rate": 3.2081763193087247e-06, + "loss": 1.5612, + "step": 613 + }, + { + "epoch": 0.7388688327316486, + "grad_norm": 1.84375, + "learning_rate": 3.180477226273172e-06, + "loss": 1.7428, + "step": 614 + }, + { + "epoch": 0.740072202166065, + "grad_norm": 8.25, + "learning_rate": 3.1528755987936188e-06, + "loss": 1.465, + "step": 615 + }, + { + "epoch": 0.7412755716004813, + "grad_norm": 2.21875, + "learning_rate": 3.1253718313563207e-06, + "loss": 1.7722, + "step": 616 + }, + { + "epoch": 0.7424789410348978, + "grad_norm": 8.4375, + "learning_rate": 3.097966317048895e-06, + "loss": 1.6246, + "step": 617 + }, + { + "epoch": 0.7436823104693141, + "grad_norm": 2.09375, + "learning_rate": 3.070659447554719e-06, + "loss": 1.7443, + "step": 618 + }, + { + "epoch": 0.7448856799037304, + "grad_norm": 9.1875, + "learning_rate": 3.0434516131473214e-06, + "loss": 1.6198, + "step": 619 + }, + { + "epoch": 0.7460890493381468, + "grad_norm": 1.8828125, + "learning_rate": 3.016343202684807e-06, + "loss": 1.7547, + "step": 620 + }, + { + "epoch": 0.7472924187725631, + "grad_norm": 7.0625, + "learning_rate": 2.9893346036042968e-06, + "loss": 1.6661, + "step": 621 + }, + { + "epoch": 0.7484957882069796, + "grad_norm": 1.4765625, + "learning_rate": 2.962426201916402e-06, + "loss": 1.7177, + "step": 622 + }, + { + "epoch": 0.7496991576413959, + "grad_norm": 8.0625, + "learning_rate": 2.9356183821996976e-06, + "loss": 1.6522, + "step": 623 + }, + { + "epoch": 0.7509025270758123, + "grad_norm": 1.65625, + "learning_rate": 2.9089115275952217e-06, + "loss": 1.7324, + "step": 624 + }, + { + "epoch": 0.7521058965102286, + "grad_norm": 11.875, + "learning_rate": 2.882306019801008e-06, + "loss": 1.4715, + "step": 625 + }, + { + "epoch": 0.753309265944645, + "grad_norm": 1.6484375, + "learning_rate": 2.855802239066623e-06, + "loss": 1.7237, + "step": 626 + }, + { + "epoch": 0.7545126353790613, + "grad_norm": 9.5, + "learning_rate": 2.8294005641877486e-06, + "loss": 1.8183, + "step": 627 + }, + { + "epoch": 0.7557160048134778, + "grad_norm": 2.234375, + "learning_rate": 2.8031013725007415e-06, + "loss": 1.7527, + "step": 628 + }, + { + "epoch": 0.7569193742478941, + "grad_norm": 8.125, + "learning_rate": 2.776905039877268e-06, + "loss": 1.7001, + "step": 629 + }, + { + "epoch": 0.7581227436823105, + "grad_norm": 1.65625, + "learning_rate": 2.750811940718906e-06, + "loss": 1.6861, + "step": 630 + }, + { + "epoch": 0.7593261131167268, + "grad_norm": 8.75, + "learning_rate": 2.724822447951814e-06, + "loss": 1.7897, + "step": 631 + }, + { + "epoch": 0.7605294825511432, + "grad_norm": 1.828125, + "learning_rate": 2.6989369330213865e-06, + "loss": 1.7686, + "step": 632 + }, + { + "epoch": 0.7617328519855595, + "grad_norm": 7.5, + "learning_rate": 2.6731557658869668e-06, + "loss": 1.7332, + "step": 633 + }, + { + "epoch": 0.762936221419976, + "grad_norm": 2.265625, + "learning_rate": 2.647479315016528e-06, + "loss": 1.7653, + "step": 634 + }, + { + "epoch": 0.7641395908543923, + "grad_norm": 8.125, + "learning_rate": 2.621907947381438e-06, + "loss": 1.5852, + "step": 635 + }, + { + "epoch": 0.7653429602888087, + "grad_norm": 2.59375, + "learning_rate": 2.596442028451194e-06, + "loss": 1.7552, + "step": 636 + }, + { + "epoch": 0.766546329723225, + "grad_norm": 9.1875, + "learning_rate": 2.5710819221882e-06, + "loss": 1.5638, + "step": 637 + }, + { + "epoch": 0.7677496991576414, + "grad_norm": 1.8359375, + "learning_rate": 2.5458279910425865e-06, + "loss": 1.7789, + "step": 638 + }, + { + "epoch": 0.7689530685920578, + "grad_norm": 13.1875, + "learning_rate": 2.5206805959469984e-06, + "loss": 1.8226, + "step": 639 + }, + { + "epoch": 0.7701564380264742, + "grad_norm": 2.046875, + "learning_rate": 2.4956400963114647e-06, + "loss": 1.7214, + "step": 640 + }, + { + "epoch": 0.7713598074608905, + "grad_norm": 8.125, + "learning_rate": 2.4707068500182442e-06, + "loss": 1.5876, + "step": 641 + }, + { + "epoch": 0.7725631768953068, + "grad_norm": 2.078125, + "learning_rate": 2.445881213416713e-06, + "loss": 1.7379, + "step": 642 + }, + { + "epoch": 0.7737665463297232, + "grad_norm": 10.3125, + "learning_rate": 2.4211635413182845e-06, + "loss": 1.7739, + "step": 643 + }, + { + "epoch": 0.7749699157641395, + "grad_norm": 2.515625, + "learning_rate": 2.3965541869913188e-06, + "loss": 1.7963, + "step": 644 + }, + { + "epoch": 0.776173285198556, + "grad_norm": 8.0, + "learning_rate": 2.3720535021560864e-06, + "loss": 1.8607, + "step": 645 + }, + { + "epoch": 0.7773766546329723, + "grad_norm": 2.046875, + "learning_rate": 2.3476618369797457e-06, + "loss": 1.7694, + "step": 646 + }, + { + "epoch": 0.7785800240673887, + "grad_norm": 11.0625, + "learning_rate": 2.32337954007132e-06, + "loss": 1.8392, + "step": 647 + }, + { + "epoch": 0.779783393501805, + "grad_norm": 2.15625, + "learning_rate": 2.299206958476731e-06, + "loss": 1.7709, + "step": 648 + }, + { + "epoch": 0.7809867629362214, + "grad_norm": 7.15625, + "learning_rate": 2.2751444376738373e-06, + "loss": 1.7899, + "step": 649 + }, + { + "epoch": 0.7821901323706378, + "grad_norm": 2.046875, + "learning_rate": 2.251192321567488e-06, + "loss": 1.7695, + "step": 650 + }, + { + "epoch": 0.7833935018050542, + "grad_norm": 9.625, + "learning_rate": 2.2273509524846193e-06, + "loss": 1.5654, + "step": 651 + }, + { + "epoch": 0.7845968712394705, + "grad_norm": 2.03125, + "learning_rate": 2.2036206711693508e-06, + "loss": 1.734, + "step": 652 + }, + { + "epoch": 0.7858002406738869, + "grad_norm": 9.4375, + "learning_rate": 2.180001816778118e-06, + "loss": 1.7517, + "step": 653 + }, + { + "epoch": 0.7870036101083032, + "grad_norm": 1.8359375, + "learning_rate": 2.1564947268748382e-06, + "loss": 1.7432, + "step": 654 + }, + { + "epoch": 0.7882069795427196, + "grad_norm": 9.125, + "learning_rate": 2.133099737426064e-06, + "loss": 1.8342, + "step": 655 + }, + { + "epoch": 0.789410348977136, + "grad_norm": 1.75, + "learning_rate": 2.1098171827961965e-06, + "loss": 1.7589, + "step": 656 + }, + { + "epoch": 0.7906137184115524, + "grad_norm": 7.0, + "learning_rate": 2.086647395742709e-06, + "loss": 1.7399, + "step": 657 + }, + { + "epoch": 0.7918170878459687, + "grad_norm": 2.03125, + "learning_rate": 2.0635907074113737e-06, + "loss": 1.6705, + "step": 658 + }, + { + "epoch": 0.7930204572803851, + "grad_norm": 11.0625, + "learning_rate": 2.040647447331553e-06, + "loss": 1.6823, + "step": 659 + }, + { + "epoch": 0.7942238267148014, + "grad_norm": 1.71875, + "learning_rate": 2.0178179434114674e-06, + "loss": 1.7502, + "step": 660 + }, + { + "epoch": 0.7954271961492179, + "grad_norm": 11.8125, + "learning_rate": 1.9951025219335183e-06, + "loss": 1.7313, + "step": 661 + }, + { + "epoch": 0.7966305655836342, + "grad_norm": 1.546875, + "learning_rate": 1.972501507549637e-06, + "loss": 1.7425, + "step": 662 + }, + { + "epoch": 0.7978339350180506, + "grad_norm": 12.5, + "learning_rate": 1.95001522327662e-06, + "loss": 1.6735, + "step": 663 + }, + { + "epoch": 0.7990373044524669, + "grad_norm": 1.765625, + "learning_rate": 1.927643990491528e-06, + "loss": 1.7718, + "step": 664 + }, + { + "epoch": 0.8002406738868832, + "grad_norm": 11.125, + "learning_rate": 1.905388128927098e-06, + "loss": 1.7697, + "step": 665 + }, + { + "epoch": 0.8014440433212996, + "grad_norm": 1.59375, + "learning_rate": 1.883247956667157e-06, + "loss": 1.7468, + "step": 666 + }, + { + "epoch": 0.802647412755716, + "grad_norm": 6.53125, + "learning_rate": 1.8612237901420838e-06, + "loss": 1.455, + "step": 667 + }, + { + "epoch": 0.8038507821901324, + "grad_norm": 2.03125, + "learning_rate": 1.839315944124298e-06, + "loss": 1.7169, + "step": 668 + }, + { + "epoch": 0.8050541516245487, + "grad_norm": 7.25, + "learning_rate": 1.8175247317237365e-06, + "loss": 1.4329, + "step": 669 + }, + { + "epoch": 0.8062575210589651, + "grad_norm": 1.734375, + "learning_rate": 1.7958504643834062e-06, + "loss": 1.7497, + "step": 670 + }, + { + "epoch": 0.8074608904933814, + "grad_norm": 7.375, + "learning_rate": 1.774293451874909e-06, + "loss": 1.4798, + "step": 671 + }, + { + "epoch": 0.8086642599277978, + "grad_norm": 2.09375, + "learning_rate": 1.7528540022940288e-06, + "loss": 1.7344, + "step": 672 + }, + { + "epoch": 0.8098676293622142, + "grad_norm": 10.125, + "learning_rate": 1.731532422056319e-06, + "loss": 1.9758, + "step": 673 + }, + { + "epoch": 0.8110709987966306, + "grad_norm": 1.9921875, + "learning_rate": 1.71032901589274e-06, + "loss": 1.7087, + "step": 674 + }, + { + "epoch": 0.8122743682310469, + "grad_norm": 11.25, + "learning_rate": 1.6892440868452763e-06, + "loss": 1.5607, + "step": 675 + }, + { + "epoch": 0.8134777376654633, + "grad_norm": 1.46875, + "learning_rate": 1.6682779362626378e-06, + "loss": 1.7157, + "step": 676 + }, + { + "epoch": 0.8146811070998796, + "grad_norm": 11.0625, + "learning_rate": 1.6474308637959235e-06, + "loss": 1.7396, + "step": 677 + }, + { + "epoch": 0.8158844765342961, + "grad_norm": 1.875, + "learning_rate": 1.6267031673943546e-06, + "loss": 1.7405, + "step": 678 + }, + { + "epoch": 0.8170878459687124, + "grad_norm": 8.0, + "learning_rate": 1.6060951433010186e-06, + "loss": 1.5432, + "step": 679 + }, + { + "epoch": 0.8182912154031288, + "grad_norm": 1.7578125, + "learning_rate": 1.5856070860486205e-06, + "loss": 1.8063, + "step": 680 + }, + { + "epoch": 0.8194945848375451, + "grad_norm": 8.6875, + "learning_rate": 1.5652392884552947e-06, + "loss": 1.6438, + "step": 681 + }, + { + "epoch": 0.8206979542719615, + "grad_norm": 1.375, + "learning_rate": 1.544992041620398e-06, + "loss": 1.7676, + "step": 682 + }, + { + "epoch": 0.8219013237063778, + "grad_norm": 12.0625, + "learning_rate": 1.5248656349203628e-06, + "loss": 1.6503, + "step": 683 + }, + { + "epoch": 0.8231046931407943, + "grad_norm": 1.9140625, + "learning_rate": 1.5048603560045549e-06, + "loss": 1.7596, + "step": 684 + }, + { + "epoch": 0.8243080625752106, + "grad_norm": 10.375, + "learning_rate": 1.4849764907911712e-06, + "loss": 1.867, + "step": 685 + }, + { + "epoch": 0.825511432009627, + "grad_norm": 1.984375, + "learning_rate": 1.4652143234631465e-06, + "loss": 1.7849, + "step": 686 + }, + { + "epoch": 0.8267148014440433, + "grad_norm": 7.34375, + "learning_rate": 1.4455741364640863e-06, + "loss": 1.7476, + "step": 687 + }, + { + "epoch": 0.8279181708784596, + "grad_norm": 1.6328125, + "learning_rate": 1.426056210494241e-06, + "loss": 1.7788, + "step": 688 + }, + { + "epoch": 0.8291215403128761, + "grad_norm": 7.5, + "learning_rate": 1.4066608245064872e-06, + "loss": 1.5044, + "step": 689 + }, + { + "epoch": 0.8303249097472925, + "grad_norm": 1.890625, + "learning_rate": 1.3873882557023488e-06, + "loss": 1.7591, + "step": 690 + }, + { + "epoch": 0.8315282791817088, + "grad_norm": 10.5625, + "learning_rate": 1.3682387795280228e-06, + "loss": 1.868, + "step": 691 + }, + { + "epoch": 0.8327316486161251, + "grad_norm": 1.6015625, + "learning_rate": 1.3492126696704544e-06, + "loss": 1.7034, + "step": 692 + }, + { + "epoch": 0.8339350180505415, + "grad_norm": 7.9375, + "learning_rate": 1.3303101980534183e-06, + "loss": 1.4884, + "step": 693 + }, + { + "epoch": 0.8351383874849578, + "grad_norm": 1.578125, + "learning_rate": 1.3115316348336348e-06, + "loss": 1.7203, + "step": 694 + }, + { + "epoch": 0.8363417569193743, + "grad_norm": 9.3125, + "learning_rate": 1.2928772483969054e-06, + "loss": 1.865, + "step": 695 + }, + { + "epoch": 0.8375451263537906, + "grad_norm": 2.390625, + "learning_rate": 1.2743473053542842e-06, + "loss": 1.8178, + "step": 696 + }, + { + "epoch": 0.838748495788207, + "grad_norm": 7.90625, + "learning_rate": 1.2559420705382664e-06, + "loss": 1.6496, + "step": 697 + }, + { + "epoch": 0.8399518652226233, + "grad_norm": 1.859375, + "learning_rate": 1.237661806998991e-06, + "loss": 1.8212, + "step": 698 + }, + { + "epoch": 0.8411552346570397, + "grad_norm": 8.5, + "learning_rate": 1.2195067760004952e-06, + "loss": 1.4821, + "step": 699 + }, + { + "epoch": 0.8423586040914561, + "grad_norm": 1.859375, + "learning_rate": 1.2014772370169747e-06, + "loss": 1.793, + "step": 700 + }, + { + "epoch": 0.8435619735258725, + "grad_norm": 9.5625, + "learning_rate": 1.1835734477290784e-06, + "loss": 1.6435, + "step": 701 + }, + { + "epoch": 0.8447653429602888, + "grad_norm": 2.140625, + "learning_rate": 1.1657956640202217e-06, + "loss": 1.7819, + "step": 702 + }, + { + "epoch": 0.8459687123947052, + "grad_norm": 8.3125, + "learning_rate": 1.148144139972931e-06, + "loss": 1.4722, + "step": 703 + }, + { + "epoch": 0.8471720818291215, + "grad_norm": 1.9921875, + "learning_rate": 1.1306191278652112e-06, + "loss": 1.7809, + "step": 704 + }, + { + "epoch": 0.8483754512635379, + "grad_norm": 9.375, + "learning_rate": 1.1132208781669418e-06, + "loss": 1.7592, + "step": 705 + }, + { + "epoch": 0.8495788206979543, + "grad_norm": 1.6796875, + "learning_rate": 1.0959496395362946e-06, + "loss": 1.8302, + "step": 706 + }, + { + "epoch": 0.8507821901323707, + "grad_norm": 8.3125, + "learning_rate": 1.0788056588161854e-06, + "loss": 1.3777, + "step": 707 + }, + { + "epoch": 0.851985559566787, + "grad_norm": 1.6015625, + "learning_rate": 1.0617891810307458e-06, + "loss": 1.7717, + "step": 708 + }, + { + "epoch": 0.8531889290012034, + "grad_norm": 8.6875, + "learning_rate": 1.0449004493818083e-06, + "loss": 1.6946, + "step": 709 + }, + { + "epoch": 0.8543922984356197, + "grad_norm": 1.7578125, + "learning_rate": 1.0281397052454457e-06, + "loss": 1.7454, + "step": 710 + }, + { + "epoch": 0.855595667870036, + "grad_norm": 10.1875, + "learning_rate": 1.0115071881685134e-06, + "loss": 1.7094, + "step": 711 + }, + { + "epoch": 0.8567990373044525, + "grad_norm": 1.7265625, + "learning_rate": 9.950031358652313e-07, + "loss": 1.7548, + "step": 712 + }, + { + "epoch": 0.8580024067388689, + "grad_norm": 6.5625, + "learning_rate": 9.786277842137837e-07, + "loss": 1.4388, + "step": 713 + }, + { + "epoch": 0.8592057761732852, + "grad_norm": 2.546875, + "learning_rate": 9.623813672529437e-07, + "loss": 1.861, + "step": 714 + }, + { + "epoch": 0.8604091456077015, + "grad_norm": 7.9375, + "learning_rate": 9.462641171787313e-07, + "loss": 1.6033, + "step": 715 + }, + { + "epoch": 0.8616125150421179, + "grad_norm": 1.7890625, + "learning_rate": 9.302762643411e-07, + "loss": 1.7625, + "step": 716 + }, + { + "epoch": 0.8628158844765343, + "grad_norm": 8.0625, + "learning_rate": 9.144180372406342e-07, + "loss": 1.5054, + "step": 717 + }, + { + "epoch": 0.8640192539109507, + "grad_norm": 1.4921875, + "learning_rate": 8.986896625253006e-07, + "loss": 1.7148, + "step": 718 + }, + { + "epoch": 0.865222623345367, + "grad_norm": 6.78125, + "learning_rate": 8.830913649871875e-07, + "loss": 1.7363, + "step": 719 + }, + { + "epoch": 0.8664259927797834, + "grad_norm": 1.84375, + "learning_rate": 8.676233675593038e-07, + "loss": 1.7645, + "step": 720 + }, + { + "epoch": 0.8676293622141997, + "grad_norm": 9.3125, + "learning_rate": 8.522858913123944e-07, + "loss": 1.7745, + "step": 721 + }, + { + "epoch": 0.8688327316486161, + "grad_norm": 1.984375, + "learning_rate": 8.370791554517743e-07, + "loss": 1.8315, + "step": 722 + }, + { + "epoch": 0.8700361010830325, + "grad_norm": 8.125, + "learning_rate": 8.220033773142022e-07, + "loss": 1.7244, + "step": 723 + }, + { + "epoch": 0.8712394705174489, + "grad_norm": 1.9140625, + "learning_rate": 8.070587723647705e-07, + "loss": 1.8274, + "step": 724 + }, + { + "epoch": 0.8724428399518652, + "grad_norm": 7.59375, + "learning_rate": 7.922455541938245e-07, + "loss": 1.7885, + "step": 725 + }, + { + "epoch": 0.8736462093862816, + "grad_norm": 1.921875, + "learning_rate": 7.77563934513913e-07, + "loss": 1.7429, + "step": 726 + }, + { + "epoch": 0.8748495788206979, + "grad_norm": 8.9375, + "learning_rate": 7.630141231567589e-07, + "loss": 1.5492, + "step": 727 + }, + { + "epoch": 0.8760529482551144, + "grad_norm": 1.625, + "learning_rate": 7.485963280702646e-07, + "loss": 1.7734, + "step": 728 + }, + { + "epoch": 0.8772563176895307, + "grad_norm": 11.5, + "learning_rate": 7.343107553155404e-07, + "loss": 1.7769, + "step": 729 + }, + { + "epoch": 0.8784596871239471, + "grad_norm": 2.15625, + "learning_rate": 7.201576090639529e-07, + "loss": 1.7938, + "step": 730 + }, + { + "epoch": 0.8796630565583634, + "grad_norm": 7.46875, + "learning_rate": 7.061370915942101e-07, + "loss": 1.6373, + "step": 731 + }, + { + "epoch": 0.8808664259927798, + "grad_norm": 1.796875, + "learning_rate": 6.922494032894744e-07, + "loss": 1.7239, + "step": 732 + }, + { + "epoch": 0.8820697954271961, + "grad_norm": 9.0625, + "learning_rate": 6.784947426344923e-07, + "loss": 1.7693, + "step": 733 + }, + { + "epoch": 0.8832731648616126, + "grad_norm": 2.015625, + "learning_rate": 6.648733062127643e-07, + "loss": 1.7668, + "step": 734 + }, + { + "epoch": 0.8844765342960289, + "grad_norm": 9.0, + "learning_rate": 6.513852887037319e-07, + "loss": 1.5196, + "step": 735 + }, + { + "epoch": 0.8856799037304453, + "grad_norm": 1.6875, + "learning_rate": 6.380308828799919e-07, + "loss": 1.735, + "step": 736 + }, + { + "epoch": 0.8868832731648616, + "grad_norm": 8.25, + "learning_rate": 6.248102796045475e-07, + "loss": 1.7167, + "step": 737 + }, + { + "epoch": 0.8880866425992779, + "grad_norm": 2.0625, + "learning_rate": 6.117236678280736e-07, + "loss": 1.7921, + "step": 738 + }, + { + "epoch": 0.8892900120336944, + "grad_norm": 13.5625, + "learning_rate": 5.98771234586224e-07, + "loss": 2.1283, + "step": 739 + }, + { + "epoch": 0.8904933814681107, + "grad_norm": 2.078125, + "learning_rate": 5.859531649969563e-07, + "loss": 1.7826, + "step": 740 + }, + { + "epoch": 0.8916967509025271, + "grad_norm": 9.3125, + "learning_rate": 5.732696422578787e-07, + "loss": 1.4974, + "step": 741 + }, + { + "epoch": 0.8929001203369434, + "grad_norm": 1.7421875, + "learning_rate": 5.60720847643641e-07, + "loss": 1.7384, + "step": 742 + }, + { + "epoch": 0.8941034897713598, + "grad_norm": 8.6875, + "learning_rate": 5.483069605033365e-07, + "loss": 1.7097, + "step": 743 + }, + { + "epoch": 0.8953068592057761, + "grad_norm": 1.921875, + "learning_rate": 5.360281582579474e-07, + "loss": 1.7171, + "step": 744 + }, + { + "epoch": 0.8965102286401926, + "grad_norm": 8.3125, + "learning_rate": 5.238846163978018e-07, + "loss": 1.5687, + "step": 745 + }, + { + "epoch": 0.8977135980746089, + "grad_norm": 2.28125, + "learning_rate": 5.11876508480067e-07, + "loss": 1.7238, + "step": 746 + }, + { + "epoch": 0.8989169675090253, + "grad_norm": 8.5, + "learning_rate": 5.000040061262712e-07, + "loss": 1.6323, + "step": 747 + }, + { + "epoch": 0.9001203369434416, + "grad_norm": 1.75, + "learning_rate": 4.882672790198473e-07, + "loss": 1.7586, + "step": 748 + }, + { + "epoch": 0.901323706377858, + "grad_norm": 9.3125, + "learning_rate": 4.766664949037103e-07, + "loss": 1.4364, + "step": 749 + }, + { + "epoch": 0.9025270758122743, + "grad_norm": 1.65625, + "learning_rate": 4.652018195778629e-07, + "loss": 1.753, + "step": 750 + }, + { + "epoch": 0.9037304452466908, + "grad_norm": 9.875, + "learning_rate": 4.538734168970149e-07, + "loss": 1.9226, + "step": 751 + }, + { + "epoch": 0.9049338146811071, + "grad_norm": 1.7578125, + "learning_rate": 4.4268144876825846e-07, + "loss": 1.7851, + "step": 752 + }, + { + "epoch": 0.9061371841155235, + "grad_norm": 9.25, + "learning_rate": 4.3162607514873556e-07, + "loss": 1.483, + "step": 753 + }, + { + "epoch": 0.9073405535499398, + "grad_norm": 1.765625, + "learning_rate": 4.207074540433631e-07, + "loss": 1.7761, + "step": 754 + }, + { + "epoch": 0.9085439229843562, + "grad_norm": 7.8125, + "learning_rate": 4.09925741502577e-07, + "loss": 1.559, + "step": 755 + }, + { + "epoch": 0.9097472924187726, + "grad_norm": 1.65625, + "learning_rate": 3.9928109162008953e-07, + "loss": 1.6966, + "step": 756 + }, + { + "epoch": 0.910950661853189, + "grad_norm": 6.875, + "learning_rate": 3.887736565307032e-07, + "loss": 1.6578, + "step": 757 + }, + { + "epoch": 0.9121540312876053, + "grad_norm": 2.328125, + "learning_rate": 3.7840358640812036e-07, + "loss": 1.7345, + "step": 758 + }, + { + "epoch": 0.9133574007220217, + "grad_norm": 10.125, + "learning_rate": 3.68171029462806e-07, + "loss": 1.6894, + "step": 759 + }, + { + "epoch": 0.914560770156438, + "grad_norm": 1.84375, + "learning_rate": 3.580761319398729e-07, + "loss": 1.8096, + "step": 760 + }, + { + "epoch": 0.9157641395908543, + "grad_norm": 8.625, + "learning_rate": 3.481190381169808e-07, + "loss": 1.755, + "step": 761 + }, + { + "epoch": 0.9169675090252708, + "grad_norm": 2.046875, + "learning_rate": 3.3829989030228163e-07, + "loss": 1.817, + "step": 762 + }, + { + "epoch": 0.9181708784596871, + "grad_norm": 6.90625, + "learning_rate": 3.286188288323844e-07, + "loss": 1.5882, + "step": 763 + }, + { + "epoch": 0.9193742478941035, + "grad_norm": 1.7421875, + "learning_rate": 3.190759920703512e-07, + "loss": 1.7517, + "step": 764 + }, + { + "epoch": 0.9205776173285198, + "grad_norm": 7.3125, + "learning_rate": 3.096715164037123e-07, + "loss": 1.9038, + "step": 765 + }, + { + "epoch": 0.9217809867629362, + "grad_norm": 1.7421875, + "learning_rate": 3.0040553624252844e-07, + "loss": 1.7996, + "step": 766 + }, + { + "epoch": 0.9229843561973526, + "grad_norm": 8.75, + "learning_rate": 2.9127818401745833e-07, + "loss": 1.4735, + "step": 767 + }, + { + "epoch": 0.924187725631769, + "grad_norm": 1.5703125, + "learning_rate": 2.822895901778744e-07, + "loss": 1.7877, + "step": 768 + }, + { + "epoch": 0.9253910950661853, + "grad_norm": 10.6875, + "learning_rate": 2.7343988318999536e-07, + "loss": 1.6788, + "step": 769 + }, + { + "epoch": 0.9265944645006017, + "grad_norm": 1.9296875, + "learning_rate": 2.6472918953504566e-07, + "loss": 1.7739, + "step": 770 + }, + { + "epoch": 0.927797833935018, + "grad_norm": 7.0, + "learning_rate": 2.5615763370745894e-07, + "loss": 1.5671, + "step": 771 + }, + { + "epoch": 0.9290012033694344, + "grad_norm": 1.828125, + "learning_rate": 2.477253382130862e-07, + "loss": 1.7519, + "step": 772 + }, + { + "epoch": 0.9302045728038508, + "grad_norm": 8.4375, + "learning_rate": 2.394324235674517e-07, + "loss": 1.7307, + "step": 773 + }, + { + "epoch": 0.9314079422382672, + "grad_norm": 3.8125, + "learning_rate": 2.3127900829403305e-07, + "loss": 1.776, + "step": 774 + }, + { + "epoch": 0.9326113116726835, + "grad_norm": 13.375, + "learning_rate": 2.2326520892255953e-07, + "loss": 1.7122, + "step": 775 + }, + { + "epoch": 0.9338146811070999, + "grad_norm": 2.109375, + "learning_rate": 2.1539113998735094e-07, + "loss": 1.7175, + "step": 776 + }, + { + "epoch": 0.9350180505415162, + "grad_norm": 8.875, + "learning_rate": 2.0765691402568455e-07, + "loss": 1.575, + "step": 777 + }, + { + "epoch": 0.9362214199759326, + "grad_norm": 2.328125, + "learning_rate": 2.000626415761786e-07, + "loss": 1.7733, + "step": 778 + }, + { + "epoch": 0.937424789410349, + "grad_norm": 11.3125, + "learning_rate": 1.9260843117721695e-07, + "loss": 1.5334, + "step": 779 + }, + { + "epoch": 0.9386281588447654, + "grad_norm": 1.9453125, + "learning_rate": 1.8529438936540022e-07, + "loss": 1.7955, + "step": 780 + }, + { + "epoch": 0.9398315282791817, + "grad_norm": 9.625, + "learning_rate": 1.7812062067401713e-07, + "loss": 1.0221, + "step": 781 + }, + { + "epoch": 0.941034897713598, + "grad_norm": 1.9609375, + "learning_rate": 1.710872276315556e-07, + "loss": 1.7049, + "step": 782 + }, + { + "epoch": 0.9422382671480144, + "grad_norm": 7.90625, + "learning_rate": 1.6419431076023505e-07, + "loss": 1.8431, + "step": 783 + }, + { + "epoch": 0.9434416365824309, + "grad_norm": 2.203125, + "learning_rate": 1.5744196857456874e-07, + "loss": 1.7725, + "step": 784 + }, + { + "epoch": 0.9446450060168472, + "grad_norm": 6.65625, + "learning_rate": 1.5083029757995914e-07, + "loss": 1.5299, + "step": 785 + }, + { + "epoch": 0.9458483754512635, + "grad_norm": 2.328125, + "learning_rate": 1.4435939227131712e-07, + "loss": 1.7676, + "step": 786 + }, + { + "epoch": 0.9470517448856799, + "grad_norm": 10.0625, + "learning_rate": 1.3802934513170828e-07, + "loss": 1.6324, + "step": 787 + }, + { + "epoch": 0.9482551143200962, + "grad_norm": 1.640625, + "learning_rate": 1.3184024663103755e-07, + "loss": 1.7018, + "step": 788 + }, + { + "epoch": 0.9494584837545126, + "grad_norm": 7.90625, + "learning_rate": 1.25792185224749e-07, + "loss": 1.3774, + "step": 789 + }, + { + "epoch": 0.950661853188929, + "grad_norm": 1.9453125, + "learning_rate": 1.198852473525669e-07, + "loss": 1.7961, + "step": 790 + }, + { + "epoch": 0.9518652226233454, + "grad_norm": 7.40625, + "learning_rate": 1.1411951743726002e-07, + "loss": 1.8785, + "step": 791 + }, + { + "epoch": 0.9530685920577617, + "grad_norm": 1.921875, + "learning_rate": 1.0849507788343038e-07, + "loss": 1.7485, + "step": 792 + }, + { + "epoch": 0.9542719614921781, + "grad_norm": 9.0625, + "learning_rate": 1.030120090763409e-07, + "loss": 1.7257, + "step": 793 + }, + { + "epoch": 0.9554753309265944, + "grad_norm": 1.6640625, + "learning_rate": 9.767038938076511e-08, + "loss": 1.7644, + "step": 794 + }, + { + "epoch": 0.9566787003610109, + "grad_norm": 9.375, + "learning_rate": 9.247029513986482e-08, + "loss": 1.9214, + "step": 795 + }, + { + "epoch": 0.9578820697954272, + "grad_norm": 1.703125, + "learning_rate": 8.741180067409982e-08, + "loss": 1.7783, + "step": 796 + }, + { + "epoch": 0.9590854392298436, + "grad_norm": 8.625, + "learning_rate": 8.249497828016872e-08, + "loss": 1.6367, + "step": 797 + }, + { + "epoch": 0.9602888086642599, + "grad_norm": 1.9765625, + "learning_rate": 7.771989822997206e-08, + "loss": 1.789, + "step": 798 + }, + { + "epoch": 0.9614921780986763, + "grad_norm": 6.75, + "learning_rate": 7.30866287696097e-08, + "loss": 1.5397, + "step": 799 + }, + { + "epoch": 0.9626955475330926, + "grad_norm": 1.9609375, + "learning_rate": 6.859523611840612e-08, + "loss": 1.727, + "step": 800 + }, + { + "epoch": 0.9638989169675091, + "grad_norm": 7.3125, + "learning_rate": 6.424578446796004e-08, + "loss": 1.7173, + "step": 801 + }, + { + "epoch": 0.9651022864019254, + "grad_norm": 1.7109375, + "learning_rate": 6.003833598123287e-08, + "loss": 1.7525, + "step": 802 + }, + { + "epoch": 0.9663056558363418, + "grad_norm": 7.90625, + "learning_rate": 5.597295079165621e-08, + "loss": 1.4858, + "step": 803 + }, + { + "epoch": 0.9675090252707581, + "grad_norm": 1.890625, + "learning_rate": 5.204968700227242e-08, + "loss": 1.7038, + "step": 804 + }, + { + "epoch": 0.9687123947051745, + "grad_norm": 9.4375, + "learning_rate": 4.826860068490868e-08, + "loss": 1.6603, + "step": 805 + }, + { + "epoch": 0.9699157641395909, + "grad_norm": 1.8671875, + "learning_rate": 4.4629745879367634e-08, + "loss": 1.7481, + "step": 806 + }, + { + "epoch": 0.9711191335740073, + "grad_norm": 8.75, + "learning_rate": 4.113317459266242e-08, + "loss": 1.605, + "step": 807 + }, + { + "epoch": 0.9723225030084236, + "grad_norm": 1.7578125, + "learning_rate": 3.777893679827061e-08, + "loss": 1.7071, + "step": 808 + }, + { + "epoch": 0.97352587244284, + "grad_norm": 8.4375, + "learning_rate": 3.456708043541812e-08, + "loss": 1.9034, + "step": 809 + }, + { + "epoch": 0.9747292418772563, + "grad_norm": 1.59375, + "learning_rate": 3.1497651408399774e-08, + "loss": 1.7136, + "step": 810 + }, + { + "epoch": 0.9759326113116726, + "grad_norm": 11.3125, + "learning_rate": 2.8570693585914246e-08, + "loss": 1.8149, + "step": 811 + }, + { + "epoch": 0.9771359807460891, + "grad_norm": 2.046875, + "learning_rate": 2.578624880044567e-08, + "loss": 1.8252, + "step": 812 + }, + { + "epoch": 0.9783393501805054, + "grad_norm": 7.875, + "learning_rate": 2.314435684766081e-08, + "loss": 1.8397, + "step": 813 + }, + { + "epoch": 0.9795427196149218, + "grad_norm": 1.890625, + "learning_rate": 2.0645055485842837e-08, + "loss": 1.7662, + "step": 814 + }, + { + "epoch": 0.9807460890493381, + "grad_norm": 7.03125, + "learning_rate": 1.8288380435349527e-08, + "loss": 1.67, + "step": 815 + }, + { + "epoch": 0.9819494584837545, + "grad_norm": 2.140625, + "learning_rate": 1.6074365378105915e-08, + "loss": 1.7614, + "step": 816 + }, + { + "epoch": 0.9831528279181708, + "grad_norm": 8.4375, + "learning_rate": 1.400304195711688e-08, + "loss": 1.5358, + "step": 817 + }, + { + "epoch": 0.9843561973525873, + "grad_norm": 2.046875, + "learning_rate": 1.2074439776021962e-08, + "loss": 1.7641, + "step": 818 + }, + { + "epoch": 0.9855595667870036, + "grad_norm": 6.96875, + "learning_rate": 1.0288586398670141e-08, + "loss": 1.5657, + "step": 819 + }, + { + "epoch": 0.98676293622142, + "grad_norm": 1.828125, + "learning_rate": 8.64550734872016e-09, + "loss": 1.7538, + "step": 820 + }, + { + "epoch": 0.9879663056558363, + "grad_norm": 7.875, + "learning_rate": 7.145226109286363e-09, + "loss": 1.5632, + "step": 821 + }, + { + "epoch": 0.9891696750902527, + "grad_norm": 2.203125, + "learning_rate": 5.787764122592299e-09, + "loss": 1.8164, + "step": 822 + }, + { + "epoch": 0.9903730445246691, + "grad_norm": 8.9375, + "learning_rate": 4.573140789672082e-09, + "loss": 1.886, + "step": 823 + }, + { + "epoch": 0.9915764139590855, + "grad_norm": 1.828125, + "learning_rate": 3.5013734700883874e-09, + "loss": 1.7194, + "step": 824 + }, + { + "epoch": 0.9927797833935018, + "grad_norm": 9.125, + "learning_rate": 2.5724774816870966e-09, + "loss": 1.3841, + "step": 825 + }, + { + "epoch": 0.9939831528279182, + "grad_norm": 2.03125, + "learning_rate": 1.7864661003774708e-09, + "loss": 1.7312, + "step": 826 + }, + { + "epoch": 0.9951865222623345, + "grad_norm": 7.09375, + "learning_rate": 1.1433505599434126e-09, + "loss": 1.4796, + "step": 827 + }, + { + "epoch": 0.9963898916967509, + "grad_norm": 1.4921875, + "learning_rate": 6.431400518780439e-10, + "loss": 1.6919, + "step": 828 + }, + { + "epoch": 0.9975932611311673, + "grad_norm": 7.53125, + "learning_rate": 2.8584172526047173e-10, + "loss": 1.6439, + "step": 829 + }, + { + "epoch": 0.9987966305655837, + "grad_norm": 1.90625, + "learning_rate": 7.146068664698469e-11, + "loss": 1.7275, + "step": 830 + }, + { + "epoch": 1.0, + "grad_norm": 8.5, + "learning_rate": 0.0, + "loss": 1.8022, + "step": 831 + } + ], + "logging_steps": 1, + "max_steps": 831, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 8.843230151337574e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}