{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 831, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012033694344163659, "grad_norm": 12.4375, "learning_rate": 1.9999928539313353e-05, "loss": 1.9787, "step": 1 }, { "epoch": 0.0024067388688327317, "grad_norm": 5.34375, "learning_rate": 1.9999714158274743e-05, "loss": 2.6581, "step": 2 }, { "epoch": 0.0036101083032490976, "grad_norm": 13.1875, "learning_rate": 1.9999356859948126e-05, "loss": 1.7041, "step": 3 }, { "epoch": 0.0048134777376654635, "grad_norm": 14.375, "learning_rate": 1.9998856649440058e-05, "loss": 2.4025, "step": 4 }, { "epoch": 0.006016847172081829, "grad_norm": 9.5625, "learning_rate": 1.9998213533899625e-05, "loss": 2.1206, "step": 5 }, { "epoch": 0.007220216606498195, "grad_norm": 7.78125, "learning_rate": 1.9997427522518315e-05, "loss": 2.262, "step": 6 }, { "epoch": 0.00842358604091456, "grad_norm": 14.6875, "learning_rate": 1.9996498626529914e-05, "loss": 2.2089, "step": 7 }, { "epoch": 0.009626955475330927, "grad_norm": 3.921875, "learning_rate": 1.999542685921033e-05, "loss": 2.1776, "step": 8 }, { "epoch": 0.010830324909747292, "grad_norm": 9.875, "learning_rate": 1.9994212235877407e-05, "loss": 1.9749, "step": 9 }, { "epoch": 0.012033694344163659, "grad_norm": 3.1875, "learning_rate": 1.9992854773890714e-05, "loss": 2.1631, "step": 10 }, { "epoch": 0.013237063778580024, "grad_norm": 7.6875, "learning_rate": 1.9991354492651283e-05, "loss": 1.7845, "step": 11 }, { "epoch": 0.01444043321299639, "grad_norm": 2.3125, "learning_rate": 1.9989711413601332e-05, "loss": 2.1931, "step": 12 }, { "epoch": 0.015643802647412757, "grad_norm": 13.875, "learning_rate": 1.998792556022398e-05, "loss": 2.2409, "step": 13 }, { "epoch": 0.01684717208182912, "grad_norm": 2.203125, "learning_rate": 1.9985996958042887e-05, "loss": 2.1679, "step": 14 }, { "epoch": 0.018050541516245487, "grad_norm": 15.4375, "learning_rate": 1.9983925634621894e-05, "loss": 2.2523, "step": 15 }, { "epoch": 0.019253910950661854, "grad_norm": 1.71875, "learning_rate": 1.9981711619564654e-05, "loss": 2.1399, "step": 16 }, { "epoch": 0.02045728038507822, "grad_norm": 10.8125, "learning_rate": 1.997935494451416e-05, "loss": 2.0425, "step": 17 }, { "epoch": 0.021660649819494584, "grad_norm": 2.109375, "learning_rate": 1.997685564315234e-05, "loss": 2.1166, "step": 18 }, { "epoch": 0.02286401925391095, "grad_norm": 12.625, "learning_rate": 1.9974213751199556e-05, "loss": 1.9304, "step": 19 }, { "epoch": 0.024067388688327317, "grad_norm": 2.0625, "learning_rate": 1.9971429306414087e-05, "loss": 2.1412, "step": 20 }, { "epoch": 0.02527075812274368, "grad_norm": 8.1875, "learning_rate": 1.99685023485916e-05, "loss": 1.7462, "step": 21 }, { "epoch": 0.026474127557160047, "grad_norm": 1.640625, "learning_rate": 1.9965432919564583e-05, "loss": 2.1127, "step": 22 }, { "epoch": 0.027677496991576414, "grad_norm": 9.0625, "learning_rate": 1.9962221063201734e-05, "loss": 1.8733, "step": 23 }, { "epoch": 0.02888086642599278, "grad_norm": 1.84375, "learning_rate": 1.995886682540734e-05, "loss": 2.0519, "step": 24 }, { "epoch": 0.030084235860409144, "grad_norm": 7.53125, "learning_rate": 1.9955370254120635e-05, "loss": 1.591, "step": 25 }, { "epoch": 0.031287605294825514, "grad_norm": 1.84375, "learning_rate": 1.9951731399315095e-05, "loss": 2.1164, "step": 26 }, { "epoch": 0.032490974729241874, "grad_norm": 8.75, "learning_rate": 1.994795031299773e-05, "loss": 1.4183, "step": 27 }, { "epoch": 0.03369434416365824, "grad_norm": 2.03125, "learning_rate": 1.9944027049208347e-05, "loss": 2.0505, "step": 28 }, { "epoch": 0.03489771359807461, "grad_norm": 9.4375, "learning_rate": 1.993996166401877e-05, "loss": 1.7333, "step": 29 }, { "epoch": 0.036101083032490974, "grad_norm": 1.84375, "learning_rate": 1.993575421553204e-05, "loss": 2.0512, "step": 30 }, { "epoch": 0.03730445246690734, "grad_norm": 11.3125, "learning_rate": 1.9931404763881598e-05, "loss": 1.9456, "step": 31 }, { "epoch": 0.03850782190132371, "grad_norm": 2.015625, "learning_rate": 1.9926913371230393e-05, "loss": 2.0971, "step": 32 }, { "epoch": 0.039711191335740074, "grad_norm": 11.9375, "learning_rate": 1.992228010177003e-05, "loss": 2.1031, "step": 33 }, { "epoch": 0.04091456077015644, "grad_norm": 2.015625, "learning_rate": 1.9917505021719833e-05, "loss": 2.0079, "step": 34 }, { "epoch": 0.0421179302045728, "grad_norm": 10.0625, "learning_rate": 1.99125881993259e-05, "loss": 2.0108, "step": 35 }, { "epoch": 0.04332129963898917, "grad_norm": 2.03125, "learning_rate": 1.990752970486014e-05, "loss": 2.0955, "step": 36 }, { "epoch": 0.044524669073405534, "grad_norm": 9.1875, "learning_rate": 1.990232961061924e-05, "loss": 1.6093, "step": 37 }, { "epoch": 0.0457280385078219, "grad_norm": 1.9140625, "learning_rate": 1.989698799092366e-05, "loss": 2.0615, "step": 38 }, { "epoch": 0.04693140794223827, "grad_norm": 9.5625, "learning_rate": 1.9891504922116572e-05, "loss": 1.5992, "step": 39 }, { "epoch": 0.048134777376654635, "grad_norm": 2.5, "learning_rate": 1.988588048256274e-05, "loss": 2.0622, "step": 40 }, { "epoch": 0.049338146811071, "grad_norm": 8.5625, "learning_rate": 1.9880114752647434e-05, "loss": 2.0544, "step": 41 }, { "epoch": 0.05054151624548736, "grad_norm": 2.03125, "learning_rate": 1.9874207814775252e-05, "loss": 2.0189, "step": 42 }, { "epoch": 0.05174488567990373, "grad_norm": 8.9375, "learning_rate": 1.9868159753368964e-05, "loss": 1.8381, "step": 43 }, { "epoch": 0.052948255114320095, "grad_norm": 1.84375, "learning_rate": 1.9861970654868292e-05, "loss": 2.0512, "step": 44 }, { "epoch": 0.05415162454873646, "grad_norm": 8.0, "learning_rate": 1.9855640607728684e-05, "loss": 2.0973, "step": 45 }, { "epoch": 0.05535499398315283, "grad_norm": 2.140625, "learning_rate": 1.9849169702420044e-05, "loss": 2.0381, "step": 46 }, { "epoch": 0.056558363417569195, "grad_norm": 8.6875, "learning_rate": 1.9842558031425434e-05, "loss": 1.9673, "step": 47 }, { "epoch": 0.05776173285198556, "grad_norm": 2.015625, "learning_rate": 1.983580568923977e-05, "loss": 2.0981, "step": 48 }, { "epoch": 0.05896510228640193, "grad_norm": 10.1875, "learning_rate": 1.982891277236845e-05, "loss": 1.6095, "step": 49 }, { "epoch": 0.06016847172081829, "grad_norm": 1.6640625, "learning_rate": 1.9821879379325985e-05, "loss": 2.0325, "step": 50 }, { "epoch": 0.061371841155234655, "grad_norm": 7.96875, "learning_rate": 1.9814705610634602e-05, "loss": 1.9149, "step": 51 }, { "epoch": 0.06257521058965103, "grad_norm": 2.015625, "learning_rate": 1.9807391568822785e-05, "loss": 1.9993, "step": 52 }, { "epoch": 0.06377858002406739, "grad_norm": 8.75, "learning_rate": 1.9799937358423826e-05, "loss": 2.1704, "step": 53 }, { "epoch": 0.06498194945848375, "grad_norm": 2.296875, "learning_rate": 1.9792343085974316e-05, "loss": 2.0963, "step": 54 }, { "epoch": 0.06618531889290012, "grad_norm": 9.125, "learning_rate": 1.9784608860012652e-05, "loss": 2.0504, "step": 55 }, { "epoch": 0.06738868832731648, "grad_norm": 1.953125, "learning_rate": 1.9776734791077442e-05, "loss": 1.9826, "step": 56 }, { "epoch": 0.06859205776173286, "grad_norm": 10.875, "learning_rate": 1.976872099170597e-05, "loss": 1.8005, "step": 57 }, { "epoch": 0.06979542719614922, "grad_norm": 1.859375, "learning_rate": 1.976056757643255e-05, "loss": 2.0221, "step": 58 }, { "epoch": 0.07099879663056559, "grad_norm": 10.75, "learning_rate": 1.9752274661786916e-05, "loss": 1.8314, "step": 59 }, { "epoch": 0.07220216606498195, "grad_norm": 1.6796875, "learning_rate": 1.9743842366292544e-05, "loss": 1.9793, "step": 60 }, { "epoch": 0.07340553549939831, "grad_norm": 9.0, "learning_rate": 1.9735270810464958e-05, "loss": 1.8859, "step": 61 }, { "epoch": 0.07460890493381468, "grad_norm": 1.4921875, "learning_rate": 1.9726560116810006e-05, "loss": 2.0158, "step": 62 }, { "epoch": 0.07581227436823104, "grad_norm": 14.8125, "learning_rate": 1.971771040982213e-05, "loss": 1.8607, "step": 63 }, { "epoch": 0.07701564380264742, "grad_norm": 1.5078125, "learning_rate": 1.9708721815982543e-05, "loss": 2.0328, "step": 64 }, { "epoch": 0.07821901323706378, "grad_norm": 7.53125, "learning_rate": 1.9699594463757475e-05, "loss": 1.7419, "step": 65 }, { "epoch": 0.07942238267148015, "grad_norm": 1.5390625, "learning_rate": 1.9690328483596287e-05, "loss": 1.9668, "step": 66 }, { "epoch": 0.08062575210589651, "grad_norm": 8.125, "learning_rate": 1.968092400792965e-05, "loss": 2.1749, "step": 67 }, { "epoch": 0.08182912154031288, "grad_norm": 1.984375, "learning_rate": 1.9671381171167616e-05, "loss": 1.9551, "step": 68 }, { "epoch": 0.08303249097472924, "grad_norm": 7.8125, "learning_rate": 1.9661700109697718e-05, "loss": 1.8759, "step": 69 }, { "epoch": 0.0842358604091456, "grad_norm": 2.015625, "learning_rate": 1.9651880961883025e-05, "loss": 2.0612, "step": 70 }, { "epoch": 0.08543922984356198, "grad_norm": 8.8125, "learning_rate": 1.964192386806013e-05, "loss": 1.4317, "step": 71 }, { "epoch": 0.08664259927797834, "grad_norm": 2.03125, "learning_rate": 1.9631828970537196e-05, "loss": 1.9923, "step": 72 }, { "epoch": 0.08784596871239471, "grad_norm": 11.3125, "learning_rate": 1.9621596413591885e-05, "loss": 1.8632, "step": 73 }, { "epoch": 0.08904933814681107, "grad_norm": 1.6171875, "learning_rate": 1.96112263434693e-05, "loss": 2.0025, "step": 74 }, { "epoch": 0.09025270758122744, "grad_norm": 9.875, "learning_rate": 1.960071890837991e-05, "loss": 1.8546, "step": 75 }, { "epoch": 0.0914560770156438, "grad_norm": 1.6171875, "learning_rate": 1.9590074258497423e-05, "loss": 1.9668, "step": 76 }, { "epoch": 0.09265944645006016, "grad_norm": 8.5, "learning_rate": 1.957929254595664e-05, "loss": 2.0109, "step": 77 }, { "epoch": 0.09386281588447654, "grad_norm": 2.140625, "learning_rate": 1.9568373924851267e-05, "loss": 2.0018, "step": 78 }, { "epoch": 0.0950661853188929, "grad_norm": 11.0, "learning_rate": 1.9557318551231745e-05, "loss": 1.7631, "step": 79 }, { "epoch": 0.09626955475330927, "grad_norm": 1.625, "learning_rate": 1.9546126583102983e-05, "loss": 1.9526, "step": 80 }, { "epoch": 0.09747292418772563, "grad_norm": 9.5625, "learning_rate": 1.953479818042214e-05, "loss": 2.1091, "step": 81 }, { "epoch": 0.098676293622142, "grad_norm": 1.6640625, "learning_rate": 1.952333350509629e-05, "loss": 2.0067, "step": 82 }, { "epoch": 0.09987966305655836, "grad_norm": 7.4375, "learning_rate": 1.9511732720980156e-05, "loss": 1.7628, "step": 83 }, { "epoch": 0.10108303249097472, "grad_norm": 2.234375, "learning_rate": 1.949999599387373e-05, "loss": 1.943, "step": 84 }, { "epoch": 0.1022864019253911, "grad_norm": 7.71875, "learning_rate": 1.9488123491519935e-05, "loss": 1.8745, "step": 85 }, { "epoch": 0.10348977135980746, "grad_norm": 2.015625, "learning_rate": 1.94761153836022e-05, "loss": 1.9994, "step": 86 }, { "epoch": 0.10469314079422383, "grad_norm": 7.625, "learning_rate": 1.9463971841742057e-05, "loss": 1.3599, "step": 87 }, { "epoch": 0.10589651022864019, "grad_norm": 2.15625, "learning_rate": 1.9451693039496665e-05, "loss": 2.0137, "step": 88 }, { "epoch": 0.10709987966305656, "grad_norm": 8.375, "learning_rate": 1.9439279152356363e-05, "loss": 2.134, "step": 89 }, { "epoch": 0.10830324909747292, "grad_norm": 2.234375, "learning_rate": 1.9426730357742123e-05, "loss": 1.9876, "step": 90 }, { "epoch": 0.1095066185318893, "grad_norm": 9.4375, "learning_rate": 1.9414046835003043e-05, "loss": 1.682, "step": 91 }, { "epoch": 0.11070998796630566, "grad_norm": 2.46875, "learning_rate": 1.9401228765413774e-05, "loss": 1.9372, "step": 92 }, { "epoch": 0.11191335740072202, "grad_norm": 8.9375, "learning_rate": 1.938827633217193e-05, "loss": 2.0301, "step": 93 }, { "epoch": 0.11311672683513839, "grad_norm": 2.125, "learning_rate": 1.9375189720395454e-05, "loss": 1.9371, "step": 94 }, { "epoch": 0.11432009626955475, "grad_norm": 8.0625, "learning_rate": 1.936196911712001e-05, "loss": 1.5517, "step": 95 }, { "epoch": 0.11552346570397112, "grad_norm": 1.96875, "learning_rate": 1.934861471129627e-05, "loss": 1.9661, "step": 96 }, { "epoch": 0.11672683513838748, "grad_norm": 12.1875, "learning_rate": 1.9335126693787237e-05, "loss": 1.45, "step": 97 }, { "epoch": 0.11793020457280386, "grad_norm": 1.71875, "learning_rate": 1.9321505257365508e-05, "loss": 1.9117, "step": 98 }, { "epoch": 0.11913357400722022, "grad_norm": 9.1875, "learning_rate": 1.930775059671053e-05, "loss": 2.2048, "step": 99 }, { "epoch": 0.12033694344163658, "grad_norm": 2.359375, "learning_rate": 1.9293862908405795e-05, "loss": 1.9723, "step": 100 }, { "epoch": 0.12154031287605295, "grad_norm": 7.96875, "learning_rate": 1.927984239093605e-05, "loss": 1.5856, "step": 101 }, { "epoch": 0.12274368231046931, "grad_norm": 1.875, "learning_rate": 1.926568924468446e-05, "loss": 1.9171, "step": 102 }, { "epoch": 0.12394705174488568, "grad_norm": 7.46875, "learning_rate": 1.9251403671929738e-05, "loss": 1.7228, "step": 103 }, { "epoch": 0.12515042117930206, "grad_norm": 1.8515625, "learning_rate": 1.9236985876843243e-05, "loss": 1.9522, "step": 104 }, { "epoch": 0.1263537906137184, "grad_norm": 7.71875, "learning_rate": 1.922243606548609e-05, "loss": 2.0209, "step": 105 }, { "epoch": 0.12755716004813478, "grad_norm": 2.28125, "learning_rate": 1.9207754445806176e-05, "loss": 2.0477, "step": 106 }, { "epoch": 0.12876052948255115, "grad_norm": 9.5625, "learning_rate": 1.9192941227635232e-05, "loss": 1.6086, "step": 107 }, { "epoch": 0.1299638989169675, "grad_norm": 2.8125, "learning_rate": 1.91779966226858e-05, "loss": 1.9732, "step": 108 }, { "epoch": 0.13116726835138387, "grad_norm": 8.3125, "learning_rate": 1.9162920844548227e-05, "loss": 1.8482, "step": 109 }, { "epoch": 0.13237063778580024, "grad_norm": 2.0625, "learning_rate": 1.914771410868761e-05, "loss": 1.9297, "step": 110 }, { "epoch": 0.13357400722021662, "grad_norm": 9.6875, "learning_rate": 1.91323766324407e-05, "loss": 1.8445, "step": 111 }, { "epoch": 0.13477737665463296, "grad_norm": 1.890625, "learning_rate": 1.9116908635012813e-05, "loss": 1.9342, "step": 112 }, { "epoch": 0.13598074608904934, "grad_norm": 7.28125, "learning_rate": 1.91013103374747e-05, "loss": 1.712, "step": 113 }, { "epoch": 0.1371841155234657, "grad_norm": 2.109375, "learning_rate": 1.9085581962759366e-05, "loss": 1.9928, "step": 114 }, { "epoch": 0.13838748495788206, "grad_norm": 6.96875, "learning_rate": 1.9069723735658903e-05, "loss": 1.7337, "step": 115 }, { "epoch": 0.13959085439229843, "grad_norm": 2.203125, "learning_rate": 1.905373588282127e-05, "loss": 1.9959, "step": 116 }, { "epoch": 0.1407942238267148, "grad_norm": 11.9375, "learning_rate": 1.903761863274706e-05, "loss": 2.1487, "step": 117 }, { "epoch": 0.14199759326113118, "grad_norm": 2.671875, "learning_rate": 1.9021372215786218e-05, "loss": 1.9969, "step": 118 }, { "epoch": 0.14320096269554752, "grad_norm": 11.5625, "learning_rate": 1.9004996864134767e-05, "loss": 1.7683, "step": 119 }, { "epoch": 0.1444043321299639, "grad_norm": 2.234375, "learning_rate": 1.8988492811831485e-05, "loss": 1.9063, "step": 120 }, { "epoch": 0.14560770156438027, "grad_norm": 8.0, "learning_rate": 1.8971860294754554e-05, "loss": 1.8438, "step": 121 }, { "epoch": 0.14681107099879662, "grad_norm": 1.734375, "learning_rate": 1.8955099550618194e-05, "loss": 1.9308, "step": 122 }, { "epoch": 0.148014440433213, "grad_norm": 11.6875, "learning_rate": 1.8938210818969257e-05, "loss": 2.1651, "step": 123 }, { "epoch": 0.14921780986762936, "grad_norm": 1.78125, "learning_rate": 1.8921194341183815e-05, "loss": 1.9401, "step": 124 }, { "epoch": 0.15042117930204574, "grad_norm": 8.9375, "learning_rate": 1.8904050360463708e-05, "loss": 1.9641, "step": 125 }, { "epoch": 0.15162454873646208, "grad_norm": 1.5078125, "learning_rate": 1.8886779121833065e-05, "loss": 1.9155, "step": 126 }, { "epoch": 0.15282791817087846, "grad_norm": 8.75, "learning_rate": 1.886938087213479e-05, "loss": 1.692, "step": 127 }, { "epoch": 0.15403128760529483, "grad_norm": 1.7578125, "learning_rate": 1.885185586002707e-05, "loss": 1.9295, "step": 128 }, { "epoch": 0.1552346570397112, "grad_norm": 8.8125, "learning_rate": 1.8834204335979777e-05, "loss": 1.8224, "step": 129 }, { "epoch": 0.15643802647412755, "grad_norm": 1.9453125, "learning_rate": 1.8816426552270922e-05, "loss": 1.9151, "step": 130 }, { "epoch": 0.15764139590854392, "grad_norm": 11.5, "learning_rate": 1.8798522762983026e-05, "loss": 1.5583, "step": 131 }, { "epoch": 0.1588447653429603, "grad_norm": 1.640625, "learning_rate": 1.8780493223999508e-05, "loss": 1.9645, "step": 132 }, { "epoch": 0.16004813477737664, "grad_norm": 8.375, "learning_rate": 1.8762338193001013e-05, "loss": 2.0126, "step": 133 }, { "epoch": 0.16125150421179302, "grad_norm": 1.7578125, "learning_rate": 1.8744057929461736e-05, "loss": 1.929, "step": 134 }, { "epoch": 0.1624548736462094, "grad_norm": 9.5625, "learning_rate": 1.8725652694645714e-05, "loss": 1.6815, "step": 135 }, { "epoch": 0.16365824308062576, "grad_norm": 2.15625, "learning_rate": 1.8707122751603098e-05, "loss": 1.9207, "step": 136 }, { "epoch": 0.1648616125150421, "grad_norm": 9.9375, "learning_rate": 1.868846836516637e-05, "loss": 1.7623, "step": 137 }, { "epoch": 0.16606498194945848, "grad_norm": 2.0625, "learning_rate": 1.8669689801946585e-05, "loss": 1.8817, "step": 138 }, { "epoch": 0.16726835138387486, "grad_norm": 12.0, "learning_rate": 1.8650787330329546e-05, "loss": 1.9991, "step": 139 }, { "epoch": 0.1684717208182912, "grad_norm": 2.046875, "learning_rate": 1.863176122047198e-05, "loss": 1.9419, "step": 140 }, { "epoch": 0.16967509025270758, "grad_norm": 15.5625, "learning_rate": 1.861261174429765e-05, "loss": 1.8991, "step": 141 }, { "epoch": 0.17087845968712395, "grad_norm": 2.53125, "learning_rate": 1.8593339175493514e-05, "loss": 2.0773, "step": 142 }, { "epoch": 0.17208182912154033, "grad_norm": 8.0625, "learning_rate": 1.8573943789505762e-05, "loss": 1.736, "step": 143 }, { "epoch": 0.17328519855595667, "grad_norm": 1.703125, "learning_rate": 1.8554425863535915e-05, "loss": 1.9491, "step": 144 }, { "epoch": 0.17448856799037304, "grad_norm": 9.9375, "learning_rate": 1.8534785676536856e-05, "loss": 1.4771, "step": 145 }, { "epoch": 0.17569193742478942, "grad_norm": 1.7109375, "learning_rate": 1.851502350920883e-05, "loss": 1.9472, "step": 146 }, { "epoch": 0.17689530685920576, "grad_norm": 8.375, "learning_rate": 1.849513964399545e-05, "loss": 1.4921, "step": 147 }, { "epoch": 0.17809867629362214, "grad_norm": 1.9921875, "learning_rate": 1.8475134365079642e-05, "loss": 1.8783, "step": 148 }, { "epoch": 0.1793020457280385, "grad_norm": 9.1875, "learning_rate": 1.8455007958379604e-05, "loss": 1.3073, "step": 149 }, { "epoch": 0.18050541516245489, "grad_norm": 1.6171875, "learning_rate": 1.8434760711544707e-05, "loss": 1.9601, "step": 150 }, { "epoch": 0.18170878459687123, "grad_norm": 8.0625, "learning_rate": 1.8414392913951382e-05, "loss": 1.7576, "step": 151 }, { "epoch": 0.1829121540312876, "grad_norm": 2.046875, "learning_rate": 1.8393904856698987e-05, "loss": 1.9558, "step": 152 }, { "epoch": 0.18411552346570398, "grad_norm": 9.9375, "learning_rate": 1.8373296832605647e-05, "loss": 1.5748, "step": 153 }, { "epoch": 0.18531889290012032, "grad_norm": 2.078125, "learning_rate": 1.835256913620408e-05, "loss": 1.9188, "step": 154 }, { "epoch": 0.1865222623345367, "grad_norm": 7.28125, "learning_rate": 1.8331722063737365e-05, "loss": 1.7746, "step": 155 }, { "epoch": 0.18772563176895307, "grad_norm": 2.171875, "learning_rate": 1.8310755913154726e-05, "loss": 1.9605, "step": 156 }, { "epoch": 0.18892900120336945, "grad_norm": 7.875, "learning_rate": 1.8289670984107263e-05, "loss": 1.5612, "step": 157 }, { "epoch": 0.1901323706377858, "grad_norm": 1.515625, "learning_rate": 1.826846757794368e-05, "loss": 1.938, "step": 158 }, { "epoch": 0.19133574007220217, "grad_norm": 10.75, "learning_rate": 1.8247145997705977e-05, "loss": 1.5543, "step": 159 }, { "epoch": 0.19253910950661854, "grad_norm": 2.203125, "learning_rate": 1.8225706548125094e-05, "loss": 1.9203, "step": 160 }, { "epoch": 0.19374247894103488, "grad_norm": 7.90625, "learning_rate": 1.8204149535616596e-05, "loss": 1.7661, "step": 161 }, { "epoch": 0.19494584837545126, "grad_norm": 2.109375, "learning_rate": 1.8182475268276265e-05, "loss": 1.9409, "step": 162 }, { "epoch": 0.19614921780986763, "grad_norm": 8.625, "learning_rate": 1.8160684055875704e-05, "loss": 1.6057, "step": 163 }, { "epoch": 0.197352587244284, "grad_norm": 2.125, "learning_rate": 1.813877620985792e-05, "loss": 1.9911, "step": 164 }, { "epoch": 0.19855595667870035, "grad_norm": 11.0, "learning_rate": 1.8116752043332848e-05, "loss": 1.8332, "step": 165 }, { "epoch": 0.19975932611311673, "grad_norm": 2.078125, "learning_rate": 1.8094611871072906e-05, "loss": 2.0064, "step": 166 }, { "epoch": 0.2009626955475331, "grad_norm": 9.875, "learning_rate": 1.8072356009508473e-05, "loss": 2.1622, "step": 167 }, { "epoch": 0.20216606498194944, "grad_norm": 1.6171875, "learning_rate": 1.8049984776723383e-05, "loss": 1.9138, "step": 168 }, { "epoch": 0.20336943441636582, "grad_norm": 8.5, "learning_rate": 1.8027498492450367e-05, "loss": 1.8894, "step": 169 }, { "epoch": 0.2045728038507822, "grad_norm": 1.7734375, "learning_rate": 1.8004897478066482e-05, "loss": 1.9079, "step": 170 }, { "epoch": 0.20577617328519857, "grad_norm": 9.0, "learning_rate": 1.7982182056588536e-05, "loss": 1.4771, "step": 171 }, { "epoch": 0.2069795427196149, "grad_norm": 1.9296875, "learning_rate": 1.795935255266845e-05, "loss": 1.9463, "step": 172 }, { "epoch": 0.20818291215403129, "grad_norm": 10.375, "learning_rate": 1.7936409292588627e-05, "loss": 1.6885, "step": 173 }, { "epoch": 0.20938628158844766, "grad_norm": 2.03125, "learning_rate": 1.791335260425729e-05, "loss": 1.9168, "step": 174 }, { "epoch": 0.21058965102286403, "grad_norm": 7.75, "learning_rate": 1.7890182817203806e-05, "loss": 1.7355, "step": 175 }, { "epoch": 0.21179302045728038, "grad_norm": 2.609375, "learning_rate": 1.786690026257394e-05, "loss": 1.9454, "step": 176 }, { "epoch": 0.21299638989169675, "grad_norm": 6.71875, "learning_rate": 1.7843505273125164e-05, "loss": 1.4808, "step": 177 }, { "epoch": 0.21419975932611313, "grad_norm": 1.75, "learning_rate": 1.7819998183221883e-05, "loss": 1.9103, "step": 178 }, { "epoch": 0.21540312876052947, "grad_norm": 9.125, "learning_rate": 1.7796379328830652e-05, "loss": 1.9194, "step": 179 }, { "epoch": 0.21660649819494585, "grad_norm": 1.890625, "learning_rate": 1.7772649047515384e-05, "loss": 1.9517, "step": 180 }, { "epoch": 0.21780986762936222, "grad_norm": 7.375, "learning_rate": 1.7748807678432514e-05, "loss": 1.8614, "step": 181 }, { "epoch": 0.2190132370637786, "grad_norm": 3.03125, "learning_rate": 1.7724855562326167e-05, "loss": 1.8591, "step": 182 }, { "epoch": 0.22021660649819494, "grad_norm": 6.6875, "learning_rate": 1.7700793041523272e-05, "loss": 1.3666, "step": 183 }, { "epoch": 0.2214199759326113, "grad_norm": 2.359375, "learning_rate": 1.7676620459928683e-05, "loss": 2.0232, "step": 184 }, { "epoch": 0.2226233453670277, "grad_norm": 8.9375, "learning_rate": 1.7652338163020257e-05, "loss": 1.9097, "step": 185 }, { "epoch": 0.22382671480144403, "grad_norm": 1.9453125, "learning_rate": 1.7627946497843917e-05, "loss": 2.0247, "step": 186 }, { "epoch": 0.2250300842358604, "grad_norm": 7.0, "learning_rate": 1.7603445813008685e-05, "loss": 1.4858, "step": 187 }, { "epoch": 0.22623345367027678, "grad_norm": 2.3125, "learning_rate": 1.7578836458681718e-05, "loss": 1.958, "step": 188 }, { "epoch": 0.22743682310469315, "grad_norm": 11.3125, "learning_rate": 1.755411878658329e-05, "loss": 1.7506, "step": 189 }, { "epoch": 0.2286401925391095, "grad_norm": 1.703125, "learning_rate": 1.7529293149981758e-05, "loss": 1.9309, "step": 190 }, { "epoch": 0.22984356197352587, "grad_norm": 10.875, "learning_rate": 1.7504359903688537e-05, "loss": 1.9346, "step": 191 }, { "epoch": 0.23104693140794225, "grad_norm": 1.7421875, "learning_rate": 1.7479319404053004e-05, "loss": 1.9085, "step": 192 }, { "epoch": 0.2322503008423586, "grad_norm": 9.4375, "learning_rate": 1.7454172008957417e-05, "loss": 1.5559, "step": 193 }, { "epoch": 0.23345367027677497, "grad_norm": 1.6640625, "learning_rate": 1.7428918077811802e-05, "loss": 1.9636, "step": 194 }, { "epoch": 0.23465703971119134, "grad_norm": 8.8125, "learning_rate": 1.740355797154881e-05, "loss": 1.6825, "step": 195 }, { "epoch": 0.2358604091456077, "grad_norm": 1.5078125, "learning_rate": 1.7378092052618565e-05, "loss": 1.889, "step": 196 }, { "epoch": 0.23706377858002406, "grad_norm": 7.71875, "learning_rate": 1.7352520684983474e-05, "loss": 1.9605, "step": 197 }, { "epoch": 0.23826714801444043, "grad_norm": 1.8671875, "learning_rate": 1.7326844234113037e-05, "loss": 1.9375, "step": 198 }, { "epoch": 0.2394705174488568, "grad_norm": 13.875, "learning_rate": 1.7301063066978617e-05, "loss": 1.6995, "step": 199 }, { "epoch": 0.24067388688327315, "grad_norm": 1.84375, "learning_rate": 1.727517755204819e-05, "loss": 1.9733, "step": 200 }, { "epoch": 0.24187725631768953, "grad_norm": 7.59375, "learning_rate": 1.72491880592811e-05, "loss": 1.6648, "step": 201 }, { "epoch": 0.2430806257521059, "grad_norm": 1.75, "learning_rate": 1.7223094960122733e-05, "loss": 1.9393, "step": 202 }, { "epoch": 0.24428399518652227, "grad_norm": 7.96875, "learning_rate": 1.719689862749926e-05, "loss": 2.0018, "step": 203 }, { "epoch": 0.24548736462093862, "grad_norm": 1.828125, "learning_rate": 1.7170599435812253e-05, "loss": 1.914, "step": 204 }, { "epoch": 0.246690734055355, "grad_norm": 8.8125, "learning_rate": 1.714419776093338e-05, "loss": 1.6501, "step": 205 }, { "epoch": 0.24789410348977137, "grad_norm": 1.8046875, "learning_rate": 1.7117693980198996e-05, "loss": 1.8677, "step": 206 }, { "epoch": 0.2490974729241877, "grad_norm": 9.4375, "learning_rate": 1.709108847240478e-05, "loss": 1.4796, "step": 207 }, { "epoch": 0.2503008423586041, "grad_norm": 1.875, "learning_rate": 1.7064381617800302e-05, "loss": 1.963, "step": 208 }, { "epoch": 0.25150421179302046, "grad_norm": 9.1875, "learning_rate": 1.7037573798083598e-05, "loss": 1.7358, "step": 209 }, { "epoch": 0.2527075812274368, "grad_norm": 1.6875, "learning_rate": 1.7010665396395706e-05, "loss": 1.9424, "step": 210 }, { "epoch": 0.2539109506618532, "grad_norm": 7.25, "learning_rate": 1.6983656797315197e-05, "loss": 1.4907, "step": 211 }, { "epoch": 0.25511432009626955, "grad_norm": 1.796875, "learning_rate": 1.6956548386852684e-05, "loss": 1.9358, "step": 212 }, { "epoch": 0.2563176895306859, "grad_norm": 8.0625, "learning_rate": 1.6929340552445283e-05, "loss": 2.0059, "step": 213 }, { "epoch": 0.2575210589651023, "grad_norm": 2.203125, "learning_rate": 1.6902033682951104e-05, "loss": 1.9074, "step": 214 }, { "epoch": 0.25872442839951865, "grad_norm": 8.5, "learning_rate": 1.6874628168643683e-05, "loss": 1.4401, "step": 215 }, { "epoch": 0.259927797833935, "grad_norm": 1.625, "learning_rate": 1.6847124401206384e-05, "loss": 1.9238, "step": 216 }, { "epoch": 0.2611311672683514, "grad_norm": 10.0, "learning_rate": 1.681952277372683e-05, "loss": 1.7667, "step": 217 }, { "epoch": 0.26233453670276774, "grad_norm": 1.7734375, "learning_rate": 1.6791823680691276e-05, "loss": 1.9808, "step": 218 }, { "epoch": 0.26353790613718414, "grad_norm": 7.84375, "learning_rate": 1.676402751797896e-05, "loss": 1.5813, "step": 219 }, { "epoch": 0.2647412755716005, "grad_norm": 1.7109375, "learning_rate": 1.673613468285646e-05, "loss": 1.8665, "step": 220 }, { "epoch": 0.26594464500601683, "grad_norm": 9.0, "learning_rate": 1.6708145573972005e-05, "loss": 1.6345, "step": 221 }, { "epoch": 0.26714801444043323, "grad_norm": 2.046875, "learning_rate": 1.6680060591349774e-05, "loss": 1.9092, "step": 222 }, { "epoch": 0.2683513838748496, "grad_norm": 10.6875, "learning_rate": 1.6651880136384215e-05, "loss": 1.8779, "step": 223 }, { "epoch": 0.2695547533092659, "grad_norm": 1.8203125, "learning_rate": 1.662360461183424e-05, "loss": 1.8846, "step": 224 }, { "epoch": 0.27075812274368233, "grad_norm": 8.5625, "learning_rate": 1.659523442181754e-05, "loss": 1.7232, "step": 225 }, { "epoch": 0.2719614921780987, "grad_norm": 1.6328125, "learning_rate": 1.6566769971804763e-05, "loss": 1.8834, "step": 226 }, { "epoch": 0.273164861612515, "grad_norm": 9.3125, "learning_rate": 1.653821166861374e-05, "loss": 1.675, "step": 227 }, { "epoch": 0.2743682310469314, "grad_norm": 1.90625, "learning_rate": 1.6509559920403663e-05, "loss": 1.9348, "step": 228 }, { "epoch": 0.27557160048134777, "grad_norm": 7.625, "learning_rate": 1.6480815136669248e-05, "loss": 1.5852, "step": 229 }, { "epoch": 0.2767749699157641, "grad_norm": 2.03125, "learning_rate": 1.6451977728234894e-05, "loss": 1.9191, "step": 230 }, { "epoch": 0.2779783393501805, "grad_norm": 8.5625, "learning_rate": 1.64230481072488e-05, "loss": 1.7214, "step": 231 }, { "epoch": 0.27918170878459686, "grad_norm": 1.953125, "learning_rate": 1.639402668717709e-05, "loss": 1.9314, "step": 232 }, { "epoch": 0.28038507821901326, "grad_norm": 8.3125, "learning_rate": 1.6364913882797875e-05, "loss": 1.6346, "step": 233 }, { "epoch": 0.2815884476534296, "grad_norm": 1.921875, "learning_rate": 1.633571011019536e-05, "loss": 1.9149, "step": 234 }, { "epoch": 0.28279181708784595, "grad_norm": 7.59375, "learning_rate": 1.630641578675387e-05, "loss": 1.9227, "step": 235 }, { "epoch": 0.28399518652226236, "grad_norm": 1.84375, "learning_rate": 1.62770313311519e-05, "loss": 1.9066, "step": 236 }, { "epoch": 0.2851985559566787, "grad_norm": 8.4375, "learning_rate": 1.6247557163356127e-05, "loss": 1.5464, "step": 237 }, { "epoch": 0.28640192539109505, "grad_norm": 2.578125, "learning_rate": 1.62179937046154e-05, "loss": 1.8708, "step": 238 }, { "epoch": 0.28760529482551145, "grad_norm": 7.59375, "learning_rate": 1.6188341377454735e-05, "loss": 1.4924, "step": 239 }, { "epoch": 0.2888086642599278, "grad_norm": 1.7890625, "learning_rate": 1.6158600605669264e-05, "loss": 1.8812, "step": 240 }, { "epoch": 0.29001203369434414, "grad_norm": 10.5, "learning_rate": 1.6128771814318178e-05, "loss": 1.7115, "step": 241 }, { "epoch": 0.29121540312876054, "grad_norm": 2.03125, "learning_rate": 1.6098855429718662e-05, "loss": 1.8493, "step": 242 }, { "epoch": 0.2924187725631769, "grad_norm": 8.25, "learning_rate": 1.606885187943979e-05, "loss": 1.5891, "step": 243 }, { "epoch": 0.29362214199759323, "grad_norm": 2.359375, "learning_rate": 1.6038761592296435e-05, "loss": 1.8819, "step": 244 }, { "epoch": 0.29482551143200963, "grad_norm": 8.5, "learning_rate": 1.60085849983431e-05, "loss": 1.6887, "step": 245 }, { "epoch": 0.296028880866426, "grad_norm": 2.09375, "learning_rate": 1.597832252886781e-05, "loss": 1.9356, "step": 246 }, { "epoch": 0.2972322503008424, "grad_norm": 8.9375, "learning_rate": 1.594797461638594e-05, "loss": 1.8267, "step": 247 }, { "epoch": 0.29843561973525873, "grad_norm": 1.8984375, "learning_rate": 1.591754169463402e-05, "loss": 1.8325, "step": 248 }, { "epoch": 0.2996389891696751, "grad_norm": 8.125, "learning_rate": 1.5887024198563552e-05, "loss": 1.5603, "step": 249 }, { "epoch": 0.3008423586040915, "grad_norm": 1.578125, "learning_rate": 1.5856422564334772e-05, "loss": 1.842, "step": 250 }, { "epoch": 0.3020457280385078, "grad_norm": 9.625, "learning_rate": 1.5825737229310448e-05, "loss": 1.6354, "step": 251 }, { "epoch": 0.30324909747292417, "grad_norm": 2.53125, "learning_rate": 1.5794968632049598e-05, "loss": 1.8604, "step": 252 }, { "epoch": 0.30445246690734057, "grad_norm": 7.4375, "learning_rate": 1.576411721230124e-05, "loss": 1.8925, "step": 253 }, { "epoch": 0.3056558363417569, "grad_norm": 2.0625, "learning_rate": 1.57331834109981e-05, "loss": 1.9078, "step": 254 }, { "epoch": 0.30685920577617326, "grad_norm": 7.21875, "learning_rate": 1.570216767025032e-05, "loss": 1.7336, "step": 255 }, { "epoch": 0.30806257521058966, "grad_norm": 1.9375, "learning_rate": 1.5671070433339116e-05, "loss": 1.9041, "step": 256 }, { "epoch": 0.309265944645006, "grad_norm": 7.5625, "learning_rate": 1.5639892144710477e-05, "loss": 1.55, "step": 257 }, { "epoch": 0.3104693140794224, "grad_norm": 1.8125, "learning_rate": 1.5608633249968783e-05, "loss": 1.9125, "step": 258 }, { "epoch": 0.31167268351383876, "grad_norm": 8.0, "learning_rate": 1.557729419587045e-05, "loss": 1.8085, "step": 259 }, { "epoch": 0.3128760529482551, "grad_norm": 2.140625, "learning_rate": 1.5545875430317546e-05, "loss": 1.8648, "step": 260 }, { "epoch": 0.3140794223826715, "grad_norm": 9.0625, "learning_rate": 1.5514377402351376e-05, "loss": 1.7423, "step": 261 }, { "epoch": 0.31528279181708785, "grad_norm": 2.53125, "learning_rate": 1.548280056214609e-05, "loss": 1.9913, "step": 262 }, { "epoch": 0.3164861612515042, "grad_norm": 9.3125, "learning_rate": 1.545114536100222e-05, "loss": 1.7276, "step": 263 }, { "epoch": 0.3176895306859206, "grad_norm": 1.765625, "learning_rate": 1.541941225134025e-05, "loss": 1.9106, "step": 264 }, { "epoch": 0.31889290012033694, "grad_norm": 7.375, "learning_rate": 1.5387601686694134e-05, "loss": 1.9022, "step": 265 }, { "epoch": 0.3200962695547533, "grad_norm": 1.9453125, "learning_rate": 1.5355714121704846e-05, "loss": 1.8894, "step": 266 }, { "epoch": 0.3212996389891697, "grad_norm": 7.40625, "learning_rate": 1.532375001211383e-05, "loss": 2.1068, "step": 267 }, { "epoch": 0.32250300842358604, "grad_norm": 1.890625, "learning_rate": 1.529170981475653e-05, "loss": 1.9553, "step": 268 }, { "epoch": 0.3237063778580024, "grad_norm": 12.75, "learning_rate": 1.525959398755585e-05, "loss": 2.0006, "step": 269 }, { "epoch": 0.3249097472924188, "grad_norm": 1.671875, "learning_rate": 1.5227402989515607e-05, "loss": 1.8988, "step": 270 }, { "epoch": 0.32611311672683513, "grad_norm": 8.3125, "learning_rate": 1.519513728071396e-05, "loss": 1.7496, "step": 271 }, { "epoch": 0.32731648616125153, "grad_norm": 1.984375, "learning_rate": 1.5162797322296855e-05, "loss": 1.9465, "step": 272 }, { "epoch": 0.3285198555956679, "grad_norm": 12.8125, "learning_rate": 1.5130383576471415e-05, "loss": 1.5856, "step": 273 }, { "epoch": 0.3297232250300842, "grad_norm": 2.015625, "learning_rate": 1.5097896506499349e-05, "loss": 1.9516, "step": 274 }, { "epoch": 0.3309265944645006, "grad_norm": 6.90625, "learning_rate": 1.5065336576690318e-05, "loss": 1.5428, "step": 275 }, { "epoch": 0.33212996389891697, "grad_norm": 1.640625, "learning_rate": 1.5032704252395315e-05, "loss": 1.8789, "step": 276 }, { "epoch": 0.3333333333333333, "grad_norm": 8.625, "learning_rate": 1.5000000000000002e-05, "loss": 1.532, "step": 277 }, { "epoch": 0.3345367027677497, "grad_norm": 1.8203125, "learning_rate": 1.496722428691804e-05, "loss": 1.9783, "step": 278 }, { "epoch": 0.33574007220216606, "grad_norm": 8.0625, "learning_rate": 1.4934377581584425e-05, "loss": 1.594, "step": 279 }, { "epoch": 0.3369434416365824, "grad_norm": 2.109375, "learning_rate": 1.490146035344878e-05, "loss": 1.9084, "step": 280 }, { "epoch": 0.3381468110709988, "grad_norm": 11.6875, "learning_rate": 1.4868473072968645e-05, "loss": 2.3898, "step": 281 }, { "epoch": 0.33935018050541516, "grad_norm": 1.4765625, "learning_rate": 1.4835416211602771e-05, "loss": 1.898, "step": 282 }, { "epoch": 0.3405535499398315, "grad_norm": 9.8125, "learning_rate": 1.4802290241804355e-05, "loss": 1.9441, "step": 283 }, { "epoch": 0.3417569193742479, "grad_norm": 1.9296875, "learning_rate": 1.4769095637014308e-05, "loss": 1.8702, "step": 284 }, { "epoch": 0.34296028880866425, "grad_norm": 8.6875, "learning_rate": 1.473583287165448e-05, "loss": 1.9498, "step": 285 }, { "epoch": 0.34416365824308065, "grad_norm": 1.6328125, "learning_rate": 1.4702502421120884e-05, "loss": 1.856, "step": 286 }, { "epoch": 0.345367027677497, "grad_norm": 9.0, "learning_rate": 1.4669104761776892e-05, "loss": 1.716, "step": 287 }, { "epoch": 0.34657039711191334, "grad_norm": 2.125, "learning_rate": 1.463564037094644e-05, "loss": 1.8696, "step": 288 }, { "epoch": 0.34777376654632974, "grad_norm": 7.9375, "learning_rate": 1.4602109726907197e-05, "loss": 1.5692, "step": 289 }, { "epoch": 0.3489771359807461, "grad_norm": 1.671875, "learning_rate": 1.4568513308883732e-05, "loss": 1.8459, "step": 290 }, { "epoch": 0.35018050541516244, "grad_norm": 8.4375, "learning_rate": 1.4534851597040666e-05, "loss": 1.7531, "step": 291 }, { "epoch": 0.35138387484957884, "grad_norm": 2.359375, "learning_rate": 1.4501125072475804e-05, "loss": 1.8967, "step": 292 }, { "epoch": 0.3525872442839952, "grad_norm": 10.625, "learning_rate": 1.4467334217213274e-05, "loss": 2.0254, "step": 293 }, { "epoch": 0.35379061371841153, "grad_norm": 2.046875, "learning_rate": 1.4433479514196615e-05, "loss": 1.8432, "step": 294 }, { "epoch": 0.35499398315282793, "grad_norm": 6.78125, "learning_rate": 1.439956144728189e-05, "loss": 1.4684, "step": 295 }, { "epoch": 0.3561973525872443, "grad_norm": 1.8203125, "learning_rate": 1.4365580501230776e-05, "loss": 1.9181, "step": 296 }, { "epoch": 0.3574007220216607, "grad_norm": 9.6875, "learning_rate": 1.4331537161703612e-05, "loss": 2.1226, "step": 297 }, { "epoch": 0.358604091456077, "grad_norm": 2.0, "learning_rate": 1.4297431915252487e-05, "loss": 1.876, "step": 298 }, { "epoch": 0.35980746089049337, "grad_norm": 7.96875, "learning_rate": 1.4263265249314269e-05, "loss": 1.5929, "step": 299 }, { "epoch": 0.36101083032490977, "grad_norm": 1.9609375, "learning_rate": 1.422903765220363e-05, "loss": 1.9327, "step": 300 }, { "epoch": 0.3622141997593261, "grad_norm": 13.9375, "learning_rate": 1.41947496131061e-05, "loss": 1.6594, "step": 301 }, { "epoch": 0.36341756919374246, "grad_norm": 1.703125, "learning_rate": 1.4160401622071039e-05, "loss": 1.8538, "step": 302 }, { "epoch": 0.36462093862815886, "grad_norm": 9.3125, "learning_rate": 1.4125994170004644e-05, "loss": 1.7329, "step": 303 }, { "epoch": 0.3658243080625752, "grad_norm": 2.453125, "learning_rate": 1.4091527748662957e-05, "loss": 1.9571, "step": 304 }, { "epoch": 0.36702767749699156, "grad_norm": 9.1875, "learning_rate": 1.4057002850644796e-05, "loss": 1.5898, "step": 305 }, { "epoch": 0.36823104693140796, "grad_norm": 1.8203125, "learning_rate": 1.402241996938475e-05, "loss": 1.8661, "step": 306 }, { "epoch": 0.3694344163658243, "grad_norm": 9.5, "learning_rate": 1.3987779599146105e-05, "loss": 1.7381, "step": 307 }, { "epoch": 0.37063778580024065, "grad_norm": 1.9609375, "learning_rate": 1.3953082235013788e-05, "loss": 1.8554, "step": 308 }, { "epoch": 0.37184115523465705, "grad_norm": 8.9375, "learning_rate": 1.3918328372887295e-05, "loss": 1.6175, "step": 309 }, { "epoch": 0.3730445246690734, "grad_norm": 2.1875, "learning_rate": 1.3883518509473598e-05, "loss": 1.8526, "step": 310 }, { "epoch": 0.3742478941034898, "grad_norm": 7.96875, "learning_rate": 1.3848653142280037e-05, "loss": 1.9204, "step": 311 }, { "epoch": 0.37545126353790614, "grad_norm": 2.046875, "learning_rate": 1.381373276960724e-05, "loss": 1.9565, "step": 312 }, { "epoch": 0.3766546329723225, "grad_norm": 9.8125, "learning_rate": 1.377875789054196e-05, "loss": 1.6281, "step": 313 }, { "epoch": 0.3778580024067389, "grad_norm": 1.59375, "learning_rate": 1.3743729004949972e-05, "loss": 1.9128, "step": 314 }, { "epoch": 0.37906137184115524, "grad_norm": 11.8125, "learning_rate": 1.3708646613468925e-05, "loss": 1.6389, "step": 315 }, { "epoch": 0.3802647412755716, "grad_norm": 1.9296875, "learning_rate": 1.3673511217501172e-05, "loss": 1.8232, "step": 316 }, { "epoch": 0.381468110709988, "grad_norm": 8.25, "learning_rate": 1.3638323319206617e-05, "loss": 1.4705, "step": 317 }, { "epoch": 0.38267148014440433, "grad_norm": 2.4375, "learning_rate": 1.3603083421495535e-05, "loss": 1.9497, "step": 318 }, { "epoch": 0.3838748495788207, "grad_norm": 9.875, "learning_rate": 1.3567792028021382e-05, "loss": 1.7197, "step": 319 }, { "epoch": 0.3850782190132371, "grad_norm": 1.8046875, "learning_rate": 1.3532449643173604e-05, "loss": 1.8622, "step": 320 }, { "epoch": 0.3862815884476534, "grad_norm": 6.15625, "learning_rate": 1.3497056772070417e-05, "loss": 1.6377, "step": 321 }, { "epoch": 0.38748495788206977, "grad_norm": 2.03125, "learning_rate": 1.3461613920551598e-05, "loss": 1.8915, "step": 322 }, { "epoch": 0.38868832731648617, "grad_norm": 10.375, "learning_rate": 1.3426121595171242e-05, "loss": 1.7872, "step": 323 }, { "epoch": 0.3898916967509025, "grad_norm": 1.84375, "learning_rate": 1.3390580303190541e-05, "loss": 1.9056, "step": 324 }, { "epoch": 0.3910950661853189, "grad_norm": 9.875, "learning_rate": 1.335499055257052e-05, "loss": 1.7587, "step": 325 }, { "epoch": 0.39229843561973526, "grad_norm": 2.28125, "learning_rate": 1.3319352851964787e-05, "loss": 1.9716, "step": 326 }, { "epoch": 0.3935018050541516, "grad_norm": 8.125, "learning_rate": 1.3283667710712245e-05, "loss": 1.3889, "step": 327 }, { "epoch": 0.394705174488568, "grad_norm": 1.8359375, "learning_rate": 1.3247935638829838e-05, "loss": 1.8637, "step": 328 }, { "epoch": 0.39590854392298436, "grad_norm": 7.90625, "learning_rate": 1.3212157147005244e-05, "loss": 1.702, "step": 329 }, { "epoch": 0.3971119133574007, "grad_norm": 1.3203125, "learning_rate": 1.3176332746589587e-05, "loss": 1.8744, "step": 330 }, { "epoch": 0.3983152827918171, "grad_norm": 10.625, "learning_rate": 1.3140462949590107e-05, "loss": 2.0545, "step": 331 }, { "epoch": 0.39951865222623345, "grad_norm": 1.734375, "learning_rate": 1.3104548268662873e-05, "loss": 1.8685, "step": 332 }, { "epoch": 0.4007220216606498, "grad_norm": 10.25, "learning_rate": 1.3068589217105441e-05, "loss": 1.8625, "step": 333 }, { "epoch": 0.4019253910950662, "grad_norm": 1.9453125, "learning_rate": 1.3032586308849512e-05, "loss": 1.98, "step": 334 }, { "epoch": 0.40312876052948254, "grad_norm": 10.625, "learning_rate": 1.2996540058453589e-05, "loss": 1.7419, "step": 335 }, { "epoch": 0.4043321299638989, "grad_norm": 1.9453125, "learning_rate": 1.2960450981095643e-05, "loss": 1.8723, "step": 336 }, { "epoch": 0.4055354993983153, "grad_norm": 9.5, "learning_rate": 1.2924319592565713e-05, "loss": 1.8843, "step": 337 }, { "epoch": 0.40673886883273164, "grad_norm": 1.671875, "learning_rate": 1.2888146409258575e-05, "loss": 1.8432, "step": 338 }, { "epoch": 0.40794223826714804, "grad_norm": 8.5, "learning_rate": 1.2851931948166328e-05, "loss": 1.7638, "step": 339 }, { "epoch": 0.4091456077015644, "grad_norm": 2.046875, "learning_rate": 1.281567672687102e-05, "loss": 1.9141, "step": 340 }, { "epoch": 0.41034897713598073, "grad_norm": 8.5, "learning_rate": 1.2779381263537262e-05, "loss": 1.4478, "step": 341 }, { "epoch": 0.41155234657039713, "grad_norm": 2.046875, "learning_rate": 1.2743046076904795e-05, "loss": 1.897, "step": 342 }, { "epoch": 0.4127557160048135, "grad_norm": 10.875, "learning_rate": 1.2706671686281094e-05, "loss": 2.1328, "step": 343 }, { "epoch": 0.4139590854392298, "grad_norm": 2.03125, "learning_rate": 1.2670258611533947e-05, "loss": 1.841, "step": 344 }, { "epoch": 0.4151624548736462, "grad_norm": 11.1875, "learning_rate": 1.2633807373084022e-05, "loss": 1.7572, "step": 345 }, { "epoch": 0.41636582430806257, "grad_norm": 1.8046875, "learning_rate": 1.2597318491897416e-05, "loss": 1.9017, "step": 346 }, { "epoch": 0.4175691937424789, "grad_norm": 7.90625, "learning_rate": 1.2560792489478244e-05, "loss": 1.6313, "step": 347 }, { "epoch": 0.4187725631768953, "grad_norm": 1.6328125, "learning_rate": 1.2524229887861132e-05, "loss": 1.8519, "step": 348 }, { "epoch": 0.41997593261131166, "grad_norm": 8.1875, "learning_rate": 1.2487631209603819e-05, "loss": 1.6599, "step": 349 }, { "epoch": 0.42117930204572807, "grad_norm": 1.9765625, "learning_rate": 1.245099697777963e-05, "loss": 1.8785, "step": 350 }, { "epoch": 0.4223826714801444, "grad_norm": 7.96875, "learning_rate": 1.241432771597004e-05, "loss": 1.4527, "step": 351 }, { "epoch": 0.42358604091456076, "grad_norm": 1.8671875, "learning_rate": 1.237762394825718e-05, "loss": 1.9006, "step": 352 }, { "epoch": 0.42478941034897716, "grad_norm": 7.6875, "learning_rate": 1.234088619921633e-05, "loss": 1.6453, "step": 353 }, { "epoch": 0.4259927797833935, "grad_norm": 2.515625, "learning_rate": 1.230411499390845e-05, "loss": 1.8704, "step": 354 }, { "epoch": 0.42719614921780985, "grad_norm": 7.59375, "learning_rate": 1.2267310857872654e-05, "loss": 1.669, "step": 355 }, { "epoch": 0.42839951865222625, "grad_norm": 2.1875, "learning_rate": 1.2230474317118708e-05, "loss": 1.9757, "step": 356 }, { "epoch": 0.4296028880866426, "grad_norm": 8.875, "learning_rate": 1.2193605898119513e-05, "loss": 1.7546, "step": 357 }, { "epoch": 0.43080625752105894, "grad_norm": 1.5859375, "learning_rate": 1.2156706127803578e-05, "loss": 1.8807, "step": 358 }, { "epoch": 0.43200962695547535, "grad_norm": 7.90625, "learning_rate": 1.2119775533547482e-05, "loss": 1.6133, "step": 359 }, { "epoch": 0.4332129963898917, "grad_norm": 1.5234375, "learning_rate": 1.2082814643168357e-05, "loss": 1.8919, "step": 360 }, { "epoch": 0.43441636582430804, "grad_norm": 12.0625, "learning_rate": 1.2045823984916317e-05, "loss": 1.6771, "step": 361 }, { "epoch": 0.43561973525872444, "grad_norm": 2.25, "learning_rate": 1.2008804087466931e-05, "loss": 1.9265, "step": 362 }, { "epoch": 0.4368231046931408, "grad_norm": 8.625, "learning_rate": 1.1971755479913665e-05, "loss": 1.6558, "step": 363 }, { "epoch": 0.4380264741275572, "grad_norm": 1.6484375, "learning_rate": 1.1934678691760296e-05, "loss": 1.8653, "step": 364 }, { "epoch": 0.43922984356197353, "grad_norm": 9.5, "learning_rate": 1.1897574252913377e-05, "loss": 1.529, "step": 365 }, { "epoch": 0.4404332129963899, "grad_norm": 1.78125, "learning_rate": 1.1860442693674648e-05, "loss": 1.8504, "step": 366 }, { "epoch": 0.4416365824308063, "grad_norm": 10.0625, "learning_rate": 1.182328454473344e-05, "loss": 1.7066, "step": 367 }, { "epoch": 0.4428399518652226, "grad_norm": 2.15625, "learning_rate": 1.1786100337159132e-05, "loss": 1.9005, "step": 368 }, { "epoch": 0.44404332129963897, "grad_norm": 10.125, "learning_rate": 1.1748890602393521e-05, "loss": 1.9596, "step": 369 }, { "epoch": 0.4452466907340554, "grad_norm": 1.59375, "learning_rate": 1.1711655872243247e-05, "loss": 1.7943, "step": 370 }, { "epoch": 0.4464500601684717, "grad_norm": 9.6875, "learning_rate": 1.1674396678872186e-05, "loss": 1.8632, "step": 371 }, { "epoch": 0.44765342960288806, "grad_norm": 2.34375, "learning_rate": 1.1637113554793846e-05, "loss": 1.9607, "step": 372 }, { "epoch": 0.44885679903730447, "grad_norm": 6.96875, "learning_rate": 1.1599807032863756e-05, "loss": 1.6864, "step": 373 }, { "epoch": 0.4500601684717208, "grad_norm": 1.8125, "learning_rate": 1.1562477646271856e-05, "loss": 1.8366, "step": 374 }, { "epoch": 0.45126353790613716, "grad_norm": 7.5625, "learning_rate": 1.152512592853486e-05, "loss": 1.7295, "step": 375 }, { "epoch": 0.45246690734055356, "grad_norm": 1.5546875, "learning_rate": 1.1487752413488646e-05, "loss": 1.8746, "step": 376 }, { "epoch": 0.4536702767749699, "grad_norm": 7.9375, "learning_rate": 1.1450357635280628e-05, "loss": 1.4717, "step": 377 }, { "epoch": 0.4548736462093863, "grad_norm": 1.7421875, "learning_rate": 1.141294212836211e-05, "loss": 1.8848, "step": 378 }, { "epoch": 0.45607701564380265, "grad_norm": 9.9375, "learning_rate": 1.1375506427480658e-05, "loss": 1.5209, "step": 379 }, { "epoch": 0.457280385078219, "grad_norm": 2.296875, "learning_rate": 1.1338051067672444e-05, "loss": 1.8625, "step": 380 }, { "epoch": 0.4584837545126354, "grad_norm": 7.71875, "learning_rate": 1.1300576584254617e-05, "loss": 1.7493, "step": 381 }, { "epoch": 0.45968712394705175, "grad_norm": 2.03125, "learning_rate": 1.1263083512817644e-05, "loss": 1.8731, "step": 382 }, { "epoch": 0.4608904933814681, "grad_norm": 8.5, "learning_rate": 1.1225572389217643e-05, "loss": 1.7396, "step": 383 }, { "epoch": 0.4620938628158845, "grad_norm": 1.7109375, "learning_rate": 1.1188043749568752e-05, "loss": 1.8687, "step": 384 }, { "epoch": 0.46329723225030084, "grad_norm": 8.625, "learning_rate": 1.1150498130235435e-05, "loss": 1.7904, "step": 385 }, { "epoch": 0.4645006016847172, "grad_norm": 2.0625, "learning_rate": 1.1112936067824847e-05, "loss": 1.9042, "step": 386 }, { "epoch": 0.4657039711191336, "grad_norm": 8.75, "learning_rate": 1.1075358099179136e-05, "loss": 1.8215, "step": 387 }, { "epoch": 0.46690734055354993, "grad_norm": 1.7109375, "learning_rate": 1.1037764761367795e-05, "loss": 1.8536, "step": 388 }, { "epoch": 0.4681107099879663, "grad_norm": 8.6875, "learning_rate": 1.1000156591679971e-05, "loss": 1.6523, "step": 389 }, { "epoch": 0.4693140794223827, "grad_norm": 2.125, "learning_rate": 1.0962534127616784e-05, "loss": 1.9216, "step": 390 }, { "epoch": 0.470517448856799, "grad_norm": 9.0, "learning_rate": 1.0924897906883663e-05, "loss": 1.571, "step": 391 }, { "epoch": 0.4717208182912154, "grad_norm": 1.953125, "learning_rate": 1.088724846738264e-05, "loss": 1.8167, "step": 392 }, { "epoch": 0.4729241877256318, "grad_norm": 9.0, "learning_rate": 1.0849586347204677e-05, "loss": 1.8241, "step": 393 }, { "epoch": 0.4741275571600481, "grad_norm": 2.203125, "learning_rate": 1.0811912084621968e-05, "loss": 1.84, "step": 394 }, { "epoch": 0.4753309265944645, "grad_norm": 6.78125, "learning_rate": 1.0774226218080244e-05, "loss": 1.5207, "step": 395 }, { "epoch": 0.47653429602888087, "grad_norm": 2.125, "learning_rate": 1.0736529286191087e-05, "loss": 1.9606, "step": 396 }, { "epoch": 0.4777376654632972, "grad_norm": 8.875, "learning_rate": 1.0698821827724225e-05, "loss": 1.6881, "step": 397 }, { "epoch": 0.4789410348977136, "grad_norm": 1.703125, "learning_rate": 1.0661104381599833e-05, "loss": 1.9175, "step": 398 }, { "epoch": 0.48014440433212996, "grad_norm": 10.6875, "learning_rate": 1.0623377486880831e-05, "loss": 1.7195, "step": 399 }, { "epoch": 0.4813477737665463, "grad_norm": 1.8828125, "learning_rate": 1.058564168276518e-05, "loss": 1.837, "step": 400 }, { "epoch": 0.4825511432009627, "grad_norm": 11.0, "learning_rate": 1.054789750857817e-05, "loss": 1.9014, "step": 401 }, { "epoch": 0.48375451263537905, "grad_norm": 1.7421875, "learning_rate": 1.0510145503764727e-05, "loss": 1.8715, "step": 402 }, { "epoch": 0.48495788206979545, "grad_norm": 7.625, "learning_rate": 1.0472386207881684e-05, "loss": 1.7432, "step": 403 }, { "epoch": 0.4861612515042118, "grad_norm": 2.1875, "learning_rate": 1.0434620160590086e-05, "loss": 1.8686, "step": 404 }, { "epoch": 0.48736462093862815, "grad_norm": 9.25, "learning_rate": 1.0396847901647469e-05, "loss": 1.8893, "step": 405 }, { "epoch": 0.48856799037304455, "grad_norm": 1.6484375, "learning_rate": 1.0359069970900139e-05, "loss": 1.8032, "step": 406 }, { "epoch": 0.4897713598074609, "grad_norm": 7.90625, "learning_rate": 1.0321286908275476e-05, "loss": 1.6637, "step": 407 }, { "epoch": 0.49097472924187724, "grad_norm": 1.953125, "learning_rate": 1.0283499253774201e-05, "loss": 1.8709, "step": 408 }, { "epoch": 0.49217809867629364, "grad_norm": 9.75, "learning_rate": 1.0245707547462654e-05, "loss": 1.7803, "step": 409 }, { "epoch": 0.49338146811071, "grad_norm": 1.7109375, "learning_rate": 1.0207912329465097e-05, "loss": 1.8821, "step": 410 }, { "epoch": 0.49458483754512633, "grad_norm": 10.5, "learning_rate": 1.0170114139955975e-05, "loss": 1.8227, "step": 411 }, { "epoch": 0.49578820697954273, "grad_norm": 1.8828125, "learning_rate": 1.01323135191522e-05, "loss": 1.8133, "step": 412 }, { "epoch": 0.4969915764139591, "grad_norm": 7.78125, "learning_rate": 1.0094511007305445e-05, "loss": 1.4014, "step": 413 }, { "epoch": 0.4981949458483754, "grad_norm": 2.09375, "learning_rate": 1.005670714469439e-05, "loss": 1.9023, "step": 414 }, { "epoch": 0.4993983152827918, "grad_norm": 8.5, "learning_rate": 1.0018902471617037e-05, "loss": 1.9721, "step": 415 }, { "epoch": 0.5006016847172082, "grad_norm": 1.890625, "learning_rate": 9.981097528382964e-06, "loss": 1.8523, "step": 416 }, { "epoch": 0.5018050541516246, "grad_norm": 9.4375, "learning_rate": 9.943292855305611e-06, "loss": 1.4807, "step": 417 }, { "epoch": 0.5030084235860409, "grad_norm": 2.0625, "learning_rate": 9.905488992694558e-06, "loss": 1.8578, "step": 418 }, { "epoch": 0.5042117930204573, "grad_norm": 7.03125, "learning_rate": 9.867686480847801e-06, "loss": 1.5719, "step": 419 }, { "epoch": 0.5054151624548736, "grad_norm": 1.6640625, "learning_rate": 9.829885860044028e-06, "loss": 1.8964, "step": 420 }, { "epoch": 0.50661853188929, "grad_norm": 7.65625, "learning_rate": 9.792087670534908e-06, "loss": 1.6174, "step": 421 }, { "epoch": 0.5078219013237064, "grad_norm": 2.359375, "learning_rate": 9.754292452537348e-06, "loss": 1.8476, "step": 422 }, { "epoch": 0.5090252707581228, "grad_norm": 9.375, "learning_rate": 9.716500746225802e-06, "loss": 1.6067, "step": 423 }, { "epoch": 0.5102286401925391, "grad_norm": 3.296875, "learning_rate": 9.678713091724527e-06, "loss": 1.9678, "step": 424 }, { "epoch": 0.5114320096269555, "grad_norm": 8.5, "learning_rate": 9.640930029099863e-06, "loss": 2.0364, "step": 425 }, { "epoch": 0.5126353790613718, "grad_norm": 1.84375, "learning_rate": 9.603152098352538e-06, "loss": 1.8445, "step": 426 }, { "epoch": 0.5138387484957883, "grad_norm": 8.625, "learning_rate": 9.565379839409916e-06, "loss": 1.9894, "step": 427 }, { "epoch": 0.5150421179302046, "grad_norm": 1.8984375, "learning_rate": 9.527613792118318e-06, "loss": 1.881, "step": 428 }, { "epoch": 0.516245487364621, "grad_norm": 8.625, "learning_rate": 9.489854496235278e-06, "loss": 1.6797, "step": 429 }, { "epoch": 0.5174488567990373, "grad_norm": 2.125, "learning_rate": 9.452102491421835e-06, "loss": 1.8709, "step": 430 }, { "epoch": 0.5186522262334536, "grad_norm": 8.4375, "learning_rate": 9.414358317234826e-06, "loss": 1.4383, "step": 431 }, { "epoch": 0.51985559566787, "grad_norm": 2.4375, "learning_rate": 9.376622513119174e-06, "loss": 1.958, "step": 432 }, { "epoch": 0.5210589651022864, "grad_norm": 7.5, "learning_rate": 9.338895618400168e-06, "loss": 1.6565, "step": 433 }, { "epoch": 0.5222623345367028, "grad_norm": 1.859375, "learning_rate": 9.301178172275776e-06, "loss": 1.8772, "step": 434 }, { "epoch": 0.5234657039711191, "grad_norm": 7.1875, "learning_rate": 9.263470713808917e-06, "loss": 1.7663, "step": 435 }, { "epoch": 0.5246690734055355, "grad_norm": 1.90625, "learning_rate": 9.22577378191976e-06, "loss": 1.8776, "step": 436 }, { "epoch": 0.5258724428399518, "grad_norm": 6.71875, "learning_rate": 9.188087915378037e-06, "loss": 1.5338, "step": 437 }, { "epoch": 0.5270758122743683, "grad_norm": 1.796875, "learning_rate": 9.150413652795325e-06, "loss": 1.8267, "step": 438 }, { "epoch": 0.5282791817087846, "grad_norm": 8.0, "learning_rate": 9.112751532617361e-06, "loss": 1.7629, "step": 439 }, { "epoch": 0.529482551143201, "grad_norm": 1.625, "learning_rate": 9.07510209311634e-06, "loss": 1.8474, "step": 440 }, { "epoch": 0.5306859205776173, "grad_norm": 8.6875, "learning_rate": 9.037465872383219e-06, "loss": 1.7209, "step": 441 }, { "epoch": 0.5318892900120337, "grad_norm": 1.6953125, "learning_rate": 8.999843408320034e-06, "loss": 1.8088, "step": 442 }, { "epoch": 0.53309265944645, "grad_norm": 8.375, "learning_rate": 8.962235238632208e-06, "loss": 1.4698, "step": 443 }, { "epoch": 0.5342960288808665, "grad_norm": 2.1875, "learning_rate": 8.924641900820864e-06, "loss": 1.9038, "step": 444 }, { "epoch": 0.5354993983152828, "grad_norm": 7.65625, "learning_rate": 8.887063932175156e-06, "loss": 1.7247, "step": 445 }, { "epoch": 0.5367027677496992, "grad_norm": 2.265625, "learning_rate": 8.849501869764569e-06, "loss": 1.8362, "step": 446 }, { "epoch": 0.5379061371841155, "grad_norm": 9.3125, "learning_rate": 8.811956250431253e-06, "loss": 1.6821, "step": 447 }, { "epoch": 0.5391095066185319, "grad_norm": 2.015625, "learning_rate": 8.77442761078236e-06, "loss": 1.8706, "step": 448 }, { "epoch": 0.5403128760529483, "grad_norm": 8.25, "learning_rate": 8.73691648718236e-06, "loss": 1.9886, "step": 449 }, { "epoch": 0.5415162454873647, "grad_norm": 2.640625, "learning_rate": 8.699423415745383e-06, "loss": 1.8496, "step": 450 }, { "epoch": 0.542719614921781, "grad_norm": 10.0, "learning_rate": 8.661948932327558e-06, "loss": 1.768, "step": 451 }, { "epoch": 0.5439229843561973, "grad_norm": 1.6796875, "learning_rate": 8.624493572519345e-06, "loss": 1.8441, "step": 452 }, { "epoch": 0.5451263537906137, "grad_norm": 10.125, "learning_rate": 8.587057871637891e-06, "loss": 1.5176, "step": 453 }, { "epoch": 0.54632972322503, "grad_norm": 2.0625, "learning_rate": 8.549642364719373e-06, "loss": 1.9311, "step": 454 }, { "epoch": 0.5475330926594465, "grad_norm": 17.25, "learning_rate": 8.512247586511354e-06, "loss": 2.0458, "step": 455 }, { "epoch": 0.5487364620938628, "grad_norm": 2.1875, "learning_rate": 8.474874071465144e-06, "loss": 1.8916, "step": 456 }, { "epoch": 0.5499398315282792, "grad_norm": 10.125, "learning_rate": 8.437522353728147e-06, "loss": 1.5376, "step": 457 }, { "epoch": 0.5511432009626955, "grad_norm": 2.078125, "learning_rate": 8.400192967136245e-06, "loss": 1.8623, "step": 458 }, { "epoch": 0.5523465703971119, "grad_norm": 7.1875, "learning_rate": 8.36288644520616e-06, "loss": 1.3575, "step": 459 }, { "epoch": 0.5535499398315282, "grad_norm": 1.9375, "learning_rate": 8.325603321127819e-06, "loss": 1.8188, "step": 460 }, { "epoch": 0.5547533092659447, "grad_norm": 9.1875, "learning_rate": 8.288344127756755e-06, "loss": 1.1446, "step": 461 }, { "epoch": 0.555956678700361, "grad_norm": 1.5859375, "learning_rate": 8.251109397606482e-06, "loss": 1.8927, "step": 462 }, { "epoch": 0.5571600481347774, "grad_norm": 12.6875, "learning_rate": 8.213899662840871e-06, "loss": 1.6248, "step": 463 }, { "epoch": 0.5583634175691937, "grad_norm": 2.15625, "learning_rate": 8.176715455266564e-06, "loss": 1.8177, "step": 464 }, { "epoch": 0.5595667870036101, "grad_norm": 9.375, "learning_rate": 8.139557306325359e-06, "loss": 1.7076, "step": 465 }, { "epoch": 0.5607701564380265, "grad_norm": 2.0, "learning_rate": 8.102425747086623e-06, "loss": 1.894, "step": 466 }, { "epoch": 0.5619735258724429, "grad_norm": 8.5625, "learning_rate": 8.065321308239706e-06, "loss": 1.4971, "step": 467 }, { "epoch": 0.5631768953068592, "grad_norm": 2.375, "learning_rate": 8.028244520086338e-06, "loss": 1.8354, "step": 468 }, { "epoch": 0.5643802647412756, "grad_norm": 10.8125, "learning_rate": 7.99119591253307e-06, "loss": 1.3236, "step": 469 }, { "epoch": 0.5655836341756919, "grad_norm": 2.3125, "learning_rate": 7.954176015083687e-06, "loss": 1.8395, "step": 470 }, { "epoch": 0.5667870036101083, "grad_norm": 11.875, "learning_rate": 7.91718535683165e-06, "loss": 1.783, "step": 471 }, { "epoch": 0.5679903730445247, "grad_norm": 2.515625, "learning_rate": 7.88022446645252e-06, "loss": 1.8556, "step": 472 }, { "epoch": 0.5691937424789411, "grad_norm": 8.6875, "learning_rate": 7.843293872196425e-06, "loss": 1.789, "step": 473 }, { "epoch": 0.5703971119133574, "grad_norm": 1.75, "learning_rate": 7.806394101880488e-06, "loss": 1.92, "step": 474 }, { "epoch": 0.5716004813477737, "grad_norm": 8.5625, "learning_rate": 7.769525682881295e-06, "loss": 1.7191, "step": 475 }, { "epoch": 0.5728038507821901, "grad_norm": 1.46875, "learning_rate": 7.73268914212735e-06, "loss": 1.8853, "step": 476 }, { "epoch": 0.5740072202166066, "grad_norm": 9.4375, "learning_rate": 7.695885006091552e-06, "loss": 1.8473, "step": 477 }, { "epoch": 0.5752105896510229, "grad_norm": 1.921875, "learning_rate": 7.659113800783672e-06, "loss": 1.8681, "step": 478 }, { "epoch": 0.5764139590854392, "grad_norm": 7.0625, "learning_rate": 7.622376051742824e-06, "loss": 1.1885, "step": 479 }, { "epoch": 0.5776173285198556, "grad_norm": 2.09375, "learning_rate": 7.585672284029962e-06, "loss": 1.894, "step": 480 }, { "epoch": 0.5788206979542719, "grad_norm": 7.875, "learning_rate": 7.549003022220374e-06, "loss": 1.4822, "step": 481 }, { "epoch": 0.5800240673886883, "grad_norm": 3.140625, "learning_rate": 7.512368790396186e-06, "loss": 1.8693, "step": 482 }, { "epoch": 0.5812274368231047, "grad_norm": 7.9375, "learning_rate": 7.475770112138867e-06, "loss": 1.7295, "step": 483 }, { "epoch": 0.5824308062575211, "grad_norm": 1.84375, "learning_rate": 7.43920751052176e-06, "loss": 1.9136, "step": 484 }, { "epoch": 0.5836341756919374, "grad_norm": 8.5625, "learning_rate": 7.402681508102585e-06, "loss": 1.9603, "step": 485 }, { "epoch": 0.5848375451263538, "grad_norm": 1.8828125, "learning_rate": 7.366192626915982e-06, "loss": 1.783, "step": 486 }, { "epoch": 0.5860409145607701, "grad_norm": 8.625, "learning_rate": 7.329741388466056e-06, "loss": 1.6332, "step": 487 }, { "epoch": 0.5872442839951865, "grad_norm": 1.7109375, "learning_rate": 7.293328313718912e-06, "loss": 1.8852, "step": 488 }, { "epoch": 0.5884476534296029, "grad_norm": 9.5, "learning_rate": 7.256953923095209e-06, "loss": 1.667, "step": 489 }, { "epoch": 0.5896510228640193, "grad_norm": 2.015625, "learning_rate": 7.220618736462739e-06, "loss": 1.8578, "step": 490 }, { "epoch": 0.5908543922984356, "grad_norm": 8.5625, "learning_rate": 7.184323273128981e-06, "loss": 1.6134, "step": 491 }, { "epoch": 0.592057761732852, "grad_norm": 1.640625, "learning_rate": 7.1480680518336766e-06, "loss": 1.8897, "step": 492 }, { "epoch": 0.5932611311672683, "grad_norm": 11.25, "learning_rate": 7.111853590741431e-06, "loss": 2.0435, "step": 493 }, { "epoch": 0.5944645006016848, "grad_norm": 1.9140625, "learning_rate": 7.075680407434289e-06, "loss": 1.8304, "step": 494 }, { "epoch": 0.5956678700361011, "grad_norm": 7.84375, "learning_rate": 7.039549018904362e-06, "loss": 1.7986, "step": 495 }, { "epoch": 0.5968712394705175, "grad_norm": 1.7421875, "learning_rate": 7.0034599415464135e-06, "loss": 1.8652, "step": 496 }, { "epoch": 0.5980746089049338, "grad_norm": 10.625, "learning_rate": 6.967413691150493e-06, "loss": 2.0518, "step": 497 }, { "epoch": 0.5992779783393501, "grad_norm": 3.0, "learning_rate": 6.931410782894563e-06, "loss": 1.8989, "step": 498 }, { "epoch": 0.6004813477737665, "grad_norm": 11.625, "learning_rate": 6.895451731337129e-06, "loss": 1.7888, "step": 499 }, { "epoch": 0.601684717208183, "grad_norm": 2.609375, "learning_rate": 6.859537050409895e-06, "loss": 1.8564, "step": 500 }, { "epoch": 0.6028880866425993, "grad_norm": 8.4375, "learning_rate": 6.823667253410417e-06, "loss": 1.7225, "step": 501 }, { "epoch": 0.6040914560770156, "grad_norm": 2.0, "learning_rate": 6.787842852994757e-06, "loss": 1.8348, "step": 502 }, { "epoch": 0.605294825511432, "grad_norm": 9.625, "learning_rate": 6.752064361170165e-06, "loss": 1.415, "step": 503 }, { "epoch": 0.6064981949458483, "grad_norm": 2.078125, "learning_rate": 6.716332289287759e-06, "loss": 1.8862, "step": 504 }, { "epoch": 0.6077015643802648, "grad_norm": 9.25, "learning_rate": 6.6806471480352175e-06, "loss": 1.9002, "step": 505 }, { "epoch": 0.6089049338146811, "grad_norm": 1.6171875, "learning_rate": 6.64500944742948e-06, "loss": 1.808, "step": 506 }, { "epoch": 0.6101083032490975, "grad_norm": 13.125, "learning_rate": 6.609419696809463e-06, "loss": 1.7093, "step": 507 }, { "epoch": 0.6113116726835138, "grad_norm": 1.9609375, "learning_rate": 6.5738784048287615e-06, "loss": 1.8882, "step": 508 }, { "epoch": 0.6125150421179302, "grad_norm": 13.25, "learning_rate": 6.5383860794484065e-06, "loss": 1.9821, "step": 509 }, { "epoch": 0.6137184115523465, "grad_norm": 2.046875, "learning_rate": 6.502943227929586e-06, "loss": 1.8888, "step": 510 }, { "epoch": 0.614921780986763, "grad_norm": 6.8125, "learning_rate": 6.4675503568263955e-06, "loss": 1.5587, "step": 511 }, { "epoch": 0.6161251504211793, "grad_norm": 1.6328125, "learning_rate": 6.432207971978619e-06, "loss": 1.8976, "step": 512 }, { "epoch": 0.6173285198555957, "grad_norm": 7.84375, "learning_rate": 6.396916578504468e-06, "loss": 1.5405, "step": 513 }, { "epoch": 0.618531889290012, "grad_norm": 1.8125, "learning_rate": 6.3616766807933875e-06, "loss": 1.7942, "step": 514 }, { "epoch": 0.6197352587244284, "grad_norm": 10.75, "learning_rate": 6.326488782498831e-06, "loss": 1.5382, "step": 515 }, { "epoch": 0.6209386281588448, "grad_norm": 1.96875, "learning_rate": 6.291353386531074e-06, "loss": 1.8092, "step": 516 }, { "epoch": 0.6221419975932612, "grad_norm": 8.75, "learning_rate": 6.256270995050026e-06, "loss": 1.6977, "step": 517 }, { "epoch": 0.6233453670276775, "grad_norm": 1.9609375, "learning_rate": 6.221242109458043e-06, "loss": 1.9296, "step": 518 }, { "epoch": 0.6245487364620939, "grad_norm": 7.90625, "learning_rate": 6.186267230392762e-06, "loss": 1.5804, "step": 519 }, { "epoch": 0.6257521058965102, "grad_norm": 2.0625, "learning_rate": 6.151346857719964e-06, "loss": 1.9559, "step": 520 }, { "epoch": 0.6269554753309265, "grad_norm": 6.28125, "learning_rate": 6.116481490526407e-06, "loss": 1.6613, "step": 521 }, { "epoch": 0.628158844765343, "grad_norm": 1.859375, "learning_rate": 6.081671627112704e-06, "loss": 1.894, "step": 522 }, { "epoch": 0.6293622141997594, "grad_norm": 8.625, "learning_rate": 6.046917764986213e-06, "loss": 1.6817, "step": 523 }, { "epoch": 0.6305655836341757, "grad_norm": 1.84375, "learning_rate": 6.012220400853899e-06, "loss": 1.946, "step": 524 }, { "epoch": 0.631768953068592, "grad_norm": 7.9375, "learning_rate": 5.977580030615254e-06, "loss": 1.5564, "step": 525 }, { "epoch": 0.6329723225030084, "grad_norm": 2.171875, "learning_rate": 5.942997149355208e-06, "loss": 1.8387, "step": 526 }, { "epoch": 0.6341756919374247, "grad_norm": 11.4375, "learning_rate": 5.9084722513370485e-06, "loss": 1.6062, "step": 527 }, { "epoch": 0.6353790613718412, "grad_norm": 1.6640625, "learning_rate": 5.874005829995358e-06, "loss": 1.8847, "step": 528 }, { "epoch": 0.6365824308062575, "grad_norm": 7.65625, "learning_rate": 5.839598377928964e-06, "loss": 1.3544, "step": 529 }, { "epoch": 0.6377858002406739, "grad_norm": 2.3125, "learning_rate": 5.8052503868939005e-06, "loss": 1.9021, "step": 530 }, { "epoch": 0.6389891696750902, "grad_norm": 8.75, "learning_rate": 5.7709623477963696e-06, "loss": 1.71, "step": 531 }, { "epoch": 0.6401925391095066, "grad_norm": 1.484375, "learning_rate": 5.736734750685737e-06, "loss": 1.9201, "step": 532 }, { "epoch": 0.641395908543923, "grad_norm": 7.40625, "learning_rate": 5.702568084747513e-06, "loss": 1.5156, "step": 533 }, { "epoch": 0.6425992779783394, "grad_norm": 2.0625, "learning_rate": 5.6684628382963905e-06, "loss": 1.834, "step": 534 }, { "epoch": 0.6438026474127557, "grad_norm": 8.625, "learning_rate": 5.6344194987692304e-06, "loss": 1.7765, "step": 535 }, { "epoch": 0.6450060168471721, "grad_norm": 1.8828125, "learning_rate": 5.60043855271811e-06, "loss": 1.8613, "step": 536 }, { "epoch": 0.6462093862815884, "grad_norm": 8.0, "learning_rate": 5.566520485803388e-06, "loss": 1.89, "step": 537 }, { "epoch": 0.6474127557160048, "grad_norm": 2.0, "learning_rate": 5.53266578278673e-06, "loss": 1.8957, "step": 538 }, { "epoch": 0.6486161251504212, "grad_norm": 11.375, "learning_rate": 5.498874927524196e-06, "loss": 1.7665, "step": 539 }, { "epoch": 0.6498194945848376, "grad_norm": 1.78125, "learning_rate": 5.465148402959339e-06, "loss": 1.8616, "step": 540 }, { "epoch": 0.6510228640192539, "grad_norm": 6.90625, "learning_rate": 5.43148669111627e-06, "loss": 1.4651, "step": 541 }, { "epoch": 0.6522262334536703, "grad_norm": 2.21875, "learning_rate": 5.397890273092807e-06, "loss": 1.8635, "step": 542 }, { "epoch": 0.6534296028880866, "grad_norm": 8.5, "learning_rate": 5.364359629053566e-06, "loss": 1.5246, "step": 543 }, { "epoch": 0.6546329723225031, "grad_norm": 1.9375, "learning_rate": 5.33089523822311e-06, "loss": 1.8426, "step": 544 }, { "epoch": 0.6558363417569194, "grad_norm": 7.3125, "learning_rate": 5.29749757887912e-06, "loss": 1.6715, "step": 545 }, { "epoch": 0.6570397111913358, "grad_norm": 2.09375, "learning_rate": 5.264167128345523e-06, "loss": 1.8278, "step": 546 }, { "epoch": 0.6582430806257521, "grad_norm": 8.75, "learning_rate": 5.230904362985694e-06, "loss": 1.6496, "step": 547 }, { "epoch": 0.6594464500601684, "grad_norm": 1.8984375, "learning_rate": 5.197709758195648e-06, "loss": 1.8517, "step": 548 }, { "epoch": 0.6606498194945848, "grad_norm": 9.0625, "learning_rate": 5.164583788397234e-06, "loss": 1.4924, "step": 549 }, { "epoch": 0.6618531889290012, "grad_norm": 2.015625, "learning_rate": 5.131526927031356e-06, "loss": 1.8934, "step": 550 }, { "epoch": 0.6630565583634176, "grad_norm": 6.96875, "learning_rate": 5.098539646551226e-06, "loss": 1.6779, "step": 551 }, { "epoch": 0.6642599277978339, "grad_norm": 2.328125, "learning_rate": 5.0656224184155764e-06, "loss": 1.9336, "step": 552 }, { "epoch": 0.6654632972322503, "grad_norm": 7.125, "learning_rate": 5.032775713081963e-06, "loss": 1.6682, "step": 553 }, { "epoch": 0.6666666666666666, "grad_norm": 1.96875, "learning_rate": 5.000000000000003e-06, "loss": 1.8408, "step": 554 }, { "epoch": 0.6678700361010831, "grad_norm": 7.875, "learning_rate": 4.967295747604685e-06, "loss": 1.7067, "step": 555 }, { "epoch": 0.6690734055354994, "grad_norm": 1.953125, "learning_rate": 4.934663423309685e-06, "loss": 1.9034, "step": 556 }, { "epoch": 0.6702767749699158, "grad_norm": 8.6875, "learning_rate": 4.902103493500654e-06, "loss": 1.7299, "step": 557 }, { "epoch": 0.6714801444043321, "grad_norm": 2.328125, "learning_rate": 4.869616423528588e-06, "loss": 1.9405, "step": 558 }, { "epoch": 0.6726835138387485, "grad_norm": 8.625, "learning_rate": 4.837202677703149e-06, "loss": 1.6553, "step": 559 }, { "epoch": 0.6738868832731648, "grad_norm": 2.203125, "learning_rate": 4.804862719286044e-06, "loss": 1.888, "step": 560 }, { "epoch": 0.6750902527075813, "grad_norm": 9.6875, "learning_rate": 4.772597010484396e-06, "loss": 1.7892, "step": 561 }, { "epoch": 0.6762936221419976, "grad_norm": 1.8984375, "learning_rate": 4.740406012444153e-06, "loss": 1.887, "step": 562 }, { "epoch": 0.677496991576414, "grad_norm": 10.0, "learning_rate": 4.7082901852434734e-06, "loss": 1.7795, "step": 563 }, { "epoch": 0.6787003610108303, "grad_norm": 2.234375, "learning_rate": 4.6762499878861764e-06, "loss": 1.8282, "step": 564 }, { "epoch": 0.6799037304452467, "grad_norm": 16.0, "learning_rate": 4.644285878295161e-06, "loss": 1.7961, "step": 565 }, { "epoch": 0.681107099879663, "grad_norm": 1.859375, "learning_rate": 4.612398313305867e-06, "loss": 1.9233, "step": 566 }, { "epoch": 0.6823104693140795, "grad_norm": 21.75, "learning_rate": 4.580587748659753e-06, "loss": 1.8356, "step": 567 }, { "epoch": 0.6835138387484958, "grad_norm": 2.03125, "learning_rate": 4.548854638997778e-06, "loss": 1.9244, "step": 568 }, { "epoch": 0.6847172081829122, "grad_norm": 8.375, "learning_rate": 4.517199437853909e-06, "loss": 1.7831, "step": 569 }, { "epoch": 0.6859205776173285, "grad_norm": 2.6875, "learning_rate": 4.485622597648624e-06, "loss": 1.9176, "step": 570 }, { "epoch": 0.6871239470517448, "grad_norm": 6.8125, "learning_rate": 4.454124569682459e-06, "loss": 1.5729, "step": 571 }, { "epoch": 0.6883273164861613, "grad_norm": 1.59375, "learning_rate": 4.4227058041295515e-06, "loss": 1.8203, "step": 572 }, { "epoch": 0.6895306859205776, "grad_norm": 7.4375, "learning_rate": 4.391366750031217e-06, "loss": 1.6795, "step": 573 }, { "epoch": 0.690734055354994, "grad_norm": 2.15625, "learning_rate": 4.3601078552895245e-06, "loss": 1.8442, "step": 574 }, { "epoch": 0.6919374247894103, "grad_norm": 9.0625, "learning_rate": 4.3289295666608865e-06, "loss": 1.7537, "step": 575 }, { "epoch": 0.6931407942238267, "grad_norm": 1.890625, "learning_rate": 4.297832329749687e-06, "loss": 1.8637, "step": 576 }, { "epoch": 0.694344163658243, "grad_norm": 9.875, "learning_rate": 4.2668165890019044e-06, "loss": 1.6435, "step": 577 }, { "epoch": 0.6955475330926595, "grad_norm": 1.625, "learning_rate": 4.235882787698763e-06, "loss": 1.8364, "step": 578 }, { "epoch": 0.6967509025270758, "grad_norm": 8.6875, "learning_rate": 4.205031367950402e-06, "loss": 1.9807, "step": 579 }, { "epoch": 0.6979542719614922, "grad_norm": 1.7890625, "learning_rate": 4.174262770689552e-06, "loss": 1.9126, "step": 580 }, { "epoch": 0.6991576413959085, "grad_norm": 8.0625, "learning_rate": 4.143577435665229e-06, "loss": 1.6958, "step": 581 }, { "epoch": 0.7003610108303249, "grad_norm": 2.1875, "learning_rate": 4.112975801436454e-06, "loss": 1.8707, "step": 582 }, { "epoch": 0.7015643802647413, "grad_norm": 8.6875, "learning_rate": 4.082458305365982e-06, "loss": 1.8376, "step": 583 }, { "epoch": 0.7027677496991577, "grad_norm": 1.8359375, "learning_rate": 4.052025383614061e-06, "loss": 1.8117, "step": 584 }, { "epoch": 0.703971119133574, "grad_norm": 7.4375, "learning_rate": 4.021677471132192e-06, "loss": 1.5338, "step": 585 }, { "epoch": 0.7051744885679904, "grad_norm": 1.875, "learning_rate": 3.991415001656906e-06, "loss": 1.8334, "step": 586 }, { "epoch": 0.7063778580024067, "grad_norm": 7.9375, "learning_rate": 3.9612384077035705e-06, "loss": 1.8104, "step": 587 }, { "epoch": 0.7075812274368231, "grad_norm": 1.8046875, "learning_rate": 3.931148120560211e-06, "loss": 1.8165, "step": 588 }, { "epoch": 0.7087845968712395, "grad_norm": 9.6875, "learning_rate": 3.90114457028134e-06, "loss": 1.7101, "step": 589 }, { "epoch": 0.7099879663056559, "grad_norm": 2.484375, "learning_rate": 3.871228185681822e-06, "loss": 1.8789, "step": 590 }, { "epoch": 0.7111913357400722, "grad_norm": 7.53125, "learning_rate": 3.84139939433074e-06, "loss": 1.6553, "step": 591 }, { "epoch": 0.7123947051744886, "grad_norm": 2.0, "learning_rate": 3.811658622545268e-06, "loss": 1.8035, "step": 592 }, { "epoch": 0.7135980746089049, "grad_norm": 9.0, "learning_rate": 3.782006295384604e-06, "loss": 1.4303, "step": 593 }, { "epoch": 0.7148014440433214, "grad_norm": 2.0625, "learning_rate": 3.7524428366438757e-06, "loss": 1.8162, "step": 594 }, { "epoch": 0.7160048134777377, "grad_norm": 7.90625, "learning_rate": 3.722968668848098e-06, "loss": 1.8102, "step": 595 }, { "epoch": 0.717208182912154, "grad_norm": 1.8203125, "learning_rate": 3.6935842132461307e-06, "loss": 1.8109, "step": 596 }, { "epoch": 0.7184115523465704, "grad_norm": 10.25, "learning_rate": 3.664289889804643e-06, "loss": 1.9009, "step": 597 }, { "epoch": 0.7196149217809867, "grad_norm": 2.703125, "learning_rate": 3.635086117202128e-06, "loss": 1.9618, "step": 598 }, { "epoch": 0.7208182912154031, "grad_norm": 7.4375, "learning_rate": 3.6059733128229125e-06, "loss": 1.6618, "step": 599 }, { "epoch": 0.7220216606498195, "grad_norm": 2.234375, "learning_rate": 3.576951892751197e-06, "loss": 1.8906, "step": 600 }, { "epoch": 0.7232250300842359, "grad_norm": 7.25, "learning_rate": 3.548022271765107e-06, "loss": 1.6154, "step": 601 }, { "epoch": 0.7244283995186522, "grad_norm": 1.90625, "learning_rate": 3.5191848633307545e-06, "loss": 1.856, "step": 602 }, { "epoch": 0.7256317689530686, "grad_norm": 6.90625, "learning_rate": 3.490440079596341e-06, "loss": 1.4066, "step": 603 }, { "epoch": 0.7268351383874849, "grad_norm": 2.046875, "learning_rate": 3.4617883313862633e-06, "loss": 1.8758, "step": 604 }, { "epoch": 0.7280385078219013, "grad_norm": 7.8125, "learning_rate": 3.433230028195239e-06, "loss": 1.4305, "step": 605 }, { "epoch": 0.7292418772563177, "grad_norm": 2.015625, "learning_rate": 3.4047655781824605e-06, "loss": 1.8426, "step": 606 }, { "epoch": 0.7304452466907341, "grad_norm": 8.25, "learning_rate": 3.376395388165762e-06, "loss": 1.6548, "step": 607 }, { "epoch": 0.7316486161251504, "grad_norm": 1.9375, "learning_rate": 3.3481198636157908e-06, "loss": 1.8894, "step": 608 }, { "epoch": 0.7328519855595668, "grad_norm": 7.6875, "learning_rate": 3.3199394086502257e-06, "loss": 1.1895, "step": 609 }, { "epoch": 0.7340553549939831, "grad_norm": 2.609375, "learning_rate": 3.2918544260279985e-06, "loss": 1.8858, "step": 610 }, { "epoch": 0.7352587244283996, "grad_norm": 8.5, "learning_rate": 3.2638653171435387e-06, "loss": 1.6574, "step": 611 }, { "epoch": 0.7364620938628159, "grad_norm": 2.890625, "learning_rate": 3.2359724820210394e-06, "loss": 1.9466, "step": 612 }, { "epoch": 0.7376654632972323, "grad_norm": 7.625, "learning_rate": 3.2081763193087247e-06, "loss": 1.5547, "step": 613 }, { "epoch": 0.7388688327316486, "grad_norm": 1.828125, "learning_rate": 3.180477226273172e-06, "loss": 1.8555, "step": 614 }, { "epoch": 0.740072202166065, "grad_norm": 8.125, "learning_rate": 3.1528755987936188e-06, "loss": 1.4619, "step": 615 }, { "epoch": 0.7412755716004813, "grad_norm": 2.125, "learning_rate": 3.1253718313563207e-06, "loss": 1.8658, "step": 616 }, { "epoch": 0.7424789410348978, "grad_norm": 8.25, "learning_rate": 3.097966317048895e-06, "loss": 1.6176, "step": 617 }, { "epoch": 0.7436823104693141, "grad_norm": 2.15625, "learning_rate": 3.070659447554719e-06, "loss": 1.859, "step": 618 }, { "epoch": 0.7448856799037304, "grad_norm": 9.1875, "learning_rate": 3.0434516131473214e-06, "loss": 1.6177, "step": 619 }, { "epoch": 0.7460890493381468, "grad_norm": 1.9140625, "learning_rate": 3.016343202684807e-06, "loss": 1.8524, "step": 620 }, { "epoch": 0.7472924187725631, "grad_norm": 7.0, "learning_rate": 2.9893346036042968e-06, "loss": 1.6677, "step": 621 }, { "epoch": 0.7484957882069796, "grad_norm": 1.4609375, "learning_rate": 2.962426201916402e-06, "loss": 1.8322, "step": 622 }, { "epoch": 0.7496991576413959, "grad_norm": 7.96875, "learning_rate": 2.9356183821996976e-06, "loss": 1.6472, "step": 623 }, { "epoch": 0.7509025270758123, "grad_norm": 1.8046875, "learning_rate": 2.9089115275952217e-06, "loss": 1.8495, "step": 624 }, { "epoch": 0.7521058965102286, "grad_norm": 12.0625, "learning_rate": 2.882306019801008e-06, "loss": 1.4577, "step": 625 }, { "epoch": 0.753309265944645, "grad_norm": 1.6328125, "learning_rate": 2.855802239066623e-06, "loss": 1.8251, "step": 626 }, { "epoch": 0.7545126353790613, "grad_norm": 9.375, "learning_rate": 2.8294005641877486e-06, "loss": 1.8152, "step": 627 }, { "epoch": 0.7557160048134778, "grad_norm": 2.3125, "learning_rate": 2.8031013725007415e-06, "loss": 1.8742, "step": 628 }, { "epoch": 0.7569193742478941, "grad_norm": 7.96875, "learning_rate": 2.776905039877268e-06, "loss": 1.6975, "step": 629 }, { "epoch": 0.7581227436823105, "grad_norm": 1.6796875, "learning_rate": 2.750811940718906e-06, "loss": 1.7944, "step": 630 }, { "epoch": 0.7593261131167268, "grad_norm": 8.6875, "learning_rate": 2.724822447951814e-06, "loss": 1.7846, "step": 631 }, { "epoch": 0.7605294825511432, "grad_norm": 1.8671875, "learning_rate": 2.6989369330213865e-06, "loss": 1.8771, "step": 632 }, { "epoch": 0.7617328519855595, "grad_norm": 7.5, "learning_rate": 2.6731557658869668e-06, "loss": 1.7278, "step": 633 }, { "epoch": 0.762936221419976, "grad_norm": 2.265625, "learning_rate": 2.647479315016528e-06, "loss": 1.8712, "step": 634 }, { "epoch": 0.7641395908543923, "grad_norm": 8.0, "learning_rate": 2.621907947381438e-06, "loss": 1.5809, "step": 635 }, { "epoch": 0.7653429602888087, "grad_norm": 2.546875, "learning_rate": 2.596442028451194e-06, "loss": 1.8584, "step": 636 }, { "epoch": 0.766546329723225, "grad_norm": 9.25, "learning_rate": 2.5710819221882e-06, "loss": 1.5645, "step": 637 }, { "epoch": 0.7677496991576414, "grad_norm": 1.8984375, "learning_rate": 2.5458279910425865e-06, "loss": 1.9015, "step": 638 }, { "epoch": 0.7689530685920578, "grad_norm": 13.125, "learning_rate": 2.5206805959469984e-06, "loss": 1.8187, "step": 639 }, { "epoch": 0.7701564380264742, "grad_norm": 2.140625, "learning_rate": 2.4956400963114647e-06, "loss": 1.8437, "step": 640 }, { "epoch": 0.7713598074608905, "grad_norm": 8.0625, "learning_rate": 2.4707068500182442e-06, "loss": 1.5862, "step": 641 }, { "epoch": 0.7725631768953068, "grad_norm": 1.890625, "learning_rate": 2.445881213416713e-06, "loss": 1.8355, "step": 642 }, { "epoch": 0.7737665463297232, "grad_norm": 10.0625, "learning_rate": 2.4211635413182845e-06, "loss": 1.7649, "step": 643 }, { "epoch": 0.7749699157641395, "grad_norm": 2.46875, "learning_rate": 2.3965541869913188e-06, "loss": 1.9244, "step": 644 }, { "epoch": 0.776173285198556, "grad_norm": 7.9375, "learning_rate": 2.3720535021560864e-06, "loss": 1.8585, "step": 645 }, { "epoch": 0.7773766546329723, "grad_norm": 1.953125, "learning_rate": 2.3476618369797457e-06, "loss": 1.8733, "step": 646 }, { "epoch": 0.7785800240673887, "grad_norm": 9.875, "learning_rate": 2.32337954007132e-06, "loss": 1.8191, "step": 647 }, { "epoch": 0.779783393501805, "grad_norm": 2.09375, "learning_rate": 2.299206958476731e-06, "loss": 1.8745, "step": 648 }, { "epoch": 0.7809867629362214, "grad_norm": 7.03125, "learning_rate": 2.2751444376738373e-06, "loss": 1.7859, "step": 649 }, { "epoch": 0.7821901323706378, "grad_norm": 1.9453125, "learning_rate": 2.251192321567488e-06, "loss": 1.9035, "step": 650 }, { "epoch": 0.7833935018050542, "grad_norm": 9.5, "learning_rate": 2.2273509524846193e-06, "loss": 1.5491, "step": 651 }, { "epoch": 0.7845968712394705, "grad_norm": 2.078125, "learning_rate": 2.2036206711693508e-06, "loss": 1.8475, "step": 652 }, { "epoch": 0.7858002406738869, "grad_norm": 9.5625, "learning_rate": 2.180001816778118e-06, "loss": 1.7437, "step": 653 }, { "epoch": 0.7870036101083032, "grad_norm": 1.6171875, "learning_rate": 2.1564947268748382e-06, "loss": 1.8555, "step": 654 }, { "epoch": 0.7882069795427196, "grad_norm": 9.0625, "learning_rate": 2.133099737426064e-06, "loss": 1.8266, "step": 655 }, { "epoch": 0.789410348977136, "grad_norm": 1.7734375, "learning_rate": 2.1098171827961965e-06, "loss": 1.8497, "step": 656 }, { "epoch": 0.7906137184115524, "grad_norm": 6.90625, "learning_rate": 2.086647395742709e-06, "loss": 1.7402, "step": 657 }, { "epoch": 0.7918170878459687, "grad_norm": 1.9765625, "learning_rate": 2.0635907074113737e-06, "loss": 1.7807, "step": 658 }, { "epoch": 0.7930204572803851, "grad_norm": 11.125, "learning_rate": 2.040647447331553e-06, "loss": 1.6841, "step": 659 }, { "epoch": 0.7942238267148014, "grad_norm": 1.7734375, "learning_rate": 2.0178179434114674e-06, "loss": 1.8595, "step": 660 }, { "epoch": 0.7954271961492179, "grad_norm": 11.125, "learning_rate": 1.9951025219335183e-06, "loss": 1.7197, "step": 661 }, { "epoch": 0.7966305655836342, "grad_norm": 1.6015625, "learning_rate": 1.972501507549637e-06, "loss": 1.8435, "step": 662 }, { "epoch": 0.7978339350180506, "grad_norm": 12.75, "learning_rate": 1.95001522327662e-06, "loss": 1.6737, "step": 663 }, { "epoch": 0.7990373044524669, "grad_norm": 1.828125, "learning_rate": 1.927643990491528e-06, "loss": 1.8645, "step": 664 }, { "epoch": 0.8002406738868832, "grad_norm": 11.0625, "learning_rate": 1.905388128927098e-06, "loss": 1.7747, "step": 665 }, { "epoch": 0.8014440433212996, "grad_norm": 1.65625, "learning_rate": 1.883247956667157e-06, "loss": 1.885, "step": 666 }, { "epoch": 0.802647412755716, "grad_norm": 6.5625, "learning_rate": 1.8612237901420838e-06, "loss": 1.459, "step": 667 }, { "epoch": 0.8038507821901324, "grad_norm": 2.171875, "learning_rate": 1.839315944124298e-06, "loss": 1.8284, "step": 668 }, { "epoch": 0.8050541516245487, "grad_norm": 7.125, "learning_rate": 1.8175247317237365e-06, "loss": 1.4277, "step": 669 }, { "epoch": 0.8062575210589651, "grad_norm": 1.7734375, "learning_rate": 1.7958504643834062e-06, "loss": 1.8519, "step": 670 }, { "epoch": 0.8074608904933814, "grad_norm": 7.34375, "learning_rate": 1.774293451874909e-06, "loss": 1.4755, "step": 671 }, { "epoch": 0.8086642599277978, "grad_norm": 1.890625, "learning_rate": 1.7528540022940288e-06, "loss": 1.8477, "step": 672 }, { "epoch": 0.8098676293622142, "grad_norm": 10.3125, "learning_rate": 1.731532422056319e-06, "loss": 1.9744, "step": 673 }, { "epoch": 0.8110709987966306, "grad_norm": 1.9609375, "learning_rate": 1.71032901589274e-06, "loss": 1.7982, "step": 674 }, { "epoch": 0.8122743682310469, "grad_norm": 11.4375, "learning_rate": 1.6892440868452763e-06, "loss": 1.5589, "step": 675 }, { "epoch": 0.8134777376654633, "grad_norm": 1.546875, "learning_rate": 1.6682779362626378e-06, "loss": 1.8309, "step": 676 }, { "epoch": 0.8146811070998796, "grad_norm": 11.125, "learning_rate": 1.6474308637959235e-06, "loss": 1.74, "step": 677 }, { "epoch": 0.8158844765342961, "grad_norm": 1.8203125, "learning_rate": 1.6267031673943546e-06, "loss": 1.8395, "step": 678 }, { "epoch": 0.8170878459687124, "grad_norm": 8.1875, "learning_rate": 1.6060951433010186e-06, "loss": 1.5354, "step": 679 }, { "epoch": 0.8182912154031288, "grad_norm": 1.765625, "learning_rate": 1.5856070860486205e-06, "loss": 1.9064, "step": 680 }, { "epoch": 0.8194945848375451, "grad_norm": 8.6875, "learning_rate": 1.5652392884552947e-06, "loss": 1.6362, "step": 681 }, { "epoch": 0.8206979542719615, "grad_norm": 1.375, "learning_rate": 1.544992041620398e-06, "loss": 1.8921, "step": 682 }, { "epoch": 0.8219013237063778, "grad_norm": 12.0, "learning_rate": 1.5248656349203628e-06, "loss": 1.6462, "step": 683 }, { "epoch": 0.8231046931407943, "grad_norm": 1.9921875, "learning_rate": 1.5048603560045549e-06, "loss": 1.8653, "step": 684 }, { "epoch": 0.8243080625752106, "grad_norm": 10.4375, "learning_rate": 1.4849764907911712e-06, "loss": 1.8637, "step": 685 }, { "epoch": 0.825511432009627, "grad_norm": 2.09375, "learning_rate": 1.4652143234631465e-06, "loss": 1.8923, "step": 686 }, { "epoch": 0.8267148014440433, "grad_norm": 7.46875, "learning_rate": 1.4455741364640863e-06, "loss": 1.7424, "step": 687 }, { "epoch": 0.8279181708784596, "grad_norm": 1.6796875, "learning_rate": 1.426056210494241e-06, "loss": 1.8606, "step": 688 }, { "epoch": 0.8291215403128761, "grad_norm": 8.0625, "learning_rate": 1.4066608245064872e-06, "loss": 1.5042, "step": 689 }, { "epoch": 0.8303249097472925, "grad_norm": 2.015625, "learning_rate": 1.3873882557023488e-06, "loss": 1.8733, "step": 690 }, { "epoch": 0.8315282791817088, "grad_norm": 10.25, "learning_rate": 1.3682387795280228e-06, "loss": 1.8601, "step": 691 }, { "epoch": 0.8327316486161251, "grad_norm": 1.6328125, "learning_rate": 1.3492126696704544e-06, "loss": 1.7998, "step": 692 }, { "epoch": 0.8339350180505415, "grad_norm": 7.90625, "learning_rate": 1.3303101980534183e-06, "loss": 1.4791, "step": 693 }, { "epoch": 0.8351383874849578, "grad_norm": 1.625, "learning_rate": 1.3115316348336348e-06, "loss": 1.845, "step": 694 }, { "epoch": 0.8363417569193743, "grad_norm": 9.3125, "learning_rate": 1.2928772483969054e-06, "loss": 1.8637, "step": 695 }, { "epoch": 0.8375451263537906, "grad_norm": 2.3125, "learning_rate": 1.2743473053542842e-06, "loss": 1.9164, "step": 696 }, { "epoch": 0.838748495788207, "grad_norm": 7.78125, "learning_rate": 1.2559420705382664e-06, "loss": 1.6432, "step": 697 }, { "epoch": 0.8399518652226233, "grad_norm": 1.9296875, "learning_rate": 1.237661806998991e-06, "loss": 1.9078, "step": 698 }, { "epoch": 0.8411552346570397, "grad_norm": 8.5625, "learning_rate": 1.2195067760004952e-06, "loss": 1.478, "step": 699 }, { "epoch": 0.8423586040914561, "grad_norm": 1.7734375, "learning_rate": 1.2014772370169747e-06, "loss": 1.8989, "step": 700 }, { "epoch": 0.8435619735258725, "grad_norm": 9.8125, "learning_rate": 1.1835734477290784e-06, "loss": 1.6463, "step": 701 }, { "epoch": 0.8447653429602888, "grad_norm": 2.1875, "learning_rate": 1.1657956640202217e-06, "loss": 1.9037, "step": 702 }, { "epoch": 0.8459687123947052, "grad_norm": 8.4375, "learning_rate": 1.148144139972931e-06, "loss": 1.4785, "step": 703 }, { "epoch": 0.8471720818291215, "grad_norm": 1.953125, "learning_rate": 1.1306191278652112e-06, "loss": 1.8928, "step": 704 }, { "epoch": 0.8483754512635379, "grad_norm": 9.1875, "learning_rate": 1.1132208781669418e-06, "loss": 1.7606, "step": 705 }, { "epoch": 0.8495788206979543, "grad_norm": 1.6015625, "learning_rate": 1.0959496395362946e-06, "loss": 1.9227, "step": 706 }, { "epoch": 0.8507821901323707, "grad_norm": 8.5, "learning_rate": 1.0788056588161854e-06, "loss": 1.3676, "step": 707 }, { "epoch": 0.851985559566787, "grad_norm": 1.6484375, "learning_rate": 1.0617891810307458e-06, "loss": 1.883, "step": 708 }, { "epoch": 0.8531889290012034, "grad_norm": 8.6875, "learning_rate": 1.0449004493818083e-06, "loss": 1.693, "step": 709 }, { "epoch": 0.8543922984356197, "grad_norm": 1.7734375, "learning_rate": 1.0281397052454457e-06, "loss": 1.8337, "step": 710 }, { "epoch": 0.855595667870036, "grad_norm": 10.0, "learning_rate": 1.0115071881685134e-06, "loss": 1.7055, "step": 711 }, { "epoch": 0.8567990373044525, "grad_norm": 1.7421875, "learning_rate": 9.950031358652313e-07, "loss": 1.857, "step": 712 }, { "epoch": 0.8580024067388689, "grad_norm": 6.4375, "learning_rate": 9.786277842137837e-07, "loss": 1.4346, "step": 713 }, { "epoch": 0.8592057761732852, "grad_norm": 2.546875, "learning_rate": 9.623813672529437e-07, "loss": 1.9645, "step": 714 }, { "epoch": 0.8604091456077015, "grad_norm": 7.75, "learning_rate": 9.462641171787313e-07, "loss": 1.6015, "step": 715 }, { "epoch": 0.8616125150421179, "grad_norm": 1.8046875, "learning_rate": 9.302762643411e-07, "loss": 1.8621, "step": 716 }, { "epoch": 0.8628158844765343, "grad_norm": 7.96875, "learning_rate": 9.144180372406342e-07, "loss": 1.4974, "step": 717 }, { "epoch": 0.8640192539109507, "grad_norm": 1.4765625, "learning_rate": 8.986896625253006e-07, "loss": 1.8188, "step": 718 }, { "epoch": 0.865222623345367, "grad_norm": 6.75, "learning_rate": 8.830913649871875e-07, "loss": 1.7289, "step": 719 }, { "epoch": 0.8664259927797834, "grad_norm": 1.8515625, "learning_rate": 8.676233675593038e-07, "loss": 1.8617, "step": 720 }, { "epoch": 0.8676293622141997, "grad_norm": 9.1875, "learning_rate": 8.522858913123944e-07, "loss": 1.7697, "step": 721 }, { "epoch": 0.8688327316486161, "grad_norm": 1.9453125, "learning_rate": 8.370791554517743e-07, "loss": 1.9422, "step": 722 }, { "epoch": 0.8700361010830325, "grad_norm": 7.9375, "learning_rate": 8.220033773142022e-07, "loss": 1.7187, "step": 723 }, { "epoch": 0.8712394705174489, "grad_norm": 1.984375, "learning_rate": 8.070587723647705e-07, "loss": 1.9435, "step": 724 }, { "epoch": 0.8724428399518652, "grad_norm": 7.5, "learning_rate": 7.922455541938245e-07, "loss": 1.7793, "step": 725 }, { "epoch": 0.8736462093862816, "grad_norm": 2.0, "learning_rate": 7.77563934513913e-07, "loss": 1.8336, "step": 726 }, { "epoch": 0.8748495788206979, "grad_norm": 8.9375, "learning_rate": 7.630141231567589e-07, "loss": 1.546, "step": 727 }, { "epoch": 0.8760529482551144, "grad_norm": 1.5625, "learning_rate": 7.485963280702646e-07, "loss": 1.8681, "step": 728 }, { "epoch": 0.8772563176895307, "grad_norm": 11.5625, "learning_rate": 7.343107553155404e-07, "loss": 1.773, "step": 729 }, { "epoch": 0.8784596871239471, "grad_norm": 2.15625, "learning_rate": 7.201576090639529e-07, "loss": 1.9156, "step": 730 }, { "epoch": 0.8796630565583634, "grad_norm": 7.34375, "learning_rate": 7.061370915942101e-07, "loss": 1.6258, "step": 731 }, { "epoch": 0.8808664259927798, "grad_norm": 1.7109375, "learning_rate": 6.922494032894744e-07, "loss": 1.8233, "step": 732 }, { "epoch": 0.8820697954271961, "grad_norm": 9.0625, "learning_rate": 6.784947426344923e-07, "loss": 1.7625, "step": 733 }, { "epoch": 0.8832731648616126, "grad_norm": 2.09375, "learning_rate": 6.648733062127643e-07, "loss": 1.8679, "step": 734 }, { "epoch": 0.8844765342960289, "grad_norm": 9.0625, "learning_rate": 6.513852887037319e-07, "loss": 1.5156, "step": 735 }, { "epoch": 0.8856799037304453, "grad_norm": 1.7421875, "learning_rate": 6.380308828799919e-07, "loss": 1.8636, "step": 736 }, { "epoch": 0.8868832731648616, "grad_norm": 8.125, "learning_rate": 6.248102796045475e-07, "loss": 1.7208, "step": 737 }, { "epoch": 0.8880866425992779, "grad_norm": 2.234375, "learning_rate": 6.117236678280736e-07, "loss": 1.9117, "step": 738 }, { "epoch": 0.8892900120336944, "grad_norm": 13.3125, "learning_rate": 5.98771234586224e-07, "loss": 2.1236, "step": 739 }, { "epoch": 0.8904933814681107, "grad_norm": 1.984375, "learning_rate": 5.859531649969563e-07, "loss": 1.8878, "step": 740 }, { "epoch": 0.8916967509025271, "grad_norm": 9.0, "learning_rate": 5.732696422578787e-07, "loss": 1.488, "step": 741 }, { "epoch": 0.8929001203369434, "grad_norm": 1.7734375, "learning_rate": 5.60720847643641e-07, "loss": 1.8455, "step": 742 }, { "epoch": 0.8941034897713598, "grad_norm": 8.5625, "learning_rate": 5.483069605033365e-07, "loss": 1.7038, "step": 743 }, { "epoch": 0.8953068592057761, "grad_norm": 2.03125, "learning_rate": 5.360281582579474e-07, "loss": 1.8326, "step": 744 }, { "epoch": 0.8965102286401926, "grad_norm": 9.125, "learning_rate": 5.238846163978018e-07, "loss": 1.5662, "step": 745 }, { "epoch": 0.8977135980746089, "grad_norm": 2.28125, "learning_rate": 5.11876508480067e-07, "loss": 1.8304, "step": 746 }, { "epoch": 0.8989169675090253, "grad_norm": 8.4375, "learning_rate": 5.000040061262712e-07, "loss": 1.6297, "step": 747 }, { "epoch": 0.9001203369434416, "grad_norm": 1.828125, "learning_rate": 4.882672790198473e-07, "loss": 1.8719, "step": 748 }, { "epoch": 0.901323706377858, "grad_norm": 9.3125, "learning_rate": 4.766664949037103e-07, "loss": 1.437, "step": 749 }, { "epoch": 0.9025270758122743, "grad_norm": 1.5546875, "learning_rate": 4.652018195778629e-07, "loss": 1.8454, "step": 750 }, { "epoch": 0.9037304452466908, "grad_norm": 9.375, "learning_rate": 4.538734168970149e-07, "loss": 1.9147, "step": 751 }, { "epoch": 0.9049338146811071, "grad_norm": 1.8203125, "learning_rate": 4.4268144876825846e-07, "loss": 1.8809, "step": 752 }, { "epoch": 0.9061371841155235, "grad_norm": 9.125, "learning_rate": 4.3162607514873556e-07, "loss": 1.4797, "step": 753 }, { "epoch": 0.9073405535499398, "grad_norm": 1.7109375, "learning_rate": 4.207074540433631e-07, "loss": 1.874, "step": 754 }, { "epoch": 0.9085439229843562, "grad_norm": 7.71875, "learning_rate": 4.09925741502577e-07, "loss": 1.5552, "step": 755 }, { "epoch": 0.9097472924187726, "grad_norm": 1.5859375, "learning_rate": 3.9928109162008953e-07, "loss": 1.8081, "step": 756 }, { "epoch": 0.910950661853189, "grad_norm": 6.875, "learning_rate": 3.887736565307032e-07, "loss": 1.6507, "step": 757 }, { "epoch": 0.9121540312876053, "grad_norm": 2.4375, "learning_rate": 3.7840358640812036e-07, "loss": 1.8591, "step": 758 }, { "epoch": 0.9133574007220217, "grad_norm": 10.0, "learning_rate": 3.68171029462806e-07, "loss": 1.6767, "step": 759 }, { "epoch": 0.914560770156438, "grad_norm": 1.984375, "learning_rate": 3.580761319398729e-07, "loss": 1.9217, "step": 760 }, { "epoch": 0.9157641395908543, "grad_norm": 8.4375, "learning_rate": 3.481190381169808e-07, "loss": 1.7456, "step": 761 }, { "epoch": 0.9169675090252708, "grad_norm": 1.796875, "learning_rate": 3.3829989030228163e-07, "loss": 1.9188, "step": 762 }, { "epoch": 0.9181708784596871, "grad_norm": 6.84375, "learning_rate": 3.286188288323844e-07, "loss": 1.5842, "step": 763 }, { "epoch": 0.9193742478941035, "grad_norm": 1.7109375, "learning_rate": 3.190759920703512e-07, "loss": 1.8547, "step": 764 }, { "epoch": 0.9205776173285198, "grad_norm": 7.25, "learning_rate": 3.096715164037123e-07, "loss": 1.906, "step": 765 }, { "epoch": 0.9217809867629362, "grad_norm": 1.7734375, "learning_rate": 3.0040553624252844e-07, "loss": 1.8969, "step": 766 }, { "epoch": 0.9229843561973526, "grad_norm": 8.75, "learning_rate": 2.9127818401745833e-07, "loss": 1.4738, "step": 767 }, { "epoch": 0.924187725631769, "grad_norm": 1.578125, "learning_rate": 2.822895901778744e-07, "loss": 1.8895, "step": 768 }, { "epoch": 0.9253910950661853, "grad_norm": 10.5, "learning_rate": 2.7343988318999536e-07, "loss": 1.6742, "step": 769 }, { "epoch": 0.9265944645006017, "grad_norm": 2.125, "learning_rate": 2.6472918953504566e-07, "loss": 1.8744, "step": 770 }, { "epoch": 0.927797833935018, "grad_norm": 7.0, "learning_rate": 2.5615763370745894e-07, "loss": 1.5675, "step": 771 }, { "epoch": 0.9290012033694344, "grad_norm": 1.9765625, "learning_rate": 2.477253382130862e-07, "loss": 1.8616, "step": 772 }, { "epoch": 0.9302045728038508, "grad_norm": 8.375, "learning_rate": 2.394324235674517e-07, "loss": 1.7312, "step": 773 }, { "epoch": 0.9314079422382672, "grad_norm": 2.515625, "learning_rate": 2.3127900829403305e-07, "loss": 1.8756, "step": 774 }, { "epoch": 0.9326113116726835, "grad_norm": 13.625, "learning_rate": 2.2326520892255953e-07, "loss": 1.7167, "step": 775 }, { "epoch": 0.9338146811070999, "grad_norm": 2.1875, "learning_rate": 2.1539113998735094e-07, "loss": 1.822, "step": 776 }, { "epoch": 0.9350180505415162, "grad_norm": 8.75, "learning_rate": 2.0765691402568455e-07, "loss": 1.5693, "step": 777 }, { "epoch": 0.9362214199759326, "grad_norm": 2.203125, "learning_rate": 2.000626415761786e-07, "loss": 1.8813, "step": 778 }, { "epoch": 0.937424789410349, "grad_norm": 11.25, "learning_rate": 1.9260843117721695e-07, "loss": 1.5309, "step": 779 }, { "epoch": 0.9386281588447654, "grad_norm": 2.078125, "learning_rate": 1.8529438936540022e-07, "loss": 1.8917, "step": 780 }, { "epoch": 0.9398315282791817, "grad_norm": 9.6875, "learning_rate": 1.7812062067401713e-07, "loss": 1.0129, "step": 781 }, { "epoch": 0.941034897713598, "grad_norm": 1.8671875, "learning_rate": 1.710872276315556e-07, "loss": 1.802, "step": 782 }, { "epoch": 0.9422382671480144, "grad_norm": 7.71875, "learning_rate": 1.6419431076023505e-07, "loss": 1.8398, "step": 783 }, { "epoch": 0.9434416365824309, "grad_norm": 2.25, "learning_rate": 1.5744196857456874e-07, "loss": 1.9039, "step": 784 }, { "epoch": 0.9446450060168472, "grad_norm": 6.65625, "learning_rate": 1.5083029757995914e-07, "loss": 1.5238, "step": 785 }, { "epoch": 0.9458483754512635, "grad_norm": 2.671875, "learning_rate": 1.4435939227131712e-07, "loss": 1.8906, "step": 786 }, { "epoch": 0.9470517448856799, "grad_norm": 10.125, "learning_rate": 1.3802934513170828e-07, "loss": 1.628, "step": 787 }, { "epoch": 0.9482551143200962, "grad_norm": 1.6953125, "learning_rate": 1.3184024663103755e-07, "loss": 1.8095, "step": 788 }, { "epoch": 0.9494584837545126, "grad_norm": 7.78125, "learning_rate": 1.25792185224749e-07, "loss": 1.3659, "step": 789 }, { "epoch": 0.950661853188929, "grad_norm": 1.7890625, "learning_rate": 1.198852473525669e-07, "loss": 1.9008, "step": 790 }, { "epoch": 0.9518652226233454, "grad_norm": 7.25, "learning_rate": 1.1411951743726002e-07, "loss": 1.8708, "step": 791 }, { "epoch": 0.9530685920577617, "grad_norm": 1.9609375, "learning_rate": 1.0849507788343038e-07, "loss": 1.8636, "step": 792 }, { "epoch": 0.9542719614921781, "grad_norm": 8.9375, "learning_rate": 1.030120090763409e-07, "loss": 1.7204, "step": 793 }, { "epoch": 0.9554753309265944, "grad_norm": 1.6328125, "learning_rate": 9.767038938076511e-08, "loss": 1.8623, "step": 794 }, { "epoch": 0.9566787003610109, "grad_norm": 9.3125, "learning_rate": 9.247029513986482e-08, "loss": 1.9118, "step": 795 }, { "epoch": 0.9578820697954272, "grad_norm": 1.59375, "learning_rate": 8.741180067409982e-08, "loss": 1.888, "step": 796 }, { "epoch": 0.9590854392298436, "grad_norm": 8.625, "learning_rate": 8.249497828016872e-08, "loss": 1.6362, "step": 797 }, { "epoch": 0.9602888086642599, "grad_norm": 2.0625, "learning_rate": 7.771989822997206e-08, "loss": 1.8862, "step": 798 }, { "epoch": 0.9614921780986763, "grad_norm": 6.65625, "learning_rate": 7.30866287696097e-08, "loss": 1.5293, "step": 799 }, { "epoch": 0.9626955475330926, "grad_norm": 1.8828125, "learning_rate": 6.859523611840612e-08, "loss": 1.8288, "step": 800 }, { "epoch": 0.9638989169675091, "grad_norm": 7.25, "learning_rate": 6.424578446796004e-08, "loss": 1.7089, "step": 801 }, { "epoch": 0.9651022864019254, "grad_norm": 1.71875, "learning_rate": 6.003833598123287e-08, "loss": 1.8746, "step": 802 }, { "epoch": 0.9663056558363418, "grad_norm": 7.84375, "learning_rate": 5.597295079165621e-08, "loss": 1.4806, "step": 803 }, { "epoch": 0.9675090252707581, "grad_norm": 1.9140625, "learning_rate": 5.204968700227242e-08, "loss": 1.8057, "step": 804 }, { "epoch": 0.9687123947051745, "grad_norm": 9.625, "learning_rate": 4.826860068490868e-08, "loss": 1.6486, "step": 805 }, { "epoch": 0.9699157641395909, "grad_norm": 1.7890625, "learning_rate": 4.4629745879367634e-08, "loss": 1.8444, "step": 806 }, { "epoch": 0.9711191335740073, "grad_norm": 8.75, "learning_rate": 4.113317459266242e-08, "loss": 1.6037, "step": 807 }, { "epoch": 0.9723225030084236, "grad_norm": 1.8515625, "learning_rate": 3.777893679827061e-08, "loss": 1.8095, "step": 808 }, { "epoch": 0.97352587244284, "grad_norm": 8.375, "learning_rate": 3.456708043541812e-08, "loss": 1.8942, "step": 809 }, { "epoch": 0.9747292418772563, "grad_norm": 1.5859375, "learning_rate": 3.1497651408399774e-08, "loss": 1.8051, "step": 810 }, { "epoch": 0.9759326113116726, "grad_norm": 11.0, "learning_rate": 2.8570693585914246e-08, "loss": 1.8053, "step": 811 }, { "epoch": 0.9771359807460891, "grad_norm": 2.109375, "learning_rate": 2.578624880044567e-08, "loss": 1.9297, "step": 812 }, { "epoch": 0.9783393501805054, "grad_norm": 7.78125, "learning_rate": 2.314435684766081e-08, "loss": 1.8373, "step": 813 }, { "epoch": 0.9795427196149218, "grad_norm": 1.703125, "learning_rate": 2.0645055485842837e-08, "loss": 1.9003, "step": 814 }, { "epoch": 0.9807460890493381, "grad_norm": 7.0, "learning_rate": 1.8288380435349527e-08, "loss": 1.6641, "step": 815 }, { "epoch": 0.9819494584837545, "grad_norm": 2.140625, "learning_rate": 1.6074365378105915e-08, "loss": 1.8634, "step": 816 }, { "epoch": 0.9831528279181708, "grad_norm": 8.5, "learning_rate": 1.400304195711688e-08, "loss": 1.5292, "step": 817 }, { "epoch": 0.9843561973525873, "grad_norm": 2.125, "learning_rate": 1.2074439776021962e-08, "loss": 1.8817, "step": 818 }, { "epoch": 0.9855595667870036, "grad_norm": 6.9375, "learning_rate": 1.0288586398670141e-08, "loss": 1.5644, "step": 819 }, { "epoch": 0.98676293622142, "grad_norm": 1.71875, "learning_rate": 8.64550734872016e-09, "loss": 1.8427, "step": 820 }, { "epoch": 0.9879663056558363, "grad_norm": 7.75, "learning_rate": 7.145226109286363e-09, "loss": 1.5548, "step": 821 }, { "epoch": 0.9891696750902527, "grad_norm": 2.078125, "learning_rate": 5.787764122592299e-09, "loss": 1.9276, "step": 822 }, { "epoch": 0.9903730445246691, "grad_norm": 8.75, "learning_rate": 4.573140789672082e-09, "loss": 1.8879, "step": 823 }, { "epoch": 0.9915764139590855, "grad_norm": 1.9375, "learning_rate": 3.5013734700883874e-09, "loss": 1.8432, "step": 824 }, { "epoch": 0.9927797833935018, "grad_norm": 9.0625, "learning_rate": 2.5724774816870966e-09, "loss": 1.3733, "step": 825 }, { "epoch": 0.9939831528279182, "grad_norm": 1.9140625, "learning_rate": 1.7864661003774708e-09, "loss": 1.822, "step": 826 }, { "epoch": 0.9951865222623345, "grad_norm": 6.9375, "learning_rate": 1.1433505599434126e-09, "loss": 1.4707, "step": 827 }, { "epoch": 0.9963898916967509, "grad_norm": 1.5546875, "learning_rate": 6.431400518780439e-10, "loss": 1.8107, "step": 828 }, { "epoch": 0.9975932611311673, "grad_norm": 7.375, "learning_rate": 2.8584172526047173e-10, "loss": 1.6397, "step": 829 }, { "epoch": 0.9987966305655837, "grad_norm": 2.125, "learning_rate": 7.146068664698469e-11, "loss": 1.8256, "step": 830 }, { "epoch": 1.0, "grad_norm": 8.5, "learning_rate": 0.0, "loss": 1.7932, "step": 831 } ], "logging_steps": 1, "max_steps": 831, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.262293843351962e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }