|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9997177001976099, |
|
"eval_steps": 133, |
|
"global_step": 1328, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0007527994730403689, |
|
"grad_norm": 36.36020673883064, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 2.0808, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0015055989460807378, |
|
"grad_norm": 41.49766742673561, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 2.0787, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0022583984191211067, |
|
"grad_norm": 29.396842470362714, |
|
"learning_rate": 1.5e-06, |
|
"loss": 1.9621, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0030111978921614755, |
|
"grad_norm": 22.162520153974473, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.8945, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0037639973652018443, |
|
"grad_norm": 29.318403975263223, |
|
"learning_rate": 2.5e-06, |
|
"loss": 2.1412, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0045167968382422135, |
|
"grad_norm": 18.707385510288105, |
|
"learning_rate": 3e-06, |
|
"loss": 2.1158, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.005269596311282582, |
|
"grad_norm": 19.505666042431155, |
|
"learning_rate": 3.5e-06, |
|
"loss": 1.8249, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.006022395784322951, |
|
"grad_norm": 12.946325162286865, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.7453, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.00677519525736332, |
|
"grad_norm": 48.622192972274625, |
|
"learning_rate": 4.5e-06, |
|
"loss": 1.6962, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.007527994730403689, |
|
"grad_norm": 19.565469972794624, |
|
"learning_rate": 5e-06, |
|
"loss": 1.6878, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008280794203444058, |
|
"grad_norm": 75.67524968927123, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 1.6624, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.009033593676484427, |
|
"grad_norm": 30.445369285190505, |
|
"learning_rate": 6e-06, |
|
"loss": 1.6926, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.009786393149524796, |
|
"grad_norm": 9.286678675822756, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 1.493, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.010539192622565164, |
|
"grad_norm": 8.800345354001589, |
|
"learning_rate": 7e-06, |
|
"loss": 1.4314, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.011291992095605533, |
|
"grad_norm": 17.285960388196333, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 1.552, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.012044791568645902, |
|
"grad_norm": 9.20149362304735, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.4517, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.012797591041686271, |
|
"grad_norm": 12.357494553067204, |
|
"learning_rate": 8.5e-06, |
|
"loss": 1.5248, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.01355039051472664, |
|
"grad_norm": 5.930012552972285, |
|
"learning_rate": 9e-06, |
|
"loss": 1.3537, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.014303189987767008, |
|
"grad_norm": 7.406734075765084, |
|
"learning_rate": 9.5e-06, |
|
"loss": 1.437, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.015055989460807377, |
|
"grad_norm": 7.89125237928406, |
|
"learning_rate": 1e-05, |
|
"loss": 1.4079, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.015808788933847746, |
|
"grad_norm": 9.23489645591984, |
|
"learning_rate": 1.0500000000000001e-05, |
|
"loss": 1.4561, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.016561588406888116, |
|
"grad_norm": 6.965625824093193, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 1.3439, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.017314387879928485, |
|
"grad_norm": 7.080158120837606, |
|
"learning_rate": 1.15e-05, |
|
"loss": 1.3862, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.018067187352968854, |
|
"grad_norm": 7.7009738518492235, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.3187, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.018819986826009223, |
|
"grad_norm": 6.953266502376254, |
|
"learning_rate": 1.25e-05, |
|
"loss": 1.4262, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.019572786299049592, |
|
"grad_norm": 5.092442953921055, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 1.2701, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.020325585772089958, |
|
"grad_norm": 5.561143584438669, |
|
"learning_rate": 1.3500000000000001e-05, |
|
"loss": 1.4132, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.021078385245130327, |
|
"grad_norm": 9.291190855019563, |
|
"learning_rate": 1.4e-05, |
|
"loss": 1.3376, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.021831184718170697, |
|
"grad_norm": 6.741096041673811, |
|
"learning_rate": 1.45e-05, |
|
"loss": 1.3535, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.022583984191211066, |
|
"grad_norm": 6.345428967677617, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 1.281, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.023336783664251435, |
|
"grad_norm": 6.240759122218503, |
|
"learning_rate": 1.55e-05, |
|
"loss": 1.3036, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.024089583137291804, |
|
"grad_norm": 7.655198830811222, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.3841, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.024842382610332173, |
|
"grad_norm": 5.018774253201613, |
|
"learning_rate": 1.65e-05, |
|
"loss": 1.278, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.025595182083372543, |
|
"grad_norm": 5.460243707216422, |
|
"learning_rate": 1.7e-05, |
|
"loss": 1.3331, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.026347981556412912, |
|
"grad_norm": 4.624338654763614, |
|
"learning_rate": 1.7500000000000002e-05, |
|
"loss": 1.2542, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.02710078102945328, |
|
"grad_norm": 7.701628120385831, |
|
"learning_rate": 1.8e-05, |
|
"loss": 1.298, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.027853580502493647, |
|
"grad_norm": 5.136489226357356, |
|
"learning_rate": 1.8500000000000002e-05, |
|
"loss": 1.1864, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.028606379975534016, |
|
"grad_norm": 6.874578456975717, |
|
"learning_rate": 1.9e-05, |
|
"loss": 1.2732, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.029359179448574385, |
|
"grad_norm": 5.811036013054745, |
|
"learning_rate": 1.95e-05, |
|
"loss": 1.2806, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.030111978921614754, |
|
"grad_norm": 5.51821322763505, |
|
"learning_rate": 2e-05, |
|
"loss": 1.3575, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.030864778394655124, |
|
"grad_norm": 6.6228648443339475, |
|
"learning_rate": 1.999997025336748e-05, |
|
"loss": 1.2004, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.03161757786769549, |
|
"grad_norm": 5.963103240408259, |
|
"learning_rate": 1.9999881013646893e-05, |
|
"loss": 1.2557, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.03237037734073586, |
|
"grad_norm": 6.519925159232462, |
|
"learning_rate": 1.999973228136915e-05, |
|
"loss": 1.2194, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.03312317681377623, |
|
"grad_norm": 5.898983728146134, |
|
"learning_rate": 1.9999524057419116e-05, |
|
"loss": 1.228, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.0338759762868166, |
|
"grad_norm": 7.997013591911161, |
|
"learning_rate": 1.9999256343035577e-05, |
|
"loss": 1.2152, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.03462877575985697, |
|
"grad_norm": 11.068558967470823, |
|
"learning_rate": 1.9998929139811257e-05, |
|
"loss": 1.2682, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.03538157523289734, |
|
"grad_norm": 6.025642126702577, |
|
"learning_rate": 1.9998542449692794e-05, |
|
"loss": 1.2186, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.03613437470593771, |
|
"grad_norm": 5.1234335564506015, |
|
"learning_rate": 1.999809627498073e-05, |
|
"loss": 1.0775, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.03688717417897808, |
|
"grad_norm": 6.9589319498297835, |
|
"learning_rate": 1.9997590618329507e-05, |
|
"loss": 1.1942, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.037639973652018446, |
|
"grad_norm": 4.899408391501564, |
|
"learning_rate": 1.999702548274744e-05, |
|
"loss": 1.1736, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.038392773125058816, |
|
"grad_norm": 6.523950562265359, |
|
"learning_rate": 1.999640087159671e-05, |
|
"loss": 1.1497, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.039145572598099185, |
|
"grad_norm": 6.080223023749499, |
|
"learning_rate": 1.999571678859333e-05, |
|
"loss": 1.1685, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.03989837207113955, |
|
"grad_norm": 4.810923884376871, |
|
"learning_rate": 1.9994973237807133e-05, |
|
"loss": 1.1577, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.040651171544179916, |
|
"grad_norm": 9.585902843233736, |
|
"learning_rate": 1.999417022366174e-05, |
|
"loss": 1.2533, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.041403971017220285, |
|
"grad_norm": 7.121805505321927, |
|
"learning_rate": 1.9993307750934555e-05, |
|
"loss": 1.2452, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.042156770490260655, |
|
"grad_norm": 5.119582554676219, |
|
"learning_rate": 1.99923858247567e-05, |
|
"loss": 1.1369, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.042909569963301024, |
|
"grad_norm": 6.992826625922773, |
|
"learning_rate": 1.999140445061302e-05, |
|
"loss": 1.0986, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.04366236943634139, |
|
"grad_norm": 4.249338096365068, |
|
"learning_rate": 1.9990363634342032e-05, |
|
"loss": 1.1033, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.04441516890938176, |
|
"grad_norm": 5.775070285941309, |
|
"learning_rate": 1.9989263382135882e-05, |
|
"loss": 1.1086, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.04516796838242213, |
|
"grad_norm": 6.663400587483444, |
|
"learning_rate": 1.9988103700540345e-05, |
|
"loss": 1.1803, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0459207678554625, |
|
"grad_norm": 5.183870397182474, |
|
"learning_rate": 1.998688459645473e-05, |
|
"loss": 1.1531, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.04667356732850287, |
|
"grad_norm": 3.5395456165525427, |
|
"learning_rate": 1.9985606077131895e-05, |
|
"loss": 1.0054, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.04742636680154324, |
|
"grad_norm": 3.8150354882109503, |
|
"learning_rate": 1.998426815017817e-05, |
|
"loss": 1.1239, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.04817916627458361, |
|
"grad_norm": 5.03910810217068, |
|
"learning_rate": 1.998287082355331e-05, |
|
"loss": 1.1786, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.04893196574762398, |
|
"grad_norm": 5.5510267998080645, |
|
"learning_rate": 1.9981414105570473e-05, |
|
"loss": 1.1524, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.04968476522066435, |
|
"grad_norm": 4.565720771118459, |
|
"learning_rate": 1.997989800489615e-05, |
|
"loss": 1.154, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.050437564693704716, |
|
"grad_norm": 4.530477664369838, |
|
"learning_rate": 1.997832253055012e-05, |
|
"loss": 1.1039, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.051190364166745085, |
|
"grad_norm": 4.123668763760377, |
|
"learning_rate": 1.9976687691905394e-05, |
|
"loss": 1.15, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.051943163639785454, |
|
"grad_norm": 7.025385301726715, |
|
"learning_rate": 1.997499349868816e-05, |
|
"loss": 1.0612, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.052695963112825824, |
|
"grad_norm": 4.176310522867936, |
|
"learning_rate": 1.997323996097772e-05, |
|
"loss": 1.0894, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.05344876258586619, |
|
"grad_norm": 3.246977668511193, |
|
"learning_rate": 1.9971427089206458e-05, |
|
"loss": 1.1196, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.05420156205890656, |
|
"grad_norm": 4.176862011800632, |
|
"learning_rate": 1.9969554894159723e-05, |
|
"loss": 1.1921, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.054954361531946924, |
|
"grad_norm": 8.805463366473532, |
|
"learning_rate": 1.9967623386975826e-05, |
|
"loss": 1.0246, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.05570716100498729, |
|
"grad_norm": 3.64619595168792, |
|
"learning_rate": 1.996563257914593e-05, |
|
"loss": 1.0822, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.05645996047802766, |
|
"grad_norm": 4.217544603504828, |
|
"learning_rate": 1.9963582482514003e-05, |
|
"loss": 1.1187, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.05721275995106803, |
|
"grad_norm": 5.678985309075052, |
|
"learning_rate": 1.9961473109276735e-05, |
|
"loss": 1.2067, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.0579655594241084, |
|
"grad_norm": 5.001296565321351, |
|
"learning_rate": 1.9959304471983477e-05, |
|
"loss": 1.1527, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.05871835889714877, |
|
"grad_norm": 5.604515835465151, |
|
"learning_rate": 1.9957076583536166e-05, |
|
"loss": 1.1761, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.05947115837018914, |
|
"grad_norm": 5.095593228417783, |
|
"learning_rate": 1.995478945718923e-05, |
|
"loss": 1.1628, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.06022395784322951, |
|
"grad_norm": 6.597427071965454, |
|
"learning_rate": 1.9952443106549535e-05, |
|
"loss": 1.2414, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06097675731626988, |
|
"grad_norm": 6.9875665495378225, |
|
"learning_rate": 1.9950037545576288e-05, |
|
"loss": 1.2225, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.06172955678931025, |
|
"grad_norm": 4.687344949995792, |
|
"learning_rate": 1.994757278858095e-05, |
|
"loss": 1.1402, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.062482356262350616, |
|
"grad_norm": 5.277541712626479, |
|
"learning_rate": 1.994504885022717e-05, |
|
"loss": 1.1022, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.06323515573539099, |
|
"grad_norm": 2.9552521360386907, |
|
"learning_rate": 1.9942465745530687e-05, |
|
"loss": 1.0843, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.06398795520843135, |
|
"grad_norm": 3.454331479730833, |
|
"learning_rate": 1.9939823489859226e-05, |
|
"loss": 1.0417, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.06474075468147172, |
|
"grad_norm": 6.494959979859043, |
|
"learning_rate": 1.9937122098932428e-05, |
|
"loss": 1.1774, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.06549355415451209, |
|
"grad_norm": 5.232583927463086, |
|
"learning_rate": 1.9934361588821757e-05, |
|
"loss": 1.1037, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.06624635362755246, |
|
"grad_norm": 4.2279381075948175, |
|
"learning_rate": 1.993154197595038e-05, |
|
"loss": 1.1474, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.06699915310059283, |
|
"grad_norm": 4.148924762168968, |
|
"learning_rate": 1.99286632770931e-05, |
|
"loss": 1.1898, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.0677519525736332, |
|
"grad_norm": 5.154636179011585, |
|
"learning_rate": 1.9925725509376236e-05, |
|
"loss": 1.1342, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.06850475204667357, |
|
"grad_norm": 5.217031244017699, |
|
"learning_rate": 1.9922728690277528e-05, |
|
"loss": 1.1692, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.06925755151971394, |
|
"grad_norm": 4.338496940394976, |
|
"learning_rate": 1.991967283762603e-05, |
|
"loss": 1.141, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.07001035099275431, |
|
"grad_norm": 4.197923670398518, |
|
"learning_rate": 1.9916557969602007e-05, |
|
"loss": 1.1289, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.07076315046579468, |
|
"grad_norm": 3.743275306079982, |
|
"learning_rate": 1.991338410473683e-05, |
|
"loss": 1.1073, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.07151594993883505, |
|
"grad_norm": 8.539568294151485, |
|
"learning_rate": 1.991015126191285e-05, |
|
"loss": 1.1257, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.07226874941187542, |
|
"grad_norm": 3.5627327495855856, |
|
"learning_rate": 1.9906859460363307e-05, |
|
"loss": 1.063, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.07302154888491579, |
|
"grad_norm": 3.7469430274257376, |
|
"learning_rate": 1.9903508719672208e-05, |
|
"loss": 1.041, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.07377434835795615, |
|
"grad_norm": 5.4214290416554025, |
|
"learning_rate": 1.9900099059774197e-05, |
|
"loss": 1.1798, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.07452714783099652, |
|
"grad_norm": 6.3484542327964615, |
|
"learning_rate": 1.989663050095446e-05, |
|
"loss": 1.0986, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.07527994730403689, |
|
"grad_norm": 3.8840684282304982, |
|
"learning_rate": 1.989310306384858e-05, |
|
"loss": 1.1509, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07603274677707726, |
|
"grad_norm": 6.221356396939684, |
|
"learning_rate": 1.9889516769442436e-05, |
|
"loss": 1.0952, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.07678554625011763, |
|
"grad_norm": 4.917299687739317, |
|
"learning_rate": 1.988587163907206e-05, |
|
"loss": 1.1515, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.077538345723158, |
|
"grad_norm": 3.71254487396025, |
|
"learning_rate": 1.988216769442353e-05, |
|
"loss": 1.1358, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.07829114519619837, |
|
"grad_norm": 5.144719861197378, |
|
"learning_rate": 1.9878404957532817e-05, |
|
"loss": 1.1711, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.07904394466923872, |
|
"grad_norm": 3.9343091087557105, |
|
"learning_rate": 1.987458345078567e-05, |
|
"loss": 1.1338, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0797967441422791, |
|
"grad_norm": 5.493391217443812, |
|
"learning_rate": 1.9870703196917485e-05, |
|
"loss": 1.1356, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.08054954361531946, |
|
"grad_norm": 4.196852899871084, |
|
"learning_rate": 1.9866764219013154e-05, |
|
"loss": 1.1412, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.08130234308835983, |
|
"grad_norm": 4.031891342725685, |
|
"learning_rate": 1.986276654050695e-05, |
|
"loss": 1.2285, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.0820551425614002, |
|
"grad_norm": 5.3508039600144865, |
|
"learning_rate": 1.985871018518236e-05, |
|
"loss": 1.2115, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.08280794203444057, |
|
"grad_norm": 4.108835495999991, |
|
"learning_rate": 1.9854595177171968e-05, |
|
"loss": 1.1451, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.08356074150748094, |
|
"grad_norm": 3.302036630754478, |
|
"learning_rate": 1.9850421540957307e-05, |
|
"loss": 1.0757, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.08431354098052131, |
|
"grad_norm": 4.414268844516725, |
|
"learning_rate": 1.984618930136869e-05, |
|
"loss": 1.0881, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.08506634045356168, |
|
"grad_norm": 3.700539962241975, |
|
"learning_rate": 1.98418984835851e-05, |
|
"loss": 1.144, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.08581913992660205, |
|
"grad_norm": 4.9358231906298204, |
|
"learning_rate": 1.9837549113134015e-05, |
|
"loss": 1.038, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.08657193939964242, |
|
"grad_norm": 5.645738001161152, |
|
"learning_rate": 1.9833141215891253e-05, |
|
"loss": 1.2081, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.08732473887268279, |
|
"grad_norm": 5.119377292254036, |
|
"learning_rate": 1.9828674818080837e-05, |
|
"loss": 1.1226, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.08807753834572316, |
|
"grad_norm": 4.1763020003723526, |
|
"learning_rate": 1.9824149946274827e-05, |
|
"loss": 1.1374, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.08883033781876352, |
|
"grad_norm": 5.503112263483898, |
|
"learning_rate": 1.981956662739316e-05, |
|
"loss": 1.1168, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.0895831372918039, |
|
"grad_norm": 3.9664193299971147, |
|
"learning_rate": 1.98149248887035e-05, |
|
"loss": 1.1612, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.09033593676484426, |
|
"grad_norm": 4.46872589770535, |
|
"learning_rate": 1.9810224757821063e-05, |
|
"loss": 1.0124, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.09108873623788463, |
|
"grad_norm": 3.7293231569422702, |
|
"learning_rate": 1.9805466262708464e-05, |
|
"loss": 1.1548, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.091841535710925, |
|
"grad_norm": 4.012485005147592, |
|
"learning_rate": 1.9800649431675544e-05, |
|
"loss": 1.1773, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.09259433518396537, |
|
"grad_norm": 3.5463279826981102, |
|
"learning_rate": 1.9795774293379206e-05, |
|
"loss": 1.11, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.09334713465700574, |
|
"grad_norm": 3.528951778129194, |
|
"learning_rate": 1.979084087682323e-05, |
|
"loss": 1.2064, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.09409993413004611, |
|
"grad_norm": 10.1058424194676, |
|
"learning_rate": 1.9785849211358133e-05, |
|
"loss": 1.1444, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.09485273360308648, |
|
"grad_norm": 6.315992672709579, |
|
"learning_rate": 1.9780799326680956e-05, |
|
"loss": 1.1438, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.09560553307612685, |
|
"grad_norm": 4.060283476245672, |
|
"learning_rate": 1.9775691252835113e-05, |
|
"loss": 1.0999, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.09635833254916722, |
|
"grad_norm": 3.8976057736287313, |
|
"learning_rate": 1.9770525020210204e-05, |
|
"loss": 1.1222, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.09711113202220759, |
|
"grad_norm": 3.9139807812132905, |
|
"learning_rate": 1.9765300659541837e-05, |
|
"loss": 1.1164, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.09786393149524795, |
|
"grad_norm": 4.995087522269767, |
|
"learning_rate": 1.976001820191143e-05, |
|
"loss": 1.085, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.09861673096828832, |
|
"grad_norm": 3.7833873291414823, |
|
"learning_rate": 1.9754677678746064e-05, |
|
"loss": 1.0634, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.0993695304413287, |
|
"grad_norm": 3.334670976301107, |
|
"learning_rate": 1.9749279121818235e-05, |
|
"loss": 1.1077, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.10012232991436906, |
|
"grad_norm": 7.0827146935611065, |
|
"learning_rate": 1.9743822563245738e-05, |
|
"loss": 1.1753, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.10012232991436906, |
|
"eval_loss": 1.0733484029769897, |
|
"eval_runtime": 600.4408, |
|
"eval_samples_per_second": 29.786, |
|
"eval_steps_per_second": 0.466, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.10087512938740943, |
|
"grad_norm": 3.600867409556725, |
|
"learning_rate": 1.973830803549141e-05, |
|
"loss": 0.9933, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.1016279288604498, |
|
"grad_norm": 3.8689385067834148, |
|
"learning_rate": 1.9732735571362985e-05, |
|
"loss": 1.1034, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.10238072833349017, |
|
"grad_norm": 6.883779237321949, |
|
"learning_rate": 1.972710520401287e-05, |
|
"loss": 1.1141, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.10313352780653054, |
|
"grad_norm": 4.762968226804512, |
|
"learning_rate": 1.972141696693795e-05, |
|
"loss": 1.2243, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.10388632727957091, |
|
"grad_norm": 3.6511081137242902, |
|
"learning_rate": 1.9715670893979416e-05, |
|
"loss": 1.0188, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.10463912675261128, |
|
"grad_norm": 3.5691298823695914, |
|
"learning_rate": 1.9709867019322528e-05, |
|
"loss": 1.0551, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.10539192622565165, |
|
"grad_norm": 3.5475531225746444, |
|
"learning_rate": 1.9704005377496428e-05, |
|
"loss": 1.0743, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.10614472569869202, |
|
"grad_norm": 3.791897303162606, |
|
"learning_rate": 1.969808600337394e-05, |
|
"loss": 1.066, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.10689752517173239, |
|
"grad_norm": 4.693893833681282, |
|
"learning_rate": 1.969210893217136e-05, |
|
"loss": 1.1729, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.10765032464477275, |
|
"grad_norm": 3.2325270058743, |
|
"learning_rate": 1.9686074199448222e-05, |
|
"loss": 1.1003, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.10840312411781312, |
|
"grad_norm": 3.546773416480843, |
|
"learning_rate": 1.967998184110713e-05, |
|
"loss": 1.1224, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.10915592359085349, |
|
"grad_norm": 4.8504525234744245, |
|
"learning_rate": 1.967383189339352e-05, |
|
"loss": 1.2487, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.10990872306389385, |
|
"grad_norm": 3.224089352304579, |
|
"learning_rate": 1.9667624392895423e-05, |
|
"loss": 1.0354, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.11066152253693422, |
|
"grad_norm": 4.260944538348364, |
|
"learning_rate": 1.9661359376543295e-05, |
|
"loss": 1.1538, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.11141432200997459, |
|
"grad_norm": 3.7968699711076965, |
|
"learning_rate": 1.9655036881609763e-05, |
|
"loss": 1.1139, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.11216712148301496, |
|
"grad_norm": 6.106144924923444, |
|
"learning_rate": 1.9648656945709413e-05, |
|
"loss": 1.1584, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.11291992095605533, |
|
"grad_norm": 3.862077116537031, |
|
"learning_rate": 1.9642219606798566e-05, |
|
"loss": 1.0728, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1136727204290957, |
|
"grad_norm": 3.984219630530602, |
|
"learning_rate": 1.9635724903175055e-05, |
|
"loss": 1.1206, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.11442551990213606, |
|
"grad_norm": 3.669494644891549, |
|
"learning_rate": 1.9629172873477995e-05, |
|
"loss": 1.083, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.11517831937517643, |
|
"grad_norm": 5.42829688475088, |
|
"learning_rate": 1.9622563556687545e-05, |
|
"loss": 1.1588, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.1159311188482168, |
|
"grad_norm": 4.79232246776243, |
|
"learning_rate": 1.961589699212469e-05, |
|
"loss": 1.2102, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.11668391832125717, |
|
"grad_norm": 11.405826556390412, |
|
"learning_rate": 1.9609173219450998e-05, |
|
"loss": 1.109, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.11743671779429754, |
|
"grad_norm": 5.458296218745196, |
|
"learning_rate": 1.960239227866839e-05, |
|
"loss": 1.0762, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.11818951726733791, |
|
"grad_norm": 3.7544350915582854, |
|
"learning_rate": 1.9595554210118896e-05, |
|
"loss": 1.1603, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.11894231674037828, |
|
"grad_norm": 22.120555226575323, |
|
"learning_rate": 1.9588659054484417e-05, |
|
"loss": 1.0958, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.11969511621341865, |
|
"grad_norm": 4.148885042117614, |
|
"learning_rate": 1.9581706852786492e-05, |
|
"loss": 1.1157, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.12044791568645902, |
|
"grad_norm": 3.9732316992633328, |
|
"learning_rate": 1.9574697646386027e-05, |
|
"loss": 1.1421, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.12120071515949939, |
|
"grad_norm": 3.4050988239013296, |
|
"learning_rate": 1.9567631476983088e-05, |
|
"loss": 1.1144, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.12195351463253976, |
|
"grad_norm": 3.6985764801817465, |
|
"learning_rate": 1.9560508386616624e-05, |
|
"loss": 1.0498, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.12270631410558012, |
|
"grad_norm": 3.8528386734401434, |
|
"learning_rate": 1.9553328417664223e-05, |
|
"loss": 1.0844, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.1234591135786205, |
|
"grad_norm": 3.2819992244778184, |
|
"learning_rate": 1.954609161284186e-05, |
|
"loss": 1.1187, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.12421191305166086, |
|
"grad_norm": 3.22953606650776, |
|
"learning_rate": 1.953879801520366e-05, |
|
"loss": 1.0609, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.12496471252470123, |
|
"grad_norm": 6.494288008441105, |
|
"learning_rate": 1.953144766814161e-05, |
|
"loss": 1.0842, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.1257175119977416, |
|
"grad_norm": 4.888963654063504, |
|
"learning_rate": 1.9524040615385324e-05, |
|
"loss": 1.1902, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.12647031147078197, |
|
"grad_norm": 3.6311602708602257, |
|
"learning_rate": 1.951657690100178e-05, |
|
"loss": 1.0839, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.12722311094382233, |
|
"grad_norm": 4.00682080860383, |
|
"learning_rate": 1.950905656939505e-05, |
|
"loss": 1.1657, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.1279759104168627, |
|
"grad_norm": 3.890665804371254, |
|
"learning_rate": 1.9501479665306046e-05, |
|
"loss": 1.052, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.12872870988990306, |
|
"grad_norm": 3.4307975760371585, |
|
"learning_rate": 1.949384623381224e-05, |
|
"loss": 1.0919, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.12948150936294345, |
|
"grad_norm": 4.0455339119695655, |
|
"learning_rate": 1.9486156320327406e-05, |
|
"loss": 1.1103, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.1302343088359838, |
|
"grad_norm": 3.4165115341264296, |
|
"learning_rate": 1.947840997060136e-05, |
|
"loss": 1.0946, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.13098710830902419, |
|
"grad_norm": 4.65681514038902, |
|
"learning_rate": 1.9470607230719654e-05, |
|
"loss": 1.1848, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.13173990778206454, |
|
"grad_norm": 3.9568518534057193, |
|
"learning_rate": 1.9462748147103342e-05, |
|
"loss": 1.1567, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.13249270725510492, |
|
"grad_norm": 4.955380650232782, |
|
"learning_rate": 1.945483276650868e-05, |
|
"loss": 1.2039, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.13324550672814528, |
|
"grad_norm": 3.9264372250335158, |
|
"learning_rate": 1.9446861136026846e-05, |
|
"loss": 1.04, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.13399830620118566, |
|
"grad_norm": 3.405903263698621, |
|
"learning_rate": 1.9438833303083677e-05, |
|
"loss": 1.058, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.13475110567422602, |
|
"grad_norm": 3.9674907430691544, |
|
"learning_rate": 1.943074931543937e-05, |
|
"loss": 1.1056, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.1355039051472664, |
|
"grad_norm": 3.5452086691685096, |
|
"learning_rate": 1.9422609221188208e-05, |
|
"loss": 1.0868, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.13625670462030676, |
|
"grad_norm": 7.650572420624207, |
|
"learning_rate": 1.9414413068758266e-05, |
|
"loss": 1.1428, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.13700950409334714, |
|
"grad_norm": 3.999012518334001, |
|
"learning_rate": 1.9406160906911137e-05, |
|
"loss": 1.1722, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.1377623035663875, |
|
"grad_norm": 3.038212939029898, |
|
"learning_rate": 1.939785278474162e-05, |
|
"loss": 1.1286, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.13851510303942788, |
|
"grad_norm": 3.906578885384695, |
|
"learning_rate": 1.938948875167745e-05, |
|
"loss": 1.1803, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.13926790251246823, |
|
"grad_norm": 3.3509854123706293, |
|
"learning_rate": 1.9381068857478994e-05, |
|
"loss": 1.0973, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.14002070198550862, |
|
"grad_norm": 4.003782189974324, |
|
"learning_rate": 1.937259315223894e-05, |
|
"loss": 1.0838, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.14077350145854897, |
|
"grad_norm": 3.8245109394744583, |
|
"learning_rate": 1.9364061686382042e-05, |
|
"loss": 1.1094, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.14152630093158935, |
|
"grad_norm": 3.013745925688022, |
|
"learning_rate": 1.9355474510664763e-05, |
|
"loss": 1.0676, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.1422791004046297, |
|
"grad_norm": 4.3642368726250425, |
|
"learning_rate": 1.934683167617502e-05, |
|
"loss": 1.1752, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.1430318998776701, |
|
"grad_norm": 4.122723903005718, |
|
"learning_rate": 1.933813323433186e-05, |
|
"loss": 1.1305, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.14378469935071045, |
|
"grad_norm": 4.253380654597697, |
|
"learning_rate": 1.9329379236885145e-05, |
|
"loss": 1.0763, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.14453749882375083, |
|
"grad_norm": 4.458868784252592, |
|
"learning_rate": 1.9320569735915273e-05, |
|
"loss": 1.0436, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.1452902982967912, |
|
"grad_norm": 3.901071568651419, |
|
"learning_rate": 1.9311704783832835e-05, |
|
"loss": 1.1949, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.14604309776983157, |
|
"grad_norm": 3.129172072359172, |
|
"learning_rate": 1.9302784433378333e-05, |
|
"loss": 1.1172, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.14679589724287193, |
|
"grad_norm": 3.8443786959818045, |
|
"learning_rate": 1.9293808737621837e-05, |
|
"loss": 1.2054, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.1475486967159123, |
|
"grad_norm": 6.00072077234027, |
|
"learning_rate": 1.9284777749962696e-05, |
|
"loss": 1.1121, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.14830149618895266, |
|
"grad_norm": 3.7432081921844773, |
|
"learning_rate": 1.9275691524129203e-05, |
|
"loss": 1.0584, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.14905429566199305, |
|
"grad_norm": 3.4135705273299934, |
|
"learning_rate": 1.926655011417828e-05, |
|
"loss": 1.0901, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.1498070951350334, |
|
"grad_norm": 3.4177123004615817, |
|
"learning_rate": 1.9257353574495164e-05, |
|
"loss": 1.1038, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.15055989460807379, |
|
"grad_norm": 3.0832234189207504, |
|
"learning_rate": 1.9248101959793066e-05, |
|
"loss": 1.0317, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.15131269408111414, |
|
"grad_norm": 3.291295738727208, |
|
"learning_rate": 1.9238795325112867e-05, |
|
"loss": 1.2094, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.15206549355415452, |
|
"grad_norm": 3.4703930043192, |
|
"learning_rate": 1.9229433725822776e-05, |
|
"loss": 1.1485, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.15281829302719488, |
|
"grad_norm": 3.927008914244205, |
|
"learning_rate": 1.9220017217618006e-05, |
|
"loss": 1.0951, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.15357109250023526, |
|
"grad_norm": 3.346783632612943, |
|
"learning_rate": 1.921054585652043e-05, |
|
"loss": 1.0405, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.15432389197327562, |
|
"grad_norm": 3.732718718454638, |
|
"learning_rate": 1.9201019698878272e-05, |
|
"loss": 1.072, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.155076691446316, |
|
"grad_norm": 2.8305022942366502, |
|
"learning_rate": 1.9191438801365763e-05, |
|
"loss": 1.047, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.15582949091935636, |
|
"grad_norm": 3.470154225042611, |
|
"learning_rate": 1.9181803220982776e-05, |
|
"loss": 1.1342, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.15658229039239674, |
|
"grad_norm": 3.2370363710969032, |
|
"learning_rate": 1.917211301505453e-05, |
|
"loss": 1.0424, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.1573350898654371, |
|
"grad_norm": 4.036275798333279, |
|
"learning_rate": 1.916236824123123e-05, |
|
"loss": 1.1305, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.15808788933847745, |
|
"grad_norm": 3.105762174117261, |
|
"learning_rate": 1.915256895748771e-05, |
|
"loss": 1.0866, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.15884068881151783, |
|
"grad_norm": 3.241485339921148, |
|
"learning_rate": 1.914271522212311e-05, |
|
"loss": 1.0438, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.1595934882845582, |
|
"grad_norm": 3.7672919645423084, |
|
"learning_rate": 1.9132807093760523e-05, |
|
"loss": 1.0428, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.16034628775759857, |
|
"grad_norm": 3.0743803531471614, |
|
"learning_rate": 1.9122844631346632e-05, |
|
"loss": 1.1082, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.16109908723063893, |
|
"grad_norm": 4.63250823250084, |
|
"learning_rate": 1.9112827894151386e-05, |
|
"loss": 1.0954, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.1618518867036793, |
|
"grad_norm": 3.122767531926535, |
|
"learning_rate": 1.9102756941767625e-05, |
|
"loss": 1.0046, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.16260468617671966, |
|
"grad_norm": 3.268308565573597, |
|
"learning_rate": 1.9092631834110723e-05, |
|
"loss": 1.0834, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.16335748564976005, |
|
"grad_norm": 3.106344816610958, |
|
"learning_rate": 1.9082452631418265e-05, |
|
"loss": 1.0597, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.1641102851228004, |
|
"grad_norm": 2.894735322482825, |
|
"learning_rate": 1.9072219394249644e-05, |
|
"loss": 1.0627, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.1648630845958408, |
|
"grad_norm": 3.1750259084537324, |
|
"learning_rate": 1.9061932183485726e-05, |
|
"loss": 0.9958, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.16561588406888114, |
|
"grad_norm": 3.3713532561993897, |
|
"learning_rate": 1.9051591060328496e-05, |
|
"loss": 1.1432, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.16636868354192152, |
|
"grad_norm": 3.1214606959314875, |
|
"learning_rate": 1.9041196086300666e-05, |
|
"loss": 1.1035, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.16712148301496188, |
|
"grad_norm": 4.111611609451444, |
|
"learning_rate": 1.903074732324533e-05, |
|
"loss": 1.1142, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.16787428248800226, |
|
"grad_norm": 6.34744510464487, |
|
"learning_rate": 1.902024483332559e-05, |
|
"loss": 1.1981, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.16862708196104262, |
|
"grad_norm": 3.853858890566725, |
|
"learning_rate": 1.900968867902419e-05, |
|
"loss": 1.1083, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.169379881434083, |
|
"grad_norm": 3.0972363845249316, |
|
"learning_rate": 1.8999078923143142e-05, |
|
"loss": 1.1272, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.17013268090712336, |
|
"grad_norm": 6.697298803805668, |
|
"learning_rate": 1.8988415628803345e-05, |
|
"loss": 1.078, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.17088548038016374, |
|
"grad_norm": 3.320222370823791, |
|
"learning_rate": 1.8977698859444217e-05, |
|
"loss": 1.0549, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.1716382798532041, |
|
"grad_norm": 3.7206790432455006, |
|
"learning_rate": 1.8966928678823317e-05, |
|
"loss": 1.1347, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.17239107932624448, |
|
"grad_norm": 3.290103204055493, |
|
"learning_rate": 1.8956105151015966e-05, |
|
"loss": 1.0179, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.17314387879928483, |
|
"grad_norm": 3.1386966405501564, |
|
"learning_rate": 1.894522834041487e-05, |
|
"loss": 1.1059, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.17389667827232522, |
|
"grad_norm": 3.2706699814852875, |
|
"learning_rate": 1.8934298311729728e-05, |
|
"loss": 1.0172, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.17464947774536557, |
|
"grad_norm": 3.8269561638109355, |
|
"learning_rate": 1.8923315129986838e-05, |
|
"loss": 1.1137, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.17540227721840596, |
|
"grad_norm": 3.772999914244335, |
|
"learning_rate": 1.8912278860528742e-05, |
|
"loss": 1.1287, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.1761550766914463, |
|
"grad_norm": 5.414372276284227, |
|
"learning_rate": 1.890118956901381e-05, |
|
"loss": 1.1443, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.1769078761644867, |
|
"grad_norm": 3.263109591052546, |
|
"learning_rate": 1.8890047321415856e-05, |
|
"loss": 1.0893, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.17766067563752705, |
|
"grad_norm": 3.3607503214338856, |
|
"learning_rate": 1.8878852184023754e-05, |
|
"loss": 1.0175, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.17841347511056743, |
|
"grad_norm": 4.631748841741912, |
|
"learning_rate": 1.8867604223441027e-05, |
|
"loss": 1.1528, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.1791662745836078, |
|
"grad_norm": 3.7210788472140837, |
|
"learning_rate": 1.885630350658546e-05, |
|
"loss": 1.1066, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.17991907405664817, |
|
"grad_norm": 3.1758770450597247, |
|
"learning_rate": 1.884495010068872e-05, |
|
"loss": 1.103, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.18067187352968853, |
|
"grad_norm": 3.4141865931283037, |
|
"learning_rate": 1.8833544073295918e-05, |
|
"loss": 1.091, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1814246730027289, |
|
"grad_norm": 3.432096224130023, |
|
"learning_rate": 1.8822085492265235e-05, |
|
"loss": 1.1407, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.18217747247576926, |
|
"grad_norm": 3.3201010603309573, |
|
"learning_rate": 1.8810574425767512e-05, |
|
"loss": 1.1171, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.18293027194880965, |
|
"grad_norm": 3.142047941577179, |
|
"learning_rate": 1.879901094228584e-05, |
|
"loss": 1.0008, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.18368307142185, |
|
"grad_norm": 4.428870556760251, |
|
"learning_rate": 1.8787395110615163e-05, |
|
"loss": 1.1599, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.18443587089489039, |
|
"grad_norm": 3.42062935475352, |
|
"learning_rate": 1.877572699986185e-05, |
|
"loss": 1.0318, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.18518867036793074, |
|
"grad_norm": 2.532845389080576, |
|
"learning_rate": 1.8764006679443306e-05, |
|
"loss": 1.0513, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.18594146984097112, |
|
"grad_norm": 3.4902131357709663, |
|
"learning_rate": 1.8752234219087538e-05, |
|
"loss": 1.0851, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.18669426931401148, |
|
"grad_norm": 3.9584887752768396, |
|
"learning_rate": 1.8740409688832762e-05, |
|
"loss": 1.1399, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.18744706878705186, |
|
"grad_norm": 3.369982891463805, |
|
"learning_rate": 1.8728533159026972e-05, |
|
"loss": 1.182, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.18819986826009222, |
|
"grad_norm": 4.416503857133181, |
|
"learning_rate": 1.8716604700327516e-05, |
|
"loss": 1.0763, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.18895266773313257, |
|
"grad_norm": 2.7507954310960114, |
|
"learning_rate": 1.8704624383700686e-05, |
|
"loss": 1.1158, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.18970546720617296, |
|
"grad_norm": 3.1857061931174875, |
|
"learning_rate": 1.8692592280421305e-05, |
|
"loss": 1.1041, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.1904582666792133, |
|
"grad_norm": 4.04760329251056, |
|
"learning_rate": 1.8680508462072282e-05, |
|
"loss": 1.0831, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.1912110661522537, |
|
"grad_norm": 3.882827890480107, |
|
"learning_rate": 1.8668373000544197e-05, |
|
"loss": 1.1167, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.19196386562529405, |
|
"grad_norm": 3.287198676998183, |
|
"learning_rate": 1.865618596803487e-05, |
|
"loss": 1.0305, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.19271666509833443, |
|
"grad_norm": 3.604326797700127, |
|
"learning_rate": 1.8643947437048944e-05, |
|
"loss": 1.0032, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.1934694645713748, |
|
"grad_norm": 4.379872775610173, |
|
"learning_rate": 1.863165748039743e-05, |
|
"loss": 1.008, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.19422226404441517, |
|
"grad_norm": 3.369497382545173, |
|
"learning_rate": 1.8619316171197292e-05, |
|
"loss": 1.067, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.19497506351745553, |
|
"grad_norm": 4.120227014926847, |
|
"learning_rate": 1.8606923582871007e-05, |
|
"loss": 1.0974, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.1957278629904959, |
|
"grad_norm": 3.0494014526642608, |
|
"learning_rate": 1.859447978914614e-05, |
|
"loss": 1.0551, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.19648066246353627, |
|
"grad_norm": 3.3910652042019724, |
|
"learning_rate": 1.8581984864054866e-05, |
|
"loss": 1.0932, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.19723346193657665, |
|
"grad_norm": 3.9039698571171138, |
|
"learning_rate": 1.8569438881933587e-05, |
|
"loss": 1.1064, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.197986261409617, |
|
"grad_norm": 3.268330588127502, |
|
"learning_rate": 1.8556841917422443e-05, |
|
"loss": 0.9784, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.1987390608826574, |
|
"grad_norm": 4.314068622976358, |
|
"learning_rate": 1.8544194045464888e-05, |
|
"loss": 1.0996, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.19949186035569774, |
|
"grad_norm": 3.1827098111819967, |
|
"learning_rate": 1.853149534130724e-05, |
|
"loss": 1.1221, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.20024465982873813, |
|
"grad_norm": 4.040702246051114, |
|
"learning_rate": 1.8518745880498242e-05, |
|
"loss": 1.0216, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.20024465982873813, |
|
"eval_loss": 1.0534603595733643, |
|
"eval_runtime": 583.4736, |
|
"eval_samples_per_second": 30.653, |
|
"eval_steps_per_second": 0.48, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.20099745930177848, |
|
"grad_norm": 3.1584439216784266, |
|
"learning_rate": 1.8505945738888593e-05, |
|
"loss": 1.0341, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.20175025877481886, |
|
"grad_norm": 3.2691826063280294, |
|
"learning_rate": 1.849309499263052e-05, |
|
"loss": 1.14, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.20250305824785922, |
|
"grad_norm": 3.6898030714786794, |
|
"learning_rate": 1.8480193718177305e-05, |
|
"loss": 1.1653, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.2032558577208996, |
|
"grad_norm": 3.575806731417126, |
|
"learning_rate": 1.8467241992282842e-05, |
|
"loss": 1.1146, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.20400865719393996, |
|
"grad_norm": 3.1081441368569744, |
|
"learning_rate": 1.845423989200118e-05, |
|
"loss": 1.0278, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.20476145666698034, |
|
"grad_norm": 2.789678804862752, |
|
"learning_rate": 1.8441187494686055e-05, |
|
"loss": 1.0891, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.2055142561400207, |
|
"grad_norm": 3.985369579436919, |
|
"learning_rate": 1.8428084877990443e-05, |
|
"loss": 1.1268, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.20626705561306108, |
|
"grad_norm": 3.22661410744001, |
|
"learning_rate": 1.841493211986609e-05, |
|
"loss": 1.1072, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.20701985508610143, |
|
"grad_norm": 3.1835000593613123, |
|
"learning_rate": 1.840172929856304e-05, |
|
"loss": 1.0414, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.20777265455914182, |
|
"grad_norm": 3.305479541641861, |
|
"learning_rate": 1.8388476492629198e-05, |
|
"loss": 1.0443, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.20852545403218217, |
|
"grad_norm": 4.693809651108469, |
|
"learning_rate": 1.837517378090983e-05, |
|
"loss": 0.9937, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.20927825350522256, |
|
"grad_norm": 3.5123200132532566, |
|
"learning_rate": 1.836182124254711e-05, |
|
"loss": 1.0776, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.2100310529782629, |
|
"grad_norm": 3.3276868884741733, |
|
"learning_rate": 1.834841895697965e-05, |
|
"loss": 1.0845, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.2107838524513033, |
|
"grad_norm": 3.133629495172562, |
|
"learning_rate": 1.833496700394202e-05, |
|
"loss": 1.1334, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.21153665192434365, |
|
"grad_norm": 2.7777396926997957, |
|
"learning_rate": 1.8321465463464287e-05, |
|
"loss": 1.0504, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.21228945139738403, |
|
"grad_norm": 2.727044505380872, |
|
"learning_rate": 1.8307914415871516e-05, |
|
"loss": 0.9811, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.2130422508704244, |
|
"grad_norm": 3.9758603746678594, |
|
"learning_rate": 1.829431394178332e-05, |
|
"loss": 0.9935, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.21379505034346477, |
|
"grad_norm": 3.5295194163983625, |
|
"learning_rate": 1.8280664122113356e-05, |
|
"loss": 1.0375, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.21454784981650513, |
|
"grad_norm": 3.774125316395598, |
|
"learning_rate": 1.8266965038068856e-05, |
|
"loss": 1.1335, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.2153006492895455, |
|
"grad_norm": 4.651024677980998, |
|
"learning_rate": 1.8253216771150153e-05, |
|
"loss": 1.0207, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.21605344876258586, |
|
"grad_norm": 3.099286878703337, |
|
"learning_rate": 1.823941940315017e-05, |
|
"loss": 1.0212, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.21680624823562625, |
|
"grad_norm": 3.0097671512838096, |
|
"learning_rate": 1.8225573016153945e-05, |
|
"loss": 1.0461, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.2175590477086666, |
|
"grad_norm": 2.973750973417238, |
|
"learning_rate": 1.821167769253817e-05, |
|
"loss": 1.0388, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.21831184718170699, |
|
"grad_norm": 2.967274178762896, |
|
"learning_rate": 1.8197733514970655e-05, |
|
"loss": 1.0564, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.21906464665474734, |
|
"grad_norm": 2.5138035874146327, |
|
"learning_rate": 1.8183740566409867e-05, |
|
"loss": 1.073, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.2198174461277877, |
|
"grad_norm": 3.7001058284729433, |
|
"learning_rate": 1.816969893010442e-05, |
|
"loss": 1.2111, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.22057024560082808, |
|
"grad_norm": 3.5282758941825456, |
|
"learning_rate": 1.8155608689592604e-05, |
|
"loss": 1.183, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.22132304507386844, |
|
"grad_norm": 3.312941108339452, |
|
"learning_rate": 1.8141469928701852e-05, |
|
"loss": 1.0349, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.22207584454690882, |
|
"grad_norm": 2.9008531856700417, |
|
"learning_rate": 1.812728273154827e-05, |
|
"loss": 1.0123, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.22282864401994917, |
|
"grad_norm": 3.55932688032634, |
|
"learning_rate": 1.8113047182536128e-05, |
|
"loss": 1.1382, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.22358144349298956, |
|
"grad_norm": 4.056235257189877, |
|
"learning_rate": 1.8098763366357354e-05, |
|
"loss": 1.0822, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.2243342429660299, |
|
"grad_norm": 4.772955312310337, |
|
"learning_rate": 1.8084431367991032e-05, |
|
"loss": 1.1983, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.2250870424390703, |
|
"grad_norm": 4.2810326932753995, |
|
"learning_rate": 1.8070051272702905e-05, |
|
"loss": 1.1598, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.22583984191211065, |
|
"grad_norm": 3.7796974809301473, |
|
"learning_rate": 1.8055623166044855e-05, |
|
"loss": 1.1074, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.22659264138515103, |
|
"grad_norm": 3.6457585818347304, |
|
"learning_rate": 1.804114713385439e-05, |
|
"loss": 1.1222, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.2273454408581914, |
|
"grad_norm": 3.453904748558131, |
|
"learning_rate": 1.8026623262254164e-05, |
|
"loss": 1.1338, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.22809824033123177, |
|
"grad_norm": 3.532636263927634, |
|
"learning_rate": 1.8012051637651423e-05, |
|
"loss": 1.059, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.22885103980427213, |
|
"grad_norm": 3.9412069933016403, |
|
"learning_rate": 1.7997432346737524e-05, |
|
"loss": 1.0612, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.2296038392773125, |
|
"grad_norm": 3.3027808930961835, |
|
"learning_rate": 1.7982765476487398e-05, |
|
"loss": 1.1245, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.23035663875035287, |
|
"grad_norm": 3.36315843459813, |
|
"learning_rate": 1.7968051114159046e-05, |
|
"loss": 1.0142, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.23110943822339325, |
|
"grad_norm": 3.4294122694976426, |
|
"learning_rate": 1.795328934729302e-05, |
|
"loss": 1.0978, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.2318622376964336, |
|
"grad_norm": 3.09886747290162, |
|
"learning_rate": 1.793848026371188e-05, |
|
"loss": 1.0259, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.232615037169474, |
|
"grad_norm": 3.802369083047949, |
|
"learning_rate": 1.7923623951519708e-05, |
|
"loss": 1.0032, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.23336783664251434, |
|
"grad_norm": 3.459231482580547, |
|
"learning_rate": 1.7908720499101552e-05, |
|
"loss": 1.1666, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.23412063611555473, |
|
"grad_norm": 2.642651982530993, |
|
"learning_rate": 1.7893769995122916e-05, |
|
"loss": 1.084, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.23487343558859508, |
|
"grad_norm": 2.9773440456665208, |
|
"learning_rate": 1.7878772528529232e-05, |
|
"loss": 1.0423, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.23562623506163546, |
|
"grad_norm": 3.154468928265403, |
|
"learning_rate": 1.7863728188545326e-05, |
|
"loss": 1.1134, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.23637903453467582, |
|
"grad_norm": 3.7877452098752937, |
|
"learning_rate": 1.7848637064674887e-05, |
|
"loss": 1.1711, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.2371318340077162, |
|
"grad_norm": 3.378663421824846, |
|
"learning_rate": 1.783349924669994e-05, |
|
"loss": 1.0739, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.23788463348075656, |
|
"grad_norm": 4.410599739962371, |
|
"learning_rate": 1.78183148246803e-05, |
|
"loss": 1.0029, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.23863743295379694, |
|
"grad_norm": 3.6348238736424157, |
|
"learning_rate": 1.7803083888953058e-05, |
|
"loss": 1.0898, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.2393902324268373, |
|
"grad_norm": 2.957533946262562, |
|
"learning_rate": 1.7787806530132022e-05, |
|
"loss": 1.1463, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.24014303189987768, |
|
"grad_norm": 6.23103586941611, |
|
"learning_rate": 1.777248283910719e-05, |
|
"loss": 1.1414, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.24089583137291803, |
|
"grad_norm": 2.865456649973816, |
|
"learning_rate": 1.77571129070442e-05, |
|
"loss": 1.0555, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.24164863084595842, |
|
"grad_norm": 2.978113549024132, |
|
"learning_rate": 1.7741696825383797e-05, |
|
"loss": 1.1657, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.24240143031899877, |
|
"grad_norm": 3.6676086137258777, |
|
"learning_rate": 1.7726234685841283e-05, |
|
"loss": 1.1312, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.24315422979203916, |
|
"grad_norm": 3.8360009679535043, |
|
"learning_rate": 1.7710726580405977e-05, |
|
"loss": 1.1094, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.2439070292650795, |
|
"grad_norm": 4.893214293222856, |
|
"learning_rate": 1.769517260134066e-05, |
|
"loss": 1.1595, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.2446598287381199, |
|
"grad_norm": 2.9775759972530706, |
|
"learning_rate": 1.7679572841181033e-05, |
|
"loss": 1.0488, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.24541262821116025, |
|
"grad_norm": 3.2211167234681897, |
|
"learning_rate": 1.766392739273516e-05, |
|
"loss": 1.0147, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.24616542768420063, |
|
"grad_norm": 2.8555966297737934, |
|
"learning_rate": 1.7648236349082928e-05, |
|
"loss": 1.0554, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.246918227157241, |
|
"grad_norm": 3.001112904305115, |
|
"learning_rate": 1.7632499803575473e-05, |
|
"loss": 1.1109, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.24767102663028137, |
|
"grad_norm": 3.79725157083872, |
|
"learning_rate": 1.7616717849834644e-05, |
|
"loss": 1.0736, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.24842382610332173, |
|
"grad_norm": 3.1372105798656764, |
|
"learning_rate": 1.7600890581752435e-05, |
|
"loss": 1.08, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.2491766255763621, |
|
"grad_norm": 3.0523824969950866, |
|
"learning_rate": 1.758501809349044e-05, |
|
"loss": 1.0516, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.24992942504940246, |
|
"grad_norm": 3.090928065217594, |
|
"learning_rate": 1.756910047947926e-05, |
|
"loss": 0.967, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.2506822245224428, |
|
"grad_norm": 3.7066771841864985, |
|
"learning_rate": 1.755313783441799e-05, |
|
"loss": 1.0556, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.2514350239954832, |
|
"grad_norm": 3.0285861591388934, |
|
"learning_rate": 1.7537130253273613e-05, |
|
"loss": 1.1683, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.2521878234685236, |
|
"grad_norm": 2.80325314926076, |
|
"learning_rate": 1.7521077831280453e-05, |
|
"loss": 0.9991, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.25294062294156394, |
|
"grad_norm": 2.995638992681817, |
|
"learning_rate": 1.7504980663939614e-05, |
|
"loss": 1.0163, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.2536934224146043, |
|
"grad_norm": 3.139175462481918, |
|
"learning_rate": 1.7488838847018397e-05, |
|
"loss": 1.1129, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.25444622188764465, |
|
"grad_norm": 2.9364998673824485, |
|
"learning_rate": 1.7472652476549747e-05, |
|
"loss": 1.0008, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.25519902136068506, |
|
"grad_norm": 3.3899427096289974, |
|
"learning_rate": 1.7456421648831658e-05, |
|
"loss": 1.0741, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.2559518208337254, |
|
"grad_norm": 3.333680754858661, |
|
"learning_rate": 1.744014646042663e-05, |
|
"loss": 1.0719, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2567046203067658, |
|
"grad_norm": 3.6173353051432273, |
|
"learning_rate": 1.742382700816107e-05, |
|
"loss": 1.0938, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.25745741977980613, |
|
"grad_norm": 3.43700516947627, |
|
"learning_rate": 1.7407463389124728e-05, |
|
"loss": 1.1011, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.25821021925284654, |
|
"grad_norm": 7.1179022156761365, |
|
"learning_rate": 1.739105570067012e-05, |
|
"loss": 1.1076, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.2589630187258869, |
|
"grad_norm": 3.6490493958884342, |
|
"learning_rate": 1.7374604040411934e-05, |
|
"loss": 0.9959, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.25971581819892725, |
|
"grad_norm": 3.990073166624143, |
|
"learning_rate": 1.7358108506226477e-05, |
|
"loss": 1.0756, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.2604686176719676, |
|
"grad_norm": 3.7673869409287044, |
|
"learning_rate": 1.7341569196251065e-05, |
|
"loss": 1.0857, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.261221417145008, |
|
"grad_norm": 3.830082557723633, |
|
"learning_rate": 1.732498620888345e-05, |
|
"loss": 1.0964, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.26197421661804837, |
|
"grad_norm": 7.568910658413351, |
|
"learning_rate": 1.730835964278124e-05, |
|
"loss": 1.0863, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.2627270160910887, |
|
"grad_norm": 5.225358882803191, |
|
"learning_rate": 1.729168959686131e-05, |
|
"loss": 1.1107, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.2634798155641291, |
|
"grad_norm": 3.3550233061746355, |
|
"learning_rate": 1.7274976170299197e-05, |
|
"loss": 1.1369, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.2642326150371695, |
|
"grad_norm": 3.5292404069420114, |
|
"learning_rate": 1.7258219462528543e-05, |
|
"loss": 1.0442, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.26498541451020985, |
|
"grad_norm": 4.527087799400158, |
|
"learning_rate": 1.7241419573240463e-05, |
|
"loss": 1.1139, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.2657382139832502, |
|
"grad_norm": 4.109247471170187, |
|
"learning_rate": 1.7224576602382993e-05, |
|
"loss": 1.1061, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.26649101345629056, |
|
"grad_norm": 3.2870155559806946, |
|
"learning_rate": 1.720769065016046e-05, |
|
"loss": 1.0581, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.26724381292933097, |
|
"grad_norm": 3.779535153125096, |
|
"learning_rate": 1.719076181703291e-05, |
|
"loss": 1.066, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.2679966124023713, |
|
"grad_norm": 5.005623870979084, |
|
"learning_rate": 1.7173790203715494e-05, |
|
"loss": 1.0451, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.2687494118754117, |
|
"grad_norm": 3.93851827850133, |
|
"learning_rate": 1.7156775911177888e-05, |
|
"loss": 1.0904, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.26950221134845204, |
|
"grad_norm": 4.547113741305462, |
|
"learning_rate": 1.713971904064367e-05, |
|
"loss": 1.113, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.27025501082149245, |
|
"grad_norm": 3.7171211560510686, |
|
"learning_rate": 1.712261969358973e-05, |
|
"loss": 1.0793, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.2710078102945328, |
|
"grad_norm": 3.5934038171786766, |
|
"learning_rate": 1.7105477971745668e-05, |
|
"loss": 1.004, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.27176060976757316, |
|
"grad_norm": 4.1748700140242985, |
|
"learning_rate": 1.7088293977093187e-05, |
|
"loss": 1.1683, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.2725134092406135, |
|
"grad_norm": 3.435875864417095, |
|
"learning_rate": 1.7071067811865477e-05, |
|
"loss": 1.0426, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.2732662087136539, |
|
"grad_norm": 3.4776749127597126, |
|
"learning_rate": 1.7053799578546623e-05, |
|
"loss": 1.0401, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.2740190081866943, |
|
"grad_norm": 3.2901799263911826, |
|
"learning_rate": 1.7036489379870982e-05, |
|
"loss": 1.115, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.27477180765973463, |
|
"grad_norm": 3.2846213361675645, |
|
"learning_rate": 1.7019137318822577e-05, |
|
"loss": 1.0098, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.275524607132775, |
|
"grad_norm": 3.5406251727492313, |
|
"learning_rate": 1.7001743498634487e-05, |
|
"loss": 1.0518, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.2762774066058154, |
|
"grad_norm": 4.997896297281951, |
|
"learning_rate": 1.6984308022788227e-05, |
|
"loss": 1.1053, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.27703020607885576, |
|
"grad_norm": 3.306159931490668, |
|
"learning_rate": 1.6966830995013134e-05, |
|
"loss": 1.0413, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.2777830055518961, |
|
"grad_norm": 3.5786298411885014, |
|
"learning_rate": 1.694931251928575e-05, |
|
"loss": 1.0284, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.27853580502493647, |
|
"grad_norm": 3.4314746136554755, |
|
"learning_rate": 1.693175269982921e-05, |
|
"loss": 1.0704, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.2792886044979769, |
|
"grad_norm": 3.802313145942647, |
|
"learning_rate": 1.691415164111261e-05, |
|
"loss": 1.0687, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.28004140397101723, |
|
"grad_norm": 3.8384682286598952, |
|
"learning_rate": 1.689650944785041e-05, |
|
"loss": 1.0777, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.2807942034440576, |
|
"grad_norm": 3.1221086188163487, |
|
"learning_rate": 1.6878826225001756e-05, |
|
"loss": 1.0765, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.28154700291709794, |
|
"grad_norm": 3.189592999644019, |
|
"learning_rate": 1.686110207776993e-05, |
|
"loss": 1.0747, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.28229980239013835, |
|
"grad_norm": 3.3796395188906714, |
|
"learning_rate": 1.6843337111601663e-05, |
|
"loss": 1.0966, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.2830526018631787, |
|
"grad_norm": 3.620502370043677, |
|
"learning_rate": 1.6825531432186545e-05, |
|
"loss": 1.0919, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.28380540133621907, |
|
"grad_norm": 3.342937529880184, |
|
"learning_rate": 1.680768514545637e-05, |
|
"loss": 1.0405, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.2845582008092594, |
|
"grad_norm": 3.44340611172516, |
|
"learning_rate": 1.6789798357584524e-05, |
|
"loss": 0.9996, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.2853110002822998, |
|
"grad_norm": 3.015178467631497, |
|
"learning_rate": 1.677187117498536e-05, |
|
"loss": 1.0534, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.2860637997553402, |
|
"grad_norm": 2.8540969925658084, |
|
"learning_rate": 1.6753903704313527e-05, |
|
"loss": 1.0674, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.28681659922838054, |
|
"grad_norm": 3.2861357016852284, |
|
"learning_rate": 1.6735896052463384e-05, |
|
"loss": 1.0382, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.2875693987014209, |
|
"grad_norm": 3.0596062908393047, |
|
"learning_rate": 1.6717848326568327e-05, |
|
"loss": 1.0231, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.28832219817446125, |
|
"grad_norm": 3.0906561170405005, |
|
"learning_rate": 1.6699760634000166e-05, |
|
"loss": 1.0676, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.28907499764750166, |
|
"grad_norm": 2.3789151459015434, |
|
"learning_rate": 1.66816330823685e-05, |
|
"loss": 1.0296, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.289827797120542, |
|
"grad_norm": 3.8593539388223843, |
|
"learning_rate": 1.6663465779520042e-05, |
|
"loss": 1.0445, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.2905805965935824, |
|
"grad_norm": 3.854702956207567, |
|
"learning_rate": 1.6645258833538015e-05, |
|
"loss": 1.1603, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.29133339606662273, |
|
"grad_norm": 3.383690393128223, |
|
"learning_rate": 1.6627012352741482e-05, |
|
"loss": 1.0015, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.29208619553966314, |
|
"grad_norm": 3.364330036008893, |
|
"learning_rate": 1.6608726445684715e-05, |
|
"loss": 1.0971, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.2928389950127035, |
|
"grad_norm": 3.114986206892385, |
|
"learning_rate": 1.659040122115655e-05, |
|
"loss": 0.9818, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.29359179448574385, |
|
"grad_norm": 3.0575994965283186, |
|
"learning_rate": 1.6572036788179728e-05, |
|
"loss": 1.0585, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.2943445939587842, |
|
"grad_norm": 3.0998367099456035, |
|
"learning_rate": 1.6553633256010254e-05, |
|
"loss": 0.9561, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.2950973934318246, |
|
"grad_norm": 3.3358383840570247, |
|
"learning_rate": 1.653519073413675e-05, |
|
"loss": 1.1339, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.295850192904865, |
|
"grad_norm": 3.6575458151303555, |
|
"learning_rate": 1.6516709332279806e-05, |
|
"loss": 1.1176, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.29660299237790533, |
|
"grad_norm": 3.223000019134338, |
|
"learning_rate": 1.649818916039131e-05, |
|
"loss": 0.985, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.2973557918509457, |
|
"grad_norm": 2.8833925903157156, |
|
"learning_rate": 1.6479630328653814e-05, |
|
"loss": 1.0436, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.2981085913239861, |
|
"grad_norm": 3.451911301191298, |
|
"learning_rate": 1.646103294747987e-05, |
|
"loss": 1.0887, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.29886139079702645, |
|
"grad_norm": 2.9713239489773073, |
|
"learning_rate": 1.6442397127511366e-05, |
|
"loss": 1.0673, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.2996141902700668, |
|
"grad_norm": 3.040445936124947, |
|
"learning_rate": 1.6423722979618883e-05, |
|
"loss": 1.0726, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.30036698974310716, |
|
"grad_norm": 3.468326113161527, |
|
"learning_rate": 1.6405010614901017e-05, |
|
"loss": 1.012, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.30036698974310716, |
|
"eval_loss": 1.0304744243621826, |
|
"eval_runtime": 583.2125, |
|
"eval_samples_per_second": 30.666, |
|
"eval_steps_per_second": 0.48, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.30111978921614757, |
|
"grad_norm": 2.8552822081266305, |
|
"learning_rate": 1.6386260144683744e-05, |
|
"loss": 1.0157, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3018725886891879, |
|
"grad_norm": 5.422850816552489, |
|
"learning_rate": 1.6367471680519734e-05, |
|
"loss": 1.0109, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.3026253881622283, |
|
"grad_norm": 3.6553640892361985, |
|
"learning_rate": 1.6348645334187686e-05, |
|
"loss": 1.1007, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.30337818763526864, |
|
"grad_norm": 3.9670270138761645, |
|
"learning_rate": 1.632978121769169e-05, |
|
"loss": 1.0345, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.30413098710830905, |
|
"grad_norm": 3.224743049795719, |
|
"learning_rate": 1.631087944326053e-05, |
|
"loss": 1.0532, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.3048837865813494, |
|
"grad_norm": 3.5913998466341286, |
|
"learning_rate": 1.6291940123347033e-05, |
|
"loss": 1.08, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.30563658605438976, |
|
"grad_norm": 2.906360104068391, |
|
"learning_rate": 1.6272963370627398e-05, |
|
"loss": 1.0642, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.3063893855274301, |
|
"grad_norm": 2.8309108092126616, |
|
"learning_rate": 1.6253949298000527e-05, |
|
"loss": 1.0076, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.3071421850004705, |
|
"grad_norm": 2.913978002284561, |
|
"learning_rate": 1.6234898018587336e-05, |
|
"loss": 1.0802, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.3078949844735109, |
|
"grad_norm": 3.2698030119279538, |
|
"learning_rate": 1.6215809645730115e-05, |
|
"loss": 1.1231, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.30864778394655124, |
|
"grad_norm": 3.1904018234043248, |
|
"learning_rate": 1.6196684292991827e-05, |
|
"loss": 1.0841, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.3094005834195916, |
|
"grad_norm": 3.0466834191213703, |
|
"learning_rate": 1.6177522074155436e-05, |
|
"loss": 1.0113, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.310153382892632, |
|
"grad_norm": 4.011803340356661, |
|
"learning_rate": 1.615832310322324e-05, |
|
"loss": 1.126, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.31090618236567236, |
|
"grad_norm": 2.761867409413426, |
|
"learning_rate": 1.6139087494416184e-05, |
|
"loss": 1.123, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.3116589818387127, |
|
"grad_norm": 2.8570455014305014, |
|
"learning_rate": 1.6119815362173188e-05, |
|
"loss": 1.0686, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.31241178131175307, |
|
"grad_norm": 3.1348742494088633, |
|
"learning_rate": 1.6100506821150455e-05, |
|
"loss": 1.0467, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.3131645807847935, |
|
"grad_norm": 3.072427636664424, |
|
"learning_rate": 1.6081161986220807e-05, |
|
"loss": 1.0526, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.31391738025783383, |
|
"grad_norm": 3.001432948323744, |
|
"learning_rate": 1.6061780972472978e-05, |
|
"loss": 1.0589, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.3146701797308742, |
|
"grad_norm": 2.876397896586471, |
|
"learning_rate": 1.6042363895210948e-05, |
|
"loss": 1.0391, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.31542297920391454, |
|
"grad_norm": 5.019687647291023, |
|
"learning_rate": 1.6022910869953245e-05, |
|
"loss": 1.0123, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.3161757786769549, |
|
"grad_norm": 2.758077610379088, |
|
"learning_rate": 1.6003422012432275e-05, |
|
"loss": 1.0637, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3169285781499953, |
|
"grad_norm": 4.352286539206582, |
|
"learning_rate": 1.5983897438593612e-05, |
|
"loss": 1.0986, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.31768137762303567, |
|
"grad_norm": 5.425551091825354, |
|
"learning_rate": 1.596433726459532e-05, |
|
"loss": 1.0641, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.318434177096076, |
|
"grad_norm": 3.114326822208182, |
|
"learning_rate": 1.5944741606807257e-05, |
|
"loss": 1.0409, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.3191869765691164, |
|
"grad_norm": 3.4234417375142665, |
|
"learning_rate": 1.5925110581810396e-05, |
|
"loss": 1.1171, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.3199397760421568, |
|
"grad_norm": 2.5707654982863377, |
|
"learning_rate": 1.590544430639611e-05, |
|
"loss": 1.0763, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.32069257551519714, |
|
"grad_norm": 3.4413913127761586, |
|
"learning_rate": 1.5885742897565494e-05, |
|
"loss": 1.1053, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.3214453749882375, |
|
"grad_norm": 2.9975019877398124, |
|
"learning_rate": 1.586600647252866e-05, |
|
"loss": 0.9461, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.32219817446127785, |
|
"grad_norm": 3.190004846812769, |
|
"learning_rate": 1.5846235148704047e-05, |
|
"loss": 0.9774, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.32295097393431826, |
|
"grad_norm": 3.110338821378457, |
|
"learning_rate": 1.5826429043717716e-05, |
|
"loss": 1.1223, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.3237037734073586, |
|
"grad_norm": 2.8781286921631613, |
|
"learning_rate": 1.580658827540265e-05, |
|
"loss": 1.1501, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.324456572880399, |
|
"grad_norm": 3.3316597059677, |
|
"learning_rate": 1.578671296179806e-05, |
|
"loss": 1.0884, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.32520937235343933, |
|
"grad_norm": 3.3183585538326925, |
|
"learning_rate": 1.5766803221148676e-05, |
|
"loss": 1.1408, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.32596217182647974, |
|
"grad_norm": 3.2746550035987485, |
|
"learning_rate": 1.574685917190404e-05, |
|
"loss": 1.0604, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.3267149712995201, |
|
"grad_norm": 2.9205856949478544, |
|
"learning_rate": 1.572688093271782e-05, |
|
"loss": 1.0424, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.32746777077256045, |
|
"grad_norm": 2.8380294657039795, |
|
"learning_rate": 1.5706868622447084e-05, |
|
"loss": 1.0492, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.3282205702456008, |
|
"grad_norm": 3.753517198713471, |
|
"learning_rate": 1.568682236015159e-05, |
|
"loss": 1.0763, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.3289733697186412, |
|
"grad_norm": 3.450360877181349, |
|
"learning_rate": 1.566674226509311e-05, |
|
"loss": 1.058, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.3297261691916816, |
|
"grad_norm": 3.5110811328728104, |
|
"learning_rate": 1.564662845673468e-05, |
|
"loss": 1.0606, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.33047896866472193, |
|
"grad_norm": 3.927731470985521, |
|
"learning_rate": 1.5626481054739916e-05, |
|
"loss": 1.1387, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.3312317681377623, |
|
"grad_norm": 5.154292997922348, |
|
"learning_rate": 1.560630017897229e-05, |
|
"loss": 1.0563, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.3319845676108027, |
|
"grad_norm": 4.313155430470082, |
|
"learning_rate": 1.558608594949441e-05, |
|
"loss": 1.1021, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.33273736708384305, |
|
"grad_norm": 2.7409094235023224, |
|
"learning_rate": 1.5565838486567343e-05, |
|
"loss": 0.9847, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.3334901665568834, |
|
"grad_norm": 2.832698130182746, |
|
"learning_rate": 1.554555791064985e-05, |
|
"loss": 1.0113, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.33424296602992376, |
|
"grad_norm": 3.1151968557597627, |
|
"learning_rate": 1.55252443423977e-05, |
|
"loss": 1.0411, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.33499576550296417, |
|
"grad_norm": 2.9105383861881378, |
|
"learning_rate": 1.550489790266294e-05, |
|
"loss": 1.0841, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.3357485649760045, |
|
"grad_norm": 3.616458297606771, |
|
"learning_rate": 1.5484518712493188e-05, |
|
"loss": 1.0521, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.3365013644490449, |
|
"grad_norm": 3.5478639800192076, |
|
"learning_rate": 1.5464106893130896e-05, |
|
"loss": 1.124, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.33725416392208524, |
|
"grad_norm": 3.029607726981716, |
|
"learning_rate": 1.5443662566012645e-05, |
|
"loss": 1.0125, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.33800696339512565, |
|
"grad_norm": 2.72691607967082, |
|
"learning_rate": 1.542318585276841e-05, |
|
"loss": 1.055, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.338759762868166, |
|
"grad_norm": 4.1070664679867575, |
|
"learning_rate": 1.5402676875220847e-05, |
|
"loss": 1.1034, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.33951256234120636, |
|
"grad_norm": 2.8627575059209147, |
|
"learning_rate": 1.5382135755384554e-05, |
|
"loss": 1.0617, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.3402653618142467, |
|
"grad_norm": 2.846634773819146, |
|
"learning_rate": 1.5361562615465366e-05, |
|
"loss": 1.0299, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.3410181612872871, |
|
"grad_norm": 3.185640397378095, |
|
"learning_rate": 1.5340957577859605e-05, |
|
"loss": 1.0712, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.3417709607603275, |
|
"grad_norm": 2.9033149341984106, |
|
"learning_rate": 1.5320320765153367e-05, |
|
"loss": 1.0137, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.34252376023336784, |
|
"grad_norm": 2.6348325760822235, |
|
"learning_rate": 1.5299652300121792e-05, |
|
"loss": 1.0322, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.3432765597064082, |
|
"grad_norm": 3.141329980975792, |
|
"learning_rate": 1.5278952305728325e-05, |
|
"loss": 1.0285, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.3440293591794486, |
|
"grad_norm": 2.932905735802033, |
|
"learning_rate": 1.5258220905123997e-05, |
|
"loss": 0.9916, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.34478215865248896, |
|
"grad_norm": 3.91581873738193, |
|
"learning_rate": 1.5237458221646668e-05, |
|
"loss": 0.9755, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.3455349581255293, |
|
"grad_norm": 3.0788973407240112, |
|
"learning_rate": 1.5216664378820327e-05, |
|
"loss": 0.9842, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.34628775759856967, |
|
"grad_norm": 3.1508943897589785, |
|
"learning_rate": 1.5195839500354337e-05, |
|
"loss": 1.0143, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.34704055707161, |
|
"grad_norm": 2.755535031332719, |
|
"learning_rate": 1.5174983710142694e-05, |
|
"loss": 1.1033, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.34779335654465043, |
|
"grad_norm": 2.72908305859456, |
|
"learning_rate": 1.515409713226331e-05, |
|
"loss": 1.0267, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.3485461560176908, |
|
"grad_norm": 2.8179620108009664, |
|
"learning_rate": 1.513317989097725e-05, |
|
"loss": 1.0323, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.34929895549073114, |
|
"grad_norm": 3.414258478909775, |
|
"learning_rate": 1.5112232110728016e-05, |
|
"loss": 1.0828, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.3500517549637715, |
|
"grad_norm": 3.2784290028904954, |
|
"learning_rate": 1.5091253916140789e-05, |
|
"loss": 1.0864, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.3508045544368119, |
|
"grad_norm": 4.226014886336044, |
|
"learning_rate": 1.5070245432021699e-05, |
|
"loss": 1.0652, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.35155735390985227, |
|
"grad_norm": 2.4370397137721733, |
|
"learning_rate": 1.5049206783357082e-05, |
|
"loss": 0.9975, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.3523101533828926, |
|
"grad_norm": 3.5305399592217794, |
|
"learning_rate": 1.502813809531272e-05, |
|
"loss": 0.9722, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.353062952855933, |
|
"grad_norm": 3.6714227478521138, |
|
"learning_rate": 1.5007039493233123e-05, |
|
"loss": 1.0186, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.3538157523289734, |
|
"grad_norm": 3.2619665655127292, |
|
"learning_rate": 1.4985911102640762e-05, |
|
"loss": 1.0209, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.35456855180201374, |
|
"grad_norm": 2.8237326768852338, |
|
"learning_rate": 1.4964753049235333e-05, |
|
"loss": 1.0689, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.3553213512750541, |
|
"grad_norm": 3.0898996817280437, |
|
"learning_rate": 1.4943565458892999e-05, |
|
"loss": 1.0663, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.35607415074809445, |
|
"grad_norm": 3.0818327635483347, |
|
"learning_rate": 1.4922348457665656e-05, |
|
"loss": 1.1391, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.35682695022113486, |
|
"grad_norm": 2.6170688801613036, |
|
"learning_rate": 1.4901102171780175e-05, |
|
"loss": 1.0734, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.3575797496941752, |
|
"grad_norm": 2.6182095607098868, |
|
"learning_rate": 1.487982672763764e-05, |
|
"loss": 0.9822, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.3583325491672156, |
|
"grad_norm": 3.531969368207952, |
|
"learning_rate": 1.4858522251812621e-05, |
|
"loss": 1.1004, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.35908534864025593, |
|
"grad_norm": 3.3782017687196575, |
|
"learning_rate": 1.4837188871052399e-05, |
|
"loss": 1.0112, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.35983814811329634, |
|
"grad_norm": 3.0803648867618802, |
|
"learning_rate": 1.481582671227622e-05, |
|
"loss": 1.0364, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.3605909475863367, |
|
"grad_norm": 3.665787032584331, |
|
"learning_rate": 1.4794435902574543e-05, |
|
"loss": 1.085, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.36134374705937705, |
|
"grad_norm": 3.211770330714845, |
|
"learning_rate": 1.4773016569208283e-05, |
|
"loss": 1.1324, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.3620965465324174, |
|
"grad_norm": 3.4731502354467874, |
|
"learning_rate": 1.4751568839608036e-05, |
|
"loss": 1.0276, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.3628493460054578, |
|
"grad_norm": 3.5439955990261334, |
|
"learning_rate": 1.4730092841373362e-05, |
|
"loss": 1.0329, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.3636021454784982, |
|
"grad_norm": 2.778610271559237, |
|
"learning_rate": 1.4708588702271978e-05, |
|
"loss": 1.0138, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.36435494495153853, |
|
"grad_norm": 4.278298948928529, |
|
"learning_rate": 1.468705655023903e-05, |
|
"loss": 1.065, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.3651077444245789, |
|
"grad_norm": 3.5441846422629593, |
|
"learning_rate": 1.466549651337632e-05, |
|
"loss": 1.0446, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.3658605438976193, |
|
"grad_norm": 3.039844008569692, |
|
"learning_rate": 1.4643908719951551e-05, |
|
"loss": 1.085, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.36661334337065965, |
|
"grad_norm": 2.8411313966432052, |
|
"learning_rate": 1.4622293298397554e-05, |
|
"loss": 1.1288, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.3673661428437, |
|
"grad_norm": 3.747615895853388, |
|
"learning_rate": 1.4600650377311523e-05, |
|
"loss": 1.0776, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.36811894231674036, |
|
"grad_norm": 2.7460960649591524, |
|
"learning_rate": 1.4578980085454268e-05, |
|
"loss": 1.0613, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.36887174178978077, |
|
"grad_norm": 3.1896235531083286, |
|
"learning_rate": 1.4557282551749428e-05, |
|
"loss": 0.99, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.3696245412628211, |
|
"grad_norm": 3.159003490883495, |
|
"learning_rate": 1.4535557905282716e-05, |
|
"loss": 0.9487, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.3703773407358615, |
|
"grad_norm": 3.0652639342812833, |
|
"learning_rate": 1.451380627530115e-05, |
|
"loss": 0.9943, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.37113014020890184, |
|
"grad_norm": 3.550087346952904, |
|
"learning_rate": 1.4492027791212275e-05, |
|
"loss": 1.0086, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.37188293968194225, |
|
"grad_norm": 2.6049865869424704, |
|
"learning_rate": 1.4470222582583404e-05, |
|
"loss": 0.9236, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.3726357391549826, |
|
"grad_norm": 3.6347381036624995, |
|
"learning_rate": 1.4448390779140844e-05, |
|
"loss": 1.1121, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.37338853862802296, |
|
"grad_norm": 6.880850065504925, |
|
"learning_rate": 1.442653251076912e-05, |
|
"loss": 0.9888, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.3741413381010633, |
|
"grad_norm": 6.344279229708359, |
|
"learning_rate": 1.4404647907510212e-05, |
|
"loss": 1.0726, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.3748941375741037, |
|
"grad_norm": 2.855607928708251, |
|
"learning_rate": 1.4382737099562765e-05, |
|
"loss": 1.0838, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.3756469370471441, |
|
"grad_norm": 2.9832996495692075, |
|
"learning_rate": 1.436080021728133e-05, |
|
"loss": 0.9708, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.37639973652018444, |
|
"grad_norm": 2.751066005440165, |
|
"learning_rate": 1.4338837391175582e-05, |
|
"loss": 1.0608, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3771525359932248, |
|
"grad_norm": 3.485859409386025, |
|
"learning_rate": 1.431684875190955e-05, |
|
"loss": 1.0913, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.37790533546626515, |
|
"grad_norm": 3.426005357359028, |
|
"learning_rate": 1.4294834430300822e-05, |
|
"loss": 1.0836, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.37865813493930556, |
|
"grad_norm": 3.9542615717855463, |
|
"learning_rate": 1.4272794557319785e-05, |
|
"loss": 1.1442, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.3794109344123459, |
|
"grad_norm": 2.622153474999588, |
|
"learning_rate": 1.4250729264088845e-05, |
|
"loss": 1.0587, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.38016373388538627, |
|
"grad_norm": 3.087065445678147, |
|
"learning_rate": 1.4228638681881633e-05, |
|
"loss": 1.0277, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.3809165333584266, |
|
"grad_norm": 3.1258129796224625, |
|
"learning_rate": 1.4206522942122232e-05, |
|
"loss": 1.0719, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.38166933283146703, |
|
"grad_norm": 3.5669441503070143, |
|
"learning_rate": 1.4184382176384403e-05, |
|
"loss": 1.1192, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.3824221323045074, |
|
"grad_norm": 2.631996666259307, |
|
"learning_rate": 1.4162216516390787e-05, |
|
"loss": 1.0321, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.38317493177754774, |
|
"grad_norm": 2.7083628383479414, |
|
"learning_rate": 1.4140026094012136e-05, |
|
"loss": 1.0325, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.3839277312505881, |
|
"grad_norm": 2.7679663041389624, |
|
"learning_rate": 1.4117811041266518e-05, |
|
"loss": 0.9971, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.3846805307236285, |
|
"grad_norm": 3.0521936330145834, |
|
"learning_rate": 1.4095571490318532e-05, |
|
"loss": 1.0261, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.38543333019666887, |
|
"grad_norm": 2.7908671493996233, |
|
"learning_rate": 1.4073307573478528e-05, |
|
"loss": 1.0099, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.3861861296697092, |
|
"grad_norm": 2.5789858700673647, |
|
"learning_rate": 1.405101942320182e-05, |
|
"loss": 1.0592, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.3869389291427496, |
|
"grad_norm": 4.887610637172058, |
|
"learning_rate": 1.4028707172087885e-05, |
|
"loss": 1.1322, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.38769172861579, |
|
"grad_norm": 3.542141348173503, |
|
"learning_rate": 1.40063709528796e-05, |
|
"loss": 1.0538, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.38844452808883034, |
|
"grad_norm": 3.290310094881258, |
|
"learning_rate": 1.3984010898462417e-05, |
|
"loss": 1.0527, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.3891973275618707, |
|
"grad_norm": 3.089847663466809, |
|
"learning_rate": 1.3961627141863603e-05, |
|
"loss": 1.0758, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.38995012703491105, |
|
"grad_norm": 3.046285439988099, |
|
"learning_rate": 1.3939219816251435e-05, |
|
"loss": 1.0543, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.39070292650795146, |
|
"grad_norm": 2.907476323401419, |
|
"learning_rate": 1.3916789054934408e-05, |
|
"loss": 1.0748, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.3914557259809918, |
|
"grad_norm": 2.9958486413284344, |
|
"learning_rate": 1.3894334991360448e-05, |
|
"loss": 1.1112, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3922085254540322, |
|
"grad_norm": 2.66442614074179, |
|
"learning_rate": 1.3871857759116104e-05, |
|
"loss": 0.9338, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.39296132492707253, |
|
"grad_norm": 3.6446504858155215, |
|
"learning_rate": 1.3849357491925779e-05, |
|
"loss": 1.1278, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.39371412440011294, |
|
"grad_norm": 3.22553976414466, |
|
"learning_rate": 1.3826834323650899e-05, |
|
"loss": 1.0915, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.3944669238731533, |
|
"grad_norm": 3.3837299425627343, |
|
"learning_rate": 1.3804288388289152e-05, |
|
"loss": 0.9893, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.39521972334619365, |
|
"grad_norm": 4.383592843892888, |
|
"learning_rate": 1.378171981997367e-05, |
|
"loss": 1.1149, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.395972522819234, |
|
"grad_norm": 2.9287918069708967, |
|
"learning_rate": 1.3759128752972229e-05, |
|
"loss": 0.9805, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.3967253222922744, |
|
"grad_norm": 3.769542827664187, |
|
"learning_rate": 1.3736515321686468e-05, |
|
"loss": 0.999, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.3974781217653148, |
|
"grad_norm": 4.040693234235752, |
|
"learning_rate": 1.3713879660651069e-05, |
|
"loss": 1.0457, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.39823092123835513, |
|
"grad_norm": 3.0216912415405193, |
|
"learning_rate": 1.3691221904532972e-05, |
|
"loss": 1.0238, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.3989837207113955, |
|
"grad_norm": 3.2298304957599444, |
|
"learning_rate": 1.3668542188130567e-05, |
|
"loss": 1.1817, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.3997365201844359, |
|
"grad_norm": 3.205471685042934, |
|
"learning_rate": 1.364584064637289e-05, |
|
"loss": 1.052, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.40048931965747625, |
|
"grad_norm": 3.0582507683769897, |
|
"learning_rate": 1.3623117414318827e-05, |
|
"loss": 1.0515, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.40048931965747625, |
|
"eval_loss": 1.013999581336975, |
|
"eval_runtime": 583.7469, |
|
"eval_samples_per_second": 30.638, |
|
"eval_steps_per_second": 0.48, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.4012421191305166, |
|
"grad_norm": 3.2249512274563905, |
|
"learning_rate": 1.3600372627156304e-05, |
|
"loss": 1.1174, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.40199491860355696, |
|
"grad_norm": 2.8980939124189193, |
|
"learning_rate": 1.3577606420201483e-05, |
|
"loss": 0.9254, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.40274771807659737, |
|
"grad_norm": 2.6685576816268, |
|
"learning_rate": 1.3554818928897965e-05, |
|
"loss": 1.0023, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.4035005175496377, |
|
"grad_norm": 3.2379228804073947, |
|
"learning_rate": 1.353201028881598e-05, |
|
"loss": 1.083, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.4042533170226781, |
|
"grad_norm": 3.0605669152172728, |
|
"learning_rate": 1.350918063565157e-05, |
|
"loss": 0.9634, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.40500611649571844, |
|
"grad_norm": 3.2347349427551078, |
|
"learning_rate": 1.3486330105225797e-05, |
|
"loss": 0.9626, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.40575891596875885, |
|
"grad_norm": 3.7810473941095264, |
|
"learning_rate": 1.3463458833483923e-05, |
|
"loss": 1.1268, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.4065117154417992, |
|
"grad_norm": 2.726200850867419, |
|
"learning_rate": 1.344056695649462e-05, |
|
"loss": 1.1052, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.40726451491483956, |
|
"grad_norm": 2.5107188818811923, |
|
"learning_rate": 1.3417654610449131e-05, |
|
"loss": 1.0178, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.4080173143878799, |
|
"grad_norm": 3.010490350395698, |
|
"learning_rate": 1.3394721931660488e-05, |
|
"loss": 0.9679, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.40877011386092027, |
|
"grad_norm": 2.465477343951642, |
|
"learning_rate": 1.3371769056562683e-05, |
|
"loss": 1.0452, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.4095229133339607, |
|
"grad_norm": 4.104294474246676, |
|
"learning_rate": 1.3348796121709862e-05, |
|
"loss": 1.0685, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.41027571280700104, |
|
"grad_norm": 2.918854588082266, |
|
"learning_rate": 1.3325803263775521e-05, |
|
"loss": 1.0147, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.4110285122800414, |
|
"grad_norm": 3.31220295919612, |
|
"learning_rate": 1.3302790619551673e-05, |
|
"loss": 1.0331, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.41178131175308175, |
|
"grad_norm": 4.183130206995228, |
|
"learning_rate": 1.3279758325948054e-05, |
|
"loss": 1.1075, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.41253411122612216, |
|
"grad_norm": 3.7338257554064422, |
|
"learning_rate": 1.32567065199913e-05, |
|
"loss": 1.0739, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.4132869106991625, |
|
"grad_norm": 3.1338772898350995, |
|
"learning_rate": 1.3233635338824132e-05, |
|
"loss": 1.0575, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.41403971017220287, |
|
"grad_norm": 2.923412077252181, |
|
"learning_rate": 1.3210544919704539e-05, |
|
"loss": 1.0108, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4147925096452432, |
|
"grad_norm": 2.383268802641107, |
|
"learning_rate": 1.318743540000496e-05, |
|
"loss": 0.9357, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.41554530911828363, |
|
"grad_norm": 2.8657602020971678, |
|
"learning_rate": 1.3164306917211475e-05, |
|
"loss": 1.0201, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.416298108591324, |
|
"grad_norm": 6.229638816200601, |
|
"learning_rate": 1.3141159608922984e-05, |
|
"loss": 1.1212, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.41705090806436435, |
|
"grad_norm": 3.0334418266757806, |
|
"learning_rate": 1.3117993612850377e-05, |
|
"loss": 1.0709, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.4178037075374047, |
|
"grad_norm": 3.3777028783426464, |
|
"learning_rate": 1.3094809066815731e-05, |
|
"loss": 1.0202, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.4185565070104451, |
|
"grad_norm": 4.869952280964426, |
|
"learning_rate": 1.3071606108751475e-05, |
|
"loss": 1.066, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.41930930648348547, |
|
"grad_norm": 3.207610155370362, |
|
"learning_rate": 1.3048384876699588e-05, |
|
"loss": 1.0221, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.4200621059565258, |
|
"grad_norm": 4.465798720585036, |
|
"learning_rate": 1.302514550881076e-05, |
|
"loss": 0.9787, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.4208149054295662, |
|
"grad_norm": 3.803049002174094, |
|
"learning_rate": 1.3001888143343578e-05, |
|
"loss": 1.02, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.4215677049026066, |
|
"grad_norm": 3.441550189570075, |
|
"learning_rate": 1.2978612918663702e-05, |
|
"loss": 1.0819, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.42232050437564694, |
|
"grad_norm": 2.688153536798873, |
|
"learning_rate": 1.2955319973243043e-05, |
|
"loss": 1.0335, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.4230733038486873, |
|
"grad_norm": 3.4545648939837545, |
|
"learning_rate": 1.293200944565894e-05, |
|
"loss": 1.0494, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.42382610332172765, |
|
"grad_norm": 3.0549486245877584, |
|
"learning_rate": 1.290868147459333e-05, |
|
"loss": 1.0133, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.42457890279476806, |
|
"grad_norm": 2.6796979217078025, |
|
"learning_rate": 1.2885336198831933e-05, |
|
"loss": 1.0228, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.4253317022678084, |
|
"grad_norm": 2.564267991029905, |
|
"learning_rate": 1.2861973757263416e-05, |
|
"loss": 0.9532, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.4260845017408488, |
|
"grad_norm": 3.2351011961272897, |
|
"learning_rate": 1.2838594288878567e-05, |
|
"loss": 1.0146, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.42683730121388913, |
|
"grad_norm": 3.6439280454672107, |
|
"learning_rate": 1.2815197932769486e-05, |
|
"loss": 1.0859, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.42759010068692954, |
|
"grad_norm": 4.0189574487408395, |
|
"learning_rate": 1.2791784828128727e-05, |
|
"loss": 1.1235, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.4283429001599699, |
|
"grad_norm": 3.332167286806501, |
|
"learning_rate": 1.2768355114248493e-05, |
|
"loss": 1.008, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.42909569963301025, |
|
"grad_norm": 2.7378718594417157, |
|
"learning_rate": 1.274490893051981e-05, |
|
"loss": 0.9768, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.4298484991060506, |
|
"grad_norm": 3.5007868628431966, |
|
"learning_rate": 1.2721446416431676e-05, |
|
"loss": 1.0619, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.430601298579091, |
|
"grad_norm": 4.43305491920189, |
|
"learning_rate": 1.2697967711570243e-05, |
|
"loss": 1.1178, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.4313540980521314, |
|
"grad_norm": 3.963447493013407, |
|
"learning_rate": 1.2674472955618001e-05, |
|
"loss": 1.013, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.43210689752517173, |
|
"grad_norm": 2.773126731287594, |
|
"learning_rate": 1.2650962288352916e-05, |
|
"loss": 1.0851, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.4328596969982121, |
|
"grad_norm": 3.639961417112135, |
|
"learning_rate": 1.2627435849647629e-05, |
|
"loss": 1.0373, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.4336124964712525, |
|
"grad_norm": 3.0305761457545684, |
|
"learning_rate": 1.2603893779468604e-05, |
|
"loss": 1.0637, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.43436529594429285, |
|
"grad_norm": 2.4310238944919726, |
|
"learning_rate": 1.2580336217875303e-05, |
|
"loss": 1.0316, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.4351180954173332, |
|
"grad_norm": 3.3566129974599095, |
|
"learning_rate": 1.2556763305019353e-05, |
|
"loss": 1.002, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.43587089489037356, |
|
"grad_norm": 3.332157255311208, |
|
"learning_rate": 1.2533175181143704e-05, |
|
"loss": 1.0369, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.43662369436341397, |
|
"grad_norm": 3.296409695390347, |
|
"learning_rate": 1.2509571986581814e-05, |
|
"loss": 1.0249, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.4373764938364543, |
|
"grad_norm": 3.104522070073551, |
|
"learning_rate": 1.248595386175679e-05, |
|
"loss": 1.0295, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.4381292933094947, |
|
"grad_norm": 3.702230092409405, |
|
"learning_rate": 1.2462320947180565e-05, |
|
"loss": 1.0505, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.43888209278253504, |
|
"grad_norm": 2.9464845086562876, |
|
"learning_rate": 1.2438673383453073e-05, |
|
"loss": 1.0431, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.4396348922555754, |
|
"grad_norm": 3.245101732744253, |
|
"learning_rate": 1.241501131126138e-05, |
|
"loss": 1.0038, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.4403876917286158, |
|
"grad_norm": 3.1816104157227207, |
|
"learning_rate": 1.239133487137889e-05, |
|
"loss": 1.0419, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.44114049120165616, |
|
"grad_norm": 4.017669311902771, |
|
"learning_rate": 1.2367644204664468e-05, |
|
"loss": 1.0518, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.4418932906746965, |
|
"grad_norm": 3.2156530752080363, |
|
"learning_rate": 1.2343939452061628e-05, |
|
"loss": 1.0116, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.44264609014773687, |
|
"grad_norm": 2.7530854094979818, |
|
"learning_rate": 1.232022075459768e-05, |
|
"loss": 1.0057, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.4433988896207773, |
|
"grad_norm": 2.4845646778491157, |
|
"learning_rate": 1.2296488253382902e-05, |
|
"loss": 0.9812, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.44415168909381764, |
|
"grad_norm": 2.771993461799015, |
|
"learning_rate": 1.2272742089609694e-05, |
|
"loss": 1.0233, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.444904488566858, |
|
"grad_norm": 3.2638541440257174, |
|
"learning_rate": 1.2248982404551733e-05, |
|
"loss": 1.1117, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.44565728803989835, |
|
"grad_norm": 3.019401928494119, |
|
"learning_rate": 1.2225209339563144e-05, |
|
"loss": 1.0764, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.44641008751293876, |
|
"grad_norm": 2.5950884293437433, |
|
"learning_rate": 1.2201423036077657e-05, |
|
"loss": 0.9995, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.4471628869859791, |
|
"grad_norm": 3.1737029171437836, |
|
"learning_rate": 1.2177623635607753e-05, |
|
"loss": 1.0109, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.44791568645901947, |
|
"grad_norm": 2.73228114214597, |
|
"learning_rate": 1.2153811279743841e-05, |
|
"loss": 0.9672, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.4486684859320598, |
|
"grad_norm": 3.2642865460623693, |
|
"learning_rate": 1.2129986110153395e-05, |
|
"loss": 1.0576, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.44942128540510023, |
|
"grad_norm": 2.703014706115608, |
|
"learning_rate": 1.210614826858013e-05, |
|
"loss": 1.0316, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.4501740848781406, |
|
"grad_norm": 4.575951156022697, |
|
"learning_rate": 1.208229789684315e-05, |
|
"loss": 1.0492, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.45092688435118095, |
|
"grad_norm": 4.2735868522314036, |
|
"learning_rate": 1.2058435136836101e-05, |
|
"loss": 0.9975, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.4516796838242213, |
|
"grad_norm": 3.152733804810423, |
|
"learning_rate": 1.2034560130526341e-05, |
|
"loss": 1.0168, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.4524324832972617, |
|
"grad_norm": 4.0000821775666475, |
|
"learning_rate": 1.201067301995407e-05, |
|
"loss": 1.1154, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.45318528277030207, |
|
"grad_norm": 4.068503998540418, |
|
"learning_rate": 1.1986773947231505e-05, |
|
"loss": 1.0289, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.4539380822433424, |
|
"grad_norm": 2.6788756306717594, |
|
"learning_rate": 1.1962863054542045e-05, |
|
"loss": 0.9589, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.4546908817163828, |
|
"grad_norm": 5.012791771168218, |
|
"learning_rate": 1.1938940484139387e-05, |
|
"loss": 1.1051, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.4554436811894232, |
|
"grad_norm": 3.521967279714444, |
|
"learning_rate": 1.1915006378346719e-05, |
|
"loss": 1.0553, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.45619648066246354, |
|
"grad_norm": 3.616593755479914, |
|
"learning_rate": 1.1891060879555847e-05, |
|
"loss": 1.102, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.4569492801355039, |
|
"grad_norm": 2.9722037476669496, |
|
"learning_rate": 1.1867104130226363e-05, |
|
"loss": 1.0029, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.45770207960854425, |
|
"grad_norm": 3.4759851155141765, |
|
"learning_rate": 1.1843136272884795e-05, |
|
"loss": 1.0905, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.45845487908158467, |
|
"grad_norm": 2.7210451462852325, |
|
"learning_rate": 1.1819157450123745e-05, |
|
"loss": 1.0133, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.459207678554625, |
|
"grad_norm": 2.572254919121521, |
|
"learning_rate": 1.1795167804601062e-05, |
|
"loss": 0.9759, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.4599604780276654, |
|
"grad_norm": 2.724723345376094, |
|
"learning_rate": 1.1771167479038978e-05, |
|
"loss": 1.0532, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.46071327750070573, |
|
"grad_norm": 3.4328753233271163, |
|
"learning_rate": 1.1747156616223272e-05, |
|
"loss": 1.0223, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.46146607697374614, |
|
"grad_norm": 3.0252287414459125, |
|
"learning_rate": 1.1723135359002403e-05, |
|
"loss": 0.9849, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.4622188764467865, |
|
"grad_norm": 3.0032309037729648, |
|
"learning_rate": 1.1699103850286668e-05, |
|
"loss": 1.0028, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.46297167591982685, |
|
"grad_norm": 2.8243286910732905, |
|
"learning_rate": 1.1675062233047365e-05, |
|
"loss": 1.05, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.4637244753928672, |
|
"grad_norm": 3.5574655975394984, |
|
"learning_rate": 1.1651010650315923e-05, |
|
"loss": 1.0497, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.4644772748659076, |
|
"grad_norm": 3.022425375669109, |
|
"learning_rate": 1.1626949245183061e-05, |
|
"loss": 0.9807, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.465230074338948, |
|
"grad_norm": 2.771846295903394, |
|
"learning_rate": 1.1602878160797936e-05, |
|
"loss": 1.076, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.46598287381198833, |
|
"grad_norm": 2.505321802846061, |
|
"learning_rate": 1.1578797540367284e-05, |
|
"loss": 0.9784, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.4667356732850287, |
|
"grad_norm": 2.983970077564599, |
|
"learning_rate": 1.155470752715458e-05, |
|
"loss": 0.959, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.4674884727580691, |
|
"grad_norm": 4.383473009689115, |
|
"learning_rate": 1.153060826447918e-05, |
|
"loss": 0.986, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.46824127223110945, |
|
"grad_norm": 3.4488742706032367, |
|
"learning_rate": 1.1506499895715462e-05, |
|
"loss": 1.0913, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.4689940717041498, |
|
"grad_norm": 2.6140008129721517, |
|
"learning_rate": 1.148238256429199e-05, |
|
"loss": 0.9833, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.46974687117719016, |
|
"grad_norm": 2.92309907991184, |
|
"learning_rate": 1.1458256413690634e-05, |
|
"loss": 1.0083, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.4704996706502306, |
|
"grad_norm": 2.724708243419771, |
|
"learning_rate": 1.1434121587445752e-05, |
|
"loss": 1.0594, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.47125247012327093, |
|
"grad_norm": 3.0724226680282367, |
|
"learning_rate": 1.1409978229143297e-05, |
|
"loss": 1.0204, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.4720052695963113, |
|
"grad_norm": 3.0671644057682923, |
|
"learning_rate": 1.1385826482419993e-05, |
|
"loss": 1.0139, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.47275806906935164, |
|
"grad_norm": 3.1314240764279364, |
|
"learning_rate": 1.1361666490962468e-05, |
|
"loss": 1.0052, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.473510868542392, |
|
"grad_norm": 3.3855180309761237, |
|
"learning_rate": 1.1337498398506397e-05, |
|
"loss": 0.9997, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.4742636680154324, |
|
"grad_norm": 3.1909270618792958, |
|
"learning_rate": 1.1313322348835658e-05, |
|
"loss": 1.0612, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.47501646748847276, |
|
"grad_norm": 3.783620789630306, |
|
"learning_rate": 1.1289138485781456e-05, |
|
"loss": 1.0032, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.4757692669615131, |
|
"grad_norm": 4.229526141420337, |
|
"learning_rate": 1.1264946953221496e-05, |
|
"loss": 1.0331, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.47652206643455347, |
|
"grad_norm": 4.6378584634680715, |
|
"learning_rate": 1.12407478950791e-05, |
|
"loss": 1.0362, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.4772748659075939, |
|
"grad_norm": 2.6228761321729452, |
|
"learning_rate": 1.1216541455322367e-05, |
|
"loss": 1.0012, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.47802766538063424, |
|
"grad_norm": 2.9654511830251535, |
|
"learning_rate": 1.1192327777963313e-05, |
|
"loss": 1.0351, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.4787804648536746, |
|
"grad_norm": 2.5847394351824406, |
|
"learning_rate": 1.1168107007057006e-05, |
|
"loss": 0.9708, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.47953326432671495, |
|
"grad_norm": 3.838509319191613, |
|
"learning_rate": 1.1143879286700723e-05, |
|
"loss": 0.9432, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.48028606379975536, |
|
"grad_norm": 2.817996686530098, |
|
"learning_rate": 1.1119644761033079e-05, |
|
"loss": 0.9694, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.4810388632727957, |
|
"grad_norm": 3.531105963283091, |
|
"learning_rate": 1.1095403574233185e-05, |
|
"loss": 1.0909, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.48179166274583607, |
|
"grad_norm": 3.3417816286597213, |
|
"learning_rate": 1.1071155870519777e-05, |
|
"loss": 1.1137, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.4825444622188764, |
|
"grad_norm": 3.5562409238146317, |
|
"learning_rate": 1.1046901794150358e-05, |
|
"loss": 1.0303, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.48329726169191684, |
|
"grad_norm": 2.907782910105036, |
|
"learning_rate": 1.1022641489420342e-05, |
|
"loss": 1.0336, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.4840500611649572, |
|
"grad_norm": 2.970163177761591, |
|
"learning_rate": 1.0998375100662215e-05, |
|
"loss": 1.0605, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.48480286063799755, |
|
"grad_norm": 3.2772867501044582, |
|
"learning_rate": 1.0974102772244638e-05, |
|
"loss": 1.0235, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.4855556601110379, |
|
"grad_norm": 3.497468818702157, |
|
"learning_rate": 1.094982464857162e-05, |
|
"loss": 1.0848, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.4863084595840783, |
|
"grad_norm": 3.3225147360774403, |
|
"learning_rate": 1.0925540874081649e-05, |
|
"loss": 0.9684, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.48706125905711867, |
|
"grad_norm": 3.2301417809169575, |
|
"learning_rate": 1.0901251593246822e-05, |
|
"loss": 0.9827, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.487814058530159, |
|
"grad_norm": 3.636896054687057, |
|
"learning_rate": 1.0876956950572006e-05, |
|
"loss": 1.065, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.4885668580031994, |
|
"grad_norm": 2.932810236288748, |
|
"learning_rate": 1.0852657090593961e-05, |
|
"loss": 1.0698, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.4893196574762398, |
|
"grad_norm": 5.0895461182195385, |
|
"learning_rate": 1.0828352157880489e-05, |
|
"loss": 1.1485, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.49007245694928014, |
|
"grad_norm": 3.1415226619078616, |
|
"learning_rate": 1.0804042297029567e-05, |
|
"loss": 0.9901, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.4908252564223205, |
|
"grad_norm": 2.909255583040199, |
|
"learning_rate": 1.0779727652668496e-05, |
|
"loss": 1.0655, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.49157805589536085, |
|
"grad_norm": 3.1548858316494877, |
|
"learning_rate": 1.075540836945304e-05, |
|
"loss": 1.0486, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.49233085536840127, |
|
"grad_norm": 2.7012241871329614, |
|
"learning_rate": 1.0731084592066548e-05, |
|
"loss": 0.9917, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.4930836548414416, |
|
"grad_norm": 3.0157551767075796, |
|
"learning_rate": 1.0706756465219114e-05, |
|
"loss": 1.0596, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.493836454314482, |
|
"grad_norm": 2.8704467999311367, |
|
"learning_rate": 1.0682424133646712e-05, |
|
"loss": 1.0269, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.49458925378752233, |
|
"grad_norm": 3.0763751507732127, |
|
"learning_rate": 1.0658087742110322e-05, |
|
"loss": 1.0799, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.49534205326056274, |
|
"grad_norm": 3.9020236991450856, |
|
"learning_rate": 1.063374743539509e-05, |
|
"loss": 1.0025, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.4960948527336031, |
|
"grad_norm": 2.609346679016363, |
|
"learning_rate": 1.060940335830944e-05, |
|
"loss": 0.9835, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.49684765220664345, |
|
"grad_norm": 3.343369949839618, |
|
"learning_rate": 1.058505565568424e-05, |
|
"loss": 0.9486, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.4976004516796838, |
|
"grad_norm": 3.3313727642441795, |
|
"learning_rate": 1.0560704472371919e-05, |
|
"loss": 0.9556, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.4983532511527242, |
|
"grad_norm": 3.4155555321239355, |
|
"learning_rate": 1.0536349953245622e-05, |
|
"loss": 1.0095, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.4991060506257646, |
|
"grad_norm": 2.480958118144923, |
|
"learning_rate": 1.0511992243198335e-05, |
|
"loss": 0.9385, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.49985885009880493, |
|
"grad_norm": 3.5008683649636816, |
|
"learning_rate": 1.0487631487142018e-05, |
|
"loss": 1.0369, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.5006116495718453, |
|
"grad_norm": 2.663416310535519, |
|
"learning_rate": 1.0463267830006779e-05, |
|
"loss": 1.0242, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.5006116495718453, |
|
"eval_loss": 0.9954760074615479, |
|
"eval_runtime": 584.4864, |
|
"eval_samples_per_second": 30.6, |
|
"eval_steps_per_second": 0.479, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.5013644490448856, |
|
"grad_norm": 3.1777588089309208, |
|
"learning_rate": 1.0438901416739955e-05, |
|
"loss": 1.0186, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.502117248517926, |
|
"grad_norm": 2.7838827384508944, |
|
"learning_rate": 1.0414532392305301e-05, |
|
"loss": 0.9709, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.5028700479909664, |
|
"grad_norm": 2.921400217102283, |
|
"learning_rate": 1.03901609016821e-05, |
|
"loss": 1.0714, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.5036228474640068, |
|
"grad_norm": 2.7341839637863847, |
|
"learning_rate": 1.0365787089864303e-05, |
|
"loss": 0.9927, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.5043756469370472, |
|
"grad_norm": 4.914922149938216, |
|
"learning_rate": 1.034141110185968e-05, |
|
"loss": 1.1115, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5051284464100875, |
|
"grad_norm": 2.989888013177945, |
|
"learning_rate": 1.031703308268894e-05, |
|
"loss": 0.9944, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.5058812458831279, |
|
"grad_norm": 2.579049180170171, |
|
"learning_rate": 1.0292653177384878e-05, |
|
"loss": 0.9751, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.5066340453561683, |
|
"grad_norm": 3.959139485335877, |
|
"learning_rate": 1.0268271530991509e-05, |
|
"loss": 1.0733, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.5073868448292086, |
|
"grad_norm": 2.600824437688288, |
|
"learning_rate": 1.0243888288563213e-05, |
|
"loss": 0.9694, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.508139644302249, |
|
"grad_norm": 2.5090320928335075, |
|
"learning_rate": 1.0219503595163857e-05, |
|
"loss": 1.0231, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.5088924437752893, |
|
"grad_norm": 2.6849288835400453, |
|
"learning_rate": 1.0195117595865947e-05, |
|
"loss": 1.0139, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.5096452432483297, |
|
"grad_norm": 3.682029203578967, |
|
"learning_rate": 1.017073043574975e-05, |
|
"loss": 0.9919, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.5103980427213701, |
|
"grad_norm": 3.275453418274055, |
|
"learning_rate": 1.0146342259902446e-05, |
|
"loss": 1.0953, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.5111508421944104, |
|
"grad_norm": 3.0421924924609907, |
|
"learning_rate": 1.0121953213417256e-05, |
|
"loss": 1.0479, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.5119036416674508, |
|
"grad_norm": 3.4230363471287273, |
|
"learning_rate": 1.0097563441392582e-05, |
|
"loss": 1.0352, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.5126564411404912, |
|
"grad_norm": 3.21566163229416, |
|
"learning_rate": 1.0073173088931143e-05, |
|
"loss": 0.9818, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.5134092406135315, |
|
"grad_norm": 2.729633411998688, |
|
"learning_rate": 1.0048782301139102e-05, |
|
"loss": 0.9991, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.514162040086572, |
|
"grad_norm": 2.9671555111696284, |
|
"learning_rate": 1.0024391223125226e-05, |
|
"loss": 1.0071, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.5149148395596123, |
|
"grad_norm": 3.8434580170431665, |
|
"learning_rate": 1e-05, |
|
"loss": 1.0889, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.5156676390326527, |
|
"grad_norm": 2.7977469550737184, |
|
"learning_rate": 9.975608776874775e-06, |
|
"loss": 1.029, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.5164204385056931, |
|
"grad_norm": 3.312999685049618, |
|
"learning_rate": 9.951217698860902e-06, |
|
"loss": 1.0791, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.5171732379787334, |
|
"grad_norm": 2.8531522744975706, |
|
"learning_rate": 9.926826911068862e-06, |
|
"loss": 0.9893, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.5179260374517738, |
|
"grad_norm": 3.2587348054443948, |
|
"learning_rate": 9.90243655860742e-06, |
|
"loss": 1.0307, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.5186788369248142, |
|
"grad_norm": 2.783794808143798, |
|
"learning_rate": 9.878046786582745e-06, |
|
"loss": 1.042, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.5194316363978545, |
|
"grad_norm": 2.6862927134575414, |
|
"learning_rate": 9.853657740097558e-06, |
|
"loss": 1.0261, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5201844358708949, |
|
"grad_norm": 2.997479231338951, |
|
"learning_rate": 9.829269564250254e-06, |
|
"loss": 1.0427, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.5209372353439352, |
|
"grad_norm": 2.4242881837479464, |
|
"learning_rate": 9.804882404134057e-06, |
|
"loss": 0.928, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.5216900348169756, |
|
"grad_norm": 2.745092956089842, |
|
"learning_rate": 9.780496404836146e-06, |
|
"loss": 1.037, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.522442834290016, |
|
"grad_norm": 3.03917646598378, |
|
"learning_rate": 9.75611171143679e-06, |
|
"loss": 0.9335, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.5231956337630563, |
|
"grad_norm": 2.7717089194586713, |
|
"learning_rate": 9.731728469008493e-06, |
|
"loss": 1.0349, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.5239484332360967, |
|
"grad_norm": 4.154071155669152, |
|
"learning_rate": 9.707346822615127e-06, |
|
"loss": 1.0371, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.5247012327091372, |
|
"grad_norm": 2.888871286357065, |
|
"learning_rate": 9.682966917311065e-06, |
|
"loss": 0.9778, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.5254540321821775, |
|
"grad_norm": 2.598234744411881, |
|
"learning_rate": 9.658588898140322e-06, |
|
"loss": 0.9517, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.5262068316552179, |
|
"grad_norm": 2.87901406826843, |
|
"learning_rate": 9.634212910135697e-06, |
|
"loss": 1.0629, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.5269596311282582, |
|
"grad_norm": 2.3502091823270153, |
|
"learning_rate": 9.609839098317902e-06, |
|
"loss": 0.9472, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5277124306012986, |
|
"grad_norm": 2.3727300076429043, |
|
"learning_rate": 9.585467607694702e-06, |
|
"loss": 0.9752, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.528465230074339, |
|
"grad_norm": 3.0992227105534766, |
|
"learning_rate": 9.561098583260047e-06, |
|
"loss": 1.0605, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.5292180295473793, |
|
"grad_norm": 2.998573259606581, |
|
"learning_rate": 9.536732169993225e-06, |
|
"loss": 1.0572, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.5299708290204197, |
|
"grad_norm": 3.0645514660070363, |
|
"learning_rate": 9.512368512857983e-06, |
|
"loss": 1.0209, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.5307236284934601, |
|
"grad_norm": 2.7686653578062446, |
|
"learning_rate": 9.488007756801672e-06, |
|
"loss": 0.9782, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.5314764279665004, |
|
"grad_norm": 2.419321874756083, |
|
"learning_rate": 9.463650046754383e-06, |
|
"loss": 0.9543, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.5322292274395408, |
|
"grad_norm": 3.2183835254514923, |
|
"learning_rate": 9.439295527628083e-06, |
|
"loss": 1.0494, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.5329820269125811, |
|
"grad_norm": 2.7493248848894773, |
|
"learning_rate": 9.414944344315765e-06, |
|
"loss": 1.0513, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.5337348263856215, |
|
"grad_norm": 2.5720306128631054, |
|
"learning_rate": 9.390596641690563e-06, |
|
"loss": 0.9475, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.5344876258586619, |
|
"grad_norm": 2.6083152242917804, |
|
"learning_rate": 9.366252564604914e-06, |
|
"loss": 0.9755, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.5352404253317022, |
|
"grad_norm": 2.49703677714488, |
|
"learning_rate": 9.34191225788968e-06, |
|
"loss": 0.9997, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.5359932248047427, |
|
"grad_norm": 3.556514244662279, |
|
"learning_rate": 9.317575866353293e-06, |
|
"loss": 1.0298, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.536746024277783, |
|
"grad_norm": 2.754748122306969, |
|
"learning_rate": 9.293243534780887e-06, |
|
"loss": 1.0231, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.5374988237508234, |
|
"grad_norm": 2.458781472253963, |
|
"learning_rate": 9.268915407933457e-06, |
|
"loss": 0.9993, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.5382516232238638, |
|
"grad_norm": 3.1095834663708013, |
|
"learning_rate": 9.244591630546964e-06, |
|
"loss": 1.0862, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.5390044226969041, |
|
"grad_norm": 3.35070855022117, |
|
"learning_rate": 9.220272347331502e-06, |
|
"loss": 0.9522, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.5397572221699445, |
|
"grad_norm": 4.631416487344075, |
|
"learning_rate": 9.195957702970434e-06, |
|
"loss": 1.0246, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.5405100216429849, |
|
"grad_norm": 2.7689299663326112, |
|
"learning_rate": 9.171647842119515e-06, |
|
"loss": 1.0256, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.5412628211160252, |
|
"grad_norm": 3.049609437321965, |
|
"learning_rate": 9.14734290940604e-06, |
|
"loss": 1.0002, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.5420156205890656, |
|
"grad_norm": 2.795960963250181, |
|
"learning_rate": 9.123043049427996e-06, |
|
"loss": 1.0337, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.5427684200621059, |
|
"grad_norm": 2.8255346166814914, |
|
"learning_rate": 9.098748406753181e-06, |
|
"loss": 0.9985, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.5435212195351463, |
|
"grad_norm": 2.7816767930156137, |
|
"learning_rate": 9.074459125918356e-06, |
|
"loss": 1.0088, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.5442740190081867, |
|
"grad_norm": 2.9163237872445054, |
|
"learning_rate": 9.050175351428381e-06, |
|
"loss": 0.9896, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.545026818481227, |
|
"grad_norm": 2.7100301901021266, |
|
"learning_rate": 9.025897227755367e-06, |
|
"loss": 1.0298, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.5457796179542674, |
|
"grad_norm": 3.6617428934101586, |
|
"learning_rate": 9.001624899337785e-06, |
|
"loss": 0.9469, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.5465324174273078, |
|
"grad_norm": 3.2124650336329474, |
|
"learning_rate": 8.977358510579658e-06, |
|
"loss": 1.0152, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.5472852169003481, |
|
"grad_norm": 2.7297134092753077, |
|
"learning_rate": 8.953098205849647e-06, |
|
"loss": 1.0301, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.5480380163733886, |
|
"grad_norm": 3.555015522849158, |
|
"learning_rate": 8.928844129480228e-06, |
|
"loss": 1.0335, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.5487908158464289, |
|
"grad_norm": 2.9032937240322525, |
|
"learning_rate": 8.904596425766817e-06, |
|
"loss": 1.067, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.5495436153194693, |
|
"grad_norm": 2.8183123864129644, |
|
"learning_rate": 8.880355238966923e-06, |
|
"loss": 0.9862, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.5502964147925097, |
|
"grad_norm": 3.380735555760305, |
|
"learning_rate": 8.856120713299284e-06, |
|
"loss": 1.1157, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.55104921426555, |
|
"grad_norm": 3.4648885692214804, |
|
"learning_rate": 8.831892992943e-06, |
|
"loss": 1.0331, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.5518020137385904, |
|
"grad_norm": 3.0630177296042516, |
|
"learning_rate": 8.807672222036692e-06, |
|
"loss": 1.0538, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.5525548132116308, |
|
"grad_norm": 2.549871690933666, |
|
"learning_rate": 8.783458544677633e-06, |
|
"loss": 1.0284, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.5533076126846711, |
|
"grad_norm": 3.211702233203899, |
|
"learning_rate": 8.7592521049209e-06, |
|
"loss": 1.0029, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.5540604121577115, |
|
"grad_norm": 3.274729428072235, |
|
"learning_rate": 8.735053046778506e-06, |
|
"loss": 0.9601, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.5548132116307518, |
|
"grad_norm": 3.115868299982563, |
|
"learning_rate": 8.710861514218545e-06, |
|
"loss": 0.9472, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.5555660111037922, |
|
"grad_norm": 2.5731947785457154, |
|
"learning_rate": 8.686677651164345e-06, |
|
"loss": 0.9086, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.5563188105768326, |
|
"grad_norm": 2.5740093256448833, |
|
"learning_rate": 8.662501601493607e-06, |
|
"loss": 1.0467, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.5570716100498729, |
|
"grad_norm": 4.663822637060085, |
|
"learning_rate": 8.638333509037537e-06, |
|
"loss": 1.0923, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.5578244095229133, |
|
"grad_norm": 2.5224339575220482, |
|
"learning_rate": 8.61417351758001e-06, |
|
"loss": 1.0176, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.5585772089959538, |
|
"grad_norm": 2.6320547385384403, |
|
"learning_rate": 8.590021770856708e-06, |
|
"loss": 1.0732, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.559330008468994, |
|
"grad_norm": 3.0609203768889, |
|
"learning_rate": 8.565878412554251e-06, |
|
"loss": 1.0025, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.5600828079420345, |
|
"grad_norm": 3.0482117277870797, |
|
"learning_rate": 8.541743586309366e-06, |
|
"loss": 1.0, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.5608356074150748, |
|
"grad_norm": 2.9710905213118557, |
|
"learning_rate": 8.517617435708011e-06, |
|
"loss": 1.0132, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.5615884068881152, |
|
"grad_norm": 3.3175141891426647, |
|
"learning_rate": 8.493500104284539e-06, |
|
"loss": 0.9257, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.5623412063611556, |
|
"grad_norm": 2.508169904735289, |
|
"learning_rate": 8.469391735520824e-06, |
|
"loss": 0.938, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.5630940058341959, |
|
"grad_norm": 2.707186967130032, |
|
"learning_rate": 8.445292472845423e-06, |
|
"loss": 1.0384, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.5638468053072363, |
|
"grad_norm": 2.477550044042414, |
|
"learning_rate": 8.42120245963272e-06, |
|
"loss": 0.9575, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.5645996047802767, |
|
"grad_norm": 2.968333454620037, |
|
"learning_rate": 8.397121839202069e-06, |
|
"loss": 0.9827, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.565352404253317, |
|
"grad_norm": 6.9956574767627435, |
|
"learning_rate": 8.373050754816942e-06, |
|
"loss": 1.0719, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.5661052037263574, |
|
"grad_norm": 2.9911439956329686, |
|
"learning_rate": 8.348989349684077e-06, |
|
"loss": 0.9937, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.5668580031993977, |
|
"grad_norm": 2.6387837783947052, |
|
"learning_rate": 8.324937766952638e-06, |
|
"loss": 0.9247, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.5676108026724381, |
|
"grad_norm": 4.394239465779068, |
|
"learning_rate": 8.300896149713334e-06, |
|
"loss": 1.0567, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.5683636021454785, |
|
"grad_norm": 3.0051427224594613, |
|
"learning_rate": 8.276864640997602e-06, |
|
"loss": 1.0641, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.5691164016185188, |
|
"grad_norm": 3.028211050401354, |
|
"learning_rate": 8.252843383776731e-06, |
|
"loss": 0.9739, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.5698692010915593, |
|
"grad_norm": 2.526597328033783, |
|
"learning_rate": 8.228832520961023e-06, |
|
"loss": 0.9153, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.5706220005645996, |
|
"grad_norm": 2.701644840333796, |
|
"learning_rate": 8.204832195398941e-06, |
|
"loss": 1.0112, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.57137480003764, |
|
"grad_norm": 3.7978352979490464, |
|
"learning_rate": 8.18084254987626e-06, |
|
"loss": 0.9748, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.5721275995106804, |
|
"grad_norm": 3.758272789518827, |
|
"learning_rate": 8.15686372711521e-06, |
|
"loss": 1.0222, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.5728803989837207, |
|
"grad_norm": 2.5711172001094353, |
|
"learning_rate": 8.132895869773638e-06, |
|
"loss": 1.0119, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.5736331984567611, |
|
"grad_norm": 2.9242712243012012, |
|
"learning_rate": 8.108939120444154e-06, |
|
"loss": 0.9453, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.5743859979298015, |
|
"grad_norm": 3.010900564143939, |
|
"learning_rate": 8.084993621653283e-06, |
|
"loss": 0.9995, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.5751387974028418, |
|
"grad_norm": 3.013143165423945, |
|
"learning_rate": 8.061059515860616e-06, |
|
"loss": 0.9736, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.5758915968758822, |
|
"grad_norm": 3.600869577617628, |
|
"learning_rate": 8.037136945457959e-06, |
|
"loss": 1.0206, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.5766443963489225, |
|
"grad_norm": 2.8526015661570656, |
|
"learning_rate": 8.013226052768498e-06, |
|
"loss": 0.9567, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.5773971958219629, |
|
"grad_norm": 2.9267086899123322, |
|
"learning_rate": 7.989326980045937e-06, |
|
"loss": 1.021, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.5781499952950033, |
|
"grad_norm": 2.489653573594032, |
|
"learning_rate": 7.965439869473664e-06, |
|
"loss": 1.0382, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.5789027947680436, |
|
"grad_norm": 2.7327345835934804, |
|
"learning_rate": 7.941564863163899e-06, |
|
"loss": 1.058, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.579655594241084, |
|
"grad_norm": 3.209412795643892, |
|
"learning_rate": 7.91770210315685e-06, |
|
"loss": 1.0648, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.5804083937141244, |
|
"grad_norm": 2.9843151463835755, |
|
"learning_rate": 7.893851731419872e-06, |
|
"loss": 1.0647, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.5811611931871647, |
|
"grad_norm": 3.647581304182318, |
|
"learning_rate": 7.870013889846608e-06, |
|
"loss": 1.0173, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.5819139926602052, |
|
"grad_norm": 2.367016172308078, |
|
"learning_rate": 7.846188720256162e-06, |
|
"loss": 0.9627, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.5826667921332455, |
|
"grad_norm": 2.5064853047353646, |
|
"learning_rate": 7.822376364392248e-06, |
|
"loss": 0.9736, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.5834195916062859, |
|
"grad_norm": 3.3689106910977555, |
|
"learning_rate": 7.798576963922347e-06, |
|
"loss": 1.033, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.5841723910793263, |
|
"grad_norm": 2.545780240126541, |
|
"learning_rate": 7.774790660436857e-06, |
|
"loss": 1.0125, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.5849251905523666, |
|
"grad_norm": 2.827075772715696, |
|
"learning_rate": 7.75101759544827e-06, |
|
"loss": 1.1027, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.585677990025407, |
|
"grad_norm": 2.70846832173852, |
|
"learning_rate": 7.727257910390309e-06, |
|
"loss": 0.9252, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.5864307894984474, |
|
"grad_norm": 2.8457104629747123, |
|
"learning_rate": 7.703511746617098e-06, |
|
"loss": 0.8725, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.5871835889714877, |
|
"grad_norm": 3.248698548728837, |
|
"learning_rate": 7.679779245402321e-06, |
|
"loss": 0.9749, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.5879363884445281, |
|
"grad_norm": 3.1331071482094903, |
|
"learning_rate": 7.656060547938375e-06, |
|
"loss": 1.0044, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.5886891879175684, |
|
"grad_norm": 3.230980661848815, |
|
"learning_rate": 7.632355795335533e-06, |
|
"loss": 1.0545, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.5894419873906088, |
|
"grad_norm": 2.8480133523980578, |
|
"learning_rate": 7.608665128621111e-06, |
|
"loss": 0.9804, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.5901947868636492, |
|
"grad_norm": 3.6918447213822363, |
|
"learning_rate": 7.584988688738622e-06, |
|
"loss": 0.9664, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.5909475863366895, |
|
"grad_norm": 3.673578400225882, |
|
"learning_rate": 7.561326616546932e-06, |
|
"loss": 1.0105, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.59170038580973, |
|
"grad_norm": 3.0681486083575766, |
|
"learning_rate": 7.5376790528194354e-06, |
|
"loss": 0.9658, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.5924531852827704, |
|
"grad_norm": 2.969194265691157, |
|
"learning_rate": 7.514046138243211e-06, |
|
"loss": 1.037, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.5932059847558107, |
|
"grad_norm": 3.041477785697227, |
|
"learning_rate": 7.490428013418187e-06, |
|
"loss": 1.033, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.5939587842288511, |
|
"grad_norm": 2.8679574117693085, |
|
"learning_rate": 7.466824818856296e-06, |
|
"loss": 1.004, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.5947115837018914, |
|
"grad_norm": 2.2901391036080345, |
|
"learning_rate": 7.443236694980649e-06, |
|
"loss": 0.8783, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.5954643831749318, |
|
"grad_norm": 3.3270607439893025, |
|
"learning_rate": 7.4196637821247e-06, |
|
"loss": 1.0179, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.5962171826479722, |
|
"grad_norm": 2.358053180305501, |
|
"learning_rate": 7.396106220531398e-06, |
|
"loss": 0.9435, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.5969699821210125, |
|
"grad_norm": 3.076010451529784, |
|
"learning_rate": 7.372564150352373e-06, |
|
"loss": 0.9858, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.5977227815940529, |
|
"grad_norm": 3.20081193030361, |
|
"learning_rate": 7.349037711647089e-06, |
|
"loss": 1.007, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.5984755810670932, |
|
"grad_norm": 2.6824310401334444, |
|
"learning_rate": 7.325527044382004e-06, |
|
"loss": 0.9502, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.5992283805401336, |
|
"grad_norm": 2.967747752671785, |
|
"learning_rate": 7.3020322884297565e-06, |
|
"loss": 0.9533, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.599981180013174, |
|
"grad_norm": 2.3419025363845094, |
|
"learning_rate": 7.278553583568326e-06, |
|
"loss": 1.0294, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.6007339794862143, |
|
"grad_norm": 2.8919180147573744, |
|
"learning_rate": 7.2550910694801905e-06, |
|
"loss": 1.0084, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.6007339794862143, |
|
"eval_loss": 0.9782312512397766, |
|
"eval_runtime": 585.6058, |
|
"eval_samples_per_second": 30.541, |
|
"eval_steps_per_second": 0.478, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.6014867789592547, |
|
"grad_norm": 3.0960911393551, |
|
"learning_rate": 7.2316448857515076e-06, |
|
"loss": 0.9845, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.6022395784322951, |
|
"grad_norm": 2.5561402924270347, |
|
"learning_rate": 7.208215171871277e-06, |
|
"loss": 0.9745, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6029923779053354, |
|
"grad_norm": 2.5104780203751322, |
|
"learning_rate": 7.184802067230518e-06, |
|
"loss": 0.9585, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.6037451773783759, |
|
"grad_norm": 2.412017836012546, |
|
"learning_rate": 7.161405711121436e-06, |
|
"loss": 0.874, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.6044979768514162, |
|
"grad_norm": 2.5834562133722523, |
|
"learning_rate": 7.1380262427365885e-06, |
|
"loss": 0.8987, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 0.6052507763244566, |
|
"grad_norm": 4.380554220513787, |
|
"learning_rate": 7.114663801168073e-06, |
|
"loss": 1.0493, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.606003575797497, |
|
"grad_norm": 2.7164301877546166, |
|
"learning_rate": 7.091318525406671e-06, |
|
"loss": 1.0521, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.6067563752705373, |
|
"grad_norm": 2.940119831487498, |
|
"learning_rate": 7.067990554341064e-06, |
|
"loss": 0.9458, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.6075091747435777, |
|
"grad_norm": 2.7406400760630176, |
|
"learning_rate": 7.04468002675696e-06, |
|
"loss": 1.0437, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.6082619742166181, |
|
"grad_norm": 2.8146647858748937, |
|
"learning_rate": 7.021387081336302e-06, |
|
"loss": 0.9932, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.6090147736896584, |
|
"grad_norm": 3.1861410008902857, |
|
"learning_rate": 6.998111856656427e-06, |
|
"loss": 1.0304, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 0.6097675731626988, |
|
"grad_norm": 3.3122100772434804, |
|
"learning_rate": 6.974854491189243e-06, |
|
"loss": 0.9891, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.6105203726357391, |
|
"grad_norm": 2.892082290412832, |
|
"learning_rate": 6.951615123300415e-06, |
|
"loss": 0.9414, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 0.6112731721087795, |
|
"grad_norm": 2.4416813602015925, |
|
"learning_rate": 6.928393891248529e-06, |
|
"loss": 1.004, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.6120259715818199, |
|
"grad_norm": 3.8141614574322893, |
|
"learning_rate": 6.905190933184275e-06, |
|
"loss": 0.965, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 0.6127787710548602, |
|
"grad_norm": 2.7579831000313115, |
|
"learning_rate": 6.882006387149625e-06, |
|
"loss": 1.0356, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.6135315705279006, |
|
"grad_norm": 3.092633092511484, |
|
"learning_rate": 6.858840391077017e-06, |
|
"loss": 0.9903, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.614284370000941, |
|
"grad_norm": 2.9786953925685755, |
|
"learning_rate": 6.8356930827885256e-06, |
|
"loss": 1.0204, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.6150371694739813, |
|
"grad_norm": 2.495282333549744, |
|
"learning_rate": 6.812564599995042e-06, |
|
"loss": 0.9635, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 0.6157899689470218, |
|
"grad_norm": 2.663824091107566, |
|
"learning_rate": 6.789455080295464e-06, |
|
"loss": 1.0795, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.6165427684200621, |
|
"grad_norm": 2.8628446717330647, |
|
"learning_rate": 6.766364661175872e-06, |
|
"loss": 0.9932, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.6172955678931025, |
|
"grad_norm": 2.7547900025423324, |
|
"learning_rate": 6.743293480008703e-06, |
|
"loss": 0.9901, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.6180483673661429, |
|
"grad_norm": 2.9160986440400074, |
|
"learning_rate": 6.720241674051948e-06, |
|
"loss": 1.0101, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 0.6188011668391832, |
|
"grad_norm": 2.7815364890305236, |
|
"learning_rate": 6.697209380448333e-06, |
|
"loss": 1.0153, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.6195539663122236, |
|
"grad_norm": 2.814498812667104, |
|
"learning_rate": 6.674196736224481e-06, |
|
"loss": 1.0124, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 0.620306765785264, |
|
"grad_norm": 3.284851022195675, |
|
"learning_rate": 6.651203878290139e-06, |
|
"loss": 1.0318, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.6210595652583043, |
|
"grad_norm": 4.043376728595679, |
|
"learning_rate": 6.628230943437319e-06, |
|
"loss": 1.0221, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.6218123647313447, |
|
"grad_norm": 2.724946690541462, |
|
"learning_rate": 6.605278068339516e-06, |
|
"loss": 0.9727, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.622565164204385, |
|
"grad_norm": 3.175680771426865, |
|
"learning_rate": 6.58234538955087e-06, |
|
"loss": 0.9562, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 0.6233179636774254, |
|
"grad_norm": 3.337757521832563, |
|
"learning_rate": 6.559433043505383e-06, |
|
"loss": 1.0422, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.6240707631504658, |
|
"grad_norm": 3.0583079706035456, |
|
"learning_rate": 6.536541166516079e-06, |
|
"loss": 0.9973, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 0.6248235626235061, |
|
"grad_norm": 3.4140875903459422, |
|
"learning_rate": 6.513669894774209e-06, |
|
"loss": 1.0313, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.6255763620965465, |
|
"grad_norm": 2.963263472232558, |
|
"learning_rate": 6.490819364348434e-06, |
|
"loss": 0.9431, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 0.626329161569587, |
|
"grad_norm": 3.885267790943743, |
|
"learning_rate": 6.467989711184021e-06, |
|
"loss": 0.9676, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.6270819610426273, |
|
"grad_norm": 2.986705312141074, |
|
"learning_rate": 6.445181071102034e-06, |
|
"loss": 0.9332, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.6278347605156677, |
|
"grad_norm": 2.4434737482142848, |
|
"learning_rate": 6.422393579798519e-06, |
|
"loss": 0.9472, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.628587559988708, |
|
"grad_norm": 2.9933676178531203, |
|
"learning_rate": 6.399627372843699e-06, |
|
"loss": 1.0365, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.6293403594617484, |
|
"grad_norm": 2.423565098566723, |
|
"learning_rate": 6.376882585681174e-06, |
|
"loss": 0.9378, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.6300931589347888, |
|
"grad_norm": 3.930550350027819, |
|
"learning_rate": 6.354159353627114e-06, |
|
"loss": 1.0059, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 0.6308459584078291, |
|
"grad_norm": 3.4001506334649667, |
|
"learning_rate": 6.331457811869437e-06, |
|
"loss": 0.9484, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.6315987578808695, |
|
"grad_norm": 2.9835541064513698, |
|
"learning_rate": 6.3087780954670306e-06, |
|
"loss": 0.9999, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 0.6323515573539098, |
|
"grad_norm": 2.9869824794113113, |
|
"learning_rate": 6.286120339348935e-06, |
|
"loss": 1.0178, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.6331043568269502, |
|
"grad_norm": 3.336008765122204, |
|
"learning_rate": 6.263484678313536e-06, |
|
"loss": 1.0082, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 0.6338571562999906, |
|
"grad_norm": 2.6343658023196146, |
|
"learning_rate": 6.240871247027774e-06, |
|
"loss": 0.9284, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.6346099557730309, |
|
"grad_norm": 2.8587661519934744, |
|
"learning_rate": 6.2182801800263325e-06, |
|
"loss": 0.9708, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 0.6353627552460713, |
|
"grad_norm": 2.68509448532749, |
|
"learning_rate": 6.195711611710851e-06, |
|
"loss": 0.973, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.6361155547191117, |
|
"grad_norm": 2.436855010449512, |
|
"learning_rate": 6.173165676349103e-06, |
|
"loss": 0.9015, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.636868354192152, |
|
"grad_norm": 3.2521498003954026, |
|
"learning_rate": 6.150642508074225e-06, |
|
"loss": 1.0108, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.6376211536651925, |
|
"grad_norm": 3.1006751282783402, |
|
"learning_rate": 6.128142240883899e-06, |
|
"loss": 0.9579, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 0.6383739531382328, |
|
"grad_norm": 2.8978862891079054, |
|
"learning_rate": 6.105665008639557e-06, |
|
"loss": 1.015, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.6391267526112732, |
|
"grad_norm": 2.279326073157125, |
|
"learning_rate": 6.083210945065595e-06, |
|
"loss": 0.9875, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 0.6398795520843136, |
|
"grad_norm": 3.0675504012666654, |
|
"learning_rate": 6.0607801837485665e-06, |
|
"loss": 0.9362, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.6406323515573539, |
|
"grad_norm": 2.4679963317993, |
|
"learning_rate": 6.038372858136401e-06, |
|
"loss": 1.0029, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 0.6413851510303943, |
|
"grad_norm": 2.318094472802182, |
|
"learning_rate": 6.015989101537586e-06, |
|
"loss": 0.9346, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.6421379505034347, |
|
"grad_norm": 3.5138374389839737, |
|
"learning_rate": 5.993629047120403e-06, |
|
"loss": 1.0261, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 0.642890749976475, |
|
"grad_norm": 2.854935679385697, |
|
"learning_rate": 5.971292827912117e-06, |
|
"loss": 0.9759, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.6436435494495154, |
|
"grad_norm": 2.4090683744161834, |
|
"learning_rate": 5.9489805767981845e-06, |
|
"loss": 0.9546, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.6443963489225557, |
|
"grad_norm": 2.7843495775159233, |
|
"learning_rate": 5.926692426521474e-06, |
|
"loss": 0.8722, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.6451491483955961, |
|
"grad_norm": 2.740863877578159, |
|
"learning_rate": 5.904428509681473e-06, |
|
"loss": 0.9348, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 0.6459019478686365, |
|
"grad_norm": 3.0461938473498833, |
|
"learning_rate": 5.882188958733488e-06, |
|
"loss": 0.9922, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.6466547473416768, |
|
"grad_norm": 3.0394454770711294, |
|
"learning_rate": 5.859973905987866e-06, |
|
"loss": 0.9624, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 0.6474075468147172, |
|
"grad_norm": 2.723188343497309, |
|
"learning_rate": 5.837783483609214e-06, |
|
"loss": 1.0125, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.6481603462877576, |
|
"grad_norm": 2.3691610151759432, |
|
"learning_rate": 5.815617823615599e-06, |
|
"loss": 0.9457, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.648913145760798, |
|
"grad_norm": 2.667516135651494, |
|
"learning_rate": 5.79347705787777e-06, |
|
"loss": 1.0045, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.6496659452338384, |
|
"grad_norm": 3.301425067030213, |
|
"learning_rate": 5.771361318118374e-06, |
|
"loss": 1.0249, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 0.6504187447068787, |
|
"grad_norm": 3.075114609965823, |
|
"learning_rate": 5.749270735911159e-06, |
|
"loss": 1.0088, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.6511715441799191, |
|
"grad_norm": 2.349924278634797, |
|
"learning_rate": 5.727205442680218e-06, |
|
"loss": 0.9746, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.6519243436529595, |
|
"grad_norm": 2.9453643370218003, |
|
"learning_rate": 5.7051655696991825e-06, |
|
"loss": 0.9492, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.6526771431259998, |
|
"grad_norm": 3.003085836693138, |
|
"learning_rate": 5.683151248090455e-06, |
|
"loss": 1.0076, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 0.6534299425990402, |
|
"grad_norm": 4.541919041618707, |
|
"learning_rate": 5.66116260882442e-06, |
|
"loss": 0.9974, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.6541827420720806, |
|
"grad_norm": 3.3380757605339038, |
|
"learning_rate": 5.639199782718674e-06, |
|
"loss": 1.0176, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 0.6549355415451209, |
|
"grad_norm": 7.9191564125396345, |
|
"learning_rate": 5.617262900437239e-06, |
|
"loss": 1.0649, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.6556883410181613, |
|
"grad_norm": 3.1847304138480963, |
|
"learning_rate": 5.595352092489791e-06, |
|
"loss": 1.0244, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 0.6564411404912016, |
|
"grad_norm": 2.8005119319971072, |
|
"learning_rate": 5.573467489230879e-06, |
|
"loss": 0.9347, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.657193939964242, |
|
"grad_norm": 3.7323965387138887, |
|
"learning_rate": 5.55160922085916e-06, |
|
"loss": 1.0172, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 0.6579467394372824, |
|
"grad_norm": 3.100690883893798, |
|
"learning_rate": 5.529777417416599e-06, |
|
"loss": 1.0025, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.6586995389103227, |
|
"grad_norm": 3.061825155807451, |
|
"learning_rate": 5.507972208787728e-06, |
|
"loss": 0.9475, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.6594523383833631, |
|
"grad_norm": 3.0772915102988456, |
|
"learning_rate": 5.486193724698854e-06, |
|
"loss": 0.899, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.6602051378564034, |
|
"grad_norm": 2.4561283709078596, |
|
"learning_rate": 5.464442094717281e-06, |
|
"loss": 0.9297, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 0.6609579373294439, |
|
"grad_norm": 3.2450122492959106, |
|
"learning_rate": 5.442717448250574e-06, |
|
"loss": 1.0168, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.6617107368024843, |
|
"grad_norm": 2.6488414083706213, |
|
"learning_rate": 5.421019914545735e-06, |
|
"loss": 0.9117, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 0.6624635362755246, |
|
"grad_norm": 2.3123514940320677, |
|
"learning_rate": 5.399349622688479e-06, |
|
"loss": 0.985, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.663216335748565, |
|
"grad_norm": 3.4193564013787157, |
|
"learning_rate": 5.3777067016024495e-06, |
|
"loss": 1.0232, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 0.6639691352216054, |
|
"grad_norm": 2.6262517727005013, |
|
"learning_rate": 5.35609128004845e-06, |
|
"loss": 0.9809, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.6647219346946457, |
|
"grad_norm": 3.544497471768593, |
|
"learning_rate": 5.33450348662368e-06, |
|
"loss": 1.0073, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 0.6654747341676861, |
|
"grad_norm": 2.208983215094343, |
|
"learning_rate": 5.312943449760975e-06, |
|
"loss": 0.9091, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.6662275336407264, |
|
"grad_norm": 2.655612493732194, |
|
"learning_rate": 5.291411297728027e-06, |
|
"loss": 0.9758, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.6669803331137668, |
|
"grad_norm": 3.0194835999472405, |
|
"learning_rate": 5.269907158626639e-06, |
|
"loss": 1.0229, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.6677331325868072, |
|
"grad_norm": 3.113229923263648, |
|
"learning_rate": 5.248431160391963e-06, |
|
"loss": 0.9857, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 0.6684859320598475, |
|
"grad_norm": 2.870839406559322, |
|
"learning_rate": 5.226983430791722e-06, |
|
"loss": 1.027, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.6692387315328879, |
|
"grad_norm": 2.652991785215087, |
|
"learning_rate": 5.205564097425458e-06, |
|
"loss": 1.0588, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 0.6699915310059283, |
|
"grad_norm": 2.895387116149638, |
|
"learning_rate": 5.184173287723782e-06, |
|
"loss": 0.9812, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.6707443304789686, |
|
"grad_norm": 2.764343371936672, |
|
"learning_rate": 5.1628111289476025e-06, |
|
"loss": 1.0278, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 0.671497129952009, |
|
"grad_norm": 3.649390125582586, |
|
"learning_rate": 5.14147774818738e-06, |
|
"loss": 1.0463, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.6722499294250494, |
|
"grad_norm": 2.6370061008680907, |
|
"learning_rate": 5.120173272362361e-06, |
|
"loss": 1.0512, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 0.6730027288980898, |
|
"grad_norm": 3.2489475334028652, |
|
"learning_rate": 5.098897828219831e-06, |
|
"loss": 0.9963, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.6737555283711302, |
|
"grad_norm": 2.86598289535442, |
|
"learning_rate": 5.0776515423343445e-06, |
|
"loss": 0.9304, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.6745083278441705, |
|
"grad_norm": 2.963772691083135, |
|
"learning_rate": 5.0564345411070025e-06, |
|
"loss": 1.0188, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.6752611273172109, |
|
"grad_norm": 2.5796584317987916, |
|
"learning_rate": 5.03524695076467e-06, |
|
"loss": 0.9741, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 0.6760139267902513, |
|
"grad_norm": 2.51416923212948, |
|
"learning_rate": 5.014088897359242e-06, |
|
"loss": 1.0096, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.6767667262632916, |
|
"grad_norm": 2.6665940615567187, |
|
"learning_rate": 4.992960506766881e-06, |
|
"loss": 0.9346, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 0.677519525736332, |
|
"grad_norm": 3.065335474625922, |
|
"learning_rate": 4.971861904687283e-06, |
|
"loss": 1.0338, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6782723252093723, |
|
"grad_norm": 2.9639586464498913, |
|
"learning_rate": 4.950793216642923e-06, |
|
"loss": 0.9781, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 0.6790251246824127, |
|
"grad_norm": 3.040680837970922, |
|
"learning_rate": 4.929754567978303e-06, |
|
"loss": 1.0059, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 0.6797779241554531, |
|
"grad_norm": 2.6333146143761716, |
|
"learning_rate": 4.908746083859214e-06, |
|
"loss": 0.985, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 0.6805307236284934, |
|
"grad_norm": 3.292933520792098, |
|
"learning_rate": 4.887767889271987e-06, |
|
"loss": 1.0661, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.6812835231015338, |
|
"grad_norm": 2.652137597319615, |
|
"learning_rate": 4.866820109022752e-06, |
|
"loss": 1.0072, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.6820363225745742, |
|
"grad_norm": 3.1886078375655136, |
|
"learning_rate": 4.845902867736692e-06, |
|
"loss": 1.0616, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.6827891220476145, |
|
"grad_norm": 2.6580714465346036, |
|
"learning_rate": 4.8250162898573046e-06, |
|
"loss": 0.9878, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 0.683541921520655, |
|
"grad_norm": 3.5025560301930834, |
|
"learning_rate": 4.804160499645667e-06, |
|
"loss": 1.0037, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.6842947209936953, |
|
"grad_norm": 3.0446356739651215, |
|
"learning_rate": 4.783335621179675e-06, |
|
"loss": 0.9986, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 0.6850475204667357, |
|
"grad_norm": 2.5280116263957524, |
|
"learning_rate": 4.762541778353337e-06, |
|
"loss": 0.9486, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.6858003199397761, |
|
"grad_norm": 3.185618921006941, |
|
"learning_rate": 4.741779094876009e-06, |
|
"loss": 1.0347, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 0.6865531194128164, |
|
"grad_norm": 3.2600894992341525, |
|
"learning_rate": 4.721047694271676e-06, |
|
"loss": 1.1032, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.6873059188858568, |
|
"grad_norm": 2.661744895585514, |
|
"learning_rate": 4.700347699878211e-06, |
|
"loss": 0.9778, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 0.6880587183588972, |
|
"grad_norm": 2.806546351632779, |
|
"learning_rate": 4.679679234846636e-06, |
|
"loss": 0.9729, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 0.6888115178319375, |
|
"grad_norm": 3.2198890803288136, |
|
"learning_rate": 4.659042422140399e-06, |
|
"loss": 1.0176, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.6895643173049779, |
|
"grad_norm": 3.1995454384698245, |
|
"learning_rate": 4.6384373845346375e-06, |
|
"loss": 0.9504, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.6903171167780182, |
|
"grad_norm": 2.206607456533358, |
|
"learning_rate": 4.617864244615448e-06, |
|
"loss": 0.9667, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 0.6910699162510586, |
|
"grad_norm": 2.7105101530672986, |
|
"learning_rate": 4.597323124779155e-06, |
|
"loss": 0.9501, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.691822715724099, |
|
"grad_norm": 2.486726858098457, |
|
"learning_rate": 4.576814147231594e-06, |
|
"loss": 0.9677, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 0.6925755151971393, |
|
"grad_norm": 3.5397984883778983, |
|
"learning_rate": 4.556337433987359e-06, |
|
"loss": 1.0204, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.6933283146701797, |
|
"grad_norm": 3.0384236542438336, |
|
"learning_rate": 4.535893106869107e-06, |
|
"loss": 1.1224, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 0.69408111414322, |
|
"grad_norm": 2.8509939761812233, |
|
"learning_rate": 4.515481287506811e-06, |
|
"loss": 0.9704, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 0.6948339136162605, |
|
"grad_norm": 2.612764095608806, |
|
"learning_rate": 4.495102097337062e-06, |
|
"loss": 0.9182, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 0.6955867130893009, |
|
"grad_norm": 2.991760219708861, |
|
"learning_rate": 4.474755657602303e-06, |
|
"loss": 0.9583, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.6963395125623412, |
|
"grad_norm": 2.680080595323768, |
|
"learning_rate": 4.454442089350151e-06, |
|
"loss": 1.0236, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.6970923120353816, |
|
"grad_norm": 2.4653672558221067, |
|
"learning_rate": 4.434161513432659e-06, |
|
"loss": 0.8888, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 0.697845111508422, |
|
"grad_norm": 3.4498774018544447, |
|
"learning_rate": 4.413914050505591e-06, |
|
"loss": 0.9749, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 0.6985979109814623, |
|
"grad_norm": 2.9269586402584724, |
|
"learning_rate": 4.393699821027716e-06, |
|
"loss": 0.9258, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.6993507104545027, |
|
"grad_norm": 3.2070405609265764, |
|
"learning_rate": 4.37351894526009e-06, |
|
"loss": 1.0338, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 0.700103509927543, |
|
"grad_norm": 2.8651840370763333, |
|
"learning_rate": 4.35337154326532e-06, |
|
"loss": 0.8729, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.7008563094005834, |
|
"grad_norm": 2.9774465542399424, |
|
"learning_rate": 4.333257734906889e-06, |
|
"loss": 0.9809, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 0.7008563094005834, |
|
"eval_loss": 0.9652193188667297, |
|
"eval_runtime": 585.081, |
|
"eval_samples_per_second": 30.568, |
|
"eval_steps_per_second": 0.479, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 0.7016091088736238, |
|
"grad_norm": 3.7083258629749887, |
|
"learning_rate": 4.313177639848408e-06, |
|
"loss": 1.044, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.7023619083466641, |
|
"grad_norm": 3.682326422466505, |
|
"learning_rate": 4.293131377552923e-06, |
|
"loss": 0.9859, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 0.7031147078197045, |
|
"grad_norm": 2.6777610705109494, |
|
"learning_rate": 4.273119067282184e-06, |
|
"loss": 0.9582, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 0.7038675072927449, |
|
"grad_norm": 2.473756981986894, |
|
"learning_rate": 4.253140828095964e-06, |
|
"loss": 0.9545, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.7046203067657852, |
|
"grad_norm": 2.9701523969106978, |
|
"learning_rate": 4.2331967788513295e-06, |
|
"loss": 0.9585, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.7053731062388257, |
|
"grad_norm": 2.7968419484911715, |
|
"learning_rate": 4.213287038201943e-06, |
|
"loss": 0.9302, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 0.706125905711866, |
|
"grad_norm": 3.068880113580643, |
|
"learning_rate": 4.193411724597352e-06, |
|
"loss": 1.0344, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.7068787051849064, |
|
"grad_norm": 2.6284431085135656, |
|
"learning_rate": 4.173570956282286e-06, |
|
"loss": 0.9452, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 0.7076315046579468, |
|
"grad_norm": 4.4448629730253915, |
|
"learning_rate": 4.153764851295954e-06, |
|
"loss": 0.9522, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.7083843041309871, |
|
"grad_norm": 3.262892448671552, |
|
"learning_rate": 4.1339935274713404e-06, |
|
"loss": 0.9437, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 0.7091371036040275, |
|
"grad_norm": 2.523698427815705, |
|
"learning_rate": 4.114257102434508e-06, |
|
"loss": 0.9616, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.7098899030770679, |
|
"grad_norm": 3.44474459550001, |
|
"learning_rate": 4.094555693603891e-06, |
|
"loss": 1.0122, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 0.7106427025501082, |
|
"grad_norm": 2.950294841102305, |
|
"learning_rate": 4.074889418189608e-06, |
|
"loss": 0.9568, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.7113955020231486, |
|
"grad_norm": 4.539750688224925, |
|
"learning_rate": 4.055258393192746e-06, |
|
"loss": 0.9975, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.7121483014961889, |
|
"grad_norm": 3.2693568806029996, |
|
"learning_rate": 4.0356627354046854e-06, |
|
"loss": 1.0896, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 0.7129011009692293, |
|
"grad_norm": 2.8301942529564275, |
|
"learning_rate": 4.016102561406392e-06, |
|
"loss": 0.9751, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 0.7136539004422697, |
|
"grad_norm": 3.279627709424991, |
|
"learning_rate": 3.996577987567727e-06, |
|
"loss": 0.914, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.71440669991531, |
|
"grad_norm": 2.832349441872171, |
|
"learning_rate": 3.977089130046756e-06, |
|
"loss": 0.9785, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 0.7151594993883504, |
|
"grad_norm": 3.1766100419605885, |
|
"learning_rate": 3.957636104789056e-06, |
|
"loss": 1.0199, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.7159122988613909, |
|
"grad_norm": 3.1393936203952255, |
|
"learning_rate": 3.938219027527023e-06, |
|
"loss": 1.0026, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 0.7166650983344311, |
|
"grad_norm": 2.4038458329860415, |
|
"learning_rate": 3.9188380137791934e-06, |
|
"loss": 0.9613, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.7174178978074716, |
|
"grad_norm": 2.7702463519768736, |
|
"learning_rate": 3.899493178849544e-06, |
|
"loss": 0.9076, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 0.7181706972805119, |
|
"grad_norm": 2.831283963811367, |
|
"learning_rate": 3.880184637826816e-06, |
|
"loss": 0.9818, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.7189234967535523, |
|
"grad_norm": 2.483000422386397, |
|
"learning_rate": 3.860912505583819e-06, |
|
"loss": 0.8982, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.7196762962265927, |
|
"grad_norm": 2.699549276031205, |
|
"learning_rate": 3.841676896776764e-06, |
|
"loss": 1.0386, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 0.720429095699633, |
|
"grad_norm": 3.118505420829208, |
|
"learning_rate": 3.822477925844564e-06, |
|
"loss": 0.956, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 0.7211818951726734, |
|
"grad_norm": 3.4585379790093245, |
|
"learning_rate": 3.803315707008176e-06, |
|
"loss": 0.9622, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 0.7219346946457137, |
|
"grad_norm": 2.8026653218513333, |
|
"learning_rate": 3.7841903542698855e-06, |
|
"loss": 1.0212, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 0.7226874941187541, |
|
"grad_norm": 3.049945822314557, |
|
"learning_rate": 3.7651019814126656e-06, |
|
"loss": 0.9308, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.7234402935917945, |
|
"grad_norm": 3.878397734938858, |
|
"learning_rate": 3.7460507019994775e-06, |
|
"loss": 1.0046, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 0.7241930930648348, |
|
"grad_norm": 2.9601126857355506, |
|
"learning_rate": 3.7270366293726033e-06, |
|
"loss": 0.955, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 0.7249458925378752, |
|
"grad_norm": 2.844017367671424, |
|
"learning_rate": 3.7080598766529686e-06, |
|
"loss": 0.9088, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 0.7256986920109156, |
|
"grad_norm": 4.1384896382910625, |
|
"learning_rate": 3.689120556739475e-06, |
|
"loss": 1.0619, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 0.7264514914839559, |
|
"grad_norm": 3.8764277193054784, |
|
"learning_rate": 3.6702187823083147e-06, |
|
"loss": 0.9416, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.7272042909569963, |
|
"grad_norm": 2.9803869936914675, |
|
"learning_rate": 3.651354665812313e-06, |
|
"loss": 1.0522, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.7279570904300366, |
|
"grad_norm": 3.3176825382488926, |
|
"learning_rate": 3.6325283194802675e-06, |
|
"loss": 0.9702, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 0.7287098899030771, |
|
"grad_norm": 2.6696911093544173, |
|
"learning_rate": 3.613739855316257e-06, |
|
"loss": 0.8698, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 0.7294626893761175, |
|
"grad_norm": 3.1150460867558003, |
|
"learning_rate": 3.594989385098985e-06, |
|
"loss": 0.9889, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 0.7302154888491578, |
|
"grad_norm": 2.935099853350304, |
|
"learning_rate": 3.5762770203811225e-06, |
|
"loss": 0.9321, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.7309682883221982, |
|
"grad_norm": 2.44072267113799, |
|
"learning_rate": 3.557602872488638e-06, |
|
"loss": 0.9667, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 0.7317210877952386, |
|
"grad_norm": 3.0638151270341902, |
|
"learning_rate": 3.5389670525201335e-06, |
|
"loss": 0.9692, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.7324738872682789, |
|
"grad_norm": 3.5159680803026143, |
|
"learning_rate": 3.5203696713461866e-06, |
|
"loss": 1.0826, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 0.7332266867413193, |
|
"grad_norm": 2.476026867492786, |
|
"learning_rate": 3.5018108396086945e-06, |
|
"loss": 1.0051, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 0.7339794862143596, |
|
"grad_norm": 2.376830294565953, |
|
"learning_rate": 3.483290667720196e-06, |
|
"loss": 1.0273, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.7347322856874, |
|
"grad_norm": 2.2455010627110514, |
|
"learning_rate": 3.4648092658632506e-06, |
|
"loss": 0.9793, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 0.7354850851604404, |
|
"grad_norm": 2.6970914455002535, |
|
"learning_rate": 3.4463667439897486e-06, |
|
"loss": 0.9663, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 0.7362378846334807, |
|
"grad_norm": 3.3981774487654017, |
|
"learning_rate": 3.4279632118202744e-06, |
|
"loss": 0.9573, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 0.7369906841065211, |
|
"grad_norm": 2.5187307744033776, |
|
"learning_rate": 3.4095987788434538e-06, |
|
"loss": 0.9323, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 0.7377434835795615, |
|
"grad_norm": 2.6118138428418853, |
|
"learning_rate": 3.3912735543152864e-06, |
|
"loss": 0.9362, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.7384962830526018, |
|
"grad_norm": 2.751134189690385, |
|
"learning_rate": 3.372987647258521e-06, |
|
"loss": 0.9826, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 0.7392490825256423, |
|
"grad_norm": 2.5860913540515322, |
|
"learning_rate": 3.354741166461989e-06, |
|
"loss": 0.9628, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 0.7400018819986826, |
|
"grad_norm": 2.9459531689153193, |
|
"learning_rate": 3.3365342204799613e-06, |
|
"loss": 0.9987, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 0.740754681471723, |
|
"grad_norm": 3.1706073449088676, |
|
"learning_rate": 3.3183669176315046e-06, |
|
"loss": 0.9705, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.7415074809447634, |
|
"grad_norm": 3.400712366138258, |
|
"learning_rate": 3.3002393659998357e-06, |
|
"loss": 1.0182, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.7422602804178037, |
|
"grad_norm": 2.162383584339622, |
|
"learning_rate": 3.2821516734316772e-06, |
|
"loss": 0.9337, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 0.7430130798908441, |
|
"grad_norm": 2.252372753973197, |
|
"learning_rate": 3.264103947536619e-06, |
|
"loss": 0.9497, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 0.7437658793638845, |
|
"grad_norm": 3.400700187647833, |
|
"learning_rate": 3.2460962956864727e-06, |
|
"loss": 0.9949, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 0.7445186788369248, |
|
"grad_norm": 3.1714265140894504, |
|
"learning_rate": 3.2281288250146447e-06, |
|
"loss": 1.0138, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 0.7452714783099652, |
|
"grad_norm": 2.4482936996207814, |
|
"learning_rate": 3.210201642415477e-06, |
|
"loss": 0.9914, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.7460242777830055, |
|
"grad_norm": 2.524241576825987, |
|
"learning_rate": 3.1923148545436357e-06, |
|
"loss": 0.8905, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 0.7467770772560459, |
|
"grad_norm": 3.2756497769927715, |
|
"learning_rate": 3.174468567813461e-06, |
|
"loss": 0.9682, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 0.7475298767290863, |
|
"grad_norm": 2.991414090084408, |
|
"learning_rate": 3.1566628883983395e-06, |
|
"loss": 0.983, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 0.7482826762021266, |
|
"grad_norm": 2.8418499968354376, |
|
"learning_rate": 3.138897922230074e-06, |
|
"loss": 0.9612, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 0.749035475675167, |
|
"grad_norm": 3.5608060404370234, |
|
"learning_rate": 3.121173774998245e-06, |
|
"loss": 1.0149, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.7497882751482075, |
|
"grad_norm": 3.1105127346719152, |
|
"learning_rate": 3.103490552149595e-06, |
|
"loss": 0.9507, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 0.7505410746212478, |
|
"grad_norm": 2.8041971684545537, |
|
"learning_rate": 3.0858483588873878e-06, |
|
"loss": 0.9625, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 0.7512938740942882, |
|
"grad_norm": 2.5647827030571397, |
|
"learning_rate": 3.0682473001707925e-06, |
|
"loss": 0.8918, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 0.7520466735673285, |
|
"grad_norm": 3.6591031416487105, |
|
"learning_rate": 3.050687480714256e-06, |
|
"loss": 0.998, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 0.7527994730403689, |
|
"grad_norm": 2.9910839287028064, |
|
"learning_rate": 3.0331690049868733e-06, |
|
"loss": 0.9591, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.7535522725134093, |
|
"grad_norm": 2.5843459545257903, |
|
"learning_rate": 3.0156919772117788e-06, |
|
"loss": 0.97, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 0.7543050719864496, |
|
"grad_norm": 2.625453019920782, |
|
"learning_rate": 2.998256501365514e-06, |
|
"loss": 0.9809, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 0.75505787145949, |
|
"grad_norm": 2.710505597656893, |
|
"learning_rate": 2.9808626811774222e-06, |
|
"loss": 0.9726, |
|
"step": 1003 |
|
}, |
|
{ |
|
"epoch": 0.7558106709325303, |
|
"grad_norm": 2.611841535968574, |
|
"learning_rate": 2.963510620129021e-06, |
|
"loss": 0.9266, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 0.7565634704055707, |
|
"grad_norm": 3.2583815916558203, |
|
"learning_rate": 2.9462004214533803e-06, |
|
"loss": 0.9847, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.7573162698786111, |
|
"grad_norm": 2.532735200287804, |
|
"learning_rate": 2.9289321881345257e-06, |
|
"loss": 1.0084, |
|
"step": 1006 |
|
}, |
|
{ |
|
"epoch": 0.7580690693516514, |
|
"grad_norm": 2.857674586577938, |
|
"learning_rate": 2.911706022906816e-06, |
|
"loss": 1.0016, |
|
"step": 1007 |
|
}, |
|
{ |
|
"epoch": 0.7588218688246918, |
|
"grad_norm": 2.533187369034248, |
|
"learning_rate": 2.894522028254334e-06, |
|
"loss": 0.94, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 0.7595746682977322, |
|
"grad_norm": 2.594116896625209, |
|
"learning_rate": 2.8773803064102758e-06, |
|
"loss": 0.8854, |
|
"step": 1009 |
|
}, |
|
{ |
|
"epoch": 0.7603274677707725, |
|
"grad_norm": 3.120940328980785, |
|
"learning_rate": 2.860280959356336e-06, |
|
"loss": 0.9751, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.761080267243813, |
|
"grad_norm": 2.4297166404754544, |
|
"learning_rate": 2.843224088822113e-06, |
|
"loss": 0.9551, |
|
"step": 1011 |
|
}, |
|
{ |
|
"epoch": 0.7618330667168532, |
|
"grad_norm": 2.7126291505696583, |
|
"learning_rate": 2.8262097962845058e-06, |
|
"loss": 1.009, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 0.7625858661898937, |
|
"grad_norm": 3.1508191398932888, |
|
"learning_rate": 2.809238182967092e-06, |
|
"loss": 0.9555, |
|
"step": 1013 |
|
}, |
|
{ |
|
"epoch": 0.7633386656629341, |
|
"grad_norm": 2.540908640400342, |
|
"learning_rate": 2.7923093498395438e-06, |
|
"loss": 0.934, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 0.7640914651359744, |
|
"grad_norm": 2.524639624628765, |
|
"learning_rate": 2.77542339761701e-06, |
|
"loss": 1.022, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.7648442646090148, |
|
"grad_norm": 2.6011323382599048, |
|
"learning_rate": 2.7585804267595383e-06, |
|
"loss": 0.9177, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 0.7655970640820552, |
|
"grad_norm": 2.4744466687181235, |
|
"learning_rate": 2.74178053747146e-06, |
|
"loss": 0.9263, |
|
"step": 1017 |
|
}, |
|
{ |
|
"epoch": 0.7663498635550955, |
|
"grad_norm": 3.1963960205551545, |
|
"learning_rate": 2.7250238297008026e-06, |
|
"loss": 1.0093, |
|
"step": 1018 |
|
}, |
|
{ |
|
"epoch": 0.7671026630281359, |
|
"grad_norm": 3.240522495596184, |
|
"learning_rate": 2.708310403138692e-06, |
|
"loss": 1.0007, |
|
"step": 1019 |
|
}, |
|
{ |
|
"epoch": 0.7678554625011762, |
|
"grad_norm": 3.056025997152738, |
|
"learning_rate": 2.691640357218759e-06, |
|
"loss": 0.9285, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.7686082619742166, |
|
"grad_norm": 3.010243843142797, |
|
"learning_rate": 2.675013791116551e-06, |
|
"loss": 0.9169, |
|
"step": 1021 |
|
}, |
|
{ |
|
"epoch": 0.769361061447257, |
|
"grad_norm": 2.9212925265086818, |
|
"learning_rate": 2.658430803748936e-06, |
|
"loss": 0.9661, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 0.7701138609202973, |
|
"grad_norm": 2.6746112820249532, |
|
"learning_rate": 2.6418914937735228e-06, |
|
"loss": 0.9974, |
|
"step": 1023 |
|
}, |
|
{ |
|
"epoch": 0.7708666603933377, |
|
"grad_norm": 2.8886622989206763, |
|
"learning_rate": 2.625395959588067e-06, |
|
"loss": 1.0527, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 0.7716194598663781, |
|
"grad_norm": 3.198567662487964, |
|
"learning_rate": 2.6089442993298854e-06, |
|
"loss": 1.0188, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.7723722593394184, |
|
"grad_norm": 3.01161211798399, |
|
"learning_rate": 2.592536610875275e-06, |
|
"loss": 0.9717, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 0.7731250588124589, |
|
"grad_norm": 3.2539981490647474, |
|
"learning_rate": 2.576172991838933e-06, |
|
"loss": 0.9739, |
|
"step": 1027 |
|
}, |
|
{ |
|
"epoch": 0.7738778582854992, |
|
"grad_norm": 2.1667397313948373, |
|
"learning_rate": 2.5598535395733735e-06, |
|
"loss": 0.9793, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 0.7746306577585396, |
|
"grad_norm": 2.5896902701922144, |
|
"learning_rate": 2.5435783511683444e-06, |
|
"loss": 0.9285, |
|
"step": 1029 |
|
}, |
|
{ |
|
"epoch": 0.77538345723158, |
|
"grad_norm": 3.199288571835217, |
|
"learning_rate": 2.5273475234502565e-06, |
|
"loss": 1.0095, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.7761362567046203, |
|
"grad_norm": 2.7782162251256883, |
|
"learning_rate": 2.511161152981604e-06, |
|
"loss": 0.9286, |
|
"step": 1031 |
|
}, |
|
{ |
|
"epoch": 0.7768890561776607, |
|
"grad_norm": 2.3999364661213196, |
|
"learning_rate": 2.4950193360603868e-06, |
|
"loss": 0.991, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 0.7776418556507011, |
|
"grad_norm": 3.4885352475618334, |
|
"learning_rate": 2.4789221687195473e-06, |
|
"loss": 0.9268, |
|
"step": 1033 |
|
}, |
|
{ |
|
"epoch": 0.7783946551237414, |
|
"grad_norm": 2.702835220531536, |
|
"learning_rate": 2.4628697467263916e-06, |
|
"loss": 0.918, |
|
"step": 1034 |
|
}, |
|
{ |
|
"epoch": 0.7791474545967818, |
|
"grad_norm": 2.3672163089187066, |
|
"learning_rate": 2.4468621655820125e-06, |
|
"loss": 1.0267, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.7799002540698221, |
|
"grad_norm": 3.3617116952028856, |
|
"learning_rate": 2.430899520520741e-06, |
|
"loss": 1.1033, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 0.7806530535428625, |
|
"grad_norm": 2.605061703191369, |
|
"learning_rate": 2.414981906509565e-06, |
|
"loss": 0.9952, |
|
"step": 1037 |
|
}, |
|
{ |
|
"epoch": 0.7814058530159029, |
|
"grad_norm": 2.7895797271741127, |
|
"learning_rate": 2.399109418247563e-06, |
|
"loss": 0.95, |
|
"step": 1038 |
|
}, |
|
{ |
|
"epoch": 0.7821586524889432, |
|
"grad_norm": 2.9660703069176355, |
|
"learning_rate": 2.383282150165358e-06, |
|
"loss": 0.9452, |
|
"step": 1039 |
|
}, |
|
{ |
|
"epoch": 0.7829114519619836, |
|
"grad_norm": 2.5443722224387058, |
|
"learning_rate": 2.367500196424529e-06, |
|
"loss": 0.9476, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.783664251435024, |
|
"grad_norm": 3.0632555247984987, |
|
"learning_rate": 2.351763650917074e-06, |
|
"loss": 0.9636, |
|
"step": 1041 |
|
}, |
|
{ |
|
"epoch": 0.7844170509080644, |
|
"grad_norm": 2.574855917383529, |
|
"learning_rate": 2.33607260726484e-06, |
|
"loss": 0.9424, |
|
"step": 1042 |
|
}, |
|
{ |
|
"epoch": 0.7851698503811048, |
|
"grad_norm": 2.6815683468237927, |
|
"learning_rate": 2.3204271588189685e-06, |
|
"loss": 0.9793, |
|
"step": 1043 |
|
}, |
|
{ |
|
"epoch": 0.7859226498541451, |
|
"grad_norm": 2.4551608958468827, |
|
"learning_rate": 2.304827398659342e-06, |
|
"loss": 0.8961, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 0.7866754493271855, |
|
"grad_norm": 3.1263512494254813, |
|
"learning_rate": 2.289273419594027e-06, |
|
"loss": 0.9426, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.7874282488002259, |
|
"grad_norm": 3.586836506696256, |
|
"learning_rate": 2.2737653141587203e-06, |
|
"loss": 1.021, |
|
"step": 1046 |
|
}, |
|
{ |
|
"epoch": 0.7881810482732662, |
|
"grad_norm": 2.6987281971760178, |
|
"learning_rate": 2.258303174616204e-06, |
|
"loss": 0.999, |
|
"step": 1047 |
|
}, |
|
{ |
|
"epoch": 0.7889338477463066, |
|
"grad_norm": 2.8798605799038755, |
|
"learning_rate": 2.2428870929558012e-06, |
|
"loss": 0.9594, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 0.7896866472193469, |
|
"grad_norm": 3.113718806953596, |
|
"learning_rate": 2.2275171608928124e-06, |
|
"loss": 0.9488, |
|
"step": 1049 |
|
}, |
|
{ |
|
"epoch": 0.7904394466923873, |
|
"grad_norm": 4.380703171601769, |
|
"learning_rate": 2.2121934698679793e-06, |
|
"loss": 0.963, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.7911922461654277, |
|
"grad_norm": 2.9209377610680325, |
|
"learning_rate": 2.196916111046944e-06, |
|
"loss": 0.9774, |
|
"step": 1051 |
|
}, |
|
{ |
|
"epoch": 0.791945045638468, |
|
"grad_norm": 2.7181240542221903, |
|
"learning_rate": 2.1816851753197023e-06, |
|
"loss": 0.8549, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 0.7926978451115084, |
|
"grad_norm": 2.4755549085650697, |
|
"learning_rate": 2.166500753300065e-06, |
|
"loss": 0.93, |
|
"step": 1053 |
|
}, |
|
{ |
|
"epoch": 0.7934506445845488, |
|
"grad_norm": 2.757372825800587, |
|
"learning_rate": 2.151362935325115e-06, |
|
"loss": 1.0114, |
|
"step": 1054 |
|
}, |
|
{ |
|
"epoch": 0.7942034440575891, |
|
"grad_norm": 2.915932886320286, |
|
"learning_rate": 2.1362718114546777e-06, |
|
"loss": 1.0286, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.7949562435306295, |
|
"grad_norm": 2.1476012635980957, |
|
"learning_rate": 2.121227471470768e-06, |
|
"loss": 0.9105, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 0.7957090430036698, |
|
"grad_norm": 3.423807719412449, |
|
"learning_rate": 2.1062300048770847e-06, |
|
"loss": 0.9917, |
|
"step": 1057 |
|
}, |
|
{ |
|
"epoch": 0.7964618424767103, |
|
"grad_norm": 2.640799998383572, |
|
"learning_rate": 2.09127950089845e-06, |
|
"loss": 1.076, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 0.7972146419497507, |
|
"grad_norm": 2.5105814202222456, |
|
"learning_rate": 2.0763760484802966e-06, |
|
"loss": 0.9166, |
|
"step": 1059 |
|
}, |
|
{ |
|
"epoch": 0.797967441422791, |
|
"grad_norm": 2.4939451327072546, |
|
"learning_rate": 2.0615197362881234e-06, |
|
"loss": 0.9807, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.7987202408958314, |
|
"grad_norm": 2.618028999667555, |
|
"learning_rate": 2.046710652706985e-06, |
|
"loss": 0.9361, |
|
"step": 1061 |
|
}, |
|
{ |
|
"epoch": 0.7994730403688718, |
|
"grad_norm": 2.7939434505437695, |
|
"learning_rate": 2.0319488858409552e-06, |
|
"loss": 0.9447, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 0.8002258398419121, |
|
"grad_norm": 4.901271577317021, |
|
"learning_rate": 2.0172345235126043e-06, |
|
"loss": 1.0124, |
|
"step": 1063 |
|
}, |
|
{ |
|
"epoch": 0.8009786393149525, |
|
"grad_norm": 3.2082041335316602, |
|
"learning_rate": 2.0025676532624794e-06, |
|
"loss": 0.9451, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 0.8009786393149525, |
|
"eval_loss": 0.955357015132904, |
|
"eval_runtime": 584.7085, |
|
"eval_samples_per_second": 30.588, |
|
"eval_steps_per_second": 0.479, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 0.8017314387879928, |
|
"grad_norm": 2.386836563345872, |
|
"learning_rate": 1.9879483623485786e-06, |
|
"loss": 0.9693, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.8024842382610332, |
|
"grad_norm": 2.7219810252216106, |
|
"learning_rate": 1.9733767377458377e-06, |
|
"loss": 0.9822, |
|
"step": 1066 |
|
}, |
|
{ |
|
"epoch": 0.8032370377340736, |
|
"grad_norm": 2.718832993746013, |
|
"learning_rate": 1.9588528661456087e-06, |
|
"loss": 0.9315, |
|
"step": 1067 |
|
}, |
|
{ |
|
"epoch": 0.8039898372071139, |
|
"grad_norm": 2.6930921754180974, |
|
"learning_rate": 1.944376833955147e-06, |
|
"loss": 1.0321, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 0.8047426366801543, |
|
"grad_norm": 2.6093370687591784, |
|
"learning_rate": 1.929948727297096e-06, |
|
"loss": 0.9255, |
|
"step": 1069 |
|
}, |
|
{ |
|
"epoch": 0.8054954361531947, |
|
"grad_norm": 2.638629387462051, |
|
"learning_rate": 1.9155686320089684e-06, |
|
"loss": 0.937, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.806248235626235, |
|
"grad_norm": 2.7812671767987975, |
|
"learning_rate": 1.901236633642649e-06, |
|
"loss": 0.9696, |
|
"step": 1071 |
|
}, |
|
{ |
|
"epoch": 0.8070010350992755, |
|
"grad_norm": 2.5334036447302144, |
|
"learning_rate": 1.8869528174638752e-06, |
|
"loss": 0.9251, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 0.8077538345723158, |
|
"grad_norm": 5.233440136651286, |
|
"learning_rate": 1.8727172684517325e-06, |
|
"loss": 1.0172, |
|
"step": 1073 |
|
}, |
|
{ |
|
"epoch": 0.8085066340453562, |
|
"grad_norm": 3.2625149189009814, |
|
"learning_rate": 1.8585300712981514e-06, |
|
"loss": 0.9732, |
|
"step": 1074 |
|
}, |
|
{ |
|
"epoch": 0.8092594335183966, |
|
"grad_norm": 2.59047640894747, |
|
"learning_rate": 1.8443913104073984e-06, |
|
"loss": 1.0292, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.8100122329914369, |
|
"grad_norm": 3.406989263786069, |
|
"learning_rate": 1.8303010698955803e-06, |
|
"loss": 0.9894, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 0.8107650324644773, |
|
"grad_norm": 2.901678111605894, |
|
"learning_rate": 1.8162594335901363e-06, |
|
"loss": 1.0178, |
|
"step": 1077 |
|
}, |
|
{ |
|
"epoch": 0.8115178319375177, |
|
"grad_norm": 3.799031519554636, |
|
"learning_rate": 1.802266485029347e-06, |
|
"loss": 1.0145, |
|
"step": 1078 |
|
}, |
|
{ |
|
"epoch": 0.812270631410558, |
|
"grad_norm": 2.4781380441389986, |
|
"learning_rate": 1.7883223074618316e-06, |
|
"loss": 0.9426, |
|
"step": 1079 |
|
}, |
|
{ |
|
"epoch": 0.8130234308835984, |
|
"grad_norm": 2.887790196949554, |
|
"learning_rate": 1.774426983846058e-06, |
|
"loss": 0.9036, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.8137762303566387, |
|
"grad_norm": 3.0308047796368385, |
|
"learning_rate": 1.760580596849838e-06, |
|
"loss": 0.9406, |
|
"step": 1081 |
|
}, |
|
{ |
|
"epoch": 0.8145290298296791, |
|
"grad_norm": 2.888803583084765, |
|
"learning_rate": 1.746783228849851e-06, |
|
"loss": 0.9907, |
|
"step": 1082 |
|
}, |
|
{ |
|
"epoch": 0.8152818293027195, |
|
"grad_norm": 3.0358192779029864, |
|
"learning_rate": 1.7330349619311415e-06, |
|
"loss": 0.9547, |
|
"step": 1083 |
|
}, |
|
{ |
|
"epoch": 0.8160346287757598, |
|
"grad_norm": 2.5236660699183138, |
|
"learning_rate": 1.7193358778866464e-06, |
|
"loss": 0.968, |
|
"step": 1084 |
|
}, |
|
{ |
|
"epoch": 0.8167874282488002, |
|
"grad_norm": 4.185827467185533, |
|
"learning_rate": 1.7056860582166823e-06, |
|
"loss": 1.0663, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.8175402277218405, |
|
"grad_norm": 2.6918458931338716, |
|
"learning_rate": 1.6920855841284844e-06, |
|
"loss": 0.9741, |
|
"step": 1086 |
|
}, |
|
{ |
|
"epoch": 0.818293027194881, |
|
"grad_norm": 3.3467900392243717, |
|
"learning_rate": 1.6785345365357153e-06, |
|
"loss": 0.9688, |
|
"step": 1087 |
|
}, |
|
{ |
|
"epoch": 0.8190458266679214, |
|
"grad_norm": 2.574503648578952, |
|
"learning_rate": 1.6650329960579792e-06, |
|
"loss": 0.8948, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 0.8197986261409617, |
|
"grad_norm": 2.906207704777791, |
|
"learning_rate": 1.6515810430203516e-06, |
|
"loss": 0.98, |
|
"step": 1089 |
|
}, |
|
{ |
|
"epoch": 0.8205514256140021, |
|
"grad_norm": 2.3838491968983204, |
|
"learning_rate": 1.638178757452894e-06, |
|
"loss": 0.908, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.8213042250870425, |
|
"grad_norm": 2.6732775130808295, |
|
"learning_rate": 1.624826219090172e-06, |
|
"loss": 0.9864, |
|
"step": 1091 |
|
}, |
|
{ |
|
"epoch": 0.8220570245600828, |
|
"grad_norm": 2.6917421373345367, |
|
"learning_rate": 1.6115235073708024e-06, |
|
"loss": 0.9564, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 0.8228098240331232, |
|
"grad_norm": 2.8465083001648837, |
|
"learning_rate": 1.5982707014369603e-06, |
|
"loss": 0.9958, |
|
"step": 1093 |
|
}, |
|
{ |
|
"epoch": 0.8235626235061635, |
|
"grad_norm": 2.859612454624599, |
|
"learning_rate": 1.585067880133916e-06, |
|
"loss": 0.996, |
|
"step": 1094 |
|
}, |
|
{ |
|
"epoch": 0.8243154229792039, |
|
"grad_norm": 2.8110918785044547, |
|
"learning_rate": 1.5719151220095596e-06, |
|
"loss": 0.9513, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.8250682224522443, |
|
"grad_norm": 3.041627129196771, |
|
"learning_rate": 1.558812505313947e-06, |
|
"loss": 0.9961, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 0.8258210219252846, |
|
"grad_norm": 3.120842891505949, |
|
"learning_rate": 1.5457601079988226e-06, |
|
"loss": 0.9401, |
|
"step": 1097 |
|
}, |
|
{ |
|
"epoch": 0.826573821398325, |
|
"grad_norm": 2.753472364352601, |
|
"learning_rate": 1.5327580077171589e-06, |
|
"loss": 0.9639, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 0.8273266208713654, |
|
"grad_norm": 3.3039694472199703, |
|
"learning_rate": 1.5198062818226967e-06, |
|
"loss": 0.9655, |
|
"step": 1099 |
|
}, |
|
{ |
|
"epoch": 0.8280794203444057, |
|
"grad_norm": 2.566689484677852, |
|
"learning_rate": 1.5069050073694813e-06, |
|
"loss": 0.8969, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.8288322198174461, |
|
"grad_norm": 2.783130384357118, |
|
"learning_rate": 1.4940542611114073e-06, |
|
"loss": 0.9337, |
|
"step": 1101 |
|
}, |
|
{ |
|
"epoch": 0.8295850192904864, |
|
"grad_norm": 2.6527066025848516, |
|
"learning_rate": 1.4812541195017593e-06, |
|
"loss": 1.0317, |
|
"step": 1102 |
|
}, |
|
{ |
|
"epoch": 0.8303378187635269, |
|
"grad_norm": 3.0204607327918818, |
|
"learning_rate": 1.4685046586927598e-06, |
|
"loss": 0.9498, |
|
"step": 1103 |
|
}, |
|
{ |
|
"epoch": 0.8310906182365673, |
|
"grad_norm": 3.8488215000090986, |
|
"learning_rate": 1.4558059545351144e-06, |
|
"loss": 0.928, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 0.8318434177096076, |
|
"grad_norm": 2.6329112944573865, |
|
"learning_rate": 1.4431580825775604e-06, |
|
"loss": 0.9369, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.832596217182648, |
|
"grad_norm": 3.142878133761454, |
|
"learning_rate": 1.4305611180664157e-06, |
|
"loss": 1.0072, |
|
"step": 1106 |
|
}, |
|
{ |
|
"epoch": 0.8333490166556884, |
|
"grad_norm": 4.196882021549821, |
|
"learning_rate": 1.4180151359451367e-06, |
|
"loss": 0.9551, |
|
"step": 1107 |
|
}, |
|
{ |
|
"epoch": 0.8341018161287287, |
|
"grad_norm": 2.6280348669160283, |
|
"learning_rate": 1.4055202108538657e-06, |
|
"loss": 0.9148, |
|
"step": 1108 |
|
}, |
|
{ |
|
"epoch": 0.8348546156017691, |
|
"grad_norm": 2.51099856856607, |
|
"learning_rate": 1.3930764171289935e-06, |
|
"loss": 0.9765, |
|
"step": 1109 |
|
}, |
|
{ |
|
"epoch": 0.8356074150748094, |
|
"grad_norm": 2.6019099844031737, |
|
"learning_rate": 1.3806838288027113e-06, |
|
"loss": 1.0087, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.8363602145478498, |
|
"grad_norm": 3.4496452020581043, |
|
"learning_rate": 1.3683425196025734e-06, |
|
"loss": 0.8536, |
|
"step": 1111 |
|
}, |
|
{ |
|
"epoch": 0.8371130140208902, |
|
"grad_norm": 2.8099366285977863, |
|
"learning_rate": 1.3560525629510567e-06, |
|
"loss": 0.9274, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 0.8378658134939305, |
|
"grad_norm": 2.8308845223177586, |
|
"learning_rate": 1.3438140319651283e-06, |
|
"loss": 0.9104, |
|
"step": 1113 |
|
}, |
|
{ |
|
"epoch": 0.8386186129669709, |
|
"grad_norm": 2.3951322032617295, |
|
"learning_rate": 1.331626999455804e-06, |
|
"loss": 0.9188, |
|
"step": 1114 |
|
}, |
|
{ |
|
"epoch": 0.8393714124400113, |
|
"grad_norm": 3.35662443745714, |
|
"learning_rate": 1.3194915379277195e-06, |
|
"loss": 0.9918, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.8401242119130516, |
|
"grad_norm": 3.588345323887508, |
|
"learning_rate": 1.307407719578696e-06, |
|
"loss": 0.9311, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 0.840877011386092, |
|
"grad_norm": 2.7850061242387656, |
|
"learning_rate": 1.2953756162993158e-06, |
|
"loss": 0.9796, |
|
"step": 1117 |
|
}, |
|
{ |
|
"epoch": 0.8416298108591324, |
|
"grad_norm": 2.571025177959364, |
|
"learning_rate": 1.2833952996724864e-06, |
|
"loss": 0.9475, |
|
"step": 1118 |
|
}, |
|
{ |
|
"epoch": 0.8423826103321728, |
|
"grad_norm": 3.3900850223254237, |
|
"learning_rate": 1.2714668409730312e-06, |
|
"loss": 0.9553, |
|
"step": 1119 |
|
}, |
|
{ |
|
"epoch": 0.8431354098052132, |
|
"grad_norm": 3.1347132178889106, |
|
"learning_rate": 1.259590311167238e-06, |
|
"loss": 1.0453, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.8438882092782535, |
|
"grad_norm": 2.876010919809789, |
|
"learning_rate": 1.2477657809124632e-06, |
|
"loss": 0.8553, |
|
"step": 1121 |
|
}, |
|
{ |
|
"epoch": 0.8446410087512939, |
|
"grad_norm": 2.887022175621213, |
|
"learning_rate": 1.2359933205566987e-06, |
|
"loss": 0.9759, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 0.8453938082243343, |
|
"grad_norm": 2.434068422278726, |
|
"learning_rate": 1.2242730001381532e-06, |
|
"loss": 0.9128, |
|
"step": 1123 |
|
}, |
|
{ |
|
"epoch": 0.8461466076973746, |
|
"grad_norm": 2.736589284541001, |
|
"learning_rate": 1.2126048893848396e-06, |
|
"loss": 0.9591, |
|
"step": 1124 |
|
}, |
|
{ |
|
"epoch": 0.846899407170415, |
|
"grad_norm": 2.4174009434081496, |
|
"learning_rate": 1.2009890577141625e-06, |
|
"loss": 1.0533, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.8476522066434553, |
|
"grad_norm": 2.854192293105523, |
|
"learning_rate": 1.189425574232491e-06, |
|
"loss": 0.9179, |
|
"step": 1126 |
|
}, |
|
{ |
|
"epoch": 0.8484050061164957, |
|
"grad_norm": 2.7270460687471627, |
|
"learning_rate": 1.1779145077347653e-06, |
|
"loss": 0.92, |
|
"step": 1127 |
|
}, |
|
{ |
|
"epoch": 0.8491578055895361, |
|
"grad_norm": 3.292903949444069, |
|
"learning_rate": 1.1664559267040821e-06, |
|
"loss": 1.0056, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 0.8499106050625764, |
|
"grad_norm": 3.1573702260917984, |
|
"learning_rate": 1.1550498993112812e-06, |
|
"loss": 0.9981, |
|
"step": 1129 |
|
}, |
|
{ |
|
"epoch": 0.8506634045356168, |
|
"grad_norm": 2.566537819994616, |
|
"learning_rate": 1.143696493414539e-06, |
|
"loss": 1.0057, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.8514162040086571, |
|
"grad_norm": 3.1010812728125057, |
|
"learning_rate": 1.1323957765589766e-06, |
|
"loss": 0.9657, |
|
"step": 1131 |
|
}, |
|
{ |
|
"epoch": 0.8521690034816976, |
|
"grad_norm": 3.7067774109664624, |
|
"learning_rate": 1.121147815976248e-06, |
|
"loss": 1.0551, |
|
"step": 1132 |
|
}, |
|
{ |
|
"epoch": 0.852921802954738, |
|
"grad_norm": 2.7515838856907906, |
|
"learning_rate": 1.109952678584144e-06, |
|
"loss": 0.9818, |
|
"step": 1133 |
|
}, |
|
{ |
|
"epoch": 0.8536746024277783, |
|
"grad_norm": 3.4640444745995205, |
|
"learning_rate": 1.0988104309861913e-06, |
|
"loss": 0.8982, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 0.8544274019008187, |
|
"grad_norm": 3.7011907670434656, |
|
"learning_rate": 1.0877211394712617e-06, |
|
"loss": 1.0956, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.8551802013738591, |
|
"grad_norm": 3.187205634209682, |
|
"learning_rate": 1.076684870013165e-06, |
|
"loss": 0.9818, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 0.8559330008468994, |
|
"grad_norm": 3.0989118973167518, |
|
"learning_rate": 1.0657016882702764e-06, |
|
"loss": 0.9672, |
|
"step": 1137 |
|
}, |
|
{ |
|
"epoch": 0.8566858003199398, |
|
"grad_norm": 2.7075586038664605, |
|
"learning_rate": 1.0547716595851298e-06, |
|
"loss": 0.9506, |
|
"step": 1138 |
|
}, |
|
{ |
|
"epoch": 0.8574385997929801, |
|
"grad_norm": 3.1360256431016587, |
|
"learning_rate": 1.0438948489840327e-06, |
|
"loss": 0.9602, |
|
"step": 1139 |
|
}, |
|
{ |
|
"epoch": 0.8581913992660205, |
|
"grad_norm": 3.260413556431712, |
|
"learning_rate": 1.0330713211766864e-06, |
|
"loss": 0.936, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.8589441987390609, |
|
"grad_norm": 2.762546618163618, |
|
"learning_rate": 1.022301140555787e-06, |
|
"loss": 0.9092, |
|
"step": 1141 |
|
}, |
|
{ |
|
"epoch": 0.8596969982121012, |
|
"grad_norm": 3.027720830975655, |
|
"learning_rate": 1.0115843711966577e-06, |
|
"loss": 0.9336, |
|
"step": 1142 |
|
}, |
|
{ |
|
"epoch": 0.8604497976851416, |
|
"grad_norm": 2.4551963085300543, |
|
"learning_rate": 1.000921076856859e-06, |
|
"loss": 0.9253, |
|
"step": 1143 |
|
}, |
|
{ |
|
"epoch": 0.861202597158182, |
|
"grad_norm": 2.5990984905624566, |
|
"learning_rate": 9.903113209758098e-07, |
|
"loss": 0.9247, |
|
"step": 1144 |
|
}, |
|
{ |
|
"epoch": 0.8619553966312223, |
|
"grad_norm": 2.520291676745646, |
|
"learning_rate": 9.79755166674411e-07, |
|
"loss": 0.9327, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.8627081961042627, |
|
"grad_norm": 2.9740303315319645, |
|
"learning_rate": 9.692526767546727e-07, |
|
"loss": 0.936, |
|
"step": 1146 |
|
}, |
|
{ |
|
"epoch": 0.863460995577303, |
|
"grad_norm": 2.8172674563090876, |
|
"learning_rate": 9.588039136993366e-07, |
|
"loss": 0.9249, |
|
"step": 1147 |
|
}, |
|
{ |
|
"epoch": 0.8642137950503435, |
|
"grad_norm": 2.6866056844439314, |
|
"learning_rate": 9.484089396715057e-07, |
|
"loss": 0.9751, |
|
"step": 1148 |
|
}, |
|
{ |
|
"epoch": 0.8649665945233839, |
|
"grad_norm": 2.4806642645953416, |
|
"learning_rate": 9.380678165142732e-07, |
|
"loss": 0.9758, |
|
"step": 1149 |
|
}, |
|
{ |
|
"epoch": 0.8657193939964242, |
|
"grad_norm": 2.673649491913748, |
|
"learning_rate": 9.277806057503592e-07, |
|
"loss": 0.8811, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.8664721934694646, |
|
"grad_norm": 4.406342851150164, |
|
"learning_rate": 9.175473685817371e-07, |
|
"loss": 0.8679, |
|
"step": 1151 |
|
}, |
|
{ |
|
"epoch": 0.867224992942505, |
|
"grad_norm": 3.7985140736623815, |
|
"learning_rate": 9.073681658892775e-07, |
|
"loss": 1.0464, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 0.8679777924155453, |
|
"grad_norm": 2.5754256214115125, |
|
"learning_rate": 8.972430582323788e-07, |
|
"loss": 0.978, |
|
"step": 1153 |
|
}, |
|
{ |
|
"epoch": 0.8687305918885857, |
|
"grad_norm": 2.6776748283021594, |
|
"learning_rate": 8.871721058486149e-07, |
|
"loss": 0.9401, |
|
"step": 1154 |
|
}, |
|
{ |
|
"epoch": 0.869483391361626, |
|
"grad_norm": 2.5262861152205613, |
|
"learning_rate": 8.771553686533684e-07, |
|
"loss": 0.9202, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.8702361908346664, |
|
"grad_norm": 2.5512587089365097, |
|
"learning_rate": 8.671929062394802e-07, |
|
"loss": 0.9661, |
|
"step": 1156 |
|
}, |
|
{ |
|
"epoch": 0.8709889903077068, |
|
"grad_norm": 3.0309053039828875, |
|
"learning_rate": 8.572847778768912e-07, |
|
"loss": 1.0411, |
|
"step": 1157 |
|
}, |
|
{ |
|
"epoch": 0.8717417897807471, |
|
"grad_norm": 2.870089467163631, |
|
"learning_rate": 8.474310425122923e-07, |
|
"loss": 0.9452, |
|
"step": 1158 |
|
}, |
|
{ |
|
"epoch": 0.8724945892537875, |
|
"grad_norm": 2.7251037878757014, |
|
"learning_rate": 8.376317587687721e-07, |
|
"loss": 0.9643, |
|
"step": 1159 |
|
}, |
|
{ |
|
"epoch": 0.8732473887268279, |
|
"grad_norm": 3.0515241331021206, |
|
"learning_rate": 8.278869849454718e-07, |
|
"loss": 0.9511, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.8740001881998682, |
|
"grad_norm": 2.7559306109031327, |
|
"learning_rate": 8.181967790172274e-07, |
|
"loss": 0.9655, |
|
"step": 1161 |
|
}, |
|
{ |
|
"epoch": 0.8747529876729087, |
|
"grad_norm": 3.005111485431956, |
|
"learning_rate": 8.085611986342423e-07, |
|
"loss": 0.9264, |
|
"step": 1162 |
|
}, |
|
{ |
|
"epoch": 0.875505787145949, |
|
"grad_norm": 3.183291159476253, |
|
"learning_rate": 7.989803011217256e-07, |
|
"loss": 0.9755, |
|
"step": 1163 |
|
}, |
|
{ |
|
"epoch": 0.8762585866189894, |
|
"grad_norm": 3.6302734872126976, |
|
"learning_rate": 7.8945414347957e-07, |
|
"loss": 1.0719, |
|
"step": 1164 |
|
}, |
|
{ |
|
"epoch": 0.8770113860920298, |
|
"grad_norm": 2.200336710545509, |
|
"learning_rate": 7.799827823819972e-07, |
|
"loss": 0.8843, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.8777641855650701, |
|
"grad_norm": 2.7624309633223345, |
|
"learning_rate": 7.705662741772235e-07, |
|
"loss": 1.0182, |
|
"step": 1166 |
|
}, |
|
{ |
|
"epoch": 0.8785169850381105, |
|
"grad_norm": 3.1072712239860523, |
|
"learning_rate": 7.612046748871327e-07, |
|
"loss": 1.0004, |
|
"step": 1167 |
|
}, |
|
{ |
|
"epoch": 0.8792697845111508, |
|
"grad_norm": 2.65712921508899, |
|
"learning_rate": 7.518980402069354e-07, |
|
"loss": 0.9294, |
|
"step": 1168 |
|
}, |
|
{ |
|
"epoch": 0.8800225839841912, |
|
"grad_norm": 2.7368564706264813, |
|
"learning_rate": 7.426464255048393e-07, |
|
"loss": 0.9607, |
|
"step": 1169 |
|
}, |
|
{ |
|
"epoch": 0.8807753834572316, |
|
"grad_norm": 2.441360830618783, |
|
"learning_rate": 7.334498858217231e-07, |
|
"loss": 0.9321, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.8815281829302719, |
|
"grad_norm": 2.788678307102662, |
|
"learning_rate": 7.243084758708007e-07, |
|
"loss": 0.989, |
|
"step": 1171 |
|
}, |
|
{ |
|
"epoch": 0.8822809824033123, |
|
"grad_norm": 2.5226447327049355, |
|
"learning_rate": 7.152222500373052e-07, |
|
"loss": 0.9305, |
|
"step": 1172 |
|
}, |
|
{ |
|
"epoch": 0.8830337818763527, |
|
"grad_norm": 2.7207043507041107, |
|
"learning_rate": 7.06191262378163e-07, |
|
"loss": 0.9421, |
|
"step": 1173 |
|
}, |
|
{ |
|
"epoch": 0.883786581349393, |
|
"grad_norm": 2.623692509214404, |
|
"learning_rate": 6.972155666216684e-07, |
|
"loss": 1.031, |
|
"step": 1174 |
|
}, |
|
{ |
|
"epoch": 0.8845393808224334, |
|
"grad_norm": 2.599723287251219, |
|
"learning_rate": 6.882952161671652e-07, |
|
"loss": 0.9094, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.8852921802954737, |
|
"grad_norm": 2.4580215454865586, |
|
"learning_rate": 6.794302640847294e-07, |
|
"loss": 0.9175, |
|
"step": 1176 |
|
}, |
|
{ |
|
"epoch": 0.8860449797685142, |
|
"grad_norm": 2.4645666417160097, |
|
"learning_rate": 6.706207631148564e-07, |
|
"loss": 0.8983, |
|
"step": 1177 |
|
}, |
|
{ |
|
"epoch": 0.8867977792415546, |
|
"grad_norm": 3.0348308308305962, |
|
"learning_rate": 6.618667656681444e-07, |
|
"loss": 0.9817, |
|
"step": 1178 |
|
}, |
|
{ |
|
"epoch": 0.8875505787145949, |
|
"grad_norm": 2.4790311726644663, |
|
"learning_rate": 6.531683238249809e-07, |
|
"loss": 0.9532, |
|
"step": 1179 |
|
}, |
|
{ |
|
"epoch": 0.8883033781876353, |
|
"grad_norm": 2.6924152713824956, |
|
"learning_rate": 6.445254893352381e-07, |
|
"loss": 1.0677, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.8890561776606757, |
|
"grad_norm": 2.6608700924933135, |
|
"learning_rate": 6.359383136179598e-07, |
|
"loss": 0.8722, |
|
"step": 1181 |
|
}, |
|
{ |
|
"epoch": 0.889808977133716, |
|
"grad_norm": 2.6182698749250695, |
|
"learning_rate": 6.274068477610584e-07, |
|
"loss": 0.9527, |
|
"step": 1182 |
|
}, |
|
{ |
|
"epoch": 0.8905617766067564, |
|
"grad_norm": 6.3305830668644845, |
|
"learning_rate": 6.189311425210087e-07, |
|
"loss": 0.9959, |
|
"step": 1183 |
|
}, |
|
{ |
|
"epoch": 0.8913145760797967, |
|
"grad_norm": 3.0187373017741392, |
|
"learning_rate": 6.105112483225495e-07, |
|
"loss": 1.0358, |
|
"step": 1184 |
|
}, |
|
{ |
|
"epoch": 0.8920673755528371, |
|
"grad_norm": 2.7315069757579913, |
|
"learning_rate": 6.021472152583818e-07, |
|
"loss": 0.9743, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.8928201750258775, |
|
"grad_norm": 2.827226835891074, |
|
"learning_rate": 5.938390930888671e-07, |
|
"loss": 0.928, |
|
"step": 1186 |
|
}, |
|
{ |
|
"epoch": 0.8935729744989178, |
|
"grad_norm": 2.7665857256735125, |
|
"learning_rate": 5.855869312417362e-07, |
|
"loss": 0.8687, |
|
"step": 1187 |
|
}, |
|
{ |
|
"epoch": 0.8943257739719582, |
|
"grad_norm": 2.5980919886852676, |
|
"learning_rate": 5.77390778811796e-07, |
|
"loss": 0.9564, |
|
"step": 1188 |
|
}, |
|
{ |
|
"epoch": 0.8950785734449986, |
|
"grad_norm": 2.6210409082659245, |
|
"learning_rate": 5.692506845606327e-07, |
|
"loss": 0.9545, |
|
"step": 1189 |
|
}, |
|
{ |
|
"epoch": 0.8958313729180389, |
|
"grad_norm": 3.662360614683105, |
|
"learning_rate": 5.611666969163243e-07, |
|
"loss": 0.9616, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.8965841723910793, |
|
"grad_norm": 3.147025077582227, |
|
"learning_rate": 5.53138863973155e-07, |
|
"loss": 1.0046, |
|
"step": 1191 |
|
}, |
|
{ |
|
"epoch": 0.8973369718641196, |
|
"grad_norm": 3.1345035339900438, |
|
"learning_rate": 5.451672334913216e-07, |
|
"loss": 0.9031, |
|
"step": 1192 |
|
}, |
|
{ |
|
"epoch": 0.8980897713371601, |
|
"grad_norm": 2.8518474768796778, |
|
"learning_rate": 5.372518528966575e-07, |
|
"loss": 0.9732, |
|
"step": 1193 |
|
}, |
|
{ |
|
"epoch": 0.8988425708102005, |
|
"grad_norm": 3.062727839155735, |
|
"learning_rate": 5.293927692803458e-07, |
|
"loss": 1.0282, |
|
"step": 1194 |
|
}, |
|
{ |
|
"epoch": 0.8995953702832408, |
|
"grad_norm": 2.9353100401019323, |
|
"learning_rate": 5.215900293986431e-07, |
|
"loss": 1.0028, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.9003481697562812, |
|
"grad_norm": 2.6913340260060594, |
|
"learning_rate": 5.138436796725942e-07, |
|
"loss": 0.9665, |
|
"step": 1196 |
|
}, |
|
{ |
|
"epoch": 0.9011009692293216, |
|
"grad_norm": 2.8934435608082842, |
|
"learning_rate": 5.061537661877636e-07, |
|
"loss": 1.0446, |
|
"step": 1197 |
|
}, |
|
{ |
|
"epoch": 0.9011009692293216, |
|
"eval_loss": 0.9507451057434082, |
|
"eval_runtime": 583.6153, |
|
"eval_samples_per_second": 30.645, |
|
"eval_steps_per_second": 0.48, |
|
"step": 1197 |
|
}, |
|
{ |
|
"epoch": 0.9018537687023619, |
|
"grad_norm": 2.553104219592107, |
|
"learning_rate": 4.98520334693956e-07, |
|
"loss": 0.9547, |
|
"step": 1198 |
|
}, |
|
{ |
|
"epoch": 0.9026065681754023, |
|
"grad_norm": 2.7253032781503657, |
|
"learning_rate": 4.909434306049487e-07, |
|
"loss": 0.9249, |
|
"step": 1199 |
|
}, |
|
{ |
|
"epoch": 0.9033593676484426, |
|
"grad_norm": 3.2709141351515285, |
|
"learning_rate": 4.834230989982214e-07, |
|
"loss": 0.9728, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.904112167121483, |
|
"grad_norm": 4.401571356782588, |
|
"learning_rate": 4.7595938461467706e-07, |
|
"loss": 0.9535, |
|
"step": 1201 |
|
}, |
|
{ |
|
"epoch": 0.9048649665945234, |
|
"grad_norm": 2.5275547478412093, |
|
"learning_rate": 4.6855233185839175e-07, |
|
"loss": 0.9745, |
|
"step": 1202 |
|
}, |
|
{ |
|
"epoch": 0.9056177660675637, |
|
"grad_norm": 2.5001698261652954, |
|
"learning_rate": 4.6120198479634117e-07, |
|
"loss": 0.9821, |
|
"step": 1203 |
|
}, |
|
{ |
|
"epoch": 0.9063705655406041, |
|
"grad_norm": 2.6567511968251454, |
|
"learning_rate": 4.5390838715813956e-07, |
|
"loss": 0.9181, |
|
"step": 1204 |
|
}, |
|
{ |
|
"epoch": 0.9071233650136445, |
|
"grad_norm": 2.462427713605323, |
|
"learning_rate": 4.4667158233577925e-07, |
|
"loss": 0.9857, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.9078761644866848, |
|
"grad_norm": 2.5236048684689476, |
|
"learning_rate": 4.394916133833782e-07, |
|
"loss": 0.9388, |
|
"step": 1206 |
|
}, |
|
{ |
|
"epoch": 0.9086289639597253, |
|
"grad_norm": 2.5204539009176443, |
|
"learning_rate": 4.323685230169128e-07, |
|
"loss": 0.9548, |
|
"step": 1207 |
|
}, |
|
{ |
|
"epoch": 0.9093817634327656, |
|
"grad_norm": 2.7491122618434414, |
|
"learning_rate": 4.253023536139733e-07, |
|
"loss": 0.937, |
|
"step": 1208 |
|
}, |
|
{ |
|
"epoch": 0.910134562905806, |
|
"grad_norm": 2.5076418947884376, |
|
"learning_rate": 4.1829314721351213e-07, |
|
"loss": 0.9159, |
|
"step": 1209 |
|
}, |
|
{ |
|
"epoch": 0.9108873623788464, |
|
"grad_norm": 3.1696179972875766, |
|
"learning_rate": 4.113409455155837e-07, |
|
"loss": 0.877, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.9116401618518867, |
|
"grad_norm": 2.8529238109300548, |
|
"learning_rate": 4.0444578988110715e-07, |
|
"loss": 0.9462, |
|
"step": 1211 |
|
}, |
|
{ |
|
"epoch": 0.9123929613249271, |
|
"grad_norm": 2.90898356943298, |
|
"learning_rate": 3.976077213316132e-07, |
|
"loss": 0.9328, |
|
"step": 1212 |
|
}, |
|
{ |
|
"epoch": 0.9131457607979674, |
|
"grad_norm": 2.6862075191363117, |
|
"learning_rate": 3.908267805490051e-07, |
|
"loss": 1.0102, |
|
"step": 1213 |
|
}, |
|
{ |
|
"epoch": 0.9138985602710078, |
|
"grad_norm": 2.8255615762352377, |
|
"learning_rate": 3.8410300787531385e-07, |
|
"loss": 0.9589, |
|
"step": 1214 |
|
}, |
|
{ |
|
"epoch": 0.9146513597440482, |
|
"grad_norm": 2.862843093830216, |
|
"learning_rate": 3.774364433124578e-07, |
|
"loss": 0.9288, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.9154041592170885, |
|
"grad_norm": 2.6717135457398036, |
|
"learning_rate": 3.708271265220087e-07, |
|
"loss": 0.9827, |
|
"step": 1216 |
|
}, |
|
{ |
|
"epoch": 0.9161569586901289, |
|
"grad_norm": 2.98813894982846, |
|
"learning_rate": 3.642750968249442e-07, |
|
"loss": 1.0676, |
|
"step": 1217 |
|
}, |
|
{ |
|
"epoch": 0.9169097581631693, |
|
"grad_norm": 3.367070226516106, |
|
"learning_rate": 3.5778039320143456e-07, |
|
"loss": 0.9641, |
|
"step": 1218 |
|
}, |
|
{ |
|
"epoch": 0.9176625576362096, |
|
"grad_norm": 3.312081763624184, |
|
"learning_rate": 3.5134305429058935e-07, |
|
"loss": 0.955, |
|
"step": 1219 |
|
}, |
|
{ |
|
"epoch": 0.91841535710925, |
|
"grad_norm": 3.0015816399724873, |
|
"learning_rate": 3.4496311839024133e-07, |
|
"loss": 0.9972, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.9191681565822903, |
|
"grad_norm": 2.851669734888555, |
|
"learning_rate": 3.386406234567086e-07, |
|
"loss": 0.8808, |
|
"step": 1221 |
|
}, |
|
{ |
|
"epoch": 0.9199209560553308, |
|
"grad_norm": 3.579793360196461, |
|
"learning_rate": 3.3237560710458137e-07, |
|
"loss": 0.9801, |
|
"step": 1222 |
|
}, |
|
{ |
|
"epoch": 0.9206737555283712, |
|
"grad_norm": 2.569101602670157, |
|
"learning_rate": 3.261681066064859e-07, |
|
"loss": 0.9929, |
|
"step": 1223 |
|
}, |
|
{ |
|
"epoch": 0.9214265550014115, |
|
"grad_norm": 4.085329324706736, |
|
"learning_rate": 3.2001815889286856e-07, |
|
"loss": 0.91, |
|
"step": 1224 |
|
}, |
|
{ |
|
"epoch": 0.9221793544744519, |
|
"grad_norm": 2.713777124027884, |
|
"learning_rate": 3.1392580055177867e-07, |
|
"loss": 1.0687, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.9229321539474923, |
|
"grad_norm": 2.680582171824577, |
|
"learning_rate": 3.0789106782864285e-07, |
|
"loss": 0.9431, |
|
"step": 1226 |
|
}, |
|
{ |
|
"epoch": 0.9236849534205326, |
|
"grad_norm": 2.7796394938594244, |
|
"learning_rate": 3.019139966260587e-07, |
|
"loss": 0.9721, |
|
"step": 1227 |
|
}, |
|
{ |
|
"epoch": 0.924437752893573, |
|
"grad_norm": 3.0975647783515483, |
|
"learning_rate": 2.959946225035726e-07, |
|
"loss": 0.9531, |
|
"step": 1228 |
|
}, |
|
{ |
|
"epoch": 0.9251905523666133, |
|
"grad_norm": 2.7977963864759587, |
|
"learning_rate": 2.901329806774744e-07, |
|
"loss": 0.9394, |
|
"step": 1229 |
|
}, |
|
{ |
|
"epoch": 0.9259433518396537, |
|
"grad_norm": 2.5453356161001284, |
|
"learning_rate": 2.843291060205855e-07, |
|
"loss": 0.904, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.9266961513126941, |
|
"grad_norm": 2.7541961024247144, |
|
"learning_rate": 2.785830330620509e-07, |
|
"loss": 0.9774, |
|
"step": 1231 |
|
}, |
|
{ |
|
"epoch": 0.9274489507857344, |
|
"grad_norm": 3.084595078404382, |
|
"learning_rate": 2.728947959871353e-07, |
|
"loss": 0.8789, |
|
"step": 1232 |
|
}, |
|
{ |
|
"epoch": 0.9282017502587748, |
|
"grad_norm": 2.838093099116681, |
|
"learning_rate": 2.672644286370163e-07, |
|
"loss": 0.9272, |
|
"step": 1233 |
|
}, |
|
{ |
|
"epoch": 0.9289545497318152, |
|
"grad_norm": 3.2906398456754453, |
|
"learning_rate": 2.616919645085902e-07, |
|
"loss": 0.9266, |
|
"step": 1234 |
|
}, |
|
{ |
|
"epoch": 0.9297073492048555, |
|
"grad_norm": 2.5775786586459555, |
|
"learning_rate": 2.5617743675426354e-07, |
|
"loss": 0.8926, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.930460148677896, |
|
"grad_norm": 2.6722830170773344, |
|
"learning_rate": 2.507208781817638e-07, |
|
"loss": 0.9202, |
|
"step": 1236 |
|
}, |
|
{ |
|
"epoch": 0.9312129481509362, |
|
"grad_norm": 2.4434315642032716, |
|
"learning_rate": 2.453223212539391e-07, |
|
"loss": 0.9665, |
|
"step": 1237 |
|
}, |
|
{ |
|
"epoch": 0.9319657476239767, |
|
"grad_norm": 3.06220669668657, |
|
"learning_rate": 2.399817980885677e-07, |
|
"loss": 0.9852, |
|
"step": 1238 |
|
}, |
|
{ |
|
"epoch": 0.9327185470970171, |
|
"grad_norm": 2.980887110954531, |
|
"learning_rate": 2.3469934045816435e-07, |
|
"loss": 0.9432, |
|
"step": 1239 |
|
}, |
|
{ |
|
"epoch": 0.9334713465700574, |
|
"grad_norm": 2.856068250126512, |
|
"learning_rate": 2.294749797897955e-07, |
|
"loss": 1.0187, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.9342241460430978, |
|
"grad_norm": 3.5481294720376697, |
|
"learning_rate": 2.243087471648886e-07, |
|
"loss": 0.9678, |
|
"step": 1241 |
|
}, |
|
{ |
|
"epoch": 0.9349769455161382, |
|
"grad_norm": 2.990365041826747, |
|
"learning_rate": 2.192006733190466e-07, |
|
"loss": 0.9186, |
|
"step": 1242 |
|
}, |
|
{ |
|
"epoch": 0.9357297449891785, |
|
"grad_norm": 2.7209981946105963, |
|
"learning_rate": 2.1415078864187034e-07, |
|
"loss": 0.8997, |
|
"step": 1243 |
|
}, |
|
{ |
|
"epoch": 0.9364825444622189, |
|
"grad_norm": 2.633970676325884, |
|
"learning_rate": 2.091591231767709e-07, |
|
"loss": 0.9298, |
|
"step": 1244 |
|
}, |
|
{ |
|
"epoch": 0.9372353439352592, |
|
"grad_norm": 2.5883350194236416, |
|
"learning_rate": 2.0422570662079866e-07, |
|
"loss": 0.9546, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.9379881434082996, |
|
"grad_norm": 3.2796875101881517, |
|
"learning_rate": 1.9935056832445676e-07, |
|
"loss": 0.9693, |
|
"step": 1246 |
|
}, |
|
{ |
|
"epoch": 0.93874094288134, |
|
"grad_norm": 2.5962393649368614, |
|
"learning_rate": 1.945337372915368e-07, |
|
"loss": 1.0175, |
|
"step": 1247 |
|
}, |
|
{ |
|
"epoch": 0.9394937423543803, |
|
"grad_norm": 2.7526882993306154, |
|
"learning_rate": 1.8977524217893782e-07, |
|
"loss": 0.9715, |
|
"step": 1248 |
|
}, |
|
{ |
|
"epoch": 0.9402465418274207, |
|
"grad_norm": 2.8840274447370993, |
|
"learning_rate": 1.85075111296501e-07, |
|
"loss": 0.8795, |
|
"step": 1249 |
|
}, |
|
{ |
|
"epoch": 0.9409993413004611, |
|
"grad_norm": 2.9301887630100554, |
|
"learning_rate": 1.804333726068408e-07, |
|
"loss": 0.9652, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.9417521407735014, |
|
"grad_norm": 2.6456393593543144, |
|
"learning_rate": 1.7585005372517504e-07, |
|
"loss": 0.9918, |
|
"step": 1251 |
|
}, |
|
{ |
|
"epoch": 0.9425049402465419, |
|
"grad_norm": 2.8411309787841863, |
|
"learning_rate": 1.7132518191916413e-07, |
|
"loss": 0.9837, |
|
"step": 1252 |
|
}, |
|
{ |
|
"epoch": 0.9432577397195822, |
|
"grad_norm": 3.2850349406325594, |
|
"learning_rate": 1.6685878410874768e-07, |
|
"loss": 0.9916, |
|
"step": 1253 |
|
}, |
|
{ |
|
"epoch": 0.9440105391926226, |
|
"grad_norm": 2.964090383970478, |
|
"learning_rate": 1.6245088686598686e-07, |
|
"loss": 0.9974, |
|
"step": 1254 |
|
}, |
|
{ |
|
"epoch": 0.944763338665663, |
|
"grad_norm": 2.6196165472195543, |
|
"learning_rate": 1.5810151641489912e-07, |
|
"loss": 0.9014, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.9455161381387033, |
|
"grad_norm": 2.7772230721575615, |
|
"learning_rate": 1.5381069863131037e-07, |
|
"loss": 0.9592, |
|
"step": 1256 |
|
}, |
|
{ |
|
"epoch": 0.9462689376117437, |
|
"grad_norm": 2.5463035833372043, |
|
"learning_rate": 1.495784590426963e-07, |
|
"loss": 0.9416, |
|
"step": 1257 |
|
}, |
|
{ |
|
"epoch": 0.947021737084784, |
|
"grad_norm": 3.1995799318479943, |
|
"learning_rate": 1.4540482282803136e-07, |
|
"loss": 1.0492, |
|
"step": 1258 |
|
}, |
|
{ |
|
"epoch": 0.9477745365578244, |
|
"grad_norm": 3.5523139139820374, |
|
"learning_rate": 1.4128981481764115e-07, |
|
"loss": 1.016, |
|
"step": 1259 |
|
}, |
|
{ |
|
"epoch": 0.9485273360308648, |
|
"grad_norm": 3.262601645233588, |
|
"learning_rate": 1.3723345949305245e-07, |
|
"loss": 0.9621, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.9492801355039051, |
|
"grad_norm": 2.456450672228116, |
|
"learning_rate": 1.3323578098684565e-07, |
|
"loss": 0.8501, |
|
"step": 1261 |
|
}, |
|
{ |
|
"epoch": 0.9500329349769455, |
|
"grad_norm": 3.7186153382386595, |
|
"learning_rate": 1.292968030825159e-07, |
|
"loss": 0.9515, |
|
"step": 1262 |
|
}, |
|
{ |
|
"epoch": 0.9507857344499859, |
|
"grad_norm": 3.049161674324443, |
|
"learning_rate": 1.2541654921432998e-07, |
|
"loss": 1.0811, |
|
"step": 1263 |
|
}, |
|
{ |
|
"epoch": 0.9515385339230262, |
|
"grad_norm": 3.291544360767655, |
|
"learning_rate": 1.2159504246718522e-07, |
|
"loss": 0.9972, |
|
"step": 1264 |
|
}, |
|
{ |
|
"epoch": 0.9522913333960666, |
|
"grad_norm": 3.0656467336632085, |
|
"learning_rate": 1.1783230557647075e-07, |
|
"loss": 0.9184, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.9530441328691069, |
|
"grad_norm": 2.628146994913141, |
|
"learning_rate": 1.1412836092793977e-07, |
|
"loss": 0.8738, |
|
"step": 1266 |
|
}, |
|
{ |
|
"epoch": 0.9537969323421474, |
|
"grad_norm": 2.8202267098554836, |
|
"learning_rate": 1.1048323055756649e-07, |
|
"loss": 0.9663, |
|
"step": 1267 |
|
}, |
|
{ |
|
"epoch": 0.9545497318151878, |
|
"grad_norm": 2.430990179547932, |
|
"learning_rate": 1.068969361514216e-07, |
|
"loss": 0.9611, |
|
"step": 1268 |
|
}, |
|
{ |
|
"epoch": 0.9553025312882281, |
|
"grad_norm": 3.248130764229673, |
|
"learning_rate": 1.033694990455425e-07, |
|
"loss": 0.9268, |
|
"step": 1269 |
|
}, |
|
{ |
|
"epoch": 0.9560553307612685, |
|
"grad_norm": 3.7522100183518363, |
|
"learning_rate": 9.990094022580332e-08, |
|
"loss": 0.9114, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.9568081302343089, |
|
"grad_norm": 3.053183197199831, |
|
"learning_rate": 9.649128032779287e-08, |
|
"loss": 0.9182, |
|
"step": 1271 |
|
}, |
|
{ |
|
"epoch": 0.9575609297073492, |
|
"grad_norm": 2.7055521182554965, |
|
"learning_rate": 9.314053963669245e-08, |
|
"loss": 0.9798, |
|
"step": 1272 |
|
}, |
|
{ |
|
"epoch": 0.9583137291803896, |
|
"grad_norm": 4.422847847788482, |
|
"learning_rate": 8.984873808715155e-08, |
|
"loss": 0.9754, |
|
"step": 1273 |
|
}, |
|
{ |
|
"epoch": 0.9590665286534299, |
|
"grad_norm": 2.4097712053201854, |
|
"learning_rate": 8.661589526317238e-08, |
|
"loss": 1.0184, |
|
"step": 1274 |
|
}, |
|
{ |
|
"epoch": 0.9598193281264703, |
|
"grad_norm": 3.3193315006269564, |
|
"learning_rate": 8.344203039799214e-08, |
|
"loss": 0.9705, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.9605721275995107, |
|
"grad_norm": 3.230643135053207, |
|
"learning_rate": 8.032716237396987e-08, |
|
"loss": 1.0181, |
|
"step": 1276 |
|
}, |
|
{ |
|
"epoch": 0.961324927072551, |
|
"grad_norm": 2.748071452140581, |
|
"learning_rate": 7.727130972247199e-08, |
|
"loss": 0.9488, |
|
"step": 1277 |
|
}, |
|
{ |
|
"epoch": 0.9620777265455914, |
|
"grad_norm": 4.801952883628051, |
|
"learning_rate": 7.427449062376468e-08, |
|
"loss": 0.9536, |
|
"step": 1278 |
|
}, |
|
{ |
|
"epoch": 0.9628305260186318, |
|
"grad_norm": 2.2950527457208594, |
|
"learning_rate": 7.133672290690064e-08, |
|
"loss": 1.0118, |
|
"step": 1279 |
|
}, |
|
{ |
|
"epoch": 0.9635833254916721, |
|
"grad_norm": 2.5220242298362603, |
|
"learning_rate": 6.845802404962243e-08, |
|
"loss": 0.8761, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.9643361249647125, |
|
"grad_norm": 3.113693252903444, |
|
"learning_rate": 6.56384111782482e-08, |
|
"loss": 0.9248, |
|
"step": 1281 |
|
}, |
|
{ |
|
"epoch": 0.9650889244377528, |
|
"grad_norm": 3.275264466855922, |
|
"learning_rate": 6.287790106757396e-08, |
|
"loss": 0.9332, |
|
"step": 1282 |
|
}, |
|
{ |
|
"epoch": 0.9658417239107933, |
|
"grad_norm": 3.1198895526228316, |
|
"learning_rate": 6.017651014077807e-08, |
|
"loss": 1.0051, |
|
"step": 1283 |
|
}, |
|
{ |
|
"epoch": 0.9665945233838337, |
|
"grad_norm": 3.2659243106544054, |
|
"learning_rate": 5.753425446931582e-08, |
|
"loss": 0.9792, |
|
"step": 1284 |
|
}, |
|
{ |
|
"epoch": 0.967347322856874, |
|
"grad_norm": 2.7005554860553174, |
|
"learning_rate": 5.495114977282945e-08, |
|
"loss": 1.0284, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.9681001223299144, |
|
"grad_norm": 2.231612566361707, |
|
"learning_rate": 5.2427211419051605e-08, |
|
"loss": 0.8459, |
|
"step": 1286 |
|
}, |
|
{ |
|
"epoch": 0.9688529218029548, |
|
"grad_norm": 2.470625102204559, |
|
"learning_rate": 4.99624544237165e-08, |
|
"loss": 0.9918, |
|
"step": 1287 |
|
}, |
|
{ |
|
"epoch": 0.9696057212759951, |
|
"grad_norm": 2.7251044172160417, |
|
"learning_rate": 4.7556893450466656e-08, |
|
"loss": 0.9955, |
|
"step": 1288 |
|
}, |
|
{ |
|
"epoch": 0.9703585207490355, |
|
"grad_norm": 2.662846919070589, |
|
"learning_rate": 4.5210542810771864e-08, |
|
"loss": 0.9875, |
|
"step": 1289 |
|
}, |
|
{ |
|
"epoch": 0.9711113202220758, |
|
"grad_norm": 3.1981248551397847, |
|
"learning_rate": 4.292341646383813e-08, |
|
"loss": 1.0478, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.9718641196951162, |
|
"grad_norm": 2.738474070013934, |
|
"learning_rate": 4.069552801652443e-08, |
|
"loss": 0.9877, |
|
"step": 1291 |
|
}, |
|
{ |
|
"epoch": 0.9726169191681566, |
|
"grad_norm": 2.7898654824264213, |
|
"learning_rate": 3.852689072326832e-08, |
|
"loss": 0.9641, |
|
"step": 1292 |
|
}, |
|
{ |
|
"epoch": 0.9733697186411969, |
|
"grad_norm": 2.942924173502617, |
|
"learning_rate": 3.641751748600042e-08, |
|
"loss": 0.9441, |
|
"step": 1293 |
|
}, |
|
{ |
|
"epoch": 0.9741225181142373, |
|
"grad_norm": 2.767562137073869, |
|
"learning_rate": 3.436742085407119e-08, |
|
"loss": 0.9486, |
|
"step": 1294 |
|
}, |
|
{ |
|
"epoch": 0.9748753175872776, |
|
"grad_norm": 3.186982969064621, |
|
"learning_rate": 3.2376613024175384e-08, |
|
"loss": 0.9654, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.975628117060318, |
|
"grad_norm": 2.7143614500651503, |
|
"learning_rate": 3.044510584027771e-08, |
|
"loss": 0.9494, |
|
"step": 1296 |
|
}, |
|
{ |
|
"epoch": 0.9763809165333585, |
|
"grad_norm": 4.735419194732088, |
|
"learning_rate": 2.8572910793546183e-08, |
|
"loss": 0.9178, |
|
"step": 1297 |
|
}, |
|
{ |
|
"epoch": 0.9771337160063988, |
|
"grad_norm": 2.683092581543925, |
|
"learning_rate": 2.676003902227886e-08, |
|
"loss": 1.047, |
|
"step": 1298 |
|
}, |
|
{ |
|
"epoch": 0.9778865154794392, |
|
"grad_norm": 3.1217343085874654, |
|
"learning_rate": 2.50065013118439e-08, |
|
"loss": 0.8718, |
|
"step": 1299 |
|
}, |
|
{ |
|
"epoch": 0.9786393149524796, |
|
"grad_norm": 2.3338305591023634, |
|
"learning_rate": 2.3312308094607382e-08, |
|
"loss": 0.9517, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.9793921144255199, |
|
"grad_norm": 3.498090322618458, |
|
"learning_rate": 2.167746944988114e-08, |
|
"loss": 0.9942, |
|
"step": 1301 |
|
}, |
|
{ |
|
"epoch": 0.9801449138985603, |
|
"grad_norm": 3.1250744843303195, |
|
"learning_rate": 2.010199510385058e-08, |
|
"loss": 0.9257, |
|
"step": 1302 |
|
}, |
|
{ |
|
"epoch": 0.9808977133716006, |
|
"grad_norm": 3.0363120214015034, |
|
"learning_rate": 1.8585894429528073e-08, |
|
"loss": 0.975, |
|
"step": 1303 |
|
}, |
|
{ |
|
"epoch": 0.981650512844641, |
|
"grad_norm": 2.3790341152687255, |
|
"learning_rate": 1.7129176446692986e-08, |
|
"loss": 0.9287, |
|
"step": 1304 |
|
}, |
|
{ |
|
"epoch": 0.9824033123176814, |
|
"grad_norm": 2.7257619356627076, |
|
"learning_rate": 1.5731849821833955e-08, |
|
"loss": 0.892, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.9831561117907217, |
|
"grad_norm": 2.36317553749754, |
|
"learning_rate": 1.4393922868105591e-08, |
|
"loss": 0.9281, |
|
"step": 1306 |
|
}, |
|
{ |
|
"epoch": 0.9839089112637621, |
|
"grad_norm": 2.3089545627122505, |
|
"learning_rate": 1.3115403545270744e-08, |
|
"loss": 0.8987, |
|
"step": 1307 |
|
}, |
|
{ |
|
"epoch": 0.9846617107368025, |
|
"grad_norm": 3.1710248965049357, |
|
"learning_rate": 1.1896299459658311e-08, |
|
"loss": 0.9173, |
|
"step": 1308 |
|
}, |
|
{ |
|
"epoch": 0.9854145102098428, |
|
"grad_norm": 2.597055385822197, |
|
"learning_rate": 1.0736617864117727e-08, |
|
"loss": 0.9617, |
|
"step": 1309 |
|
}, |
|
{ |
|
"epoch": 0.9861673096828832, |
|
"grad_norm": 3.916236816005277, |
|
"learning_rate": 9.636365657971215e-09, |
|
"loss": 1.023, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.9869201091559235, |
|
"grad_norm": 2.688668108409379, |
|
"learning_rate": 8.595549386981595e-09, |
|
"loss": 0.9199, |
|
"step": 1311 |
|
}, |
|
{ |
|
"epoch": 0.987672908628964, |
|
"grad_norm": 2.4623215137317254, |
|
"learning_rate": 7.614175243301213e-09, |
|
"loss": 0.9765, |
|
"step": 1312 |
|
}, |
|
{ |
|
"epoch": 0.9884257081020044, |
|
"grad_norm": 3.3372772901322794, |
|
"learning_rate": 6.692249065447521e-09, |
|
"loss": 0.8604, |
|
"step": 1313 |
|
}, |
|
{ |
|
"epoch": 0.9891785075750447, |
|
"grad_norm": 3.3515115025764595, |
|
"learning_rate": 5.8297763382597625e-09, |
|
"loss": 1.0037, |
|
"step": 1314 |
|
}, |
|
{ |
|
"epoch": 0.9899313070480851, |
|
"grad_norm": 2.604424720606584, |
|
"learning_rate": 5.026762192870127e-09, |
|
"loss": 0.9879, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.9906841065211255, |
|
"grad_norm": 3.544062299090655, |
|
"learning_rate": 4.283211406670429e-09, |
|
"loss": 1.0411, |
|
"step": 1316 |
|
}, |
|
{ |
|
"epoch": 0.9914369059941658, |
|
"grad_norm": 3.267680350517861, |
|
"learning_rate": 3.5991284032899087e-09, |
|
"loss": 0.976, |
|
"step": 1317 |
|
}, |
|
{ |
|
"epoch": 0.9921897054672062, |
|
"grad_norm": 3.0886627412078256, |
|
"learning_rate": 2.974517252558595e-09, |
|
"loss": 0.8727, |
|
"step": 1318 |
|
}, |
|
{ |
|
"epoch": 0.9929425049402465, |
|
"grad_norm": 2.4720359733436275, |
|
"learning_rate": 2.4093816704950924e-09, |
|
"loss": 0.9837, |
|
"step": 1319 |
|
}, |
|
{ |
|
"epoch": 0.9936953044132869, |
|
"grad_norm": 2.5467003523165035, |
|
"learning_rate": 1.9037250192732728e-09, |
|
"loss": 0.9094, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.9944481038863273, |
|
"grad_norm": 2.76588303003768, |
|
"learning_rate": 1.4575503072100649e-09, |
|
"loss": 1.019, |
|
"step": 1321 |
|
}, |
|
{ |
|
"epoch": 0.9952009033593676, |
|
"grad_norm": 2.894873158632333, |
|
"learning_rate": 1.0708601887454706e-09, |
|
"loss": 0.9081, |
|
"step": 1322 |
|
}, |
|
{ |
|
"epoch": 0.995953702832408, |
|
"grad_norm": 2.6572995358290408, |
|
"learning_rate": 7.43656964423689e-10, |
|
"loss": 0.9697, |
|
"step": 1323 |
|
}, |
|
{ |
|
"epoch": 0.9967065023054484, |
|
"grad_norm": 3.158217365660322, |
|
"learning_rate": 4.759425808853468e-10, |
|
"loss": 0.9996, |
|
"step": 1324 |
|
}, |
|
{ |
|
"epoch": 0.9974593017784887, |
|
"grad_norm": 4.329882531340572, |
|
"learning_rate": 2.677186308497337e-10, |
|
"loss": 0.9162, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.9982121012515291, |
|
"grad_norm": 3.0070450446433963, |
|
"learning_rate": 1.1898635310925167e-10, |
|
"loss": 0.9543, |
|
"step": 1326 |
|
}, |
|
{ |
|
"epoch": 0.9989649007245694, |
|
"grad_norm": 2.7526840917387676, |
|
"learning_rate": 2.9746632520533116e-11, |
|
"loss": 0.9748, |
|
"step": 1327 |
|
}, |
|
{ |
|
"epoch": 0.9997177001976099, |
|
"grad_norm": 2.4986435103048477, |
|
"learning_rate": 0.0, |
|
"loss": 0.9028, |
|
"step": 1328 |
|
}, |
|
{ |
|
"epoch": 0.9997177001976099, |
|
"step": 1328, |
|
"total_flos": 1676734831263744.0, |
|
"train_loss": 1.0399419161300343, |
|
"train_runtime": 48886.61, |
|
"train_samples_per_second": 6.956, |
|
"train_steps_per_second": 0.027 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 1328, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 133, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1676734831263744.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|