{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9798818457608176, "eval_steps": 500, "global_step": 24800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 3.703125, "learning_rate": 1.9999968642467102e-05, "loss": 4.2386, "step": 20 }, { "epoch": 0.0, "grad_norm": 1.8515625, "learning_rate": 1.999987438156715e-05, "loss": 3.1965, "step": 40 }, { "epoch": 0.0, "grad_norm": 1.4921875, "learning_rate": 1.9999717217822316e-05, "loss": 2.7844, "step": 60 }, { "epoch": 0.01, "grad_norm": 1.25, "learning_rate": 1.999949715222121e-05, "loss": 2.6013, "step": 80 }, { "epoch": 0.01, "grad_norm": 2.28125, "learning_rate": 1.9999214186148133e-05, "loss": 2.5417, "step": 100 }, { "epoch": 0.01, "grad_norm": 1.1953125, "learning_rate": 1.9998868321383038e-05, "loss": 2.4376, "step": 120 }, { "epoch": 0.01, "grad_norm": 1.25, "learning_rate": 1.9998459560101546e-05, "loss": 2.3875, "step": 140 }, { "epoch": 0.01, "grad_norm": 1.3359375, "learning_rate": 1.9997987904874905e-05, "loss": 2.3568, "step": 160 }, { "epoch": 0.01, "grad_norm": 1.359375, "learning_rate": 1.9997453358670004e-05, "loss": 2.3034, "step": 180 }, { "epoch": 0.02, "grad_norm": 1.9296875, "learning_rate": 1.9996855924849337e-05, "loss": 2.2779, "step": 200 }, { "epoch": 0.02, "grad_norm": 1.6015625, "learning_rate": 1.999619560717097e-05, "loss": 2.2728, "step": 220 }, { "epoch": 0.02, "grad_norm": 1.4921875, "learning_rate": 1.9995472409788548e-05, "loss": 2.2436, "step": 240 }, { "epoch": 0.02, "grad_norm": 1.625, "learning_rate": 1.999468633725125e-05, "loss": 2.2062, "step": 260 }, { "epoch": 0.02, "grad_norm": 1.6875, "learning_rate": 1.9993837394503745e-05, "loss": 2.1873, "step": 280 }, { "epoch": 0.02, "grad_norm": 2.328125, "learning_rate": 1.99929255868862e-05, "loss": 2.1973, "step": 300 }, { "epoch": 0.03, "grad_norm": 1.5546875, "learning_rate": 1.999195092013422e-05, "loss": 2.1891, "step": 320 }, { "epoch": 0.03, "grad_norm": 2.8125, "learning_rate": 1.99909134003788e-05, "loss": 2.1813, "step": 340 }, { "epoch": 0.03, "grad_norm": 2.1875, "learning_rate": 1.998981303414633e-05, "loss": 2.1609, "step": 360 }, { "epoch": 0.03, "grad_norm": 2.234375, "learning_rate": 1.9988649828358504e-05, "loss": 2.1693, "step": 380 }, { "epoch": 0.03, "grad_norm": 1.8359375, "learning_rate": 1.9987423790332315e-05, "loss": 2.1465, "step": 400 }, { "epoch": 0.03, "grad_norm": 1.71875, "learning_rate": 1.9986134927779986e-05, "loss": 2.1387, "step": 420 }, { "epoch": 0.04, "grad_norm": 1.734375, "learning_rate": 1.998478324880893e-05, "loss": 2.1236, "step": 440 }, { "epoch": 0.04, "grad_norm": 1.7265625, "learning_rate": 1.9983368761921703e-05, "loss": 2.1144, "step": 460 }, { "epoch": 0.04, "grad_norm": 1.640625, "learning_rate": 1.9981891476015936e-05, "loss": 2.1164, "step": 480 }, { "epoch": 0.04, "grad_norm": 1.65625, "learning_rate": 1.99803514003843e-05, "loss": 2.1083, "step": 500 }, { "epoch": 0.04, "grad_norm": 1.8515625, "learning_rate": 1.9978748544714427e-05, "loss": 2.0906, "step": 520 }, { "epoch": 0.04, "grad_norm": 1.890625, "learning_rate": 1.997708291908886e-05, "loss": 2.1043, "step": 540 }, { "epoch": 0.04, "grad_norm": 2.0, "learning_rate": 1.9975354533984995e-05, "loss": 2.1028, "step": 560 }, { "epoch": 0.05, "grad_norm": 1.8984375, "learning_rate": 1.9973563400274994e-05, "loss": 2.082, "step": 580 }, { "epoch": 0.05, "grad_norm": 2.1875, "learning_rate": 1.9971709529225754e-05, "loss": 2.0806, "step": 600 }, { "epoch": 0.05, "grad_norm": 2.375, "learning_rate": 1.9969792932498783e-05, "loss": 2.0803, "step": 620 }, { "epoch": 0.05, "grad_norm": 1.84375, "learning_rate": 1.9967813622150177e-05, "loss": 2.0731, "step": 640 }, { "epoch": 0.05, "grad_norm": 2.015625, "learning_rate": 1.996577161063052e-05, "loss": 2.0662, "step": 660 }, { "epoch": 0.05, "grad_norm": 1.859375, "learning_rate": 1.99636669107848e-05, "loss": 2.0472, "step": 680 }, { "epoch": 0.06, "grad_norm": 1.9609375, "learning_rate": 1.996149953585235e-05, "loss": 2.0562, "step": 700 }, { "epoch": 0.06, "grad_norm": 1.84375, "learning_rate": 1.9959269499466746e-05, "loss": 2.0587, "step": 720 }, { "epoch": 0.06, "grad_norm": 1.890625, "learning_rate": 1.9956976815655723e-05, "loss": 2.0576, "step": 740 }, { "epoch": 0.06, "grad_norm": 1.9609375, "learning_rate": 1.99546214988411e-05, "loss": 2.0508, "step": 760 }, { "epoch": 0.06, "grad_norm": 1.953125, "learning_rate": 1.9952203563838676e-05, "loss": 2.034, "step": 780 }, { "epoch": 0.06, "grad_norm": 1.8046875, "learning_rate": 1.9949723025858136e-05, "loss": 2.0259, "step": 800 }, { "epoch": 0.07, "grad_norm": 2.078125, "learning_rate": 1.994717990050297e-05, "loss": 2.0439, "step": 820 }, { "epoch": 0.07, "grad_norm": 2.875, "learning_rate": 1.9944574203770365e-05, "loss": 2.0371, "step": 840 }, { "epoch": 0.07, "grad_norm": 2.515625, "learning_rate": 1.994190595205109e-05, "loss": 2.0375, "step": 860 }, { "epoch": 0.07, "grad_norm": 2.0625, "learning_rate": 1.9939175162129427e-05, "loss": 2.0227, "step": 880 }, { "epoch": 0.07, "grad_norm": 2.078125, "learning_rate": 1.9936381851183032e-05, "loss": 2.0182, "step": 900 }, { "epoch": 0.07, "grad_norm": 1.8203125, "learning_rate": 1.9933526036782852e-05, "loss": 2.0208, "step": 920 }, { "epoch": 0.08, "grad_norm": 1.9921875, "learning_rate": 1.993060773689299e-05, "loss": 2.0177, "step": 940 }, { "epoch": 0.08, "grad_norm": 1.8125, "learning_rate": 1.992762696987062e-05, "loss": 2.0208, "step": 960 }, { "epoch": 0.08, "grad_norm": 2.0, "learning_rate": 1.9924583754465842e-05, "loss": 1.9938, "step": 980 }, { "epoch": 0.08, "grad_norm": 2.109375, "learning_rate": 1.9921478109821598e-05, "loss": 2.0132, "step": 1000 }, { "epoch": 0.08, "grad_norm": 1.8046875, "learning_rate": 1.9918310055473515e-05, "loss": 2.0062, "step": 1020 }, { "epoch": 0.08, "grad_norm": 1.8828125, "learning_rate": 1.991507961134981e-05, "loss": 2.0074, "step": 1040 }, { "epoch": 0.08, "grad_norm": 2.046875, "learning_rate": 1.9911786797771144e-05, "loss": 2.0153, "step": 1060 }, { "epoch": 0.09, "grad_norm": 2.078125, "learning_rate": 1.990843163545052e-05, "loss": 1.996, "step": 1080 }, { "epoch": 0.09, "grad_norm": 2.21875, "learning_rate": 1.990501414549312e-05, "loss": 2.0067, "step": 1100 }, { "epoch": 0.09, "grad_norm": 1.8359375, "learning_rate": 1.9901534349396204e-05, "loss": 1.9922, "step": 1120 }, { "epoch": 0.09, "grad_norm": 2.25, "learning_rate": 1.9897992269048953e-05, "loss": 1.9953, "step": 1140 }, { "epoch": 0.09, "grad_norm": 2.109375, "learning_rate": 1.9894387926732342e-05, "loss": 1.9968, "step": 1160 }, { "epoch": 0.09, "grad_norm": 2.390625, "learning_rate": 1.9890721345118987e-05, "loss": 1.9851, "step": 1180 }, { "epoch": 0.1, "grad_norm": 2.171875, "learning_rate": 1.988699254727303e-05, "loss": 1.9749, "step": 1200 }, { "epoch": 0.1, "grad_norm": 2.25, "learning_rate": 1.988320155664996e-05, "loss": 2.003, "step": 1220 }, { "epoch": 0.1, "grad_norm": 2.140625, "learning_rate": 1.9879348397096482e-05, "loss": 1.9779, "step": 1240 }, { "epoch": 0.1, "grad_norm": 2.09375, "learning_rate": 1.9875433092850376e-05, "loss": 1.9633, "step": 1260 }, { "epoch": 0.1, "grad_norm": 1.9921875, "learning_rate": 1.9871455668540325e-05, "loss": 1.9824, "step": 1280 }, { "epoch": 0.1, "grad_norm": 2.3125, "learning_rate": 1.9867416149185774e-05, "loss": 1.9785, "step": 1300 }, { "epoch": 0.11, "grad_norm": 2.265625, "learning_rate": 1.9863314560196775e-05, "loss": 1.9923, "step": 1320 }, { "epoch": 0.11, "grad_norm": 2.140625, "learning_rate": 1.9859150927373803e-05, "loss": 1.9839, "step": 1340 }, { "epoch": 0.11, "grad_norm": 2.4375, "learning_rate": 1.9854925276907627e-05, "loss": 1.985, "step": 1360 }, { "epoch": 0.11, "grad_norm": 1.953125, "learning_rate": 1.985063763537913e-05, "loss": 1.974, "step": 1380 }, { "epoch": 0.11, "grad_norm": 1.9453125, "learning_rate": 1.9846288029759124e-05, "loss": 1.9801, "step": 1400 }, { "epoch": 0.11, "grad_norm": 2.03125, "learning_rate": 1.984187648740822e-05, "loss": 1.9733, "step": 1420 }, { "epoch": 0.11, "grad_norm": 2.046875, "learning_rate": 1.983740303607662e-05, "loss": 1.9653, "step": 1440 }, { "epoch": 0.12, "grad_norm": 2.4375, "learning_rate": 1.9832867703903953e-05, "loss": 1.9672, "step": 1460 }, { "epoch": 0.12, "grad_norm": 2.15625, "learning_rate": 1.9828270519419115e-05, "loss": 1.9625, "step": 1480 }, { "epoch": 0.12, "grad_norm": 2.0625, "learning_rate": 1.9823611511540064e-05, "loss": 1.9542, "step": 1500 }, { "epoch": 0.12, "grad_norm": 2.328125, "learning_rate": 1.9818890709573652e-05, "loss": 1.9475, "step": 1520 }, { "epoch": 0.12, "grad_norm": 2.234375, "learning_rate": 1.9814108143215446e-05, "loss": 1.9642, "step": 1540 }, { "epoch": 0.12, "grad_norm": 2.015625, "learning_rate": 1.9809263842549516e-05, "loss": 1.9541, "step": 1560 }, { "epoch": 0.13, "grad_norm": 1.9921875, "learning_rate": 1.980435783804828e-05, "loss": 1.956, "step": 1580 }, { "epoch": 0.13, "grad_norm": 1.96875, "learning_rate": 1.9799390160572295e-05, "loss": 1.9812, "step": 1600 }, { "epoch": 0.13, "grad_norm": 1.953125, "learning_rate": 1.979436084137005e-05, "loss": 1.9617, "step": 1620 }, { "epoch": 0.13, "grad_norm": 2.125, "learning_rate": 1.9789269912077792e-05, "loss": 1.9534, "step": 1640 }, { "epoch": 0.13, "grad_norm": 2.140625, "learning_rate": 1.9784117404719324e-05, "loss": 1.9519, "step": 1660 }, { "epoch": 0.13, "grad_norm": 2.046875, "learning_rate": 1.977890335170578e-05, "loss": 1.9448, "step": 1680 }, { "epoch": 0.14, "grad_norm": 2.21875, "learning_rate": 1.9773627785835454e-05, "loss": 1.9361, "step": 1700 }, { "epoch": 0.14, "grad_norm": 2.265625, "learning_rate": 1.9768290740293573e-05, "loss": 1.9485, "step": 1720 }, { "epoch": 0.14, "grad_norm": 2.046875, "learning_rate": 1.9762892248652093e-05, "loss": 1.9356, "step": 1740 }, { "epoch": 0.14, "grad_norm": 2.453125, "learning_rate": 1.975743234486949e-05, "loss": 1.9484, "step": 1760 }, { "epoch": 0.14, "grad_norm": 2.28125, "learning_rate": 1.9751911063290542e-05, "loss": 1.9358, "step": 1780 }, { "epoch": 0.14, "grad_norm": 2.203125, "learning_rate": 1.974632843864612e-05, "loss": 1.9453, "step": 1800 }, { "epoch": 0.15, "grad_norm": 2.359375, "learning_rate": 1.9740684506052958e-05, "loss": 1.9217, "step": 1820 }, { "epoch": 0.15, "grad_norm": 2.171875, "learning_rate": 1.9734979301013445e-05, "loss": 1.9243, "step": 1840 }, { "epoch": 0.15, "grad_norm": 2.09375, "learning_rate": 1.9729212859415397e-05, "loss": 1.9421, "step": 1860 }, { "epoch": 0.15, "grad_norm": 2.625, "learning_rate": 1.9723385217531824e-05, "loss": 1.9311, "step": 1880 }, { "epoch": 0.15, "grad_norm": 1.9609375, "learning_rate": 1.9717496412020717e-05, "loss": 1.9402, "step": 1900 }, { "epoch": 0.15, "grad_norm": 2.140625, "learning_rate": 1.9711546479924797e-05, "loss": 1.9433, "step": 1920 }, { "epoch": 0.15, "grad_norm": 2.203125, "learning_rate": 1.9705535458671304e-05, "loss": 1.9181, "step": 1940 }, { "epoch": 0.16, "grad_norm": 2.375, "learning_rate": 1.9699463386071748e-05, "loss": 1.929, "step": 1960 }, { "epoch": 0.16, "grad_norm": 2.25, "learning_rate": 1.9693330300321666e-05, "loss": 1.941, "step": 1980 }, { "epoch": 0.16, "grad_norm": 2.3125, "learning_rate": 1.96871362400004e-05, "loss": 1.9172, "step": 2000 }, { "epoch": 0.16, "grad_norm": 2.375, "learning_rate": 1.9680881244070848e-05, "loss": 1.9103, "step": 2020 }, { "epoch": 0.16, "grad_norm": 1.9921875, "learning_rate": 1.96745653518792e-05, "loss": 1.9323, "step": 2040 }, { "epoch": 0.16, "grad_norm": 2.015625, "learning_rate": 1.9668188603154716e-05, "loss": 1.9333, "step": 2060 }, { "epoch": 0.17, "grad_norm": 2.640625, "learning_rate": 1.9661751038009463e-05, "loss": 1.9243, "step": 2080 }, { "epoch": 0.17, "grad_norm": 2.03125, "learning_rate": 1.965525269693807e-05, "loss": 1.9386, "step": 2100 }, { "epoch": 0.17, "grad_norm": 2.8125, "learning_rate": 1.9648693620817455e-05, "loss": 1.9293, "step": 2120 }, { "epoch": 0.17, "grad_norm": 2.125, "learning_rate": 1.96420738509066e-05, "loss": 1.9175, "step": 2140 }, { "epoch": 0.17, "grad_norm": 1.90625, "learning_rate": 1.963539342884626e-05, "loss": 1.9176, "step": 2160 }, { "epoch": 0.17, "grad_norm": 2.15625, "learning_rate": 1.9628652396658725e-05, "loss": 1.9182, "step": 2180 }, { "epoch": 0.18, "grad_norm": 2.015625, "learning_rate": 1.9621850796747528e-05, "loss": 1.9048, "step": 2200 }, { "epoch": 0.18, "grad_norm": 2.4375, "learning_rate": 1.9614988671897208e-05, "loss": 1.9209, "step": 2220 }, { "epoch": 0.18, "grad_norm": 2.1875, "learning_rate": 1.960806606527303e-05, "loss": 1.9064, "step": 2240 }, { "epoch": 0.18, "grad_norm": 2.515625, "learning_rate": 1.96010830204207e-05, "loss": 1.9192, "step": 2260 }, { "epoch": 0.18, "grad_norm": 2.203125, "learning_rate": 1.9594039581266107e-05, "loss": 1.9326, "step": 2280 }, { "epoch": 0.18, "grad_norm": 2.0, "learning_rate": 1.958693579211505e-05, "loss": 1.9194, "step": 2300 }, { "epoch": 0.19, "grad_norm": 2.0625, "learning_rate": 1.957977169765294e-05, "loss": 1.8903, "step": 2320 }, { "epoch": 0.19, "grad_norm": 2.703125, "learning_rate": 1.957254734294454e-05, "loss": 1.9135, "step": 2340 }, { "epoch": 0.19, "grad_norm": 2.125, "learning_rate": 1.956526277343366e-05, "loss": 1.9228, "step": 2360 }, { "epoch": 0.19, "grad_norm": 2.28125, "learning_rate": 1.95579180349429e-05, "loss": 1.9094, "step": 2380 }, { "epoch": 0.19, "grad_norm": 2.015625, "learning_rate": 1.955051317367333e-05, "loss": 1.9102, "step": 2400 }, { "epoch": 0.19, "grad_norm": 2.109375, "learning_rate": 1.9543048236204215e-05, "loss": 1.8987, "step": 2420 }, { "epoch": 0.19, "grad_norm": 2.328125, "learning_rate": 1.9535523269492733e-05, "loss": 1.9124, "step": 2440 }, { "epoch": 0.2, "grad_norm": 2.171875, "learning_rate": 1.9527938320873652e-05, "loss": 1.9137, "step": 2460 }, { "epoch": 0.2, "grad_norm": 2.078125, "learning_rate": 1.9520293438059065e-05, "loss": 1.9078, "step": 2480 }, { "epoch": 0.2, "grad_norm": 1.9765625, "learning_rate": 1.9512588669138055e-05, "loss": 1.9092, "step": 2500 }, { "epoch": 0.2, "grad_norm": 2.09375, "learning_rate": 1.9504824062576425e-05, "loss": 1.9114, "step": 2520 }, { "epoch": 0.2, "grad_norm": 2.046875, "learning_rate": 1.949699966721637e-05, "loss": 1.9121, "step": 2540 }, { "epoch": 0.2, "grad_norm": 2.171875, "learning_rate": 1.9489115532276182e-05, "loss": 1.9139, "step": 2560 }, { "epoch": 0.21, "grad_norm": 2.0625, "learning_rate": 1.9481171707349936e-05, "loss": 1.8889, "step": 2580 }, { "epoch": 0.21, "grad_norm": 2.03125, "learning_rate": 1.9473168242407183e-05, "loss": 1.9233, "step": 2600 }, { "epoch": 0.21, "grad_norm": 2.203125, "learning_rate": 1.9465105187792617e-05, "loss": 1.8928, "step": 2620 }, { "epoch": 0.21, "grad_norm": 2.15625, "learning_rate": 1.9456982594225787e-05, "loss": 1.9101, "step": 2640 }, { "epoch": 0.21, "grad_norm": 2.234375, "learning_rate": 1.9448800512800762e-05, "loss": 1.8862, "step": 2660 }, { "epoch": 0.21, "grad_norm": 2.203125, "learning_rate": 1.9440558994985805e-05, "loss": 1.8912, "step": 2680 }, { "epoch": 0.22, "grad_norm": 2.3125, "learning_rate": 1.943225809262306e-05, "loss": 1.8983, "step": 2700 }, { "epoch": 0.22, "grad_norm": 2.28125, "learning_rate": 1.942389785792822e-05, "loss": 1.9031, "step": 2720 }, { "epoch": 0.22, "grad_norm": 2.0625, "learning_rate": 1.94154783434902e-05, "loss": 1.9023, "step": 2740 }, { "epoch": 0.22, "grad_norm": 2.46875, "learning_rate": 1.940699960227081e-05, "loss": 1.8974, "step": 2760 }, { "epoch": 0.22, "grad_norm": 2.0625, "learning_rate": 1.939846168760441e-05, "loss": 1.9007, "step": 2780 }, { "epoch": 0.22, "grad_norm": 2.15625, "learning_rate": 1.938986465319759e-05, "loss": 1.8949, "step": 2800 }, { "epoch": 0.23, "grad_norm": 2.375, "learning_rate": 1.9381208553128813e-05, "loss": 1.8864, "step": 2820 }, { "epoch": 0.23, "grad_norm": 2.359375, "learning_rate": 1.9372493441848105e-05, "loss": 1.9024, "step": 2840 }, { "epoch": 0.23, "grad_norm": 2.109375, "learning_rate": 1.9363719374176683e-05, "loss": 1.8891, "step": 2860 }, { "epoch": 0.23, "grad_norm": 2.28125, "learning_rate": 1.935488640530662e-05, "loss": 1.8849, "step": 2880 }, { "epoch": 0.23, "grad_norm": 2.234375, "learning_rate": 1.9345994590800498e-05, "loss": 1.8939, "step": 2900 }, { "epoch": 0.23, "grad_norm": 2.453125, "learning_rate": 1.9337043986591064e-05, "loss": 1.8903, "step": 2920 }, { "epoch": 0.23, "grad_norm": 2.265625, "learning_rate": 1.9328034648980874e-05, "loss": 1.8731, "step": 2940 }, { "epoch": 0.24, "grad_norm": 2.203125, "learning_rate": 1.9318966634641936e-05, "loss": 1.8781, "step": 2960 }, { "epoch": 0.24, "grad_norm": 2.109375, "learning_rate": 1.9309840000615358e-05, "loss": 1.8855, "step": 2980 }, { "epoch": 0.24, "grad_norm": 2.03125, "learning_rate": 1.930065480431098e-05, "loss": 1.89, "step": 3000 }, { "epoch": 0.24, "grad_norm": 2.53125, "learning_rate": 1.9291411103507033e-05, "loss": 1.878, "step": 3020 }, { "epoch": 0.24, "grad_norm": 2.453125, "learning_rate": 1.9282108956349754e-05, "loss": 1.8896, "step": 3040 }, { "epoch": 0.24, "grad_norm": 2.140625, "learning_rate": 1.9272748421353023e-05, "loss": 1.8763, "step": 3060 }, { "epoch": 0.25, "grad_norm": 2.015625, "learning_rate": 1.9263329557398012e-05, "loss": 1.8741, "step": 3080 }, { "epoch": 0.25, "grad_norm": 2.28125, "learning_rate": 1.9253852423732803e-05, "loss": 1.8664, "step": 3100 }, { "epoch": 0.25, "grad_norm": 2.25, "learning_rate": 1.9244317079972007e-05, "loss": 1.8706, "step": 3120 }, { "epoch": 0.25, "grad_norm": 2.3125, "learning_rate": 1.92347235860964e-05, "loss": 1.8791, "step": 3140 }, { "epoch": 0.25, "grad_norm": 2.34375, "learning_rate": 1.9225072002452557e-05, "loss": 1.8834, "step": 3160 }, { "epoch": 0.25, "grad_norm": 2.359375, "learning_rate": 1.9215362389752434e-05, "loss": 1.8849, "step": 3180 }, { "epoch": 0.26, "grad_norm": 2.3125, "learning_rate": 1.9205594809073035e-05, "loss": 1.8804, "step": 3200 }, { "epoch": 0.26, "grad_norm": 2.3125, "learning_rate": 1.9195769321855984e-05, "loss": 1.8717, "step": 3220 }, { "epoch": 0.26, "grad_norm": 2.1875, "learning_rate": 1.9185885989907173e-05, "loss": 1.8701, "step": 3240 }, { "epoch": 0.26, "grad_norm": 2.0625, "learning_rate": 1.917594487539635e-05, "loss": 1.8764, "step": 3260 }, { "epoch": 0.26, "grad_norm": 2.234375, "learning_rate": 1.9165946040856747e-05, "loss": 1.8695, "step": 3280 }, { "epoch": 0.26, "grad_norm": 2.59375, "learning_rate": 1.9155889549184657e-05, "loss": 1.8747, "step": 3300 }, { "epoch": 0.27, "grad_norm": 3.3125, "learning_rate": 1.9145775463639073e-05, "loss": 1.858, "step": 3320 }, { "epoch": 0.27, "grad_norm": 2.4375, "learning_rate": 1.9135603847841266e-05, "loss": 1.8668, "step": 3340 }, { "epoch": 0.27, "grad_norm": 2.28125, "learning_rate": 1.9125374765774404e-05, "loss": 1.8479, "step": 3360 }, { "epoch": 0.27, "grad_norm": 2.1875, "learning_rate": 1.911508828178312e-05, "loss": 1.8627, "step": 3380 }, { "epoch": 0.27, "grad_norm": 2.40625, "learning_rate": 1.9104744460573156e-05, "loss": 1.8924, "step": 3400 }, { "epoch": 0.27, "grad_norm": 2.15625, "learning_rate": 1.909434336721089e-05, "loss": 1.8739, "step": 3420 }, { "epoch": 0.27, "grad_norm": 2.1875, "learning_rate": 1.9083885067122985e-05, "loss": 1.8762, "step": 3440 }, { "epoch": 0.28, "grad_norm": 2.5, "learning_rate": 1.9073369626095958e-05, "loss": 1.8711, "step": 3460 }, { "epoch": 0.28, "grad_norm": 2.046875, "learning_rate": 1.9062797110275743e-05, "loss": 1.8768, "step": 3480 }, { "epoch": 0.28, "grad_norm": 2.65625, "learning_rate": 1.9052167586167315e-05, "loss": 1.8683, "step": 3500 }, { "epoch": 0.28, "grad_norm": 2.234375, "learning_rate": 1.9041481120634248e-05, "loss": 1.8697, "step": 3520 }, { "epoch": 0.28, "grad_norm": 2.375, "learning_rate": 1.9030737780898284e-05, "loss": 1.863, "step": 3540 }, { "epoch": 0.28, "grad_norm": 2.25, "learning_rate": 1.9019937634538946e-05, "loss": 1.8664, "step": 3560 }, { "epoch": 0.29, "grad_norm": 2.203125, "learning_rate": 1.900908074949307e-05, "loss": 1.8684, "step": 3580 }, { "epoch": 0.29, "grad_norm": 2.1875, "learning_rate": 1.8998167194054425e-05, "loss": 1.8525, "step": 3600 }, { "epoch": 0.29, "grad_norm": 2.5625, "learning_rate": 1.8987197036873227e-05, "loss": 1.8582, "step": 3620 }, { "epoch": 0.29, "grad_norm": 2.796875, "learning_rate": 1.897617034695576e-05, "loss": 1.8664, "step": 3640 }, { "epoch": 0.29, "grad_norm": 2.1875, "learning_rate": 1.8965087193663906e-05, "loss": 1.8692, "step": 3660 }, { "epoch": 0.29, "grad_norm": 2.21875, "learning_rate": 1.895394764671473e-05, "loss": 1.8534, "step": 3680 }, { "epoch": 0.3, "grad_norm": 2.359375, "learning_rate": 1.894275177618004e-05, "loss": 1.852, "step": 3700 }, { "epoch": 0.3, "grad_norm": 2.4375, "learning_rate": 1.893149965248592e-05, "loss": 1.8699, "step": 3720 }, { "epoch": 0.3, "grad_norm": 2.203125, "learning_rate": 1.8920191346412326e-05, "loss": 1.8649, "step": 3740 }, { "epoch": 0.3, "grad_norm": 2.15625, "learning_rate": 1.8908826929092607e-05, "loss": 1.857, "step": 3760 }, { "epoch": 0.3, "grad_norm": 2.28125, "learning_rate": 1.8897406472013084e-05, "loss": 1.8404, "step": 3780 }, { "epoch": 0.3, "grad_norm": 2.65625, "learning_rate": 1.8885930047012585e-05, "loss": 1.864, "step": 3800 }, { "epoch": 0.3, "grad_norm": 2.640625, "learning_rate": 1.887439772628199e-05, "loss": 1.8578, "step": 3820 }, { "epoch": 0.31, "grad_norm": 1.9765625, "learning_rate": 1.886280958236379e-05, "loss": 1.8603, "step": 3840 }, { "epoch": 0.31, "grad_norm": 2.484375, "learning_rate": 1.8851165688151627e-05, "loss": 1.8603, "step": 3860 }, { "epoch": 0.31, "grad_norm": 2.65625, "learning_rate": 1.8839466116889823e-05, "loss": 1.8752, "step": 3880 }, { "epoch": 0.31, "grad_norm": 2.140625, "learning_rate": 1.882771094217293e-05, "loss": 1.8628, "step": 3900 }, { "epoch": 0.31, "grad_norm": 2.28125, "learning_rate": 1.8815900237945284e-05, "loss": 1.8575, "step": 3920 }, { "epoch": 0.31, "grad_norm": 2.28125, "learning_rate": 1.8804034078500497e-05, "loss": 1.85, "step": 3940 }, { "epoch": 0.32, "grad_norm": 2.140625, "learning_rate": 1.8792112538481025e-05, "loss": 1.8687, "step": 3960 }, { "epoch": 0.32, "grad_norm": 2.21875, "learning_rate": 1.8780135692877693e-05, "loss": 1.8465, "step": 3980 }, { "epoch": 0.32, "grad_norm": 2.3125, "learning_rate": 1.8768103617029213e-05, "loss": 1.8569, "step": 4000 }, { "epoch": 0.32, "grad_norm": 2.328125, "learning_rate": 1.8756016386621712e-05, "loss": 1.8401, "step": 4020 }, { "epoch": 0.32, "grad_norm": 2.21875, "learning_rate": 1.874387407768827e-05, "loss": 1.8356, "step": 4040 }, { "epoch": 0.32, "grad_norm": 2.21875, "learning_rate": 1.873167676660842e-05, "loss": 1.8605, "step": 4060 }, { "epoch": 0.33, "grad_norm": 2.125, "learning_rate": 1.8719424530107674e-05, "loss": 1.8598, "step": 4080 }, { "epoch": 0.33, "grad_norm": 2.890625, "learning_rate": 1.8707117445257067e-05, "loss": 1.8512, "step": 4100 }, { "epoch": 0.33, "grad_norm": 2.59375, "learning_rate": 1.8694755589472633e-05, "loss": 1.8482, "step": 4120 }, { "epoch": 0.33, "grad_norm": 2.359375, "learning_rate": 1.8682339040514933e-05, "loss": 1.8479, "step": 4140 }, { "epoch": 0.33, "grad_norm": 2.140625, "learning_rate": 1.8669867876488578e-05, "loss": 1.8397, "step": 4160 }, { "epoch": 0.33, "grad_norm": 2.0625, "learning_rate": 1.8657342175841722e-05, "loss": 1.8579, "step": 4180 }, { "epoch": 0.34, "grad_norm": 2.78125, "learning_rate": 1.8644762017365576e-05, "loss": 1.8508, "step": 4200 }, { "epoch": 0.34, "grad_norm": 2.34375, "learning_rate": 1.863212748019391e-05, "loss": 1.8335, "step": 4220 }, { "epoch": 0.34, "grad_norm": 3.03125, "learning_rate": 1.861943864380255e-05, "loss": 1.8415, "step": 4240 }, { "epoch": 0.34, "grad_norm": 2.375, "learning_rate": 1.86066955880089e-05, "loss": 1.8543, "step": 4260 }, { "epoch": 0.34, "grad_norm": 2.625, "learning_rate": 1.85938983929714e-05, "loss": 1.861, "step": 4280 }, { "epoch": 0.34, "grad_norm": 2.4375, "learning_rate": 1.858104713918907e-05, "loss": 1.8387, "step": 4300 }, { "epoch": 0.34, "grad_norm": 2.21875, "learning_rate": 1.8568141907500964e-05, "loss": 1.8561, "step": 4320 }, { "epoch": 0.35, "grad_norm": 2.484375, "learning_rate": 1.8555182779085678e-05, "loss": 1.8442, "step": 4340 }, { "epoch": 0.35, "grad_norm": 2.34375, "learning_rate": 1.8542169835460846e-05, "loss": 1.8582, "step": 4360 }, { "epoch": 0.35, "grad_norm": 2.265625, "learning_rate": 1.8529103158482605e-05, "loss": 1.8319, "step": 4380 }, { "epoch": 0.35, "grad_norm": 2.875, "learning_rate": 1.8515982830345115e-05, "loss": 1.8388, "step": 4400 }, { "epoch": 0.35, "grad_norm": 2.28125, "learning_rate": 1.850280893358e-05, "loss": 1.8552, "step": 4420 }, { "epoch": 0.35, "grad_norm": 2.296875, "learning_rate": 1.848958155105586e-05, "loss": 1.8317, "step": 4440 }, { "epoch": 0.36, "grad_norm": 2.203125, "learning_rate": 1.847630076597774e-05, "loss": 1.8413, "step": 4460 }, { "epoch": 0.36, "grad_norm": 2.5, "learning_rate": 1.846296666188661e-05, "loss": 1.8251, "step": 4480 }, { "epoch": 0.36, "grad_norm": 2.15625, "learning_rate": 1.8449579322658827e-05, "loss": 1.8445, "step": 4500 }, { "epoch": 0.36, "grad_norm": 2.40625, "learning_rate": 1.8436138832505623e-05, "loss": 1.8672, "step": 4520 }, { "epoch": 0.36, "grad_norm": 2.109375, "learning_rate": 1.842264527597257e-05, "loss": 1.8343, "step": 4540 }, { "epoch": 0.36, "grad_norm": 2.390625, "learning_rate": 1.8409098737939038e-05, "loss": 1.8272, "step": 4560 }, { "epoch": 0.37, "grad_norm": 2.296875, "learning_rate": 1.8395499303617677e-05, "loss": 1.8448, "step": 4580 }, { "epoch": 0.37, "grad_norm": 2.421875, "learning_rate": 1.8381847058553872e-05, "loss": 1.835, "step": 4600 }, { "epoch": 0.37, "grad_norm": 2.171875, "learning_rate": 1.8368142088625213e-05, "loss": 1.8356, "step": 4620 }, { "epoch": 0.37, "grad_norm": 2.40625, "learning_rate": 1.8354384480040935e-05, "loss": 1.8175, "step": 4640 }, { "epoch": 0.37, "grad_norm": 2.703125, "learning_rate": 1.83405743193414e-05, "loss": 1.8218, "step": 4660 }, { "epoch": 0.37, "grad_norm": 2.265625, "learning_rate": 1.8326711693397537e-05, "loss": 1.8409, "step": 4680 }, { "epoch": 0.38, "grad_norm": 2.25, "learning_rate": 1.831279668941031e-05, "loss": 1.8471, "step": 4700 }, { "epoch": 0.38, "grad_norm": 2.109375, "learning_rate": 1.8298829394910146e-05, "loss": 1.8708, "step": 4720 }, { "epoch": 0.38, "grad_norm": 2.28125, "learning_rate": 1.82848098977564e-05, "loss": 1.8397, "step": 4740 }, { "epoch": 0.38, "grad_norm": 2.25, "learning_rate": 1.8270738286136815e-05, "loss": 1.8166, "step": 4760 }, { "epoch": 0.38, "grad_norm": 2.1875, "learning_rate": 1.8256614648566937e-05, "loss": 1.8257, "step": 4780 }, { "epoch": 0.38, "grad_norm": 2.40625, "learning_rate": 1.824243907388958e-05, "loss": 1.8483, "step": 4800 }, { "epoch": 0.38, "grad_norm": 2.59375, "learning_rate": 1.8228211651274264e-05, "loss": 1.8235, "step": 4820 }, { "epoch": 0.39, "grad_norm": 2.640625, "learning_rate": 1.8213932470216652e-05, "loss": 1.8561, "step": 4840 }, { "epoch": 0.39, "grad_norm": 2.828125, "learning_rate": 1.8199601620537977e-05, "loss": 1.8324, "step": 4860 }, { "epoch": 0.39, "grad_norm": 2.171875, "learning_rate": 1.81852191923845e-05, "loss": 1.8389, "step": 4880 }, { "epoch": 0.39, "grad_norm": 2.171875, "learning_rate": 1.8170785276226915e-05, "loss": 1.8372, "step": 4900 }, { "epoch": 0.39, "grad_norm": 2.484375, "learning_rate": 1.8156299962859805e-05, "loss": 1.8367, "step": 4920 }, { "epoch": 0.39, "grad_norm": 2.21875, "learning_rate": 1.8141763343401057e-05, "loss": 1.8078, "step": 4940 }, { "epoch": 0.4, "grad_norm": 2.21875, "learning_rate": 1.8127175509291292e-05, "loss": 1.8181, "step": 4960 }, { "epoch": 0.4, "grad_norm": 2.359375, "learning_rate": 1.8112536552293286e-05, "loss": 1.8273, "step": 4980 }, { "epoch": 0.4, "grad_norm": 3.140625, "learning_rate": 1.80978465644914e-05, "loss": 1.8302, "step": 5000 }, { "epoch": 0.4, "grad_norm": 2.125, "learning_rate": 1.8083105638291e-05, "loss": 1.8469, "step": 5020 }, { "epoch": 0.4, "grad_norm": 2.46875, "learning_rate": 1.8068313866417876e-05, "loss": 1.8235, "step": 5040 }, { "epoch": 0.4, "grad_norm": 2.328125, "learning_rate": 1.8053471341917636e-05, "loss": 1.8302, "step": 5060 }, { "epoch": 0.41, "grad_norm": 2.625, "learning_rate": 1.8038578158155163e-05, "loss": 1.8218, "step": 5080 }, { "epoch": 0.41, "grad_norm": 2.234375, "learning_rate": 1.8023634408814e-05, "loss": 1.8322, "step": 5100 }, { "epoch": 0.41, "grad_norm": 2.4375, "learning_rate": 1.8008640187895755e-05, "loss": 1.8091, "step": 5120 }, { "epoch": 0.41, "grad_norm": 2.53125, "learning_rate": 1.7993595589719533e-05, "loss": 1.828, "step": 5140 }, { "epoch": 0.41, "grad_norm": 2.40625, "learning_rate": 1.797850070892132e-05, "loss": 1.8188, "step": 5160 }, { "epoch": 0.41, "grad_norm": 2.84375, "learning_rate": 1.7963355640453407e-05, "loss": 1.8106, "step": 5180 }, { "epoch": 0.42, "grad_norm": 2.15625, "learning_rate": 1.7948160479583783e-05, "loss": 1.8172, "step": 5200 }, { "epoch": 0.42, "grad_norm": 2.375, "learning_rate": 1.793291532189553e-05, "loss": 1.8324, "step": 5220 }, { "epoch": 0.42, "grad_norm": 2.359375, "learning_rate": 1.791762026328623e-05, "loss": 1.8202, "step": 5240 }, { "epoch": 0.42, "grad_norm": 2.171875, "learning_rate": 1.7902275399967363e-05, "loss": 1.8183, "step": 5260 }, { "epoch": 0.42, "grad_norm": 2.5625, "learning_rate": 1.78868808284637e-05, "loss": 1.8347, "step": 5280 }, { "epoch": 0.42, "grad_norm": 2.5625, "learning_rate": 1.7871436645612685e-05, "loss": 1.831, "step": 5300 }, { "epoch": 0.42, "grad_norm": 2.796875, "learning_rate": 1.785594294856385e-05, "loss": 1.8263, "step": 5320 }, { "epoch": 0.43, "grad_norm": 2.265625, "learning_rate": 1.7840399834778176e-05, "loss": 1.847, "step": 5340 }, { "epoch": 0.43, "grad_norm": 2.375, "learning_rate": 1.7824807402027504e-05, "loss": 1.8249, "step": 5360 }, { "epoch": 0.43, "grad_norm": 2.34375, "learning_rate": 1.78091657483939e-05, "loss": 1.8206, "step": 5380 }, { "epoch": 0.43, "grad_norm": 2.328125, "learning_rate": 1.779347497226905e-05, "loss": 1.8251, "step": 5400 }, { "epoch": 0.43, "grad_norm": 2.5, "learning_rate": 1.777773517235364e-05, "loss": 1.8226, "step": 5420 }, { "epoch": 0.43, "grad_norm": 2.1875, "learning_rate": 1.7761946447656736e-05, "loss": 1.8309, "step": 5440 }, { "epoch": 0.44, "grad_norm": 2.234375, "learning_rate": 1.7746108897495157e-05, "loss": 1.8283, "step": 5460 }, { "epoch": 0.44, "grad_norm": 2.796875, "learning_rate": 1.7730222621492846e-05, "loss": 1.8275, "step": 5480 }, { "epoch": 0.44, "grad_norm": 2.390625, "learning_rate": 1.7714287719580254e-05, "loss": 1.8059, "step": 5500 }, { "epoch": 0.44, "grad_norm": 2.28125, "learning_rate": 1.769830429199371e-05, "loss": 1.8235, "step": 5520 }, { "epoch": 0.44, "grad_norm": 2.40625, "learning_rate": 1.7682272439274778e-05, "loss": 1.8104, "step": 5540 }, { "epoch": 0.44, "grad_norm": 2.609375, "learning_rate": 1.766619226226965e-05, "loss": 1.8212, "step": 5560 }, { "epoch": 0.45, "grad_norm": 2.140625, "learning_rate": 1.765006386212847e-05, "loss": 1.8269, "step": 5580 }, { "epoch": 0.45, "grad_norm": 2.21875, "learning_rate": 1.763388734030475e-05, "loss": 1.8212, "step": 5600 }, { "epoch": 0.45, "grad_norm": 2.390625, "learning_rate": 1.7617662798554685e-05, "loss": 1.8447, "step": 5620 }, { "epoch": 0.45, "grad_norm": 2.53125, "learning_rate": 1.7601390338936547e-05, "loss": 1.8244, "step": 5640 }, { "epoch": 0.45, "grad_norm": 2.375, "learning_rate": 1.7585070063810014e-05, "loss": 1.8125, "step": 5660 }, { "epoch": 0.45, "grad_norm": 2.53125, "learning_rate": 1.7568702075835557e-05, "loss": 1.8114, "step": 5680 }, { "epoch": 0.46, "grad_norm": 2.34375, "learning_rate": 1.7552286477973766e-05, "loss": 1.8136, "step": 5700 }, { "epoch": 0.46, "grad_norm": 2.359375, "learning_rate": 1.7535823373484716e-05, "loss": 1.8261, "step": 5720 }, { "epoch": 0.46, "grad_norm": 2.3125, "learning_rate": 1.751931286592732e-05, "loss": 1.8085, "step": 5740 }, { "epoch": 0.46, "grad_norm": 2.34375, "learning_rate": 1.7502755059158683e-05, "loss": 1.8297, "step": 5760 }, { "epoch": 0.46, "grad_norm": 2.296875, "learning_rate": 1.7486150057333416e-05, "loss": 1.7937, "step": 5780 }, { "epoch": 0.46, "grad_norm": 2.15625, "learning_rate": 1.7469497964903018e-05, "loss": 1.8052, "step": 5800 }, { "epoch": 0.46, "grad_norm": 2.328125, "learning_rate": 1.7452798886615205e-05, "loss": 1.8216, "step": 5820 }, { "epoch": 0.47, "grad_norm": 2.171875, "learning_rate": 1.7436052927513254e-05, "loss": 1.8322, "step": 5840 }, { "epoch": 0.47, "grad_norm": 2.484375, "learning_rate": 1.741926019293533e-05, "loss": 1.8182, "step": 5860 }, { "epoch": 0.47, "grad_norm": 2.578125, "learning_rate": 1.740242078851384e-05, "loss": 1.8262, "step": 5880 }, { "epoch": 0.47, "grad_norm": 2.734375, "learning_rate": 1.7385534820174757e-05, "loss": 1.7948, "step": 5900 }, { "epoch": 0.47, "grad_norm": 3.0, "learning_rate": 1.7368602394136964e-05, "loss": 1.8332, "step": 5920 }, { "epoch": 0.47, "grad_norm": 2.234375, "learning_rate": 1.735162361691157e-05, "loss": 1.8016, "step": 5940 }, { "epoch": 0.48, "grad_norm": 2.25, "learning_rate": 1.7334598595301257e-05, "loss": 1.8103, "step": 5960 }, { "epoch": 0.48, "grad_norm": 2.953125, "learning_rate": 1.7317527436399603e-05, "loss": 1.8014, "step": 5980 }, { "epoch": 0.48, "grad_norm": 2.25, "learning_rate": 1.7300410247590402e-05, "loss": 1.8071, "step": 6000 }, { "epoch": 0.48, "grad_norm": 2.375, "learning_rate": 1.7283247136546996e-05, "loss": 1.809, "step": 6020 }, { "epoch": 0.48, "grad_norm": 2.46875, "learning_rate": 1.7266038211231583e-05, "loss": 1.8236, "step": 6040 }, { "epoch": 0.48, "grad_norm": 2.234375, "learning_rate": 1.724878357989457e-05, "loss": 1.8306, "step": 6060 }, { "epoch": 0.49, "grad_norm": 2.265625, "learning_rate": 1.7231483351073858e-05, "loss": 1.8165, "step": 6080 }, { "epoch": 0.49, "grad_norm": 2.1875, "learning_rate": 1.721413763359417e-05, "loss": 1.8162, "step": 6100 }, { "epoch": 0.49, "grad_norm": 2.25, "learning_rate": 1.7196746536566376e-05, "loss": 1.8346, "step": 6120 }, { "epoch": 0.49, "grad_norm": 2.265625, "learning_rate": 1.71793101693868e-05, "loss": 1.8082, "step": 6140 }, { "epoch": 0.49, "grad_norm": 2.15625, "learning_rate": 1.7161828641736527e-05, "loss": 1.8105, "step": 6160 }, { "epoch": 0.49, "grad_norm": 2.4375, "learning_rate": 1.7144302063580726e-05, "loss": 1.8105, "step": 6180 }, { "epoch": 0.49, "grad_norm": 2.46875, "learning_rate": 1.712673054516794e-05, "loss": 1.8232, "step": 6200 }, { "epoch": 0.5, "grad_norm": 2.75, "learning_rate": 1.7109114197029408e-05, "loss": 1.8227, "step": 6220 }, { "epoch": 0.5, "grad_norm": 2.265625, "learning_rate": 1.7091453129978363e-05, "loss": 1.8181, "step": 6240 }, { "epoch": 0.5, "grad_norm": 2.59375, "learning_rate": 1.7073747455109336e-05, "loss": 1.8006, "step": 6260 }, { "epoch": 0.5, "grad_norm": 2.71875, "learning_rate": 1.7055997283797463e-05, "loss": 1.7975, "step": 6280 }, { "epoch": 0.5, "grad_norm": 2.28125, "learning_rate": 1.7038202727697766e-05, "loss": 1.8105, "step": 6300 }, { "epoch": 0.5, "grad_norm": 2.59375, "learning_rate": 1.7020363898744477e-05, "loss": 1.7994, "step": 6320 }, { "epoch": 0.51, "grad_norm": 2.59375, "learning_rate": 1.7002480909150316e-05, "loss": 1.8193, "step": 6340 }, { "epoch": 0.51, "grad_norm": 2.40625, "learning_rate": 1.6984553871405783e-05, "loss": 1.8347, "step": 6360 }, { "epoch": 0.51, "grad_norm": 2.234375, "learning_rate": 1.6966582898278466e-05, "loss": 1.8159, "step": 6380 }, { "epoch": 0.51, "grad_norm": 2.3125, "learning_rate": 1.694856810281232e-05, "loss": 1.8053, "step": 6400 }, { "epoch": 0.51, "grad_norm": 2.34375, "learning_rate": 1.6930509598326948e-05, "loss": 1.828, "step": 6420 }, { "epoch": 0.51, "grad_norm": 2.1875, "learning_rate": 1.6912407498416914e-05, "loss": 1.8186, "step": 6440 }, { "epoch": 0.52, "grad_norm": 2.296875, "learning_rate": 1.689426191695101e-05, "loss": 1.8027, "step": 6460 }, { "epoch": 0.52, "grad_norm": 2.375, "learning_rate": 1.6876072968071532e-05, "loss": 1.8098, "step": 6480 }, { "epoch": 0.52, "grad_norm": 2.46875, "learning_rate": 1.6857840766193586e-05, "loss": 1.8129, "step": 6500 }, { "epoch": 0.52, "grad_norm": 2.328125, "learning_rate": 1.6839565426004346e-05, "loss": 1.8054, "step": 6520 }, { "epoch": 0.52, "grad_norm": 2.390625, "learning_rate": 1.6821247062462347e-05, "loss": 1.8123, "step": 6540 }, { "epoch": 0.52, "grad_norm": 2.328125, "learning_rate": 1.6802885790796753e-05, "loss": 1.8074, "step": 6560 }, { "epoch": 0.53, "grad_norm": 2.65625, "learning_rate": 1.678448172650664e-05, "loss": 1.7996, "step": 6580 }, { "epoch": 0.53, "grad_norm": 2.609375, "learning_rate": 1.676603498536026e-05, "loss": 1.8098, "step": 6600 }, { "epoch": 0.53, "grad_norm": 2.265625, "learning_rate": 1.6747545683394322e-05, "loss": 1.8016, "step": 6620 }, { "epoch": 0.53, "grad_norm": 2.6875, "learning_rate": 1.672901393691325e-05, "loss": 1.8093, "step": 6640 }, { "epoch": 0.53, "grad_norm": 2.484375, "learning_rate": 1.6710439862488478e-05, "loss": 1.8023, "step": 6660 }, { "epoch": 0.53, "grad_norm": 2.234375, "learning_rate": 1.6691823576957676e-05, "loss": 1.8075, "step": 6680 }, { "epoch": 0.53, "grad_norm": 2.6875, "learning_rate": 1.667316519742405e-05, "loss": 1.8052, "step": 6700 }, { "epoch": 0.54, "grad_norm": 2.34375, "learning_rate": 1.6654464841255586e-05, "loss": 1.8011, "step": 6720 }, { "epoch": 0.54, "grad_norm": 2.25, "learning_rate": 1.663572262608433e-05, "loss": 1.8075, "step": 6740 }, { "epoch": 0.54, "grad_norm": 2.296875, "learning_rate": 1.6616938669805622e-05, "loss": 1.7911, "step": 6760 }, { "epoch": 0.54, "grad_norm": 2.203125, "learning_rate": 1.659811309057738e-05, "loss": 1.8026, "step": 6780 }, { "epoch": 0.54, "grad_norm": 2.1875, "learning_rate": 1.6579246006819335e-05, "loss": 1.8088, "step": 6800 }, { "epoch": 0.54, "grad_norm": 2.328125, "learning_rate": 1.6560337537212306e-05, "loss": 1.8155, "step": 6820 }, { "epoch": 0.55, "grad_norm": 2.21875, "learning_rate": 1.6541387800697438e-05, "loss": 1.7997, "step": 6840 }, { "epoch": 0.55, "grad_norm": 2.25, "learning_rate": 1.6522396916475468e-05, "loss": 1.8253, "step": 6860 }, { "epoch": 0.55, "grad_norm": 2.265625, "learning_rate": 1.650336500400595e-05, "loss": 1.8037, "step": 6880 }, { "epoch": 0.55, "grad_norm": 2.421875, "learning_rate": 1.6484292183006542e-05, "loss": 1.8154, "step": 6900 }, { "epoch": 0.55, "grad_norm": 2.515625, "learning_rate": 1.6465178573452214e-05, "loss": 1.8169, "step": 6920 }, { "epoch": 0.55, "grad_norm": 2.359375, "learning_rate": 1.6446024295574522e-05, "loss": 1.8002, "step": 6940 }, { "epoch": 0.56, "grad_norm": 2.28125, "learning_rate": 1.6426829469860837e-05, "loss": 1.7999, "step": 6960 }, { "epoch": 0.56, "grad_norm": 2.375, "learning_rate": 1.6407594217053587e-05, "loss": 1.7973, "step": 6980 }, { "epoch": 0.56, "grad_norm": 2.296875, "learning_rate": 1.638831865814951e-05, "loss": 1.8073, "step": 7000 }, { "epoch": 0.56, "grad_norm": 2.265625, "learning_rate": 1.6369002914398874e-05, "loss": 1.795, "step": 7020 }, { "epoch": 0.56, "grad_norm": 2.34375, "learning_rate": 1.6349647107304724e-05, "loss": 1.7985, "step": 7040 }, { "epoch": 0.56, "grad_norm": 2.125, "learning_rate": 1.633025135862213e-05, "loss": 1.7936, "step": 7060 }, { "epoch": 0.57, "grad_norm": 2.890625, "learning_rate": 1.6310815790357404e-05, "loss": 1.8036, "step": 7080 }, { "epoch": 0.57, "grad_norm": 2.4375, "learning_rate": 1.6291340524767327e-05, "loss": 1.8046, "step": 7100 }, { "epoch": 0.57, "grad_norm": 2.6875, "learning_rate": 1.6271825684358404e-05, "loss": 1.8052, "step": 7120 }, { "epoch": 0.57, "grad_norm": 2.484375, "learning_rate": 1.625227139188607e-05, "loss": 1.8105, "step": 7140 }, { "epoch": 0.57, "grad_norm": 2.375, "learning_rate": 1.6232677770353936e-05, "loss": 1.7952, "step": 7160 }, { "epoch": 0.57, "grad_norm": 2.625, "learning_rate": 1.621304494301301e-05, "loss": 1.8102, "step": 7180 }, { "epoch": 0.57, "grad_norm": 2.609375, "learning_rate": 1.6193373033360904e-05, "loss": 1.7962, "step": 7200 }, { "epoch": 0.58, "grad_norm": 2.328125, "learning_rate": 1.6173662165141084e-05, "loss": 1.8078, "step": 7220 }, { "epoch": 0.58, "grad_norm": 2.5, "learning_rate": 1.6153912462342073e-05, "loss": 1.8051, "step": 7240 }, { "epoch": 0.58, "grad_norm": 2.15625, "learning_rate": 1.6134124049196688e-05, "loss": 1.8057, "step": 7260 }, { "epoch": 0.58, "grad_norm": 2.78125, "learning_rate": 1.6114297050181235e-05, "loss": 1.8153, "step": 7280 }, { "epoch": 0.58, "grad_norm": 2.171875, "learning_rate": 1.6094431590014746e-05, "loss": 1.8047, "step": 7300 }, { "epoch": 0.58, "grad_norm": 2.359375, "learning_rate": 1.6074527793658186e-05, "loss": 1.8069, "step": 7320 }, { "epoch": 0.59, "grad_norm": 2.4375, "learning_rate": 1.605458578631367e-05, "loss": 1.7919, "step": 7340 }, { "epoch": 0.59, "grad_norm": 2.390625, "learning_rate": 1.6034605693423676e-05, "loss": 1.8104, "step": 7360 }, { "epoch": 0.59, "grad_norm": 2.703125, "learning_rate": 1.6014587640670244e-05, "loss": 1.7971, "step": 7380 }, { "epoch": 0.59, "grad_norm": 2.703125, "learning_rate": 1.599453175397421e-05, "loss": 1.7987, "step": 7400 }, { "epoch": 0.59, "grad_norm": 2.46875, "learning_rate": 1.597443815949439e-05, "loss": 1.8057, "step": 7420 }, { "epoch": 0.59, "grad_norm": 2.359375, "learning_rate": 1.59543069836268e-05, "loss": 1.7817, "step": 7440 }, { "epoch": 0.6, "grad_norm": 2.515625, "learning_rate": 1.5934138353003845e-05, "loss": 1.8009, "step": 7460 }, { "epoch": 0.6, "grad_norm": 2.5, "learning_rate": 1.5913932394493548e-05, "loss": 1.7939, "step": 7480 }, { "epoch": 0.6, "grad_norm": 2.40625, "learning_rate": 1.589368923519874e-05, "loss": 1.8014, "step": 7500 }, { "epoch": 0.6, "grad_norm": 2.328125, "learning_rate": 1.587340900245624e-05, "loss": 1.7879, "step": 7520 }, { "epoch": 0.6, "grad_norm": 2.25, "learning_rate": 1.5853091823836087e-05, "loss": 1.8, "step": 7540 }, { "epoch": 0.6, "grad_norm": 2.140625, "learning_rate": 1.5832737827140727e-05, "loss": 1.7894, "step": 7560 }, { "epoch": 0.61, "grad_norm": 2.6875, "learning_rate": 1.581234714040419e-05, "loss": 1.7845, "step": 7580 }, { "epoch": 0.61, "grad_norm": 2.359375, "learning_rate": 1.5791919891891313e-05, "loss": 1.7841, "step": 7600 }, { "epoch": 0.61, "grad_norm": 2.3125, "learning_rate": 1.5771456210096913e-05, "loss": 1.8057, "step": 7620 }, { "epoch": 0.61, "grad_norm": 2.84375, "learning_rate": 1.5750956223744985e-05, "loss": 1.7961, "step": 7640 }, { "epoch": 0.61, "grad_norm": 2.28125, "learning_rate": 1.5730420061787898e-05, "loss": 1.7908, "step": 7660 }, { "epoch": 0.61, "grad_norm": 2.578125, "learning_rate": 1.5709847853405574e-05, "loss": 1.7888, "step": 7680 }, { "epoch": 0.61, "grad_norm": 2.328125, "learning_rate": 1.568923972800468e-05, "loss": 1.7742, "step": 7700 }, { "epoch": 0.62, "grad_norm": 2.484375, "learning_rate": 1.566859581521782e-05, "loss": 1.7902, "step": 7720 }, { "epoch": 0.62, "grad_norm": 2.46875, "learning_rate": 1.5647916244902707e-05, "loss": 1.7918, "step": 7740 }, { "epoch": 0.62, "grad_norm": 2.390625, "learning_rate": 1.5627201147141357e-05, "loss": 1.806, "step": 7760 }, { "epoch": 0.62, "grad_norm": 2.25, "learning_rate": 1.5606450652239263e-05, "loss": 1.7925, "step": 7780 }, { "epoch": 0.62, "grad_norm": 2.578125, "learning_rate": 1.5585664890724584e-05, "loss": 1.7921, "step": 7800 }, { "epoch": 0.62, "grad_norm": 2.390625, "learning_rate": 1.5564843993347313e-05, "loss": 1.7901, "step": 7820 }, { "epoch": 0.63, "grad_norm": 2.390625, "learning_rate": 1.5543988091078467e-05, "loss": 1.7881, "step": 7840 }, { "epoch": 0.63, "grad_norm": 2.6875, "learning_rate": 1.5523097315109245e-05, "loss": 1.7948, "step": 7860 }, { "epoch": 0.63, "grad_norm": 2.484375, "learning_rate": 1.5502171796850226e-05, "loss": 1.7958, "step": 7880 }, { "epoch": 0.63, "grad_norm": 2.40625, "learning_rate": 1.5481211667930528e-05, "loss": 1.7911, "step": 7900 }, { "epoch": 0.63, "grad_norm": 2.453125, "learning_rate": 1.5460217060196986e-05, "loss": 1.7709, "step": 7920 }, { "epoch": 0.63, "grad_norm": 2.40625, "learning_rate": 1.54391881057133e-05, "loss": 1.7914, "step": 7940 }, { "epoch": 0.64, "grad_norm": 2.328125, "learning_rate": 1.541812493675925e-05, "loss": 1.8062, "step": 7960 }, { "epoch": 0.64, "grad_norm": 2.34375, "learning_rate": 1.539702768582982e-05, "loss": 1.8074, "step": 7980 }, { "epoch": 0.64, "grad_norm": 2.75, "learning_rate": 1.5375896485634386e-05, "loss": 1.7788, "step": 8000 }, { "epoch": 0.64, "grad_norm": 2.453125, "learning_rate": 1.5354731469095884e-05, "loss": 1.7814, "step": 8020 }, { "epoch": 0.64, "grad_norm": 2.421875, "learning_rate": 1.5333532769349955e-05, "loss": 1.7854, "step": 8040 }, { "epoch": 0.64, "grad_norm": 2.65625, "learning_rate": 1.5312300519744135e-05, "loss": 1.7869, "step": 8060 }, { "epoch": 0.65, "grad_norm": 2.625, "learning_rate": 1.529103485383699e-05, "loss": 1.7736, "step": 8080 }, { "epoch": 0.65, "grad_norm": 2.28125, "learning_rate": 1.5269735905397278e-05, "loss": 1.7966, "step": 8100 }, { "epoch": 0.65, "grad_norm": 2.59375, "learning_rate": 1.524840380840314e-05, "loss": 1.7907, "step": 8120 }, { "epoch": 0.65, "grad_norm": 2.671875, "learning_rate": 1.5227038697041216e-05, "loss": 1.7767, "step": 8140 }, { "epoch": 0.65, "grad_norm": 2.234375, "learning_rate": 1.520564070570582e-05, "loss": 1.7963, "step": 8160 }, { "epoch": 0.65, "grad_norm": 2.421875, "learning_rate": 1.5184209968998098e-05, "loss": 1.7822, "step": 8180 }, { "epoch": 0.65, "grad_norm": 2.390625, "learning_rate": 1.5162746621725176e-05, "loss": 1.7806, "step": 8200 }, { "epoch": 0.66, "grad_norm": 2.359375, "learning_rate": 1.5141250798899307e-05, "loss": 1.7836, "step": 8220 }, { "epoch": 0.66, "grad_norm": 2.640625, "learning_rate": 1.5119722635737035e-05, "loss": 1.7825, "step": 8240 }, { "epoch": 0.66, "grad_norm": 2.953125, "learning_rate": 1.5098162267658323e-05, "loss": 1.7877, "step": 8260 }, { "epoch": 0.66, "grad_norm": 2.3125, "learning_rate": 1.5076569830285736e-05, "loss": 1.791, "step": 8280 }, { "epoch": 0.66, "grad_norm": 2.4375, "learning_rate": 1.5054945459443544e-05, "loss": 1.781, "step": 8300 }, { "epoch": 0.66, "grad_norm": 2.609375, "learning_rate": 1.5033289291156905e-05, "loss": 1.7873, "step": 8320 }, { "epoch": 0.67, "grad_norm": 2.296875, "learning_rate": 1.501160146165099e-05, "loss": 1.7963, "step": 8340 }, { "epoch": 0.67, "grad_norm": 2.515625, "learning_rate": 1.498988210735013e-05, "loss": 1.794, "step": 8360 }, { "epoch": 0.67, "grad_norm": 2.265625, "learning_rate": 1.4968131364876952e-05, "loss": 1.8001, "step": 8380 }, { "epoch": 0.67, "grad_norm": 2.46875, "learning_rate": 1.4946349371051541e-05, "loss": 1.7728, "step": 8400 }, { "epoch": 0.67, "grad_norm": 2.59375, "learning_rate": 1.4924536262890557e-05, "loss": 1.7732, "step": 8420 }, { "epoch": 0.67, "grad_norm": 2.671875, "learning_rate": 1.4902692177606368e-05, "loss": 1.7822, "step": 8440 }, { "epoch": 0.68, "grad_norm": 2.609375, "learning_rate": 1.4880817252606226e-05, "loss": 1.7862, "step": 8460 }, { "epoch": 0.68, "grad_norm": 2.421875, "learning_rate": 1.4858911625491352e-05, "loss": 1.801, "step": 8480 }, { "epoch": 0.68, "grad_norm": 2.40625, "learning_rate": 1.4836975434056102e-05, "loss": 1.8229, "step": 8500 }, { "epoch": 0.68, "grad_norm": 2.578125, "learning_rate": 1.48150088162871e-05, "loss": 1.7954, "step": 8520 }, { "epoch": 0.68, "grad_norm": 2.46875, "learning_rate": 1.4793011910362352e-05, "loss": 1.7996, "step": 8540 }, { "epoch": 0.68, "grad_norm": 2.40625, "learning_rate": 1.4770984854650397e-05, "loss": 1.8033, "step": 8560 }, { "epoch": 0.68, "grad_norm": 2.140625, "learning_rate": 1.4748927787709417e-05, "loss": 1.7883, "step": 8580 }, { "epoch": 0.69, "grad_norm": 2.265625, "learning_rate": 1.4726840848286385e-05, "loss": 1.7939, "step": 8600 }, { "epoch": 0.69, "grad_norm": 2.421875, "learning_rate": 1.4704724175316181e-05, "loss": 1.7975, "step": 8620 }, { "epoch": 0.69, "grad_norm": 2.46875, "learning_rate": 1.4682577907920707e-05, "loss": 1.8029, "step": 8640 }, { "epoch": 0.69, "grad_norm": 2.359375, "learning_rate": 1.4660402185408046e-05, "loss": 1.7807, "step": 8660 }, { "epoch": 0.69, "grad_norm": 2.5, "learning_rate": 1.4638197147271548e-05, "loss": 1.7953, "step": 8680 }, { "epoch": 0.69, "grad_norm": 2.4375, "learning_rate": 1.4615962933188981e-05, "loss": 1.7902, "step": 8700 }, { "epoch": 0.7, "grad_norm": 2.5625, "learning_rate": 1.4593699683021625e-05, "loss": 1.7849, "step": 8720 }, { "epoch": 0.7, "grad_norm": 2.765625, "learning_rate": 1.4571407536813422e-05, "loss": 1.7814, "step": 8740 }, { "epoch": 0.7, "grad_norm": 2.375, "learning_rate": 1.4549086634790075e-05, "loss": 1.7932, "step": 8760 }, { "epoch": 0.7, "grad_norm": 2.4375, "learning_rate": 1.4526737117358167e-05, "loss": 1.789, "step": 8780 }, { "epoch": 0.7, "grad_norm": 2.546875, "learning_rate": 1.4504359125104292e-05, "loss": 1.7828, "step": 8800 }, { "epoch": 0.7, "grad_norm": 2.34375, "learning_rate": 1.4481952798794152e-05, "loss": 1.7876, "step": 8820 }, { "epoch": 0.71, "grad_norm": 2.328125, "learning_rate": 1.4459518279371692e-05, "loss": 1.794, "step": 8840 }, { "epoch": 0.71, "grad_norm": 2.734375, "learning_rate": 1.4437055707958184e-05, "loss": 1.7919, "step": 8860 }, { "epoch": 0.71, "grad_norm": 2.421875, "learning_rate": 1.4414565225851371e-05, "loss": 1.7846, "step": 8880 }, { "epoch": 0.71, "grad_norm": 2.453125, "learning_rate": 1.4392046974524565e-05, "loss": 1.7843, "step": 8900 }, { "epoch": 0.71, "grad_norm": 2.40625, "learning_rate": 1.4369501095625747e-05, "loss": 1.7726, "step": 8920 }, { "epoch": 0.71, "grad_norm": 2.390625, "learning_rate": 1.4346927730976691e-05, "loss": 1.7836, "step": 8940 }, { "epoch": 0.72, "grad_norm": 2.8125, "learning_rate": 1.4324327022572073e-05, "loss": 1.776, "step": 8960 }, { "epoch": 0.72, "grad_norm": 2.296875, "learning_rate": 1.4301699112578557e-05, "loss": 1.7903, "step": 8980 }, { "epoch": 0.72, "grad_norm": 2.5, "learning_rate": 1.4279044143333926e-05, "loss": 1.7757, "step": 9000 }, { "epoch": 0.72, "grad_norm": 2.3125, "learning_rate": 1.425636225734617e-05, "loss": 1.7705, "step": 9020 }, { "epoch": 0.72, "grad_norm": 2.5, "learning_rate": 1.42336535972926e-05, "loss": 1.8011, "step": 9040 }, { "epoch": 0.72, "grad_norm": 2.375, "learning_rate": 1.4210918306018937e-05, "loss": 1.7795, "step": 9060 }, { "epoch": 0.72, "grad_norm": 2.78125, "learning_rate": 1.4188156526538435e-05, "loss": 1.7965, "step": 9080 }, { "epoch": 0.73, "grad_norm": 2.796875, "learning_rate": 1.4165368402030952e-05, "loss": 1.7631, "step": 9100 }, { "epoch": 0.73, "grad_norm": 2.453125, "learning_rate": 1.4142554075842083e-05, "loss": 1.7949, "step": 9120 }, { "epoch": 0.73, "grad_norm": 2.625, "learning_rate": 1.4119713691482228e-05, "loss": 1.785, "step": 9140 }, { "epoch": 0.73, "grad_norm": 2.46875, "learning_rate": 1.4096847392625708e-05, "loss": 1.777, "step": 9160 }, { "epoch": 0.73, "grad_norm": 2.546875, "learning_rate": 1.4073955323109859e-05, "loss": 1.779, "step": 9180 }, { "epoch": 0.73, "grad_norm": 2.5, "learning_rate": 1.4051037626934112e-05, "loss": 1.7815, "step": 9200 }, { "epoch": 0.74, "grad_norm": 2.828125, "learning_rate": 1.4028094448259113e-05, "loss": 1.7852, "step": 9220 }, { "epoch": 0.74, "grad_norm": 2.375, "learning_rate": 1.4005125931405792e-05, "loss": 1.7999, "step": 9240 }, { "epoch": 0.74, "grad_norm": 2.390625, "learning_rate": 1.3982132220854472e-05, "loss": 1.791, "step": 9260 }, { "epoch": 0.74, "grad_norm": 2.484375, "learning_rate": 1.3959113461243952e-05, "loss": 1.7836, "step": 9280 }, { "epoch": 0.74, "grad_norm": 2.6875, "learning_rate": 1.3936069797370591e-05, "loss": 1.778, "step": 9300 }, { "epoch": 0.74, "grad_norm": 2.234375, "learning_rate": 1.3913001374187421e-05, "loss": 1.8065, "step": 9320 }, { "epoch": 0.75, "grad_norm": 2.671875, "learning_rate": 1.3889908336803198e-05, "loss": 1.8035, "step": 9340 }, { "epoch": 0.75, "grad_norm": 2.4375, "learning_rate": 1.3866790830481529e-05, "loss": 1.7789, "step": 9360 }, { "epoch": 0.75, "grad_norm": 2.65625, "learning_rate": 1.3843649000639933e-05, "loss": 1.7706, "step": 9380 }, { "epoch": 0.75, "grad_norm": 2.546875, "learning_rate": 1.3820482992848929e-05, "loss": 1.7685, "step": 9400 }, { "epoch": 0.75, "grad_norm": 2.46875, "learning_rate": 1.3797292952831127e-05, "loss": 1.7687, "step": 9420 }, { "epoch": 0.75, "grad_norm": 2.375, "learning_rate": 1.3774079026460308e-05, "loss": 1.7768, "step": 9440 }, { "epoch": 0.76, "grad_norm": 2.5, "learning_rate": 1.3750841359760511e-05, "loss": 1.7878, "step": 9460 }, { "epoch": 0.76, "grad_norm": 2.40625, "learning_rate": 1.37275800989051e-05, "loss": 1.792, "step": 9480 }, { "epoch": 0.76, "grad_norm": 2.75, "learning_rate": 1.3704295390215868e-05, "loss": 1.7822, "step": 9500 }, { "epoch": 0.76, "grad_norm": 2.4375, "learning_rate": 1.3680987380162095e-05, "loss": 1.7831, "step": 9520 }, { "epoch": 0.76, "grad_norm": 2.453125, "learning_rate": 1.3657656215359634e-05, "loss": 1.7819, "step": 9540 }, { "epoch": 0.76, "grad_norm": 2.546875, "learning_rate": 1.3634302042569995e-05, "loss": 1.7839, "step": 9560 }, { "epoch": 0.76, "grad_norm": 2.3125, "learning_rate": 1.3610925008699413e-05, "loss": 1.7905, "step": 9580 }, { "epoch": 0.77, "grad_norm": 2.484375, "learning_rate": 1.3587525260797934e-05, "loss": 1.7785, "step": 9600 }, { "epoch": 0.77, "grad_norm": 2.328125, "learning_rate": 1.3564102946058468e-05, "loss": 1.7846, "step": 9620 }, { "epoch": 0.77, "grad_norm": 2.53125, "learning_rate": 1.3540658211815898e-05, "loss": 1.7841, "step": 9640 }, { "epoch": 0.77, "grad_norm": 2.453125, "learning_rate": 1.3517191205546121e-05, "loss": 1.774, "step": 9660 }, { "epoch": 0.77, "grad_norm": 2.578125, "learning_rate": 1.3493702074865139e-05, "loss": 1.7947, "step": 9680 }, { "epoch": 0.77, "grad_norm": 2.390625, "learning_rate": 1.3470190967528118e-05, "loss": 1.7843, "step": 9700 }, { "epoch": 0.78, "grad_norm": 2.296875, "learning_rate": 1.3446658031428474e-05, "loss": 1.7796, "step": 9720 }, { "epoch": 0.78, "grad_norm": 2.5, "learning_rate": 1.3423103414596929e-05, "loss": 1.7713, "step": 9740 }, { "epoch": 0.78, "grad_norm": 2.734375, "learning_rate": 1.3399527265200581e-05, "loss": 1.7769, "step": 9760 }, { "epoch": 0.78, "grad_norm": 2.546875, "learning_rate": 1.3375929731541986e-05, "loss": 1.7823, "step": 9780 }, { "epoch": 0.78, "grad_norm": 2.59375, "learning_rate": 1.3352310962058202e-05, "loss": 1.7642, "step": 9800 }, { "epoch": 0.78, "grad_norm": 2.296875, "learning_rate": 1.332867110531988e-05, "loss": 1.7841, "step": 9820 }, { "epoch": 0.79, "grad_norm": 2.609375, "learning_rate": 1.3305010310030311e-05, "loss": 1.7897, "step": 9840 }, { "epoch": 0.79, "grad_norm": 2.421875, "learning_rate": 1.3281328725024496e-05, "loss": 1.7813, "step": 9860 }, { "epoch": 0.79, "grad_norm": 2.359375, "learning_rate": 1.3257626499268217e-05, "loss": 1.7828, "step": 9880 }, { "epoch": 0.79, "grad_norm": 2.390625, "learning_rate": 1.3233903781857084e-05, "loss": 1.7809, "step": 9900 }, { "epoch": 0.79, "grad_norm": 2.453125, "learning_rate": 1.3210160722015619e-05, "loss": 1.7768, "step": 9920 }, { "epoch": 0.79, "grad_norm": 2.46875, "learning_rate": 1.3186397469096295e-05, "loss": 1.7816, "step": 9940 }, { "epoch": 0.8, "grad_norm": 2.703125, "learning_rate": 1.3162614172578614e-05, "loss": 1.7741, "step": 9960 }, { "epoch": 0.8, "grad_norm": 2.484375, "learning_rate": 1.3138810982068154e-05, "loss": 1.7801, "step": 9980 }, { "epoch": 0.8, "grad_norm": 2.46875, "learning_rate": 1.3114988047295638e-05, "loss": 1.7711, "step": 10000 }, { "epoch": 0.8, "grad_norm": 2.3125, "learning_rate": 1.3091145518115982e-05, "loss": 1.7807, "step": 10020 }, { "epoch": 0.8, "grad_norm": 2.578125, "learning_rate": 1.3067283544507366e-05, "loss": 1.7835, "step": 10040 }, { "epoch": 0.8, "grad_norm": 2.609375, "learning_rate": 1.3043402276570276e-05, "loss": 1.7746, "step": 10060 }, { "epoch": 0.8, "grad_norm": 2.53125, "learning_rate": 1.3019501864526565e-05, "loss": 1.7742, "step": 10080 }, { "epoch": 0.81, "grad_norm": 2.25, "learning_rate": 1.2995582458718518e-05, "loss": 1.7811, "step": 10100 }, { "epoch": 0.81, "grad_norm": 2.3125, "learning_rate": 1.2971644209607893e-05, "loss": 1.7684, "step": 10120 }, { "epoch": 0.81, "grad_norm": 2.703125, "learning_rate": 1.2947687267774973e-05, "loss": 1.7778, "step": 10140 }, { "epoch": 0.81, "grad_norm": 2.46875, "learning_rate": 1.2923711783917637e-05, "loss": 1.7587, "step": 10160 }, { "epoch": 0.81, "grad_norm": 2.296875, "learning_rate": 1.2899717908850385e-05, "loss": 1.784, "step": 10180 }, { "epoch": 0.81, "grad_norm": 2.515625, "learning_rate": 1.2875705793503424e-05, "loss": 1.773, "step": 10200 }, { "epoch": 0.82, "grad_norm": 2.6875, "learning_rate": 1.2851675588921677e-05, "loss": 1.7721, "step": 10220 }, { "epoch": 0.82, "grad_norm": 2.75, "learning_rate": 1.2827627446263877e-05, "loss": 1.7781, "step": 10240 }, { "epoch": 0.82, "grad_norm": 2.734375, "learning_rate": 1.2803561516801575e-05, "loss": 1.7935, "step": 10260 }, { "epoch": 0.82, "grad_norm": 2.34375, "learning_rate": 1.2779477951918217e-05, "loss": 1.7746, "step": 10280 }, { "epoch": 0.82, "grad_norm": 2.453125, "learning_rate": 1.2755376903108183e-05, "loss": 1.7783, "step": 10300 }, { "epoch": 0.82, "grad_norm": 2.421875, "learning_rate": 1.2731258521975829e-05, "loss": 1.7812, "step": 10320 }, { "epoch": 0.83, "grad_norm": 2.46875, "learning_rate": 1.2707122960234544e-05, "loss": 1.7742, "step": 10340 }, { "epoch": 0.83, "grad_norm": 2.671875, "learning_rate": 1.2682970369705773e-05, "loss": 1.7585, "step": 10360 }, { "epoch": 0.83, "grad_norm": 2.671875, "learning_rate": 1.2658800902318103e-05, "loss": 1.7848, "step": 10380 }, { "epoch": 0.83, "grad_norm": 2.609375, "learning_rate": 1.2634614710106266e-05, "loss": 1.7784, "step": 10400 }, { "epoch": 0.83, "grad_norm": 2.421875, "learning_rate": 1.2610411945210199e-05, "loss": 1.7762, "step": 10420 }, { "epoch": 0.83, "grad_norm": 2.359375, "learning_rate": 1.2586192759874094e-05, "loss": 1.7686, "step": 10440 }, { "epoch": 0.84, "grad_norm": 2.734375, "learning_rate": 1.2561957306445428e-05, "loss": 1.7861, "step": 10460 }, { "epoch": 0.84, "grad_norm": 2.53125, "learning_rate": 1.253770573737402e-05, "loss": 1.7744, "step": 10480 }, { "epoch": 0.84, "grad_norm": 2.40625, "learning_rate": 1.2513438205211048e-05, "loss": 1.7703, "step": 10500 }, { "epoch": 0.84, "grad_norm": 2.65625, "learning_rate": 1.2489154862608111e-05, "loss": 1.7785, "step": 10520 }, { "epoch": 0.84, "grad_norm": 2.453125, "learning_rate": 1.2464855862316263e-05, "loss": 1.7789, "step": 10540 }, { "epoch": 0.84, "grad_norm": 2.484375, "learning_rate": 1.244054135718505e-05, "loss": 1.7766, "step": 10560 }, { "epoch": 0.84, "grad_norm": 2.484375, "learning_rate": 1.2416211500161546e-05, "loss": 1.7805, "step": 10580 }, { "epoch": 0.85, "grad_norm": 2.5, "learning_rate": 1.2391866444289394e-05, "loss": 1.7769, "step": 10600 }, { "epoch": 0.85, "grad_norm": 2.5, "learning_rate": 1.2367506342707851e-05, "loss": 1.7727, "step": 10620 }, { "epoch": 0.85, "grad_norm": 2.640625, "learning_rate": 1.2343131348650806e-05, "loss": 1.7603, "step": 10640 }, { "epoch": 0.85, "grad_norm": 2.375, "learning_rate": 1.231874161544583e-05, "loss": 1.7681, "step": 10660 }, { "epoch": 0.85, "grad_norm": 2.515625, "learning_rate": 1.2294337296513219e-05, "loss": 1.7705, "step": 10680 }, { "epoch": 0.85, "grad_norm": 2.375, "learning_rate": 1.2269918545365e-05, "loss": 1.7692, "step": 10700 }, { "epoch": 0.86, "grad_norm": 2.53125, "learning_rate": 1.2245485515604004e-05, "loss": 1.7685, "step": 10720 }, { "epoch": 0.86, "grad_norm": 2.84375, "learning_rate": 1.2221038360922863e-05, "loss": 1.7873, "step": 10740 }, { "epoch": 0.86, "grad_norm": 2.5, "learning_rate": 1.219657723510307e-05, "loss": 1.779, "step": 10760 }, { "epoch": 0.86, "grad_norm": 2.65625, "learning_rate": 1.2172102292013994e-05, "loss": 1.7963, "step": 10780 }, { "epoch": 0.86, "grad_norm": 2.4375, "learning_rate": 1.2147613685611928e-05, "loss": 1.7737, "step": 10800 }, { "epoch": 0.86, "grad_norm": 2.671875, "learning_rate": 1.212311156993911e-05, "loss": 1.7578, "step": 10820 }, { "epoch": 0.87, "grad_norm": 2.875, "learning_rate": 1.2098596099122745e-05, "loss": 1.7649, "step": 10840 }, { "epoch": 0.87, "grad_norm": 2.734375, "learning_rate": 1.2074067427374068e-05, "loss": 1.782, "step": 10860 }, { "epoch": 0.87, "grad_norm": 2.640625, "learning_rate": 1.2049525708987331e-05, "loss": 1.7729, "step": 10880 }, { "epoch": 0.87, "grad_norm": 2.546875, "learning_rate": 1.2024971098338868e-05, "loss": 1.7769, "step": 10900 }, { "epoch": 0.87, "grad_norm": 2.40625, "learning_rate": 1.2000403749886108e-05, "loss": 1.7761, "step": 10920 }, { "epoch": 0.87, "grad_norm": 2.640625, "learning_rate": 1.1975823818166596e-05, "loss": 1.7476, "step": 10940 }, { "epoch": 0.87, "grad_norm": 2.609375, "learning_rate": 1.1951231457797047e-05, "loss": 1.7814, "step": 10960 }, { "epoch": 0.88, "grad_norm": 2.703125, "learning_rate": 1.1926626823472338e-05, "loss": 1.7691, "step": 10980 }, { "epoch": 0.88, "grad_norm": 2.65625, "learning_rate": 1.1902010069964569e-05, "loss": 1.7756, "step": 11000 }, { "epoch": 0.88, "grad_norm": 2.5625, "learning_rate": 1.1877381352122064e-05, "loss": 1.7833, "step": 11020 }, { "epoch": 0.88, "grad_norm": 2.4375, "learning_rate": 1.1852740824868416e-05, "loss": 1.7659, "step": 11040 }, { "epoch": 0.88, "grad_norm": 2.46875, "learning_rate": 1.1828088643201492e-05, "loss": 1.772, "step": 11060 }, { "epoch": 0.88, "grad_norm": 2.546875, "learning_rate": 1.180342496219248e-05, "loss": 1.7516, "step": 11080 }, { "epoch": 0.89, "grad_norm": 2.609375, "learning_rate": 1.17787499369849e-05, "loss": 1.7647, "step": 11100 }, { "epoch": 0.89, "grad_norm": 2.515625, "learning_rate": 1.1754063722793624e-05, "loss": 1.769, "step": 11120 }, { "epoch": 0.89, "grad_norm": 2.671875, "learning_rate": 1.1729366474903923e-05, "loss": 1.7813, "step": 11140 }, { "epoch": 0.89, "grad_norm": 2.734375, "learning_rate": 1.1704658348670455e-05, "loss": 1.7669, "step": 11160 }, { "epoch": 0.89, "grad_norm": 2.328125, "learning_rate": 1.1679939499516317e-05, "loss": 1.7846, "step": 11180 }, { "epoch": 0.89, "grad_norm": 2.375, "learning_rate": 1.165521008293206e-05, "loss": 1.7719, "step": 11200 }, { "epoch": 0.9, "grad_norm": 2.65625, "learning_rate": 1.1630470254474697e-05, "loss": 1.7625, "step": 11220 }, { "epoch": 0.9, "grad_norm": 2.578125, "learning_rate": 1.1605720169766752e-05, "loss": 1.7721, "step": 11240 }, { "epoch": 0.9, "grad_norm": 2.53125, "learning_rate": 1.1580959984495243e-05, "loss": 1.7558, "step": 11260 }, { "epoch": 0.9, "grad_norm": 2.609375, "learning_rate": 1.1556189854410744e-05, "loss": 1.7633, "step": 11280 }, { "epoch": 0.9, "grad_norm": 2.4375, "learning_rate": 1.1531409935326377e-05, "loss": 1.7632, "step": 11300 }, { "epoch": 0.9, "grad_norm": 2.5, "learning_rate": 1.1506620383116835e-05, "loss": 1.7925, "step": 11320 }, { "epoch": 0.91, "grad_norm": 2.453125, "learning_rate": 1.1481821353717418e-05, "loss": 1.7667, "step": 11340 }, { "epoch": 0.91, "grad_norm": 2.484375, "learning_rate": 1.145701300312303e-05, "loss": 1.7733, "step": 11360 }, { "epoch": 0.91, "grad_norm": 2.703125, "learning_rate": 1.1432195487387223e-05, "loss": 1.7772, "step": 11380 }, { "epoch": 0.91, "grad_norm": 2.640625, "learning_rate": 1.1407368962621184e-05, "loss": 1.7459, "step": 11400 }, { "epoch": 0.91, "grad_norm": 2.59375, "learning_rate": 1.1382533584992783e-05, "loss": 1.7608, "step": 11420 }, { "epoch": 0.91, "grad_norm": 2.4375, "learning_rate": 1.1357689510725571e-05, "loss": 1.749, "step": 11440 }, { "epoch": 0.91, "grad_norm": 2.5625, "learning_rate": 1.1332836896097808e-05, "loss": 1.77, "step": 11460 }, { "epoch": 0.92, "grad_norm": 2.5, "learning_rate": 1.1307975897441473e-05, "loss": 1.7676, "step": 11480 }, { "epoch": 0.92, "grad_norm": 2.546875, "learning_rate": 1.1283106671141282e-05, "loss": 1.7755, "step": 11500 }, { "epoch": 0.92, "grad_norm": 2.5625, "learning_rate": 1.1258229373633713e-05, "loss": 1.7742, "step": 11520 }, { "epoch": 0.92, "grad_norm": 2.828125, "learning_rate": 1.1233344161406008e-05, "loss": 1.7606, "step": 11540 }, { "epoch": 0.92, "grad_norm": 2.640625, "learning_rate": 1.12084511909952e-05, "loss": 1.7749, "step": 11560 }, { "epoch": 0.92, "grad_norm": 2.640625, "learning_rate": 1.1183550618987118e-05, "loss": 1.7868, "step": 11580 }, { "epoch": 0.93, "grad_norm": 2.640625, "learning_rate": 1.1158642602015415e-05, "loss": 1.7712, "step": 11600 }, { "epoch": 0.93, "grad_norm": 2.46875, "learning_rate": 1.1133727296760572e-05, "loss": 1.7732, "step": 11620 }, { "epoch": 0.93, "grad_norm": 2.546875, "learning_rate": 1.110880485994891e-05, "loss": 1.7672, "step": 11640 }, { "epoch": 0.93, "grad_norm": 2.375, "learning_rate": 1.1083875448351626e-05, "loss": 1.7858, "step": 11660 }, { "epoch": 0.93, "grad_norm": 2.40625, "learning_rate": 1.1058939218783772e-05, "loss": 1.7683, "step": 11680 }, { "epoch": 0.93, "grad_norm": 2.59375, "learning_rate": 1.10339963281033e-05, "loss": 1.7813, "step": 11700 }, { "epoch": 0.94, "grad_norm": 2.578125, "learning_rate": 1.100904693321006e-05, "loss": 1.7745, "step": 11720 }, { "epoch": 0.94, "grad_norm": 2.484375, "learning_rate": 1.0984091191044816e-05, "loss": 1.7848, "step": 11740 }, { "epoch": 0.94, "grad_norm": 2.5, "learning_rate": 1.0959129258588257e-05, "loss": 1.7518, "step": 11760 }, { "epoch": 0.94, "grad_norm": 2.53125, "learning_rate": 1.0934161292860008e-05, "loss": 1.7768, "step": 11780 }, { "epoch": 0.94, "grad_norm": 2.5625, "learning_rate": 1.0909187450917656e-05, "loss": 1.7602, "step": 11800 }, { "epoch": 0.94, "grad_norm": 2.515625, "learning_rate": 1.0884207889855735e-05, "loss": 1.758, "step": 11820 }, { "epoch": 0.95, "grad_norm": 2.5625, "learning_rate": 1.0859222766804778e-05, "loss": 1.7761, "step": 11840 }, { "epoch": 0.95, "grad_norm": 2.609375, "learning_rate": 1.0834232238930283e-05, "loss": 1.7606, "step": 11860 }, { "epoch": 0.95, "grad_norm": 2.59375, "learning_rate": 1.0809236463431754e-05, "loss": 1.779, "step": 11880 }, { "epoch": 0.95, "grad_norm": 2.765625, "learning_rate": 1.0784235597541708e-05, "loss": 1.771, "step": 11900 }, { "epoch": 0.95, "grad_norm": 2.5625, "learning_rate": 1.075922979852468e-05, "loss": 1.7654, "step": 11920 }, { "epoch": 0.95, "grad_norm": 2.703125, "learning_rate": 1.073421922367623e-05, "loss": 1.7758, "step": 11940 }, { "epoch": 0.95, "grad_norm": 2.453125, "learning_rate": 1.0709204030321972e-05, "loss": 1.7592, "step": 11960 }, { "epoch": 0.96, "grad_norm": 2.546875, "learning_rate": 1.068418437581656e-05, "loss": 1.7741, "step": 11980 }, { "epoch": 0.96, "grad_norm": 2.46875, "learning_rate": 1.0659160417542721e-05, "loss": 1.759, "step": 12000 }, { "epoch": 0.96, "grad_norm": 2.6875, "learning_rate": 1.0634132312910245e-05, "loss": 1.7809, "step": 12020 }, { "epoch": 0.96, "grad_norm": 2.65625, "learning_rate": 1.060910021935501e-05, "loss": 1.7811, "step": 12040 }, { "epoch": 0.96, "grad_norm": 2.59375, "learning_rate": 1.0584064294337983e-05, "loss": 1.761, "step": 12060 }, { "epoch": 0.96, "grad_norm": 2.40625, "learning_rate": 1.0559024695344233e-05, "loss": 1.7515, "step": 12080 }, { "epoch": 0.97, "grad_norm": 2.359375, "learning_rate": 1.0533981579881938e-05, "loss": 1.7861, "step": 12100 }, { "epoch": 0.97, "grad_norm": 2.546875, "learning_rate": 1.0508935105481402e-05, "loss": 1.7643, "step": 12120 }, { "epoch": 0.97, "grad_norm": 2.546875, "learning_rate": 1.0483885429694051e-05, "loss": 1.7745, "step": 12140 }, { "epoch": 0.97, "grad_norm": 2.46875, "learning_rate": 1.0458832710091448e-05, "loss": 1.7539, "step": 12160 }, { "epoch": 0.97, "grad_norm": 2.421875, "learning_rate": 1.0433777104264313e-05, "loss": 1.7546, "step": 12180 }, { "epoch": 0.97, "grad_norm": 2.53125, "learning_rate": 1.0408718769821512e-05, "loss": 1.7606, "step": 12200 }, { "epoch": 0.98, "grad_norm": 2.765625, "learning_rate": 1.0383657864389077e-05, "loss": 1.7583, "step": 12220 }, { "epoch": 0.98, "grad_norm": 2.828125, "learning_rate": 1.0358594545609207e-05, "loss": 1.7659, "step": 12240 }, { "epoch": 0.98, "grad_norm": 2.53125, "learning_rate": 1.0333528971139297e-05, "loss": 1.7601, "step": 12260 }, { "epoch": 0.98, "grad_norm": 2.421875, "learning_rate": 1.0308461298650923e-05, "loss": 1.7612, "step": 12280 }, { "epoch": 0.98, "grad_norm": 2.484375, "learning_rate": 1.0283391685828844e-05, "loss": 1.7646, "step": 12300 }, { "epoch": 0.98, "grad_norm": 2.765625, "learning_rate": 1.0258320290370051e-05, "loss": 1.7741, "step": 12320 }, { "epoch": 0.99, "grad_norm": 2.359375, "learning_rate": 1.0233247269982732e-05, "loss": 1.7616, "step": 12340 }, { "epoch": 0.99, "grad_norm": 2.625, "learning_rate": 1.0208172782385295e-05, "loss": 1.7502, "step": 12360 }, { "epoch": 0.99, "grad_norm": 2.65625, "learning_rate": 1.0183096985305385e-05, "loss": 1.7806, "step": 12380 }, { "epoch": 0.99, "grad_norm": 2.578125, "learning_rate": 1.0158020036478881e-05, "loss": 1.7728, "step": 12400 }, { "epoch": 0.99, "grad_norm": 2.484375, "learning_rate": 1.0132942093648905e-05, "loss": 1.7748, "step": 12420 }, { "epoch": 0.99, "grad_norm": 2.515625, "learning_rate": 1.0107863314564834e-05, "loss": 1.7669, "step": 12440 }, { "epoch": 0.99, "grad_norm": 2.46875, "learning_rate": 1.0082783856981306e-05, "loss": 1.765, "step": 12460 }, { "epoch": 1.0, "grad_norm": 2.53125, "learning_rate": 1.0057703878657227e-05, "loss": 1.7704, "step": 12480 }, { "epoch": 1.0, "grad_norm": 2.484375, "learning_rate": 1.0032623537354775e-05, "loss": 1.7509, "step": 12500 }, { "epoch": 1.0, "grad_norm": 2.484375, "learning_rate": 1.0007542990838413e-05, "loss": 1.7584, "step": 12520 }, { "epoch": 1.0, "grad_norm": 2.609375, "learning_rate": 9.982462396873895e-06, "loss": 1.7479, "step": 12540 }, { "epoch": 1.0, "grad_norm": 2.484375, "learning_rate": 9.95738191322728e-06, "loss": 1.7456, "step": 12560 }, { "epoch": 1.0, "grad_norm": 2.625, "learning_rate": 9.93230169766392e-06, "loss": 1.7532, "step": 12580 }, { "epoch": 1.01, "grad_norm": 2.421875, "learning_rate": 9.907221907947489e-06, "loss": 1.7547, "step": 12600 }, { "epoch": 1.01, "grad_norm": 2.484375, "learning_rate": 9.882142701838986e-06, "loss": 1.7646, "step": 12620 }, { "epoch": 1.01, "grad_norm": 2.625, "learning_rate": 9.85706423709573e-06, "loss": 1.7707, "step": 12640 }, { "epoch": 1.01, "grad_norm": 2.625, "learning_rate": 9.83198667147038e-06, "loss": 1.772, "step": 12660 }, { "epoch": 1.01, "grad_norm": 2.546875, "learning_rate": 9.80691016270994e-06, "loss": 1.7501, "step": 12680 }, { "epoch": 1.01, "grad_norm": 2.484375, "learning_rate": 9.781834868554763e-06, "loss": 1.7525, "step": 12700 }, { "epoch": 1.02, "grad_norm": 2.578125, "learning_rate": 9.756760946737572e-06, "loss": 1.7504, "step": 12720 }, { "epoch": 1.02, "grad_norm": 2.5625, "learning_rate": 9.731688554982446e-06, "loss": 1.7504, "step": 12740 }, { "epoch": 1.02, "grad_norm": 2.71875, "learning_rate": 9.706617851003837e-06, "loss": 1.75, "step": 12760 }, { "epoch": 1.02, "grad_norm": 2.484375, "learning_rate": 9.681548992505594e-06, "loss": 1.7557, "step": 12780 }, { "epoch": 1.02, "grad_norm": 2.546875, "learning_rate": 9.65648213717995e-06, "loss": 1.7724, "step": 12800 }, { "epoch": 1.02, "grad_norm": 2.890625, "learning_rate": 9.63141744270653e-06, "loss": 1.7697, "step": 12820 }, { "epoch": 1.03, "grad_norm": 2.359375, "learning_rate": 9.606355066751382e-06, "loss": 1.7447, "step": 12840 }, { "epoch": 1.03, "grad_norm": 2.59375, "learning_rate": 9.581295166965956e-06, "loss": 1.7531, "step": 12860 }, { "epoch": 1.03, "grad_norm": 2.484375, "learning_rate": 9.556237900986128e-06, "loss": 1.776, "step": 12880 }, { "epoch": 1.03, "grad_norm": 2.515625, "learning_rate": 9.531183426431217e-06, "loss": 1.758, "step": 12900 }, { "epoch": 1.03, "grad_norm": 2.390625, "learning_rate": 9.506131900902972e-06, "loss": 1.7682, "step": 12920 }, { "epoch": 1.03, "grad_norm": 2.453125, "learning_rate": 9.481083481984593e-06, "loss": 1.7616, "step": 12940 }, { "epoch": 1.03, "grad_norm": 2.6875, "learning_rate": 9.456038327239744e-06, "loss": 1.7584, "step": 12960 }, { "epoch": 1.04, "grad_norm": 2.578125, "learning_rate": 9.430996594211547e-06, "loss": 1.7752, "step": 12980 }, { "epoch": 1.04, "grad_norm": 2.75, "learning_rate": 9.405958440421613e-06, "loss": 1.7765, "step": 13000 }, { "epoch": 1.04, "grad_norm": 2.390625, "learning_rate": 9.380924023369027e-06, "loss": 1.7521, "step": 13020 }, { "epoch": 1.04, "grad_norm": 2.5, "learning_rate": 9.355893500529369e-06, "loss": 1.7598, "step": 13040 }, { "epoch": 1.04, "grad_norm": 2.484375, "learning_rate": 9.330867029353732e-06, "loss": 1.7406, "step": 13060 }, { "epoch": 1.04, "grad_norm": 2.40625, "learning_rate": 9.305844767267716e-06, "loss": 1.7665, "step": 13080 }, { "epoch": 1.05, "grad_norm": 2.640625, "learning_rate": 9.280826871670441e-06, "loss": 1.7629, "step": 13100 }, { "epoch": 1.05, "grad_norm": 2.5, "learning_rate": 9.255813499933573e-06, "loss": 1.7617, "step": 13120 }, { "epoch": 1.05, "grad_norm": 2.734375, "learning_rate": 9.230804809400304e-06, "loss": 1.7504, "step": 13140 }, { "epoch": 1.05, "grad_norm": 2.5, "learning_rate": 9.20580095738439e-06, "loss": 1.7582, "step": 13160 }, { "epoch": 1.05, "grad_norm": 2.53125, "learning_rate": 9.180802101169153e-06, "loss": 1.759, "step": 13180 }, { "epoch": 1.05, "grad_norm": 2.4375, "learning_rate": 9.155808398006487e-06, "loss": 1.7688, "step": 13200 }, { "epoch": 1.06, "grad_norm": 2.640625, "learning_rate": 9.130820005115863e-06, "loss": 1.7675, "step": 13220 }, { "epoch": 1.06, "grad_norm": 2.578125, "learning_rate": 9.10583707968336e-06, "loss": 1.7683, "step": 13240 }, { "epoch": 1.06, "grad_norm": 2.71875, "learning_rate": 9.080859778860662e-06, "loss": 1.7856, "step": 13260 }, { "epoch": 1.06, "grad_norm": 2.578125, "learning_rate": 9.055888259764066e-06, "loss": 1.7773, "step": 13280 }, { "epoch": 1.06, "grad_norm": 2.640625, "learning_rate": 9.030922679473512e-06, "loss": 1.7512, "step": 13300 }, { "epoch": 1.06, "grad_norm": 2.484375, "learning_rate": 9.005963195031566e-06, "loss": 1.7544, "step": 13320 }, { "epoch": 1.06, "grad_norm": 2.703125, "learning_rate": 8.981009963442464e-06, "loss": 1.7731, "step": 13340 }, { "epoch": 1.07, "grad_norm": 2.6875, "learning_rate": 8.956063141671103e-06, "loss": 1.7636, "step": 13360 }, { "epoch": 1.07, "grad_norm": 2.546875, "learning_rate": 8.931122886642058e-06, "loss": 1.7437, "step": 13380 }, { "epoch": 1.07, "grad_norm": 2.453125, "learning_rate": 8.906189355238602e-06, "loss": 1.7513, "step": 13400 }, { "epoch": 1.07, "grad_norm": 2.5, "learning_rate": 8.881262704301709e-06, "loss": 1.7474, "step": 13420 }, { "epoch": 1.07, "grad_norm": 2.71875, "learning_rate": 8.856343090629074e-06, "loss": 1.7673, "step": 13440 }, { "epoch": 1.07, "grad_norm": 2.65625, "learning_rate": 8.831430670974126e-06, "loss": 1.7671, "step": 13460 }, { "epoch": 1.08, "grad_norm": 2.734375, "learning_rate": 8.806525602045043e-06, "loss": 1.7584, "step": 13480 }, { "epoch": 1.08, "grad_norm": 2.8125, "learning_rate": 8.781628040503758e-06, "loss": 1.7684, "step": 13500 }, { "epoch": 1.08, "grad_norm": 2.59375, "learning_rate": 8.756738142964985e-06, "loss": 1.7607, "step": 13520 }, { "epoch": 1.08, "grad_norm": 2.65625, "learning_rate": 8.731856065995229e-06, "loss": 1.7683, "step": 13540 }, { "epoch": 1.08, "grad_norm": 2.5625, "learning_rate": 8.706981966111791e-06, "loss": 1.7517, "step": 13560 }, { "epoch": 1.08, "grad_norm": 2.640625, "learning_rate": 8.682115999781814e-06, "loss": 1.7554, "step": 13580 }, { "epoch": 1.09, "grad_norm": 2.59375, "learning_rate": 8.657258323421253e-06, "loss": 1.7452, "step": 13600 }, { "epoch": 1.09, "grad_norm": 2.515625, "learning_rate": 8.632409093393938e-06, "loss": 1.7385, "step": 13620 }, { "epoch": 1.09, "grad_norm": 2.5625, "learning_rate": 8.607568466010556e-06, "loss": 1.7716, "step": 13640 }, { "epoch": 1.09, "grad_norm": 2.515625, "learning_rate": 8.582736597527673e-06, "loss": 1.7491, "step": 13660 }, { "epoch": 1.09, "grad_norm": 2.453125, "learning_rate": 8.557913644146785e-06, "loss": 1.7569, "step": 13680 }, { "epoch": 1.09, "grad_norm": 2.46875, "learning_rate": 8.533099762013281e-06, "loss": 1.7498, "step": 13700 }, { "epoch": 1.1, "grad_norm": 2.5, "learning_rate": 8.5082951072155e-06, "loss": 1.7629, "step": 13720 }, { "epoch": 1.1, "grad_norm": 2.359375, "learning_rate": 8.483499835783743e-06, "loss": 1.7664, "step": 13740 }, { "epoch": 1.1, "grad_norm": 2.625, "learning_rate": 8.45871410368928e-06, "loss": 1.7748, "step": 13760 }, { "epoch": 1.1, "grad_norm": 2.46875, "learning_rate": 8.433938066843367e-06, "loss": 1.7368, "step": 13780 }, { "epoch": 1.1, "grad_norm": 2.515625, "learning_rate": 8.409171881096292e-06, "loss": 1.7764, "step": 13800 }, { "epoch": 1.1, "grad_norm": 2.390625, "learning_rate": 8.384415702236363e-06, "loss": 1.7538, "step": 13820 }, { "epoch": 1.1, "grad_norm": 2.71875, "learning_rate": 8.359669685988939e-06, "loss": 1.7678, "step": 13840 }, { "epoch": 1.11, "grad_norm": 2.6875, "learning_rate": 8.334933988015465e-06, "loss": 1.7644, "step": 13860 }, { "epoch": 1.11, "grad_norm": 2.5, "learning_rate": 8.31020876391247e-06, "loss": 1.7725, "step": 13880 }, { "epoch": 1.11, "grad_norm": 2.5625, "learning_rate": 8.285494169210597e-06, "loss": 1.7637, "step": 13900 }, { "epoch": 1.11, "grad_norm": 2.578125, "learning_rate": 8.26079035937364e-06, "loss": 1.7472, "step": 13920 }, { "epoch": 1.11, "grad_norm": 2.578125, "learning_rate": 8.23609748979753e-06, "loss": 1.7796, "step": 13940 }, { "epoch": 1.11, "grad_norm": 2.609375, "learning_rate": 8.211415715809407e-06, "loss": 1.7416, "step": 13960 }, { "epoch": 1.12, "grad_norm": 2.546875, "learning_rate": 8.186745192666592e-06, "loss": 1.7395, "step": 13980 }, { "epoch": 1.12, "grad_norm": 2.421875, "learning_rate": 8.162086075555645e-06, "loss": 1.7577, "step": 14000 }, { "epoch": 1.12, "grad_norm": 2.921875, "learning_rate": 8.13743851959138e-06, "loss": 1.7681, "step": 14020 }, { "epoch": 1.12, "grad_norm": 2.59375, "learning_rate": 8.11280267981588e-06, "loss": 1.7549, "step": 14040 }, { "epoch": 1.12, "grad_norm": 2.59375, "learning_rate": 8.088178711197533e-06, "loss": 1.7603, "step": 14060 }, { "epoch": 1.12, "grad_norm": 2.78125, "learning_rate": 8.063566768630052e-06, "loss": 1.7712, "step": 14080 }, { "epoch": 1.13, "grad_norm": 2.640625, "learning_rate": 8.038967006931508e-06, "loss": 1.7618, "step": 14100 }, { "epoch": 1.13, "grad_norm": 2.5, "learning_rate": 8.014379580843333e-06, "loss": 1.7589, "step": 14120 }, { "epoch": 1.13, "grad_norm": 2.515625, "learning_rate": 7.989804645029386e-06, "loss": 1.7535, "step": 14140 }, { "epoch": 1.13, "grad_norm": 2.734375, "learning_rate": 7.96524235407494e-06, "loss": 1.7566, "step": 14160 }, { "epoch": 1.13, "grad_norm": 2.5625, "learning_rate": 7.940692862485735e-06, "loss": 1.75, "step": 14180 }, { "epoch": 1.13, "grad_norm": 2.484375, "learning_rate": 7.916156324687e-06, "loss": 1.7601, "step": 14200 }, { "epoch": 1.14, "grad_norm": 2.5625, "learning_rate": 7.89163289502247e-06, "loss": 1.7576, "step": 14220 }, { "epoch": 1.14, "grad_norm": 2.546875, "learning_rate": 7.867122727753442e-06, "loss": 1.7725, "step": 14240 }, { "epoch": 1.14, "grad_norm": 2.609375, "learning_rate": 7.84262597705777e-06, "loss": 1.7552, "step": 14260 }, { "epoch": 1.14, "grad_norm": 2.546875, "learning_rate": 7.818142797028922e-06, "loss": 1.765, "step": 14280 }, { "epoch": 1.14, "grad_norm": 2.515625, "learning_rate": 7.793673341675004e-06, "loss": 1.7469, "step": 14300 }, { "epoch": 1.14, "grad_norm": 2.59375, "learning_rate": 7.769217764917782e-06, "loss": 1.776, "step": 14320 }, { "epoch": 1.14, "grad_norm": 2.578125, "learning_rate": 7.744776220591718e-06, "loss": 1.7531, "step": 14340 }, { "epoch": 1.15, "grad_norm": 2.40625, "learning_rate": 7.720348862443022e-06, "loss": 1.7494, "step": 14360 }, { "epoch": 1.15, "grad_norm": 2.625, "learning_rate": 7.69593584412865e-06, "loss": 1.7567, "step": 14380 }, { "epoch": 1.15, "grad_norm": 2.5625, "learning_rate": 7.671537319215358e-06, "loss": 1.7583, "step": 14400 }, { "epoch": 1.15, "grad_norm": 2.625, "learning_rate": 7.647153441178745e-06, "loss": 1.7573, "step": 14420 }, { "epoch": 1.15, "grad_norm": 2.5625, "learning_rate": 7.622784363402261e-06, "loss": 1.7606, "step": 14440 }, { "epoch": 1.15, "grad_norm": 2.5, "learning_rate": 7.598430239176264e-06, "loss": 1.7359, "step": 14460 }, { "epoch": 1.16, "grad_norm": 2.515625, "learning_rate": 7.574091221697055e-06, "loss": 1.7503, "step": 14480 }, { "epoch": 1.16, "grad_norm": 2.53125, "learning_rate": 7.549767464065895e-06, "loss": 1.7486, "step": 14500 }, { "epoch": 1.16, "grad_norm": 2.546875, "learning_rate": 7.525459119288059e-06, "loss": 1.7441, "step": 14520 }, { "epoch": 1.16, "grad_norm": 2.46875, "learning_rate": 7.501166340271878e-06, "loss": 1.7643, "step": 14540 }, { "epoch": 1.16, "grad_norm": 2.46875, "learning_rate": 7.476889279827759e-06, "loss": 1.7584, "step": 14560 }, { "epoch": 1.16, "grad_norm": 2.40625, "learning_rate": 7.452628090667242e-06, "loss": 1.7749, "step": 14580 }, { "epoch": 1.17, "grad_norm": 2.4375, "learning_rate": 7.42838292540202e-06, "loss": 1.7659, "step": 14600 }, { "epoch": 1.17, "grad_norm": 2.65625, "learning_rate": 7.404153936542997e-06, "loss": 1.739, "step": 14620 }, { "epoch": 1.17, "grad_norm": 2.734375, "learning_rate": 7.379941276499323e-06, "loss": 1.7443, "step": 14640 }, { "epoch": 1.17, "grad_norm": 2.359375, "learning_rate": 7.355745097577431e-06, "loss": 1.7476, "step": 14660 }, { "epoch": 1.17, "grad_norm": 2.59375, "learning_rate": 7.331565551980078e-06, "loss": 1.7513, "step": 14680 }, { "epoch": 1.17, "grad_norm": 2.65625, "learning_rate": 7.307402791805398e-06, "loss": 1.7508, "step": 14700 }, { "epoch": 1.18, "grad_norm": 2.3125, "learning_rate": 7.283256969045937e-06, "loss": 1.7515, "step": 14720 }, { "epoch": 1.18, "grad_norm": 2.53125, "learning_rate": 7.259128235587692e-06, "loss": 1.7577, "step": 14740 }, { "epoch": 1.18, "grad_norm": 2.484375, "learning_rate": 7.235016743209178e-06, "loss": 1.7483, "step": 14760 }, { "epoch": 1.18, "grad_norm": 2.625, "learning_rate": 7.210922643580436e-06, "loss": 1.7603, "step": 14780 }, { "epoch": 1.18, "grad_norm": 2.609375, "learning_rate": 7.186846088262114e-06, "loss": 1.7553, "step": 14800 }, { "epoch": 1.18, "grad_norm": 2.453125, "learning_rate": 7.162787228704499e-06, "loss": 1.744, "step": 14820 }, { "epoch": 1.18, "grad_norm": 2.671875, "learning_rate": 7.138746216246565e-06, "loss": 1.7731, "step": 14840 }, { "epoch": 1.19, "grad_norm": 2.40625, "learning_rate": 7.114723202115013e-06, "loss": 1.7563, "step": 14860 }, { "epoch": 1.19, "grad_norm": 2.546875, "learning_rate": 7.090718337423339e-06, "loss": 1.7579, "step": 14880 }, { "epoch": 1.19, "grad_norm": 2.75, "learning_rate": 7.066731773170865e-06, "loss": 1.7448, "step": 14900 }, { "epoch": 1.19, "grad_norm": 2.484375, "learning_rate": 7.042763660241805e-06, "loss": 1.7659, "step": 14920 }, { "epoch": 1.19, "grad_norm": 2.53125, "learning_rate": 7.018814149404298e-06, "loss": 1.753, "step": 14940 }, { "epoch": 1.19, "grad_norm": 2.640625, "learning_rate": 6.99488339130947e-06, "loss": 1.7511, "step": 14960 }, { "epoch": 1.2, "grad_norm": 2.484375, "learning_rate": 6.970971536490496e-06, "loss": 1.7691, "step": 14980 }, { "epoch": 1.2, "grad_norm": 2.34375, "learning_rate": 6.947078735361628e-06, "loss": 1.7498, "step": 15000 }, { "epoch": 1.2, "grad_norm": 2.421875, "learning_rate": 6.923205138217271e-06, "loss": 1.7428, "step": 15020 }, { "epoch": 1.2, "grad_norm": 2.53125, "learning_rate": 6.8993508952310365e-06, "loss": 1.7465, "step": 15040 }, { "epoch": 1.2, "grad_norm": 2.40625, "learning_rate": 6.875516156454776e-06, "loss": 1.7692, "step": 15060 }, { "epoch": 1.2, "grad_norm": 2.6875, "learning_rate": 6.851701071817662e-06, "loss": 1.7492, "step": 15080 }, { "epoch": 1.21, "grad_norm": 2.484375, "learning_rate": 6.827905791125235e-06, "loss": 1.7673, "step": 15100 }, { "epoch": 1.21, "grad_norm": 2.59375, "learning_rate": 6.804130464058465e-06, "loss": 1.75, "step": 15120 }, { "epoch": 1.21, "grad_norm": 2.375, "learning_rate": 6.780375240172792e-06, "loss": 1.7489, "step": 15140 }, { "epoch": 1.21, "grad_norm": 2.71875, "learning_rate": 6.756640268897217e-06, "loss": 1.7649, "step": 15160 }, { "epoch": 1.21, "grad_norm": 2.515625, "learning_rate": 6.732925699533331e-06, "loss": 1.7566, "step": 15180 }, { "epoch": 1.21, "grad_norm": 2.515625, "learning_rate": 6.709231681254402e-06, "loss": 1.7516, "step": 15200 }, { "epoch": 1.22, "grad_norm": 2.484375, "learning_rate": 6.685558363104419e-06, "loss": 1.7658, "step": 15220 }, { "epoch": 1.22, "grad_norm": 2.421875, "learning_rate": 6.661905893997149e-06, "loss": 1.774, "step": 15240 }, { "epoch": 1.22, "grad_norm": 2.5625, "learning_rate": 6.63827442271523e-06, "loss": 1.7388, "step": 15260 }, { "epoch": 1.22, "grad_norm": 2.46875, "learning_rate": 6.6146640979092035e-06, "loss": 1.7533, "step": 15280 }, { "epoch": 1.22, "grad_norm": 2.484375, "learning_rate": 6.591075068096588e-06, "loss": 1.7614, "step": 15300 }, { "epoch": 1.22, "grad_norm": 2.625, "learning_rate": 6.567507481660971e-06, "loss": 1.7339, "step": 15320 }, { "epoch": 1.22, "grad_norm": 2.46875, "learning_rate": 6.543961486851026e-06, "loss": 1.7646, "step": 15340 }, { "epoch": 1.23, "grad_norm": 2.53125, "learning_rate": 6.520437231779621e-06, "loss": 1.7514, "step": 15360 }, { "epoch": 1.23, "grad_norm": 2.53125, "learning_rate": 6.496934864422876e-06, "loss": 1.7665, "step": 15380 }, { "epoch": 1.23, "grad_norm": 2.515625, "learning_rate": 6.473454532619223e-06, "loss": 1.754, "step": 15400 }, { "epoch": 1.23, "grad_norm": 2.609375, "learning_rate": 6.449996384068482e-06, "loss": 1.7539, "step": 15420 }, { "epoch": 1.23, "grad_norm": 2.640625, "learning_rate": 6.426560566330937e-06, "loss": 1.7595, "step": 15440 }, { "epoch": 1.23, "grad_norm": 2.59375, "learning_rate": 6.403147226826403e-06, "loss": 1.7466, "step": 15460 }, { "epoch": 1.24, "grad_norm": 2.484375, "learning_rate": 6.379756512833288e-06, "loss": 1.7639, "step": 15480 }, { "epoch": 1.24, "grad_norm": 2.4375, "learning_rate": 6.356388571487696e-06, "loss": 1.7477, "step": 15500 }, { "epoch": 1.24, "grad_norm": 2.359375, "learning_rate": 6.333043549782465e-06, "loss": 1.7563, "step": 15520 }, { "epoch": 1.24, "grad_norm": 2.640625, "learning_rate": 6.309721594566271e-06, "loss": 1.7366, "step": 15540 }, { "epoch": 1.24, "grad_norm": 2.546875, "learning_rate": 6.2864228525426914e-06, "loss": 1.7691, "step": 15560 }, { "epoch": 1.24, "grad_norm": 2.59375, "learning_rate": 6.263147470269275e-06, "loss": 1.7686, "step": 15580 }, { "epoch": 1.25, "grad_norm": 2.59375, "learning_rate": 6.239895594156649e-06, "loss": 1.7474, "step": 15600 }, { "epoch": 1.25, "grad_norm": 2.734375, "learning_rate": 6.216667370467558e-06, "loss": 1.7609, "step": 15620 }, { "epoch": 1.25, "grad_norm": 2.59375, "learning_rate": 6.193462945315974e-06, "loss": 1.7503, "step": 15640 }, { "epoch": 1.25, "grad_norm": 2.4375, "learning_rate": 6.1702824646661685e-06, "loss": 1.7608, "step": 15660 }, { "epoch": 1.25, "grad_norm": 2.515625, "learning_rate": 6.147126074331788e-06, "loss": 1.7577, "step": 15680 }, { "epoch": 1.25, "grad_norm": 2.59375, "learning_rate": 6.123993919974947e-06, "loss": 1.7551, "step": 15700 }, { "epoch": 1.25, "grad_norm": 2.5625, "learning_rate": 6.100886147105305e-06, "loss": 1.7545, "step": 15720 }, { "epoch": 1.26, "grad_norm": 2.6875, "learning_rate": 6.077802901079155e-06, "loss": 1.7639, "step": 15740 }, { "epoch": 1.26, "grad_norm": 2.53125, "learning_rate": 6.054744327098498e-06, "loss": 1.7582, "step": 15760 }, { "epoch": 1.26, "grad_norm": 2.59375, "learning_rate": 6.031710570210157e-06, "loss": 1.7632, "step": 15780 }, { "epoch": 1.26, "grad_norm": 2.515625, "learning_rate": 6.008701775304827e-06, "loss": 1.7465, "step": 15800 }, { "epoch": 1.26, "grad_norm": 2.703125, "learning_rate": 5.985718087116197e-06, "loss": 1.7537, "step": 15820 }, { "epoch": 1.26, "grad_norm": 2.453125, "learning_rate": 5.96275965022002e-06, "loss": 1.757, "step": 15840 }, { "epoch": 1.27, "grad_norm": 2.5625, "learning_rate": 5.939826609033203e-06, "loss": 1.759, "step": 15860 }, { "epoch": 1.27, "grad_norm": 2.53125, "learning_rate": 5.916919107812924e-06, "loss": 1.7539, "step": 15880 }, { "epoch": 1.27, "grad_norm": 2.59375, "learning_rate": 5.894037290655683e-06, "loss": 1.7741, "step": 15900 }, { "epoch": 1.27, "grad_norm": 2.515625, "learning_rate": 5.871181301496427e-06, "loss": 1.7582, "step": 15920 }, { "epoch": 1.27, "grad_norm": 2.5625, "learning_rate": 5.848351284107644e-06, "loss": 1.7656, "step": 15940 }, { "epoch": 1.27, "grad_norm": 2.484375, "learning_rate": 5.825547382098435e-06, "loss": 1.7577, "step": 15960 }, { "epoch": 1.28, "grad_norm": 2.5625, "learning_rate": 5.802769738913632e-06, "loss": 1.7544, "step": 15980 }, { "epoch": 1.28, "grad_norm": 2.578125, "learning_rate": 5.780018497832901e-06, "loss": 1.7667, "step": 16000 }, { "epoch": 1.28, "grad_norm": 2.546875, "learning_rate": 5.757293801969808e-06, "loss": 1.7478, "step": 16020 }, { "epoch": 1.28, "grad_norm": 2.578125, "learning_rate": 5.7345957942709505e-06, "loss": 1.7502, "step": 16040 }, { "epoch": 1.28, "grad_norm": 2.5625, "learning_rate": 5.7119246175150555e-06, "loss": 1.7631, "step": 16060 }, { "epoch": 1.28, "grad_norm": 2.515625, "learning_rate": 5.689280414312066e-06, "loss": 1.741, "step": 16080 }, { "epoch": 1.29, "grad_norm": 2.546875, "learning_rate": 5.666663327102238e-06, "loss": 1.7412, "step": 16100 }, { "epoch": 1.29, "grad_norm": 2.578125, "learning_rate": 5.644073498155287e-06, "loss": 1.7629, "step": 16120 }, { "epoch": 1.29, "grad_norm": 2.375, "learning_rate": 5.6215110695694405e-06, "loss": 1.7418, "step": 16140 }, { "epoch": 1.29, "grad_norm": 2.734375, "learning_rate": 5.598976183270579e-06, "loss": 1.7543, "step": 16160 }, { "epoch": 1.29, "grad_norm": 2.609375, "learning_rate": 5.576468981011327e-06, "loss": 1.7464, "step": 16180 }, { "epoch": 1.29, "grad_norm": 2.53125, "learning_rate": 5.553989604370169e-06, "loss": 1.7567, "step": 16200 }, { "epoch": 1.29, "grad_norm": 2.4375, "learning_rate": 5.5315381947505565e-06, "loss": 1.7528, "step": 16220 }, { "epoch": 1.3, "grad_norm": 2.71875, "learning_rate": 5.509114893380016e-06, "loss": 1.7407, "step": 16240 }, { "epoch": 1.3, "grad_norm": 2.625, "learning_rate": 5.48671984130926e-06, "loss": 1.7652, "step": 16260 }, { "epoch": 1.3, "grad_norm": 2.53125, "learning_rate": 5.46435317941132e-06, "loss": 1.7695, "step": 16280 }, { "epoch": 1.3, "grad_norm": 2.484375, "learning_rate": 5.442015048380617e-06, "loss": 1.7443, "step": 16300 }, { "epoch": 1.3, "grad_norm": 2.484375, "learning_rate": 5.419705588732115e-06, "loss": 1.7552, "step": 16320 }, { "epoch": 1.3, "grad_norm": 2.46875, "learning_rate": 5.3974249408004364e-06, "loss": 1.7586, "step": 16340 }, { "epoch": 1.31, "grad_norm": 2.515625, "learning_rate": 5.3751732447389445e-06, "loss": 1.785, "step": 16360 }, { "epoch": 1.31, "grad_norm": 2.59375, "learning_rate": 5.3529506405188965e-06, "loss": 1.7338, "step": 16380 }, { "epoch": 1.31, "grad_norm": 2.5625, "learning_rate": 5.33075726792856e-06, "loss": 1.7437, "step": 16400 }, { "epoch": 1.31, "grad_norm": 2.5625, "learning_rate": 5.308593266572309e-06, "loss": 1.7614, "step": 16420 }, { "epoch": 1.31, "grad_norm": 2.578125, "learning_rate": 5.286458775869768e-06, "loss": 1.7477, "step": 16440 }, { "epoch": 1.31, "grad_norm": 2.515625, "learning_rate": 5.264353935054935e-06, "loss": 1.7534, "step": 16460 }, { "epoch": 1.32, "grad_norm": 2.515625, "learning_rate": 5.2422788831752955e-06, "loss": 1.7556, "step": 16480 }, { "epoch": 1.32, "grad_norm": 2.5625, "learning_rate": 5.220233759090939e-06, "loss": 1.7572, "step": 16500 }, { "epoch": 1.32, "grad_norm": 2.75, "learning_rate": 5.19821870147372e-06, "loss": 1.7439, "step": 16520 }, { "epoch": 1.32, "grad_norm": 2.609375, "learning_rate": 5.176233848806349e-06, "loss": 1.7529, "step": 16540 }, { "epoch": 1.32, "grad_norm": 2.59375, "learning_rate": 5.154279339381543e-06, "loss": 1.7456, "step": 16560 }, { "epoch": 1.32, "grad_norm": 2.609375, "learning_rate": 5.132355311301145e-06, "loss": 1.7487, "step": 16580 }, { "epoch": 1.33, "grad_norm": 2.59375, "learning_rate": 5.110461902475261e-06, "loss": 1.7541, "step": 16600 }, { "epoch": 1.33, "grad_norm": 2.578125, "learning_rate": 5.088599250621393e-06, "loss": 1.7484, "step": 16620 }, { "epoch": 1.33, "grad_norm": 2.546875, "learning_rate": 5.066767493263568e-06, "loss": 1.7639, "step": 16640 }, { "epoch": 1.33, "grad_norm": 2.53125, "learning_rate": 5.044966767731474e-06, "loss": 1.7654, "step": 16660 }, { "epoch": 1.33, "grad_norm": 2.578125, "learning_rate": 5.0231972111596e-06, "loss": 1.7514, "step": 16680 }, { "epoch": 1.33, "grad_norm": 2.703125, "learning_rate": 5.001458960486372e-06, "loss": 1.756, "step": 16700 }, { "epoch": 1.33, "grad_norm": 2.6875, "learning_rate": 4.979752152453287e-06, "loss": 1.756, "step": 16720 }, { "epoch": 1.34, "grad_norm": 2.8125, "learning_rate": 4.958076923604055e-06, "loss": 1.7605, "step": 16740 }, { "epoch": 1.34, "grad_norm": 2.609375, "learning_rate": 4.936433410283754e-06, "loss": 1.7346, "step": 16760 }, { "epoch": 1.34, "grad_norm": 2.453125, "learning_rate": 4.914821748637938e-06, "loss": 1.7629, "step": 16780 }, { "epoch": 1.34, "grad_norm": 2.578125, "learning_rate": 4.8932420746118246e-06, "loss": 1.7526, "step": 16800 }, { "epoch": 1.34, "grad_norm": 2.34375, "learning_rate": 4.871694523949404e-06, "loss": 1.7639, "step": 16820 }, { "epoch": 1.34, "grad_norm": 2.578125, "learning_rate": 4.850179232192603e-06, "loss": 1.7388, "step": 16840 }, { "epoch": 1.35, "grad_norm": 2.65625, "learning_rate": 4.828696334680428e-06, "loss": 1.7452, "step": 16860 }, { "epoch": 1.35, "grad_norm": 2.515625, "learning_rate": 4.807245966548113e-06, "loss": 1.767, "step": 16880 }, { "epoch": 1.35, "grad_norm": 2.515625, "learning_rate": 4.785828262726271e-06, "loss": 1.7547, "step": 16900 }, { "epoch": 1.35, "grad_norm": 2.703125, "learning_rate": 4.764443357940044e-06, "loss": 1.7462, "step": 16920 }, { "epoch": 1.35, "grad_norm": 2.5625, "learning_rate": 4.743091386708257e-06, "loss": 1.7567, "step": 16940 }, { "epoch": 1.35, "grad_norm": 2.53125, "learning_rate": 4.721772483342573e-06, "loss": 1.7352, "step": 16960 }, { "epoch": 1.36, "grad_norm": 2.625, "learning_rate": 4.700486781946639e-06, "loss": 1.754, "step": 16980 }, { "epoch": 1.36, "grad_norm": 2.46875, "learning_rate": 4.679234416415258e-06, "loss": 1.735, "step": 17000 }, { "epoch": 1.36, "grad_norm": 2.453125, "learning_rate": 4.65801552043353e-06, "loss": 1.7673, "step": 17020 }, { "epoch": 1.36, "grad_norm": 2.421875, "learning_rate": 4.636830227476033e-06, "loss": 1.743, "step": 17040 }, { "epoch": 1.36, "grad_norm": 2.578125, "learning_rate": 4.61567867080595e-06, "loss": 1.7607, "step": 17060 }, { "epoch": 1.36, "grad_norm": 2.484375, "learning_rate": 4.594560983474269e-06, "loss": 1.7553, "step": 17080 }, { "epoch": 1.37, "grad_norm": 2.625, "learning_rate": 4.5734772983189206e-06, "loss": 1.7538, "step": 17100 }, { "epoch": 1.37, "grad_norm": 2.390625, "learning_rate": 4.552427747963937e-06, "loss": 1.7566, "step": 17120 }, { "epoch": 1.37, "grad_norm": 2.5, "learning_rate": 4.531412464818654e-06, "loss": 1.7531, "step": 17140 }, { "epoch": 1.37, "grad_norm": 2.4375, "learning_rate": 4.510431581076837e-06, "loss": 1.7236, "step": 17160 }, { "epoch": 1.37, "grad_norm": 2.375, "learning_rate": 4.489485228715872e-06, "loss": 1.7459, "step": 17180 }, { "epoch": 1.37, "grad_norm": 2.546875, "learning_rate": 4.468573539495928e-06, "loss": 1.766, "step": 17200 }, { "epoch": 1.37, "grad_norm": 2.5625, "learning_rate": 4.447696644959135e-06, "loss": 1.7471, "step": 17220 }, { "epoch": 1.38, "grad_norm": 2.484375, "learning_rate": 4.4268546764287455e-06, "loss": 1.7713, "step": 17240 }, { "epoch": 1.38, "grad_norm": 2.484375, "learning_rate": 4.406047765008319e-06, "loss": 1.7543, "step": 17260 }, { "epoch": 1.38, "grad_norm": 2.578125, "learning_rate": 4.385276041580892e-06, "loss": 1.7526, "step": 17280 }, { "epoch": 1.38, "grad_norm": 2.578125, "learning_rate": 4.3645396368081496e-06, "loss": 1.7675, "step": 17300 }, { "epoch": 1.38, "grad_norm": 2.4375, "learning_rate": 4.34383868112963e-06, "loss": 1.7483, "step": 17320 }, { "epoch": 1.38, "grad_norm": 2.65625, "learning_rate": 4.323173304761856e-06, "loss": 1.7331, "step": 17340 }, { "epoch": 1.39, "grad_norm": 2.5, "learning_rate": 4.302543637697558e-06, "loss": 1.7604, "step": 17360 }, { "epoch": 1.39, "grad_norm": 2.5, "learning_rate": 4.281949809704852e-06, "loss": 1.7468, "step": 17380 }, { "epoch": 1.39, "grad_norm": 2.546875, "learning_rate": 4.2613919503263866e-06, "loss": 1.761, "step": 17400 }, { "epoch": 1.39, "grad_norm": 2.59375, "learning_rate": 4.240870188878582e-06, "loss": 1.7562, "step": 17420 }, { "epoch": 1.39, "grad_norm": 2.546875, "learning_rate": 4.220384654450774e-06, "loss": 1.7714, "step": 17440 }, { "epoch": 1.39, "grad_norm": 2.640625, "learning_rate": 4.19993547590442e-06, "loss": 1.7442, "step": 17460 }, { "epoch": 1.4, "grad_norm": 2.5, "learning_rate": 4.179522781872286e-06, "loss": 1.7561, "step": 17480 }, { "epoch": 1.4, "grad_norm": 2.421875, "learning_rate": 4.159146700757639e-06, "loss": 1.762, "step": 17500 }, { "epoch": 1.4, "grad_norm": 2.59375, "learning_rate": 4.138807360733435e-06, "loss": 1.7431, "step": 17520 }, { "epoch": 1.4, "grad_norm": 2.375, "learning_rate": 4.118504889741518e-06, "loss": 1.7519, "step": 17540 }, { "epoch": 1.4, "grad_norm": 2.484375, "learning_rate": 4.098239415491808e-06, "loss": 1.7469, "step": 17560 }, { "epoch": 1.4, "grad_norm": 2.4375, "learning_rate": 4.078011065461507e-06, "loss": 1.7454, "step": 17580 }, { "epoch": 1.41, "grad_norm": 2.5, "learning_rate": 4.057819966894288e-06, "loss": 1.7447, "step": 17600 }, { "epoch": 1.41, "grad_norm": 2.640625, "learning_rate": 4.037666246799502e-06, "loss": 1.7611, "step": 17620 }, { "epoch": 1.41, "grad_norm": 2.453125, "learning_rate": 4.0175500319513704e-06, "loss": 1.749, "step": 17640 }, { "epoch": 1.41, "grad_norm": 2.59375, "learning_rate": 3.997471448888207e-06, "loss": 1.7534, "step": 17660 }, { "epoch": 1.41, "grad_norm": 2.53125, "learning_rate": 3.977430623911588e-06, "loss": 1.7399, "step": 17680 }, { "epoch": 1.41, "grad_norm": 2.40625, "learning_rate": 3.957427683085588e-06, "loss": 1.7551, "step": 17700 }, { "epoch": 1.41, "grad_norm": 2.484375, "learning_rate": 3.937462752235981e-06, "loss": 1.7442, "step": 17720 }, { "epoch": 1.42, "grad_norm": 2.546875, "learning_rate": 3.917535956949439e-06, "loss": 1.7553, "step": 17740 }, { "epoch": 1.42, "grad_norm": 2.40625, "learning_rate": 3.897647422572744e-06, "loss": 1.7473, "step": 17760 }, { "epoch": 1.42, "grad_norm": 2.578125, "learning_rate": 3.877797274212012e-06, "loss": 1.7446, "step": 17780 }, { "epoch": 1.42, "grad_norm": 2.640625, "learning_rate": 3.857985636731887e-06, "loss": 1.7519, "step": 17800 }, { "epoch": 1.42, "grad_norm": 2.484375, "learning_rate": 3.838212634754772e-06, "loss": 1.755, "step": 17820 }, { "epoch": 1.42, "grad_norm": 2.515625, "learning_rate": 3.818478392660039e-06, "loss": 1.7453, "step": 17840 }, { "epoch": 1.43, "grad_norm": 2.421875, "learning_rate": 3.798783034583241e-06, "loss": 1.7587, "step": 17860 }, { "epoch": 1.43, "grad_norm": 2.625, "learning_rate": 3.779126684415343e-06, "loss": 1.7592, "step": 17880 }, { "epoch": 1.43, "grad_norm": 2.546875, "learning_rate": 3.7595094658019302e-06, "loss": 1.7417, "step": 17900 }, { "epoch": 1.43, "grad_norm": 2.46875, "learning_rate": 3.7399315021424363e-06, "loss": 1.76, "step": 17920 }, { "epoch": 1.43, "grad_norm": 2.625, "learning_rate": 3.7203929165893805e-06, "loss": 1.7637, "step": 17940 }, { "epoch": 1.43, "grad_norm": 2.515625, "learning_rate": 3.7008938320475563e-06, "loss": 1.7419, "step": 17960 }, { "epoch": 1.44, "grad_norm": 2.671875, "learning_rate": 3.6814343711732948e-06, "loss": 1.7552, "step": 17980 }, { "epoch": 1.44, "grad_norm": 2.453125, "learning_rate": 3.6620146563736847e-06, "loss": 1.7463, "step": 18000 }, { "epoch": 1.44, "grad_norm": 2.453125, "learning_rate": 3.6426348098057897e-06, "loss": 1.7503, "step": 18020 }, { "epoch": 1.44, "grad_norm": 2.53125, "learning_rate": 3.6232949533758864e-06, "loss": 1.7493, "step": 18040 }, { "epoch": 1.44, "grad_norm": 2.609375, "learning_rate": 3.6039952087387043e-06, "loss": 1.7564, "step": 18060 }, { "epoch": 1.44, "grad_norm": 2.625, "learning_rate": 3.584735697296651e-06, "loss": 1.7506, "step": 18080 }, { "epoch": 1.44, "grad_norm": 2.421875, "learning_rate": 3.5655165401990564e-06, "loss": 1.7479, "step": 18100 }, { "epoch": 1.45, "grad_norm": 2.5, "learning_rate": 3.546337858341403e-06, "loss": 1.75, "step": 18120 }, { "epoch": 1.45, "grad_norm": 2.640625, "learning_rate": 3.527199772364572e-06, "loss": 1.7611, "step": 18140 }, { "epoch": 1.45, "grad_norm": 2.46875, "learning_rate": 3.508102402654082e-06, "loss": 1.7647, "step": 18160 }, { "epoch": 1.45, "grad_norm": 2.375, "learning_rate": 3.4890458693393305e-06, "loss": 1.7709, "step": 18180 }, { "epoch": 1.45, "grad_norm": 2.5625, "learning_rate": 3.470030292292834e-06, "loss": 1.7557, "step": 18200 }, { "epoch": 1.45, "grad_norm": 2.40625, "learning_rate": 3.451055791129495e-06, "loss": 1.7505, "step": 18220 }, { "epoch": 1.46, "grad_norm": 2.546875, "learning_rate": 3.4321224852058145e-06, "loss": 1.767, "step": 18240 }, { "epoch": 1.46, "grad_norm": 2.546875, "learning_rate": 3.4132304936191686e-06, "loss": 1.7415, "step": 18260 }, { "epoch": 1.46, "grad_norm": 2.59375, "learning_rate": 3.3943799352070574e-06, "loss": 1.7687, "step": 18280 }, { "epoch": 1.46, "grad_norm": 2.53125, "learning_rate": 3.3755709285463468e-06, "loss": 1.754, "step": 18300 }, { "epoch": 1.46, "grad_norm": 2.5, "learning_rate": 3.3568035919525154e-06, "loss": 1.7422, "step": 18320 }, { "epoch": 1.46, "grad_norm": 2.71875, "learning_rate": 3.338078043478943e-06, "loss": 1.7568, "step": 18340 }, { "epoch": 1.47, "grad_norm": 2.484375, "learning_rate": 3.3193944009161326e-06, "loss": 1.7355, "step": 18360 }, { "epoch": 1.47, "grad_norm": 2.421875, "learning_rate": 3.300752781790987e-06, "loss": 1.7707, "step": 18380 }, { "epoch": 1.47, "grad_norm": 2.53125, "learning_rate": 3.282153303366068e-06, "loss": 1.7555, "step": 18400 }, { "epoch": 1.47, "grad_norm": 2.609375, "learning_rate": 3.2635960826388546e-06, "loss": 1.7574, "step": 18420 }, { "epoch": 1.47, "grad_norm": 2.484375, "learning_rate": 3.245081236341011e-06, "loss": 1.7646, "step": 18440 }, { "epoch": 1.47, "grad_norm": 2.59375, "learning_rate": 3.226608880937653e-06, "loss": 1.7342, "step": 18460 }, { "epoch": 1.48, "grad_norm": 2.5, "learning_rate": 3.2081791326266042e-06, "loss": 1.7688, "step": 18480 }, { "epoch": 1.48, "grad_norm": 2.546875, "learning_rate": 3.1897921073376936e-06, "loss": 1.7527, "step": 18500 }, { "epoch": 1.48, "grad_norm": 2.421875, "learning_rate": 3.1714479207319826e-06, "loss": 1.7483, "step": 18520 }, { "epoch": 1.48, "grad_norm": 2.53125, "learning_rate": 3.1531466882010732e-06, "loss": 1.7643, "step": 18540 }, { "epoch": 1.48, "grad_norm": 2.515625, "learning_rate": 3.1348885248663785e-06, "loss": 1.7604, "step": 18560 }, { "epoch": 1.48, "grad_norm": 2.375, "learning_rate": 3.1166735455783814e-06, "loss": 1.7573, "step": 18580 }, { "epoch": 1.48, "grad_norm": 2.578125, "learning_rate": 3.0985018649159137e-06, "loss": 1.7594, "step": 18600 }, { "epoch": 1.49, "grad_norm": 2.484375, "learning_rate": 3.080373597185462e-06, "loss": 1.7518, "step": 18620 }, { "epoch": 1.49, "grad_norm": 2.578125, "learning_rate": 3.062288856420417e-06, "loss": 1.7487, "step": 18640 }, { "epoch": 1.49, "grad_norm": 2.59375, "learning_rate": 3.0442477563803708e-06, "loss": 1.7511, "step": 18660 }, { "epoch": 1.49, "grad_norm": 2.515625, "learning_rate": 3.0262504105504033e-06, "loss": 1.7604, "step": 18680 }, { "epoch": 1.49, "grad_norm": 2.28125, "learning_rate": 3.008296932140359e-06, "loss": 1.7601, "step": 18700 }, { "epoch": 1.49, "grad_norm": 2.53125, "learning_rate": 2.9903874340841452e-06, "loss": 1.7708, "step": 18720 }, { "epoch": 1.5, "grad_norm": 2.59375, "learning_rate": 2.9725220290390157e-06, "loss": 1.7589, "step": 18740 }, { "epoch": 1.5, "grad_norm": 2.515625, "learning_rate": 2.954700829384857e-06, "loss": 1.7628, "step": 18760 }, { "epoch": 1.5, "grad_norm": 2.5625, "learning_rate": 2.9369239472235036e-06, "loss": 1.7582, "step": 18780 }, { "epoch": 1.5, "grad_norm": 2.5625, "learning_rate": 2.9191914943779963e-06, "loss": 1.7581, "step": 18800 }, { "epoch": 1.5, "grad_norm": 2.4375, "learning_rate": 2.90150358239191e-06, "loss": 1.7553, "step": 18820 }, { "epoch": 1.5, "grad_norm": 2.328125, "learning_rate": 2.883860322528651e-06, "loss": 1.7518, "step": 18840 }, { "epoch": 1.51, "grad_norm": 2.578125, "learning_rate": 2.8662618257707266e-06, "loss": 1.7404, "step": 18860 }, { "epoch": 1.51, "grad_norm": 2.546875, "learning_rate": 2.848708202819078e-06, "loss": 1.7474, "step": 18880 }, { "epoch": 1.51, "grad_norm": 2.390625, "learning_rate": 2.8311995640923827e-06, "loss": 1.7523, "step": 18900 }, { "epoch": 1.51, "grad_norm": 2.640625, "learning_rate": 2.813736019726342e-06, "loss": 1.7517, "step": 18920 }, { "epoch": 1.51, "grad_norm": 2.5, "learning_rate": 2.7963176795729874e-06, "loss": 1.7367, "step": 18940 }, { "epoch": 1.51, "grad_norm": 2.484375, "learning_rate": 2.7789446532000208e-06, "loss": 1.7792, "step": 18960 }, { "epoch": 1.52, "grad_norm": 2.65625, "learning_rate": 2.761617049890091e-06, "loss": 1.7401, "step": 18980 }, { "epoch": 1.52, "grad_norm": 2.578125, "learning_rate": 2.7443349786401186e-06, "loss": 1.7603, "step": 19000 }, { "epoch": 1.52, "grad_norm": 2.453125, "learning_rate": 2.7270985481606173e-06, "loss": 1.7672, "step": 19020 }, { "epoch": 1.52, "grad_norm": 2.421875, "learning_rate": 2.7099078668749957e-06, "loss": 1.7617, "step": 19040 }, { "epoch": 1.52, "grad_norm": 2.359375, "learning_rate": 2.6927630429188968e-06, "loss": 1.7595, "step": 19060 }, { "epoch": 1.52, "grad_norm": 2.546875, "learning_rate": 2.675664184139487e-06, "loss": 1.7514, "step": 19080 }, { "epoch": 1.52, "grad_norm": 2.515625, "learning_rate": 2.6586113980948024e-06, "loss": 1.7675, "step": 19100 }, { "epoch": 1.53, "grad_norm": 2.515625, "learning_rate": 2.6416047920530775e-06, "loss": 1.7594, "step": 19120 }, { "epoch": 1.53, "grad_norm": 2.578125, "learning_rate": 2.6246444729920363e-06, "loss": 1.7382, "step": 19140 }, { "epoch": 1.53, "grad_norm": 2.546875, "learning_rate": 2.6077305475982496e-06, "loss": 1.7528, "step": 19160 }, { "epoch": 1.53, "grad_norm": 2.515625, "learning_rate": 2.5908631222664638e-06, "loss": 1.7441, "step": 19180 }, { "epoch": 1.53, "grad_norm": 2.59375, "learning_rate": 2.574042303098915e-06, "loss": 1.7557, "step": 19200 }, { "epoch": 1.53, "grad_norm": 2.546875, "learning_rate": 2.557268195904662e-06, "loss": 1.7525, "step": 19220 }, { "epoch": 1.54, "grad_norm": 2.546875, "learning_rate": 2.540540906198945e-06, "loss": 1.7521, "step": 19240 }, { "epoch": 1.54, "grad_norm": 2.453125, "learning_rate": 2.5238605392024927e-06, "loss": 1.7364, "step": 19260 }, { "epoch": 1.54, "grad_norm": 2.625, "learning_rate": 2.5072271998408792e-06, "loss": 1.745, "step": 19280 }, { "epoch": 1.54, "grad_norm": 2.484375, "learning_rate": 2.4906409927438546e-06, "loss": 1.7552, "step": 19300 }, { "epoch": 1.54, "grad_norm": 2.390625, "learning_rate": 2.4741020222446867e-06, "loss": 1.7602, "step": 19320 }, { "epoch": 1.54, "grad_norm": 2.40625, "learning_rate": 2.4576103923795224e-06, "loss": 1.7483, "step": 19340 }, { "epoch": 1.55, "grad_norm": 2.46875, "learning_rate": 2.4411662068866983e-06, "loss": 1.7493, "step": 19360 }, { "epoch": 1.55, "grad_norm": 2.578125, "learning_rate": 2.424769569206118e-06, "loss": 1.7549, "step": 19380 }, { "epoch": 1.55, "grad_norm": 2.453125, "learning_rate": 2.4084205824786045e-06, "loss": 1.7592, "step": 19400 }, { "epoch": 1.55, "grad_norm": 2.609375, "learning_rate": 2.3921193495452153e-06, "loss": 1.77, "step": 19420 }, { "epoch": 1.55, "grad_norm": 2.375, "learning_rate": 2.3758659729466337e-06, "loss": 1.7447, "step": 19440 }, { "epoch": 1.55, "grad_norm": 2.5625, "learning_rate": 2.3596605549225115e-06, "loss": 1.7566, "step": 19460 }, { "epoch": 1.56, "grad_norm": 2.609375, "learning_rate": 2.343503197410818e-06, "loss": 1.7408, "step": 19480 }, { "epoch": 1.56, "grad_norm": 2.390625, "learning_rate": 2.3273940020471984e-06, "loss": 1.752, "step": 19500 }, { "epoch": 1.56, "grad_norm": 2.40625, "learning_rate": 2.3113330701643546e-06, "loss": 1.7639, "step": 19520 }, { "epoch": 1.56, "grad_norm": 2.421875, "learning_rate": 2.2953205027913828e-06, "loss": 1.7446, "step": 19540 }, { "epoch": 1.56, "grad_norm": 2.4375, "learning_rate": 2.2793564006531555e-06, "loss": 1.748, "step": 19560 }, { "epoch": 1.56, "grad_norm": 2.515625, "learning_rate": 2.263440864169675e-06, "loss": 1.7488, "step": 19580 }, { "epoch": 1.56, "grad_norm": 2.59375, "learning_rate": 2.247573993455453e-06, "loss": 1.7564, "step": 19600 }, { "epoch": 1.57, "grad_norm": 2.40625, "learning_rate": 2.2317558883188728e-06, "loss": 1.7446, "step": 19620 }, { "epoch": 1.57, "grad_norm": 2.59375, "learning_rate": 2.215986648261568e-06, "loss": 1.7711, "step": 19640 }, { "epoch": 1.57, "grad_norm": 2.5625, "learning_rate": 2.200266372477785e-06, "loss": 1.7519, "step": 19660 }, { "epoch": 1.57, "grad_norm": 2.453125, "learning_rate": 2.184595159853783e-06, "loss": 1.7383, "step": 19680 }, { "epoch": 1.57, "grad_norm": 2.5625, "learning_rate": 2.168973108967177e-06, "loss": 1.7579, "step": 19700 }, { "epoch": 1.57, "grad_norm": 2.609375, "learning_rate": 2.153400318086347e-06, "loss": 1.7631, "step": 19720 }, { "epoch": 1.58, "grad_norm": 2.609375, "learning_rate": 2.137876885169813e-06, "loss": 1.7736, "step": 19740 }, { "epoch": 1.58, "grad_norm": 2.59375, "learning_rate": 2.1224029078656103e-06, "loss": 1.7371, "step": 19760 }, { "epoch": 1.58, "grad_norm": 2.40625, "learning_rate": 2.1069784835106744e-06, "loss": 1.7575, "step": 19780 }, { "epoch": 1.58, "grad_norm": 2.46875, "learning_rate": 2.0916037091302476e-06, "loss": 1.7436, "step": 19800 }, { "epoch": 1.58, "grad_norm": 2.6875, "learning_rate": 2.0762786814372494e-06, "loss": 1.7645, "step": 19820 }, { "epoch": 1.58, "grad_norm": 2.4375, "learning_rate": 2.0610034968316727e-06, "loss": 1.761, "step": 19840 }, { "epoch": 1.59, "grad_norm": 2.453125, "learning_rate": 2.045778251399981e-06, "loss": 1.7396, "step": 19860 }, { "epoch": 1.59, "grad_norm": 2.40625, "learning_rate": 2.030603040914505e-06, "loss": 1.7514, "step": 19880 }, { "epoch": 1.59, "grad_norm": 2.421875, "learning_rate": 2.0154779608328334e-06, "loss": 1.7587, "step": 19900 }, { "epoch": 1.59, "grad_norm": 2.578125, "learning_rate": 2.0004031062972175e-06, "loss": 1.7274, "step": 19920 }, { "epoch": 1.59, "grad_norm": 2.515625, "learning_rate": 1.9853785721339704e-06, "loss": 1.7504, "step": 19940 }, { "epoch": 1.59, "grad_norm": 2.578125, "learning_rate": 1.97040445285288e-06, "loss": 1.7723, "step": 19960 }, { "epoch": 1.6, "grad_norm": 2.4375, "learning_rate": 1.9554808426465944e-06, "loss": 1.7469, "step": 19980 }, { "epoch": 1.6, "grad_norm": 2.390625, "learning_rate": 1.9406078353900437e-06, "loss": 1.7564, "step": 20000 }, { "epoch": 1.6, "grad_norm": 2.34375, "learning_rate": 1.9257855246398583e-06, "loss": 1.7386, "step": 20020 }, { "epoch": 1.6, "grad_norm": 2.375, "learning_rate": 1.9110140036337578e-06, "loss": 1.7423, "step": 20040 }, { "epoch": 1.6, "grad_norm": 2.546875, "learning_rate": 1.896293365289973e-06, "loss": 1.7474, "step": 20060 }, { "epoch": 1.6, "grad_norm": 2.359375, "learning_rate": 1.8816237022066774e-06, "loss": 1.7591, "step": 20080 }, { "epoch": 1.6, "grad_norm": 2.609375, "learning_rate": 1.8670051066613826e-06, "loss": 1.767, "step": 20100 }, { "epoch": 1.61, "grad_norm": 2.53125, "learning_rate": 1.8524376706103676e-06, "loss": 1.7405, "step": 20120 }, { "epoch": 1.61, "grad_norm": 2.484375, "learning_rate": 1.837921485688099e-06, "loss": 1.767, "step": 20140 }, { "epoch": 1.61, "grad_norm": 2.421875, "learning_rate": 1.8234566432066603e-06, "loss": 1.7601, "step": 20160 }, { "epoch": 1.61, "grad_norm": 2.515625, "learning_rate": 1.8090432341551655e-06, "loss": 1.7558, "step": 20180 }, { "epoch": 1.61, "grad_norm": 2.46875, "learning_rate": 1.7946813491991988e-06, "loss": 1.7534, "step": 20200 }, { "epoch": 1.61, "grad_norm": 2.53125, "learning_rate": 1.7803710786802342e-06, "loss": 1.7425, "step": 20220 }, { "epoch": 1.62, "grad_norm": 2.484375, "learning_rate": 1.7661125126150825e-06, "loss": 1.7585, "step": 20240 }, { "epoch": 1.62, "grad_norm": 2.421875, "learning_rate": 1.7519057406952988e-06, "loss": 1.7622, "step": 20260 }, { "epoch": 1.62, "grad_norm": 2.609375, "learning_rate": 1.7377508522866448e-06, "loss": 1.7498, "step": 20280 }, { "epoch": 1.62, "grad_norm": 2.5625, "learning_rate": 1.7236479364285186e-06, "loss": 1.7554, "step": 20300 }, { "epoch": 1.62, "grad_norm": 2.578125, "learning_rate": 1.7095970818333862e-06, "loss": 1.7445, "step": 20320 }, { "epoch": 1.62, "grad_norm": 2.34375, "learning_rate": 1.6955983768862238e-06, "loss": 1.7481, "step": 20340 }, { "epoch": 1.63, "grad_norm": 2.390625, "learning_rate": 1.681651909643982e-06, "loss": 1.7431, "step": 20360 }, { "epoch": 1.63, "grad_norm": 2.71875, "learning_rate": 1.6677577678350088e-06, "loss": 1.7245, "step": 20380 }, { "epoch": 1.63, "grad_norm": 2.453125, "learning_rate": 1.6539160388584996e-06, "loss": 1.7596, "step": 20400 }, { "epoch": 1.63, "grad_norm": 2.484375, "learning_rate": 1.6401268097839696e-06, "loss": 1.7495, "step": 20420 }, { "epoch": 1.63, "grad_norm": 2.5, "learning_rate": 1.6263901673506776e-06, "loss": 1.7529, "step": 20440 }, { "epoch": 1.63, "grad_norm": 2.59375, "learning_rate": 1.6127061979670988e-06, "loss": 1.7601, "step": 20460 }, { "epoch": 1.63, "grad_norm": 2.609375, "learning_rate": 1.599074987710375e-06, "loss": 1.7599, "step": 20480 }, { "epoch": 1.64, "grad_norm": 2.46875, "learning_rate": 1.5854966223257751e-06, "loss": 1.7434, "step": 20500 }, { "epoch": 1.64, "grad_norm": 2.5625, "learning_rate": 1.57197118722615e-06, "loss": 1.748, "step": 20520 }, { "epoch": 1.64, "grad_norm": 2.5625, "learning_rate": 1.5584987674914064e-06, "loss": 1.773, "step": 20540 }, { "epoch": 1.64, "grad_norm": 2.5, "learning_rate": 1.5450794478679575e-06, "loss": 1.7542, "step": 20560 }, { "epoch": 1.64, "grad_norm": 2.4375, "learning_rate": 1.531713312768207e-06, "loss": 1.7607, "step": 20580 }, { "epoch": 1.64, "grad_norm": 2.421875, "learning_rate": 1.518400446270003e-06, "loss": 1.7506, "step": 20600 }, { "epoch": 1.65, "grad_norm": 2.5, "learning_rate": 1.5051409321161081e-06, "loss": 1.7602, "step": 20620 }, { "epoch": 1.65, "grad_norm": 2.5625, "learning_rate": 1.4919348537136947e-06, "loss": 1.739, "step": 20640 }, { "epoch": 1.65, "grad_norm": 2.671875, "learning_rate": 1.4787822941337938e-06, "loss": 1.7506, "step": 20660 }, { "epoch": 1.65, "grad_norm": 2.453125, "learning_rate": 1.4656833361107814e-06, "loss": 1.7391, "step": 20680 }, { "epoch": 1.65, "grad_norm": 2.453125, "learning_rate": 1.4526380620418712e-06, "loss": 1.7495, "step": 20700 }, { "epoch": 1.65, "grad_norm": 2.40625, "learning_rate": 1.4396465539865767e-06, "loss": 1.7558, "step": 20720 }, { "epoch": 1.66, "grad_norm": 2.515625, "learning_rate": 1.4267088936662067e-06, "loss": 1.7498, "step": 20740 }, { "epoch": 1.66, "grad_norm": 2.46875, "learning_rate": 1.413825162463347e-06, "loss": 1.7571, "step": 20760 }, { "epoch": 1.66, "grad_norm": 2.421875, "learning_rate": 1.40099544142135e-06, "loss": 1.7423, "step": 20780 }, { "epoch": 1.66, "grad_norm": 2.40625, "learning_rate": 1.3882198112438261e-06, "loss": 1.7465, "step": 20800 }, { "epoch": 1.66, "grad_norm": 2.53125, "learning_rate": 1.3754983522941313e-06, "loss": 1.7513, "step": 20820 }, { "epoch": 1.66, "grad_norm": 2.546875, "learning_rate": 1.3628311445948649e-06, "loss": 1.7469, "step": 20840 }, { "epoch": 1.67, "grad_norm": 2.375, "learning_rate": 1.3502182678273757e-06, "loss": 1.7463, "step": 20860 }, { "epoch": 1.67, "grad_norm": 2.390625, "learning_rate": 1.3376598013312347e-06, "loss": 1.7517, "step": 20880 }, { "epoch": 1.67, "grad_norm": 2.53125, "learning_rate": 1.3251558241037644e-06, "loss": 1.7621, "step": 20900 }, { "epoch": 1.67, "grad_norm": 2.484375, "learning_rate": 1.3127064147995283e-06, "loss": 1.7647, "step": 20920 }, { "epoch": 1.67, "grad_norm": 2.515625, "learning_rate": 1.3003116517298386e-06, "loss": 1.7427, "step": 20940 }, { "epoch": 1.67, "grad_norm": 2.484375, "learning_rate": 1.2879716128622522e-06, "loss": 1.7471, "step": 20960 }, { "epoch": 1.67, "grad_norm": 2.453125, "learning_rate": 1.2756863758201076e-06, "loss": 1.7534, "step": 20980 }, { "epoch": 1.68, "grad_norm": 2.578125, "learning_rate": 1.2634560178820076e-06, "loss": 1.7494, "step": 21000 }, { "epoch": 1.68, "grad_norm": 2.640625, "learning_rate": 1.2512806159813506e-06, "loss": 1.7686, "step": 21020 }, { "epoch": 1.68, "grad_norm": 2.609375, "learning_rate": 1.2391602467058395e-06, "loss": 1.7479, "step": 21040 }, { "epoch": 1.68, "grad_norm": 2.5, "learning_rate": 1.227094986297004e-06, "loss": 1.7466, "step": 21060 }, { "epoch": 1.68, "grad_norm": 2.40625, "learning_rate": 1.2150849106497176e-06, "loss": 1.743, "step": 21080 }, { "epoch": 1.68, "grad_norm": 2.453125, "learning_rate": 1.2031300953117208e-06, "loss": 1.7572, "step": 21100 }, { "epoch": 1.69, "grad_norm": 2.65625, "learning_rate": 1.1912306154831488e-06, "loss": 1.7402, "step": 21120 }, { "epoch": 1.69, "grad_norm": 2.421875, "learning_rate": 1.1793865460160547e-06, "loss": 1.7405, "step": 21140 }, { "epoch": 1.69, "grad_norm": 2.625, "learning_rate": 1.1675979614139422e-06, "loss": 1.7389, "step": 21160 }, { "epoch": 1.69, "grad_norm": 2.484375, "learning_rate": 1.1558649358312902e-06, "loss": 1.747, "step": 21180 }, { "epoch": 1.69, "grad_norm": 2.546875, "learning_rate": 1.1441875430730987e-06, "loss": 1.7453, "step": 21200 }, { "epoch": 1.69, "grad_norm": 2.4375, "learning_rate": 1.1325658565944132e-06, "loss": 1.7474, "step": 21220 }, { "epoch": 1.7, "grad_norm": 2.546875, "learning_rate": 1.1209999494998603e-06, "loss": 1.7591, "step": 21240 }, { "epoch": 1.7, "grad_norm": 2.640625, "learning_rate": 1.1094898945432064e-06, "loss": 1.7588, "step": 21260 }, { "epoch": 1.7, "grad_norm": 2.640625, "learning_rate": 1.098035764126879e-06, "loss": 1.767, "step": 21280 }, { "epoch": 1.7, "grad_norm": 2.484375, "learning_rate": 1.0866376303015213e-06, "loss": 1.7465, "step": 21300 }, { "epoch": 1.7, "grad_norm": 2.59375, "learning_rate": 1.0752955647655394e-06, "loss": 1.7664, "step": 21320 }, { "epoch": 1.7, "grad_norm": 2.5625, "learning_rate": 1.0640096388646471e-06, "loss": 1.7541, "step": 21340 }, { "epoch": 1.71, "grad_norm": 2.59375, "learning_rate": 1.0527799235914215e-06, "loss": 1.7368, "step": 21360 }, { "epoch": 1.71, "grad_norm": 2.53125, "learning_rate": 1.0416064895848555e-06, "loss": 1.7355, "step": 21380 }, { "epoch": 1.71, "grad_norm": 2.34375, "learning_rate": 1.0304894071299077e-06, "loss": 1.7654, "step": 21400 }, { "epoch": 1.71, "grad_norm": 2.5625, "learning_rate": 1.0194287461570696e-06, "loss": 1.764, "step": 21420 }, { "epoch": 1.71, "grad_norm": 2.4375, "learning_rate": 1.0084245762419187e-06, "loss": 1.7576, "step": 21440 }, { "epoch": 1.71, "grad_norm": 2.484375, "learning_rate": 9.974769666046825e-07, "loss": 1.7699, "step": 21460 }, { "epoch": 1.71, "grad_norm": 2.625, "learning_rate": 9.86585986109808e-07, "loss": 1.7636, "step": 21480 }, { "epoch": 1.72, "grad_norm": 2.46875, "learning_rate": 9.757517032655229e-07, "loss": 1.7467, "step": 21500 }, { "epoch": 1.72, "grad_norm": 2.53125, "learning_rate": 9.649741862233974e-07, "loss": 1.7481, "step": 21520 }, { "epoch": 1.72, "grad_norm": 2.609375, "learning_rate": 9.542535027779388e-07, "loss": 1.7594, "step": 21540 }, { "epoch": 1.72, "grad_norm": 2.6875, "learning_rate": 9.435897203661392e-07, "loss": 1.7374, "step": 21560 }, { "epoch": 1.72, "grad_norm": 2.40625, "learning_rate": 9.329829060670681e-07, "loss": 1.7448, "step": 21580 }, { "epoch": 1.72, "grad_norm": 2.4375, "learning_rate": 9.224331266014419e-07, "loss": 1.758, "step": 21600 }, { "epoch": 1.73, "grad_norm": 2.453125, "learning_rate": 9.119404483312089e-07, "loss": 1.7526, "step": 21620 }, { "epoch": 1.73, "grad_norm": 2.484375, "learning_rate": 9.01504937259129e-07, "loss": 1.7583, "step": 21640 }, { "epoch": 1.73, "grad_norm": 2.390625, "learning_rate": 8.911266590283607e-07, "loss": 1.7607, "step": 21660 }, { "epoch": 1.73, "grad_norm": 2.515625, "learning_rate": 8.808056789220465e-07, "loss": 1.7445, "step": 21680 }, { "epoch": 1.73, "grad_norm": 2.453125, "learning_rate": 8.705420618629035e-07, "loss": 1.7389, "step": 21700 }, { "epoch": 1.73, "grad_norm": 2.75, "learning_rate": 8.60335872412813e-07, "loss": 1.744, "step": 21720 }, { "epoch": 1.74, "grad_norm": 2.5625, "learning_rate": 8.501871747724177e-07, "loss": 1.7643, "step": 21740 }, { "epoch": 1.74, "grad_norm": 2.65625, "learning_rate": 8.400960327807128e-07, "loss": 1.7617, "step": 21760 }, { "epoch": 1.74, "grad_norm": 2.515625, "learning_rate": 8.300625099146542e-07, "loss": 1.7346, "step": 21780 }, { "epoch": 1.74, "grad_norm": 2.484375, "learning_rate": 8.200866692887421e-07, "loss": 1.7582, "step": 21800 }, { "epoch": 1.74, "grad_norm": 2.515625, "learning_rate": 8.101685736546438e-07, "loss": 1.7452, "step": 21820 }, { "epoch": 1.74, "grad_norm": 2.375, "learning_rate": 8.00308285400786e-07, "loss": 1.7564, "step": 21840 }, { "epoch": 1.75, "grad_norm": 2.546875, "learning_rate": 7.90505866551966e-07, "loss": 1.7482, "step": 21860 }, { "epoch": 1.75, "grad_norm": 2.5, "learning_rate": 7.807613787689616e-07, "loss": 1.7561, "step": 21880 }, { "epoch": 1.75, "grad_norm": 2.46875, "learning_rate": 7.710748833481418e-07, "loss": 1.7453, "step": 21900 }, { "epoch": 1.75, "grad_norm": 2.578125, "learning_rate": 7.614464412210854e-07, "loss": 1.7423, "step": 21920 }, { "epoch": 1.75, "grad_norm": 2.640625, "learning_rate": 7.518761129541929e-07, "loss": 1.7607, "step": 21940 }, { "epoch": 1.75, "grad_norm": 2.5625, "learning_rate": 7.423639587483078e-07, "loss": 1.7561, "step": 21960 }, { "epoch": 1.75, "grad_norm": 2.546875, "learning_rate": 7.329100384383381e-07, "loss": 1.7595, "step": 21980 }, { "epoch": 1.76, "grad_norm": 2.4375, "learning_rate": 7.235144114928782e-07, "loss": 1.7509, "step": 22000 }, { "epoch": 1.76, "grad_norm": 2.53125, "learning_rate": 7.141771370138383e-07, "loss": 1.7335, "step": 22020 }, { "epoch": 1.76, "grad_norm": 2.5, "learning_rate": 7.048982737360677e-07, "loss": 1.7396, "step": 22040 }, { "epoch": 1.76, "grad_norm": 2.484375, "learning_rate": 6.956778800269914e-07, "loss": 1.7418, "step": 22060 }, { "epoch": 1.76, "grad_norm": 2.484375, "learning_rate": 6.865160138862348e-07, "loss": 1.7491, "step": 22080 }, { "epoch": 1.76, "grad_norm": 2.640625, "learning_rate": 6.774127329452684e-07, "loss": 1.7625, "step": 22100 }, { "epoch": 1.77, "grad_norm": 2.65625, "learning_rate": 6.683680944670401e-07, "loss": 1.7401, "step": 22120 }, { "epoch": 1.77, "grad_norm": 2.65625, "learning_rate": 6.59382155345607e-07, "loss": 1.7504, "step": 22140 }, { "epoch": 1.77, "grad_norm": 2.453125, "learning_rate": 6.504549721057996e-07, "loss": 1.7243, "step": 22160 }, { "epoch": 1.77, "grad_norm": 2.578125, "learning_rate": 6.415866009028426e-07, "loss": 1.7606, "step": 22180 }, { "epoch": 1.77, "grad_norm": 2.703125, "learning_rate": 6.327770975220149e-07, "loss": 1.7543, "step": 22200 }, { "epoch": 1.77, "grad_norm": 2.640625, "learning_rate": 6.240265173782955e-07, "loss": 1.7501, "step": 22220 }, { "epoch": 1.78, "grad_norm": 2.734375, "learning_rate": 6.153349155160137e-07, "loss": 1.7418, "step": 22240 }, { "epoch": 1.78, "grad_norm": 2.484375, "learning_rate": 6.067023466085054e-07, "loss": 1.7562, "step": 22260 }, { "epoch": 1.78, "grad_norm": 2.625, "learning_rate": 5.981288649577665e-07, "loss": 1.7529, "step": 22280 }, { "epoch": 1.78, "grad_norm": 2.515625, "learning_rate": 5.896145244941132e-07, "loss": 1.7343, "step": 22300 }, { "epoch": 1.78, "grad_norm": 2.578125, "learning_rate": 5.811593787758407e-07, "loss": 1.7588, "step": 22320 }, { "epoch": 1.78, "grad_norm": 2.640625, "learning_rate": 5.727634809888937e-07, "loss": 1.7598, "step": 22340 }, { "epoch": 1.79, "grad_norm": 2.46875, "learning_rate": 5.644268839465162e-07, "loss": 1.7461, "step": 22360 }, { "epoch": 1.79, "grad_norm": 2.5, "learning_rate": 5.561496400889344e-07, "loss": 1.7623, "step": 22380 }, { "epoch": 1.79, "grad_norm": 2.5625, "learning_rate": 5.479318014830248e-07, "loss": 1.7425, "step": 22400 }, { "epoch": 1.79, "grad_norm": 2.484375, "learning_rate": 5.397734198219751e-07, "loss": 1.7585, "step": 22420 }, { "epoch": 1.79, "grad_norm": 2.640625, "learning_rate": 5.316745464249739e-07, "loss": 1.7674, "step": 22440 }, { "epoch": 1.79, "grad_norm": 2.53125, "learning_rate": 5.236352322368798e-07, "loss": 1.7477, "step": 22460 }, { "epoch": 1.79, "grad_norm": 2.421875, "learning_rate": 5.156555278278997e-07, "loss": 1.7478, "step": 22480 }, { "epoch": 1.8, "grad_norm": 2.453125, "learning_rate": 5.077354833932746e-07, "loss": 1.7396, "step": 22500 }, { "epoch": 1.8, "grad_norm": 2.53125, "learning_rate": 4.99875148752964e-07, "loss": 1.7386, "step": 22520 }, { "epoch": 1.8, "grad_norm": 2.546875, "learning_rate": 4.920745733513311e-07, "loss": 1.7596, "step": 22540 }, { "epoch": 1.8, "grad_norm": 2.484375, "learning_rate": 4.843338062568293e-07, "loss": 1.7637, "step": 22560 }, { "epoch": 1.8, "grad_norm": 2.421875, "learning_rate": 4.7665289616169673e-07, "loss": 1.7412, "step": 22580 }, { "epoch": 1.8, "grad_norm": 2.40625, "learning_rate": 4.690318913816505e-07, "loss": 1.7503, "step": 22600 }, { "epoch": 1.81, "grad_norm": 2.5, "learning_rate": 4.6147083985558336e-07, "loss": 1.7382, "step": 22620 }, { "epoch": 1.81, "grad_norm": 2.546875, "learning_rate": 4.5396978914525436e-07, "loss": 1.7509, "step": 22640 }, { "epoch": 1.81, "grad_norm": 2.40625, "learning_rate": 4.4652878643499986e-07, "loss": 1.7474, "step": 22660 }, { "epoch": 1.81, "grad_norm": 2.5, "learning_rate": 4.3914787853143513e-07, "loss": 1.7648, "step": 22680 }, { "epoch": 1.81, "grad_norm": 2.515625, "learning_rate": 4.3182711186314894e-07, "loss": 1.7397, "step": 22700 }, { "epoch": 1.81, "grad_norm": 2.34375, "learning_rate": 4.245665324804282e-07, "loss": 1.7452, "step": 22720 }, { "epoch": 1.82, "grad_norm": 2.5, "learning_rate": 4.1736618605495605e-07, "loss": 1.7494, "step": 22740 }, { "epoch": 1.82, "grad_norm": 2.59375, "learning_rate": 4.102261178795286e-07, "loss": 1.7637, "step": 22760 }, { "epoch": 1.82, "grad_norm": 2.5625, "learning_rate": 4.031463728677687e-07, "loss": 1.7536, "step": 22780 }, { "epoch": 1.82, "grad_norm": 2.375, "learning_rate": 3.9612699555384826e-07, "loss": 1.7383, "step": 22800 }, { "epoch": 1.82, "grad_norm": 2.546875, "learning_rate": 3.8916803009220074e-07, "loss": 1.744, "step": 22820 }, { "epoch": 1.82, "grad_norm": 2.5, "learning_rate": 3.8226952025724904e-07, "loss": 1.761, "step": 22840 }, { "epoch": 1.83, "grad_norm": 2.453125, "learning_rate": 3.7543150944312713e-07, "loss": 1.7548, "step": 22860 }, { "epoch": 1.83, "grad_norm": 2.5, "learning_rate": 3.686540406634098e-07, "loss": 1.7583, "step": 22880 }, { "epoch": 1.83, "grad_norm": 2.46875, "learning_rate": 3.6193715655083784e-07, "loss": 1.7409, "step": 22900 }, { "epoch": 1.83, "grad_norm": 2.515625, "learning_rate": 3.552808993570556e-07, "loss": 1.7385, "step": 22920 }, { "epoch": 1.83, "grad_norm": 2.40625, "learning_rate": 3.4868531095233806e-07, "loss": 1.7496, "step": 22940 }, { "epoch": 1.83, "grad_norm": 2.515625, "learning_rate": 3.421504328253378e-07, "loss": 1.7553, "step": 22960 }, { "epoch": 1.83, "grad_norm": 2.5625, "learning_rate": 3.3567630608280943e-07, "loss": 1.7509, "step": 22980 }, { "epoch": 1.84, "grad_norm": 2.4375, "learning_rate": 3.292629714493645e-07, "loss": 1.7299, "step": 23000 }, { "epoch": 1.84, "grad_norm": 2.359375, "learning_rate": 3.2291046926721134e-07, "loss": 1.7684, "step": 23020 }, { "epoch": 1.84, "grad_norm": 2.46875, "learning_rate": 3.166188394958969e-07, "loss": 1.7697, "step": 23040 }, { "epoch": 1.84, "grad_norm": 2.53125, "learning_rate": 3.1038812171205965e-07, "loss": 1.7504, "step": 23060 }, { "epoch": 1.84, "grad_norm": 2.578125, "learning_rate": 3.0421835510917706e-07, "loss": 1.7534, "step": 23080 }, { "epoch": 1.84, "grad_norm": 2.609375, "learning_rate": 2.981095784973276e-07, "loss": 1.7422, "step": 23100 }, { "epoch": 1.85, "grad_norm": 2.484375, "learning_rate": 2.9206183030293324e-07, "loss": 1.7543, "step": 23120 }, { "epoch": 1.85, "grad_norm": 2.46875, "learning_rate": 2.860751485685309e-07, "loss": 1.744, "step": 23140 }, { "epoch": 1.85, "grad_norm": 2.453125, "learning_rate": 2.801495709525237e-07, "loss": 1.7581, "step": 23160 }, { "epoch": 1.85, "grad_norm": 2.53125, "learning_rate": 2.742851347289488e-07, "loss": 1.7486, "step": 23180 }, { "epoch": 1.85, "grad_norm": 2.53125, "learning_rate": 2.684818767872421e-07, "loss": 1.7472, "step": 23200 }, { "epoch": 1.85, "grad_norm": 2.609375, "learning_rate": 2.627398336320053e-07, "loss": 1.743, "step": 23220 }, { "epoch": 1.86, "grad_norm": 2.53125, "learning_rate": 2.570590413827789e-07, "loss": 1.7496, "step": 23240 }, { "epoch": 1.86, "grad_norm": 2.65625, "learning_rate": 2.5143953577380974e-07, "loss": 1.739, "step": 23260 }, { "epoch": 1.86, "grad_norm": 2.515625, "learning_rate": 2.4588135215382834e-07, "loss": 1.752, "step": 23280 }, { "epoch": 1.86, "grad_norm": 2.703125, "learning_rate": 2.40384525485835e-07, "loss": 1.7487, "step": 23300 }, { "epoch": 1.86, "grad_norm": 2.46875, "learning_rate": 2.3494909034686542e-07, "loss": 1.7627, "step": 23320 }, { "epoch": 1.86, "grad_norm": 2.578125, "learning_rate": 2.2957508092777969e-07, "loss": 1.7621, "step": 23340 }, { "epoch": 1.86, "grad_norm": 2.546875, "learning_rate": 2.2426253103305485e-07, "loss": 1.7497, "step": 23360 }, { "epoch": 1.87, "grad_norm": 2.609375, "learning_rate": 2.1901147408055935e-07, "loss": 1.7488, "step": 23380 }, { "epoch": 1.87, "grad_norm": 2.59375, "learning_rate": 2.1382194310134884e-07, "loss": 1.7582, "step": 23400 }, { "epoch": 1.87, "grad_norm": 2.703125, "learning_rate": 2.0869397073946196e-07, "loss": 1.7587, "step": 23420 }, { "epoch": 1.87, "grad_norm": 2.4375, "learning_rate": 2.03627589251707e-07, "loss": 1.7508, "step": 23440 }, { "epoch": 1.87, "grad_norm": 2.421875, "learning_rate": 1.9862283050746555e-07, "loss": 1.7634, "step": 23460 }, { "epoch": 1.87, "grad_norm": 2.40625, "learning_rate": 1.936797259884904e-07, "loss": 1.7472, "step": 23480 }, { "epoch": 1.88, "grad_norm": 2.421875, "learning_rate": 1.8879830678870448e-07, "loss": 1.7404, "step": 23500 }, { "epoch": 1.88, "grad_norm": 2.515625, "learning_rate": 1.839786036140101e-07, "loss": 1.7505, "step": 23520 }, { "epoch": 1.88, "grad_norm": 2.46875, "learning_rate": 1.7922064678209228e-07, "loss": 1.7502, "step": 23540 }, { "epoch": 1.88, "grad_norm": 2.515625, "learning_rate": 1.7452446622222675e-07, "loss": 1.7488, "step": 23560 }, { "epoch": 1.88, "grad_norm": 2.5, "learning_rate": 1.6989009147509893e-07, "loss": 1.7317, "step": 23580 }, { "epoch": 1.88, "grad_norm": 2.53125, "learning_rate": 1.6531755169261088e-07, "loss": 1.7575, "step": 23600 }, { "epoch": 1.89, "grad_norm": 2.515625, "learning_rate": 1.6080687563769793e-07, "loss": 1.7472, "step": 23620 }, { "epoch": 1.89, "grad_norm": 2.515625, "learning_rate": 1.563580916841534e-07, "loss": 1.746, "step": 23640 }, { "epoch": 1.89, "grad_norm": 2.390625, "learning_rate": 1.5197122781644424e-07, "loss": 1.7473, "step": 23660 }, { "epoch": 1.89, "grad_norm": 2.5625, "learning_rate": 1.4764631162954124e-07, "loss": 1.7494, "step": 23680 }, { "epoch": 1.89, "grad_norm": 2.546875, "learning_rate": 1.4338337032873685e-07, "loss": 1.722, "step": 23700 }, { "epoch": 1.89, "grad_norm": 2.5, "learning_rate": 1.3918243072948312e-07, "loss": 1.7425, "step": 23720 }, { "epoch": 1.9, "grad_norm": 2.765625, "learning_rate": 1.3504351925721638e-07, "loss": 1.7323, "step": 23740 }, { "epoch": 1.9, "grad_norm": 2.515625, "learning_rate": 1.3096666194719388e-07, "loss": 1.7556, "step": 23760 }, { "epoch": 1.9, "grad_norm": 2.546875, "learning_rate": 1.269518844443307e-07, "loss": 1.7377, "step": 23780 }, { "epoch": 1.9, "grad_norm": 2.390625, "learning_rate": 1.2299921200303876e-07, "loss": 1.7489, "step": 23800 }, { "epoch": 1.9, "grad_norm": 2.59375, "learning_rate": 1.1910866948706024e-07, "loss": 1.75, "step": 23820 }, { "epoch": 1.9, "grad_norm": 2.53125, "learning_rate": 1.1528028136932435e-07, "loss": 1.7533, "step": 23840 }, { "epoch": 1.9, "grad_norm": 2.453125, "learning_rate": 1.115140717317853e-07, "loss": 1.739, "step": 23860 }, { "epoch": 1.91, "grad_norm": 2.515625, "learning_rate": 1.0781006426526797e-07, "loss": 1.736, "step": 23880 }, { "epoch": 1.91, "grad_norm": 2.46875, "learning_rate": 1.0416828226932684e-07, "loss": 1.7565, "step": 23900 }, { "epoch": 1.91, "grad_norm": 2.71875, "learning_rate": 1.0058874865209512e-07, "loss": 1.7586, "step": 23920 }, { "epoch": 1.91, "grad_norm": 2.640625, "learning_rate": 9.707148593014027e-08, "loss": 1.7549, "step": 23940 }, { "epoch": 1.91, "grad_norm": 2.5625, "learning_rate": 9.361651622832202e-08, "loss": 1.7485, "step": 23960 }, { "epoch": 1.91, "grad_norm": 2.453125, "learning_rate": 9.022386127965799e-08, "loss": 1.7582, "step": 23980 }, { "epoch": 1.92, "grad_norm": 2.515625, "learning_rate": 8.689354242517933e-08, "loss": 1.748, "step": 24000 }, { "epoch": 1.92, "grad_norm": 2.5, "learning_rate": 8.3625580613802e-08, "loss": 1.7587, "step": 24020 }, { "epoch": 1.92, "grad_norm": 2.546875, "learning_rate": 8.041999640219566e-08, "loss": 1.7427, "step": 24040 }, { "epoch": 1.92, "grad_norm": 2.421875, "learning_rate": 7.727680995464726e-08, "loss": 1.7628, "step": 24060 }, { "epoch": 1.92, "grad_norm": 2.5, "learning_rate": 7.419604104294542e-08, "loss": 1.7293, "step": 24080 }, { "epoch": 1.92, "grad_norm": 2.390625, "learning_rate": 7.1177709046244e-08, "loss": 1.7393, "step": 24100 }, { "epoch": 1.93, "grad_norm": 2.546875, "learning_rate": 6.822183295094986e-08, "loss": 1.7567, "step": 24120 }, { "epoch": 1.93, "grad_norm": 2.59375, "learning_rate": 6.532843135059751e-08, "loss": 1.7445, "step": 24140 }, { "epoch": 1.93, "grad_norm": 2.515625, "learning_rate": 6.24975224457347e-08, "loss": 1.7648, "step": 24160 }, { "epoch": 1.93, "grad_norm": 2.53125, "learning_rate": 5.972912404380805e-08, "loss": 1.7479, "step": 24180 }, { "epoch": 1.93, "grad_norm": 2.59375, "learning_rate": 5.70232535590487e-08, "loss": 1.7474, "step": 24200 }, { "epoch": 1.93, "grad_norm": 2.4375, "learning_rate": 5.437992801236802e-08, "loss": 1.7522, "step": 24220 }, { "epoch": 1.94, "grad_norm": 2.640625, "learning_rate": 5.179916403124097e-08, "loss": 1.7673, "step": 24240 }, { "epoch": 1.94, "grad_norm": 2.671875, "learning_rate": 4.928097784961394e-08, "loss": 1.7651, "step": 24260 }, { "epoch": 1.94, "grad_norm": 2.5, "learning_rate": 4.682538530779268e-08, "loss": 1.7342, "step": 24280 }, { "epoch": 1.94, "grad_norm": 2.4375, "learning_rate": 4.4432401852346765e-08, "loss": 1.7453, "step": 24300 }, { "epoch": 1.94, "grad_norm": 2.546875, "learning_rate": 4.2102042536011914e-08, "loss": 1.7627, "step": 24320 }, { "epoch": 1.94, "grad_norm": 2.453125, "learning_rate": 3.983432201759563e-08, "loss": 1.7569, "step": 24340 }, { "epoch": 1.94, "grad_norm": 2.546875, "learning_rate": 3.762925456188393e-08, "loss": 1.738, "step": 24360 }, { "epoch": 1.95, "grad_norm": 2.546875, "learning_rate": 3.5486854039552546e-08, "loss": 1.7552, "step": 24380 }, { "epoch": 1.95, "grad_norm": 2.546875, "learning_rate": 3.340713392708028e-08, "loss": 1.7822, "step": 24400 }, { "epoch": 1.95, "grad_norm": 2.390625, "learning_rate": 3.139010730666248e-08, "loss": 1.7597, "step": 24420 }, { "epoch": 1.95, "grad_norm": 2.5, "learning_rate": 2.9435786866128803e-08, "loss": 1.7347, "step": 24440 }, { "epoch": 1.95, "grad_norm": 2.5625, "learning_rate": 2.7544184898865568e-08, "loss": 1.7423, "step": 24460 }, { "epoch": 1.95, "grad_norm": 2.515625, "learning_rate": 2.5715313303737997e-08, "loss": 1.7643, "step": 24480 }, { "epoch": 1.96, "grad_norm": 2.53125, "learning_rate": 2.3949183585011415e-08, "loss": 1.7526, "step": 24500 }, { "epoch": 1.96, "grad_norm": 2.640625, "learning_rate": 2.2245806852285723e-08, "loss": 1.7543, "step": 24520 }, { "epoch": 1.96, "grad_norm": 2.5625, "learning_rate": 2.0605193820417703e-08, "loss": 1.7374, "step": 24540 }, { "epoch": 1.96, "grad_norm": 2.4375, "learning_rate": 1.9027354809461053e-08, "loss": 1.7546, "step": 24560 }, { "epoch": 1.96, "grad_norm": 2.34375, "learning_rate": 1.751229974459645e-08, "loss": 1.7331, "step": 24580 }, { "epoch": 1.96, "grad_norm": 2.640625, "learning_rate": 1.60600381560716e-08, "loss": 1.7638, "step": 24600 }, { "epoch": 1.97, "grad_norm": 2.390625, "learning_rate": 1.4670579179137945e-08, "loss": 1.7468, "step": 24620 }, { "epoch": 1.97, "grad_norm": 2.46875, "learning_rate": 1.3343931553999601e-08, "loss": 1.7356, "step": 24640 }, { "epoch": 1.97, "grad_norm": 2.6875, "learning_rate": 1.2080103625751183e-08, "loss": 1.7594, "step": 24660 }, { "epoch": 1.97, "grad_norm": 2.40625, "learning_rate": 1.0879103344328956e-08, "loss": 1.746, "step": 24680 }, { "epoch": 1.97, "grad_norm": 2.453125, "learning_rate": 9.740938264463096e-09, "loss": 1.739, "step": 24700 }, { "epoch": 1.97, "grad_norm": 2.515625, "learning_rate": 8.665615545625505e-09, "loss": 1.7597, "step": 24720 }, { "epoch": 1.98, "grad_norm": 2.390625, "learning_rate": 7.65314195198541e-09, "loss": 1.7667, "step": 24740 }, { "epoch": 1.98, "grad_norm": 2.578125, "learning_rate": 6.7035238523716075e-09, "loss": 1.7564, "step": 24760 }, { "epoch": 1.98, "grad_norm": 2.578125, "learning_rate": 5.8167672202269486e-09, "loss": 1.7443, "step": 24780 }, { "epoch": 1.98, "grad_norm": 2.578125, "learning_rate": 4.992877633570592e-09, "loss": 1.7579, "step": 24800 } ], "logging_steps": 20, "max_steps": 25052, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "total_flos": 3.850482409609259e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }