{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997999866657777, "eval_steps": 500, "global_step": 3749, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.3359375, "learning_rate": 5.333333333333335e-07, "loss": 2.017, "step": 10 }, { "epoch": 0.01, "grad_norm": 1.734375, "learning_rate": 1.066666666666667e-06, "loss": 2.3713, "step": 20 }, { "epoch": 0.01, "grad_norm": 1.234375, "learning_rate": 1.6000000000000001e-06, "loss": 2.1764, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.90234375, "learning_rate": 2.133333333333334e-06, "loss": 2.1757, "step": 40 }, { "epoch": 0.01, "grad_norm": 1.4453125, "learning_rate": 2.666666666666667e-06, "loss": 2.1736, "step": 50 }, { "epoch": 0.02, "grad_norm": 1.125, "learning_rate": 3.2000000000000003e-06, "loss": 2.1865, "step": 60 }, { "epoch": 0.02, "grad_norm": 1.1328125, "learning_rate": 3.7333333333333337e-06, "loss": 2.0263, "step": 70 }, { "epoch": 0.02, "grad_norm": 1.015625, "learning_rate": 4.266666666666668e-06, "loss": 1.9312, "step": 80 }, { "epoch": 0.02, "grad_norm": 2.171875, "learning_rate": 4.800000000000001e-06, "loss": 1.8754, "step": 90 }, { "epoch": 0.03, "grad_norm": 0.70703125, "learning_rate": 5.333333333333334e-06, "loss": 1.7123, "step": 100 }, { "epoch": 0.03, "grad_norm": 0.90625, "learning_rate": 5.8666666666666675e-06, "loss": 1.8266, "step": 110 }, { "epoch": 0.03, "grad_norm": 0.97265625, "learning_rate": 6.4000000000000006e-06, "loss": 1.6079, "step": 120 }, { "epoch": 0.03, "grad_norm": 0.7578125, "learning_rate": 6.9333333333333344e-06, "loss": 1.5006, "step": 130 }, { "epoch": 0.04, "grad_norm": 0.3359375, "learning_rate": 7.4666666666666675e-06, "loss": 1.4158, "step": 140 }, { "epoch": 0.04, "grad_norm": 0.068359375, "learning_rate": 8.000000000000001e-06, "loss": 1.3677, "step": 150 }, { "epoch": 0.04, "grad_norm": 0.384765625, "learning_rate": 8.533333333333335e-06, "loss": 1.374, "step": 160 }, { "epoch": 0.05, "grad_norm": 0.296875, "learning_rate": 9.066666666666667e-06, "loss": 1.3052, "step": 170 }, { "epoch": 0.05, "grad_norm": 0.0927734375, "learning_rate": 9.600000000000001e-06, "loss": 1.275, "step": 180 }, { "epoch": 0.05, "grad_norm": 0.10498046875, "learning_rate": 1.0133333333333335e-05, "loss": 1.2574, "step": 190 }, { "epoch": 0.05, "grad_norm": 0.09765625, "learning_rate": 1.0666666666666667e-05, "loss": 1.2239, "step": 200 }, { "epoch": 0.06, "grad_norm": 0.2109375, "learning_rate": 1.1200000000000001e-05, "loss": 1.1565, "step": 210 }, { "epoch": 0.06, "grad_norm": 0.392578125, "learning_rate": 1.1733333333333335e-05, "loss": 1.1758, "step": 220 }, { "epoch": 0.06, "grad_norm": 0.138671875, "learning_rate": 1.2266666666666667e-05, "loss": 1.1843, "step": 230 }, { "epoch": 0.06, "grad_norm": 0.22265625, "learning_rate": 1.2800000000000001e-05, "loss": 1.1936, "step": 240 }, { "epoch": 0.07, "grad_norm": 0.07861328125, "learning_rate": 1.3333333333333333e-05, "loss": 1.1426, "step": 250 }, { "epoch": 0.07, "grad_norm": 0.051513671875, "learning_rate": 1.3866666666666669e-05, "loss": 1.114, "step": 260 }, { "epoch": 0.07, "grad_norm": 0.08154296875, "learning_rate": 1.4400000000000001e-05, "loss": 1.1195, "step": 270 }, { "epoch": 0.07, "grad_norm": 0.043701171875, "learning_rate": 1.4933333333333335e-05, "loss": 1.1256, "step": 280 }, { "epoch": 0.08, "grad_norm": 0.041015625, "learning_rate": 1.546666666666667e-05, "loss": 1.102, "step": 290 }, { "epoch": 0.08, "grad_norm": 0.1103515625, "learning_rate": 1.6000000000000003e-05, "loss": 1.0762, "step": 300 }, { "epoch": 0.08, "grad_norm": 0.0419921875, "learning_rate": 1.6533333333333333e-05, "loss": 1.0761, "step": 310 }, { "epoch": 0.09, "grad_norm": 0.09765625, "learning_rate": 1.706666666666667e-05, "loss": 1.0501, "step": 320 }, { "epoch": 0.09, "grad_norm": 0.041015625, "learning_rate": 1.76e-05, "loss": 1.0105, "step": 330 }, { "epoch": 0.09, "grad_norm": 0.10302734375, "learning_rate": 1.8133333333333335e-05, "loss": 1.0232, "step": 340 }, { "epoch": 0.09, "grad_norm": 0.035888671875, "learning_rate": 1.866666666666667e-05, "loss": 0.989, "step": 350 }, { "epoch": 0.1, "grad_norm": 0.034423828125, "learning_rate": 1.9200000000000003e-05, "loss": 0.9894, "step": 360 }, { "epoch": 0.1, "grad_norm": 0.0830078125, "learning_rate": 1.9733333333333336e-05, "loss": 0.9824, "step": 370 }, { "epoch": 0.1, "grad_norm": 0.0439453125, "learning_rate": 1.999989162756852e-05, "loss": 0.9569, "step": 380 }, { "epoch": 0.1, "grad_norm": 0.034912109375, "learning_rate": 1.999902466221011e-05, "loss": 0.928, "step": 390 }, { "epoch": 0.11, "grad_norm": 0.061767578125, "learning_rate": 1.9997290806656996e-05, "loss": 0.9349, "step": 400 }, { "epoch": 0.11, "grad_norm": 0.03369140625, "learning_rate": 1.9994690211230084e-05, "loss": 0.9489, "step": 410 }, { "epoch": 0.11, "grad_norm": 0.0289306640625, "learning_rate": 1.999122310139442e-05, "loss": 0.916, "step": 420 }, { "epoch": 0.11, "grad_norm": 0.034912109375, "learning_rate": 1.9986889777739686e-05, "loss": 0.9541, "step": 430 }, { "epoch": 0.12, "grad_norm": 0.02978515625, "learning_rate": 1.9981690615954097e-05, "loss": 0.9155, "step": 440 }, { "epoch": 0.12, "grad_norm": 0.0263671875, "learning_rate": 1.9975626066791855e-05, "loss": 0.9015, "step": 450 }, { "epoch": 0.12, "grad_norm": 0.0269775390625, "learning_rate": 1.996869665603406e-05, "loss": 0.9115, "step": 460 }, { "epoch": 0.13, "grad_norm": 0.047119140625, "learning_rate": 1.996090298444313e-05, "loss": 0.896, "step": 470 }, { "epoch": 0.13, "grad_norm": 0.0291748046875, "learning_rate": 1.9952245727710723e-05, "loss": 0.8855, "step": 480 }, { "epoch": 0.13, "grad_norm": 0.193359375, "learning_rate": 1.9942725636399136e-05, "loss": 0.9084, "step": 490 }, { "epoch": 0.13, "grad_norm": 0.02783203125, "learning_rate": 1.9932343535876255e-05, "loss": 0.8899, "step": 500 }, { "epoch": 0.14, "grad_norm": 0.392578125, "learning_rate": 1.9921100326243977e-05, "loss": 0.8952, "step": 510 }, { "epoch": 0.14, "grad_norm": 0.038330078125, "learning_rate": 1.9908996982260196e-05, "loss": 0.8857, "step": 520 }, { "epoch": 0.14, "grad_norm": 0.09521484375, "learning_rate": 1.9896034553254284e-05, "loss": 0.8677, "step": 530 }, { "epoch": 0.14, "grad_norm": 0.03076171875, "learning_rate": 1.988221416303611e-05, "loss": 0.8637, "step": 540 }, { "epoch": 0.15, "grad_norm": 0.041748046875, "learning_rate": 1.986753700979861e-05, "loss": 0.8384, "step": 550 }, { "epoch": 0.15, "grad_norm": 0.026611328125, "learning_rate": 1.985200436601392e-05, "loss": 0.9015, "step": 560 }, { "epoch": 0.15, "grad_norm": 0.05419921875, "learning_rate": 1.9835617578323038e-05, "loss": 0.8564, "step": 570 }, { "epoch": 0.15, "grad_norm": 0.0281982421875, "learning_rate": 1.9818378067419092e-05, "loss": 0.8183, "step": 580 }, { "epoch": 0.16, "grad_norm": 0.0252685546875, "learning_rate": 1.9800287327924152e-05, "loss": 0.8358, "step": 590 }, { "epoch": 0.16, "grad_norm": 0.059814453125, "learning_rate": 1.9781346928259662e-05, "loss": 0.8564, "step": 600 }, { "epoch": 0.16, "grad_norm": 0.0235595703125, "learning_rate": 1.9761558510510453e-05, "loss": 0.8295, "step": 610 }, { "epoch": 0.17, "grad_norm": 0.0255126953125, "learning_rate": 1.974092379028239e-05, "loss": 0.8403, "step": 620 }, { "epoch": 0.17, "grad_norm": 0.064453125, "learning_rate": 1.9719444556553616e-05, "loss": 0.8582, "step": 630 }, { "epoch": 0.17, "grad_norm": 0.0267333984375, "learning_rate": 1.969712267151948e-05, "loss": 0.8328, "step": 640 }, { "epoch": 0.17, "grad_norm": 0.03564453125, "learning_rate": 1.9673960070431043e-05, "loss": 0.8571, "step": 650 }, { "epoch": 0.18, "grad_norm": 0.031494140625, "learning_rate": 1.9649958761427364e-05, "loss": 0.824, "step": 660 }, { "epoch": 0.18, "grad_norm": 0.026123046875, "learning_rate": 1.9625120825361326e-05, "loss": 0.8418, "step": 670 }, { "epoch": 0.18, "grad_norm": 0.0247802734375, "learning_rate": 1.9599448415619283e-05, "loss": 0.8267, "step": 680 }, { "epoch": 0.18, "grad_norm": 0.026123046875, "learning_rate": 1.957294375793435e-05, "loss": 0.8677, "step": 690 }, { "epoch": 0.19, "grad_norm": 0.031005859375, "learning_rate": 1.954560915019343e-05, "loss": 0.827, "step": 700 }, { "epoch": 0.19, "grad_norm": 0.031494140625, "learning_rate": 1.951744696223801e-05, "loss": 0.8303, "step": 710 }, { "epoch": 0.19, "grad_norm": 0.024169921875, "learning_rate": 1.9488459635658687e-05, "loss": 0.8455, "step": 720 }, { "epoch": 0.19, "grad_norm": 0.041259765625, "learning_rate": 1.945864968358349e-05, "loss": 0.8297, "step": 730 }, { "epoch": 0.2, "grad_norm": 0.026611328125, "learning_rate": 1.9428019690460008e-05, "loss": 0.8516, "step": 740 }, { "epoch": 0.2, "grad_norm": 0.0269775390625, "learning_rate": 1.939657231183132e-05, "loss": 0.8538, "step": 750 }, { "epoch": 0.2, "grad_norm": 0.037109375, "learning_rate": 1.9364310274105758e-05, "loss": 0.8045, "step": 760 }, { "epoch": 0.21, "grad_norm": 0.034423828125, "learning_rate": 1.933123637432054e-05, "loss": 0.8436, "step": 770 }, { "epoch": 0.21, "grad_norm": 0.023681640625, "learning_rate": 1.929735347989929e-05, "loss": 0.8123, "step": 780 }, { "epoch": 0.21, "grad_norm": 0.0238037109375, "learning_rate": 1.92626645284034e-05, "loss": 0.8184, "step": 790 }, { "epoch": 0.21, "grad_norm": 0.047119140625, "learning_rate": 1.92271725272774e-05, "loss": 0.8222, "step": 800 }, { "epoch": 0.22, "grad_norm": 0.0277099609375, "learning_rate": 1.919088055358818e-05, "loss": 0.8434, "step": 810 }, { "epoch": 0.22, "grad_norm": 0.224609375, "learning_rate": 1.9153791753758236e-05, "loss": 0.8413, "step": 820 }, { "epoch": 0.22, "grad_norm": 0.05517578125, "learning_rate": 1.911590934329288e-05, "loss": 0.8323, "step": 830 }, { "epoch": 0.22, "grad_norm": 0.1201171875, "learning_rate": 1.9077236606501465e-05, "loss": 0.7935, "step": 840 }, { "epoch": 0.23, "grad_norm": 0.022705078125, "learning_rate": 1.903777689621263e-05, "loss": 0.8186, "step": 850 }, { "epoch": 0.23, "grad_norm": 0.0242919921875, "learning_rate": 1.899753363348364e-05, "loss": 0.8094, "step": 860 }, { "epoch": 0.23, "grad_norm": 0.0267333984375, "learning_rate": 1.895651030730378e-05, "loss": 0.8265, "step": 870 }, { "epoch": 0.23, "grad_norm": 0.031494140625, "learning_rate": 1.891471047429186e-05, "loss": 0.9105, "step": 880 }, { "epoch": 0.24, "grad_norm": 0.030517578125, "learning_rate": 1.8872137758387873e-05, "loss": 0.8028, "step": 890 }, { "epoch": 0.24, "grad_norm": 0.0263671875, "learning_rate": 1.8828795850538804e-05, "loss": 0.8322, "step": 900 }, { "epoch": 0.24, "grad_norm": 0.0240478515625, "learning_rate": 1.8784688508378655e-05, "loss": 0.798, "step": 910 }, { "epoch": 0.25, "grad_norm": 0.0277099609375, "learning_rate": 1.8739819555902626e-05, "loss": 0.8202, "step": 920 }, { "epoch": 0.25, "grad_norm": 0.025146484375, "learning_rate": 1.8694192883135632e-05, "loss": 0.8819, "step": 930 }, { "epoch": 0.25, "grad_norm": 0.027587890625, "learning_rate": 1.8647812445795003e-05, "loss": 0.8273, "step": 940 }, { "epoch": 0.25, "grad_norm": 0.0291748046875, "learning_rate": 1.8600682264947566e-05, "loss": 0.8294, "step": 950 }, { "epoch": 0.26, "grad_norm": 0.033447265625, "learning_rate": 1.8552806426661022e-05, "loss": 0.8222, "step": 960 }, { "epoch": 0.26, "grad_norm": 0.185546875, "learning_rate": 1.8504189081649678e-05, "loss": 0.8109, "step": 970 }, { "epoch": 0.26, "grad_norm": 0.0284423828125, "learning_rate": 1.8454834444914607e-05, "loss": 0.7972, "step": 980 }, { "epoch": 0.26, "grad_norm": 0.0225830078125, "learning_rate": 1.8404746795378218e-05, "loss": 0.8107, "step": 990 }, { "epoch": 0.27, "grad_norm": 0.087890625, "learning_rate": 1.8353930475513268e-05, "loss": 0.8208, "step": 1000 }, { "epoch": 0.27, "grad_norm": 0.0272216796875, "learning_rate": 1.8302389890966404e-05, "loss": 0.7933, "step": 1010 }, { "epoch": 0.27, "grad_norm": 0.049072265625, "learning_rate": 1.8250129510176183e-05, "loss": 0.814, "step": 1020 }, { "epoch": 0.27, "grad_norm": 0.0225830078125, "learning_rate": 1.8197153863985686e-05, "loss": 0.8251, "step": 1030 }, { "epoch": 0.28, "grad_norm": 0.024658203125, "learning_rate": 1.8143467545249694e-05, "loss": 0.7962, "step": 1040 }, { "epoch": 0.28, "grad_norm": 0.0263671875, "learning_rate": 1.8089075208436507e-05, "loss": 0.8164, "step": 1050 }, { "epoch": 0.28, "grad_norm": 0.0223388671875, "learning_rate": 1.8033981569224404e-05, "loss": 0.8081, "step": 1060 }, { "epoch": 0.29, "grad_norm": 0.026123046875, "learning_rate": 1.797819140409282e-05, "loss": 0.8211, "step": 1070 }, { "epoch": 0.29, "grad_norm": 0.0252685546875, "learning_rate": 1.7921709549908222e-05, "loss": 0.8526, "step": 1080 }, { "epoch": 0.29, "grad_norm": 0.030517578125, "learning_rate": 1.7864540903504777e-05, "loss": 0.8195, "step": 1090 }, { "epoch": 0.29, "grad_norm": 0.0230712890625, "learning_rate": 1.7806690421259794e-05, "loss": 0.7951, "step": 1100 }, { "epoch": 0.3, "grad_norm": 0.04443359375, "learning_rate": 1.774816311866404e-05, "loss": 0.8389, "step": 1110 }, { "epoch": 0.3, "grad_norm": 0.023681640625, "learning_rate": 1.768896406988689e-05, "loss": 0.8031, "step": 1120 }, { "epoch": 0.3, "grad_norm": 0.0281982421875, "learning_rate": 1.7629098407336415e-05, "loss": 0.8188, "step": 1130 }, { "epoch": 0.3, "grad_norm": 0.0242919921875, "learning_rate": 1.756857132121443e-05, "loss": 0.8232, "step": 1140 }, { "epoch": 0.31, "grad_norm": 0.043701171875, "learning_rate": 1.7507388059066492e-05, "loss": 0.8126, "step": 1150 }, { "epoch": 0.31, "grad_norm": 0.06982421875, "learning_rate": 1.7445553925326963e-05, "loss": 0.7775, "step": 1160 }, { "epoch": 0.31, "grad_norm": 0.049560546875, "learning_rate": 1.7383074280859132e-05, "loss": 0.8205, "step": 1170 }, { "epoch": 0.31, "grad_norm": 0.0240478515625, "learning_rate": 1.7319954542490448e-05, "loss": 0.8158, "step": 1180 }, { "epoch": 0.32, "grad_norm": 0.041259765625, "learning_rate": 1.725620018254286e-05, "loss": 0.8146, "step": 1190 }, { "epoch": 0.32, "grad_norm": 0.271484375, "learning_rate": 1.7191816728358435e-05, "loss": 0.8259, "step": 1200 }, { "epoch": 0.32, "grad_norm": 0.021240234375, "learning_rate": 1.71268097618201e-05, "loss": 0.8213, "step": 1210 }, { "epoch": 0.33, "grad_norm": 0.02490234375, "learning_rate": 1.706118491886774e-05, "loss": 0.7947, "step": 1220 }, { "epoch": 0.33, "grad_norm": 0.0296630859375, "learning_rate": 1.6994947889009563e-05, "loss": 0.8364, "step": 1230 }, { "epoch": 0.33, "grad_norm": 0.0247802734375, "learning_rate": 1.692810441482884e-05, "loss": 0.8268, "step": 1240 }, { "epoch": 0.33, "grad_norm": 0.0238037109375, "learning_rate": 1.6860660291486023e-05, "loss": 0.7868, "step": 1250 }, { "epoch": 0.34, "grad_norm": 0.02392578125, "learning_rate": 1.6792621366216338e-05, "loss": 0.8402, "step": 1260 }, { "epoch": 0.34, "grad_norm": 0.036865234375, "learning_rate": 1.6723993537822837e-05, "loss": 0.81, "step": 1270 }, { "epoch": 0.34, "grad_norm": 0.06298828125, "learning_rate": 1.6654782756164983e-05, "loss": 0.8151, "step": 1280 }, { "epoch": 0.34, "grad_norm": 0.0235595703125, "learning_rate": 1.6584995021642814e-05, "loss": 0.7716, "step": 1290 }, { "epoch": 0.35, "grad_norm": 0.0252685546875, "learning_rate": 1.651463638467673e-05, "loss": 0.8236, "step": 1300 }, { "epoch": 0.35, "grad_norm": 0.031005859375, "learning_rate": 1.6443712945182933e-05, "loss": 0.7794, "step": 1310 }, { "epoch": 0.35, "grad_norm": 0.03955078125, "learning_rate": 1.637223085204457e-05, "loss": 0.8222, "step": 1320 }, { "epoch": 0.35, "grad_norm": 0.049560546875, "learning_rate": 1.630019630257865e-05, "loss": 0.8134, "step": 1330 }, { "epoch": 0.36, "grad_norm": 0.0244140625, "learning_rate": 1.6227615541998756e-05, "loss": 0.8209, "step": 1340 }, { "epoch": 0.36, "grad_norm": 0.0301513671875, "learning_rate": 1.6154494862873588e-05, "loss": 0.79, "step": 1350 }, { "epoch": 0.36, "grad_norm": 0.0240478515625, "learning_rate": 1.6080840604581435e-05, "loss": 0.812, "step": 1360 }, { "epoch": 0.37, "grad_norm": 0.0260009765625, "learning_rate": 1.600665915276054e-05, "loss": 0.8119, "step": 1370 }, { "epoch": 0.37, "grad_norm": 0.0732421875, "learning_rate": 1.5931956938755494e-05, "loss": 0.8073, "step": 1380 }, { "epoch": 0.37, "grad_norm": 0.0220947265625, "learning_rate": 1.585674043905966e-05, "loss": 0.8205, "step": 1390 }, { "epoch": 0.37, "grad_norm": 0.0255126953125, "learning_rate": 1.5781016174753675e-05, "loss": 0.8222, "step": 1400 }, { "epoch": 0.38, "grad_norm": 0.0302734375, "learning_rate": 1.5704790710940074e-05, "loss": 0.8307, "step": 1410 }, { "epoch": 0.38, "grad_norm": 0.028076171875, "learning_rate": 1.5628070656174135e-05, "loss": 0.8038, "step": 1420 }, { "epoch": 0.38, "grad_norm": 0.0233154296875, "learning_rate": 1.5550862661890918e-05, "loss": 0.7824, "step": 1430 }, { "epoch": 0.38, "grad_norm": 0.0252685546875, "learning_rate": 1.547317342182861e-05, "loss": 0.7891, "step": 1440 }, { "epoch": 0.39, "grad_norm": 0.058837890625, "learning_rate": 1.5395009671448186e-05, "loss": 0.8051, "step": 1450 }, { "epoch": 0.39, "grad_norm": 0.0284423828125, "learning_rate": 1.5316378187349476e-05, "loss": 0.7899, "step": 1460 }, { "epoch": 0.39, "grad_norm": 0.02734375, "learning_rate": 1.5237285786683638e-05, "loss": 0.827, "step": 1470 }, { "epoch": 0.39, "grad_norm": 0.034912109375, "learning_rate": 1.515773932656213e-05, "loss": 0.8107, "step": 1480 }, { "epoch": 0.4, "grad_norm": 0.0213623046875, "learning_rate": 1.5077745703462228e-05, "loss": 0.7785, "step": 1490 }, { "epoch": 0.4, "grad_norm": 0.0274658203125, "learning_rate": 1.4997311852629097e-05, "loss": 0.8053, "step": 1500 }, { "epoch": 0.4, "grad_norm": 0.029296875, "learning_rate": 1.4916444747474542e-05, "loss": 0.8034, "step": 1510 }, { "epoch": 0.41, "grad_norm": 0.023681640625, "learning_rate": 1.4835151398972424e-05, "loss": 0.8117, "step": 1520 }, { "epoch": 0.41, "grad_norm": 0.0303955078125, "learning_rate": 1.475343885505083e-05, "loss": 0.7811, "step": 1530 }, { "epoch": 0.41, "grad_norm": 0.166015625, "learning_rate": 1.4671314199981019e-05, "loss": 0.8069, "step": 1540 }, { "epoch": 0.41, "grad_norm": 0.024658203125, "learning_rate": 1.4588784553763262e-05, "loss": 0.8312, "step": 1550 }, { "epoch": 0.42, "grad_norm": 0.035888671875, "learning_rate": 1.4505857071509523e-05, "loss": 0.8132, "step": 1560 }, { "epoch": 0.42, "grad_norm": 0.04736328125, "learning_rate": 1.4422538942823158e-05, "loss": 0.812, "step": 1570 }, { "epoch": 0.42, "grad_norm": 0.024658203125, "learning_rate": 1.4338837391175582e-05, "loss": 0.822, "step": 1580 }, { "epoch": 0.42, "grad_norm": 0.025146484375, "learning_rate": 1.425475967328001e-05, "loss": 0.8269, "step": 1590 }, { "epoch": 0.43, "grad_norm": 0.046630859375, "learning_rate": 1.4170313078462318e-05, "loss": 0.8027, "step": 1600 }, { "epoch": 0.43, "grad_norm": 0.115234375, "learning_rate": 1.4085504928029086e-05, "loss": 0.8123, "step": 1610 }, { "epoch": 0.43, "grad_norm": 0.0269775390625, "learning_rate": 1.4000342574632846e-05, "loss": 0.8028, "step": 1620 }, { "epoch": 0.43, "grad_norm": 0.0284423828125, "learning_rate": 1.3914833401634642e-05, "loss": 0.7812, "step": 1630 }, { "epoch": 0.44, "grad_norm": 0.036865234375, "learning_rate": 1.3828984822463895e-05, "loss": 0.8294, "step": 1640 }, { "epoch": 0.44, "grad_norm": 0.03369140625, "learning_rate": 1.3742804279975686e-05, "loss": 0.8118, "step": 1650 }, { "epoch": 0.44, "grad_norm": 0.0235595703125, "learning_rate": 1.3656299245805476e-05, "loss": 0.8086, "step": 1660 }, { "epoch": 0.45, "grad_norm": 0.03955078125, "learning_rate": 1.3569477219721336e-05, "loss": 0.8155, "step": 1670 }, { "epoch": 0.45, "grad_norm": 0.0262451171875, "learning_rate": 1.3482345728973742e-05, "loss": 0.8394, "step": 1680 }, { "epoch": 0.45, "grad_norm": 0.0252685546875, "learning_rate": 1.3394912327642966e-05, "loss": 0.8172, "step": 1690 }, { "epoch": 0.45, "grad_norm": 0.0291748046875, "learning_rate": 1.330718459598417e-05, "loss": 0.7993, "step": 1700 }, { "epoch": 0.46, "grad_norm": 0.0311279296875, "learning_rate": 1.3219170139770213e-05, "loss": 0.7824, "step": 1710 }, { "epoch": 0.46, "grad_norm": 0.0380859375, "learning_rate": 1.3130876589632243e-05, "loss": 0.7982, "step": 1720 }, { "epoch": 0.46, "grad_norm": 0.037109375, "learning_rate": 1.3042311600398157e-05, "loss": 0.8093, "step": 1730 }, { "epoch": 0.46, "grad_norm": 0.03271484375, "learning_rate": 1.2953482850428927e-05, "loss": 0.7864, "step": 1740 }, { "epoch": 0.47, "grad_norm": 0.0225830078125, "learning_rate": 1.2864398040952921e-05, "loss": 0.778, "step": 1750 }, { "epoch": 0.47, "grad_norm": 0.025634765625, "learning_rate": 1.2775064895398217e-05, "loss": 0.7869, "step": 1760 }, { "epoch": 0.47, "grad_norm": 0.0274658203125, "learning_rate": 1.2685491158723003e-05, "loss": 0.8012, "step": 1770 }, { "epoch": 0.47, "grad_norm": 0.0289306640625, "learning_rate": 1.2595684596744112e-05, "loss": 0.8385, "step": 1780 }, { "epoch": 0.48, "grad_norm": 0.045166015625, "learning_rate": 1.250565299546374e-05, "loss": 0.8062, "step": 1790 }, { "epoch": 0.48, "grad_norm": 0.025146484375, "learning_rate": 1.2415404160394429e-05, "loss": 0.8076, "step": 1800 }, { "epoch": 0.48, "grad_norm": 0.0240478515625, "learning_rate": 1.2324945915882334e-05, "loss": 0.7864, "step": 1810 }, { "epoch": 0.49, "grad_norm": 0.03125, "learning_rate": 1.2234286104428884e-05, "loss": 0.833, "step": 1820 }, { "epoch": 0.49, "grad_norm": 0.07275390625, "learning_rate": 1.2143432586010851e-05, "loss": 0.7848, "step": 1830 }, { "epoch": 0.49, "grad_norm": 0.037109375, "learning_rate": 1.2052393237398916e-05, "loss": 0.7838, "step": 1840 }, { "epoch": 0.49, "grad_norm": 0.0771484375, "learning_rate": 1.1961175951474766e-05, "loss": 0.808, "step": 1850 }, { "epoch": 0.5, "grad_norm": 0.0238037109375, "learning_rate": 1.1869788636546801e-05, "loss": 0.8076, "step": 1860 }, { "epoch": 0.5, "grad_norm": 0.028076171875, "learning_rate": 1.1778239215664512e-05, "loss": 0.8196, "step": 1870 }, { "epoch": 0.5, "grad_norm": 0.0322265625, "learning_rate": 1.1686535625931566e-05, "loss": 0.7873, "step": 1880 }, { "epoch": 0.5, "grad_norm": 0.046630859375, "learning_rate": 1.1594685817817673e-05, "loss": 0.8126, "step": 1890 }, { "epoch": 0.51, "grad_norm": 0.02490234375, "learning_rate": 1.1502697754469315e-05, "loss": 0.8462, "step": 1900 }, { "epoch": 0.51, "grad_norm": 0.029296875, "learning_rate": 1.141057941101935e-05, "loss": 0.8273, "step": 1910 }, { "epoch": 0.51, "grad_norm": 0.02294921875, "learning_rate": 1.1318338773895596e-05, "loss": 0.8222, "step": 1920 }, { "epoch": 0.51, "grad_norm": 0.026611328125, "learning_rate": 1.1225983840128418e-05, "loss": 0.7724, "step": 1930 }, { "epoch": 0.52, "grad_norm": 0.0400390625, "learning_rate": 1.1133522616657417e-05, "loss": 0.8072, "step": 1940 }, { "epoch": 0.52, "grad_norm": 0.0286865234375, "learning_rate": 1.104096311963724e-05, "loss": 0.8306, "step": 1950 }, { "epoch": 0.52, "grad_norm": 0.026611328125, "learning_rate": 1.0948313373742606e-05, "loss": 0.7973, "step": 1960 }, { "epoch": 0.53, "grad_norm": 0.037841796875, "learning_rate": 1.0855581411472576e-05, "loss": 0.8385, "step": 1970 }, { "epoch": 0.53, "grad_norm": 0.02197265625, "learning_rate": 1.076277527245417e-05, "loss": 0.8238, "step": 1980 }, { "epoch": 0.53, "grad_norm": 0.0284423828125, "learning_rate": 1.0669903002745343e-05, "loss": 0.8102, "step": 1990 }, { "epoch": 0.53, "grad_norm": 0.1826171875, "learning_rate": 1.0576972654137411e-05, "loss": 0.8189, "step": 2000 }, { "epoch": 0.54, "grad_norm": 0.0247802734375, "learning_rate": 1.0483992283456992e-05, "loss": 0.7938, "step": 2010 }, { "epoch": 0.54, "grad_norm": 0.0218505859375, "learning_rate": 1.0390969951867482e-05, "loss": 0.8171, "step": 2020 }, { "epoch": 0.54, "grad_norm": 0.0311279296875, "learning_rate": 1.0297913724170187e-05, "loss": 0.8082, "step": 2030 }, { "epoch": 0.54, "grad_norm": 0.0238037109375, "learning_rate": 1.0204831668105117e-05, "loss": 0.7972, "step": 2040 }, { "epoch": 0.55, "grad_norm": 0.0218505859375, "learning_rate": 1.011173185365154e-05, "loss": 0.7743, "step": 2050 }, { "epoch": 0.55, "grad_norm": 0.03515625, "learning_rate": 1.0018622352328331e-05, "loss": 0.8095, "step": 2060 }, { "epoch": 0.55, "grad_norm": 0.03564453125, "learning_rate": 9.92551123649419e-06, "loss": 0.796, "step": 2070 }, { "epoch": 0.55, "grad_norm": 0.02197265625, "learning_rate": 9.832406578647789e-06, "loss": 0.7923, "step": 2080 }, { "epoch": 0.56, "grad_norm": 0.0230712890625, "learning_rate": 9.739316450727914e-06, "loss": 0.8106, "step": 2090 }, { "epoch": 0.56, "grad_norm": 0.0244140625, "learning_rate": 9.646248923413639e-06, "loss": 0.7871, "step": 2100 }, { "epoch": 0.56, "grad_norm": 0.027099609375, "learning_rate": 9.553212065424625e-06, "loss": 0.798, "step": 2110 }, { "epoch": 0.57, "grad_norm": 0.0341796875, "learning_rate": 9.460213942821578e-06, "loss": 0.795, "step": 2120 }, { "epoch": 0.57, "grad_norm": 0.023681640625, "learning_rate": 9.367262618306947e-06, "loss": 0.8001, "step": 2130 }, { "epoch": 0.57, "grad_norm": 0.02294921875, "learning_rate": 9.274366150525902e-06, "loss": 0.766, "step": 2140 }, { "epoch": 0.57, "grad_norm": 0.02587890625, "learning_rate": 9.181532593367675e-06, "loss": 0.7999, "step": 2150 }, { "epoch": 0.58, "grad_norm": 0.0224609375, "learning_rate": 9.0887699952673e-06, "loss": 0.813, "step": 2160 }, { "epoch": 0.58, "grad_norm": 0.035400390625, "learning_rate": 8.996086398507848e-06, "loss": 0.8108, "step": 2170 }, { "epoch": 0.58, "grad_norm": 0.0267333984375, "learning_rate": 8.903489838523167e-06, "loss": 0.841, "step": 2180 }, { "epoch": 0.58, "grad_norm": 0.02783203125, "learning_rate": 8.81098834320124e-06, "loss": 0.8204, "step": 2190 }, { "epoch": 0.59, "grad_norm": 0.03759765625, "learning_rate": 8.71858993218818e-06, "loss": 0.7937, "step": 2200 }, { "epoch": 0.59, "grad_norm": 0.04931640625, "learning_rate": 8.626302616192955e-06, "loss": 0.799, "step": 2210 }, { "epoch": 0.59, "grad_norm": 0.025634765625, "learning_rate": 8.534134396292875e-06, "loss": 0.8033, "step": 2220 }, { "epoch": 0.59, "grad_norm": 0.0277099609375, "learning_rate": 8.442093263239913e-06, "loss": 0.8089, "step": 2230 }, { "epoch": 0.6, "grad_norm": 0.0228271484375, "learning_rate": 8.350187196767942e-06, "loss": 0.792, "step": 2240 }, { "epoch": 0.6, "grad_norm": 0.047607421875, "learning_rate": 8.258424164900899e-06, "loss": 0.8242, "step": 2250 }, { "epoch": 0.6, "grad_norm": 0.0250244140625, "learning_rate": 8.166812123261982e-06, "loss": 0.8058, "step": 2260 }, { "epoch": 0.61, "grad_norm": 0.0283203125, "learning_rate": 8.075359014383914e-06, "loss": 0.8084, "step": 2270 }, { "epoch": 0.61, "grad_norm": 0.0228271484375, "learning_rate": 7.984072767020359e-06, "loss": 0.7894, "step": 2280 }, { "epoch": 0.61, "grad_norm": 0.0419921875, "learning_rate": 7.892961295458496e-06, "loss": 0.7993, "step": 2290 }, { "epoch": 0.61, "grad_norm": 0.0260009765625, "learning_rate": 7.802032498832895e-06, "loss": 0.8036, "step": 2300 }, { "epoch": 0.62, "grad_norm": 0.0235595703125, "learning_rate": 7.71129426044066e-06, "loss": 0.7973, "step": 2310 }, { "epoch": 0.62, "grad_norm": 0.0242919921875, "learning_rate": 7.620754447057985e-06, "loss": 0.7964, "step": 2320 }, { "epoch": 0.62, "grad_norm": 0.036376953125, "learning_rate": 7.530420908258111e-06, "loss": 0.777, "step": 2330 }, { "epoch": 0.62, "grad_norm": 0.029541015625, "learning_rate": 7.4403014757308e-06, "loss": 0.8116, "step": 2340 }, { "epoch": 0.63, "grad_norm": 0.0263671875, "learning_rate": 7.350403962603335e-06, "loss": 0.8267, "step": 2350 }, { "epoch": 0.63, "grad_norm": 0.031982421875, "learning_rate": 7.260736162763149e-06, "loss": 0.8153, "step": 2360 }, { "epoch": 0.63, "grad_norm": 0.1923828125, "learning_rate": 7.171305850182113e-06, "loss": 0.8026, "step": 2370 }, { "epoch": 0.63, "grad_norm": 0.03466796875, "learning_rate": 7.082120778242554e-06, "loss": 0.8198, "step": 2380 }, { "epoch": 0.64, "grad_norm": 0.034423828125, "learning_rate": 6.993188679065048e-06, "loss": 0.7804, "step": 2390 }, { "epoch": 0.64, "grad_norm": 0.0245361328125, "learning_rate": 6.904517262838082e-06, "loss": 0.8251, "step": 2400 }, { "epoch": 0.64, "grad_norm": 0.047607421875, "learning_rate": 6.8161142171495785e-06, "loss": 0.849, "step": 2410 }, { "epoch": 0.65, "grad_norm": 0.02880859375, "learning_rate": 6.72798720632042e-06, "loss": 0.7912, "step": 2420 }, { "epoch": 0.65, "grad_norm": 0.0245361328125, "learning_rate": 6.640143870739956e-06, "loss": 0.7864, "step": 2430 }, { "epoch": 0.65, "grad_norm": 0.02392578125, "learning_rate": 6.552591826203616e-06, "loss": 0.7713, "step": 2440 }, { "epoch": 0.65, "grad_norm": 0.023681640625, "learning_rate": 6.4653386632526275e-06, "loss": 0.7925, "step": 2450 }, { "epoch": 0.66, "grad_norm": 0.045166015625, "learning_rate": 6.378391946515937e-06, "loss": 0.8402, "step": 2460 }, { "epoch": 0.66, "grad_norm": 0.023681640625, "learning_rate": 6.291759214054383e-06, "loss": 0.8032, "step": 2470 }, { "epoch": 0.66, "grad_norm": 0.0238037109375, "learning_rate": 6.205447976707154e-06, "loss": 0.7891, "step": 2480 }, { "epoch": 0.66, "grad_norm": 0.08935546875, "learning_rate": 6.119465717440629e-06, "loss": 0.819, "step": 2490 }, { "epoch": 0.67, "grad_norm": 0.0283203125, "learning_rate": 6.033819890699616e-06, "loss": 0.8318, "step": 2500 }, { "epoch": 0.67, "grad_norm": 0.0240478515625, "learning_rate": 5.94851792176107e-06, "loss": 0.8241, "step": 2510 }, { "epoch": 0.67, "grad_norm": 0.0311279296875, "learning_rate": 5.863567206090348e-06, "loss": 0.7844, "step": 2520 }, { "epoch": 0.67, "grad_norm": 0.025634765625, "learning_rate": 5.778975108700031e-06, "loss": 0.8273, "step": 2530 }, { "epoch": 0.68, "grad_norm": 0.03759765625, "learning_rate": 5.694748963511396e-06, "loss": 0.8117, "step": 2540 }, { "epoch": 0.68, "grad_norm": 0.0238037109375, "learning_rate": 5.610896072718603e-06, "loss": 0.8067, "step": 2550 }, { "epoch": 0.68, "grad_norm": 0.0213623046875, "learning_rate": 5.527423706155586e-06, "loss": 0.7988, "step": 2560 }, { "epoch": 0.69, "grad_norm": 0.0230712890625, "learning_rate": 5.4443391006657896e-06, "loss": 0.8286, "step": 2570 }, { "epoch": 0.69, "grad_norm": 0.033447265625, "learning_rate": 5.361649459474756e-06, "loss": 0.8054, "step": 2580 }, { "epoch": 0.69, "grad_norm": 0.0233154296875, "learning_rate": 5.279361951565618e-06, "loss": 0.7801, "step": 2590 }, { "epoch": 0.69, "grad_norm": 0.027587890625, "learning_rate": 5.197483711057569e-06, "loss": 0.8107, "step": 2600 }, { "epoch": 0.7, "grad_norm": 0.0247802734375, "learning_rate": 5.116021836587353e-06, "loss": 0.8273, "step": 2610 }, { "epoch": 0.7, "grad_norm": 0.0235595703125, "learning_rate": 5.0349833906938235e-06, "loss": 0.7919, "step": 2620 }, { "epoch": 0.7, "grad_norm": 0.0240478515625, "learning_rate": 4.954375399205655e-06, "loss": 0.8371, "step": 2630 }, { "epoch": 0.7, "grad_norm": 0.0257568359375, "learning_rate": 4.8742048506322045e-06, "loss": 0.7882, "step": 2640 }, { "epoch": 0.71, "grad_norm": 0.027099609375, "learning_rate": 4.794478695557631e-06, "loss": 0.8149, "step": 2650 }, { "epoch": 0.71, "grad_norm": 0.02685546875, "learning_rate": 4.715203846038312e-06, "loss": 0.8254, "step": 2660 }, { "epoch": 0.71, "grad_norm": 0.08447265625, "learning_rate": 4.636387175003558e-06, "loss": 0.8249, "step": 2670 }, { "epoch": 0.71, "grad_norm": 0.0264892578125, "learning_rate": 4.558035515659768e-06, "loss": 0.8004, "step": 2680 }, { "epoch": 0.72, "grad_norm": 0.0230712890625, "learning_rate": 4.480155660898001e-06, "loss": 0.7959, "step": 2690 }, { "epoch": 0.72, "grad_norm": 0.0341796875, "learning_rate": 4.402754362705051e-06, "loss": 0.786, "step": 2700 }, { "epoch": 0.72, "grad_norm": 0.033203125, "learning_rate": 4.325838331578061e-06, "loss": 0.7948, "step": 2710 }, { "epoch": 0.73, "grad_norm": 0.0269775390625, "learning_rate": 4.249414235942755e-06, "loss": 0.7959, "step": 2720 }, { "epoch": 0.73, "grad_norm": 0.045654296875, "learning_rate": 4.173488701575274e-06, "loss": 0.775, "step": 2730 }, { "epoch": 0.73, "grad_norm": 0.0341796875, "learning_rate": 4.098068311027772e-06, "loss": 0.7992, "step": 2740 }, { "epoch": 0.73, "grad_norm": 0.02783203125, "learning_rate": 4.023159603057698e-06, "loss": 0.8045, "step": 2750 }, { "epoch": 0.74, "grad_norm": 0.0244140625, "learning_rate": 3.948769072060927e-06, "loss": 0.7903, "step": 2760 }, { "epoch": 0.74, "grad_norm": 0.0458984375, "learning_rate": 3.874903167508688e-06, "loss": 0.7974, "step": 2770 }, { "epoch": 0.74, "grad_norm": 0.026123046875, "learning_rate": 3.801568293388421e-06, "loss": 0.7766, "step": 2780 }, { "epoch": 0.74, "grad_norm": 0.0400390625, "learning_rate": 3.728770807648574e-06, "loss": 0.8202, "step": 2790 }, { "epoch": 0.75, "grad_norm": 0.0250244140625, "learning_rate": 3.6565170216473744e-06, "loss": 0.8047, "step": 2800 }, { "epoch": 0.75, "grad_norm": 0.04931640625, "learning_rate": 3.584813199605658e-06, "loss": 0.8324, "step": 2810 }, { "epoch": 0.75, "grad_norm": 0.0255126953125, "learning_rate": 3.513665558063771e-06, "loss": 0.8203, "step": 2820 }, { "epoch": 0.75, "grad_norm": 0.034912109375, "learning_rate": 3.4430802653426176e-06, "loss": 0.7997, "step": 2830 }, { "epoch": 0.76, "grad_norm": 0.06298828125, "learning_rate": 3.373063441008877e-06, "loss": 0.7783, "step": 2840 }, { "epoch": 0.76, "grad_norm": 0.03857421875, "learning_rate": 3.303621155344453e-06, "loss": 0.8088, "step": 2850 }, { "epoch": 0.76, "grad_norm": 0.0250244140625, "learning_rate": 3.234759428820198e-06, "loss": 0.8231, "step": 2860 }, { "epoch": 0.77, "grad_norm": 0.0235595703125, "learning_rate": 3.1664842315739586e-06, "loss": 0.8005, "step": 2870 }, { "epoch": 0.77, "grad_norm": 0.03076171875, "learning_rate": 3.098801482892966e-06, "loss": 0.7926, "step": 2880 }, { "epoch": 0.77, "grad_norm": 0.11962890625, "learning_rate": 3.031717050700659e-06, "loss": 0.7926, "step": 2890 }, { "epoch": 0.77, "grad_norm": 0.045166015625, "learning_rate": 2.9652367510479476e-06, "loss": 0.8127, "step": 2900 }, { "epoch": 0.78, "grad_norm": 0.021728515625, "learning_rate": 2.899366347608974e-06, "loss": 0.8046, "step": 2910 }, { "epoch": 0.78, "grad_norm": 0.036865234375, "learning_rate": 2.834111551181423e-06, "loss": 0.7647, "step": 2920 }, { "epoch": 0.78, "grad_norm": 0.0260009765625, "learning_rate": 2.7694780191914005e-06, "loss": 0.7985, "step": 2930 }, { "epoch": 0.78, "grad_norm": 0.04052734375, "learning_rate": 2.7054713552029577e-06, "loss": 0.841, "step": 2940 }, { "epoch": 0.79, "grad_norm": 0.041748046875, "learning_rate": 2.6420971084322745e-06, "loss": 0.7775, "step": 2950 }, { "epoch": 0.79, "grad_norm": 0.0244140625, "learning_rate": 2.5793607732665402e-06, "loss": 0.7954, "step": 2960 }, { "epoch": 0.79, "grad_norm": 0.03125, "learning_rate": 2.5172677887876416e-06, "loss": 0.7875, "step": 2970 }, { "epoch": 0.79, "grad_norm": 0.02490234375, "learning_rate": 2.455823538300569e-06, "loss": 0.8065, "step": 2980 }, { "epoch": 0.8, "grad_norm": 0.0224609375, "learning_rate": 2.3950333488667178e-06, "loss": 0.8201, "step": 2990 }, { "epoch": 0.8, "grad_norm": 0.06298828125, "learning_rate": 2.3349024908420403e-06, "loss": 0.813, "step": 3000 }, { "epoch": 0.8, "grad_norm": 0.03076171875, "learning_rate": 2.2754361774201217e-06, "loss": 0.8076, "step": 3010 }, { "epoch": 0.81, "grad_norm": 0.0341796875, "learning_rate": 2.2166395641802076e-06, "loss": 0.8135, "step": 3020 }, { "epoch": 0.81, "grad_norm": 0.0673828125, "learning_rate": 2.1585177486402275e-06, "loss": 0.8095, "step": 3030 }, { "epoch": 0.81, "grad_norm": 0.0341796875, "learning_rate": 2.101075769814855e-06, "loss": 0.7985, "step": 3040 }, { "epoch": 0.81, "grad_norm": 0.04150390625, "learning_rate": 2.0443186077786358e-06, "loss": 0.7976, "step": 3050 }, { "epoch": 0.82, "grad_norm": 0.04736328125, "learning_rate": 1.9882511832342297e-06, "loss": 0.7868, "step": 3060 }, { "epoch": 0.82, "grad_norm": 0.0262451171875, "learning_rate": 1.9328783570857954e-06, "loss": 0.8072, "step": 3070 }, { "epoch": 0.82, "grad_norm": 0.04296875, "learning_rate": 1.8782049300175698e-06, "loss": 0.7931, "step": 3080 }, { "epoch": 0.82, "grad_norm": 0.0269775390625, "learning_rate": 1.8242356420776485e-06, "loss": 0.8185, "step": 3090 }, { "epoch": 0.83, "grad_norm": 0.033447265625, "learning_rate": 1.7709751722670466e-06, "loss": 0.8391, "step": 3100 }, { "epoch": 0.83, "grad_norm": 0.0274658203125, "learning_rate": 1.718428138134034e-06, "loss": 0.8502, "step": 3110 }, { "epoch": 0.83, "grad_norm": 0.02783203125, "learning_rate": 1.666599095373811e-06, "loss": 0.8185, "step": 3120 }, { "epoch": 0.83, "grad_norm": 0.025390625, "learning_rate": 1.6154925374335362e-06, "loss": 0.806, "step": 3130 }, { "epoch": 0.84, "grad_norm": 0.0225830078125, "learning_rate": 1.5651128951227613e-06, "loss": 0.783, "step": 3140 }, { "epoch": 0.84, "grad_norm": 0.033935546875, "learning_rate": 1.5154645362292853e-06, "loss": 0.8159, "step": 3150 }, { "epoch": 0.84, "grad_norm": 0.025390625, "learning_rate": 1.4665517651404814e-06, "loss": 0.7793, "step": 3160 }, { "epoch": 0.85, "grad_norm": 0.0220947265625, "learning_rate": 1.4183788224701201e-06, "loss": 0.7754, "step": 3170 }, { "epoch": 0.85, "grad_norm": 0.0269775390625, "learning_rate": 1.370949884690711e-06, "loss": 0.847, "step": 3180 }, { "epoch": 0.85, "grad_norm": 0.035888671875, "learning_rate": 1.3242690637714228e-06, "loss": 0.8014, "step": 3190 }, { "epoch": 0.85, "grad_norm": 0.0238037109375, "learning_rate": 1.2783404068215776e-06, "loss": 0.8235, "step": 3200 }, { "epoch": 0.86, "grad_norm": 0.033447265625, "learning_rate": 1.2331678957397819e-06, "loss": 0.7997, "step": 3210 }, { "epoch": 0.86, "grad_norm": 0.033203125, "learning_rate": 1.1887554468687046e-06, "loss": 0.8159, "step": 3220 }, { "epoch": 0.86, "grad_norm": 0.033935546875, "learning_rate": 1.145106910655538e-06, "loss": 0.8392, "step": 3230 }, { "epoch": 0.86, "grad_norm": 0.023193359375, "learning_rate": 1.1022260713181786e-06, "loss": 0.8266, "step": 3240 }, { "epoch": 0.87, "grad_norm": 0.03662109375, "learning_rate": 1.0601166465171387e-06, "loss": 0.8209, "step": 3250 }, { "epoch": 0.87, "grad_norm": 0.0294189453125, "learning_rate": 1.0187822870332398e-06, "loss": 0.8021, "step": 3260 }, { "epoch": 0.87, "grad_norm": 0.051025390625, "learning_rate": 9.782265764510968e-07, "loss": 0.7938, "step": 3270 }, { "epoch": 0.87, "grad_norm": 0.0291748046875, "learning_rate": 9.384530308484275e-07, "loss": 0.8007, "step": 3280 }, { "epoch": 0.88, "grad_norm": 0.0279541015625, "learning_rate": 8.99465098491229e-07, "loss": 0.8097, "step": 3290 }, { "epoch": 0.88, "grad_norm": 0.032470703125, "learning_rate": 8.612661595348038e-07, "loss": 0.8029, "step": 3300 }, { "epoch": 0.88, "grad_norm": 0.0234375, "learning_rate": 8.238595257307225e-07, "loss": 0.8079, "step": 3310 }, { "epoch": 0.89, "grad_norm": 0.03125, "learning_rate": 7.872484401397018e-07, "loss": 0.8074, "step": 3320 }, { "epoch": 0.89, "grad_norm": 0.024658203125, "learning_rate": 7.514360768504314e-07, "loss": 0.7961, "step": 3330 }, { "epoch": 0.89, "grad_norm": 0.03125, "learning_rate": 7.164255407043986e-07, "loss": 0.7701, "step": 3340 }, { "epoch": 0.89, "grad_norm": 0.0269775390625, "learning_rate": 6.822198670266989e-07, "loss": 0.8165, "step": 3350 }, { "epoch": 0.9, "grad_norm": 0.0277099609375, "learning_rate": 6.488220213628837e-07, "loss": 0.8097, "step": 3360 }, { "epoch": 0.9, "grad_norm": 0.0322265625, "learning_rate": 6.16234899221858e-07, "loss": 0.8073, "step": 3370 }, { "epoch": 0.9, "grad_norm": 0.0234375, "learning_rate": 5.844613258248411e-07, "loss": 0.7865, "step": 3380 }, { "epoch": 0.9, "grad_norm": 0.130859375, "learning_rate": 5.535040558604299e-07, "loss": 0.8182, "step": 3390 }, { "epoch": 0.91, "grad_norm": 0.0233154296875, "learning_rate": 5.233657732457775e-07, "loss": 0.78, "step": 3400 }, { "epoch": 0.91, "grad_norm": 0.0284423828125, "learning_rate": 4.940490908938977e-07, "loss": 0.7961, "step": 3410 }, { "epoch": 0.91, "grad_norm": 0.0478515625, "learning_rate": 4.6555655048713953e-07, "loss": 0.7946, "step": 3420 }, { "epoch": 0.91, "grad_norm": 0.023193359375, "learning_rate": 4.3789062225682356e-07, "loss": 0.8022, "step": 3430 }, { "epoch": 0.92, "grad_norm": 0.060302734375, "learning_rate": 4.1105370476908104e-07, "loss": 0.8201, "step": 3440 }, { "epoch": 0.92, "grad_norm": 0.042236328125, "learning_rate": 3.8504812471690687e-07, "loss": 0.8014, "step": 3450 }, { "epoch": 0.92, "grad_norm": 0.02392578125, "learning_rate": 3.598761367184367e-07, "loss": 0.7995, "step": 3460 }, { "epoch": 0.93, "grad_norm": 0.02880859375, "learning_rate": 3.3553992312148177e-07, "loss": 0.8139, "step": 3470 }, { "epoch": 0.93, "grad_norm": 0.029052734375, "learning_rate": 3.1204159381432174e-07, "loss": 0.8127, "step": 3480 }, { "epoch": 0.93, "grad_norm": 0.028564453125, "learning_rate": 2.8938318604278314e-07, "loss": 0.8009, "step": 3490 }, { "epoch": 0.93, "grad_norm": 0.03173828125, "learning_rate": 2.675666642336172e-07, "loss": 0.8319, "step": 3500 }, { "epoch": 0.94, "grad_norm": 0.031494140625, "learning_rate": 2.4659391982418626e-07, "loss": 0.8413, "step": 3510 }, { "epoch": 0.94, "grad_norm": 0.08642578125, "learning_rate": 2.264667710984836e-07, "loss": 0.8194, "step": 3520 }, { "epoch": 0.94, "grad_norm": 0.0289306640625, "learning_rate": 2.0718696302949092e-07, "loss": 0.8128, "step": 3530 }, { "epoch": 0.94, "grad_norm": 0.0771484375, "learning_rate": 1.8875616712789257e-07, "loss": 0.8008, "step": 3540 }, { "epoch": 0.95, "grad_norm": 0.024658203125, "learning_rate": 1.7117598129716362e-07, "loss": 0.7753, "step": 3550 }, { "epoch": 0.95, "grad_norm": 0.0260009765625, "learning_rate": 1.544479296950341e-07, "loss": 0.802, "step": 3560 }, { "epoch": 0.95, "grad_norm": 0.025634765625, "learning_rate": 1.385734626013435e-07, "loss": 0.7936, "step": 3570 }, { "epoch": 0.95, "grad_norm": 0.0240478515625, "learning_rate": 1.2355395629231493e-07, "loss": 0.7956, "step": 3580 }, { "epoch": 0.96, "grad_norm": 0.08251953125, "learning_rate": 1.0939071292122572e-07, "loss": 0.7769, "step": 3590 }, { "epoch": 0.96, "grad_norm": 0.0279541015625, "learning_rate": 9.608496040551918e-08, "loss": 0.7957, "step": 3600 }, { "epoch": 0.96, "grad_norm": 0.0277099609375, "learning_rate": 8.363785232034849e-08, "loss": 0.814, "step": 3610 }, { "epoch": 0.97, "grad_norm": 0.03173828125, "learning_rate": 7.205046779856007e-08, "loss": 0.8155, "step": 3620 }, { "epoch": 0.97, "grad_norm": 0.03173828125, "learning_rate": 6.132381143713728e-08, "loss": 0.8364, "step": 3630 }, { "epoch": 0.97, "grad_norm": 0.0264892578125, "learning_rate": 5.1458813210106815e-08, "loss": 0.8173, "step": 3640 }, { "epoch": 0.97, "grad_norm": 0.038818359375, "learning_rate": 4.245632838791092e-08, "loss": 0.8085, "step": 3650 }, { "epoch": 0.98, "grad_norm": 0.0225830078125, "learning_rate": 3.431713746325449e-08, "loss": 0.8195, "step": 3660 }, { "epoch": 0.98, "grad_norm": 0.0751953125, "learning_rate": 2.7041946083442573e-08, "loss": 0.8029, "step": 3670 }, { "epoch": 0.98, "grad_norm": 0.06494140625, "learning_rate": 2.0631384989202585e-08, "loss": 0.8359, "step": 3680 }, { "epoch": 0.98, "grad_norm": 0.030517578125, "learning_rate": 1.5086009959995875e-08, "loss": 0.8233, "step": 3690 }, { "epoch": 0.99, "grad_norm": 0.0228271484375, "learning_rate": 1.0406301765837346e-08, "loss": 0.7743, "step": 3700 }, { "epoch": 0.99, "grad_norm": 0.024658203125, "learning_rate": 6.592666125614377e-09, "loss": 0.7764, "step": 3710 }, { "epoch": 0.99, "grad_norm": 0.023193359375, "learning_rate": 3.645433671908283e-09, "loss": 0.798, "step": 3720 }, { "epoch": 0.99, "grad_norm": 0.0260009765625, "learning_rate": 1.5648599223316852e-09, "loss": 0.8119, "step": 3730 }, { "epoch": 1.0, "grad_norm": 0.0247802734375, "learning_rate": 3.5112525737734935e-10, "loss": 0.8295, "step": 3740 } ], "logging_steps": 10, "max_steps": 3749, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 1.2690756062895145e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }