{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 16845, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0029682398337785693, "grad_norm": 0.48303863406181335, "learning_rate": 4.997031760166221e-05, "loss": 1.3426, "step": 10 }, { "epoch": 0.005936479667557139, "grad_norm": 0.5679779052734375, "learning_rate": 4.9940635203324435e-05, "loss": 1.2914, "step": 20 }, { "epoch": 0.008904719501335707, "grad_norm": 0.565687358379364, "learning_rate": 4.9910952804986644e-05, "loss": 1.2217, "step": 30 }, { "epoch": 0.011872959335114277, "grad_norm": 0.5573933720588684, "learning_rate": 4.988127040664886e-05, "loss": 1.1249, "step": 40 }, { "epoch": 0.014841199168892847, "grad_norm": 0.6410918831825256, "learning_rate": 4.9851588008311077e-05, "loss": 1.1554, "step": 50 }, { "epoch": 0.017809439002671415, "grad_norm": 0.6899677515029907, "learning_rate": 4.982190560997329e-05, "loss": 1.1577, "step": 60 }, { "epoch": 0.020777678836449986, "grad_norm": 0.6809150576591492, "learning_rate": 4.97922232116355e-05, "loss": 1.107, "step": 70 }, { "epoch": 0.023745918670228554, "grad_norm": 0.7853492498397827, "learning_rate": 4.976254081329771e-05, "loss": 1.1337, "step": 80 }, { "epoch": 0.026714158504007122, "grad_norm": 0.6834965944290161, "learning_rate": 4.9732858414959934e-05, "loss": 1.1123, "step": 90 }, { "epoch": 0.029682398337785694, "grad_norm": 0.6894986629486084, "learning_rate": 4.9703176016622144e-05, "loss": 1.1234, "step": 100 }, { "epoch": 0.032650638171564265, "grad_norm": 0.662402868270874, "learning_rate": 4.967349361828436e-05, "loss": 1.1082, "step": 110 }, { "epoch": 0.03561887800534283, "grad_norm": 0.7717655897140503, "learning_rate": 4.9643811219946576e-05, "loss": 1.1124, "step": 120 }, { "epoch": 0.0385871178391214, "grad_norm": 0.8598708510398865, "learning_rate": 4.961412882160879e-05, "loss": 1.0867, "step": 130 }, { "epoch": 0.04155535767289997, "grad_norm": 0.7680957317352295, "learning_rate": 4.9584446423271e-05, "loss": 1.096, "step": 140 }, { "epoch": 0.04452359750667854, "grad_norm": 0.6783134937286377, "learning_rate": 4.955476402493322e-05, "loss": 1.0742, "step": 150 }, { "epoch": 0.04749183734045711, "grad_norm": 0.8696405291557312, "learning_rate": 4.9525081626595434e-05, "loss": 1.0774, "step": 160 }, { "epoch": 0.05046007717423568, "grad_norm": 0.8478761315345764, "learning_rate": 4.9495399228257644e-05, "loss": 1.1037, "step": 170 }, { "epoch": 0.053428317008014245, "grad_norm": 0.7295342087745667, "learning_rate": 4.946571682991986e-05, "loss": 1.0566, "step": 180 }, { "epoch": 0.056396556841792816, "grad_norm": 0.7539967894554138, "learning_rate": 4.9436034431582076e-05, "loss": 1.1014, "step": 190 }, { "epoch": 0.05936479667557139, "grad_norm": 0.7601701021194458, "learning_rate": 4.940635203324429e-05, "loss": 1.0948, "step": 200 }, { "epoch": 0.06233303650934995, "grad_norm": 0.7360059022903442, "learning_rate": 4.93766696349065e-05, "loss": 1.0612, "step": 210 }, { "epoch": 0.06530127634312853, "grad_norm": 0.758794367313385, "learning_rate": 4.934698723656872e-05, "loss": 1.0671, "step": 220 }, { "epoch": 0.06826951617690709, "grad_norm": 0.7825369834899902, "learning_rate": 4.9317304838230934e-05, "loss": 1.1034, "step": 230 }, { "epoch": 0.07123775601068566, "grad_norm": 0.858249306678772, "learning_rate": 4.928762243989314e-05, "loss": 1.0904, "step": 240 }, { "epoch": 0.07420599584446423, "grad_norm": 0.8293972015380859, "learning_rate": 4.925794004155536e-05, "loss": 1.0787, "step": 250 }, { "epoch": 0.0771742356782428, "grad_norm": 0.8243985176086426, "learning_rate": 4.9228257643217576e-05, "loss": 1.0855, "step": 260 }, { "epoch": 0.08014247551202137, "grad_norm": 0.7798173427581787, "learning_rate": 4.919857524487979e-05, "loss": 1.0975, "step": 270 }, { "epoch": 0.08311071534579995, "grad_norm": 0.7953142523765564, "learning_rate": 4.9168892846542e-05, "loss": 1.0904, "step": 280 }, { "epoch": 0.0860789551795785, "grad_norm": 0.7199780941009521, "learning_rate": 4.913921044820422e-05, "loss": 1.0577, "step": 290 }, { "epoch": 0.08904719501335707, "grad_norm": 0.7431743741035461, "learning_rate": 4.9109528049866433e-05, "loss": 1.1162, "step": 300 }, { "epoch": 0.09201543484713565, "grad_norm": 0.7822330594062805, "learning_rate": 4.907984565152864e-05, "loss": 1.0389, "step": 310 }, { "epoch": 0.09498367468091422, "grad_norm": 0.8207771182060242, "learning_rate": 4.905016325319086e-05, "loss": 1.0779, "step": 320 }, { "epoch": 0.09795191451469279, "grad_norm": 0.7167501449584961, "learning_rate": 4.9020480854853075e-05, "loss": 1.0698, "step": 330 }, { "epoch": 0.10092015434847136, "grad_norm": 0.7531447410583496, "learning_rate": 4.899079845651529e-05, "loss": 1.0787, "step": 340 }, { "epoch": 0.10388839418224993, "grad_norm": 0.8100646734237671, "learning_rate": 4.89611160581775e-05, "loss": 1.0529, "step": 350 }, { "epoch": 0.10685663401602849, "grad_norm": 0.7299244403839111, "learning_rate": 4.893143365983972e-05, "loss": 1.0262, "step": 360 }, { "epoch": 0.10982487384980706, "grad_norm": 0.845340371131897, "learning_rate": 4.890175126150193e-05, "loss": 1.1234, "step": 370 }, { "epoch": 0.11279311368358563, "grad_norm": 0.7792521119117737, "learning_rate": 4.887206886316414e-05, "loss": 1.0919, "step": 380 }, { "epoch": 0.1157613535173642, "grad_norm": 0.6663903594017029, "learning_rate": 4.884238646482636e-05, "loss": 1.0471, "step": 390 }, { "epoch": 0.11872959335114278, "grad_norm": 0.822321355342865, "learning_rate": 4.8812704066488575e-05, "loss": 1.0181, "step": 400 }, { "epoch": 0.12169783318492135, "grad_norm": 0.837409257888794, "learning_rate": 4.878302166815079e-05, "loss": 1.102, "step": 410 }, { "epoch": 0.1246660730186999, "grad_norm": 0.7892317771911621, "learning_rate": 4.8753339269813e-05, "loss": 1.0608, "step": 420 }, { "epoch": 0.12763431285247848, "grad_norm": 0.7766367197036743, "learning_rate": 4.872365687147522e-05, "loss": 1.0525, "step": 430 }, { "epoch": 0.13060255268625706, "grad_norm": 0.6542902588844299, "learning_rate": 4.869397447313743e-05, "loss": 1.0306, "step": 440 }, { "epoch": 0.13357079252003562, "grad_norm": 0.8308093547821045, "learning_rate": 4.866429207479964e-05, "loss": 1.0462, "step": 450 }, { "epoch": 0.13653903235381418, "grad_norm": 0.7098076343536377, "learning_rate": 4.8634609676461865e-05, "loss": 1.0403, "step": 460 }, { "epoch": 0.13950727218759276, "grad_norm": 0.6637741923332214, "learning_rate": 4.8604927278124075e-05, "loss": 1.0298, "step": 470 }, { "epoch": 0.14247551202137132, "grad_norm": 0.7593216896057129, "learning_rate": 4.857524487978629e-05, "loss": 1.006, "step": 480 }, { "epoch": 0.1454437518551499, "grad_norm": 0.7959597706794739, "learning_rate": 4.85455624814485e-05, "loss": 1.0231, "step": 490 }, { "epoch": 0.14841199168892846, "grad_norm": 0.7332357168197632, "learning_rate": 4.8515880083110716e-05, "loss": 1.0455, "step": 500 }, { "epoch": 0.15138023152270705, "grad_norm": 0.6890411972999573, "learning_rate": 4.848619768477293e-05, "loss": 1.0561, "step": 510 }, { "epoch": 0.1543484713564856, "grad_norm": 0.7440224885940552, "learning_rate": 4.845651528643514e-05, "loss": 1.0515, "step": 520 }, { "epoch": 0.15731671119026416, "grad_norm": 0.832664966583252, "learning_rate": 4.8426832888097365e-05, "loss": 1.0584, "step": 530 }, { "epoch": 0.16028495102404275, "grad_norm": 0.6851795315742493, "learning_rate": 4.8397150489759574e-05, "loss": 1.0327, "step": 540 }, { "epoch": 0.1632531908578213, "grad_norm": 0.9479386806488037, "learning_rate": 4.836746809142179e-05, "loss": 1.0745, "step": 550 }, { "epoch": 0.1662214306915999, "grad_norm": 0.7720438241958618, "learning_rate": 4.8337785693084007e-05, "loss": 1.0472, "step": 560 }, { "epoch": 0.16918967052537845, "grad_norm": 0.7193672060966492, "learning_rate": 4.8308103294746216e-05, "loss": 1.0452, "step": 570 }, { "epoch": 0.172157910359157, "grad_norm": 0.7027491331100464, "learning_rate": 4.827842089640843e-05, "loss": 1.0444, "step": 580 }, { "epoch": 0.1751261501929356, "grad_norm": 0.7128656506538391, "learning_rate": 4.824873849807064e-05, "loss": 1.0367, "step": 590 }, { "epoch": 0.17809439002671415, "grad_norm": 0.7210800051689148, "learning_rate": 4.8219056099732865e-05, "loss": 1.0319, "step": 600 }, { "epoch": 0.18106262986049274, "grad_norm": 0.7800968885421753, "learning_rate": 4.8189373701395074e-05, "loss": 1.0222, "step": 610 }, { "epoch": 0.1840308696942713, "grad_norm": 0.7994186878204346, "learning_rate": 4.815969130305729e-05, "loss": 1.0582, "step": 620 }, { "epoch": 0.18699910952804988, "grad_norm": 0.723895788192749, "learning_rate": 4.8130008904719506e-05, "loss": 1.0431, "step": 630 }, { "epoch": 0.18996734936182844, "grad_norm": 0.6798288226127625, "learning_rate": 4.8100326506381716e-05, "loss": 1.0365, "step": 640 }, { "epoch": 0.192935589195607, "grad_norm": 0.7264121174812317, "learning_rate": 4.807064410804393e-05, "loss": 1.0839, "step": 650 }, { "epoch": 0.19590382902938558, "grad_norm": 0.7775474786758423, "learning_rate": 4.804096170970615e-05, "loss": 1.0839, "step": 660 }, { "epoch": 0.19887206886316414, "grad_norm": 0.8966870903968811, "learning_rate": 4.8011279311368364e-05, "loss": 1.075, "step": 670 }, { "epoch": 0.20184030869694272, "grad_norm": 0.7206711173057556, "learning_rate": 4.7981596913030574e-05, "loss": 1.0616, "step": 680 }, { "epoch": 0.20480854853072128, "grad_norm": 0.6880177855491638, "learning_rate": 4.795191451469279e-05, "loss": 1.0684, "step": 690 }, { "epoch": 0.20777678836449986, "grad_norm": 0.6823096871376038, "learning_rate": 4.7922232116355006e-05, "loss": 1.0385, "step": 700 }, { "epoch": 0.21074502819827842, "grad_norm": 0.7464755773544312, "learning_rate": 4.7892549718017215e-05, "loss": 1.052, "step": 710 }, { "epoch": 0.21371326803205698, "grad_norm": 0.7194072604179382, "learning_rate": 4.786286731967943e-05, "loss": 1.0424, "step": 720 }, { "epoch": 0.21668150786583557, "grad_norm": 0.6561169028282166, "learning_rate": 4.783318492134165e-05, "loss": 1.0605, "step": 730 }, { "epoch": 0.21964974769961412, "grad_norm": 0.7468820214271545, "learning_rate": 4.7803502523003864e-05, "loss": 1.0656, "step": 740 }, { "epoch": 0.2226179875333927, "grad_norm": 0.784570038318634, "learning_rate": 4.777382012466607e-05, "loss": 1.0684, "step": 750 }, { "epoch": 0.22558622736717127, "grad_norm": 0.8690618276596069, "learning_rate": 4.774413772632829e-05, "loss": 1.0058, "step": 760 }, { "epoch": 0.22855446720094982, "grad_norm": 0.7325943112373352, "learning_rate": 4.7714455327990506e-05, "loss": 1.0016, "step": 770 }, { "epoch": 0.2315227070347284, "grad_norm": 0.8618629574775696, "learning_rate": 4.7684772929652715e-05, "loss": 1.0219, "step": 780 }, { "epoch": 0.23449094686850697, "grad_norm": 0.6887000203132629, "learning_rate": 4.765509053131493e-05, "loss": 1.054, "step": 790 }, { "epoch": 0.23745918670228555, "grad_norm": 0.6989481449127197, "learning_rate": 4.762540813297715e-05, "loss": 1.0484, "step": 800 }, { "epoch": 0.2404274265360641, "grad_norm": 0.7582924365997314, "learning_rate": 4.7595725734639364e-05, "loss": 1.054, "step": 810 }, { "epoch": 0.2433956663698427, "grad_norm": 0.6527506709098816, "learning_rate": 4.756604333630157e-05, "loss": 1.0076, "step": 820 }, { "epoch": 0.24636390620362125, "grad_norm": 0.6977763175964355, "learning_rate": 4.753636093796379e-05, "loss": 1.0274, "step": 830 }, { "epoch": 0.2493321460373998, "grad_norm": 0.6364858746528625, "learning_rate": 4.7506678539626005e-05, "loss": 1.0389, "step": 840 }, { "epoch": 0.25230038587117837, "grad_norm": 0.7233136296272278, "learning_rate": 4.7476996141288215e-05, "loss": 1.0177, "step": 850 }, { "epoch": 0.25526862570495695, "grad_norm": 0.759456217288971, "learning_rate": 4.744731374295043e-05, "loss": 1.0332, "step": 860 }, { "epoch": 0.25823686553873554, "grad_norm": 0.7150506973266602, "learning_rate": 4.741763134461265e-05, "loss": 1.0146, "step": 870 }, { "epoch": 0.2612051053725141, "grad_norm": 0.7771095633506775, "learning_rate": 4.738794894627486e-05, "loss": 1.0138, "step": 880 }, { "epoch": 0.26417334520629265, "grad_norm": 0.6592842936515808, "learning_rate": 4.735826654793707e-05, "loss": 1.0729, "step": 890 }, { "epoch": 0.26714158504007124, "grad_norm": 0.7347294092178345, "learning_rate": 4.732858414959929e-05, "loss": 1.073, "step": 900 }, { "epoch": 0.2701098248738498, "grad_norm": 0.7552283406257629, "learning_rate": 4.7298901751261505e-05, "loss": 1.0434, "step": 910 }, { "epoch": 0.27307806470762835, "grad_norm": 0.7497857213020325, "learning_rate": 4.7269219352923714e-05, "loss": 1.0236, "step": 920 }, { "epoch": 0.27604630454140694, "grad_norm": 0.6930972933769226, "learning_rate": 4.723953695458593e-05, "loss": 1.0303, "step": 930 }, { "epoch": 0.2790145443751855, "grad_norm": 0.674025297164917, "learning_rate": 4.720985455624815e-05, "loss": 1.0226, "step": 940 }, { "epoch": 0.2819827842089641, "grad_norm": 0.7063450217247009, "learning_rate": 4.718017215791036e-05, "loss": 1.0147, "step": 950 }, { "epoch": 0.28495102404274264, "grad_norm": 0.7937106490135193, "learning_rate": 4.715048975957257e-05, "loss": 1.0197, "step": 960 }, { "epoch": 0.2879192638765212, "grad_norm": 0.7385783195495605, "learning_rate": 4.7120807361234795e-05, "loss": 1.0337, "step": 970 }, { "epoch": 0.2908875037102998, "grad_norm": 0.7633090019226074, "learning_rate": 4.7091124962897005e-05, "loss": 1.0096, "step": 980 }, { "epoch": 0.29385574354407834, "grad_norm": 0.8703083992004395, "learning_rate": 4.7061442564559214e-05, "loss": 1.0368, "step": 990 }, { "epoch": 0.2968239833778569, "grad_norm": 0.7346276640892029, "learning_rate": 4.703176016622143e-05, "loss": 1.0115, "step": 1000 }, { "epoch": 0.2997922232116355, "grad_norm": 0.7063167691230774, "learning_rate": 4.7002077767883646e-05, "loss": 1.0252, "step": 1010 }, { "epoch": 0.3027604630454141, "grad_norm": 0.7117500305175781, "learning_rate": 4.697239536954586e-05, "loss": 1.0244, "step": 1020 }, { "epoch": 0.3057287028791926, "grad_norm": 0.6929411292076111, "learning_rate": 4.694271297120807e-05, "loss": 1.003, "step": 1030 }, { "epoch": 0.3086969427129712, "grad_norm": 0.6956213712692261, "learning_rate": 4.6913030572870295e-05, "loss": 1.0508, "step": 1040 }, { "epoch": 0.3116651825467498, "grad_norm": 0.6457957029342651, "learning_rate": 4.6883348174532504e-05, "loss": 1.0415, "step": 1050 }, { "epoch": 0.3146334223805283, "grad_norm": 0.7128310203552246, "learning_rate": 4.6853665776194714e-05, "loss": 1.0391, "step": 1060 }, { "epoch": 0.3176016622143069, "grad_norm": 0.8085856437683105, "learning_rate": 4.682398337785694e-05, "loss": 1.0181, "step": 1070 }, { "epoch": 0.3205699020480855, "grad_norm": 0.6627590656280518, "learning_rate": 4.6794300979519146e-05, "loss": 0.9902, "step": 1080 }, { "epoch": 0.323538141881864, "grad_norm": 0.7251258492469788, "learning_rate": 4.676461858118136e-05, "loss": 0.9954, "step": 1090 }, { "epoch": 0.3265063817156426, "grad_norm": 0.6958507299423218, "learning_rate": 4.673493618284357e-05, "loss": 1.0152, "step": 1100 }, { "epoch": 0.3294746215494212, "grad_norm": 0.714284360408783, "learning_rate": 4.6705253784505795e-05, "loss": 1.0322, "step": 1110 }, { "epoch": 0.3324428613831998, "grad_norm": 0.6643791198730469, "learning_rate": 4.6675571386168004e-05, "loss": 0.9957, "step": 1120 }, { "epoch": 0.3354111012169783, "grad_norm": 0.7135879397392273, "learning_rate": 4.6645888987830213e-05, "loss": 1.0158, "step": 1130 }, { "epoch": 0.3383793410507569, "grad_norm": 0.7087671160697937, "learning_rate": 4.6616206589492436e-05, "loss": 1.0146, "step": 1140 }, { "epoch": 0.3413475808845355, "grad_norm": 0.6507649421691895, "learning_rate": 4.6586524191154646e-05, "loss": 1.0216, "step": 1150 }, { "epoch": 0.344315820718314, "grad_norm": 0.7352689504623413, "learning_rate": 4.655684179281686e-05, "loss": 1.064, "step": 1160 }, { "epoch": 0.3472840605520926, "grad_norm": 0.7079006433486938, "learning_rate": 4.652715939447908e-05, "loss": 1.0028, "step": 1170 }, { "epoch": 0.3502523003858712, "grad_norm": 0.6333305239677429, "learning_rate": 4.6497476996141294e-05, "loss": 1.0121, "step": 1180 }, { "epoch": 0.35322054021964977, "grad_norm": 0.6222066879272461, "learning_rate": 4.6467794597803504e-05, "loss": 1.0151, "step": 1190 }, { "epoch": 0.3561887800534283, "grad_norm": 0.6540125608444214, "learning_rate": 4.643811219946571e-05, "loss": 0.9984, "step": 1200 }, { "epoch": 0.3591570198872069, "grad_norm": 0.7811010479927063, "learning_rate": 4.6408429801127936e-05, "loss": 1.035, "step": 1210 }, { "epoch": 0.36212525972098547, "grad_norm": 0.7098474502563477, "learning_rate": 4.6378747402790145e-05, "loss": 1.0029, "step": 1220 }, { "epoch": 0.365093499554764, "grad_norm": 0.7310671210289001, "learning_rate": 4.634906500445236e-05, "loss": 0.9954, "step": 1230 }, { "epoch": 0.3680617393885426, "grad_norm": 0.8500426411628723, "learning_rate": 4.631938260611458e-05, "loss": 0.9771, "step": 1240 }, { "epoch": 0.37102997922232117, "grad_norm": 0.6685720682144165, "learning_rate": 4.6289700207776794e-05, "loss": 1.007, "step": 1250 }, { "epoch": 0.37399821905609976, "grad_norm": 0.6676350831985474, "learning_rate": 4.6260017809439003e-05, "loss": 1.0382, "step": 1260 }, { "epoch": 0.3769664588898783, "grad_norm": 0.6712237000465393, "learning_rate": 4.623033541110122e-05, "loss": 1.0302, "step": 1270 }, { "epoch": 0.37993469872365687, "grad_norm": 0.6046270132064819, "learning_rate": 4.6200653012763436e-05, "loss": 1.0235, "step": 1280 }, { "epoch": 0.38290293855743546, "grad_norm": 0.6848681569099426, "learning_rate": 4.6170970614425645e-05, "loss": 1.0059, "step": 1290 }, { "epoch": 0.385871178391214, "grad_norm": 0.7414882183074951, "learning_rate": 4.614128821608786e-05, "loss": 1.0209, "step": 1300 }, { "epoch": 0.3888394182249926, "grad_norm": 0.7111567258834839, "learning_rate": 4.611160581775008e-05, "loss": 1.0614, "step": 1310 }, { "epoch": 0.39180765805877116, "grad_norm": 0.6949933767318726, "learning_rate": 4.6081923419412294e-05, "loss": 1.0055, "step": 1320 }, { "epoch": 0.39477589789254974, "grad_norm": 0.7095695734024048, "learning_rate": 4.60522410210745e-05, "loss": 0.9842, "step": 1330 }, { "epoch": 0.3977441377263283, "grad_norm": 0.8123998045921326, "learning_rate": 4.602255862273672e-05, "loss": 1.0609, "step": 1340 }, { "epoch": 0.40071237756010686, "grad_norm": 0.7480998039245605, "learning_rate": 4.5992876224398935e-05, "loss": 1.0, "step": 1350 }, { "epoch": 0.40368061739388544, "grad_norm": 0.7112509608268738, "learning_rate": 4.5963193826061145e-05, "loss": 1.005, "step": 1360 }, { "epoch": 0.406648857227664, "grad_norm": 0.7034320831298828, "learning_rate": 4.593351142772336e-05, "loss": 0.9875, "step": 1370 }, { "epoch": 0.40961709706144256, "grad_norm": 0.6642884612083435, "learning_rate": 4.590382902938558e-05, "loss": 1.0013, "step": 1380 }, { "epoch": 0.41258533689522114, "grad_norm": 0.6435369849205017, "learning_rate": 4.587414663104779e-05, "loss": 1.0212, "step": 1390 }, { "epoch": 0.41555357672899973, "grad_norm": 0.7438673377037048, "learning_rate": 4.584446423271e-05, "loss": 1.0401, "step": 1400 }, { "epoch": 0.41852181656277826, "grad_norm": 0.6593348979949951, "learning_rate": 4.581478183437222e-05, "loss": 1.0217, "step": 1410 }, { "epoch": 0.42149005639655684, "grad_norm": 0.7108020782470703, "learning_rate": 4.5785099436034435e-05, "loss": 0.9836, "step": 1420 }, { "epoch": 0.42445829623033543, "grad_norm": 0.6754259467124939, "learning_rate": 4.5755417037696645e-05, "loss": 1.0715, "step": 1430 }, { "epoch": 0.42742653606411396, "grad_norm": 0.6980791091918945, "learning_rate": 4.572573463935886e-05, "loss": 1.0257, "step": 1440 }, { "epoch": 0.43039477589789255, "grad_norm": 0.7558574080467224, "learning_rate": 4.569605224102108e-05, "loss": 0.9879, "step": 1450 }, { "epoch": 0.43336301573167113, "grad_norm": 0.6725367307662964, "learning_rate": 4.566636984268329e-05, "loss": 1.0153, "step": 1460 }, { "epoch": 0.4363312555654497, "grad_norm": 0.8197546005249023, "learning_rate": 4.56366874443455e-05, "loss": 0.977, "step": 1470 }, { "epoch": 0.43929949539922825, "grad_norm": 0.6687082052230835, "learning_rate": 4.560700504600772e-05, "loss": 1.0105, "step": 1480 }, { "epoch": 0.44226773523300683, "grad_norm": 0.7228406667709351, "learning_rate": 4.5577322647669935e-05, "loss": 0.9827, "step": 1490 }, { "epoch": 0.4452359750667854, "grad_norm": 0.7459806799888611, "learning_rate": 4.5547640249332144e-05, "loss": 1.0305, "step": 1500 }, { "epoch": 0.44820421490056395, "grad_norm": 0.6408952474594116, "learning_rate": 4.551795785099437e-05, "loss": 0.9861, "step": 1510 }, { "epoch": 0.45117245473434253, "grad_norm": 0.7314732074737549, "learning_rate": 4.5488275452656577e-05, "loss": 0.9961, "step": 1520 }, { "epoch": 0.4541406945681211, "grad_norm": 0.7654784917831421, "learning_rate": 4.545859305431879e-05, "loss": 1.019, "step": 1530 }, { "epoch": 0.45710893440189965, "grad_norm": 0.7305381298065186, "learning_rate": 4.5428910655981e-05, "loss": 0.9938, "step": 1540 }, { "epoch": 0.46007717423567823, "grad_norm": 0.6413453221321106, "learning_rate": 4.539922825764322e-05, "loss": 0.994, "step": 1550 }, { "epoch": 0.4630454140694568, "grad_norm": 0.6420827507972717, "learning_rate": 4.5369545859305434e-05, "loss": 1.0237, "step": 1560 }, { "epoch": 0.4660136539032354, "grad_norm": 0.6597052216529846, "learning_rate": 4.5339863460967644e-05, "loss": 1.0142, "step": 1570 }, { "epoch": 0.46898189373701393, "grad_norm": 0.727504312992096, "learning_rate": 4.531018106262987e-05, "loss": 1.0349, "step": 1580 }, { "epoch": 0.4719501335707925, "grad_norm": 0.6038479208946228, "learning_rate": 4.5280498664292076e-05, "loss": 0.9878, "step": 1590 }, { "epoch": 0.4749183734045711, "grad_norm": 0.6358582973480225, "learning_rate": 4.525081626595429e-05, "loss": 0.9822, "step": 1600 }, { "epoch": 0.47788661323834963, "grad_norm": 0.6879693865776062, "learning_rate": 4.522113386761651e-05, "loss": 1.022, "step": 1610 }, { "epoch": 0.4808548530721282, "grad_norm": 0.6969800591468811, "learning_rate": 4.519145146927872e-05, "loss": 0.999, "step": 1620 }, { "epoch": 0.4838230929059068, "grad_norm": 0.632183849811554, "learning_rate": 4.5161769070940934e-05, "loss": 0.9845, "step": 1630 }, { "epoch": 0.4867913327396854, "grad_norm": 0.6474229097366333, "learning_rate": 4.5132086672603144e-05, "loss": 0.9895, "step": 1640 }, { "epoch": 0.4897595725734639, "grad_norm": 0.7708784937858582, "learning_rate": 4.5102404274265367e-05, "loss": 1.0074, "step": 1650 }, { "epoch": 0.4927278124072425, "grad_norm": 0.7153667211532593, "learning_rate": 4.5072721875927576e-05, "loss": 0.9792, "step": 1660 }, { "epoch": 0.4956960522410211, "grad_norm": 0.7407136559486389, "learning_rate": 4.504303947758979e-05, "loss": 1.008, "step": 1670 }, { "epoch": 0.4986642920747996, "grad_norm": 0.7044761776924133, "learning_rate": 4.501335707925201e-05, "loss": 0.9908, "step": 1680 }, { "epoch": 0.5016325319085783, "grad_norm": 0.7155528664588928, "learning_rate": 4.498367468091422e-05, "loss": 0.9659, "step": 1690 }, { "epoch": 0.5046007717423567, "grad_norm": 0.7560014128684998, "learning_rate": 4.4953992282576434e-05, "loss": 1.0177, "step": 1700 }, { "epoch": 0.5075690115761353, "grad_norm": 0.6918985247612, "learning_rate": 4.492430988423865e-05, "loss": 0.9929, "step": 1710 }, { "epoch": 0.5105372514099139, "grad_norm": 0.6356712579727173, "learning_rate": 4.4894627485900866e-05, "loss": 0.9851, "step": 1720 }, { "epoch": 0.5135054912436925, "grad_norm": 0.6340317130088806, "learning_rate": 4.4864945087563076e-05, "loss": 0.9821, "step": 1730 }, { "epoch": 0.5164737310774711, "grad_norm": 0.654170572757721, "learning_rate": 4.483526268922529e-05, "loss": 1.0106, "step": 1740 }, { "epoch": 0.5194419709112497, "grad_norm": 0.6271511912345886, "learning_rate": 4.480558029088751e-05, "loss": 0.9998, "step": 1750 }, { "epoch": 0.5224102107450282, "grad_norm": 0.6410935521125793, "learning_rate": 4.477589789254972e-05, "loss": 1.0079, "step": 1760 }, { "epoch": 0.5253784505788067, "grad_norm": 0.6426156759262085, "learning_rate": 4.4746215494211934e-05, "loss": 0.9827, "step": 1770 }, { "epoch": 0.5283466904125853, "grad_norm": 0.6262437701225281, "learning_rate": 4.471653309587415e-05, "loss": 1.0245, "step": 1780 }, { "epoch": 0.5313149302463639, "grad_norm": 0.6619899868965149, "learning_rate": 4.4686850697536366e-05, "loss": 0.9612, "step": 1790 }, { "epoch": 0.5342831700801425, "grad_norm": 0.7522478699684143, "learning_rate": 4.4657168299198575e-05, "loss": 0.9784, "step": 1800 }, { "epoch": 0.5372514099139211, "grad_norm": 0.69633948802948, "learning_rate": 4.462748590086079e-05, "loss": 0.9925, "step": 1810 }, { "epoch": 0.5402196497476996, "grad_norm": 0.6923314929008484, "learning_rate": 4.459780350252301e-05, "loss": 1.0409, "step": 1820 }, { "epoch": 0.5431878895814782, "grad_norm": 0.6425520181655884, "learning_rate": 4.456812110418522e-05, "loss": 1.0117, "step": 1830 }, { "epoch": 0.5461561294152567, "grad_norm": 0.6298788189888, "learning_rate": 4.453843870584743e-05, "loss": 1.0165, "step": 1840 }, { "epoch": 0.5491243692490353, "grad_norm": 0.6069309711456299, "learning_rate": 4.450875630750965e-05, "loss": 0.9799, "step": 1850 }, { "epoch": 0.5520926090828139, "grad_norm": 0.6480023860931396, "learning_rate": 4.4479073909171866e-05, "loss": 0.988, "step": 1860 }, { "epoch": 0.5550608489165925, "grad_norm": 0.607940673828125, "learning_rate": 4.4449391510834075e-05, "loss": 0.9802, "step": 1870 }, { "epoch": 0.558029088750371, "grad_norm": 0.6470855474472046, "learning_rate": 4.441970911249629e-05, "loss": 1.0095, "step": 1880 }, { "epoch": 0.5609973285841496, "grad_norm": 0.6894873976707458, "learning_rate": 4.439002671415851e-05, "loss": 0.977, "step": 1890 }, { "epoch": 0.5639655684179282, "grad_norm": 0.6529555320739746, "learning_rate": 4.436034431582072e-05, "loss": 1.0071, "step": 1900 }, { "epoch": 0.5669338082517067, "grad_norm": 0.743811845779419, "learning_rate": 4.433066191748293e-05, "loss": 0.9989, "step": 1910 }, { "epoch": 0.5699020480854853, "grad_norm": 0.8114213347434998, "learning_rate": 4.430097951914515e-05, "loss": 0.9627, "step": 1920 }, { "epoch": 0.5728702879192639, "grad_norm": 0.6733983159065247, "learning_rate": 4.4271297120807365e-05, "loss": 0.9975, "step": 1930 }, { "epoch": 0.5758385277530425, "grad_norm": 0.692699134349823, "learning_rate": 4.4241614722469575e-05, "loss": 0.9982, "step": 1940 }, { "epoch": 0.578806767586821, "grad_norm": 0.6752103567123413, "learning_rate": 4.421193232413179e-05, "loss": 0.9963, "step": 1950 }, { "epoch": 0.5817750074205996, "grad_norm": 0.7230833768844604, "learning_rate": 4.418224992579401e-05, "loss": 0.9941, "step": 1960 }, { "epoch": 0.5847432472543782, "grad_norm": 0.7616879343986511, "learning_rate": 4.4152567527456216e-05, "loss": 0.9763, "step": 1970 }, { "epoch": 0.5877114870881567, "grad_norm": 0.6616672873497009, "learning_rate": 4.412288512911843e-05, "loss": 0.9777, "step": 1980 }, { "epoch": 0.5906797269219353, "grad_norm": 0.7364579439163208, "learning_rate": 4.409320273078065e-05, "loss": 0.9786, "step": 1990 }, { "epoch": 0.5936479667557139, "grad_norm": 0.6607961654663086, "learning_rate": 4.4063520332442865e-05, "loss": 1.0312, "step": 2000 }, { "epoch": 0.5966162065894924, "grad_norm": 0.6173883080482483, "learning_rate": 4.4033837934105074e-05, "loss": 0.9894, "step": 2010 }, { "epoch": 0.599584446423271, "grad_norm": 0.8147071599960327, "learning_rate": 4.40041555357673e-05, "loss": 0.9706, "step": 2020 }, { "epoch": 0.6025526862570496, "grad_norm": 0.6534470319747925, "learning_rate": 4.397447313742951e-05, "loss": 0.9689, "step": 2030 }, { "epoch": 0.6055209260908282, "grad_norm": 0.6868370771408081, "learning_rate": 4.3944790739091716e-05, "loss": 0.9884, "step": 2040 }, { "epoch": 0.6084891659246067, "grad_norm": 0.6805328130722046, "learning_rate": 4.391510834075393e-05, "loss": 0.9819, "step": 2050 }, { "epoch": 0.6114574057583853, "grad_norm": 0.6883294582366943, "learning_rate": 4.388542594241615e-05, "loss": 1.0165, "step": 2060 }, { "epoch": 0.6144256455921638, "grad_norm": 0.635663628578186, "learning_rate": 4.3855743544078365e-05, "loss": 0.9839, "step": 2070 }, { "epoch": 0.6173938854259424, "grad_norm": 0.6705318689346313, "learning_rate": 4.3826061145740574e-05, "loss": 0.9674, "step": 2080 }, { "epoch": 0.620362125259721, "grad_norm": 0.7084488868713379, "learning_rate": 4.37963787474028e-05, "loss": 0.9787, "step": 2090 }, { "epoch": 0.6233303650934996, "grad_norm": 0.6482323408126831, "learning_rate": 4.3766696349065006e-05, "loss": 0.998, "step": 2100 }, { "epoch": 0.6262986049272781, "grad_norm": 0.6266573071479797, "learning_rate": 4.373701395072722e-05, "loss": 0.9931, "step": 2110 }, { "epoch": 0.6292668447610567, "grad_norm": 0.7121156454086304, "learning_rate": 4.370733155238944e-05, "loss": 1.0117, "step": 2120 }, { "epoch": 0.6322350845948352, "grad_norm": 0.6650971174240112, "learning_rate": 4.367764915405165e-05, "loss": 0.9827, "step": 2130 }, { "epoch": 0.6352033244286138, "grad_norm": 0.6248673796653748, "learning_rate": 4.3647966755713864e-05, "loss": 0.9938, "step": 2140 }, { "epoch": 0.6381715642623924, "grad_norm": 0.5886913537979126, "learning_rate": 4.3618284357376074e-05, "loss": 0.9722, "step": 2150 }, { "epoch": 0.641139804096171, "grad_norm": 0.6873770356178284, "learning_rate": 4.35886019590383e-05, "loss": 1.0117, "step": 2160 }, { "epoch": 0.6441080439299496, "grad_norm": 0.5833357572555542, "learning_rate": 4.3558919560700506e-05, "loss": 0.9733, "step": 2170 }, { "epoch": 0.647076283763728, "grad_norm": 0.6192591190338135, "learning_rate": 4.352923716236272e-05, "loss": 1.0046, "step": 2180 }, { "epoch": 0.6500445235975066, "grad_norm": 0.657365083694458, "learning_rate": 4.349955476402494e-05, "loss": 0.9808, "step": 2190 }, { "epoch": 0.6530127634312852, "grad_norm": 0.6927903890609741, "learning_rate": 4.346987236568715e-05, "loss": 1.0062, "step": 2200 }, { "epoch": 0.6559810032650638, "grad_norm": 0.6259236931800842, "learning_rate": 4.3440189967349364e-05, "loss": 0.99, "step": 2210 }, { "epoch": 0.6589492430988424, "grad_norm": 0.659789502620697, "learning_rate": 4.341050756901158e-05, "loss": 0.952, "step": 2220 }, { "epoch": 0.661917482932621, "grad_norm": 0.6317498087882996, "learning_rate": 4.3380825170673796e-05, "loss": 0.9938, "step": 2230 }, { "epoch": 0.6648857227663996, "grad_norm": 0.6758004426956177, "learning_rate": 4.3351142772336006e-05, "loss": 0.9761, "step": 2240 }, { "epoch": 0.667853962600178, "grad_norm": 0.707929790019989, "learning_rate": 4.332146037399822e-05, "loss": 0.9839, "step": 2250 }, { "epoch": 0.6708222024339566, "grad_norm": 0.6813105344772339, "learning_rate": 4.329177797566044e-05, "loss": 0.9933, "step": 2260 }, { "epoch": 0.6737904422677352, "grad_norm": 0.7361823320388794, "learning_rate": 4.326209557732265e-05, "loss": 1.0212, "step": 2270 }, { "epoch": 0.6767586821015138, "grad_norm": 0.670718252658844, "learning_rate": 4.3232413178984864e-05, "loss": 0.9638, "step": 2280 }, { "epoch": 0.6797269219352924, "grad_norm": 0.6154988408088684, "learning_rate": 4.320273078064708e-05, "loss": 0.9738, "step": 2290 }, { "epoch": 0.682695161769071, "grad_norm": 0.6230899095535278, "learning_rate": 4.3173048382309296e-05, "loss": 0.9931, "step": 2300 }, { "epoch": 0.6856634016028496, "grad_norm": 0.6064435839653015, "learning_rate": 4.3143365983971505e-05, "loss": 1.0204, "step": 2310 }, { "epoch": 0.688631641436628, "grad_norm": 0.635154128074646, "learning_rate": 4.311368358563372e-05, "loss": 0.9353, "step": 2320 }, { "epoch": 0.6915998812704066, "grad_norm": 0.6036397814750671, "learning_rate": 4.308400118729594e-05, "loss": 1.0105, "step": 2330 }, { "epoch": 0.6945681211041852, "grad_norm": 0.6203991770744324, "learning_rate": 4.305431878895815e-05, "loss": 0.9804, "step": 2340 }, { "epoch": 0.6975363609379638, "grad_norm": 0.6208158135414124, "learning_rate": 4.302463639062036e-05, "loss": 0.9271, "step": 2350 }, { "epoch": 0.7005046007717424, "grad_norm": 0.681175708770752, "learning_rate": 4.299495399228258e-05, "loss": 0.9734, "step": 2360 }, { "epoch": 0.703472840605521, "grad_norm": 0.6127511262893677, "learning_rate": 4.2965271593944796e-05, "loss": 0.9544, "step": 2370 }, { "epoch": 0.7064410804392995, "grad_norm": 0.6363290548324585, "learning_rate": 4.2935589195607005e-05, "loss": 0.9932, "step": 2380 }, { "epoch": 0.709409320273078, "grad_norm": 0.656937837600708, "learning_rate": 4.290590679726922e-05, "loss": 0.9682, "step": 2390 }, { "epoch": 0.7123775601068566, "grad_norm": 0.6660462021827698, "learning_rate": 4.287622439893144e-05, "loss": 0.983, "step": 2400 }, { "epoch": 0.7153457999406352, "grad_norm": 0.6178123354911804, "learning_rate": 4.284654200059365e-05, "loss": 0.9757, "step": 2410 }, { "epoch": 0.7183140397744138, "grad_norm": 0.7197669744491577, "learning_rate": 4.281685960225586e-05, "loss": 0.9877, "step": 2420 }, { "epoch": 0.7212822796081924, "grad_norm": 0.6619410514831543, "learning_rate": 4.278717720391808e-05, "loss": 1.0249, "step": 2430 }, { "epoch": 0.7242505194419709, "grad_norm": 0.5938183665275574, "learning_rate": 4.2757494805580295e-05, "loss": 0.9593, "step": 2440 }, { "epoch": 0.7272187592757495, "grad_norm": 0.6519368886947632, "learning_rate": 4.2727812407242505e-05, "loss": 0.9552, "step": 2450 }, { "epoch": 0.730186999109528, "grad_norm": 0.6326847672462463, "learning_rate": 4.269813000890472e-05, "loss": 0.9283, "step": 2460 }, { "epoch": 0.7331552389433066, "grad_norm": 0.6411892175674438, "learning_rate": 4.266844761056694e-05, "loss": 1.004, "step": 2470 }, { "epoch": 0.7361234787770852, "grad_norm": 0.7936878204345703, "learning_rate": 4.2638765212229146e-05, "loss": 0.9515, "step": 2480 }, { "epoch": 0.7390917186108638, "grad_norm": 0.7002474069595337, "learning_rate": 4.260908281389136e-05, "loss": 0.9904, "step": 2490 }, { "epoch": 0.7420599584446423, "grad_norm": 0.5938469171524048, "learning_rate": 4.257940041555358e-05, "loss": 0.9709, "step": 2500 }, { "epoch": 0.7450281982784209, "grad_norm": 0.6611453890800476, "learning_rate": 4.2549718017215795e-05, "loss": 0.9774, "step": 2510 }, { "epoch": 0.7479964381121995, "grad_norm": 0.6640607714653015, "learning_rate": 4.2520035618878004e-05, "loss": 0.972, "step": 2520 }, { "epoch": 0.750964677945978, "grad_norm": 0.6462842226028442, "learning_rate": 4.249035322054023e-05, "loss": 0.9776, "step": 2530 }, { "epoch": 0.7539329177797566, "grad_norm": 0.5849109292030334, "learning_rate": 4.246067082220244e-05, "loss": 1.0179, "step": 2540 }, { "epoch": 0.7569011576135352, "grad_norm": 0.6768197417259216, "learning_rate": 4.2430988423864646e-05, "loss": 0.9714, "step": 2550 }, { "epoch": 0.7598693974473137, "grad_norm": 0.646824300289154, "learning_rate": 4.240130602552686e-05, "loss": 1.023, "step": 2560 }, { "epoch": 0.7628376372810923, "grad_norm": 0.7211414575576782, "learning_rate": 4.237162362718908e-05, "loss": 0.972, "step": 2570 }, { "epoch": 0.7658058771148709, "grad_norm": 0.6269819736480713, "learning_rate": 4.2341941228851295e-05, "loss": 0.9738, "step": 2580 }, { "epoch": 0.7687741169486495, "grad_norm": 0.6858998537063599, "learning_rate": 4.2312258830513504e-05, "loss": 1.0041, "step": 2590 }, { "epoch": 0.771742356782428, "grad_norm": 0.5189050436019897, "learning_rate": 4.228257643217573e-05, "loss": 0.9643, "step": 2600 }, { "epoch": 0.7747105966162066, "grad_norm": 0.6885724067687988, "learning_rate": 4.2252894033837936e-05, "loss": 0.943, "step": 2610 }, { "epoch": 0.7776788364499851, "grad_norm": 0.6750602126121521, "learning_rate": 4.2223211635500146e-05, "loss": 0.9982, "step": 2620 }, { "epoch": 0.7806470762837637, "grad_norm": 0.659366250038147, "learning_rate": 4.219352923716237e-05, "loss": 1.0041, "step": 2630 }, { "epoch": 0.7836153161175423, "grad_norm": 0.6575495004653931, "learning_rate": 4.216384683882458e-05, "loss": 0.9499, "step": 2640 }, { "epoch": 0.7865835559513209, "grad_norm": 0.6087556481361389, "learning_rate": 4.2134164440486794e-05, "loss": 1.0186, "step": 2650 }, { "epoch": 0.7895517957850995, "grad_norm": 0.5324801206588745, "learning_rate": 4.2104482042149004e-05, "loss": 1.0128, "step": 2660 }, { "epoch": 0.792520035618878, "grad_norm": 0.6805785298347473, "learning_rate": 4.207479964381123e-05, "loss": 0.9591, "step": 2670 }, { "epoch": 0.7954882754526565, "grad_norm": 0.624666690826416, "learning_rate": 4.2045117245473436e-05, "loss": 0.9764, "step": 2680 }, { "epoch": 0.7984565152864351, "grad_norm": 0.6150093078613281, "learning_rate": 4.2015434847135646e-05, "loss": 1.0, "step": 2690 }, { "epoch": 0.8014247551202137, "grad_norm": 0.6825082898139954, "learning_rate": 4.198575244879787e-05, "loss": 0.98, "step": 2700 }, { "epoch": 0.8043929949539923, "grad_norm": 0.745524525642395, "learning_rate": 4.195607005046008e-05, "loss": 0.9782, "step": 2710 }, { "epoch": 0.8073612347877709, "grad_norm": 0.6018673777580261, "learning_rate": 4.1926387652122294e-05, "loss": 1.0179, "step": 2720 }, { "epoch": 0.8103294746215495, "grad_norm": 0.5541884303092957, "learning_rate": 4.189670525378451e-05, "loss": 0.9905, "step": 2730 }, { "epoch": 0.813297714455328, "grad_norm": 0.6213927865028381, "learning_rate": 4.1867022855446726e-05, "loss": 1.0215, "step": 2740 }, { "epoch": 0.8162659542891065, "grad_norm": 0.6007673740386963, "learning_rate": 4.1837340457108936e-05, "loss": 0.9846, "step": 2750 }, { "epoch": 0.8192341941228851, "grad_norm": 0.678551197052002, "learning_rate": 4.180765805877115e-05, "loss": 0.9933, "step": 2760 }, { "epoch": 0.8222024339566637, "grad_norm": 0.5489323735237122, "learning_rate": 4.177797566043337e-05, "loss": 0.9805, "step": 2770 }, { "epoch": 0.8251706737904423, "grad_norm": 0.6029487252235413, "learning_rate": 4.174829326209558e-05, "loss": 0.9983, "step": 2780 }, { "epoch": 0.8281389136242209, "grad_norm": 0.5949262976646423, "learning_rate": 4.1718610863757794e-05, "loss": 0.9883, "step": 2790 }, { "epoch": 0.8311071534579995, "grad_norm": 0.5981915593147278, "learning_rate": 4.168892846542001e-05, "loss": 0.9652, "step": 2800 }, { "epoch": 0.8340753932917779, "grad_norm": 0.6043820977210999, "learning_rate": 4.1659246067082226e-05, "loss": 0.9893, "step": 2810 }, { "epoch": 0.8370436331255565, "grad_norm": 0.6542451977729797, "learning_rate": 4.1629563668744435e-05, "loss": 1.0184, "step": 2820 }, { "epoch": 0.8400118729593351, "grad_norm": 0.5959357619285583, "learning_rate": 4.159988127040665e-05, "loss": 1.008, "step": 2830 }, { "epoch": 0.8429801127931137, "grad_norm": 0.658277153968811, "learning_rate": 4.157019887206887e-05, "loss": 0.9579, "step": 2840 }, { "epoch": 0.8459483526268923, "grad_norm": 0.6425391435623169, "learning_rate": 4.154051647373108e-05, "loss": 0.9705, "step": 2850 }, { "epoch": 0.8489165924606709, "grad_norm": 0.49258121848106384, "learning_rate": 4.1510834075393293e-05, "loss": 0.9488, "step": 2860 }, { "epoch": 0.8518848322944494, "grad_norm": 0.6020271182060242, "learning_rate": 4.148115167705551e-05, "loss": 0.9847, "step": 2870 }, { "epoch": 0.8548530721282279, "grad_norm": 0.659537136554718, "learning_rate": 4.1451469278717726e-05, "loss": 0.9962, "step": 2880 }, { "epoch": 0.8578213119620065, "grad_norm": 0.6259986758232117, "learning_rate": 4.1421786880379935e-05, "loss": 0.9601, "step": 2890 }, { "epoch": 0.8607895517957851, "grad_norm": 0.6433762907981873, "learning_rate": 4.139210448204215e-05, "loss": 1.0173, "step": 2900 }, { "epoch": 0.8637577916295637, "grad_norm": 0.6561344861984253, "learning_rate": 4.136242208370437e-05, "loss": 0.987, "step": 2910 }, { "epoch": 0.8667260314633423, "grad_norm": 0.6170705556869507, "learning_rate": 4.133273968536658e-05, "loss": 0.9751, "step": 2920 }, { "epoch": 0.8696942712971208, "grad_norm": 0.6285633444786072, "learning_rate": 4.130305728702879e-05, "loss": 1.0202, "step": 2930 }, { "epoch": 0.8726625111308994, "grad_norm": 0.5784567594528198, "learning_rate": 4.127337488869101e-05, "loss": 0.984, "step": 2940 }, { "epoch": 0.8756307509646779, "grad_norm": 0.6021987795829773, "learning_rate": 4.1243692490353225e-05, "loss": 1.0028, "step": 2950 }, { "epoch": 0.8785989907984565, "grad_norm": 0.7186152338981628, "learning_rate": 4.1214010092015435e-05, "loss": 0.9872, "step": 2960 }, { "epoch": 0.8815672306322351, "grad_norm": 0.6320556402206421, "learning_rate": 4.118432769367765e-05, "loss": 0.958, "step": 2970 }, { "epoch": 0.8845354704660137, "grad_norm": 0.6496281623840332, "learning_rate": 4.115464529533987e-05, "loss": 0.9957, "step": 2980 }, { "epoch": 0.8875037102997922, "grad_norm": 0.7143193483352661, "learning_rate": 4.1124962897002077e-05, "loss": 0.9968, "step": 2990 }, { "epoch": 0.8904719501335708, "grad_norm": 0.5800313949584961, "learning_rate": 4.109528049866429e-05, "loss": 0.9762, "step": 3000 }, { "epoch": 0.8934401899673493, "grad_norm": 0.6602805852890015, "learning_rate": 4.106559810032651e-05, "loss": 1.0245, "step": 3010 }, { "epoch": 0.8964084298011279, "grad_norm": 0.6473674774169922, "learning_rate": 4.1035915701988725e-05, "loss": 1.0021, "step": 3020 }, { "epoch": 0.8993766696349065, "grad_norm": 0.6156338453292847, "learning_rate": 4.1006233303650935e-05, "loss": 0.9532, "step": 3030 }, { "epoch": 0.9023449094686851, "grad_norm": 0.5925867557525635, "learning_rate": 4.097655090531315e-05, "loss": 0.9177, "step": 3040 }, { "epoch": 0.9053131493024636, "grad_norm": 0.6322952508926392, "learning_rate": 4.094686850697537e-05, "loss": 0.9607, "step": 3050 }, { "epoch": 0.9082813891362422, "grad_norm": 0.6165744662284851, "learning_rate": 4.0917186108637576e-05, "loss": 0.9635, "step": 3060 }, { "epoch": 0.9112496289700208, "grad_norm": 0.58901447057724, "learning_rate": 4.08875037102998e-05, "loss": 0.9799, "step": 3070 }, { "epoch": 0.9142178688037993, "grad_norm": 0.6456793546676636, "learning_rate": 4.085782131196201e-05, "loss": 0.9751, "step": 3080 }, { "epoch": 0.9171861086375779, "grad_norm": 0.5987860560417175, "learning_rate": 4.0828138913624225e-05, "loss": 0.9327, "step": 3090 }, { "epoch": 0.9201543484713565, "grad_norm": 0.5760492086410522, "learning_rate": 4.0798456515286434e-05, "loss": 0.982, "step": 3100 }, { "epoch": 0.923122588305135, "grad_norm": 0.5955607295036316, "learning_rate": 4.076877411694865e-05, "loss": 0.9669, "step": 3110 }, { "epoch": 0.9260908281389136, "grad_norm": 0.6458853483200073, "learning_rate": 4.0739091718610867e-05, "loss": 0.9424, "step": 3120 }, { "epoch": 0.9290590679726922, "grad_norm": 0.6266711354255676, "learning_rate": 4.0709409320273076e-05, "loss": 0.9545, "step": 3130 }, { "epoch": 0.9320273078064708, "grad_norm": 0.5927236080169678, "learning_rate": 4.06797269219353e-05, "loss": 0.9936, "step": 3140 }, { "epoch": 0.9349955476402493, "grad_norm": 0.5582984685897827, "learning_rate": 4.065004452359751e-05, "loss": 0.9653, "step": 3150 }, { "epoch": 0.9379637874740279, "grad_norm": 0.6802617907524109, "learning_rate": 4.0620362125259724e-05, "loss": 0.9792, "step": 3160 }, { "epoch": 0.9409320273078065, "grad_norm": 0.6594977974891663, "learning_rate": 4.059067972692194e-05, "loss": 0.9381, "step": 3170 }, { "epoch": 0.943900267141585, "grad_norm": 0.7159130573272705, "learning_rate": 4.056099732858415e-05, "loss": 0.9884, "step": 3180 }, { "epoch": 0.9468685069753636, "grad_norm": 0.7334505915641785, "learning_rate": 4.0531314930246366e-05, "loss": 0.9893, "step": 3190 }, { "epoch": 0.9498367468091422, "grad_norm": 0.6196255683898926, "learning_rate": 4.0501632531908576e-05, "loss": 1.0248, "step": 3200 }, { "epoch": 0.9528049866429208, "grad_norm": 0.5950149297714233, "learning_rate": 4.04719501335708e-05, "loss": 0.9723, "step": 3210 }, { "epoch": 0.9557732264766993, "grad_norm": 0.6434257626533508, "learning_rate": 4.044226773523301e-05, "loss": 0.979, "step": 3220 }, { "epoch": 0.9587414663104779, "grad_norm": 0.6103981137275696, "learning_rate": 4.0412585336895224e-05, "loss": 0.9667, "step": 3230 }, { "epoch": 0.9617097061442564, "grad_norm": 0.5639011859893799, "learning_rate": 4.038290293855744e-05, "loss": 0.9826, "step": 3240 }, { "epoch": 0.964677945978035, "grad_norm": 0.723637044429779, "learning_rate": 4.035322054021965e-05, "loss": 0.951, "step": 3250 }, { "epoch": 0.9676461858118136, "grad_norm": 0.6385572552680969, "learning_rate": 4.0323538141881866e-05, "loss": 0.9773, "step": 3260 }, { "epoch": 0.9706144256455922, "grad_norm": 0.5733060836791992, "learning_rate": 4.029385574354408e-05, "loss": 0.9407, "step": 3270 }, { "epoch": 0.9735826654793708, "grad_norm": 0.6948421001434326, "learning_rate": 4.02641733452063e-05, "loss": 0.9977, "step": 3280 }, { "epoch": 0.9765509053131493, "grad_norm": 0.6377859711647034, "learning_rate": 4.023449094686851e-05, "loss": 0.9441, "step": 3290 }, { "epoch": 0.9795191451469278, "grad_norm": 0.704069972038269, "learning_rate": 4.0204808548530724e-05, "loss": 1.0352, "step": 3300 }, { "epoch": 0.9824873849807064, "grad_norm": 0.6204991340637207, "learning_rate": 4.017512615019294e-05, "loss": 0.9821, "step": 3310 }, { "epoch": 0.985455624814485, "grad_norm": 0.6215436458587646, "learning_rate": 4.014544375185515e-05, "loss": 0.9572, "step": 3320 }, { "epoch": 0.9884238646482636, "grad_norm": 0.6409376859664917, "learning_rate": 4.0115761353517366e-05, "loss": 0.9914, "step": 3330 }, { "epoch": 0.9913921044820422, "grad_norm": 0.6437116861343384, "learning_rate": 4.008607895517958e-05, "loss": 0.9499, "step": 3340 }, { "epoch": 0.9943603443158208, "grad_norm": 0.5800116658210754, "learning_rate": 4.00563965568418e-05, "loss": 0.974, "step": 3350 }, { "epoch": 0.9973285841495992, "grad_norm": 0.6123467683792114, "learning_rate": 4.002671415850401e-05, "loss": 0.9675, "step": 3360 }, { "epoch": 1.0002968239833778, "grad_norm": 0.6109050512313843, "learning_rate": 3.9997031760166224e-05, "loss": 1.0152, "step": 3370 }, { "epoch": 1.0032650638171565, "grad_norm": 0.6630458235740662, "learning_rate": 3.996734936182844e-05, "loss": 0.9341, "step": 3380 }, { "epoch": 1.006233303650935, "grad_norm": 0.6218925714492798, "learning_rate": 3.993766696349065e-05, "loss": 0.9263, "step": 3390 }, { "epoch": 1.0092015434847135, "grad_norm": 0.6371595859527588, "learning_rate": 3.9907984565152865e-05, "loss": 0.8966, "step": 3400 }, { "epoch": 1.0121697833184922, "grad_norm": 0.6136346459388733, "learning_rate": 3.987830216681508e-05, "loss": 0.9506, "step": 3410 }, { "epoch": 1.0151380231522706, "grad_norm": 0.647017776966095, "learning_rate": 3.98486197684773e-05, "loss": 0.9079, "step": 3420 }, { "epoch": 1.0181062629860493, "grad_norm": 0.6583853960037231, "learning_rate": 3.981893737013951e-05, "loss": 0.9518, "step": 3430 }, { "epoch": 1.0210745028198278, "grad_norm": 0.6366351842880249, "learning_rate": 3.978925497180172e-05, "loss": 0.8948, "step": 3440 }, { "epoch": 1.0240427426536065, "grad_norm": 0.6189222931861877, "learning_rate": 3.975957257346394e-05, "loss": 0.8937, "step": 3450 }, { "epoch": 1.027010982487385, "grad_norm": 0.677386999130249, "learning_rate": 3.972989017512615e-05, "loss": 0.9326, "step": 3460 }, { "epoch": 1.0299792223211635, "grad_norm": 0.7184809446334839, "learning_rate": 3.9700207776788365e-05, "loss": 0.9174, "step": 3470 }, { "epoch": 1.0329474621549422, "grad_norm": 0.7060807943344116, "learning_rate": 3.967052537845058e-05, "loss": 0.9, "step": 3480 }, { "epoch": 1.0359157019887206, "grad_norm": 0.6315202713012695, "learning_rate": 3.96408429801128e-05, "loss": 0.925, "step": 3490 }, { "epoch": 1.0388839418224993, "grad_norm": 0.6721168756484985, "learning_rate": 3.961116058177501e-05, "loss": 0.911, "step": 3500 }, { "epoch": 1.0418521816562778, "grad_norm": 0.7183712720870972, "learning_rate": 3.958147818343722e-05, "loss": 0.9068, "step": 3510 }, { "epoch": 1.0448204214900565, "grad_norm": 0.6344295144081116, "learning_rate": 3.955179578509944e-05, "loss": 0.9098, "step": 3520 }, { "epoch": 1.047788661323835, "grad_norm": 0.6115924119949341, "learning_rate": 3.952211338676165e-05, "loss": 0.9236, "step": 3530 }, { "epoch": 1.0507569011576134, "grad_norm": 0.6145617961883545, "learning_rate": 3.9492430988423865e-05, "loss": 0.9374, "step": 3540 }, { "epoch": 1.0537251409913921, "grad_norm": 0.7246147990226746, "learning_rate": 3.946274859008608e-05, "loss": 0.9252, "step": 3550 }, { "epoch": 1.0566933808251706, "grad_norm": 0.7344697117805481, "learning_rate": 3.94330661917483e-05, "loss": 0.9314, "step": 3560 }, { "epoch": 1.0596616206589493, "grad_norm": 0.677960216999054, "learning_rate": 3.9403383793410506e-05, "loss": 0.9225, "step": 3570 }, { "epoch": 1.0626298604927278, "grad_norm": 0.6257244944572449, "learning_rate": 3.937370139507273e-05, "loss": 0.8904, "step": 3580 }, { "epoch": 1.0655981003265065, "grad_norm": 0.6566025614738464, "learning_rate": 3.934401899673494e-05, "loss": 0.9171, "step": 3590 }, { "epoch": 1.068566340160285, "grad_norm": 0.6468666791915894, "learning_rate": 3.931433659839715e-05, "loss": 0.9198, "step": 3600 }, { "epoch": 1.0715345799940634, "grad_norm": 0.602260410785675, "learning_rate": 3.9284654200059364e-05, "loss": 0.9406, "step": 3610 }, { "epoch": 1.0745028198278421, "grad_norm": 0.6441718339920044, "learning_rate": 3.925497180172158e-05, "loss": 0.9291, "step": 3620 }, { "epoch": 1.0774710596616206, "grad_norm": 0.6680988073348999, "learning_rate": 3.92252894033838e-05, "loss": 0.8933, "step": 3630 }, { "epoch": 1.0804392994953993, "grad_norm": 0.7151992321014404, "learning_rate": 3.9195607005046006e-05, "loss": 0.9326, "step": 3640 }, { "epoch": 1.0834075393291778, "grad_norm": 0.599533200263977, "learning_rate": 3.916592460670823e-05, "loss": 0.9201, "step": 3650 }, { "epoch": 1.0863757791629565, "grad_norm": 0.6293023228645325, "learning_rate": 3.913624220837044e-05, "loss": 0.9241, "step": 3660 }, { "epoch": 1.089344018996735, "grad_norm": 0.6524146795272827, "learning_rate": 3.910655981003265e-05, "loss": 0.9724, "step": 3670 }, { "epoch": 1.0923122588305134, "grad_norm": 0.7468433976173401, "learning_rate": 3.907687741169487e-05, "loss": 0.9371, "step": 3680 }, { "epoch": 1.0952804986642921, "grad_norm": 0.640473484992981, "learning_rate": 3.904719501335708e-05, "loss": 0.9229, "step": 3690 }, { "epoch": 1.0982487384980706, "grad_norm": 0.6132630109786987, "learning_rate": 3.9017512615019296e-05, "loss": 0.9392, "step": 3700 }, { "epoch": 1.1012169783318493, "grad_norm": 0.7036377787590027, "learning_rate": 3.8987830216681506e-05, "loss": 0.9015, "step": 3710 }, { "epoch": 1.1041852181656278, "grad_norm": 0.7262855768203735, "learning_rate": 3.895814781834373e-05, "loss": 0.9351, "step": 3720 }, { "epoch": 1.1071534579994065, "grad_norm": 0.665530264377594, "learning_rate": 3.892846542000594e-05, "loss": 0.9195, "step": 3730 }, { "epoch": 1.110121697833185, "grad_norm": 0.6587583422660828, "learning_rate": 3.889878302166815e-05, "loss": 0.898, "step": 3740 }, { "epoch": 1.1130899376669634, "grad_norm": 0.6382483243942261, "learning_rate": 3.886910062333037e-05, "loss": 0.9042, "step": 3750 }, { "epoch": 1.116058177500742, "grad_norm": 0.678569495677948, "learning_rate": 3.883941822499258e-05, "loss": 0.9695, "step": 3760 }, { "epoch": 1.1190264173345206, "grad_norm": 0.6646463871002197, "learning_rate": 3.8809735826654796e-05, "loss": 0.8859, "step": 3770 }, { "epoch": 1.1219946571682993, "grad_norm": 0.6731215119361877, "learning_rate": 3.878005342831701e-05, "loss": 0.9431, "step": 3780 }, { "epoch": 1.1249628970020777, "grad_norm": 0.6394199728965759, "learning_rate": 3.875037102997923e-05, "loss": 0.8829, "step": 3790 }, { "epoch": 1.1279311368358562, "grad_norm": 0.6833072304725647, "learning_rate": 3.872068863164144e-05, "loss": 0.8973, "step": 3800 }, { "epoch": 1.130899376669635, "grad_norm": 0.7632230520248413, "learning_rate": 3.869100623330365e-05, "loss": 0.8842, "step": 3810 }, { "epoch": 1.1338676165034134, "grad_norm": 0.684288740158081, "learning_rate": 3.866132383496587e-05, "loss": 0.9413, "step": 3820 }, { "epoch": 1.136835856337192, "grad_norm": 0.712907075881958, "learning_rate": 3.863164143662808e-05, "loss": 0.9003, "step": 3830 }, { "epoch": 1.1398040961709706, "grad_norm": 0.7701553702354431, "learning_rate": 3.8601959038290296e-05, "loss": 0.9691, "step": 3840 }, { "epoch": 1.1427723360047493, "grad_norm": 0.715511679649353, "learning_rate": 3.857227663995251e-05, "loss": 0.9501, "step": 3850 }, { "epoch": 1.1457405758385277, "grad_norm": 0.7037255167961121, "learning_rate": 3.854259424161473e-05, "loss": 0.9306, "step": 3860 }, { "epoch": 1.1487088156723062, "grad_norm": 0.7638218998908997, "learning_rate": 3.851291184327694e-05, "loss": 0.9524, "step": 3870 }, { "epoch": 1.151677055506085, "grad_norm": 0.6505693793296814, "learning_rate": 3.8483229444939154e-05, "loss": 0.9191, "step": 3880 }, { "epoch": 1.1546452953398634, "grad_norm": 0.6183308959007263, "learning_rate": 3.845354704660137e-05, "loss": 0.8744, "step": 3890 }, { "epoch": 1.157613535173642, "grad_norm": 0.6301867961883545, "learning_rate": 3.842386464826358e-05, "loss": 0.9137, "step": 3900 }, { "epoch": 1.1605817750074205, "grad_norm": 0.7025741934776306, "learning_rate": 3.8394182249925795e-05, "loss": 0.9305, "step": 3910 }, { "epoch": 1.1635500148411992, "grad_norm": 0.6408293843269348, "learning_rate": 3.836449985158801e-05, "loss": 0.9075, "step": 3920 }, { "epoch": 1.1665182546749777, "grad_norm": 0.6138466000556946, "learning_rate": 3.833481745325023e-05, "loss": 0.9357, "step": 3930 }, { "epoch": 1.1694864945087562, "grad_norm": 0.6550092101097107, "learning_rate": 3.830513505491244e-05, "loss": 0.9528, "step": 3940 }, { "epoch": 1.1724547343425349, "grad_norm": 0.6522756218910217, "learning_rate": 3.827545265657465e-05, "loss": 0.9317, "step": 3950 }, { "epoch": 1.1754229741763134, "grad_norm": 0.7218632102012634, "learning_rate": 3.824577025823687e-05, "loss": 0.9292, "step": 3960 }, { "epoch": 1.178391214010092, "grad_norm": 0.6621205806732178, "learning_rate": 3.821608785989908e-05, "loss": 0.9204, "step": 3970 }, { "epoch": 1.1813594538438705, "grad_norm": 0.6475970149040222, "learning_rate": 3.8186405461561295e-05, "loss": 0.9097, "step": 3980 }, { "epoch": 1.1843276936776492, "grad_norm": 0.6492740511894226, "learning_rate": 3.815672306322351e-05, "loss": 0.9261, "step": 3990 }, { "epoch": 1.1872959335114277, "grad_norm": 0.6680456399917603, "learning_rate": 3.812704066488573e-05, "loss": 0.929, "step": 4000 }, { "epoch": 1.1902641733452062, "grad_norm": 0.7199456095695496, "learning_rate": 3.809735826654794e-05, "loss": 0.921, "step": 4010 }, { "epoch": 1.1932324131789849, "grad_norm": 0.7005553245544434, "learning_rate": 3.806767586821015e-05, "loss": 0.9489, "step": 4020 }, { "epoch": 1.1962006530127633, "grad_norm": 0.735540509223938, "learning_rate": 3.803799346987237e-05, "loss": 0.9206, "step": 4030 }, { "epoch": 1.199168892846542, "grad_norm": 0.7136026620864868, "learning_rate": 3.800831107153458e-05, "loss": 0.9413, "step": 4040 }, { "epoch": 1.2021371326803205, "grad_norm": 0.6640150547027588, "learning_rate": 3.7978628673196795e-05, "loss": 0.9293, "step": 4050 }, { "epoch": 1.2051053725140992, "grad_norm": 0.676105797290802, "learning_rate": 3.794894627485901e-05, "loss": 0.9007, "step": 4060 }, { "epoch": 1.2080736123478777, "grad_norm": 0.6336154937744141, "learning_rate": 3.791926387652123e-05, "loss": 0.9198, "step": 4070 }, { "epoch": 1.2110418521816562, "grad_norm": 0.6762709021568298, "learning_rate": 3.7889581478183437e-05, "loss": 0.9211, "step": 4080 }, { "epoch": 1.2140100920154349, "grad_norm": 0.6613302826881409, "learning_rate": 3.785989907984565e-05, "loss": 0.8855, "step": 4090 }, { "epoch": 1.2169783318492133, "grad_norm": 0.7078067660331726, "learning_rate": 3.783021668150787e-05, "loss": 0.9235, "step": 4100 }, { "epoch": 1.219946571682992, "grad_norm": 0.7373542189598083, "learning_rate": 3.780053428317008e-05, "loss": 0.893, "step": 4110 }, { "epoch": 1.2229148115167705, "grad_norm": 0.6745258569717407, "learning_rate": 3.77708518848323e-05, "loss": 0.9423, "step": 4120 }, { "epoch": 1.2258830513505492, "grad_norm": 0.6126631498336792, "learning_rate": 3.774116948649451e-05, "loss": 0.9201, "step": 4130 }, { "epoch": 1.2288512911843277, "grad_norm": 0.6054243445396423, "learning_rate": 3.771148708815673e-05, "loss": 0.9148, "step": 4140 }, { "epoch": 1.2318195310181062, "grad_norm": 0.6797349452972412, "learning_rate": 3.7681804689818936e-05, "loss": 0.9171, "step": 4150 }, { "epoch": 1.2347877708518848, "grad_norm": 0.6671416163444519, "learning_rate": 3.765212229148115e-05, "loss": 0.9257, "step": 4160 }, { "epoch": 1.2377560106856633, "grad_norm": 0.7235540151596069, "learning_rate": 3.762243989314337e-05, "loss": 0.9254, "step": 4170 }, { "epoch": 1.240724250519442, "grad_norm": 0.7316147089004517, "learning_rate": 3.759275749480558e-05, "loss": 0.9349, "step": 4180 }, { "epoch": 1.2436924903532205, "grad_norm": 0.6851880550384521, "learning_rate": 3.75630750964678e-05, "loss": 0.9329, "step": 4190 }, { "epoch": 1.2466607301869992, "grad_norm": 0.6671837568283081, "learning_rate": 3.753339269813001e-05, "loss": 0.8949, "step": 4200 }, { "epoch": 1.2496289700207777, "grad_norm": 0.8188835978507996, "learning_rate": 3.7503710299792226e-05, "loss": 0.8751, "step": 4210 }, { "epoch": 1.2525972098545561, "grad_norm": 0.635193943977356, "learning_rate": 3.747402790145444e-05, "loss": 0.933, "step": 4220 }, { "epoch": 1.2555654496883348, "grad_norm": 0.7244166731834412, "learning_rate": 3.744434550311665e-05, "loss": 0.9009, "step": 4230 }, { "epoch": 1.2585336895221133, "grad_norm": 0.6825953722000122, "learning_rate": 3.741466310477887e-05, "loss": 0.9444, "step": 4240 }, { "epoch": 1.261501929355892, "grad_norm": 0.7705867290496826, "learning_rate": 3.738498070644108e-05, "loss": 0.9307, "step": 4250 }, { "epoch": 1.2644701691896705, "grad_norm": 0.6319135427474976, "learning_rate": 3.73552983081033e-05, "loss": 0.9332, "step": 4260 }, { "epoch": 1.2674384090234492, "grad_norm": 0.64055335521698, "learning_rate": 3.732561590976551e-05, "loss": 0.9062, "step": 4270 }, { "epoch": 1.2704066488572276, "grad_norm": 0.7409882545471191, "learning_rate": 3.7295933511427726e-05, "loss": 0.9713, "step": 4280 }, { "epoch": 1.2733748886910061, "grad_norm": 0.6357461214065552, "learning_rate": 3.726625111308994e-05, "loss": 0.9196, "step": 4290 }, { "epoch": 1.2763431285247848, "grad_norm": 0.6302955150604248, "learning_rate": 3.723656871475215e-05, "loss": 0.8913, "step": 4300 }, { "epoch": 1.2793113683585633, "grad_norm": 0.6616325378417969, "learning_rate": 3.720688631641437e-05, "loss": 0.9097, "step": 4310 }, { "epoch": 1.282279608192342, "grad_norm": 0.8014053106307983, "learning_rate": 3.7177203918076584e-05, "loss": 0.9446, "step": 4320 }, { "epoch": 1.2852478480261205, "grad_norm": 0.8350517153739929, "learning_rate": 3.71475215197388e-05, "loss": 0.9156, "step": 4330 }, { "epoch": 1.2882160878598992, "grad_norm": 0.6596609950065613, "learning_rate": 3.711783912140101e-05, "loss": 0.9923, "step": 4340 }, { "epoch": 1.2911843276936776, "grad_norm": 0.6925686001777649, "learning_rate": 3.7088156723063226e-05, "loss": 0.9401, "step": 4350 }, { "epoch": 1.294152567527456, "grad_norm": 0.6993871927261353, "learning_rate": 3.705847432472544e-05, "loss": 0.9515, "step": 4360 }, { "epoch": 1.2971208073612348, "grad_norm": 0.8117514848709106, "learning_rate": 3.702879192638765e-05, "loss": 0.8988, "step": 4370 }, { "epoch": 1.3000890471950133, "grad_norm": 0.6895804405212402, "learning_rate": 3.699910952804987e-05, "loss": 0.9153, "step": 4380 }, { "epoch": 1.303057287028792, "grad_norm": 0.6821000576019287, "learning_rate": 3.6969427129712084e-05, "loss": 0.8968, "step": 4390 }, { "epoch": 1.3060255268625705, "grad_norm": 0.6583243608474731, "learning_rate": 3.69397447313743e-05, "loss": 0.9378, "step": 4400 }, { "epoch": 1.3089937666963491, "grad_norm": 0.7524015307426453, "learning_rate": 3.691006233303651e-05, "loss": 0.9465, "step": 4410 }, { "epoch": 1.3119620065301276, "grad_norm": 0.6931015849113464, "learning_rate": 3.6880379934698726e-05, "loss": 0.901, "step": 4420 }, { "epoch": 1.314930246363906, "grad_norm": 0.7283458113670349, "learning_rate": 3.685069753636094e-05, "loss": 0.8931, "step": 4430 }, { "epoch": 1.3178984861976848, "grad_norm": 0.6719711422920227, "learning_rate": 3.682101513802315e-05, "loss": 0.9117, "step": 4440 }, { "epoch": 1.3208667260314633, "grad_norm": 0.794035017490387, "learning_rate": 3.679133273968537e-05, "loss": 0.9064, "step": 4450 }, { "epoch": 1.323834965865242, "grad_norm": 0.8168098330497742, "learning_rate": 3.6761650341347583e-05, "loss": 0.9362, "step": 4460 }, { "epoch": 1.3268032056990204, "grad_norm": 0.7201861143112183, "learning_rate": 3.67319679430098e-05, "loss": 0.9218, "step": 4470 }, { "epoch": 1.3297714455327991, "grad_norm": 0.7084200978279114, "learning_rate": 3.670228554467201e-05, "loss": 0.9243, "step": 4480 }, { "epoch": 1.3327396853665776, "grad_norm": 0.6903102993965149, "learning_rate": 3.6672603146334225e-05, "loss": 0.9403, "step": 4490 }, { "epoch": 1.335707925200356, "grad_norm": 0.6698452830314636, "learning_rate": 3.664292074799644e-05, "loss": 0.9108, "step": 4500 }, { "epoch": 1.3386761650341348, "grad_norm": 0.6291380524635315, "learning_rate": 3.661323834965865e-05, "loss": 0.9283, "step": 4510 }, { "epoch": 1.3416444048679133, "grad_norm": 0.7101170420646667, "learning_rate": 3.658355595132087e-05, "loss": 0.9198, "step": 4520 }, { "epoch": 1.344612644701692, "grad_norm": 0.6618587970733643, "learning_rate": 3.655387355298308e-05, "loss": 0.8878, "step": 4530 }, { "epoch": 1.3475808845354704, "grad_norm": 0.7191241383552551, "learning_rate": 3.65241911546453e-05, "loss": 0.882, "step": 4540 }, { "epoch": 1.3505491243692491, "grad_norm": 0.7250847220420837, "learning_rate": 3.649450875630751e-05, "loss": 0.8646, "step": 4550 }, { "epoch": 1.3535173642030276, "grad_norm": 0.7301775813102722, "learning_rate": 3.6464826357969725e-05, "loss": 0.8714, "step": 4560 }, { "epoch": 1.356485604036806, "grad_norm": 0.7399090528488159, "learning_rate": 3.643514395963194e-05, "loss": 0.9146, "step": 4570 }, { "epoch": 1.3594538438705848, "grad_norm": 0.7338899970054626, "learning_rate": 3.640546156129415e-05, "loss": 0.9218, "step": 4580 }, { "epoch": 1.3624220837043632, "grad_norm": 0.6969752311706543, "learning_rate": 3.637577916295637e-05, "loss": 0.9117, "step": 4590 }, { "epoch": 1.365390323538142, "grad_norm": 0.7742125391960144, "learning_rate": 3.634609676461858e-05, "loss": 0.9445, "step": 4600 }, { "epoch": 1.3683585633719204, "grad_norm": 0.837775468826294, "learning_rate": 3.63164143662808e-05, "loss": 0.9356, "step": 4610 }, { "epoch": 1.371326803205699, "grad_norm": 0.7069649696350098, "learning_rate": 3.628673196794301e-05, "loss": 0.8942, "step": 4620 }, { "epoch": 1.3742950430394776, "grad_norm": 0.6960747241973877, "learning_rate": 3.625704956960523e-05, "loss": 0.9484, "step": 4630 }, { "epoch": 1.377263282873256, "grad_norm": 0.6509305238723755, "learning_rate": 3.622736717126744e-05, "loss": 0.9079, "step": 4640 }, { "epoch": 1.3802315227070348, "grad_norm": 0.7066072821617126, "learning_rate": 3.619768477292965e-05, "loss": 0.9112, "step": 4650 }, { "epoch": 1.3831997625408132, "grad_norm": 0.7418903112411499, "learning_rate": 3.6168002374591866e-05, "loss": 0.9262, "step": 4660 }, { "epoch": 1.386168002374592, "grad_norm": 0.6625681519508362, "learning_rate": 3.613831997625408e-05, "loss": 0.9116, "step": 4670 }, { "epoch": 1.3891362422083704, "grad_norm": 0.766903281211853, "learning_rate": 3.61086375779163e-05, "loss": 0.9074, "step": 4680 }, { "epoch": 1.392104482042149, "grad_norm": 0.617678165435791, "learning_rate": 3.607895517957851e-05, "loss": 0.9324, "step": 4690 }, { "epoch": 1.3950727218759276, "grad_norm": 0.6659789085388184, "learning_rate": 3.604927278124073e-05, "loss": 0.9099, "step": 4700 }, { "epoch": 1.398040961709706, "grad_norm": 0.7493019700050354, "learning_rate": 3.601959038290294e-05, "loss": 0.9057, "step": 4710 }, { "epoch": 1.4010092015434847, "grad_norm": 0.6639522314071655, "learning_rate": 3.598990798456515e-05, "loss": 0.8924, "step": 4720 }, { "epoch": 1.4039774413772632, "grad_norm": 0.7125399708747864, "learning_rate": 3.596022558622737e-05, "loss": 0.9364, "step": 4730 }, { "epoch": 1.406945681211042, "grad_norm": 0.6880870461463928, "learning_rate": 3.593054318788958e-05, "loss": 0.94, "step": 4740 }, { "epoch": 1.4099139210448204, "grad_norm": 0.765393078327179, "learning_rate": 3.59008607895518e-05, "loss": 0.9592, "step": 4750 }, { "epoch": 1.412882160878599, "grad_norm": 0.7591105103492737, "learning_rate": 3.587117839121401e-05, "loss": 0.8971, "step": 4760 }, { "epoch": 1.4158504007123776, "grad_norm": 0.6672634482383728, "learning_rate": 3.584149599287623e-05, "loss": 0.8898, "step": 4770 }, { "epoch": 1.418818640546156, "grad_norm": 0.6270623803138733, "learning_rate": 3.581181359453844e-05, "loss": 0.8982, "step": 4780 }, { "epoch": 1.4217868803799347, "grad_norm": 0.7162853479385376, "learning_rate": 3.578213119620065e-05, "loss": 0.9368, "step": 4790 }, { "epoch": 1.4247551202137132, "grad_norm": 0.7325291037559509, "learning_rate": 3.575244879786287e-05, "loss": 0.904, "step": 4800 }, { "epoch": 1.427723360047492, "grad_norm": 0.7608901858329773, "learning_rate": 3.572276639952508e-05, "loss": 0.8736, "step": 4810 }, { "epoch": 1.4306915998812704, "grad_norm": 0.7135578989982605, "learning_rate": 3.56930840011873e-05, "loss": 0.926, "step": 4820 }, { "epoch": 1.433659839715049, "grad_norm": 0.7218388915061951, "learning_rate": 3.5663401602849514e-05, "loss": 0.961, "step": 4830 }, { "epoch": 1.4366280795488275, "grad_norm": 0.7087059020996094, "learning_rate": 3.563371920451173e-05, "loss": 0.9115, "step": 4840 }, { "epoch": 1.439596319382606, "grad_norm": 0.7503490447998047, "learning_rate": 3.560403680617394e-05, "loss": 0.9211, "step": 4850 }, { "epoch": 1.4425645592163847, "grad_norm": 0.7498166561126709, "learning_rate": 3.557435440783615e-05, "loss": 0.9059, "step": 4860 }, { "epoch": 1.4455327990501632, "grad_norm": 0.7072891592979431, "learning_rate": 3.554467200949837e-05, "loss": 0.9282, "step": 4870 }, { "epoch": 1.4485010388839419, "grad_norm": 0.6605234146118164, "learning_rate": 3.551498961116058e-05, "loss": 0.9261, "step": 4880 }, { "epoch": 1.4514692787177204, "grad_norm": 0.7122581005096436, "learning_rate": 3.54853072128228e-05, "loss": 0.9446, "step": 4890 }, { "epoch": 1.454437518551499, "grad_norm": 0.6746944189071655, "learning_rate": 3.5455624814485014e-05, "loss": 0.9515, "step": 4900 }, { "epoch": 1.4574057583852775, "grad_norm": 0.6817790865898132, "learning_rate": 3.542594241614723e-05, "loss": 0.9416, "step": 4910 }, { "epoch": 1.460373998219056, "grad_norm": 0.6961323022842407, "learning_rate": 3.539626001780944e-05, "loss": 0.9217, "step": 4920 }, { "epoch": 1.4633422380528347, "grad_norm": 0.6741704344749451, "learning_rate": 3.5366577619471656e-05, "loss": 0.9377, "step": 4930 }, { "epoch": 1.4663104778866132, "grad_norm": 0.6607795357704163, "learning_rate": 3.533689522113387e-05, "loss": 0.8585, "step": 4940 }, { "epoch": 1.4692787177203919, "grad_norm": 0.6922871470451355, "learning_rate": 3.530721282279608e-05, "loss": 0.9089, "step": 4950 }, { "epoch": 1.4722469575541703, "grad_norm": 0.6661325097084045, "learning_rate": 3.52775304244583e-05, "loss": 0.9016, "step": 4960 }, { "epoch": 1.475215197387949, "grad_norm": 0.7098068594932556, "learning_rate": 3.5247848026120514e-05, "loss": 0.9075, "step": 4970 }, { "epoch": 1.4781834372217275, "grad_norm": 0.7736371755599976, "learning_rate": 3.521816562778273e-05, "loss": 0.8992, "step": 4980 }, { "epoch": 1.481151677055506, "grad_norm": 0.6673690676689148, "learning_rate": 3.518848322944494e-05, "loss": 0.9198, "step": 4990 }, { "epoch": 1.4841199168892847, "grad_norm": 0.6655102372169495, "learning_rate": 3.5158800831107155e-05, "loss": 0.8794, "step": 5000 }, { "epoch": 1.4870881567230632, "grad_norm": 0.7615334987640381, "learning_rate": 3.512911843276937e-05, "loss": 0.9617, "step": 5010 }, { "epoch": 1.4900563965568419, "grad_norm": 0.7400135397911072, "learning_rate": 3.509943603443158e-05, "loss": 0.9142, "step": 5020 }, { "epoch": 1.4930246363906203, "grad_norm": 0.6787492632865906, "learning_rate": 3.50697536360938e-05, "loss": 0.9513, "step": 5030 }, { "epoch": 1.495992876224399, "grad_norm": 0.6557864546775818, "learning_rate": 3.504007123775601e-05, "loss": 0.9381, "step": 5040 }, { "epoch": 1.4989611160581775, "grad_norm": 0.6889845728874207, "learning_rate": 3.501038883941823e-05, "loss": 0.8919, "step": 5050 }, { "epoch": 1.501929355891956, "grad_norm": 0.6667259335517883, "learning_rate": 3.498070644108044e-05, "loss": 0.9388, "step": 5060 }, { "epoch": 1.5048975957257347, "grad_norm": 0.7543666958808899, "learning_rate": 3.4951024042742655e-05, "loss": 0.9045, "step": 5070 }, { "epoch": 1.5078658355595134, "grad_norm": 0.7088416218757629, "learning_rate": 3.492134164440487e-05, "loss": 0.9122, "step": 5080 }, { "epoch": 1.5108340753932916, "grad_norm": 0.6750200390815735, "learning_rate": 3.489165924606708e-05, "loss": 0.9358, "step": 5090 }, { "epoch": 1.5138023152270703, "grad_norm": 0.6742722988128662, "learning_rate": 3.48619768477293e-05, "loss": 0.9275, "step": 5100 }, { "epoch": 1.516770555060849, "grad_norm": 0.6562049984931946, "learning_rate": 3.483229444939151e-05, "loss": 0.9, "step": 5110 }, { "epoch": 1.5197387948946275, "grad_norm": 0.6847245097160339, "learning_rate": 3.480261205105373e-05, "loss": 0.897, "step": 5120 }, { "epoch": 1.522707034728406, "grad_norm": 0.7098760604858398, "learning_rate": 3.477292965271594e-05, "loss": 0.9489, "step": 5130 }, { "epoch": 1.5256752745621847, "grad_norm": 0.6374083161354065, "learning_rate": 3.4743247254378155e-05, "loss": 0.895, "step": 5140 }, { "epoch": 1.5286435143959634, "grad_norm": 0.7240116596221924, "learning_rate": 3.471356485604037e-05, "loss": 0.8889, "step": 5150 }, { "epoch": 1.5316117542297416, "grad_norm": 0.676428496837616, "learning_rate": 3.468388245770258e-05, "loss": 0.9342, "step": 5160 }, { "epoch": 1.5345799940635203, "grad_norm": 0.6966356635093689, "learning_rate": 3.4654200059364796e-05, "loss": 0.895, "step": 5170 }, { "epoch": 1.537548233897299, "grad_norm": 0.6685094237327576, "learning_rate": 3.462451766102701e-05, "loss": 0.8986, "step": 5180 }, { "epoch": 1.5405164737310775, "grad_norm": 0.7533665299415588, "learning_rate": 3.459483526268923e-05, "loss": 0.9388, "step": 5190 }, { "epoch": 1.543484713564856, "grad_norm": 0.680923581123352, "learning_rate": 3.456515286435144e-05, "loss": 0.9199, "step": 5200 }, { "epoch": 1.5464529533986346, "grad_norm": 0.7609050869941711, "learning_rate": 3.4535470466013654e-05, "loss": 0.8998, "step": 5210 }, { "epoch": 1.5494211932324133, "grad_norm": 0.6809772253036499, "learning_rate": 3.450578806767587e-05, "loss": 0.9037, "step": 5220 }, { "epoch": 1.5523894330661916, "grad_norm": 0.717606246471405, "learning_rate": 3.447610566933808e-05, "loss": 0.9231, "step": 5230 }, { "epoch": 1.5553576728999703, "grad_norm": 0.7689083218574524, "learning_rate": 3.44464232710003e-05, "loss": 0.9056, "step": 5240 }, { "epoch": 1.558325912733749, "grad_norm": 0.6759158372879028, "learning_rate": 3.441674087266251e-05, "loss": 0.9278, "step": 5250 }, { "epoch": 1.5612941525675275, "grad_norm": 0.6883628964424133, "learning_rate": 3.438705847432473e-05, "loss": 0.9289, "step": 5260 }, { "epoch": 1.564262392401306, "grad_norm": 0.7132298350334167, "learning_rate": 3.435737607598694e-05, "loss": 0.8762, "step": 5270 }, { "epoch": 1.5672306322350846, "grad_norm": 0.6003124713897705, "learning_rate": 3.4327693677649154e-05, "loss": 0.9226, "step": 5280 }, { "epoch": 1.5701988720688633, "grad_norm": 0.7471172213554382, "learning_rate": 3.429801127931137e-05, "loss": 0.9123, "step": 5290 }, { "epoch": 1.5731671119026416, "grad_norm": 0.7321667075157166, "learning_rate": 3.426832888097358e-05, "loss": 0.8938, "step": 5300 }, { "epoch": 1.5761353517364203, "grad_norm": 0.7397316098213196, "learning_rate": 3.42386464826358e-05, "loss": 0.8929, "step": 5310 }, { "epoch": 1.579103591570199, "grad_norm": 0.7189614176750183, "learning_rate": 3.420896408429801e-05, "loss": 0.9306, "step": 5320 }, { "epoch": 1.5820718314039774, "grad_norm": 0.6808568835258484, "learning_rate": 3.417928168596023e-05, "loss": 0.928, "step": 5330 }, { "epoch": 1.585040071237756, "grad_norm": 0.706499457359314, "learning_rate": 3.4149599287622444e-05, "loss": 0.9032, "step": 5340 }, { "epoch": 1.5880083110715346, "grad_norm": 0.6810677647590637, "learning_rate": 3.4119916889284654e-05, "loss": 0.9114, "step": 5350 }, { "epoch": 1.5909765509053133, "grad_norm": 0.8776708841323853, "learning_rate": 3.409023449094687e-05, "loss": 0.9452, "step": 5360 }, { "epoch": 1.5939447907390916, "grad_norm": 0.772865891456604, "learning_rate": 3.406055209260908e-05, "loss": 0.9165, "step": 5370 }, { "epoch": 1.5969130305728703, "grad_norm": 0.6921769976615906, "learning_rate": 3.40308696942713e-05, "loss": 0.9007, "step": 5380 }, { "epoch": 1.599881270406649, "grad_norm": 0.7539716362953186, "learning_rate": 3.400118729593351e-05, "loss": 0.9055, "step": 5390 }, { "epoch": 1.6028495102404274, "grad_norm": 0.7750148773193359, "learning_rate": 3.397150489759573e-05, "loss": 0.9189, "step": 5400 }, { "epoch": 1.605817750074206, "grad_norm": 0.7431687116622925, "learning_rate": 3.3941822499257944e-05, "loss": 0.9346, "step": 5410 }, { "epoch": 1.6087859899079846, "grad_norm": 0.6506355404853821, "learning_rate": 3.391214010092015e-05, "loss": 0.9065, "step": 5420 }, { "epoch": 1.6117542297417633, "grad_norm": 0.7202748656272888, "learning_rate": 3.388245770258237e-05, "loss": 0.914, "step": 5430 }, { "epoch": 1.6147224695755416, "grad_norm": 0.7080637216567993, "learning_rate": 3.3852775304244586e-05, "loss": 0.9121, "step": 5440 }, { "epoch": 1.6176907094093202, "grad_norm": 0.694549024105072, "learning_rate": 3.38230929059068e-05, "loss": 0.886, "step": 5450 }, { "epoch": 1.620658949243099, "grad_norm": 0.6965795755386353, "learning_rate": 3.379341050756901e-05, "loss": 0.9247, "step": 5460 }, { "epoch": 1.6236271890768774, "grad_norm": 0.7468060255050659, "learning_rate": 3.376372810923123e-05, "loss": 0.8834, "step": 5470 }, { "epoch": 1.626595428910656, "grad_norm": 0.6878003478050232, "learning_rate": 3.3734045710893444e-05, "loss": 0.9179, "step": 5480 }, { "epoch": 1.6295636687444346, "grad_norm": 0.7024316787719727, "learning_rate": 3.370436331255565e-05, "loss": 0.9037, "step": 5490 }, { "epoch": 1.632531908578213, "grad_norm": 0.6189635396003723, "learning_rate": 3.367468091421787e-05, "loss": 0.9578, "step": 5500 }, { "epoch": 1.6355001484119915, "grad_norm": 0.6469042897224426, "learning_rate": 3.3644998515880085e-05, "loss": 0.8899, "step": 5510 }, { "epoch": 1.6384683882457702, "grad_norm": 0.6131319403648376, "learning_rate": 3.36153161175423e-05, "loss": 0.8727, "step": 5520 }, { "epoch": 1.641436628079549, "grad_norm": 0.6959238052368164, "learning_rate": 3.358563371920451e-05, "loss": 0.9011, "step": 5530 }, { "epoch": 1.6444048679133274, "grad_norm": 0.6863197088241577, "learning_rate": 3.355595132086673e-05, "loss": 0.9627, "step": 5540 }, { "epoch": 1.6473731077471059, "grad_norm": 0.6819227933883667, "learning_rate": 3.352626892252894e-05, "loss": 0.9205, "step": 5550 }, { "epoch": 1.6503413475808846, "grad_norm": 0.7224652171134949, "learning_rate": 3.349658652419115e-05, "loss": 0.9183, "step": 5560 }, { "epoch": 1.653309587414663, "grad_norm": 0.7046286463737488, "learning_rate": 3.346690412585337e-05, "loss": 0.8784, "step": 5570 }, { "epoch": 1.6562778272484415, "grad_norm": 0.6479465961456299, "learning_rate": 3.3437221727515585e-05, "loss": 0.8717, "step": 5580 }, { "epoch": 1.6592460670822202, "grad_norm": 0.7227984070777893, "learning_rate": 3.34075393291778e-05, "loss": 0.9064, "step": 5590 }, { "epoch": 1.662214306915999, "grad_norm": 0.729101836681366, "learning_rate": 3.337785693084001e-05, "loss": 0.9287, "step": 5600 }, { "epoch": 1.6651825467497774, "grad_norm": 0.7223145961761475, "learning_rate": 3.334817453250223e-05, "loss": 0.9138, "step": 5610 }, { "epoch": 1.6681507865835559, "grad_norm": 0.8971875905990601, "learning_rate": 3.331849213416444e-05, "loss": 0.903, "step": 5620 }, { "epoch": 1.6711190264173346, "grad_norm": 0.6369121074676514, "learning_rate": 3.328880973582665e-05, "loss": 0.9082, "step": 5630 }, { "epoch": 1.674087266251113, "grad_norm": 0.689054548740387, "learning_rate": 3.325912733748887e-05, "loss": 0.9103, "step": 5640 }, { "epoch": 1.6770555060848915, "grad_norm": 0.7047765254974365, "learning_rate": 3.3229444939151085e-05, "loss": 0.9451, "step": 5650 }, { "epoch": 1.6800237459186702, "grad_norm": 0.7887316346168518, "learning_rate": 3.31997625408133e-05, "loss": 0.9225, "step": 5660 }, { "epoch": 1.682991985752449, "grad_norm": 0.7298426032066345, "learning_rate": 3.317008014247551e-05, "loss": 0.8988, "step": 5670 }, { "epoch": 1.6859602255862274, "grad_norm": 0.7398642897605896, "learning_rate": 3.314039774413773e-05, "loss": 0.9101, "step": 5680 }, { "epoch": 1.6889284654200059, "grad_norm": 0.6913554072380066, "learning_rate": 3.311071534579994e-05, "loss": 0.9213, "step": 5690 }, { "epoch": 1.6918967052537845, "grad_norm": 0.7257855534553528, "learning_rate": 3.308103294746215e-05, "loss": 0.9297, "step": 5700 }, { "epoch": 1.694864945087563, "grad_norm": 0.7400197982788086, "learning_rate": 3.305135054912437e-05, "loss": 0.8795, "step": 5710 }, { "epoch": 1.6978331849213415, "grad_norm": 0.7153133749961853, "learning_rate": 3.3021668150786584e-05, "loss": 0.8905, "step": 5720 }, { "epoch": 1.7008014247551202, "grad_norm": 0.6668074727058411, "learning_rate": 3.29919857524488e-05, "loss": 0.9542, "step": 5730 }, { "epoch": 1.703769664588899, "grad_norm": 0.7409269213676453, "learning_rate": 3.296230335411101e-05, "loss": 0.902, "step": 5740 }, { "epoch": 1.7067379044226774, "grad_norm": 0.6369599103927612, "learning_rate": 3.293262095577323e-05, "loss": 0.9278, "step": 5750 }, { "epoch": 1.7097061442564558, "grad_norm": 0.74519944190979, "learning_rate": 3.290293855743544e-05, "loss": 0.9486, "step": 5760 }, { "epoch": 1.7126743840902345, "grad_norm": 0.7439436912536621, "learning_rate": 3.287325615909765e-05, "loss": 0.9283, "step": 5770 }, { "epoch": 1.715642623924013, "grad_norm": 0.6844840049743652, "learning_rate": 3.2843573760759875e-05, "loss": 0.9643, "step": 5780 }, { "epoch": 1.7186108637577915, "grad_norm": 0.6939666271209717, "learning_rate": 3.2813891362422084e-05, "loss": 0.9393, "step": 5790 }, { "epoch": 1.7215791035915702, "grad_norm": 0.739400327205658, "learning_rate": 3.27842089640843e-05, "loss": 0.8803, "step": 5800 }, { "epoch": 1.7245473434253489, "grad_norm": 0.7088194489479065, "learning_rate": 3.275452656574651e-05, "loss": 0.8939, "step": 5810 }, { "epoch": 1.7275155832591274, "grad_norm": 0.7284060120582581, "learning_rate": 3.272484416740873e-05, "loss": 0.916, "step": 5820 }, { "epoch": 1.7304838230929058, "grad_norm": 0.6898395419120789, "learning_rate": 3.269516176907094e-05, "loss": 0.9158, "step": 5830 }, { "epoch": 1.7334520629266845, "grad_norm": 0.7644175887107849, "learning_rate": 3.266547937073315e-05, "loss": 0.8764, "step": 5840 }, { "epoch": 1.736420302760463, "grad_norm": 0.7546972036361694, "learning_rate": 3.2635796972395374e-05, "loss": 0.9211, "step": 5850 }, { "epoch": 1.7393885425942415, "grad_norm": 0.6879614591598511, "learning_rate": 3.2606114574057584e-05, "loss": 0.8859, "step": 5860 }, { "epoch": 1.7423567824280202, "grad_norm": 0.760108232498169, "learning_rate": 3.25764321757198e-05, "loss": 0.8904, "step": 5870 }, { "epoch": 1.7453250222617989, "grad_norm": 0.6350589990615845, "learning_rate": 3.2546749777382016e-05, "loss": 0.9096, "step": 5880 }, { "epoch": 1.7482932620955773, "grad_norm": 0.7435024976730347, "learning_rate": 3.251706737904423e-05, "loss": 0.891, "step": 5890 }, { "epoch": 1.7512615019293558, "grad_norm": 0.7279981970787048, "learning_rate": 3.248738498070644e-05, "loss": 0.8982, "step": 5900 }, { "epoch": 1.7542297417631345, "grad_norm": 0.7936573624610901, "learning_rate": 3.245770258236865e-05, "loss": 0.9058, "step": 5910 }, { "epoch": 1.757197981596913, "grad_norm": 0.6942318081855774, "learning_rate": 3.2428020184030874e-05, "loss": 0.9284, "step": 5920 }, { "epoch": 1.7601662214306915, "grad_norm": 0.7330809235572815, "learning_rate": 3.2398337785693083e-05, "loss": 0.9585, "step": 5930 }, { "epoch": 1.7631344612644702, "grad_norm": 0.7430150508880615, "learning_rate": 3.23686553873553e-05, "loss": 0.898, "step": 5940 }, { "epoch": 1.7661027010982489, "grad_norm": 0.6925823092460632, "learning_rate": 3.2338972989017516e-05, "loss": 0.9267, "step": 5950 }, { "epoch": 1.7690709409320273, "grad_norm": 0.6921890377998352, "learning_rate": 3.230929059067973e-05, "loss": 0.9102, "step": 5960 }, { "epoch": 1.7720391807658058, "grad_norm": 0.7293705344200134, "learning_rate": 3.227960819234194e-05, "loss": 0.9114, "step": 5970 }, { "epoch": 1.7750074205995845, "grad_norm": 0.6864549517631531, "learning_rate": 3.224992579400416e-05, "loss": 0.9243, "step": 5980 }, { "epoch": 1.777975660433363, "grad_norm": 0.7275557518005371, "learning_rate": 3.2220243395666374e-05, "loss": 0.8826, "step": 5990 }, { "epoch": 1.7809439002671414, "grad_norm": 0.7278900742530823, "learning_rate": 3.219056099732858e-05, "loss": 0.8902, "step": 6000 }, { "epoch": 1.7839121401009201, "grad_norm": 0.6495439410209656, "learning_rate": 3.21608785989908e-05, "loss": 0.9048, "step": 6010 }, { "epoch": 1.7868803799346988, "grad_norm": 0.6814368367195129, "learning_rate": 3.2131196200653016e-05, "loss": 0.9232, "step": 6020 }, { "epoch": 1.7898486197684773, "grad_norm": 0.756121039390564, "learning_rate": 3.210151380231523e-05, "loss": 0.9336, "step": 6030 }, { "epoch": 1.7928168596022558, "grad_norm": 0.7354190945625305, "learning_rate": 3.207183140397744e-05, "loss": 0.9535, "step": 6040 }, { "epoch": 1.7957850994360345, "grad_norm": 0.6970518827438354, "learning_rate": 3.204214900563966e-05, "loss": 0.9115, "step": 6050 }, { "epoch": 1.798753339269813, "grad_norm": 0.7166738510131836, "learning_rate": 3.2012466607301873e-05, "loss": 0.9179, "step": 6060 }, { "epoch": 1.8017215791035914, "grad_norm": 0.7010670304298401, "learning_rate": 3.198278420896408e-05, "loss": 0.9037, "step": 6070 }, { "epoch": 1.8046898189373701, "grad_norm": 0.7266472578048706, "learning_rate": 3.19531018106263e-05, "loss": 0.9261, "step": 6080 }, { "epoch": 1.8076580587711488, "grad_norm": 0.7845122814178467, "learning_rate": 3.1923419412288515e-05, "loss": 0.9096, "step": 6090 }, { "epoch": 1.8106262986049273, "grad_norm": 0.7793822288513184, "learning_rate": 3.189373701395073e-05, "loss": 0.9038, "step": 6100 }, { "epoch": 1.8135945384387058, "grad_norm": 0.6785141229629517, "learning_rate": 3.186405461561294e-05, "loss": 0.9126, "step": 6110 }, { "epoch": 1.8165627782724845, "grad_norm": 0.7554550766944885, "learning_rate": 3.183437221727516e-05, "loss": 0.9203, "step": 6120 }, { "epoch": 1.819531018106263, "grad_norm": 0.7003015279769897, "learning_rate": 3.180468981893737e-05, "loss": 0.9211, "step": 6130 }, { "epoch": 1.8224992579400414, "grad_norm": 0.6762093901634216, "learning_rate": 3.177500742059958e-05, "loss": 0.9299, "step": 6140 }, { "epoch": 1.8254674977738201, "grad_norm": 0.6585193872451782, "learning_rate": 3.17453250222618e-05, "loss": 0.9087, "step": 6150 }, { "epoch": 1.8284357376075988, "grad_norm": 0.7564946413040161, "learning_rate": 3.1715642623924015e-05, "loss": 0.9609, "step": 6160 }, { "epoch": 1.8314039774413773, "grad_norm": 0.7777302265167236, "learning_rate": 3.168596022558623e-05, "loss": 0.9081, "step": 6170 }, { "epoch": 1.8343722172751558, "grad_norm": 0.6798081994056702, "learning_rate": 3.165627782724844e-05, "loss": 0.9173, "step": 6180 }, { "epoch": 1.8373404571089345, "grad_norm": 0.7171394228935242, "learning_rate": 3.162659542891066e-05, "loss": 0.9549, "step": 6190 }, { "epoch": 1.840308696942713, "grad_norm": 0.621942937374115, "learning_rate": 3.159691303057287e-05, "loss": 0.938, "step": 6200 }, { "epoch": 1.8432769367764914, "grad_norm": 0.6758529543876648, "learning_rate": 3.156723063223508e-05, "loss": 0.9292, "step": 6210 }, { "epoch": 1.84624517661027, "grad_norm": 0.7157332897186279, "learning_rate": 3.15375482338973e-05, "loss": 0.9095, "step": 6220 }, { "epoch": 1.8492134164440488, "grad_norm": 0.6590803265571594, "learning_rate": 3.1507865835559515e-05, "loss": 0.9146, "step": 6230 }, { "epoch": 1.8521816562778273, "grad_norm": 0.6611948609352112, "learning_rate": 3.147818343722173e-05, "loss": 0.8839, "step": 6240 }, { "epoch": 1.8551498961116057, "grad_norm": 0.7113106846809387, "learning_rate": 3.144850103888394e-05, "loss": 0.9448, "step": 6250 }, { "epoch": 1.8581181359453844, "grad_norm": 0.6739339828491211, "learning_rate": 3.1418818640546156e-05, "loss": 0.8689, "step": 6260 }, { "epoch": 1.861086375779163, "grad_norm": 0.7268794178962708, "learning_rate": 3.138913624220837e-05, "loss": 0.9046, "step": 6270 }, { "epoch": 1.8640546156129414, "grad_norm": 0.6519601345062256, "learning_rate": 3.135945384387058e-05, "loss": 0.95, "step": 6280 }, { "epoch": 1.86702285544672, "grad_norm": 0.7493473291397095, "learning_rate": 3.1329771445532805e-05, "loss": 0.9424, "step": 6290 }, { "epoch": 1.8699910952804988, "grad_norm": 0.7745805382728577, "learning_rate": 3.1300089047195014e-05, "loss": 0.8366, "step": 6300 }, { "epoch": 1.8729593351142773, "grad_norm": 0.7086494565010071, "learning_rate": 3.127040664885723e-05, "loss": 0.9233, "step": 6310 }, { "epoch": 1.8759275749480557, "grad_norm": 0.6021324992179871, "learning_rate": 3.124072425051944e-05, "loss": 0.8949, "step": 6320 }, { "epoch": 1.8788958147818344, "grad_norm": 0.7947562336921692, "learning_rate": 3.121104185218166e-05, "loss": 0.8961, "step": 6330 }, { "epoch": 1.881864054615613, "grad_norm": 0.6500969529151917, "learning_rate": 3.118135945384387e-05, "loss": 0.9317, "step": 6340 }, { "epoch": 1.8848322944493914, "grad_norm": 0.7063040137290955, "learning_rate": 3.115167705550608e-05, "loss": 0.8962, "step": 6350 }, { "epoch": 1.88780053428317, "grad_norm": 0.7632299661636353, "learning_rate": 3.1121994657168305e-05, "loss": 0.891, "step": 6360 }, { "epoch": 1.8907687741169488, "grad_norm": 0.7408263087272644, "learning_rate": 3.1092312258830514e-05, "loss": 0.9047, "step": 6370 }, { "epoch": 1.8937370139507272, "grad_norm": 0.7138951420783997, "learning_rate": 3.106262986049273e-05, "loss": 0.8845, "step": 6380 }, { "epoch": 1.8967052537845057, "grad_norm": 0.7365957498550415, "learning_rate": 3.1032947462154946e-05, "loss": 0.8832, "step": 6390 }, { "epoch": 1.8996734936182844, "grad_norm": 0.6995649337768555, "learning_rate": 3.100326506381716e-05, "loss": 0.9068, "step": 6400 }, { "epoch": 1.9026417334520629, "grad_norm": 0.6828333735466003, "learning_rate": 3.097358266547937e-05, "loss": 0.9118, "step": 6410 }, { "epoch": 1.9056099732858414, "grad_norm": 0.6359698176383972, "learning_rate": 3.094390026714158e-05, "loss": 0.8868, "step": 6420 }, { "epoch": 1.90857821311962, "grad_norm": 0.6921895146369934, "learning_rate": 3.0914217868803804e-05, "loss": 0.8764, "step": 6430 }, { "epoch": 1.9115464529533988, "grad_norm": 0.7117785215377808, "learning_rate": 3.0884535470466014e-05, "loss": 0.923, "step": 6440 }, { "epoch": 1.9145146927871772, "grad_norm": 0.6908830404281616, "learning_rate": 3.085485307212823e-05, "loss": 0.8842, "step": 6450 }, { "epoch": 1.9174829326209557, "grad_norm": 0.651472806930542, "learning_rate": 3.0825170673790446e-05, "loss": 0.9351, "step": 6460 }, { "epoch": 1.9204511724547344, "grad_norm": 0.7545375823974609, "learning_rate": 3.079548827545266e-05, "loss": 0.9037, "step": 6470 }, { "epoch": 1.9234194122885129, "grad_norm": 0.6635625958442688, "learning_rate": 3.076580587711487e-05, "loss": 0.9218, "step": 6480 }, { "epoch": 1.9263876521222913, "grad_norm": 0.642776608467102, "learning_rate": 3.073612347877709e-05, "loss": 0.9211, "step": 6490 }, { "epoch": 1.92935589195607, "grad_norm": 0.605273425579071, "learning_rate": 3.0706441080439304e-05, "loss": 0.8955, "step": 6500 }, { "epoch": 1.9323241317898487, "grad_norm": 0.7083524465560913, "learning_rate": 3.067675868210151e-05, "loss": 0.9138, "step": 6510 }, { "epoch": 1.9352923716236272, "grad_norm": 0.6784158945083618, "learning_rate": 3.064707628376373e-05, "loss": 0.9245, "step": 6520 }, { "epoch": 1.9382606114574057, "grad_norm": 0.6602509021759033, "learning_rate": 3.0617393885425946e-05, "loss": 0.9042, "step": 6530 }, { "epoch": 1.9412288512911844, "grad_norm": 0.8108129501342773, "learning_rate": 3.058771148708816e-05, "loss": 0.873, "step": 6540 }, { "epoch": 1.9441970911249629, "grad_norm": 0.7119005918502808, "learning_rate": 3.055802908875037e-05, "loss": 0.8816, "step": 6550 }, { "epoch": 1.9471653309587413, "grad_norm": 0.6957710981369019, "learning_rate": 3.052834669041259e-05, "loss": 0.8922, "step": 6560 }, { "epoch": 1.95013357079252, "grad_norm": 0.6639032363891602, "learning_rate": 3.0498664292074804e-05, "loss": 0.952, "step": 6570 }, { "epoch": 1.9531018106262987, "grad_norm": 0.7830907106399536, "learning_rate": 3.0468981893737013e-05, "loss": 0.8939, "step": 6580 }, { "epoch": 1.9560700504600772, "grad_norm": 0.7688878774642944, "learning_rate": 3.0439299495399233e-05, "loss": 0.8802, "step": 6590 }, { "epoch": 1.9590382902938557, "grad_norm": 0.710884690284729, "learning_rate": 3.0409617097061442e-05, "loss": 0.9142, "step": 6600 }, { "epoch": 1.9620065301276344, "grad_norm": 0.7745290994644165, "learning_rate": 3.037993469872366e-05, "loss": 0.9117, "step": 6610 }, { "epoch": 1.9649747699614128, "grad_norm": 0.8341137766838074, "learning_rate": 3.0350252300385874e-05, "loss": 0.8686, "step": 6620 }, { "epoch": 1.9679430097951913, "grad_norm": 0.7225478887557983, "learning_rate": 3.0320569902048084e-05, "loss": 0.9018, "step": 6630 }, { "epoch": 1.97091124962897, "grad_norm": 0.7451010942459106, "learning_rate": 3.0290887503710303e-05, "loss": 0.8903, "step": 6640 }, { "epoch": 1.9738794894627487, "grad_norm": 0.6880558133125305, "learning_rate": 3.0261205105372513e-05, "loss": 0.9173, "step": 6650 }, { "epoch": 1.9768477292965272, "grad_norm": 0.6597363352775574, "learning_rate": 3.0231522707034732e-05, "loss": 0.9326, "step": 6660 }, { "epoch": 1.9798159691303057, "grad_norm": 0.6880154013633728, "learning_rate": 3.0201840308696945e-05, "loss": 0.8933, "step": 6670 }, { "epoch": 1.9827842089640844, "grad_norm": 0.755906343460083, "learning_rate": 3.017215791035916e-05, "loss": 0.8941, "step": 6680 }, { "epoch": 1.9857524487978628, "grad_norm": 0.7369247078895569, "learning_rate": 3.0142475512021374e-05, "loss": 0.8862, "step": 6690 }, { "epoch": 1.9887206886316413, "grad_norm": 0.7142388820648193, "learning_rate": 3.0112793113683583e-05, "loss": 0.8947, "step": 6700 }, { "epoch": 1.99168892846542, "grad_norm": 0.7839199900627136, "learning_rate": 3.0083110715345803e-05, "loss": 0.9156, "step": 6710 }, { "epoch": 1.9946571682991987, "grad_norm": 0.7071635723114014, "learning_rate": 3.0053428317008016e-05, "loss": 0.9129, "step": 6720 }, { "epoch": 1.9976254081329772, "grad_norm": 0.6864856481552124, "learning_rate": 3.0023745918670232e-05, "loss": 0.8744, "step": 6730 }, { "epoch": 2.0005936479667557, "grad_norm": 0.7013072371482849, "learning_rate": 2.9994063520332445e-05, "loss": 0.8763, "step": 6740 }, { "epoch": 2.0035618878005343, "grad_norm": 0.7171157002449036, "learning_rate": 2.996438112199466e-05, "loss": 0.8503, "step": 6750 }, { "epoch": 2.006530127634313, "grad_norm": 0.7891656756401062, "learning_rate": 2.9934698723656874e-05, "loss": 0.8224, "step": 6760 }, { "epoch": 2.0094983674680913, "grad_norm": 0.7087323069572449, "learning_rate": 2.9905016325319086e-05, "loss": 0.8945, "step": 6770 }, { "epoch": 2.01246660730187, "grad_norm": 0.750305712223053, "learning_rate": 2.9875333926981303e-05, "loss": 0.8414, "step": 6780 }, { "epoch": 2.0154348471356487, "grad_norm": 0.6855888366699219, "learning_rate": 2.9845651528643515e-05, "loss": 0.8437, "step": 6790 }, { "epoch": 2.018403086969427, "grad_norm": 0.7236269116401672, "learning_rate": 2.981596913030573e-05, "loss": 0.8373, "step": 6800 }, { "epoch": 2.0213713268032056, "grad_norm": 0.7629292011260986, "learning_rate": 2.9786286731967944e-05, "loss": 0.8659, "step": 6810 }, { "epoch": 2.0243395666369843, "grad_norm": 0.8071072697639465, "learning_rate": 2.975660433363016e-05, "loss": 0.9247, "step": 6820 }, { "epoch": 2.027307806470763, "grad_norm": 0.7631831765174866, "learning_rate": 2.9726921935292373e-05, "loss": 0.8482, "step": 6830 }, { "epoch": 2.0302760463045413, "grad_norm": 0.681796669960022, "learning_rate": 2.9697239536954586e-05, "loss": 0.8714, "step": 6840 }, { "epoch": 2.03324428613832, "grad_norm": 0.7427048087120056, "learning_rate": 2.9667557138616802e-05, "loss": 0.8349, "step": 6850 }, { "epoch": 2.0362125259720987, "grad_norm": 0.7295932769775391, "learning_rate": 2.9637874740279015e-05, "loss": 0.829, "step": 6860 }, { "epoch": 2.039180765805877, "grad_norm": 0.6897096633911133, "learning_rate": 2.960819234194123e-05, "loss": 0.8319, "step": 6870 }, { "epoch": 2.0421490056396556, "grad_norm": 0.7505813837051392, "learning_rate": 2.9578509943603444e-05, "loss": 0.869, "step": 6880 }, { "epoch": 2.0451172454734343, "grad_norm": 0.7533600330352783, "learning_rate": 2.954882754526566e-05, "loss": 0.8683, "step": 6890 }, { "epoch": 2.048085485307213, "grad_norm": 0.7831742167472839, "learning_rate": 2.9519145146927873e-05, "loss": 0.8161, "step": 6900 }, { "epoch": 2.0510537251409913, "grad_norm": 0.7394450306892395, "learning_rate": 2.9489462748590086e-05, "loss": 0.8834, "step": 6910 }, { "epoch": 2.05402196497477, "grad_norm": 0.7655673623085022, "learning_rate": 2.9459780350252302e-05, "loss": 0.8265, "step": 6920 }, { "epoch": 2.0569902048085487, "grad_norm": 0.7019949555397034, "learning_rate": 2.9430097951914515e-05, "loss": 0.8573, "step": 6930 }, { "epoch": 2.059958444642327, "grad_norm": 0.6387875080108643, "learning_rate": 2.940041555357673e-05, "loss": 0.8259, "step": 6940 }, { "epoch": 2.0629266844761056, "grad_norm": 0.7614507079124451, "learning_rate": 2.9370733155238944e-05, "loss": 0.8758, "step": 6950 }, { "epoch": 2.0658949243098843, "grad_norm": 0.7810057401657104, "learning_rate": 2.9341050756901163e-05, "loss": 0.824, "step": 6960 }, { "epoch": 2.068863164143663, "grad_norm": 0.7228024005889893, "learning_rate": 2.9311368358563373e-05, "loss": 0.8345, "step": 6970 }, { "epoch": 2.0718314039774413, "grad_norm": 0.6962882876396179, "learning_rate": 2.9281685960225585e-05, "loss": 0.8099, "step": 6980 }, { "epoch": 2.07479964381122, "grad_norm": 0.8062400221824646, "learning_rate": 2.92520035618878e-05, "loss": 0.8453, "step": 6990 }, { "epoch": 2.0777678836449986, "grad_norm": 0.787611186504364, "learning_rate": 2.9222321163550014e-05, "loss": 0.8787, "step": 7000 }, { "epoch": 2.080736123478777, "grad_norm": 0.7147652506828308, "learning_rate": 2.9192638765212234e-05, "loss": 0.8768, "step": 7010 }, { "epoch": 2.0837043633125556, "grad_norm": 0.6608722805976868, "learning_rate": 2.9162956366874443e-05, "loss": 0.8247, "step": 7020 }, { "epoch": 2.0866726031463343, "grad_norm": 0.752377450466156, "learning_rate": 2.9133273968536663e-05, "loss": 0.8037, "step": 7030 }, { "epoch": 2.089640842980113, "grad_norm": 0.8308156728744507, "learning_rate": 2.9103591570198872e-05, "loss": 0.8878, "step": 7040 }, { "epoch": 2.0926090828138912, "grad_norm": 0.6480581164360046, "learning_rate": 2.9073909171861085e-05, "loss": 0.8472, "step": 7050 }, { "epoch": 2.09557732264767, "grad_norm": 0.7647300362586975, "learning_rate": 2.9044226773523305e-05, "loss": 0.864, "step": 7060 }, { "epoch": 2.0985455624814486, "grad_norm": 0.7759458422660828, "learning_rate": 2.9014544375185514e-05, "loss": 0.8124, "step": 7070 }, { "epoch": 2.101513802315227, "grad_norm": 0.7516981363296509, "learning_rate": 2.8984861976847734e-05, "loss": 0.8517, "step": 7080 }, { "epoch": 2.1044820421490056, "grad_norm": 0.7401167750358582, "learning_rate": 2.8955179578509943e-05, "loss": 0.8302, "step": 7090 }, { "epoch": 2.1074502819827843, "grad_norm": 0.7449563145637512, "learning_rate": 2.8925497180172163e-05, "loss": 0.873, "step": 7100 }, { "epoch": 2.110418521816563, "grad_norm": 0.8492381572723389, "learning_rate": 2.8895814781834375e-05, "loss": 0.822, "step": 7110 }, { "epoch": 2.1133867616503412, "grad_norm": 0.7222036719322205, "learning_rate": 2.8866132383496585e-05, "loss": 0.818, "step": 7120 }, { "epoch": 2.11635500148412, "grad_norm": 0.7867766618728638, "learning_rate": 2.8836449985158804e-05, "loss": 0.8539, "step": 7130 }, { "epoch": 2.1193232413178986, "grad_norm": 0.7673478126525879, "learning_rate": 2.8806767586821014e-05, "loss": 0.8221, "step": 7140 }, { "epoch": 2.122291481151677, "grad_norm": 0.7407607436180115, "learning_rate": 2.8777085188483233e-05, "loss": 0.8626, "step": 7150 }, { "epoch": 2.1252597209854556, "grad_norm": 0.6958709955215454, "learning_rate": 2.8747402790145446e-05, "loss": 0.8333, "step": 7160 }, { "epoch": 2.1282279608192343, "grad_norm": 0.7231135964393616, "learning_rate": 2.8717720391807662e-05, "loss": 0.8548, "step": 7170 }, { "epoch": 2.131196200653013, "grad_norm": 0.9101832509040833, "learning_rate": 2.8688037993469875e-05, "loss": 0.8288, "step": 7180 }, { "epoch": 2.134164440486791, "grad_norm": 0.7785500884056091, "learning_rate": 2.8658355595132085e-05, "loss": 0.8675, "step": 7190 }, { "epoch": 2.13713268032057, "grad_norm": 0.7221528887748718, "learning_rate": 2.8628673196794304e-05, "loss": 0.8963, "step": 7200 }, { "epoch": 2.1401009201543486, "grad_norm": 0.7630613446235657, "learning_rate": 2.8598990798456517e-05, "loss": 0.8525, "step": 7210 }, { "epoch": 2.143069159988127, "grad_norm": 0.7581575512886047, "learning_rate": 2.8569308400118733e-05, "loss": 0.8182, "step": 7220 }, { "epoch": 2.1460373998219056, "grad_norm": 0.9540184140205383, "learning_rate": 2.8539626001780946e-05, "loss": 0.8806, "step": 7230 }, { "epoch": 2.1490056396556843, "grad_norm": 0.7357391715049744, "learning_rate": 2.8509943603443162e-05, "loss": 0.8415, "step": 7240 }, { "epoch": 2.151973879489463, "grad_norm": 0.9968686103820801, "learning_rate": 2.8480261205105375e-05, "loss": 0.8193, "step": 7250 }, { "epoch": 2.154942119323241, "grad_norm": 0.8082394003868103, "learning_rate": 2.8450578806767588e-05, "loss": 0.8351, "step": 7260 }, { "epoch": 2.15791035915702, "grad_norm": 0.757830023765564, "learning_rate": 2.8420896408429804e-05, "loss": 0.8291, "step": 7270 }, { "epoch": 2.1608785989907986, "grad_norm": 0.8347230553627014, "learning_rate": 2.8391214010092017e-05, "loss": 0.8315, "step": 7280 }, { "epoch": 2.163846838824577, "grad_norm": 0.6753961443901062, "learning_rate": 2.8361531611754233e-05, "loss": 0.8755, "step": 7290 }, { "epoch": 2.1668150786583555, "grad_norm": 0.8221672773361206, "learning_rate": 2.8331849213416445e-05, "loss": 0.8614, "step": 7300 }, { "epoch": 2.1697833184921342, "grad_norm": 0.7886311411857605, "learning_rate": 2.830216681507866e-05, "loss": 0.8719, "step": 7310 }, { "epoch": 2.172751558325913, "grad_norm": 0.7683786749839783, "learning_rate": 2.8272484416740874e-05, "loss": 0.863, "step": 7320 }, { "epoch": 2.175719798159691, "grad_norm": 0.8968241810798645, "learning_rate": 2.8242802018403087e-05, "loss": 0.873, "step": 7330 }, { "epoch": 2.17868803799347, "grad_norm": 0.819404125213623, "learning_rate": 2.8213119620065303e-05, "loss": 0.8907, "step": 7340 }, { "epoch": 2.1816562778272486, "grad_norm": 0.7417108416557312, "learning_rate": 2.8183437221727516e-05, "loss": 0.8809, "step": 7350 }, { "epoch": 2.184624517661027, "grad_norm": 0.8411499261856079, "learning_rate": 2.8153754823389732e-05, "loss": 0.9122, "step": 7360 }, { "epoch": 2.1875927574948055, "grad_norm": 0.776980996131897, "learning_rate": 2.8124072425051945e-05, "loss": 0.854, "step": 7370 }, { "epoch": 2.1905609973285842, "grad_norm": 0.7408774495124817, "learning_rate": 2.809439002671416e-05, "loss": 0.8633, "step": 7380 }, { "epoch": 2.193529237162363, "grad_norm": 0.8015629649162292, "learning_rate": 2.8064707628376374e-05, "loss": 0.8479, "step": 7390 }, { "epoch": 2.196497476996141, "grad_norm": 0.833080530166626, "learning_rate": 2.8035025230038587e-05, "loss": 0.8482, "step": 7400 }, { "epoch": 2.19946571682992, "grad_norm": 0.720993161201477, "learning_rate": 2.8005342831700803e-05, "loss": 0.8626, "step": 7410 }, { "epoch": 2.2024339566636986, "grad_norm": 0.7985531091690063, "learning_rate": 2.7975660433363016e-05, "loss": 0.8456, "step": 7420 }, { "epoch": 2.205402196497477, "grad_norm": 0.7086364030838013, "learning_rate": 2.7945978035025232e-05, "loss": 0.8585, "step": 7430 }, { "epoch": 2.2083704363312555, "grad_norm": 0.8406996130943298, "learning_rate": 2.7916295636687445e-05, "loss": 0.8614, "step": 7440 }, { "epoch": 2.211338676165034, "grad_norm": 0.8609254956245422, "learning_rate": 2.788661323834966e-05, "loss": 0.8563, "step": 7450 }, { "epoch": 2.214306915998813, "grad_norm": 0.7881911396980286, "learning_rate": 2.7856930840011874e-05, "loss": 0.8697, "step": 7460 }, { "epoch": 2.217275155832591, "grad_norm": 0.6949644088745117, "learning_rate": 2.7827248441674087e-05, "loss": 0.868, "step": 7470 }, { "epoch": 2.22024339566637, "grad_norm": 0.786999523639679, "learning_rate": 2.7797566043336303e-05, "loss": 0.8641, "step": 7480 }, { "epoch": 2.2232116355001486, "grad_norm": 0.7273027300834656, "learning_rate": 2.7767883644998516e-05, "loss": 0.8403, "step": 7490 }, { "epoch": 2.226179875333927, "grad_norm": 0.8558393120765686, "learning_rate": 2.7738201246660732e-05, "loss": 0.8597, "step": 7500 }, { "epoch": 2.2291481151677055, "grad_norm": 0.7229394912719727, "learning_rate": 2.7708518848322945e-05, "loss": 0.8706, "step": 7510 }, { "epoch": 2.232116355001484, "grad_norm": 0.825646698474884, "learning_rate": 2.7678836449985164e-05, "loss": 0.9021, "step": 7520 }, { "epoch": 2.235084594835263, "grad_norm": 0.8738429546356201, "learning_rate": 2.7649154051647374e-05, "loss": 0.8346, "step": 7530 }, { "epoch": 2.238052834669041, "grad_norm": 0.7595973014831543, "learning_rate": 2.7619471653309586e-05, "loss": 0.8598, "step": 7540 }, { "epoch": 2.24102107450282, "grad_norm": 0.8204748630523682, "learning_rate": 2.7589789254971802e-05, "loss": 0.8509, "step": 7550 }, { "epoch": 2.2439893143365985, "grad_norm": 0.7805768847465515, "learning_rate": 2.7560106856634015e-05, "loss": 0.8247, "step": 7560 }, { "epoch": 2.246957554170377, "grad_norm": 0.8872819542884827, "learning_rate": 2.7530424458296235e-05, "loss": 0.8543, "step": 7570 }, { "epoch": 2.2499257940041555, "grad_norm": 0.789537250995636, "learning_rate": 2.7500742059958444e-05, "loss": 0.8641, "step": 7580 }, { "epoch": 2.252894033837934, "grad_norm": 0.7404807806015015, "learning_rate": 2.7471059661620664e-05, "loss": 0.8518, "step": 7590 }, { "epoch": 2.2558622736717124, "grad_norm": 0.7324669361114502, "learning_rate": 2.7441377263282873e-05, "loss": 0.8457, "step": 7600 }, { "epoch": 2.258830513505491, "grad_norm": 0.7856626510620117, "learning_rate": 2.7411694864945086e-05, "loss": 0.8568, "step": 7610 }, { "epoch": 2.26179875333927, "grad_norm": 0.7492886781692505, "learning_rate": 2.7382012466607306e-05, "loss": 0.8075, "step": 7620 }, { "epoch": 2.2647669931730485, "grad_norm": 0.7941344976425171, "learning_rate": 2.7352330068269515e-05, "loss": 0.8431, "step": 7630 }, { "epoch": 2.267735233006827, "grad_norm": 0.7928834557533264, "learning_rate": 2.7322647669931734e-05, "loss": 0.8743, "step": 7640 }, { "epoch": 2.2707034728406055, "grad_norm": 0.7476490139961243, "learning_rate": 2.7292965271593944e-05, "loss": 0.8079, "step": 7650 }, { "epoch": 2.273671712674384, "grad_norm": 0.7902363538742065, "learning_rate": 2.7263282873256163e-05, "loss": 0.8351, "step": 7660 }, { "epoch": 2.2766399525081624, "grad_norm": 0.7967187166213989, "learning_rate": 2.7233600474918376e-05, "loss": 0.8835, "step": 7670 }, { "epoch": 2.279608192341941, "grad_norm": 0.8231721520423889, "learning_rate": 2.7203918076580586e-05, "loss": 0.8446, "step": 7680 }, { "epoch": 2.28257643217572, "grad_norm": 0.7556421160697937, "learning_rate": 2.7174235678242805e-05, "loss": 0.8458, "step": 7690 }, { "epoch": 2.2855446720094985, "grad_norm": 0.7829645872116089, "learning_rate": 2.7144553279905015e-05, "loss": 0.8547, "step": 7700 }, { "epoch": 2.2885129118432768, "grad_norm": 0.7063056230545044, "learning_rate": 2.7114870881567234e-05, "loss": 0.869, "step": 7710 }, { "epoch": 2.2914811516770555, "grad_norm": 0.7241949439048767, "learning_rate": 2.7085188483229447e-05, "loss": 0.8424, "step": 7720 }, { "epoch": 2.294449391510834, "grad_norm": 0.7578948140144348, "learning_rate": 2.7055506084891663e-05, "loss": 0.8515, "step": 7730 }, { "epoch": 2.2974176313446124, "grad_norm": 0.835233211517334, "learning_rate": 2.7025823686553876e-05, "loss": 0.8605, "step": 7740 }, { "epoch": 2.300385871178391, "grad_norm": 0.7147203087806702, "learning_rate": 2.6996141288216085e-05, "loss": 0.8272, "step": 7750 }, { "epoch": 2.30335411101217, "grad_norm": 0.7513293027877808, "learning_rate": 2.6966458889878305e-05, "loss": 0.8783, "step": 7760 }, { "epoch": 2.3063223508459485, "grad_norm": 0.8606356978416443, "learning_rate": 2.6936776491540518e-05, "loss": 0.8841, "step": 7770 }, { "epoch": 2.3092905906797268, "grad_norm": 0.8113495707511902, "learning_rate": 2.6907094093202734e-05, "loss": 0.8665, "step": 7780 }, { "epoch": 2.3122588305135054, "grad_norm": 0.8095558285713196, "learning_rate": 2.6877411694864947e-05, "loss": 0.8226, "step": 7790 }, { "epoch": 2.315227070347284, "grad_norm": 0.73026442527771, "learning_rate": 2.6847729296527163e-05, "loss": 0.8518, "step": 7800 }, { "epoch": 2.3181953101810624, "grad_norm": 0.8076465129852295, "learning_rate": 2.6818046898189376e-05, "loss": 0.8224, "step": 7810 }, { "epoch": 2.321163550014841, "grad_norm": 0.7851926684379578, "learning_rate": 2.678836449985159e-05, "loss": 0.8679, "step": 7820 }, { "epoch": 2.32413178984862, "grad_norm": 0.7606148719787598, "learning_rate": 2.6758682101513805e-05, "loss": 0.876, "step": 7830 }, { "epoch": 2.3271000296823985, "grad_norm": 0.79587322473526, "learning_rate": 2.6728999703176017e-05, "loss": 0.8056, "step": 7840 }, { "epoch": 2.3300682695161767, "grad_norm": 0.6922584176063538, "learning_rate": 2.6699317304838234e-05, "loss": 0.836, "step": 7850 }, { "epoch": 2.3330365093499554, "grad_norm": 0.7687097787857056, "learning_rate": 2.6669634906500446e-05, "loss": 0.8151, "step": 7860 }, { "epoch": 2.336004749183734, "grad_norm": 0.8245091438293457, "learning_rate": 2.6639952508162663e-05, "loss": 0.7964, "step": 7870 }, { "epoch": 2.3389729890175124, "grad_norm": 0.7584673762321472, "learning_rate": 2.6610270109824875e-05, "loss": 0.8807, "step": 7880 }, { "epoch": 2.341941228851291, "grad_norm": 0.8042759895324707, "learning_rate": 2.6580587711487088e-05, "loss": 0.8211, "step": 7890 }, { "epoch": 2.3449094686850698, "grad_norm": 0.7295954823493958, "learning_rate": 2.6550905313149304e-05, "loss": 0.8415, "step": 7900 }, { "epoch": 2.3478777085188485, "grad_norm": 0.688973069190979, "learning_rate": 2.6521222914811517e-05, "loss": 0.8405, "step": 7910 }, { "epoch": 2.3508459483526267, "grad_norm": 0.7817041873931885, "learning_rate": 2.6491540516473733e-05, "loss": 0.8789, "step": 7920 }, { "epoch": 2.3538141881864054, "grad_norm": 0.7820131182670593, "learning_rate": 2.6461858118135946e-05, "loss": 0.8383, "step": 7930 }, { "epoch": 2.356782428020184, "grad_norm": 0.728888750076294, "learning_rate": 2.6432175719798162e-05, "loss": 0.8294, "step": 7940 }, { "epoch": 2.3597506678539624, "grad_norm": 0.7914735078811646, "learning_rate": 2.6402493321460375e-05, "loss": 0.8282, "step": 7950 }, { "epoch": 2.362718907687741, "grad_norm": 0.7743697762489319, "learning_rate": 2.6372810923122588e-05, "loss": 0.8325, "step": 7960 }, { "epoch": 2.3656871475215198, "grad_norm": 0.7644380927085876, "learning_rate": 2.6343128524784804e-05, "loss": 0.8519, "step": 7970 }, { "epoch": 2.3686553873552985, "grad_norm": 0.7865247130393982, "learning_rate": 2.6313446126447017e-05, "loss": 0.8961, "step": 7980 }, { "epoch": 2.3716236271890767, "grad_norm": 0.7742238640785217, "learning_rate": 2.6283763728109233e-05, "loss": 0.8527, "step": 7990 }, { "epoch": 2.3745918670228554, "grad_norm": 0.7000154256820679, "learning_rate": 2.6254081329771446e-05, "loss": 0.8694, "step": 8000 }, { "epoch": 2.377560106856634, "grad_norm": 0.7216348648071289, "learning_rate": 2.6224398931433662e-05, "loss": 0.8619, "step": 8010 }, { "epoch": 2.3805283466904124, "grad_norm": 0.7548758387565613, "learning_rate": 2.6194716533095875e-05, "loss": 0.8852, "step": 8020 }, { "epoch": 2.383496586524191, "grad_norm": 0.7663046717643738, "learning_rate": 2.6165034134758087e-05, "loss": 0.8495, "step": 8030 }, { "epoch": 2.3864648263579697, "grad_norm": 0.8307507038116455, "learning_rate": 2.6135351736420304e-05, "loss": 0.8275, "step": 8040 }, { "epoch": 2.3894330661917484, "grad_norm": 0.8365349173545837, "learning_rate": 2.6105669338082516e-05, "loss": 0.873, "step": 8050 }, { "epoch": 2.3924013060255267, "grad_norm": 0.8038210272789001, "learning_rate": 2.6075986939744736e-05, "loss": 0.8658, "step": 8060 }, { "epoch": 2.3953695458593054, "grad_norm": 0.7396426200866699, "learning_rate": 2.6046304541406945e-05, "loss": 0.8331, "step": 8070 }, { "epoch": 2.398337785693084, "grad_norm": 0.8063576221466064, "learning_rate": 2.6016622143069165e-05, "loss": 0.8453, "step": 8080 }, { "epoch": 2.4013060255268623, "grad_norm": 0.8973240256309509, "learning_rate": 2.5986939744731374e-05, "loss": 0.8578, "step": 8090 }, { "epoch": 2.404274265360641, "grad_norm": 0.8081666827201843, "learning_rate": 2.5957257346393587e-05, "loss": 0.8546, "step": 8100 }, { "epoch": 2.4072425051944197, "grad_norm": 0.7552733421325684, "learning_rate": 2.5927574948055807e-05, "loss": 0.7966, "step": 8110 }, { "epoch": 2.4102107450281984, "grad_norm": 0.7936916947364807, "learning_rate": 2.5897892549718016e-05, "loss": 0.8343, "step": 8120 }, { "epoch": 2.4131789848619767, "grad_norm": 0.7276195287704468, "learning_rate": 2.5868210151380236e-05, "loss": 0.8703, "step": 8130 }, { "epoch": 2.4161472246957554, "grad_norm": 0.8015003204345703, "learning_rate": 2.5838527753042445e-05, "loss": 0.877, "step": 8140 }, { "epoch": 2.419115464529534, "grad_norm": 0.8002310991287231, "learning_rate": 2.5808845354704665e-05, "loss": 0.8319, "step": 8150 }, { "epoch": 2.4220837043633123, "grad_norm": 0.8283134698867798, "learning_rate": 2.5779162956366877e-05, "loss": 0.857, "step": 8160 }, { "epoch": 2.425051944197091, "grad_norm": 0.805871844291687, "learning_rate": 2.5749480558029087e-05, "loss": 0.8277, "step": 8170 }, { "epoch": 2.4280201840308697, "grad_norm": 0.7265508770942688, "learning_rate": 2.5719798159691306e-05, "loss": 0.8733, "step": 8180 }, { "epoch": 2.4309884238646484, "grad_norm": 0.850188672542572, "learning_rate": 2.5690115761353516e-05, "loss": 0.8698, "step": 8190 }, { "epoch": 2.4339566636984267, "grad_norm": 0.7770668864250183, "learning_rate": 2.5660433363015735e-05, "loss": 0.8177, "step": 8200 }, { "epoch": 2.4369249035322054, "grad_norm": 0.7985296249389648, "learning_rate": 2.5630750964677948e-05, "loss": 0.849, "step": 8210 }, { "epoch": 2.439893143365984, "grad_norm": 0.8427296876907349, "learning_rate": 2.5601068566340164e-05, "loss": 0.8884, "step": 8220 }, { "epoch": 2.4428613831997623, "grad_norm": 0.7303191423416138, "learning_rate": 2.5571386168002377e-05, "loss": 0.9021, "step": 8230 }, { "epoch": 2.445829623033541, "grad_norm": 0.9019272923469543, "learning_rate": 2.5541703769664586e-05, "loss": 0.8816, "step": 8240 }, { "epoch": 2.4487978628673197, "grad_norm": 0.7612438797950745, "learning_rate": 2.5512021371326806e-05, "loss": 0.87, "step": 8250 }, { "epoch": 2.4517661027010984, "grad_norm": 0.7570661902427673, "learning_rate": 2.548233897298902e-05, "loss": 0.8375, "step": 8260 }, { "epoch": 2.4547343425348767, "grad_norm": 0.7646659016609192, "learning_rate": 2.5452656574651235e-05, "loss": 0.8595, "step": 8270 }, { "epoch": 2.4577025823686554, "grad_norm": 0.7863264679908752, "learning_rate": 2.5422974176313448e-05, "loss": 0.8535, "step": 8280 }, { "epoch": 2.460670822202434, "grad_norm": 0.6919600963592529, "learning_rate": 2.5393291777975664e-05, "loss": 0.8125, "step": 8290 }, { "epoch": 2.4636390620362123, "grad_norm": 0.8025588393211365, "learning_rate": 2.5363609379637877e-05, "loss": 0.8589, "step": 8300 }, { "epoch": 2.466607301869991, "grad_norm": 0.7756131291389465, "learning_rate": 2.533392698130009e-05, "loss": 0.88, "step": 8310 }, { "epoch": 2.4695755417037697, "grad_norm": 0.6762662529945374, "learning_rate": 2.5304244582962306e-05, "loss": 0.8455, "step": 8320 }, { "epoch": 2.4725437815375484, "grad_norm": 0.8348343968391418, "learning_rate": 2.527456218462452e-05, "loss": 0.8593, "step": 8330 }, { "epoch": 2.4755120213713266, "grad_norm": 0.8249140381813049, "learning_rate": 2.5244879786286735e-05, "loss": 0.8238, "step": 8340 }, { "epoch": 2.4784802612051053, "grad_norm": 0.8179166316986084, "learning_rate": 2.5215197387948947e-05, "loss": 0.8501, "step": 8350 }, { "epoch": 2.481448501038884, "grad_norm": 0.8068183064460754, "learning_rate": 2.5185514989611164e-05, "loss": 0.8611, "step": 8360 }, { "epoch": 2.4844167408726623, "grad_norm": 0.8538073301315308, "learning_rate": 2.5155832591273376e-05, "loss": 0.8695, "step": 8370 }, { "epoch": 2.487384980706441, "grad_norm": 0.8070683479309082, "learning_rate": 2.512615019293559e-05, "loss": 0.8575, "step": 8380 }, { "epoch": 2.4903532205402197, "grad_norm": 0.7846097946166992, "learning_rate": 2.5096467794597805e-05, "loss": 0.8764, "step": 8390 }, { "epoch": 2.4933214603739984, "grad_norm": 0.794704020023346, "learning_rate": 2.5066785396260018e-05, "loss": 0.8097, "step": 8400 }, { "epoch": 2.4962897002077766, "grad_norm": 0.7580412030220032, "learning_rate": 2.5037102997922234e-05, "loss": 0.8092, "step": 8410 }, { "epoch": 2.4992579400415553, "grad_norm": 0.7811313271522522, "learning_rate": 2.5007420599584447e-05, "loss": 0.8548, "step": 8420 }, { "epoch": 2.502226179875334, "grad_norm": 0.7651128172874451, "learning_rate": 2.497773820124666e-05, "loss": 0.8582, "step": 8430 }, { "epoch": 2.5051944197091123, "grad_norm": 0.7565361857414246, "learning_rate": 2.4948055802908876e-05, "loss": 0.8397, "step": 8440 }, { "epoch": 2.508162659542891, "grad_norm": 0.8782424330711365, "learning_rate": 2.4918373404571092e-05, "loss": 0.8598, "step": 8450 }, { "epoch": 2.5111308993766697, "grad_norm": 0.8438358306884766, "learning_rate": 2.4888691006233305e-05, "loss": 0.8433, "step": 8460 }, { "epoch": 2.5140991392104484, "grad_norm": 0.839470386505127, "learning_rate": 2.485900860789552e-05, "loss": 0.8754, "step": 8470 }, { "epoch": 2.5170673790442266, "grad_norm": 0.7891715168952942, "learning_rate": 2.482932620955773e-05, "loss": 0.8744, "step": 8480 }, { "epoch": 2.5200356188780053, "grad_norm": 0.8311156034469604, "learning_rate": 2.4799643811219947e-05, "loss": 0.8741, "step": 8490 }, { "epoch": 2.523003858711784, "grad_norm": 0.7338982224464417, "learning_rate": 2.4769961412882163e-05, "loss": 0.8549, "step": 8500 }, { "epoch": 2.5259720985455623, "grad_norm": 0.739483118057251, "learning_rate": 2.4740279014544376e-05, "loss": 0.8458, "step": 8510 }, { "epoch": 2.528940338379341, "grad_norm": 0.7444939613342285, "learning_rate": 2.4710596616206592e-05, "loss": 0.8497, "step": 8520 }, { "epoch": 2.5319085782131197, "grad_norm": 0.8406269550323486, "learning_rate": 2.4680914217868805e-05, "loss": 0.8514, "step": 8530 }, { "epoch": 2.5348768180468984, "grad_norm": 0.7409178614616394, "learning_rate": 2.465123181953102e-05, "loss": 0.8532, "step": 8540 }, { "epoch": 2.5378450578806766, "grad_norm": 0.8377125263214111, "learning_rate": 2.4621549421193234e-05, "loss": 0.8559, "step": 8550 }, { "epoch": 2.5408132977144553, "grad_norm": 0.9167962670326233, "learning_rate": 2.4591867022855447e-05, "loss": 0.8972, "step": 8560 }, { "epoch": 2.543781537548234, "grad_norm": 0.6903221607208252, "learning_rate": 2.4562184624517663e-05, "loss": 0.8719, "step": 8570 }, { "epoch": 2.5467497773820122, "grad_norm": 0.7576964497566223, "learning_rate": 2.4532502226179875e-05, "loss": 0.8451, "step": 8580 }, { "epoch": 2.549718017215791, "grad_norm": 0.736854612827301, "learning_rate": 2.450281982784209e-05, "loss": 0.8311, "step": 8590 }, { "epoch": 2.5526862570495696, "grad_norm": 0.7180027365684509, "learning_rate": 2.4473137429504304e-05, "loss": 0.8302, "step": 8600 }, { "epoch": 2.5556544968833483, "grad_norm": 0.7868888974189758, "learning_rate": 2.444345503116652e-05, "loss": 0.8455, "step": 8610 }, { "epoch": 2.5586227367171266, "grad_norm": 0.8172736167907715, "learning_rate": 2.4413772632828733e-05, "loss": 0.8489, "step": 8620 }, { "epoch": 2.5615909765509053, "grad_norm": 0.8119463920593262, "learning_rate": 2.4384090234490946e-05, "loss": 0.8814, "step": 8630 }, { "epoch": 2.564559216384684, "grad_norm": 0.7911559343338013, "learning_rate": 2.4354407836153162e-05, "loss": 0.8612, "step": 8640 }, { "epoch": 2.5675274562184622, "grad_norm": 0.8564543128013611, "learning_rate": 2.4324725437815375e-05, "loss": 0.8515, "step": 8650 }, { "epoch": 2.570495696052241, "grad_norm": 0.7224608063697815, "learning_rate": 2.429504303947759e-05, "loss": 0.8362, "step": 8660 }, { "epoch": 2.5734639358860196, "grad_norm": 0.8463042974472046, "learning_rate": 2.4265360641139808e-05, "loss": 0.8347, "step": 8670 }, { "epoch": 2.5764321757197983, "grad_norm": 0.7602813243865967, "learning_rate": 2.423567824280202e-05, "loss": 0.8597, "step": 8680 }, { "epoch": 2.5794004155535766, "grad_norm": 0.7302565574645996, "learning_rate": 2.4205995844464233e-05, "loss": 0.8879, "step": 8690 }, { "epoch": 2.5823686553873553, "grad_norm": 0.7882288098335266, "learning_rate": 2.4176313446126446e-05, "loss": 0.8381, "step": 8700 }, { "epoch": 2.585336895221134, "grad_norm": 0.7467893362045288, "learning_rate": 2.4146631047788662e-05, "loss": 0.8754, "step": 8710 }, { "epoch": 2.588305135054912, "grad_norm": 0.8261969685554504, "learning_rate": 2.4116948649450878e-05, "loss": 0.8302, "step": 8720 }, { "epoch": 2.591273374888691, "grad_norm": 0.7031750082969666, "learning_rate": 2.408726625111309e-05, "loss": 0.88, "step": 8730 }, { "epoch": 2.5942416147224696, "grad_norm": 0.9632667303085327, "learning_rate": 2.4057583852775307e-05, "loss": 0.8479, "step": 8740 }, { "epoch": 2.5972098545562483, "grad_norm": 0.737890899181366, "learning_rate": 2.402790145443752e-05, "loss": 0.843, "step": 8750 }, { "epoch": 2.6001780943900266, "grad_norm": 0.7835996747016907, "learning_rate": 2.3998219056099733e-05, "loss": 0.8806, "step": 8760 }, { "epoch": 2.6031463342238053, "grad_norm": 0.7798622250556946, "learning_rate": 2.396853665776195e-05, "loss": 0.8305, "step": 8770 }, { "epoch": 2.606114574057584, "grad_norm": 0.7724559903144836, "learning_rate": 2.3938854259424162e-05, "loss": 0.8446, "step": 8780 }, { "epoch": 2.609082813891362, "grad_norm": 0.8460879325866699, "learning_rate": 2.3909171861086378e-05, "loss": 0.8879, "step": 8790 }, { "epoch": 2.612051053725141, "grad_norm": 0.8908317685127258, "learning_rate": 2.387948946274859e-05, "loss": 0.8368, "step": 8800 }, { "epoch": 2.6150192935589196, "grad_norm": 0.8732563853263855, "learning_rate": 2.3849807064410807e-05, "loss": 0.8699, "step": 8810 }, { "epoch": 2.6179875333926983, "grad_norm": 0.8612620830535889, "learning_rate": 2.382012466607302e-05, "loss": 0.8259, "step": 8820 }, { "epoch": 2.6209557732264765, "grad_norm": 0.7714670896530151, "learning_rate": 2.3790442267735232e-05, "loss": 0.8301, "step": 8830 }, { "epoch": 2.6239240130602552, "grad_norm": 0.7954014539718628, "learning_rate": 2.376075986939745e-05, "loss": 0.8464, "step": 8840 }, { "epoch": 2.626892252894034, "grad_norm": 0.7666088342666626, "learning_rate": 2.373107747105966e-05, "loss": 0.8164, "step": 8850 }, { "epoch": 2.629860492727812, "grad_norm": 0.7557575702667236, "learning_rate": 2.3701395072721878e-05, "loss": 0.8564, "step": 8860 }, { "epoch": 2.632828732561591, "grad_norm": 0.9145186543464661, "learning_rate": 2.367171267438409e-05, "loss": 0.8065, "step": 8870 }, { "epoch": 2.6357969723953696, "grad_norm": 0.8163389563560486, "learning_rate": 2.3642030276046307e-05, "loss": 0.8479, "step": 8880 }, { "epoch": 2.6387652122291483, "grad_norm": 0.8326930403709412, "learning_rate": 2.3612347877708523e-05, "loss": 0.8564, "step": 8890 }, { "epoch": 2.6417334520629265, "grad_norm": 0.824146568775177, "learning_rate": 2.3582665479370732e-05, "loss": 0.8134, "step": 8900 }, { "epoch": 2.6447016918967052, "grad_norm": 0.7646833658218384, "learning_rate": 2.3552983081032948e-05, "loss": 0.8632, "step": 8910 }, { "epoch": 2.647669931730484, "grad_norm": 0.7801716923713684, "learning_rate": 2.352330068269516e-05, "loss": 0.8241, "step": 8920 }, { "epoch": 2.650638171564262, "grad_norm": 0.817964494228363, "learning_rate": 2.3493618284357377e-05, "loss": 0.8479, "step": 8930 }, { "epoch": 2.653606411398041, "grad_norm": 0.7799892425537109, "learning_rate": 2.3463935886019593e-05, "loss": 0.8611, "step": 8940 }, { "epoch": 2.6565746512318196, "grad_norm": 0.9056032299995422, "learning_rate": 2.3434253487681806e-05, "loss": 0.8473, "step": 8950 }, { "epoch": 2.6595428910655983, "grad_norm": 0.8326584696769714, "learning_rate": 2.3404571089344022e-05, "loss": 0.8504, "step": 8960 }, { "epoch": 2.6625111308993765, "grad_norm": 0.9176719188690186, "learning_rate": 2.3374888691006232e-05, "loss": 0.8573, "step": 8970 }, { "epoch": 2.665479370733155, "grad_norm": 0.9030516147613525, "learning_rate": 2.3345206292668448e-05, "loss": 0.8399, "step": 8980 }, { "epoch": 2.668447610566934, "grad_norm": 0.7815122008323669, "learning_rate": 2.3315523894330664e-05, "loss": 0.814, "step": 8990 }, { "epoch": 2.671415850400712, "grad_norm": 0.8600183129310608, "learning_rate": 2.3285841495992877e-05, "loss": 0.8664, "step": 9000 }, { "epoch": 2.674384090234491, "grad_norm": 0.7203440070152283, "learning_rate": 2.3256159097655093e-05, "loss": 0.8681, "step": 9010 }, { "epoch": 2.6773523300682696, "grad_norm": 0.8177782893180847, "learning_rate": 2.3226476699317306e-05, "loss": 0.8515, "step": 9020 }, { "epoch": 2.6803205699020483, "grad_norm": 0.8409428596496582, "learning_rate": 2.3196794300979522e-05, "loss": 0.8566, "step": 9030 }, { "epoch": 2.6832888097358265, "grad_norm": 0.8034857511520386, "learning_rate": 2.3167111902641735e-05, "loss": 0.8454, "step": 9040 }, { "epoch": 2.686257049569605, "grad_norm": 0.773566484451294, "learning_rate": 2.3137429504303948e-05, "loss": 0.8239, "step": 9050 }, { "epoch": 2.689225289403384, "grad_norm": 0.8261438608169556, "learning_rate": 2.3107747105966164e-05, "loss": 0.839, "step": 9060 }, { "epoch": 2.692193529237162, "grad_norm": 0.8331723809242249, "learning_rate": 2.3078064707628377e-05, "loss": 0.8509, "step": 9070 }, { "epoch": 2.695161769070941, "grad_norm": 0.8286955952644348, "learning_rate": 2.3048382309290593e-05, "loss": 0.8733, "step": 9080 }, { "epoch": 2.6981300089047195, "grad_norm": 0.8668794631958008, "learning_rate": 2.3018699910952806e-05, "loss": 0.8813, "step": 9090 }, { "epoch": 2.7010982487384982, "grad_norm": 0.8242696523666382, "learning_rate": 2.2989017512615022e-05, "loss": 0.8119, "step": 9100 }, { "epoch": 2.7040664885722765, "grad_norm": 0.7457850575447083, "learning_rate": 2.2959335114277235e-05, "loss": 0.8753, "step": 9110 }, { "epoch": 2.707034728406055, "grad_norm": 0.8100054860115051, "learning_rate": 2.2929652715939447e-05, "loss": 0.8304, "step": 9120 }, { "epoch": 2.710002968239834, "grad_norm": 0.8167350888252258, "learning_rate": 2.2899970317601664e-05, "loss": 0.8306, "step": 9130 }, { "epoch": 2.712971208073612, "grad_norm": 0.810926079750061, "learning_rate": 2.2870287919263876e-05, "loss": 0.8541, "step": 9140 }, { "epoch": 2.715939447907391, "grad_norm": 0.8000773787498474, "learning_rate": 2.2840605520926092e-05, "loss": 0.8577, "step": 9150 }, { "epoch": 2.7189076877411695, "grad_norm": 0.9856551289558411, "learning_rate": 2.2810923122588305e-05, "loss": 0.8106, "step": 9160 }, { "epoch": 2.7218759275749482, "grad_norm": 0.7883819341659546, "learning_rate": 2.278124072425052e-05, "loss": 0.8269, "step": 9170 }, { "epoch": 2.7248441674087265, "grad_norm": 0.7424699068069458, "learning_rate": 2.2751558325912734e-05, "loss": 0.8377, "step": 9180 }, { "epoch": 2.727812407242505, "grad_norm": 0.8047650456428528, "learning_rate": 2.2721875927574947e-05, "loss": 0.8431, "step": 9190 }, { "epoch": 2.730780647076284, "grad_norm": 0.841957688331604, "learning_rate": 2.2692193529237163e-05, "loss": 0.9311, "step": 9200 }, { "epoch": 2.733748886910062, "grad_norm": 0.7910162806510925, "learning_rate": 2.2662511130899376e-05, "loss": 0.817, "step": 9210 }, { "epoch": 2.736717126743841, "grad_norm": 0.8855330944061279, "learning_rate": 2.2632828732561592e-05, "loss": 0.8375, "step": 9220 }, { "epoch": 2.7396853665776195, "grad_norm": 0.8575757145881653, "learning_rate": 2.260314633422381e-05, "loss": 0.8518, "step": 9230 }, { "epoch": 2.742653606411398, "grad_norm": 0.7841692566871643, "learning_rate": 2.257346393588602e-05, "loss": 0.8336, "step": 9240 }, { "epoch": 2.7456218462451765, "grad_norm": 0.807435154914856, "learning_rate": 2.2543781537548234e-05, "loss": 0.8165, "step": 9250 }, { "epoch": 2.748590086078955, "grad_norm": 0.8038501739501953, "learning_rate": 2.2514099139210447e-05, "loss": 0.8451, "step": 9260 }, { "epoch": 2.751558325912734, "grad_norm": 0.7744222283363342, "learning_rate": 2.2484416740872663e-05, "loss": 0.899, "step": 9270 }, { "epoch": 2.754526565746512, "grad_norm": 0.9123599529266357, "learning_rate": 2.245473434253488e-05, "loss": 0.8348, "step": 9280 }, { "epoch": 2.757494805580291, "grad_norm": 0.7251855134963989, "learning_rate": 2.2425051944197092e-05, "loss": 0.8503, "step": 9290 }, { "epoch": 2.7604630454140695, "grad_norm": 0.7562434673309326, "learning_rate": 2.2395369545859308e-05, "loss": 0.8369, "step": 9300 }, { "epoch": 2.763431285247848, "grad_norm": 0.853834867477417, "learning_rate": 2.236568714752152e-05, "loss": 0.8664, "step": 9310 }, { "epoch": 2.7663995250816265, "grad_norm": 0.7551343441009521, "learning_rate": 2.2336004749183734e-05, "loss": 0.8375, "step": 9320 }, { "epoch": 2.769367764915405, "grad_norm": 0.8466640710830688, "learning_rate": 2.2309290590679726e-05, "loss": 0.8273, "step": 9330 }, { "epoch": 2.772336004749184, "grad_norm": 0.7854313850402832, "learning_rate": 2.2279608192341942e-05, "loss": 0.8448, "step": 9340 }, { "epoch": 2.775304244582962, "grad_norm": 0.7493451237678528, "learning_rate": 2.2249925794004155e-05, "loss": 0.85, "step": 9350 }, { "epoch": 2.778272484416741, "grad_norm": 0.8265935778617859, "learning_rate": 2.222024339566637e-05, "loss": 0.8185, "step": 9360 }, { "epoch": 2.7812407242505195, "grad_norm": 0.8303396701812744, "learning_rate": 2.2190560997328584e-05, "loss": 0.8358, "step": 9370 }, { "epoch": 2.784208964084298, "grad_norm": 0.8008495569229126, "learning_rate": 2.21608785989908e-05, "loss": 0.8333, "step": 9380 }, { "epoch": 2.7871772039180764, "grad_norm": 0.7543458938598633, "learning_rate": 2.2131196200653016e-05, "loss": 0.84, "step": 9390 }, { "epoch": 2.790145443751855, "grad_norm": 0.7685660719871521, "learning_rate": 2.2101513802315226e-05, "loss": 0.8759, "step": 9400 }, { "epoch": 2.793113683585634, "grad_norm": 0.772219717502594, "learning_rate": 2.2071831403977442e-05, "loss": 0.8415, "step": 9410 }, { "epoch": 2.796081923419412, "grad_norm": 0.8281093239784241, "learning_rate": 2.2042149005639655e-05, "loss": 0.8479, "step": 9420 }, { "epoch": 2.799050163253191, "grad_norm": 0.8043393492698669, "learning_rate": 2.201246660730187e-05, "loss": 0.8703, "step": 9430 }, { "epoch": 2.8020184030869695, "grad_norm": 0.7823016047477722, "learning_rate": 2.1982784208964087e-05, "loss": 0.8415, "step": 9440 }, { "epoch": 2.804986642920748, "grad_norm": 0.9165988564491272, "learning_rate": 2.19531018106263e-05, "loss": 0.9038, "step": 9450 }, { "epoch": 2.8079548827545264, "grad_norm": 0.8192057609558105, "learning_rate": 2.1923419412288516e-05, "loss": 0.859, "step": 9460 }, { "epoch": 2.810923122588305, "grad_norm": 0.767427384853363, "learning_rate": 2.1893737013950725e-05, "loss": 0.8932, "step": 9470 }, { "epoch": 2.813891362422084, "grad_norm": 0.7586916089057922, "learning_rate": 2.186405461561294e-05, "loss": 0.8523, "step": 9480 }, { "epoch": 2.816859602255862, "grad_norm": 0.7692698836326599, "learning_rate": 2.1834372217275158e-05, "loss": 0.8701, "step": 9490 }, { "epoch": 2.8198278420896408, "grad_norm": 0.8118112087249756, "learning_rate": 2.180468981893737e-05, "loss": 0.8878, "step": 9500 }, { "epoch": 2.8227960819234195, "grad_norm": 0.8335282206535339, "learning_rate": 2.1775007420599587e-05, "loss": 0.8679, "step": 9510 }, { "epoch": 2.825764321757198, "grad_norm": 0.8098772168159485, "learning_rate": 2.17453250222618e-05, "loss": 0.8468, "step": 9520 }, { "epoch": 2.8287325615909764, "grad_norm": 0.7913337349891663, "learning_rate": 2.1715642623924016e-05, "loss": 0.8563, "step": 9530 }, { "epoch": 2.831700801424755, "grad_norm": 0.7871443033218384, "learning_rate": 2.168596022558623e-05, "loss": 0.8474, "step": 9540 }, { "epoch": 2.834669041258534, "grad_norm": 0.7683349251747131, "learning_rate": 2.165627782724844e-05, "loss": 0.8429, "step": 9550 }, { "epoch": 2.837637281092312, "grad_norm": 0.761978030204773, "learning_rate": 2.1626595428910658e-05, "loss": 0.8989, "step": 9560 }, { "epoch": 2.8406055209260908, "grad_norm": 0.7867503762245178, "learning_rate": 2.159691303057287e-05, "loss": 0.8187, "step": 9570 }, { "epoch": 2.8435737607598695, "grad_norm": 0.8149718046188354, "learning_rate": 2.1567230632235086e-05, "loss": 0.837, "step": 9580 }, { "epoch": 2.846542000593648, "grad_norm": 0.7846998572349548, "learning_rate": 2.15375482338973e-05, "loss": 0.8346, "step": 9590 }, { "epoch": 2.8495102404274264, "grad_norm": 0.7807599902153015, "learning_rate": 2.1507865835559515e-05, "loss": 0.8718, "step": 9600 }, { "epoch": 2.852478480261205, "grad_norm": 0.7701714634895325, "learning_rate": 2.1478183437221728e-05, "loss": 0.8343, "step": 9610 }, { "epoch": 2.855446720094984, "grad_norm": 0.9326925873756409, "learning_rate": 2.144850103888394e-05, "loss": 0.8588, "step": 9620 }, { "epoch": 2.858414959928762, "grad_norm": 0.8950006365776062, "learning_rate": 2.1418818640546157e-05, "loss": 0.8679, "step": 9630 }, { "epoch": 2.8613831997625407, "grad_norm": 0.8486356735229492, "learning_rate": 2.138913624220837e-05, "loss": 0.864, "step": 9640 }, { "epoch": 2.8643514395963194, "grad_norm": 0.8299232125282288, "learning_rate": 2.1359453843870586e-05, "loss": 0.8436, "step": 9650 }, { "epoch": 2.867319679430098, "grad_norm": 0.7777180671691895, "learning_rate": 2.1329771445532802e-05, "loss": 0.8644, "step": 9660 }, { "epoch": 2.8702879192638764, "grad_norm": 0.8722212910652161, "learning_rate": 2.1300089047195015e-05, "loss": 0.8849, "step": 9670 }, { "epoch": 2.873256159097655, "grad_norm": 0.7691166996955872, "learning_rate": 2.1270406648857228e-05, "loss": 0.9117, "step": 9680 }, { "epoch": 2.876224398931434, "grad_norm": 0.8059368133544922, "learning_rate": 2.124072425051944e-05, "loss": 0.8446, "step": 9690 }, { "epoch": 2.879192638765212, "grad_norm": 0.8385917544364929, "learning_rate": 2.1211041852181657e-05, "loss": 0.8455, "step": 9700 }, { "epoch": 2.8821608785989907, "grad_norm": 0.9432308077812195, "learning_rate": 2.1181359453843873e-05, "loss": 0.8609, "step": 9710 }, { "epoch": 2.8851291184327694, "grad_norm": 0.7637094259262085, "learning_rate": 2.1151677055506086e-05, "loss": 0.8369, "step": 9720 }, { "epoch": 2.888097358266548, "grad_norm": 0.8425596952438354, "learning_rate": 2.1121994657168302e-05, "loss": 0.8171, "step": 9730 }, { "epoch": 2.8910655981003264, "grad_norm": 0.7895944714546204, "learning_rate": 2.1092312258830515e-05, "loss": 0.8657, "step": 9740 }, { "epoch": 2.894033837934105, "grad_norm": 0.9404763579368591, "learning_rate": 2.1062629860492728e-05, "loss": 0.8219, "step": 9750 }, { "epoch": 2.8970020777678838, "grad_norm": 0.7366514801979065, "learning_rate": 2.1032947462154944e-05, "loss": 0.8677, "step": 9760 }, { "epoch": 2.899970317601662, "grad_norm": 0.8323311805725098, "learning_rate": 2.1003265063817157e-05, "loss": 0.841, "step": 9770 }, { "epoch": 2.9029385574354407, "grad_norm": 0.8057579398155212, "learning_rate": 2.0973582665479373e-05, "loss": 0.8571, "step": 9780 }, { "epoch": 2.9059067972692194, "grad_norm": 0.7631582617759705, "learning_rate": 2.0943900267141586e-05, "loss": 0.7837, "step": 9790 }, { "epoch": 2.908875037102998, "grad_norm": 0.95359206199646, "learning_rate": 2.09142178688038e-05, "loss": 0.8971, "step": 9800 }, { "epoch": 2.9118432769367764, "grad_norm": 0.746528685092926, "learning_rate": 2.0884535470466014e-05, "loss": 0.8293, "step": 9810 }, { "epoch": 2.914811516770555, "grad_norm": 0.787821888923645, "learning_rate": 2.0854853072128227e-05, "loss": 0.8613, "step": 9820 }, { "epoch": 2.9177797566043338, "grad_norm": 0.7103245854377747, "learning_rate": 2.0825170673790443e-05, "loss": 0.8385, "step": 9830 }, { "epoch": 2.920747996438112, "grad_norm": 0.8258326053619385, "learning_rate": 2.0795488275452656e-05, "loss": 0.8705, "step": 9840 }, { "epoch": 2.9237162362718907, "grad_norm": 0.8143662214279175, "learning_rate": 2.0765805877114872e-05, "loss": 0.8274, "step": 9850 }, { "epoch": 2.9266844761056694, "grad_norm": 0.8519349694252014, "learning_rate": 2.0736123478777085e-05, "loss": 0.8661, "step": 9860 }, { "epoch": 2.929652715939448, "grad_norm": 0.8383392095565796, "learning_rate": 2.07064410804393e-05, "loss": 0.8704, "step": 9870 }, { "epoch": 2.9326209557732263, "grad_norm": 0.8166011571884155, "learning_rate": 2.0676758682101514e-05, "loss": 0.8754, "step": 9880 }, { "epoch": 2.935589195607005, "grad_norm": 0.7948212623596191, "learning_rate": 2.0647076283763727e-05, "loss": 0.8269, "step": 9890 }, { "epoch": 2.9385574354407837, "grad_norm": 0.8796318769454956, "learning_rate": 2.0617393885425943e-05, "loss": 0.8424, "step": 9900 }, { "epoch": 2.941525675274562, "grad_norm": 0.8400830030441284, "learning_rate": 2.0587711487088156e-05, "loss": 0.8153, "step": 9910 }, { "epoch": 2.9444939151083407, "grad_norm": 0.8363904356956482, "learning_rate": 2.0558029088750372e-05, "loss": 0.8343, "step": 9920 }, { "epoch": 2.9474621549421194, "grad_norm": 0.7671076655387878, "learning_rate": 2.0528346690412585e-05, "loss": 0.8258, "step": 9930 }, { "epoch": 2.950430394775898, "grad_norm": 0.7828432321548462, "learning_rate": 2.04986642920748e-05, "loss": 0.8739, "step": 9940 }, { "epoch": 2.9533986346096763, "grad_norm": 0.8337661623954773, "learning_rate": 2.0468981893737017e-05, "loss": 0.8387, "step": 9950 }, { "epoch": 2.956366874443455, "grad_norm": 0.9032126665115356, "learning_rate": 2.043929949539923e-05, "loss": 0.8669, "step": 9960 }, { "epoch": 2.9593351142772337, "grad_norm": 0.813811719417572, "learning_rate": 2.0409617097061443e-05, "loss": 0.8469, "step": 9970 }, { "epoch": 2.962303354111012, "grad_norm": 0.816363513469696, "learning_rate": 2.0379934698723656e-05, "loss": 0.8226, "step": 9980 }, { "epoch": 2.9652715939447907, "grad_norm": 0.7742164134979248, "learning_rate": 2.0350252300385872e-05, "loss": 0.8795, "step": 9990 }, { "epoch": 2.9682398337785694, "grad_norm": 0.7590778470039368, "learning_rate": 2.0320569902048088e-05, "loss": 0.8313, "step": 10000 }, { "epoch": 2.971208073612348, "grad_norm": 0.8371145129203796, "learning_rate": 2.02908875037103e-05, "loss": 0.8143, "step": 10010 }, { "epoch": 2.9741763134461263, "grad_norm": 0.7885297536849976, "learning_rate": 2.0261205105372517e-05, "loss": 0.8539, "step": 10020 }, { "epoch": 2.977144553279905, "grad_norm": 0.8193269968032837, "learning_rate": 2.023152270703473e-05, "loss": 0.8781, "step": 10030 }, { "epoch": 2.9801127931136837, "grad_norm": 0.8111793398857117, "learning_rate": 2.0201840308696942e-05, "loss": 0.8442, "step": 10040 }, { "epoch": 2.983081032947462, "grad_norm": 0.820756196975708, "learning_rate": 2.017215791035916e-05, "loss": 0.8498, "step": 10050 }, { "epoch": 2.9860492727812407, "grad_norm": 0.8105862736701965, "learning_rate": 2.014247551202137e-05, "loss": 0.8619, "step": 10060 }, { "epoch": 2.9890175126150194, "grad_norm": 0.8605939149856567, "learning_rate": 2.0112793113683588e-05, "loss": 0.8739, "step": 10070 }, { "epoch": 2.991985752448798, "grad_norm": 0.7815289497375488, "learning_rate": 2.00831107153458e-05, "loss": 0.811, "step": 10080 }, { "epoch": 2.9949539922825763, "grad_norm": 0.8077415227890015, "learning_rate": 2.0053428317008017e-05, "loss": 0.8672, "step": 10090 }, { "epoch": 2.997922232116355, "grad_norm": 0.8030134439468384, "learning_rate": 2.002374591867023e-05, "loss": 0.8597, "step": 10100 }, { "epoch": 3.0008904719501337, "grad_norm": 0.748300313949585, "learning_rate": 1.9994063520332442e-05, "loss": 0.8273, "step": 10110 }, { "epoch": 3.003858711783912, "grad_norm": 0.8023651838302612, "learning_rate": 1.996438112199466e-05, "loss": 0.7977, "step": 10120 }, { "epoch": 3.0068269516176906, "grad_norm": 0.7826663851737976, "learning_rate": 1.993469872365687e-05, "loss": 0.7907, "step": 10130 }, { "epoch": 3.0097951914514693, "grad_norm": 0.8399567008018494, "learning_rate": 1.9905016325319087e-05, "loss": 0.8036, "step": 10140 }, { "epoch": 3.012763431285248, "grad_norm": 0.9962929487228394, "learning_rate": 1.98753339269813e-05, "loss": 0.7947, "step": 10150 }, { "epoch": 3.0157316711190263, "grad_norm": 0.7716733813285828, "learning_rate": 1.9845651528643516e-05, "loss": 0.8097, "step": 10160 }, { "epoch": 3.018699910952805, "grad_norm": 0.854088306427002, "learning_rate": 1.9815969130305732e-05, "loss": 0.8075, "step": 10170 }, { "epoch": 3.0216681507865837, "grad_norm": 0.8283464312553406, "learning_rate": 1.9786286731967942e-05, "loss": 0.7759, "step": 10180 }, { "epoch": 3.024636390620362, "grad_norm": 0.8063070774078369, "learning_rate": 1.9756604333630158e-05, "loss": 0.807, "step": 10190 }, { "epoch": 3.0276046304541406, "grad_norm": 0.8599423170089722, "learning_rate": 1.972692193529237e-05, "loss": 0.7849, "step": 10200 }, { "epoch": 3.0305728702879193, "grad_norm": 0.8788694739341736, "learning_rate": 1.9697239536954587e-05, "loss": 0.8065, "step": 10210 }, { "epoch": 3.033541110121698, "grad_norm": 0.7816437482833862, "learning_rate": 1.9667557138616803e-05, "loss": 0.8264, "step": 10220 }, { "epoch": 3.0365093499554763, "grad_norm": 0.9156720638275146, "learning_rate": 1.9637874740279016e-05, "loss": 0.8212, "step": 10230 }, { "epoch": 3.039477589789255, "grad_norm": 0.8273279070854187, "learning_rate": 1.9608192341941232e-05, "loss": 0.798, "step": 10240 }, { "epoch": 3.0424458296230337, "grad_norm": 0.8687625527381897, "learning_rate": 1.957850994360344e-05, "loss": 0.7973, "step": 10250 }, { "epoch": 3.045414069456812, "grad_norm": 0.8279813528060913, "learning_rate": 1.9548827545265658e-05, "loss": 0.7799, "step": 10260 }, { "epoch": 3.0483823092905906, "grad_norm": 0.8048006892204285, "learning_rate": 1.9519145146927874e-05, "loss": 0.8009, "step": 10270 }, { "epoch": 3.0513505491243693, "grad_norm": 0.8323532938957214, "learning_rate": 1.9489462748590087e-05, "loss": 0.8188, "step": 10280 }, { "epoch": 3.054318788958148, "grad_norm": 0.8161619901657104, "learning_rate": 1.9459780350252303e-05, "loss": 0.7741, "step": 10290 }, { "epoch": 3.0572870287919263, "grad_norm": 0.8610167503356934, "learning_rate": 1.9430097951914516e-05, "loss": 0.8311, "step": 10300 }, { "epoch": 3.060255268625705, "grad_norm": 0.7956073880195618, "learning_rate": 1.9400415553576732e-05, "loss": 0.7758, "step": 10310 }, { "epoch": 3.0632235084594837, "grad_norm": 0.8731258511543274, "learning_rate": 1.9370733155238945e-05, "loss": 0.8096, "step": 10320 }, { "epoch": 3.066191748293262, "grad_norm": 0.8595923781394958, "learning_rate": 1.9341050756901157e-05, "loss": 0.8198, "step": 10330 }, { "epoch": 3.0691599881270406, "grad_norm": 0.8547385931015015, "learning_rate": 1.9311368358563374e-05, "loss": 0.7835, "step": 10340 }, { "epoch": 3.0721282279608193, "grad_norm": 0.8384952545166016, "learning_rate": 1.9281685960225586e-05, "loss": 0.788, "step": 10350 }, { "epoch": 3.075096467794598, "grad_norm": 0.8666067123413086, "learning_rate": 1.9252003561887803e-05, "loss": 0.8134, "step": 10360 }, { "epoch": 3.0780647076283763, "grad_norm": 0.8557712435722351, "learning_rate": 1.9222321163550015e-05, "loss": 0.8016, "step": 10370 }, { "epoch": 3.081032947462155, "grad_norm": 0.889104425907135, "learning_rate": 1.919263876521223e-05, "loss": 0.8162, "step": 10380 }, { "epoch": 3.0840011872959336, "grad_norm": 0.9883944988250732, "learning_rate": 1.9162956366874444e-05, "loss": 0.7604, "step": 10390 }, { "epoch": 3.086969427129712, "grad_norm": 1.0874537229537964, "learning_rate": 1.9133273968536657e-05, "loss": 0.8122, "step": 10400 }, { "epoch": 3.0899376669634906, "grad_norm": 0.8347809910774231, "learning_rate": 1.9103591570198873e-05, "loss": 0.775, "step": 10410 }, { "epoch": 3.0929059067972693, "grad_norm": 0.8585425615310669, "learning_rate": 1.9073909171861086e-05, "loss": 0.8108, "step": 10420 }, { "epoch": 3.095874146631048, "grad_norm": 0.9726991057395935, "learning_rate": 1.9044226773523302e-05, "loss": 0.7969, "step": 10430 }, { "epoch": 3.0988423864648262, "grad_norm": 0.816733181476593, "learning_rate": 1.901454437518552e-05, "loss": 0.7843, "step": 10440 }, { "epoch": 3.101810626298605, "grad_norm": 0.9138243198394775, "learning_rate": 1.898486197684773e-05, "loss": 0.7987, "step": 10450 }, { "epoch": 3.1047788661323836, "grad_norm": 0.8893081545829773, "learning_rate": 1.8955179578509944e-05, "loss": 0.8214, "step": 10460 }, { "epoch": 3.107747105966162, "grad_norm": 0.87662672996521, "learning_rate": 1.8925497180172157e-05, "loss": 0.8086, "step": 10470 }, { "epoch": 3.1107153457999406, "grad_norm": 0.8176641464233398, "learning_rate": 1.8895814781834373e-05, "loss": 0.7727, "step": 10480 }, { "epoch": 3.1136835856337193, "grad_norm": 0.8557479977607727, "learning_rate": 1.886613238349659e-05, "loss": 0.7794, "step": 10490 }, { "epoch": 3.116651825467498, "grad_norm": 0.8954534530639648, "learning_rate": 1.8836449985158802e-05, "loss": 0.8434, "step": 10500 }, { "epoch": 3.1196200653012762, "grad_norm": 0.8101194500923157, "learning_rate": 1.8806767586821018e-05, "loss": 0.7627, "step": 10510 }, { "epoch": 3.122588305135055, "grad_norm": 0.8535807728767395, "learning_rate": 1.877708518848323e-05, "loss": 0.7877, "step": 10520 }, { "epoch": 3.1255565449688336, "grad_norm": 0.8712933659553528, "learning_rate": 1.8747402790145444e-05, "loss": 0.8046, "step": 10530 }, { "epoch": 3.128524784802612, "grad_norm": 0.825019121170044, "learning_rate": 1.871772039180766e-05, "loss": 0.7714, "step": 10540 }, { "epoch": 3.1314930246363906, "grad_norm": 0.9397444725036621, "learning_rate": 1.8688037993469873e-05, "loss": 0.8038, "step": 10550 }, { "epoch": 3.1344612644701693, "grad_norm": 0.8636755347251892, "learning_rate": 1.865835559513209e-05, "loss": 0.7902, "step": 10560 }, { "epoch": 3.137429504303948, "grad_norm": 0.8545663356781006, "learning_rate": 1.86286731967943e-05, "loss": 0.7908, "step": 10570 }, { "epoch": 3.140397744137726, "grad_norm": 0.9643442034721375, "learning_rate": 1.8598990798456518e-05, "loss": 0.8231, "step": 10580 }, { "epoch": 3.143365983971505, "grad_norm": 0.9168353080749512, "learning_rate": 1.856930840011873e-05, "loss": 0.7945, "step": 10590 }, { "epoch": 3.1463342238052836, "grad_norm": 0.8996456861495972, "learning_rate": 1.8539626001780943e-05, "loss": 0.8124, "step": 10600 }, { "epoch": 3.149302463639062, "grad_norm": 0.86201411485672, "learning_rate": 1.850994360344316e-05, "loss": 0.7869, "step": 10610 }, { "epoch": 3.1522707034728406, "grad_norm": 0.858235239982605, "learning_rate": 1.8480261205105372e-05, "loss": 0.7738, "step": 10620 }, { "epoch": 3.1552389433066192, "grad_norm": 0.8770175576210022, "learning_rate": 1.845057880676759e-05, "loss": 0.7841, "step": 10630 }, { "epoch": 3.158207183140398, "grad_norm": 0.80247563123703, "learning_rate": 1.84208964084298e-05, "loss": 0.8175, "step": 10640 }, { "epoch": 3.161175422974176, "grad_norm": 0.8442670702934265, "learning_rate": 1.8391214010092017e-05, "loss": 0.7908, "step": 10650 }, { "epoch": 3.164143662807955, "grad_norm": 0.9458386301994324, "learning_rate": 1.836153161175423e-05, "loss": 0.8087, "step": 10660 }, { "epoch": 3.1671119026417336, "grad_norm": 0.87587571144104, "learning_rate": 1.8331849213416443e-05, "loss": 0.8114, "step": 10670 }, { "epoch": 3.170080142475512, "grad_norm": 0.8657705187797546, "learning_rate": 1.830216681507866e-05, "loss": 0.7774, "step": 10680 }, { "epoch": 3.1730483823092905, "grad_norm": 0.8687613606452942, "learning_rate": 1.8272484416740872e-05, "loss": 0.7866, "step": 10690 }, { "epoch": 3.1760166221430692, "grad_norm": 0.8471196889877319, "learning_rate": 1.8242802018403088e-05, "loss": 0.7761, "step": 10700 }, { "epoch": 3.178984861976848, "grad_norm": 0.9914931654930115, "learning_rate": 1.82131196200653e-05, "loss": 0.7994, "step": 10710 }, { "epoch": 3.181953101810626, "grad_norm": 0.8881893157958984, "learning_rate": 1.8183437221727517e-05, "loss": 0.7688, "step": 10720 }, { "epoch": 3.184921341644405, "grad_norm": 0.934282124042511, "learning_rate": 1.8153754823389733e-05, "loss": 0.8089, "step": 10730 }, { "epoch": 3.1878895814781836, "grad_norm": 0.9608398079872131, "learning_rate": 1.8124072425051943e-05, "loss": 0.7777, "step": 10740 }, { "epoch": 3.190857821311962, "grad_norm": 0.8693539500236511, "learning_rate": 1.809439002671416e-05, "loss": 0.7865, "step": 10750 }, { "epoch": 3.1938260611457405, "grad_norm": 0.8089728355407715, "learning_rate": 1.806470762837637e-05, "loss": 0.8126, "step": 10760 }, { "epoch": 3.196794300979519, "grad_norm": 0.839136004447937, "learning_rate": 1.8035025230038588e-05, "loss": 0.8116, "step": 10770 }, { "epoch": 3.199762540813298, "grad_norm": 0.9516549706459045, "learning_rate": 1.8005342831700804e-05, "loss": 0.7901, "step": 10780 }, { "epoch": 3.202730780647076, "grad_norm": 0.9636576175689697, "learning_rate": 1.7975660433363017e-05, "loss": 0.7876, "step": 10790 }, { "epoch": 3.205699020480855, "grad_norm": 0.7846948504447937, "learning_rate": 1.7945978035025233e-05, "loss": 0.7793, "step": 10800 }, { "epoch": 3.2086672603146336, "grad_norm": 0.9290656447410583, "learning_rate": 1.7916295636687442e-05, "loss": 0.7998, "step": 10810 }, { "epoch": 3.211635500148412, "grad_norm": 0.8134689331054688, "learning_rate": 1.788661323834966e-05, "loss": 0.7932, "step": 10820 }, { "epoch": 3.2146037399821905, "grad_norm": 0.8373258709907532, "learning_rate": 1.7856930840011875e-05, "loss": 0.7989, "step": 10830 }, { "epoch": 3.217571979815969, "grad_norm": 0.8180834054946899, "learning_rate": 1.7827248441674087e-05, "loss": 0.8083, "step": 10840 }, { "epoch": 3.220540219649748, "grad_norm": 0.947187066078186, "learning_rate": 1.7797566043336304e-05, "loss": 0.7798, "step": 10850 }, { "epoch": 3.223508459483526, "grad_norm": 1.0117039680480957, "learning_rate": 1.7767883644998516e-05, "loss": 0.8054, "step": 10860 }, { "epoch": 3.226476699317305, "grad_norm": 0.9373120069503784, "learning_rate": 1.7738201246660733e-05, "loss": 0.8004, "step": 10870 }, { "epoch": 3.2294449391510835, "grad_norm": 0.8610514998435974, "learning_rate": 1.7708518848322945e-05, "loss": 0.7677, "step": 10880 }, { "epoch": 3.232413178984862, "grad_norm": 0.9003437161445618, "learning_rate": 1.7678836449985158e-05, "loss": 0.8095, "step": 10890 }, { "epoch": 3.2353814188186405, "grad_norm": 0.9555358290672302, "learning_rate": 1.7649154051647374e-05, "loss": 0.8184, "step": 10900 }, { "epoch": 3.238349658652419, "grad_norm": 0.9005022048950195, "learning_rate": 1.7619471653309587e-05, "loss": 0.7944, "step": 10910 }, { "epoch": 3.241317898486198, "grad_norm": 0.8886909484863281, "learning_rate": 1.7589789254971803e-05, "loss": 0.7745, "step": 10920 }, { "epoch": 3.244286138319976, "grad_norm": 0.8552870154380798, "learning_rate": 1.7560106856634016e-05, "loss": 0.7807, "step": 10930 }, { "epoch": 3.247254378153755, "grad_norm": 1.0251131057739258, "learning_rate": 1.7530424458296232e-05, "loss": 0.79, "step": 10940 }, { "epoch": 3.2502226179875335, "grad_norm": 0.8612068891525269, "learning_rate": 1.7500742059958445e-05, "loss": 0.8086, "step": 10950 }, { "epoch": 3.253190857821312, "grad_norm": 0.8735592365264893, "learning_rate": 1.7471059661620658e-05, "loss": 0.8288, "step": 10960 }, { "epoch": 3.2561590976550905, "grad_norm": 1.012428641319275, "learning_rate": 1.7441377263282874e-05, "loss": 0.8143, "step": 10970 }, { "epoch": 3.259127337488869, "grad_norm": 0.8430175185203552, "learning_rate": 1.7411694864945087e-05, "loss": 0.7853, "step": 10980 }, { "epoch": 3.262095577322648, "grad_norm": 0.9051916599273682, "learning_rate": 1.7382012466607303e-05, "loss": 0.7617, "step": 10990 }, { "epoch": 3.265063817156426, "grad_norm": 0.8995745778083801, "learning_rate": 1.735233006826952e-05, "loss": 0.8128, "step": 11000 }, { "epoch": 3.268032056990205, "grad_norm": 0.893775224685669, "learning_rate": 1.7322647669931732e-05, "loss": 0.8046, "step": 11010 }, { "epoch": 3.2710002968239835, "grad_norm": 0.8447341322898865, "learning_rate": 1.7292965271593945e-05, "loss": 0.7941, "step": 11020 }, { "epoch": 3.2739685366577618, "grad_norm": 0.8254780769348145, "learning_rate": 1.7263282873256158e-05, "loss": 0.7854, "step": 11030 }, { "epoch": 3.2769367764915405, "grad_norm": 0.8724144697189331, "learning_rate": 1.7233600474918374e-05, "loss": 0.8251, "step": 11040 }, { "epoch": 3.279905016325319, "grad_norm": 0.848600447177887, "learning_rate": 1.720391807658059e-05, "loss": 0.7821, "step": 11050 }, { "epoch": 3.282873256159098, "grad_norm": 0.9910843372344971, "learning_rate": 1.7174235678242803e-05, "loss": 0.7618, "step": 11060 }, { "epoch": 3.285841495992876, "grad_norm": 0.8709274530410767, "learning_rate": 1.714455327990502e-05, "loss": 0.7925, "step": 11070 }, { "epoch": 3.288809735826655, "grad_norm": 0.8491528630256653, "learning_rate": 1.711487088156723e-05, "loss": 0.7978, "step": 11080 }, { "epoch": 3.2917779756604335, "grad_norm": 0.9000929594039917, "learning_rate": 1.7085188483229444e-05, "loss": 0.8122, "step": 11090 }, { "epoch": 3.2947462154942118, "grad_norm": 0.8909440636634827, "learning_rate": 1.705550608489166e-05, "loss": 0.7464, "step": 11100 }, { "epoch": 3.2977144553279905, "grad_norm": 0.9142547249794006, "learning_rate": 1.7025823686553873e-05, "loss": 0.8009, "step": 11110 }, { "epoch": 3.300682695161769, "grad_norm": 0.8039944171905518, "learning_rate": 1.699614128821609e-05, "loss": 0.7625, "step": 11120 }, { "epoch": 3.303650934995548, "grad_norm": 0.9256371855735779, "learning_rate": 1.6966458889878302e-05, "loss": 0.8064, "step": 11130 }, { "epoch": 3.306619174829326, "grad_norm": 0.8562237024307251, "learning_rate": 1.693677649154052e-05, "loss": 0.8068, "step": 11140 }, { "epoch": 3.309587414663105, "grad_norm": 0.7977147102355957, "learning_rate": 1.690709409320273e-05, "loss": 0.7862, "step": 11150 }, { "epoch": 3.3125556544968835, "grad_norm": 0.8706709742546082, "learning_rate": 1.6877411694864944e-05, "loss": 0.7739, "step": 11160 }, { "epoch": 3.3155238943306617, "grad_norm": 0.8891844749450684, "learning_rate": 1.684772929652716e-05, "loss": 0.8359, "step": 11170 }, { "epoch": 3.3184921341644404, "grad_norm": 0.9938377141952515, "learning_rate": 1.6818046898189373e-05, "loss": 0.7695, "step": 11180 }, { "epoch": 3.321460373998219, "grad_norm": 0.888454794883728, "learning_rate": 1.678836449985159e-05, "loss": 0.7916, "step": 11190 }, { "epoch": 3.324428613831998, "grad_norm": 0.8678205013275146, "learning_rate": 1.6758682101513802e-05, "loss": 0.7789, "step": 11200 }, { "epoch": 3.327396853665776, "grad_norm": 0.842923641204834, "learning_rate": 1.6728999703176018e-05, "loss": 0.7936, "step": 11210 }, { "epoch": 3.330365093499555, "grad_norm": 0.923405647277832, "learning_rate": 1.6699317304838234e-05, "loss": 0.7737, "step": 11220 }, { "epoch": 3.3333333333333335, "grad_norm": 0.9673542380332947, "learning_rate": 1.6669634906500444e-05, "loss": 0.8157, "step": 11230 }, { "epoch": 3.3363015731671117, "grad_norm": 0.8497329354286194, "learning_rate": 1.663995250816266e-05, "loss": 0.8217, "step": 11240 }, { "epoch": 3.3392698130008904, "grad_norm": 0.8752831220626831, "learning_rate": 1.6610270109824873e-05, "loss": 0.8195, "step": 11250 }, { "epoch": 3.342238052834669, "grad_norm": 0.9098172783851624, "learning_rate": 1.658058771148709e-05, "loss": 0.7981, "step": 11260 }, { "epoch": 3.345206292668448, "grad_norm": 0.9954054951667786, "learning_rate": 1.6550905313149305e-05, "loss": 0.8293, "step": 11270 }, { "epoch": 3.348174532502226, "grad_norm": 0.912259578704834, "learning_rate": 1.6521222914811518e-05, "loss": 0.8394, "step": 11280 }, { "epoch": 3.3511427723360048, "grad_norm": 0.8457381725311279, "learning_rate": 1.6491540516473734e-05, "loss": 0.7682, "step": 11290 }, { "epoch": 3.3541110121697835, "grad_norm": 0.8962229490280151, "learning_rate": 1.6461858118135943e-05, "loss": 0.7953, "step": 11300 }, { "epoch": 3.3570792520035617, "grad_norm": 0.843609094619751, "learning_rate": 1.643217571979816e-05, "loss": 0.7959, "step": 11310 }, { "epoch": 3.3600474918373404, "grad_norm": 0.9469923377037048, "learning_rate": 1.6402493321460376e-05, "loss": 0.7879, "step": 11320 }, { "epoch": 3.363015731671119, "grad_norm": 0.897754430770874, "learning_rate": 1.637281092312259e-05, "loss": 0.8087, "step": 11330 }, { "epoch": 3.365983971504898, "grad_norm": 0.895817220211029, "learning_rate": 1.6343128524784805e-05, "loss": 0.801, "step": 11340 }, { "epoch": 3.368952211338676, "grad_norm": 0.8911547660827637, "learning_rate": 1.6313446126447018e-05, "loss": 0.7607, "step": 11350 }, { "epoch": 3.3719204511724548, "grad_norm": 0.9019851088523865, "learning_rate": 1.6283763728109234e-05, "loss": 0.8001, "step": 11360 }, { "epoch": 3.3748886910062335, "grad_norm": 0.9182178974151611, "learning_rate": 1.6254081329771447e-05, "loss": 0.7948, "step": 11370 }, { "epoch": 3.3778569308400117, "grad_norm": 0.8582599759101868, "learning_rate": 1.622439893143366e-05, "loss": 0.7925, "step": 11380 }, { "epoch": 3.3808251706737904, "grad_norm": 0.916682243347168, "learning_rate": 1.6194716533095876e-05, "loss": 0.7885, "step": 11390 }, { "epoch": 3.383793410507569, "grad_norm": 0.9534935355186462, "learning_rate": 1.6168002374591868e-05, "loss": 0.7961, "step": 11400 }, { "epoch": 3.386761650341348, "grad_norm": 0.9537315964698792, "learning_rate": 1.613831997625408e-05, "loss": 0.8117, "step": 11410 }, { "epoch": 3.389729890175126, "grad_norm": 0.8636172413825989, "learning_rate": 1.6108637577916297e-05, "loss": 0.7971, "step": 11420 }, { "epoch": 3.3926981300089047, "grad_norm": 0.8848598003387451, "learning_rate": 1.607895517957851e-05, "loss": 0.8086, "step": 11430 }, { "epoch": 3.3956663698426834, "grad_norm": 0.8908330202102661, "learning_rate": 1.6049272781240726e-05, "loss": 0.7773, "step": 11440 }, { "epoch": 3.3986346096764617, "grad_norm": 0.8260102272033691, "learning_rate": 1.601959038290294e-05, "loss": 0.8284, "step": 11450 }, { "epoch": 3.4016028495102404, "grad_norm": 0.9572765827178955, "learning_rate": 1.598990798456515e-05, "loss": 0.8103, "step": 11460 }, { "epoch": 3.404571089344019, "grad_norm": 0.8650007843971252, "learning_rate": 1.5960225586227368e-05, "loss": 0.8, "step": 11470 }, { "epoch": 3.407539329177798, "grad_norm": 0.8484728932380676, "learning_rate": 1.5930543187889584e-05, "loss": 0.825, "step": 11480 }, { "epoch": 3.410507569011576, "grad_norm": 1.0106016397476196, "learning_rate": 1.5900860789551797e-05, "loss": 0.8011, "step": 11490 }, { "epoch": 3.4134758088453547, "grad_norm": 0.8704240918159485, "learning_rate": 1.5871178391214013e-05, "loss": 0.8035, "step": 11500 }, { "epoch": 3.4164440486791334, "grad_norm": 0.9683484435081482, "learning_rate": 1.5841495992876226e-05, "loss": 0.8223, "step": 11510 }, { "epoch": 3.4194122885129117, "grad_norm": 0.9670411944389343, "learning_rate": 1.581181359453844e-05, "loss": 0.806, "step": 11520 }, { "epoch": 3.4223805283466904, "grad_norm": 0.8676766157150269, "learning_rate": 1.5782131196200655e-05, "loss": 0.8137, "step": 11530 }, { "epoch": 3.425348768180469, "grad_norm": 0.940537691116333, "learning_rate": 1.5752448797862867e-05, "loss": 0.7878, "step": 11540 }, { "epoch": 3.4283170080142478, "grad_norm": 0.9217203855514526, "learning_rate": 1.5722766399525084e-05, "loss": 0.8388, "step": 11550 }, { "epoch": 3.431285247848026, "grad_norm": 0.8722837567329407, "learning_rate": 1.5693084001187296e-05, "loss": 0.7985, "step": 11560 }, { "epoch": 3.4342534876818047, "grad_norm": 0.8811752200126648, "learning_rate": 1.5663401602849513e-05, "loss": 0.7873, "step": 11570 }, { "epoch": 3.4372217275155834, "grad_norm": 0.9148281812667847, "learning_rate": 1.5633719204511725e-05, "loss": 0.8035, "step": 11580 }, { "epoch": 3.4401899673493617, "grad_norm": 0.8932378888130188, "learning_rate": 1.5604036806173938e-05, "loss": 0.832, "step": 11590 }, { "epoch": 3.4431582071831404, "grad_norm": 0.9634125232696533, "learning_rate": 1.5574354407836154e-05, "loss": 0.7701, "step": 11600 }, { "epoch": 3.446126447016919, "grad_norm": 0.9190008044242859, "learning_rate": 1.5544672009498367e-05, "loss": 0.8277, "step": 11610 }, { "epoch": 3.4490946868506978, "grad_norm": 0.8262824416160583, "learning_rate": 1.5514989611160583e-05, "loss": 0.7593, "step": 11620 }, { "epoch": 3.452062926684476, "grad_norm": 0.9550215601921082, "learning_rate": 1.5485307212822796e-05, "loss": 0.7881, "step": 11630 }, { "epoch": 3.4550311665182547, "grad_norm": 0.8827412724494934, "learning_rate": 1.5455624814485012e-05, "loss": 0.7965, "step": 11640 }, { "epoch": 3.4579994063520334, "grad_norm": 0.8873876929283142, "learning_rate": 1.5425942416147225e-05, "loss": 0.8088, "step": 11650 }, { "epoch": 3.4609676461858117, "grad_norm": 0.9476723670959473, "learning_rate": 1.5396260017809438e-05, "loss": 0.8102, "step": 11660 }, { "epoch": 3.4639358860195903, "grad_norm": 0.9086580276489258, "learning_rate": 1.5366577619471654e-05, "loss": 0.8025, "step": 11670 }, { "epoch": 3.466904125853369, "grad_norm": 0.9368143081665039, "learning_rate": 1.5336895221133867e-05, "loss": 0.8186, "step": 11680 }, { "epoch": 3.4698723656871477, "grad_norm": 0.9438993334770203, "learning_rate": 1.5307212822796083e-05, "loss": 0.7736, "step": 11690 }, { "epoch": 3.472840605520926, "grad_norm": 0.9083358645439148, "learning_rate": 1.5277530424458296e-05, "loss": 0.8278, "step": 11700 }, { "epoch": 3.4758088453547047, "grad_norm": 0.8738546371459961, "learning_rate": 1.5247848026120512e-05, "loss": 0.7832, "step": 11710 }, { "epoch": 3.4787770851884834, "grad_norm": 0.9554048180580139, "learning_rate": 1.5218165627782726e-05, "loss": 0.7921, "step": 11720 }, { "epoch": 3.4817453250222616, "grad_norm": 0.8620092868804932, "learning_rate": 1.518848322944494e-05, "loss": 0.7969, "step": 11730 }, { "epoch": 3.4847135648560403, "grad_norm": 0.8661124110221863, "learning_rate": 1.5158800831107154e-05, "loss": 0.7884, "step": 11740 }, { "epoch": 3.487681804689819, "grad_norm": 0.8738637566566467, "learning_rate": 1.5129118432769368e-05, "loss": 0.7685, "step": 11750 }, { "epoch": 3.4906500445235977, "grad_norm": 0.8339844942092896, "learning_rate": 1.5099436034431583e-05, "loss": 0.7816, "step": 11760 }, { "epoch": 3.493618284357376, "grad_norm": 0.9325805902481079, "learning_rate": 1.5069753636093797e-05, "loss": 0.7474, "step": 11770 }, { "epoch": 3.4965865241911547, "grad_norm": 0.798915445804596, "learning_rate": 1.5040071237756012e-05, "loss": 0.8013, "step": 11780 }, { "epoch": 3.4995547640249334, "grad_norm": 0.9759164452552795, "learning_rate": 1.5010388839418226e-05, "loss": 0.7933, "step": 11790 }, { "epoch": 3.5025230038587116, "grad_norm": 0.9221082329750061, "learning_rate": 1.4980706441080439e-05, "loss": 0.7591, "step": 11800 }, { "epoch": 3.5054912436924903, "grad_norm": 0.9643858075141907, "learning_rate": 1.4951024042742653e-05, "loss": 0.8277, "step": 11810 }, { "epoch": 3.508459483526269, "grad_norm": 0.9068611860275269, "learning_rate": 1.4921341644404868e-05, "loss": 0.8064, "step": 11820 }, { "epoch": 3.5114277233600477, "grad_norm": 0.9101418852806091, "learning_rate": 1.4891659246067082e-05, "loss": 0.8088, "step": 11830 }, { "epoch": 3.514395963193826, "grad_norm": 0.832104504108429, "learning_rate": 1.4861976847729297e-05, "loss": 0.8312, "step": 11840 }, { "epoch": 3.5173642030276047, "grad_norm": 0.9002504944801331, "learning_rate": 1.4832294449391513e-05, "loss": 0.7986, "step": 11850 }, { "epoch": 3.5203324428613834, "grad_norm": 1.0496249198913574, "learning_rate": 1.4802612051053727e-05, "loss": 0.8074, "step": 11860 }, { "epoch": 3.5233006826951616, "grad_norm": 0.9258970022201538, "learning_rate": 1.4772929652715939e-05, "loss": 0.8434, "step": 11870 }, { "epoch": 3.5262689225289403, "grad_norm": 0.8295080661773682, "learning_rate": 1.4743247254378153e-05, "loss": 0.7798, "step": 11880 }, { "epoch": 3.529237162362719, "grad_norm": 0.990430474281311, "learning_rate": 1.4713564856040368e-05, "loss": 0.8251, "step": 11890 }, { "epoch": 3.5322054021964977, "grad_norm": 0.9134912490844727, "learning_rate": 1.4683882457702584e-05, "loss": 0.7745, "step": 11900 }, { "epoch": 3.535173642030276, "grad_norm": 0.9166617393493652, "learning_rate": 1.4654200059364798e-05, "loss": 0.8286, "step": 11910 }, { "epoch": 3.5381418818640546, "grad_norm": 0.9793247580528259, "learning_rate": 1.4624517661027013e-05, "loss": 0.7713, "step": 11920 }, { "epoch": 3.5411101216978333, "grad_norm": 0.982891857624054, "learning_rate": 1.4594835262689227e-05, "loss": 0.7955, "step": 11930 }, { "epoch": 3.5440783615316116, "grad_norm": 0.9041152596473694, "learning_rate": 1.4565152864351438e-05, "loss": 0.8356, "step": 11940 }, { "epoch": 3.5470466013653903, "grad_norm": 0.9441630840301514, "learning_rate": 1.4535470466013654e-05, "loss": 0.7628, "step": 11950 }, { "epoch": 3.550014841199169, "grad_norm": 0.8983331322669983, "learning_rate": 1.4505788067675869e-05, "loss": 0.7795, "step": 11960 }, { "epoch": 3.5529830810329477, "grad_norm": 0.9247541427612305, "learning_rate": 1.4476105669338083e-05, "loss": 0.7868, "step": 11970 }, { "epoch": 3.555951320866726, "grad_norm": 1.140688180923462, "learning_rate": 1.4446423271000298e-05, "loss": 0.7919, "step": 11980 }, { "epoch": 3.5589195607005046, "grad_norm": 0.8730014562606812, "learning_rate": 1.4416740872662512e-05, "loss": 0.8075, "step": 11990 }, { "epoch": 3.5618878005342833, "grad_norm": 0.9086686968803406, "learning_rate": 1.4387058474324727e-05, "loss": 0.7802, "step": 12000 }, { "epoch": 3.5648560403680616, "grad_norm": 0.8341102004051208, "learning_rate": 1.435737607598694e-05, "loss": 0.7868, "step": 12010 }, { "epoch": 3.5678242802018403, "grad_norm": 1.0596048831939697, "learning_rate": 1.4327693677649154e-05, "loss": 0.7901, "step": 12020 }, { "epoch": 3.570792520035619, "grad_norm": 0.8686931729316711, "learning_rate": 1.4298011279311369e-05, "loss": 0.8188, "step": 12030 }, { "epoch": 3.5737607598693977, "grad_norm": 0.8936414122581482, "learning_rate": 1.4268328880973583e-05, "loss": 0.7681, "step": 12040 }, { "epoch": 3.576728999703176, "grad_norm": 0.8683655261993408, "learning_rate": 1.4238646482635798e-05, "loss": 0.8043, "step": 12050 }, { "epoch": 3.5796972395369546, "grad_norm": 0.9319801330566406, "learning_rate": 1.4208964084298012e-05, "loss": 0.7831, "step": 12060 }, { "epoch": 3.5826654793707333, "grad_norm": 0.8970152735710144, "learning_rate": 1.4179281685960226e-05, "loss": 0.7626, "step": 12070 }, { "epoch": 3.5856337192045116, "grad_norm": 0.9043969511985779, "learning_rate": 1.414959928762244e-05, "loss": 0.7909, "step": 12080 }, { "epoch": 3.5886019590382903, "grad_norm": 0.9323931336402893, "learning_rate": 1.4119916889284654e-05, "loss": 0.7831, "step": 12090 }, { "epoch": 3.591570198872069, "grad_norm": 0.9297446608543396, "learning_rate": 1.4090234490946868e-05, "loss": 0.8151, "step": 12100 }, { "epoch": 3.5945384387058477, "grad_norm": 0.9543979167938232, "learning_rate": 1.4060552092609083e-05, "loss": 0.7986, "step": 12110 }, { "epoch": 3.597506678539626, "grad_norm": 0.951012134552002, "learning_rate": 1.4030869694271297e-05, "loss": 0.8287, "step": 12120 }, { "epoch": 3.6004749183734046, "grad_norm": 0.9943885207176208, "learning_rate": 1.4001187295933513e-05, "loss": 0.7854, "step": 12130 }, { "epoch": 3.6034431582071833, "grad_norm": 0.8945292234420776, "learning_rate": 1.3971504897595728e-05, "loss": 0.7545, "step": 12140 }, { "epoch": 3.6064113980409616, "grad_norm": 0.8595576286315918, "learning_rate": 1.3941822499257939e-05, "loss": 0.7712, "step": 12150 }, { "epoch": 3.6093796378747403, "grad_norm": 0.9131389260292053, "learning_rate": 1.3912140100920153e-05, "loss": 0.7732, "step": 12160 }, { "epoch": 3.612347877708519, "grad_norm": 0.8835858106613159, "learning_rate": 1.388245770258237e-05, "loss": 0.805, "step": 12170 }, { "epoch": 3.6153161175422976, "grad_norm": 0.8401873707771301, "learning_rate": 1.3852775304244584e-05, "loss": 0.7814, "step": 12180 }, { "epoch": 3.618284357376076, "grad_norm": 0.8316750526428223, "learning_rate": 1.3823092905906799e-05, "loss": 0.8215, "step": 12190 }, { "epoch": 3.6212525972098546, "grad_norm": 0.9151681661605835, "learning_rate": 1.3793410507569013e-05, "loss": 0.8424, "step": 12200 }, { "epoch": 3.6242208370436333, "grad_norm": 0.896335780620575, "learning_rate": 1.3763728109231228e-05, "loss": 0.788, "step": 12210 }, { "epoch": 3.6271890768774115, "grad_norm": 1.0074635744094849, "learning_rate": 1.373404571089344e-05, "loss": 0.8132, "step": 12220 }, { "epoch": 3.6301573167111902, "grad_norm": 0.9395803809165955, "learning_rate": 1.3704363312555655e-05, "loss": 0.823, "step": 12230 }, { "epoch": 3.633125556544969, "grad_norm": 0.8941648006439209, "learning_rate": 1.367468091421787e-05, "loss": 0.7965, "step": 12240 }, { "epoch": 3.6360937963787476, "grad_norm": 0.9906322956085205, "learning_rate": 1.3644998515880084e-05, "loss": 0.7901, "step": 12250 }, { "epoch": 3.639062036212526, "grad_norm": 0.9607106447219849, "learning_rate": 1.3615316117542298e-05, "loss": 0.7654, "step": 12260 }, { "epoch": 3.6420302760463046, "grad_norm": 0.9346127510070801, "learning_rate": 1.3585633719204513e-05, "loss": 0.785, "step": 12270 }, { "epoch": 3.644998515880083, "grad_norm": 0.9905751943588257, "learning_rate": 1.3555951320866727e-05, "loss": 0.7745, "step": 12280 }, { "epoch": 3.6479667557138615, "grad_norm": 0.951194703578949, "learning_rate": 1.352626892252894e-05, "loss": 0.8131, "step": 12290 }, { "epoch": 3.6509349955476402, "grad_norm": 0.9166809320449829, "learning_rate": 1.3496586524191154e-05, "loss": 0.7626, "step": 12300 }, { "epoch": 3.653903235381419, "grad_norm": 0.8549669981002808, "learning_rate": 1.3466904125853369e-05, "loss": 0.7834, "step": 12310 }, { "epoch": 3.6568714752151976, "grad_norm": 0.8752672672271729, "learning_rate": 1.3437221727515583e-05, "loss": 0.8363, "step": 12320 }, { "epoch": 3.659839715048976, "grad_norm": 0.8863996267318726, "learning_rate": 1.3407539329177798e-05, "loss": 0.7923, "step": 12330 }, { "epoch": 3.6628079548827546, "grad_norm": 0.9465095400810242, "learning_rate": 1.3377856930840012e-05, "loss": 0.8218, "step": 12340 }, { "epoch": 3.665776194716533, "grad_norm": 0.9206190705299377, "learning_rate": 1.3348174532502229e-05, "loss": 0.7686, "step": 12350 }, { "epoch": 3.6687444345503115, "grad_norm": 0.9813005924224854, "learning_rate": 1.331849213416444e-05, "loss": 0.7735, "step": 12360 }, { "epoch": 3.67171267438409, "grad_norm": 0.9265437722206116, "learning_rate": 1.3288809735826654e-05, "loss": 0.7836, "step": 12370 }, { "epoch": 3.674680914217869, "grad_norm": 1.0839077234268188, "learning_rate": 1.3259127337488869e-05, "loss": 0.7987, "step": 12380 }, { "epoch": 3.6776491540516476, "grad_norm": 1.095517873764038, "learning_rate": 1.3229444939151083e-05, "loss": 0.8085, "step": 12390 }, { "epoch": 3.680617393885426, "grad_norm": 1.037738561630249, "learning_rate": 1.31997625408133e-05, "loss": 0.7919, "step": 12400 }, { "epoch": 3.6835856337192046, "grad_norm": 0.8873872756958008, "learning_rate": 1.3170080142475514e-05, "loss": 0.819, "step": 12410 }, { "epoch": 3.686553873552983, "grad_norm": 0.8964414596557617, "learning_rate": 1.3140397744137728e-05, "loss": 0.7901, "step": 12420 }, { "epoch": 3.6895221133867615, "grad_norm": 0.8763006329536438, "learning_rate": 1.3110715345799943e-05, "loss": 0.8127, "step": 12430 }, { "epoch": 3.69249035322054, "grad_norm": 0.9101526141166687, "learning_rate": 1.3081032947462154e-05, "loss": 0.8038, "step": 12440 }, { "epoch": 3.695458593054319, "grad_norm": 0.8191701173782349, "learning_rate": 1.305135054912437e-05, "loss": 0.806, "step": 12450 }, { "epoch": 3.6984268328880976, "grad_norm": 0.8987050652503967, "learning_rate": 1.3021668150786585e-05, "loss": 0.7785, "step": 12460 }, { "epoch": 3.701395072721876, "grad_norm": 0.8950318694114685, "learning_rate": 1.2991985752448799e-05, "loss": 0.7813, "step": 12470 }, { "epoch": 3.7043633125556545, "grad_norm": 0.9262118339538574, "learning_rate": 1.2962303354111013e-05, "loss": 0.8066, "step": 12480 }, { "epoch": 3.707331552389433, "grad_norm": 0.9165562391281128, "learning_rate": 1.2932620955773228e-05, "loss": 0.7721, "step": 12490 }, { "epoch": 3.7102997922232115, "grad_norm": 0.8464228510856628, "learning_rate": 1.2902938557435442e-05, "loss": 0.8219, "step": 12500 }, { "epoch": 3.71326803205699, "grad_norm": 0.8785159587860107, "learning_rate": 1.2873256159097655e-05, "loss": 0.8082, "step": 12510 }, { "epoch": 3.716236271890769, "grad_norm": 0.8984147906303406, "learning_rate": 1.284357376075987e-05, "loss": 0.7928, "step": 12520 }, { "epoch": 3.7192045117245476, "grad_norm": 0.9415210485458374, "learning_rate": 1.2813891362422084e-05, "loss": 0.7921, "step": 12530 }, { "epoch": 3.722172751558326, "grad_norm": 1.1073685884475708, "learning_rate": 1.2784208964084299e-05, "loss": 0.8225, "step": 12540 }, { "epoch": 3.7251409913921045, "grad_norm": 1.000339388847351, "learning_rate": 1.2754526565746513e-05, "loss": 0.8255, "step": 12550 }, { "epoch": 3.728109231225883, "grad_norm": 0.9128279685974121, "learning_rate": 1.2724844167408728e-05, "loss": 0.7989, "step": 12560 }, { "epoch": 3.7310774710596615, "grad_norm": 0.8923486471176147, "learning_rate": 1.2695161769070942e-05, "loss": 0.812, "step": 12570 }, { "epoch": 3.73404571089344, "grad_norm": 1.0165687799453735, "learning_rate": 1.2665479370733155e-05, "loss": 0.8055, "step": 12580 }, { "epoch": 3.737013950727219, "grad_norm": 0.964148998260498, "learning_rate": 1.263579697239537e-05, "loss": 0.8016, "step": 12590 }, { "epoch": 3.7399821905609976, "grad_norm": 0.9706274271011353, "learning_rate": 1.2606114574057584e-05, "loss": 0.7763, "step": 12600 }, { "epoch": 3.742950430394776, "grad_norm": 1.0406314134597778, "learning_rate": 1.2576432175719798e-05, "loss": 0.7821, "step": 12610 }, { "epoch": 3.7459186702285545, "grad_norm": 1.0151352882385254, "learning_rate": 1.2546749777382013e-05, "loss": 0.7774, "step": 12620 }, { "epoch": 3.7488869100623328, "grad_norm": 0.9649484753608704, "learning_rate": 1.2517067379044229e-05, "loss": 0.762, "step": 12630 }, { "epoch": 3.7518551498961115, "grad_norm": 0.9047630429267883, "learning_rate": 1.2487384980706442e-05, "loss": 0.7759, "step": 12640 }, { "epoch": 3.75482338972989, "grad_norm": 0.9317800402641296, "learning_rate": 1.2457702582368656e-05, "loss": 0.8132, "step": 12650 }, { "epoch": 3.757791629563669, "grad_norm": 1.1614782810211182, "learning_rate": 1.2428020184030869e-05, "loss": 0.7627, "step": 12660 }, { "epoch": 3.7607598693974476, "grad_norm": 0.9428204298019409, "learning_rate": 1.2398337785693084e-05, "loss": 0.7913, "step": 12670 }, { "epoch": 3.763728109231226, "grad_norm": 0.8790725469589233, "learning_rate": 1.23686553873553e-05, "loss": 0.8273, "step": 12680 }, { "epoch": 3.7666963490650045, "grad_norm": 0.8796747922897339, "learning_rate": 1.2338972989017514e-05, "loss": 0.8031, "step": 12690 }, { "epoch": 3.7696645888987828, "grad_norm": 0.8477352857589722, "learning_rate": 1.2309290590679727e-05, "loss": 0.7771, "step": 12700 }, { "epoch": 3.7726328287325614, "grad_norm": 0.9006447196006775, "learning_rate": 1.2279608192341941e-05, "loss": 0.8053, "step": 12710 }, { "epoch": 3.77560106856634, "grad_norm": 0.8733387589454651, "learning_rate": 1.2249925794004156e-05, "loss": 0.8107, "step": 12720 }, { "epoch": 3.778569308400119, "grad_norm": 0.8696321845054626, "learning_rate": 1.222024339566637e-05, "loss": 0.8131, "step": 12730 }, { "epoch": 3.7815375482338975, "grad_norm": 0.8872879147529602, "learning_rate": 1.2190560997328585e-05, "loss": 0.7902, "step": 12740 }, { "epoch": 3.784505788067676, "grad_norm": 0.9091749787330627, "learning_rate": 1.21608785989908e-05, "loss": 0.796, "step": 12750 }, { "epoch": 3.7874740279014545, "grad_norm": 0.8892313838005066, "learning_rate": 1.2131196200653014e-05, "loss": 0.8292, "step": 12760 }, { "epoch": 3.7904422677352327, "grad_norm": 0.9339777827262878, "learning_rate": 1.2101513802315227e-05, "loss": 0.8062, "step": 12770 }, { "epoch": 3.7934105075690114, "grad_norm": 0.9412952065467834, "learning_rate": 1.2071831403977441e-05, "loss": 0.8165, "step": 12780 }, { "epoch": 3.79637874740279, "grad_norm": 0.9553532004356384, "learning_rate": 1.2042149005639657e-05, "loss": 0.8031, "step": 12790 }, { "epoch": 3.799346987236569, "grad_norm": 1.0988775491714478, "learning_rate": 1.201246660730187e-05, "loss": 0.7942, "step": 12800 }, { "epoch": 3.8023152270703475, "grad_norm": 0.9005224704742432, "learning_rate": 1.1982784208964085e-05, "loss": 0.8045, "step": 12810 }, { "epoch": 3.8052834669041258, "grad_norm": 0.833794116973877, "learning_rate": 1.1953101810626299e-05, "loss": 0.7853, "step": 12820 }, { "epoch": 3.8082517067379045, "grad_norm": 0.8762853145599365, "learning_rate": 1.1923419412288514e-05, "loss": 0.8235, "step": 12830 }, { "epoch": 3.8112199465716827, "grad_norm": 0.9304106831550598, "learning_rate": 1.1893737013950728e-05, "loss": 0.7997, "step": 12840 }, { "epoch": 3.8141881864054614, "grad_norm": 0.8896927237510681, "learning_rate": 1.1864054615612943e-05, "loss": 0.7598, "step": 12850 }, { "epoch": 3.81715642623924, "grad_norm": 0.9807896614074707, "learning_rate": 1.1834372217275157e-05, "loss": 0.8045, "step": 12860 }, { "epoch": 3.820124666073019, "grad_norm": 0.9360542297363281, "learning_rate": 1.180468981893737e-05, "loss": 0.8234, "step": 12870 }, { "epoch": 3.8230929059067975, "grad_norm": 1.0262748003005981, "learning_rate": 1.1775007420599584e-05, "loss": 0.8039, "step": 12880 }, { "epoch": 3.8260611457405758, "grad_norm": 0.862922191619873, "learning_rate": 1.1745325022261799e-05, "loss": 0.8063, "step": 12890 }, { "epoch": 3.8290293855743545, "grad_norm": 0.9350073337554932, "learning_rate": 1.1715642623924015e-05, "loss": 0.7975, "step": 12900 }, { "epoch": 3.8319976254081327, "grad_norm": 1.0467214584350586, "learning_rate": 1.1685960225586228e-05, "loss": 0.7598, "step": 12910 }, { "epoch": 3.8349658652419114, "grad_norm": 0.9889064431190491, "learning_rate": 1.1656277827248442e-05, "loss": 0.7603, "step": 12920 }, { "epoch": 3.83793410507569, "grad_norm": 0.9055907130241394, "learning_rate": 1.1626595428910657e-05, "loss": 0.8148, "step": 12930 }, { "epoch": 3.840902344909469, "grad_norm": 0.8573602437973022, "learning_rate": 1.159691303057287e-05, "loss": 0.8019, "step": 12940 }, { "epoch": 3.8438705847432475, "grad_norm": 0.9720578193664551, "learning_rate": 1.1567230632235086e-05, "loss": 0.8158, "step": 12950 }, { "epoch": 3.8468388245770258, "grad_norm": 0.9214889407157898, "learning_rate": 1.15375482338973e-05, "loss": 0.8281, "step": 12960 }, { "epoch": 3.8498070644108044, "grad_norm": 0.9132649302482605, "learning_rate": 1.1507865835559515e-05, "loss": 0.7465, "step": 12970 }, { "epoch": 3.8527753042445827, "grad_norm": 0.9985355734825134, "learning_rate": 1.1478183437221727e-05, "loss": 0.8046, "step": 12980 }, { "epoch": 3.8557435440783614, "grad_norm": 0.8660067915916443, "learning_rate": 1.1448501038883942e-05, "loss": 0.7853, "step": 12990 }, { "epoch": 3.85871178391214, "grad_norm": 0.9778634309768677, "learning_rate": 1.1418818640546156e-05, "loss": 0.761, "step": 13000 }, { "epoch": 3.861680023745919, "grad_norm": 1.0151288509368896, "learning_rate": 1.1389136242208371e-05, "loss": 0.7662, "step": 13010 }, { "epoch": 3.8646482635796975, "grad_norm": 0.975917398929596, "learning_rate": 1.1359453843870585e-05, "loss": 0.7647, "step": 13020 }, { "epoch": 3.8676165034134757, "grad_norm": 0.9875814914703369, "learning_rate": 1.13297714455328e-05, "loss": 0.8328, "step": 13030 }, { "epoch": 3.8705847432472544, "grad_norm": 1.0790032148361206, "learning_rate": 1.1300089047195014e-05, "loss": 0.7757, "step": 13040 }, { "epoch": 3.8735529830810327, "grad_norm": 0.8688693642616272, "learning_rate": 1.1270406648857227e-05, "loss": 0.8035, "step": 13050 }, { "epoch": 3.8765212229148114, "grad_norm": 0.8801321983337402, "learning_rate": 1.1240724250519442e-05, "loss": 0.8028, "step": 13060 }, { "epoch": 3.87948946274859, "grad_norm": 0.9764793515205383, "learning_rate": 1.1211041852181658e-05, "loss": 0.8054, "step": 13070 }, { "epoch": 3.8824577025823688, "grad_norm": 0.9757158160209656, "learning_rate": 1.1181359453843872e-05, "loss": 0.7634, "step": 13080 }, { "epoch": 3.8854259424161475, "grad_norm": 1.0081627368927002, "learning_rate": 1.1151677055506085e-05, "loss": 0.7735, "step": 13090 }, { "epoch": 3.8883941822499257, "grad_norm": 0.9482357501983643, "learning_rate": 1.11219946571683e-05, "loss": 0.8046, "step": 13100 }, { "epoch": 3.8913624220837044, "grad_norm": 0.9332176446914673, "learning_rate": 1.1092312258830514e-05, "loss": 0.7904, "step": 13110 }, { "epoch": 3.8943306619174827, "grad_norm": 0.8813284635543823, "learning_rate": 1.1062629860492728e-05, "loss": 0.8054, "step": 13120 }, { "epoch": 3.8972989017512614, "grad_norm": 1.1398965120315552, "learning_rate": 1.1032947462154943e-05, "loss": 0.7948, "step": 13130 }, { "epoch": 3.90026714158504, "grad_norm": 0.9105324745178223, "learning_rate": 1.1003265063817157e-05, "loss": 0.7871, "step": 13140 }, { "epoch": 3.9032353814188188, "grad_norm": 0.94728684425354, "learning_rate": 1.0973582665479372e-05, "loss": 0.8171, "step": 13150 }, { "epoch": 3.9062036212525975, "grad_norm": 0.9088504314422607, "learning_rate": 1.0943900267141585e-05, "loss": 0.7787, "step": 13160 }, { "epoch": 3.9091718610863757, "grad_norm": 0.9430853724479675, "learning_rate": 1.09142178688038e-05, "loss": 0.8105, "step": 13170 }, { "epoch": 3.9121401009201544, "grad_norm": 1.0075762271881104, "learning_rate": 1.0884535470466015e-05, "loss": 0.8273, "step": 13180 }, { "epoch": 3.9151083407539327, "grad_norm": 0.9297595024108887, "learning_rate": 1.0854853072128228e-05, "loss": 0.7725, "step": 13190 }, { "epoch": 3.9180765805877114, "grad_norm": 0.9361093640327454, "learning_rate": 1.0825170673790443e-05, "loss": 0.7916, "step": 13200 }, { "epoch": 3.92104482042149, "grad_norm": 0.8753396272659302, "learning_rate": 1.0795488275452657e-05, "loss": 0.8275, "step": 13210 }, { "epoch": 3.9240130602552687, "grad_norm": 0.9651039242744446, "learning_rate": 1.0765805877114872e-05, "loss": 0.815, "step": 13220 }, { "epoch": 3.9269813000890474, "grad_norm": 0.9417471289634705, "learning_rate": 1.0736123478777086e-05, "loss": 0.7957, "step": 13230 }, { "epoch": 3.9299495399228257, "grad_norm": 1.0615134239196777, "learning_rate": 1.07064410804393e-05, "loss": 0.7787, "step": 13240 }, { "epoch": 3.9329177797566044, "grad_norm": 0.969096839427948, "learning_rate": 1.0676758682101515e-05, "loss": 0.7804, "step": 13250 }, { "epoch": 3.9358860195903826, "grad_norm": 0.8581018447875977, "learning_rate": 1.0647076283763728e-05, "loss": 0.7726, "step": 13260 }, { "epoch": 3.9388542594241613, "grad_norm": 0.934791088104248, "learning_rate": 1.0617393885425942e-05, "loss": 0.8016, "step": 13270 }, { "epoch": 3.94182249925794, "grad_norm": 0.9198674559593201, "learning_rate": 1.0587711487088157e-05, "loss": 0.7846, "step": 13280 }, { "epoch": 3.9447907390917187, "grad_norm": 0.9727395176887512, "learning_rate": 1.0558029088750373e-05, "loss": 0.8088, "step": 13290 }, { "epoch": 3.9477589789254974, "grad_norm": 0.8742444515228271, "learning_rate": 1.0528346690412586e-05, "loss": 0.8278, "step": 13300 }, { "epoch": 3.9507272187592757, "grad_norm": 1.0352386236190796, "learning_rate": 1.04986642920748e-05, "loss": 0.8211, "step": 13310 }, { "epoch": 3.9536954585930544, "grad_norm": 0.9503917694091797, "learning_rate": 1.0468981893737015e-05, "loss": 0.8355, "step": 13320 }, { "epoch": 3.9566636984268326, "grad_norm": 1.0704954862594604, "learning_rate": 1.0439299495399228e-05, "loss": 0.803, "step": 13330 }, { "epoch": 3.9596319382606113, "grad_norm": 1.072367548942566, "learning_rate": 1.0409617097061444e-05, "loss": 0.8187, "step": 13340 }, { "epoch": 3.96260017809439, "grad_norm": 0.9495874643325806, "learning_rate": 1.0379934698723658e-05, "loss": 0.7955, "step": 13350 }, { "epoch": 3.9655684179281687, "grad_norm": 0.9812706112861633, "learning_rate": 1.0350252300385873e-05, "loss": 0.8062, "step": 13360 }, { "epoch": 3.9685366577619474, "grad_norm": 0.8435708284378052, "learning_rate": 1.0320569902048085e-05, "loss": 0.8347, "step": 13370 }, { "epoch": 3.9715048975957257, "grad_norm": 0.9395439624786377, "learning_rate": 1.02908875037103e-05, "loss": 0.7698, "step": 13380 }, { "epoch": 3.9744731374295044, "grad_norm": 1.0042177438735962, "learning_rate": 1.0261205105372514e-05, "loss": 0.8013, "step": 13390 }, { "epoch": 3.9774413772632826, "grad_norm": 0.8969469666481018, "learning_rate": 1.0231522707034729e-05, "loss": 0.7825, "step": 13400 }, { "epoch": 3.9804096170970613, "grad_norm": 0.8615506887435913, "learning_rate": 1.0201840308696943e-05, "loss": 0.8354, "step": 13410 }, { "epoch": 3.98337785693084, "grad_norm": 0.893427312374115, "learning_rate": 1.0172157910359158e-05, "loss": 0.7794, "step": 13420 }, { "epoch": 3.9863460967646187, "grad_norm": 0.9048050045967102, "learning_rate": 1.0142475512021372e-05, "loss": 0.784, "step": 13430 }, { "epoch": 3.9893143365983974, "grad_norm": 0.9938015937805176, "learning_rate": 1.0112793113683585e-05, "loss": 0.8001, "step": 13440 }, { "epoch": 3.9922825764321757, "grad_norm": 0.8949004411697388, "learning_rate": 1.00831107153458e-05, "loss": 0.7774, "step": 13450 }, { "epoch": 3.9952508162659544, "grad_norm": 0.9906232953071594, "learning_rate": 1.0056396556841794e-05, "loss": 0.7881, "step": 13460 }, { "epoch": 3.9982190560997326, "grad_norm": 0.8567777276039124, "learning_rate": 1.0026714158504008e-05, "loss": 0.803, "step": 13470 }, { "epoch": 4.001187295933511, "grad_norm": 0.8931456208229065, "learning_rate": 9.997031760166221e-06, "loss": 0.7788, "step": 13480 }, { "epoch": 4.00415553576729, "grad_norm": 0.9593549370765686, "learning_rate": 9.967349361828436e-06, "loss": 0.7651, "step": 13490 }, { "epoch": 4.007123775601069, "grad_norm": 0.9598982930183411, "learning_rate": 9.93766696349065e-06, "loss": 0.7464, "step": 13500 }, { "epoch": 4.010092015434847, "grad_norm": 0.9251787662506104, "learning_rate": 9.907984565152866e-06, "loss": 0.7754, "step": 13510 }, { "epoch": 4.013060255268626, "grad_norm": 0.9092532992362976, "learning_rate": 9.878302166815079e-06, "loss": 0.766, "step": 13520 }, { "epoch": 4.016028495102404, "grad_norm": 1.02048921585083, "learning_rate": 9.848619768477293e-06, "loss": 0.741, "step": 13530 }, { "epoch": 4.018996734936183, "grad_norm": 0.9862306118011475, "learning_rate": 9.818937370139508e-06, "loss": 0.686, "step": 13540 }, { "epoch": 4.021964974769961, "grad_norm": 0.9413052797317505, "learning_rate": 9.78925497180172e-06, "loss": 0.7775, "step": 13550 }, { "epoch": 4.02493321460374, "grad_norm": 0.9390004873275757, "learning_rate": 9.759572573463937e-06, "loss": 0.7452, "step": 13560 }, { "epoch": 4.027901454437519, "grad_norm": 0.9862014055252075, "learning_rate": 9.729890175126151e-06, "loss": 0.741, "step": 13570 }, { "epoch": 4.030869694271297, "grad_norm": 0.9327501654624939, "learning_rate": 9.700207776788366e-06, "loss": 0.778, "step": 13580 }, { "epoch": 4.033837934105076, "grad_norm": 0.9384015798568726, "learning_rate": 9.670525378450579e-06, "loss": 0.784, "step": 13590 }, { "epoch": 4.036806173938854, "grad_norm": 0.8979681134223938, "learning_rate": 9.640842980112793e-06, "loss": 0.7673, "step": 13600 }, { "epoch": 4.039774413772633, "grad_norm": 0.9125372171401978, "learning_rate": 9.611160581775008e-06, "loss": 0.7654, "step": 13610 }, { "epoch": 4.042742653606411, "grad_norm": 0.9703123569488525, "learning_rate": 9.581478183437222e-06, "loss": 0.7805, "step": 13620 }, { "epoch": 4.04571089344019, "grad_norm": 1.0698387622833252, "learning_rate": 9.551795785099437e-06, "loss": 0.7656, "step": 13630 }, { "epoch": 4.048679133273969, "grad_norm": 0.8759061098098755, "learning_rate": 9.522113386761651e-06, "loss": 0.7672, "step": 13640 }, { "epoch": 4.051647373107747, "grad_norm": 1.0125908851623535, "learning_rate": 9.492430988423866e-06, "loss": 0.773, "step": 13650 }, { "epoch": 4.054615612941526, "grad_norm": 1.0893434286117554, "learning_rate": 9.462748590086078e-06, "loss": 0.7741, "step": 13660 }, { "epoch": 4.057583852775304, "grad_norm": 0.9895984530448914, "learning_rate": 9.433066191748295e-06, "loss": 0.723, "step": 13670 }, { "epoch": 4.060552092609083, "grad_norm": 1.0161770582199097, "learning_rate": 9.403383793410509e-06, "loss": 0.7353, "step": 13680 }, { "epoch": 4.063520332442861, "grad_norm": 0.9664089679718018, "learning_rate": 9.373701395072722e-06, "loss": 0.7777, "step": 13690 }, { "epoch": 4.06648857227664, "grad_norm": 1.0124640464782715, "learning_rate": 9.344018996734936e-06, "loss": 0.7382, "step": 13700 }, { "epoch": 4.069456812110419, "grad_norm": 1.000227689743042, "learning_rate": 9.31433659839715e-06, "loss": 0.7753, "step": 13710 }, { "epoch": 4.072425051944197, "grad_norm": 1.014361023902893, "learning_rate": 9.284654200059365e-06, "loss": 0.7517, "step": 13720 }, { "epoch": 4.075393291777976, "grad_norm": 0.9377306699752808, "learning_rate": 9.25497180172158e-06, "loss": 0.7417, "step": 13730 }, { "epoch": 4.078361531611754, "grad_norm": 1.0277228355407715, "learning_rate": 9.225289403383794e-06, "loss": 0.7262, "step": 13740 }, { "epoch": 4.0813297714455326, "grad_norm": 0.9528384804725647, "learning_rate": 9.195607005046009e-06, "loss": 0.7734, "step": 13750 }, { "epoch": 4.084298011279311, "grad_norm": 1.0263906717300415, "learning_rate": 9.165924606708221e-06, "loss": 0.787, "step": 13760 }, { "epoch": 4.08726625111309, "grad_norm": 1.0394752025604248, "learning_rate": 9.136242208370436e-06, "loss": 0.7997, "step": 13770 }, { "epoch": 4.090234490946869, "grad_norm": 1.042912244796753, "learning_rate": 9.10655981003265e-06, "loss": 0.7714, "step": 13780 }, { "epoch": 4.093202730780647, "grad_norm": 1.1661458015441895, "learning_rate": 9.076877411694867e-06, "loss": 0.7696, "step": 13790 }, { "epoch": 4.096170970614426, "grad_norm": 0.9394704699516296, "learning_rate": 9.04719501335708e-06, "loss": 0.7691, "step": 13800 }, { "epoch": 4.099139210448204, "grad_norm": 0.9088842272758484, "learning_rate": 9.017512615019294e-06, "loss": 0.7794, "step": 13810 }, { "epoch": 4.1021074502819825, "grad_norm": 0.8951979279518127, "learning_rate": 8.987830216681508e-06, "loss": 0.74, "step": 13820 }, { "epoch": 4.105075690115761, "grad_norm": 0.9268091917037964, "learning_rate": 8.958147818343721e-06, "loss": 0.7488, "step": 13830 }, { "epoch": 4.10804392994954, "grad_norm": 0.9783716797828674, "learning_rate": 8.928465420005937e-06, "loss": 0.7325, "step": 13840 }, { "epoch": 4.111012169783319, "grad_norm": 1.072424292564392, "learning_rate": 8.898783021668152e-06, "loss": 0.7322, "step": 13850 }, { "epoch": 4.113980409617097, "grad_norm": 0.9002495408058167, "learning_rate": 8.869100623330366e-06, "loss": 0.7884, "step": 13860 }, { "epoch": 4.116948649450876, "grad_norm": 0.8875764012336731, "learning_rate": 8.839418224992579e-06, "loss": 0.7118, "step": 13870 }, { "epoch": 4.119916889284654, "grad_norm": 1.01841402053833, "learning_rate": 8.809735826654794e-06, "loss": 0.7226, "step": 13880 }, { "epoch": 4.1228851291184325, "grad_norm": 0.9352704286575317, "learning_rate": 8.780053428317008e-06, "loss": 0.7599, "step": 13890 }, { "epoch": 4.125853368952211, "grad_norm": 0.9383290410041809, "learning_rate": 8.750371029979223e-06, "loss": 0.7636, "step": 13900 }, { "epoch": 4.12882160878599, "grad_norm": 0.9522103071212769, "learning_rate": 8.720688631641437e-06, "loss": 0.7806, "step": 13910 }, { "epoch": 4.131789848619769, "grad_norm": 0.9566015601158142, "learning_rate": 8.691006233303652e-06, "loss": 0.729, "step": 13920 }, { "epoch": 4.134758088453547, "grad_norm": 0.9936237335205078, "learning_rate": 8.661323834965866e-06, "loss": 0.786, "step": 13930 }, { "epoch": 4.137726328287326, "grad_norm": 1.0021575689315796, "learning_rate": 8.631641436628079e-06, "loss": 0.7579, "step": 13940 }, { "epoch": 4.140694568121104, "grad_norm": 0.9321087002754211, "learning_rate": 8.601959038290295e-06, "loss": 0.7369, "step": 13950 }, { "epoch": 4.1436628079548825, "grad_norm": 0.9588301777839661, "learning_rate": 8.57227663995251e-06, "loss": 0.7365, "step": 13960 }, { "epoch": 4.146631047788661, "grad_norm": 1.0697441101074219, "learning_rate": 8.542594241614722e-06, "loss": 0.7313, "step": 13970 }, { "epoch": 4.14959928762244, "grad_norm": 0.9477381110191345, "learning_rate": 8.512911843276937e-06, "loss": 0.7815, "step": 13980 }, { "epoch": 4.152567527456219, "grad_norm": 0.915964663028717, "learning_rate": 8.483229444939151e-06, "loss": 0.7679, "step": 13990 }, { "epoch": 4.155535767289997, "grad_norm": 0.9938797354698181, "learning_rate": 8.453547046601366e-06, "loss": 0.7535, "step": 14000 }, { "epoch": 4.158504007123776, "grad_norm": 0.9842844009399414, "learning_rate": 8.42386464826358e-06, "loss": 0.732, "step": 14010 }, { "epoch": 4.161472246957554, "grad_norm": 0.9590873122215271, "learning_rate": 8.394182249925795e-06, "loss": 0.7565, "step": 14020 }, { "epoch": 4.1644404867913325, "grad_norm": 0.9819791316986084, "learning_rate": 8.364499851588009e-06, "loss": 0.7559, "step": 14030 }, { "epoch": 4.167408726625111, "grad_norm": 1.014815092086792, "learning_rate": 8.334817453250222e-06, "loss": 0.794, "step": 14040 }, { "epoch": 4.17037696645889, "grad_norm": 1.020995855331421, "learning_rate": 8.305135054912436e-06, "loss": 0.736, "step": 14050 }, { "epoch": 4.173345206292669, "grad_norm": 1.0287991762161255, "learning_rate": 8.275452656574653e-06, "loss": 0.7423, "step": 14060 }, { "epoch": 4.176313446126447, "grad_norm": 0.9976941347122192, "learning_rate": 8.245770258236867e-06, "loss": 0.7581, "step": 14070 }, { "epoch": 4.179281685960226, "grad_norm": 0.9983686804771423, "learning_rate": 8.21608785989908e-06, "loss": 0.7418, "step": 14080 }, { "epoch": 4.182249925794004, "grad_norm": 0.8948219418525696, "learning_rate": 8.186405461561294e-06, "loss": 0.7374, "step": 14090 }, { "epoch": 4.1852181656277825, "grad_norm": 0.9339085221290588, "learning_rate": 8.156723063223509e-06, "loss": 0.7389, "step": 14100 }, { "epoch": 4.188186405461561, "grad_norm": 0.9872913956642151, "learning_rate": 8.127040664885723e-06, "loss": 0.7815, "step": 14110 }, { "epoch": 4.19115464529534, "grad_norm": 1.003295660018921, "learning_rate": 8.097358266547938e-06, "loss": 0.7535, "step": 14120 }, { "epoch": 4.194122885129119, "grad_norm": 0.9385514259338379, "learning_rate": 8.067675868210152e-06, "loss": 0.7293, "step": 14130 }, { "epoch": 4.197091124962897, "grad_norm": 1.1677844524383545, "learning_rate": 8.037993469872367e-06, "loss": 0.7673, "step": 14140 }, { "epoch": 4.200059364796676, "grad_norm": 1.0024638175964355, "learning_rate": 8.00831107153458e-06, "loss": 0.7371, "step": 14150 }, { "epoch": 4.203027604630454, "grad_norm": 1.0117018222808838, "learning_rate": 7.978628673196794e-06, "loss": 0.7823, "step": 14160 }, { "epoch": 4.2059958444642325, "grad_norm": 0.9616550803184509, "learning_rate": 7.94894627485901e-06, "loss": 0.7307, "step": 14170 }, { "epoch": 4.208964084298011, "grad_norm": 0.9083321690559387, "learning_rate": 7.919263876521223e-06, "loss": 0.7666, "step": 14180 }, { "epoch": 4.21193232413179, "grad_norm": 1.014846682548523, "learning_rate": 7.889581478183437e-06, "loss": 0.7621, "step": 14190 }, { "epoch": 4.214900563965569, "grad_norm": 1.0015004873275757, "learning_rate": 7.859899079845652e-06, "loss": 0.7747, "step": 14200 }, { "epoch": 4.217868803799347, "grad_norm": 1.0885722637176514, "learning_rate": 7.830216681507866e-06, "loss": 0.7743, "step": 14210 }, { "epoch": 4.220837043633126, "grad_norm": 1.0359748601913452, "learning_rate": 7.800534283170081e-06, "loss": 0.7793, "step": 14220 }, { "epoch": 4.223805283466904, "grad_norm": 0.9756050109863281, "learning_rate": 7.770851884832295e-06, "loss": 0.7608, "step": 14230 }, { "epoch": 4.2267735233006825, "grad_norm": 1.0150599479675293, "learning_rate": 7.74116948649451e-06, "loss": 0.7263, "step": 14240 }, { "epoch": 4.229741763134461, "grad_norm": 0.9713482856750488, "learning_rate": 7.711487088156723e-06, "loss": 0.7623, "step": 14250 }, { "epoch": 4.23271000296824, "grad_norm": 0.9168558120727539, "learning_rate": 7.681804689818937e-06, "loss": 0.7639, "step": 14260 }, { "epoch": 4.2356782428020185, "grad_norm": 0.9601516723632812, "learning_rate": 7.652122291481152e-06, "loss": 0.7779, "step": 14270 }, { "epoch": 4.238646482635797, "grad_norm": 0.9309169054031372, "learning_rate": 7.622439893143367e-06, "loss": 0.763, "step": 14280 }, { "epoch": 4.241614722469576, "grad_norm": 1.023237943649292, "learning_rate": 7.5927574948055806e-06, "loss": 0.7467, "step": 14290 }, { "epoch": 4.244582962303354, "grad_norm": 1.0153001546859741, "learning_rate": 7.563075096467795e-06, "loss": 0.7316, "step": 14300 }, { "epoch": 4.247551202137132, "grad_norm": 1.017930269241333, "learning_rate": 7.5333926981300095e-06, "loss": 0.761, "step": 14310 }, { "epoch": 4.250519441970911, "grad_norm": 1.10524582862854, "learning_rate": 7.503710299792223e-06, "loss": 0.7484, "step": 14320 }, { "epoch": 4.25348768180469, "grad_norm": 1.083328366279602, "learning_rate": 7.474027901454438e-06, "loss": 0.7791, "step": 14330 }, { "epoch": 4.2564559216384685, "grad_norm": 0.9794231057167053, "learning_rate": 7.444345503116652e-06, "loss": 0.7829, "step": 14340 }, { "epoch": 4.259424161472247, "grad_norm": 1.0356334447860718, "learning_rate": 7.414663104778867e-06, "loss": 0.7279, "step": 14350 }, { "epoch": 4.262392401306026, "grad_norm": 0.8205145001411438, "learning_rate": 7.38498070644108e-06, "loss": 0.7455, "step": 14360 }, { "epoch": 4.265360641139804, "grad_norm": 1.0228333473205566, "learning_rate": 7.355298308103295e-06, "loss": 0.7565, "step": 14370 }, { "epoch": 4.268328880973582, "grad_norm": 1.025168538093567, "learning_rate": 7.32561590976551e-06, "loss": 0.7615, "step": 14380 }, { "epoch": 4.271297120807361, "grad_norm": 1.0346413850784302, "learning_rate": 7.295933511427723e-06, "loss": 0.7875, "step": 14390 }, { "epoch": 4.27426536064114, "grad_norm": 0.9277398586273193, "learning_rate": 7.266251113089937e-06, "loss": 0.7342, "step": 14400 }, { "epoch": 4.2772336004749185, "grad_norm": 1.0324021577835083, "learning_rate": 7.236568714752153e-06, "loss": 0.7162, "step": 14410 }, { "epoch": 4.280201840308697, "grad_norm": 0.9764350652694702, "learning_rate": 7.206886316414367e-06, "loss": 0.7526, "step": 14420 }, { "epoch": 4.283170080142476, "grad_norm": 0.994429886341095, "learning_rate": 7.177203918076581e-06, "loss": 0.7379, "step": 14430 }, { "epoch": 4.286138319976254, "grad_norm": 0.9952360987663269, "learning_rate": 7.147521519738795e-06, "loss": 0.7566, "step": 14440 }, { "epoch": 4.289106559810032, "grad_norm": 1.0014885663986206, "learning_rate": 7.11783912140101e-06, "loss": 0.7761, "step": 14450 }, { "epoch": 4.292074799643811, "grad_norm": 0.9838289618492126, "learning_rate": 7.088156723063224e-06, "loss": 0.7375, "step": 14460 }, { "epoch": 4.29504303947759, "grad_norm": 0.907821536064148, "learning_rate": 7.058474324725438e-06, "loss": 0.7259, "step": 14470 }, { "epoch": 4.2980112793113685, "grad_norm": 1.0343751907348633, "learning_rate": 7.028791926387652e-06, "loss": 0.7805, "step": 14480 }, { "epoch": 4.300979519145147, "grad_norm": 0.8860198259353638, "learning_rate": 6.999109528049868e-06, "loss": 0.7796, "step": 14490 }, { "epoch": 4.303947758978926, "grad_norm": 1.0006810426712036, "learning_rate": 6.9694271297120805e-06, "loss": 0.7631, "step": 14500 }, { "epoch": 4.306915998812704, "grad_norm": 0.976875364780426, "learning_rate": 6.939744731374295e-06, "loss": 0.7539, "step": 14510 }, { "epoch": 4.309884238646482, "grad_norm": 0.9557456374168396, "learning_rate": 6.91006233303651e-06, "loss": 0.7961, "step": 14520 }, { "epoch": 4.312852478480261, "grad_norm": 0.9540473818778992, "learning_rate": 6.880379934698725e-06, "loss": 0.7522, "step": 14530 }, { "epoch": 4.31582071831404, "grad_norm": 1.0581246614456177, "learning_rate": 6.850697536360938e-06, "loss": 0.7724, "step": 14540 }, { "epoch": 4.3187889581478185, "grad_norm": 1.0394785404205322, "learning_rate": 6.821015138023153e-06, "loss": 0.7827, "step": 14550 }, { "epoch": 4.321757197981597, "grad_norm": 0.9216113686561584, "learning_rate": 6.791332739685367e-06, "loss": 0.7613, "step": 14560 }, { "epoch": 4.324725437815376, "grad_norm": 0.980540931224823, "learning_rate": 6.761650341347581e-06, "loss": 0.7943, "step": 14570 }, { "epoch": 4.327693677649154, "grad_norm": 0.999685525894165, "learning_rate": 6.7319679430097955e-06, "loss": 0.7401, "step": 14580 }, { "epoch": 4.330661917482932, "grad_norm": 0.9493926167488098, "learning_rate": 6.70228554467201e-06, "loss": 0.7254, "step": 14590 }, { "epoch": 4.333630157316711, "grad_norm": 0.9986134767532349, "learning_rate": 6.6726031463342244e-06, "loss": 0.7624, "step": 14600 }, { "epoch": 4.33659839715049, "grad_norm": 0.9402568340301514, "learning_rate": 6.642920747996438e-06, "loss": 0.7523, "step": 14610 }, { "epoch": 4.3395666369842685, "grad_norm": 1.1779499053955078, "learning_rate": 6.6132383496586525e-06, "loss": 0.7676, "step": 14620 }, { "epoch": 4.342534876818047, "grad_norm": 0.9890404343605042, "learning_rate": 6.583555951320868e-06, "loss": 0.7462, "step": 14630 }, { "epoch": 4.345503116651826, "grad_norm": 1.0612539052963257, "learning_rate": 6.553873552983081e-06, "loss": 0.7368, "step": 14640 }, { "epoch": 4.348471356485604, "grad_norm": 0.9921562671661377, "learning_rate": 6.524191154645295e-06, "loss": 0.7644, "step": 14650 }, { "epoch": 4.351439596319382, "grad_norm": 0.9459686279296875, "learning_rate": 6.4945087563075105e-06, "loss": 0.7567, "step": 14660 }, { "epoch": 4.354407836153161, "grad_norm": 0.941131055355072, "learning_rate": 6.464826357969725e-06, "loss": 0.7603, "step": 14670 }, { "epoch": 4.35737607598694, "grad_norm": 0.8995440006256104, "learning_rate": 6.435143959631939e-06, "loss": 0.767, "step": 14680 }, { "epoch": 4.3603443158207185, "grad_norm": 0.9734925031661987, "learning_rate": 6.405461561294153e-06, "loss": 0.7548, "step": 14690 }, { "epoch": 4.363312555654497, "grad_norm": 0.9570586681365967, "learning_rate": 6.3757791629563676e-06, "loss": 0.7515, "step": 14700 }, { "epoch": 4.366280795488276, "grad_norm": 0.9729474782943726, "learning_rate": 6.346096764618581e-06, "loss": 0.7688, "step": 14710 }, { "epoch": 4.369249035322054, "grad_norm": 0.9383680820465088, "learning_rate": 6.316414366280796e-06, "loss": 0.7651, "step": 14720 }, { "epoch": 4.372217275155832, "grad_norm": 1.0096242427825928, "learning_rate": 6.28673196794301e-06, "loss": 0.7359, "step": 14730 }, { "epoch": 4.375185514989611, "grad_norm": 0.9830286502838135, "learning_rate": 6.257049569605225e-06, "loss": 0.7314, "step": 14740 }, { "epoch": 4.37815375482339, "grad_norm": 0.9673001170158386, "learning_rate": 6.227367171267439e-06, "loss": 0.7388, "step": 14750 }, { "epoch": 4.3811219946571685, "grad_norm": 1.0183544158935547, "learning_rate": 6.197684772929653e-06, "loss": 0.7747, "step": 14760 }, { "epoch": 4.384090234490947, "grad_norm": 1.0080556869506836, "learning_rate": 6.168002374591867e-06, "loss": 0.7772, "step": 14770 }, { "epoch": 4.387058474324726, "grad_norm": 1.1509473323822021, "learning_rate": 6.138319976254082e-06, "loss": 0.7222, "step": 14780 }, { "epoch": 4.390026714158504, "grad_norm": 0.9294554591178894, "learning_rate": 6.108637577916295e-06, "loss": 0.7475, "step": 14790 }, { "epoch": 4.392994953992282, "grad_norm": 0.8813176155090332, "learning_rate": 6.078955179578511e-06, "loss": 0.7427, "step": 14800 }, { "epoch": 4.395963193826061, "grad_norm": 1.0649441480636597, "learning_rate": 6.049272781240724e-06, "loss": 0.7343, "step": 14810 }, { "epoch": 4.39893143365984, "grad_norm": 1.050569772720337, "learning_rate": 6.019590382902939e-06, "loss": 0.7447, "step": 14820 }, { "epoch": 4.401899673493618, "grad_norm": 1.0193036794662476, "learning_rate": 5.989907984565153e-06, "loss": 0.7241, "step": 14830 }, { "epoch": 4.404867913327397, "grad_norm": 1.064293622970581, "learning_rate": 5.960225586227367e-06, "loss": 0.7298, "step": 14840 }, { "epoch": 4.407836153161176, "grad_norm": 0.9010776281356812, "learning_rate": 5.930543187889582e-06, "loss": 0.7411, "step": 14850 }, { "epoch": 4.410804392994954, "grad_norm": 0.9958186745643616, "learning_rate": 5.900860789551796e-06, "loss": 0.7527, "step": 14860 }, { "epoch": 4.413772632828732, "grad_norm": 1.0218920707702637, "learning_rate": 5.87117839121401e-06, "loss": 0.7765, "step": 14870 }, { "epoch": 4.416740872662511, "grad_norm": 0.9252408742904663, "learning_rate": 5.841495992876225e-06, "loss": 0.7227, "step": 14880 }, { "epoch": 4.41970911249629, "grad_norm": 1.0993282794952393, "learning_rate": 5.811813594538439e-06, "loss": 0.7219, "step": 14890 }, { "epoch": 4.422677352330068, "grad_norm": 0.9480684399604797, "learning_rate": 5.782131196200653e-06, "loss": 0.7533, "step": 14900 }, { "epoch": 4.425645592163847, "grad_norm": 1.0021878480911255, "learning_rate": 5.7524487978628674e-06, "loss": 0.7884, "step": 14910 }, { "epoch": 4.428613831997626, "grad_norm": 0.9122134447097778, "learning_rate": 5.722766399525082e-06, "loss": 0.741, "step": 14920 }, { "epoch": 4.431582071831404, "grad_norm": 0.956417441368103, "learning_rate": 5.693084001187296e-06, "loss": 0.7292, "step": 14930 }, { "epoch": 4.434550311665182, "grad_norm": 0.9715415835380554, "learning_rate": 5.663401602849511e-06, "loss": 0.7488, "step": 14940 }, { "epoch": 4.437518551498961, "grad_norm": 1.014623999595642, "learning_rate": 5.6337192045117245e-06, "loss": 0.7594, "step": 14950 }, { "epoch": 4.44048679133274, "grad_norm": 0.9471456408500671, "learning_rate": 5.604036806173939e-06, "loss": 0.7512, "step": 14960 }, { "epoch": 4.443455031166518, "grad_norm": 1.072137713432312, "learning_rate": 5.5743544078361535e-06, "loss": 0.7842, "step": 14970 }, { "epoch": 4.446423271000297, "grad_norm": 1.037337303161621, "learning_rate": 5.544672009498367e-06, "loss": 0.7904, "step": 14980 }, { "epoch": 4.449391510834076, "grad_norm": 0.995496928691864, "learning_rate": 5.5149896111605824e-06, "loss": 0.744, "step": 14990 }, { "epoch": 4.452359750667854, "grad_norm": 1.038312315940857, "learning_rate": 5.485307212822796e-06, "loss": 0.7474, "step": 15000 }, { "epoch": 4.455327990501632, "grad_norm": 1.0620917081832886, "learning_rate": 5.4556248144850106e-06, "loss": 0.753, "step": 15010 }, { "epoch": 4.458296230335411, "grad_norm": 1.2397912740707397, "learning_rate": 5.425942416147225e-06, "loss": 0.767, "step": 15020 }, { "epoch": 4.46126447016919, "grad_norm": 0.911263644695282, "learning_rate": 5.3962600178094395e-06, "loss": 0.717, "step": 15030 }, { "epoch": 4.464232710002968, "grad_norm": 1.0086246728897095, "learning_rate": 5.366577619471653e-06, "loss": 0.7645, "step": 15040 }, { "epoch": 4.467200949836747, "grad_norm": 1.0217297077178955, "learning_rate": 5.336895221133868e-06, "loss": 0.7424, "step": 15050 }, { "epoch": 4.470169189670526, "grad_norm": 1.0628533363342285, "learning_rate": 5.307212822796082e-06, "loss": 0.7538, "step": 15060 }, { "epoch": 4.473137429504304, "grad_norm": 0.9455434083938599, "learning_rate": 5.277530424458297e-06, "loss": 0.738, "step": 15070 }, { "epoch": 4.476105669338082, "grad_norm": 0.9059712290763855, "learning_rate": 5.247848026120511e-06, "loss": 0.7403, "step": 15080 }, { "epoch": 4.479073909171861, "grad_norm": 1.0143680572509766, "learning_rate": 5.218165627782725e-06, "loss": 0.7271, "step": 15090 }, { "epoch": 4.48204214900564, "grad_norm": 1.046194314956665, "learning_rate": 5.18848322944494e-06, "loss": 0.8082, "step": 15100 }, { "epoch": 4.485010388839418, "grad_norm": 0.9987038969993591, "learning_rate": 5.158800831107154e-06, "loss": 0.7505, "step": 15110 }, { "epoch": 4.487978628673197, "grad_norm": 0.983883798122406, "learning_rate": 5.129118432769367e-06, "loss": 0.7461, "step": 15120 }, { "epoch": 4.490946868506976, "grad_norm": 1.0110454559326172, "learning_rate": 5.099436034431583e-06, "loss": 0.7858, "step": 15130 }, { "epoch": 4.493915108340754, "grad_norm": 1.2014484405517578, "learning_rate": 5.069753636093796e-06, "loss": 0.7998, "step": 15140 }, { "epoch": 4.496883348174532, "grad_norm": 0.9484744071960449, "learning_rate": 5.040071237756011e-06, "loss": 0.7701, "step": 15150 }, { "epoch": 4.499851588008311, "grad_norm": 1.0080126523971558, "learning_rate": 5.010388839418225e-06, "loss": 0.7773, "step": 15160 }, { "epoch": 4.50281982784209, "grad_norm": 1.0008039474487305, "learning_rate": 4.98070644108044e-06, "loss": 0.7289, "step": 15170 }, { "epoch": 4.505788067675868, "grad_norm": 1.0211683511734009, "learning_rate": 4.951024042742654e-06, "loss": 0.7661, "step": 15180 }, { "epoch": 4.508756307509647, "grad_norm": 0.9978165626525879, "learning_rate": 4.921341644404868e-06, "loss": 0.7935, "step": 15190 }, { "epoch": 4.511724547343425, "grad_norm": 1.0135602951049805, "learning_rate": 4.891659246067082e-06, "loss": 0.7471, "step": 15200 }, { "epoch": 4.514692787177204, "grad_norm": 0.928688108921051, "learning_rate": 4.861976847729297e-06, "loss": 0.7576, "step": 15210 }, { "epoch": 4.517661027010982, "grad_norm": 0.9365946054458618, "learning_rate": 4.832294449391511e-06, "loss": 0.7482, "step": 15220 }, { "epoch": 4.520629266844761, "grad_norm": 0.9219352602958679, "learning_rate": 4.802612051053725e-06, "loss": 0.7675, "step": 15230 }, { "epoch": 4.52359750667854, "grad_norm": 1.0753960609436035, "learning_rate": 4.77292965271594e-06, "loss": 0.7668, "step": 15240 }, { "epoch": 4.526565746512318, "grad_norm": 0.9870726466178894, "learning_rate": 4.743247254378154e-06, "loss": 0.7696, "step": 15250 }, { "epoch": 4.529533986346097, "grad_norm": 0.9948970675468445, "learning_rate": 4.713564856040368e-06, "loss": 0.7636, "step": 15260 }, { "epoch": 4.532502226179876, "grad_norm": 0.9187266230583191, "learning_rate": 4.683882457702583e-06, "loss": 0.7398, "step": 15270 }, { "epoch": 4.535470466013654, "grad_norm": 0.8769642114639282, "learning_rate": 4.6542000593647965e-06, "loss": 0.7604, "step": 15280 }, { "epoch": 4.538438705847432, "grad_norm": 0.8811549544334412, "learning_rate": 4.624517661027011e-06, "loss": 0.7802, "step": 15290 }, { "epoch": 4.541406945681211, "grad_norm": 1.0554735660552979, "learning_rate": 4.5948352626892255e-06, "loss": 0.7527, "step": 15300 }, { "epoch": 4.54437518551499, "grad_norm": 1.1264728307724, "learning_rate": 4.56515286435144e-06, "loss": 0.7845, "step": 15310 }, { "epoch": 4.547343425348768, "grad_norm": 1.0017541646957397, "learning_rate": 4.5354704660136544e-06, "loss": 0.7548, "step": 15320 }, { "epoch": 4.550311665182547, "grad_norm": 0.9614369869232178, "learning_rate": 4.505788067675869e-06, "loss": 0.7319, "step": 15330 }, { "epoch": 4.553279905016325, "grad_norm": 1.0290606021881104, "learning_rate": 4.4761056693380825e-06, "loss": 0.7621, "step": 15340 }, { "epoch": 4.5562481448501035, "grad_norm": 1.0442272424697876, "learning_rate": 4.446423271000297e-06, "loss": 0.7277, "step": 15350 }, { "epoch": 4.559216384683882, "grad_norm": 1.0785348415374756, "learning_rate": 4.4167408726625115e-06, "loss": 0.7557, "step": 15360 }, { "epoch": 4.562184624517661, "grad_norm": 0.9761595129966736, "learning_rate": 4.387058474324725e-06, "loss": 0.7443, "step": 15370 }, { "epoch": 4.56515286435144, "grad_norm": 1.0198564529418945, "learning_rate": 4.3573760759869405e-06, "loss": 0.7454, "step": 15380 }, { "epoch": 4.568121104185218, "grad_norm": 0.9881998300552368, "learning_rate": 4.327693677649154e-06, "loss": 0.79, "step": 15390 }, { "epoch": 4.571089344018997, "grad_norm": 1.0645211935043335, "learning_rate": 4.298011279311369e-06, "loss": 0.759, "step": 15400 }, { "epoch": 4.574057583852776, "grad_norm": 0.9828948974609375, "learning_rate": 4.268328880973583e-06, "loss": 0.7436, "step": 15410 }, { "epoch": 4.5770258236865535, "grad_norm": 0.9696997404098511, "learning_rate": 4.238646482635797e-06, "loss": 0.7422, "step": 15420 }, { "epoch": 4.579994063520332, "grad_norm": 1.0432863235473633, "learning_rate": 4.208964084298011e-06, "loss": 0.7918, "step": 15430 }, { "epoch": 4.582962303354111, "grad_norm": 1.055452823638916, "learning_rate": 4.179281685960226e-06, "loss": 0.7642, "step": 15440 }, { "epoch": 4.58593054318789, "grad_norm": 0.9821627736091614, "learning_rate": 4.14959928762244e-06, "loss": 0.7921, "step": 15450 }, { "epoch": 4.588898783021668, "grad_norm": 1.0450557470321655, "learning_rate": 4.119916889284655e-06, "loss": 0.7618, "step": 15460 }, { "epoch": 4.591867022855447, "grad_norm": 1.0069783926010132, "learning_rate": 4.090234490946869e-06, "loss": 0.7691, "step": 15470 }, { "epoch": 4.594835262689225, "grad_norm": 1.1639186143875122, "learning_rate": 4.060552092609083e-06, "loss": 0.7418, "step": 15480 }, { "epoch": 4.5978035025230035, "grad_norm": 0.9840888381004333, "learning_rate": 4.030869694271297e-06, "loss": 0.7594, "step": 15490 }, { "epoch": 4.600771742356782, "grad_norm": 1.1109092235565186, "learning_rate": 4.001187295933512e-06, "loss": 0.7433, "step": 15500 }, { "epoch": 4.603739982190561, "grad_norm": 0.9764207601547241, "learning_rate": 3.971504897595725e-06, "loss": 0.7388, "step": 15510 }, { "epoch": 4.60670822202434, "grad_norm": 1.190716028213501, "learning_rate": 3.941822499257941e-06, "loss": 0.7682, "step": 15520 }, { "epoch": 4.609676461858118, "grad_norm": 1.028148889541626, "learning_rate": 3.912140100920154e-06, "loss": 0.7526, "step": 15530 }, { "epoch": 4.612644701691897, "grad_norm": 1.0454537868499756, "learning_rate": 3.882457702582369e-06, "loss": 0.7571, "step": 15540 }, { "epoch": 4.615612941525676, "grad_norm": 0.9647070169448853, "learning_rate": 3.852775304244583e-06, "loss": 0.7305, "step": 15550 }, { "epoch": 4.6185811813594535, "grad_norm": 1.0753306150436401, "learning_rate": 3.823092905906797e-06, "loss": 0.7503, "step": 15560 }, { "epoch": 4.621549421193232, "grad_norm": 0.9869060516357422, "learning_rate": 3.793410507569012e-06, "loss": 0.7693, "step": 15570 }, { "epoch": 4.624517661027011, "grad_norm": 0.9355076551437378, "learning_rate": 3.763728109231226e-06, "loss": 0.7838, "step": 15580 }, { "epoch": 4.62748590086079, "grad_norm": 1.0376447439193726, "learning_rate": 3.7340457108934404e-06, "loss": 0.7731, "step": 15590 }, { "epoch": 4.630454140694568, "grad_norm": 0.9946711659431458, "learning_rate": 3.7043633125556544e-06, "loss": 0.7275, "step": 15600 }, { "epoch": 4.633422380528347, "grad_norm": 1.0657856464385986, "learning_rate": 3.6746809142178693e-06, "loss": 0.7254, "step": 15610 }, { "epoch": 4.636390620362125, "grad_norm": 1.0746755599975586, "learning_rate": 3.6449985158800834e-06, "loss": 0.7705, "step": 15620 }, { "epoch": 4.6393588601959035, "grad_norm": 1.1331027746200562, "learning_rate": 3.6182843573760763e-06, "loss": 0.7782, "step": 15630 }, { "epoch": 4.642327100029682, "grad_norm": 0.9259887337684631, "learning_rate": 3.5886019590382904e-06, "loss": 0.7686, "step": 15640 }, { "epoch": 4.645295339863461, "grad_norm": 0.9392924904823303, "learning_rate": 3.558919560700505e-06, "loss": 0.7045, "step": 15650 }, { "epoch": 4.64826357969724, "grad_norm": 0.9543144702911377, "learning_rate": 3.529237162362719e-06, "loss": 0.7865, "step": 15660 }, { "epoch": 4.651231819531018, "grad_norm": 1.050999641418457, "learning_rate": 3.499554764024934e-06, "loss": 0.7581, "step": 15670 }, { "epoch": 4.654200059364797, "grad_norm": 0.9569733738899231, "learning_rate": 3.4698723656871475e-06, "loss": 0.7632, "step": 15680 }, { "epoch": 4.657168299198576, "grad_norm": 1.073243498802185, "learning_rate": 3.4401899673493624e-06, "loss": 0.7074, "step": 15690 }, { "epoch": 4.6601365390323535, "grad_norm": 1.131837010383606, "learning_rate": 3.4105075690115764e-06, "loss": 0.7503, "step": 15700 }, { "epoch": 4.663104778866132, "grad_norm": 1.0623327493667603, "learning_rate": 3.3808251706737905e-06, "loss": 0.7705, "step": 15710 }, { "epoch": 4.666073018699911, "grad_norm": 1.076809048652649, "learning_rate": 3.351142772336005e-06, "loss": 0.7448, "step": 15720 }, { "epoch": 4.66904125853369, "grad_norm": 0.9963298439979553, "learning_rate": 3.321460373998219e-06, "loss": 0.7125, "step": 15730 }, { "epoch": 4.672009498367468, "grad_norm": 1.0451970100402832, "learning_rate": 3.291777975660434e-06, "loss": 0.79, "step": 15740 }, { "epoch": 4.674977738201247, "grad_norm": 0.9352405071258545, "learning_rate": 3.2620955773226476e-06, "loss": 0.7646, "step": 15750 }, { "epoch": 4.677945978035025, "grad_norm": 1.0493252277374268, "learning_rate": 3.2324131789848625e-06, "loss": 0.7355, "step": 15760 }, { "epoch": 4.6809142178688035, "grad_norm": 0.9438860416412354, "learning_rate": 3.2027307806470765e-06, "loss": 0.7492, "step": 15770 }, { "epoch": 4.683882457702582, "grad_norm": 0.8936295509338379, "learning_rate": 3.1730483823092906e-06, "loss": 0.745, "step": 15780 }, { "epoch": 4.686850697536361, "grad_norm": 1.0204068422317505, "learning_rate": 3.143365983971505e-06, "loss": 0.7576, "step": 15790 }, { "epoch": 4.6898189373701396, "grad_norm": 0.9555333256721497, "learning_rate": 3.1136835856337196e-06, "loss": 0.7595, "step": 15800 }, { "epoch": 4.692787177203918, "grad_norm": 0.9514308571815491, "learning_rate": 3.0840011872959336e-06, "loss": 0.7565, "step": 15810 }, { "epoch": 4.695755417037697, "grad_norm": 1.0169340372085571, "learning_rate": 3.0543187889581477e-06, "loss": 0.7104, "step": 15820 }, { "epoch": 4.698723656871476, "grad_norm": 0.9857751131057739, "learning_rate": 3.024636390620362e-06, "loss": 0.7475, "step": 15830 }, { "epoch": 4.7016918967052534, "grad_norm": 0.9653582572937012, "learning_rate": 2.9949539922825766e-06, "loss": 0.76, "step": 15840 }, { "epoch": 4.704660136539032, "grad_norm": 1.0300312042236328, "learning_rate": 2.965271593944791e-06, "loss": 0.7327, "step": 15850 }, { "epoch": 4.707628376372811, "grad_norm": 1.1477298736572266, "learning_rate": 2.935589195607005e-06, "loss": 0.773, "step": 15860 }, { "epoch": 4.7105966162065895, "grad_norm": 0.9286091327667236, "learning_rate": 2.9059067972692197e-06, "loss": 0.7152, "step": 15870 }, { "epoch": 4.713564856040368, "grad_norm": 1.0865973234176636, "learning_rate": 2.8762243989314337e-06, "loss": 0.7692, "step": 15880 }, { "epoch": 4.716533095874147, "grad_norm": 1.0828102827072144, "learning_rate": 2.846542000593648e-06, "loss": 0.7591, "step": 15890 }, { "epoch": 4.719501335707925, "grad_norm": 0.9569112062454224, "learning_rate": 2.8168596022558623e-06, "loss": 0.7642, "step": 15900 }, { "epoch": 4.722469575541703, "grad_norm": 0.9393323659896851, "learning_rate": 2.7871772039180767e-06, "loss": 0.7994, "step": 15910 }, { "epoch": 4.725437815375482, "grad_norm": 0.9811632633209229, "learning_rate": 2.7574948055802912e-06, "loss": 0.7991, "step": 15920 }, { "epoch": 4.728406055209261, "grad_norm": 1.021985411643982, "learning_rate": 2.7278124072425053e-06, "loss": 0.7422, "step": 15930 }, { "epoch": 4.7313742950430395, "grad_norm": 1.0036324262619019, "learning_rate": 2.6981300089047198e-06, "loss": 0.726, "step": 15940 }, { "epoch": 4.734342534876818, "grad_norm": 1.1031821966171265, "learning_rate": 2.668447610566934e-06, "loss": 0.752, "step": 15950 }, { "epoch": 4.737310774710597, "grad_norm": 1.0434112548828125, "learning_rate": 2.6387652122291483e-06, "loss": 0.7227, "step": 15960 }, { "epoch": 4.740279014544376, "grad_norm": 1.106284737586975, "learning_rate": 2.6090828138913624e-06, "loss": 0.7613, "step": 15970 }, { "epoch": 4.743247254378153, "grad_norm": 1.0312436819076538, "learning_rate": 2.579400415553577e-06, "loss": 0.7511, "step": 15980 }, { "epoch": 4.746215494211932, "grad_norm": 1.014859914779663, "learning_rate": 2.5497180172157913e-06, "loss": 0.8043, "step": 15990 }, { "epoch": 4.749183734045711, "grad_norm": 0.9922273755073547, "learning_rate": 2.5200356188780054e-06, "loss": 0.7598, "step": 16000 }, { "epoch": 4.7521519738794895, "grad_norm": 0.9891121983528137, "learning_rate": 2.49035322054022e-06, "loss": 0.762, "step": 16010 }, { "epoch": 4.755120213713268, "grad_norm": 1.12299382686615, "learning_rate": 2.460670822202434e-06, "loss": 0.8067, "step": 16020 }, { "epoch": 4.758088453547047, "grad_norm": 1.0930094718933105, "learning_rate": 2.4309884238646484e-06, "loss": 0.7606, "step": 16030 }, { "epoch": 4.761056693380825, "grad_norm": 1.0453695058822632, "learning_rate": 2.4013060255268625e-06, "loss": 0.7362, "step": 16040 }, { "epoch": 4.764024933214603, "grad_norm": 1.031219720840454, "learning_rate": 2.371623627189077e-06, "loss": 0.7693, "step": 16050 }, { "epoch": 4.766993173048382, "grad_norm": 1.1497999429702759, "learning_rate": 2.3419412288512914e-06, "loss": 0.7343, "step": 16060 }, { "epoch": 4.769961412882161, "grad_norm": 0.9976547360420227, "learning_rate": 2.3122588305135055e-06, "loss": 0.7893, "step": 16070 }, { "epoch": 4.7729296527159395, "grad_norm": 0.9629161357879639, "learning_rate": 2.28257643217572e-06, "loss": 0.7583, "step": 16080 }, { "epoch": 4.775897892549718, "grad_norm": 1.0763802528381348, "learning_rate": 2.2528940338379345e-06, "loss": 0.7597, "step": 16090 }, { "epoch": 4.778866132383497, "grad_norm": 1.108970046043396, "learning_rate": 2.2232116355001485e-06, "loss": 0.7516, "step": 16100 }, { "epoch": 4.781834372217276, "grad_norm": 0.9933566451072693, "learning_rate": 2.1935292371623626e-06, "loss": 0.7305, "step": 16110 }, { "epoch": 4.784802612051053, "grad_norm": 1.0121699571609497, "learning_rate": 2.163846838824577e-06, "loss": 0.7883, "step": 16120 }, { "epoch": 4.787770851884832, "grad_norm": 1.1284277439117432, "learning_rate": 2.1341644404867915e-06, "loss": 0.7378, "step": 16130 }, { "epoch": 4.790739091718611, "grad_norm": 1.1266158819198608, "learning_rate": 2.1044820421490056e-06, "loss": 0.7737, "step": 16140 }, { "epoch": 4.7937073315523895, "grad_norm": 0.989107072353363, "learning_rate": 2.07479964381122e-06, "loss": 0.7542, "step": 16150 }, { "epoch": 4.796675571386168, "grad_norm": 1.0217342376708984, "learning_rate": 2.0451172454734346e-06, "loss": 0.7697, "step": 16160 }, { "epoch": 4.799643811219947, "grad_norm": 0.9917885661125183, "learning_rate": 2.0154348471356486e-06, "loss": 0.7467, "step": 16170 }, { "epoch": 4.802612051053725, "grad_norm": 0.9344885349273682, "learning_rate": 1.9857524487978627e-06, "loss": 0.7926, "step": 16180 }, { "epoch": 4.805580290887503, "grad_norm": 1.0406873226165771, "learning_rate": 1.956070050460077e-06, "loss": 0.7564, "step": 16190 }, { "epoch": 4.808548530721282, "grad_norm": 0.9802284836769104, "learning_rate": 1.9263876521222916e-06, "loss": 0.7383, "step": 16200 }, { "epoch": 4.811516770555061, "grad_norm": 1.1220468282699585, "learning_rate": 1.896705253784506e-06, "loss": 0.7338, "step": 16210 }, { "epoch": 4.8144850103888395, "grad_norm": 1.0869120359420776, "learning_rate": 1.8670228554467202e-06, "loss": 0.7635, "step": 16220 }, { "epoch": 4.817453250222618, "grad_norm": 0.9541686177253723, "learning_rate": 1.8373404571089347e-06, "loss": 0.7626, "step": 16230 }, { "epoch": 4.820421490056397, "grad_norm": 1.0052292346954346, "learning_rate": 1.8076580587711487e-06, "loss": 0.7526, "step": 16240 }, { "epoch": 4.823389729890176, "grad_norm": 1.0642937421798706, "learning_rate": 1.777975660433363e-06, "loss": 0.731, "step": 16250 }, { "epoch": 4.826357969723953, "grad_norm": 1.0232363939285278, "learning_rate": 1.7482932620955773e-06, "loss": 0.8001, "step": 16260 }, { "epoch": 4.829326209557732, "grad_norm": 0.9778085947036743, "learning_rate": 1.7186108637577917e-06, "loss": 0.7323, "step": 16270 }, { "epoch": 4.832294449391511, "grad_norm": 1.0876951217651367, "learning_rate": 1.688928465420006e-06, "loss": 0.7394, "step": 16280 }, { "epoch": 4.8352626892252895, "grad_norm": 1.0854240655899048, "learning_rate": 1.6592460670822205e-06, "loss": 0.7493, "step": 16290 }, { "epoch": 4.838230929059068, "grad_norm": 1.0767631530761719, "learning_rate": 1.6295636687444348e-06, "loss": 0.7355, "step": 16300 }, { "epoch": 4.841199168892847, "grad_norm": 0.9216200113296509, "learning_rate": 1.599881270406649e-06, "loss": 0.7816, "step": 16310 }, { "epoch": 4.844167408726625, "grad_norm": 0.9959633350372314, "learning_rate": 1.570198872068863e-06, "loss": 0.7825, "step": 16320 }, { "epoch": 4.847135648560403, "grad_norm": 1.0585358142852783, "learning_rate": 1.5405164737310774e-06, "loss": 0.7299, "step": 16330 }, { "epoch": 4.850103888394182, "grad_norm": 0.9767472743988037, "learning_rate": 1.5108340753932918e-06, "loss": 0.7335, "step": 16340 }, { "epoch": 4.853072128227961, "grad_norm": 0.9296145439147949, "learning_rate": 1.4811516770555061e-06, "loss": 0.7488, "step": 16350 }, { "epoch": 4.856040368061739, "grad_norm": 0.9834486842155457, "learning_rate": 1.4514692787177206e-06, "loss": 0.7393, "step": 16360 }, { "epoch": 4.859008607895518, "grad_norm": 1.0462241172790527, "learning_rate": 1.4217868803799347e-06, "loss": 0.7789, "step": 16370 }, { "epoch": 4.861976847729297, "grad_norm": 1.0443044900894165, "learning_rate": 1.3921044820421491e-06, "loss": 0.7459, "step": 16380 }, { "epoch": 4.8649450875630755, "grad_norm": 1.0716089010238647, "learning_rate": 1.3624220837043634e-06, "loss": 0.7614, "step": 16390 }, { "epoch": 4.867913327396853, "grad_norm": 0.9359921216964722, "learning_rate": 1.3327396853665777e-06, "loss": 0.751, "step": 16400 }, { "epoch": 4.870881567230632, "grad_norm": 0.9718852043151855, "learning_rate": 1.303057287028792e-06, "loss": 0.7661, "step": 16410 }, { "epoch": 4.873849807064411, "grad_norm": 1.0819077491760254, "learning_rate": 1.2733748886910062e-06, "loss": 0.7634, "step": 16420 }, { "epoch": 4.876818046898189, "grad_norm": 1.0779821872711182, "learning_rate": 1.2436924903532207e-06, "loss": 0.7171, "step": 16430 }, { "epoch": 4.879786286731968, "grad_norm": 1.0170753002166748, "learning_rate": 1.2140100920154348e-06, "loss": 0.7971, "step": 16440 }, { "epoch": 4.882754526565747, "grad_norm": 0.9914931058883667, "learning_rate": 1.1843276936776492e-06, "loss": 0.7852, "step": 16450 }, { "epoch": 4.885722766399525, "grad_norm": 1.0433253049850464, "learning_rate": 1.1546452953398635e-06, "loss": 0.7115, "step": 16460 }, { "epoch": 4.888691006233303, "grad_norm": 1.0355966091156006, "learning_rate": 1.124962897002078e-06, "loss": 0.7221, "step": 16470 }, { "epoch": 4.891659246067082, "grad_norm": 1.1262221336364746, "learning_rate": 1.095280498664292e-06, "loss": 0.7751, "step": 16480 }, { "epoch": 4.894627485900861, "grad_norm": 1.0155718326568604, "learning_rate": 1.0655981003265063e-06, "loss": 0.7433, "step": 16490 }, { "epoch": 4.897595725734639, "grad_norm": 1.0135166645050049, "learning_rate": 1.0359157019887208e-06, "loss": 0.7146, "step": 16500 }, { "epoch": 4.900563965568418, "grad_norm": 1.0288039445877075, "learning_rate": 1.006233303650935e-06, "loss": 0.7492, "step": 16510 }, { "epoch": 4.903532205402197, "grad_norm": 1.0542793273925781, "learning_rate": 9.765509053131493e-07, "loss": 0.7519, "step": 16520 }, { "epoch": 4.9065004452359755, "grad_norm": 1.0649762153625488, "learning_rate": 9.468685069753636e-07, "loss": 0.736, "step": 16530 }, { "epoch": 4.909468685069753, "grad_norm": 0.9523289203643799, "learning_rate": 9.17186108637578e-07, "loss": 0.7419, "step": 16540 }, { "epoch": 4.912436924903532, "grad_norm": 1.090806007385254, "learning_rate": 8.875037102997922e-07, "loss": 0.759, "step": 16550 }, { "epoch": 4.915405164737311, "grad_norm": 1.0011529922485352, "learning_rate": 8.578213119620065e-07, "loss": 0.7418, "step": 16560 }, { "epoch": 4.918373404571089, "grad_norm": 1.0001546144485474, "learning_rate": 8.281389136242209e-07, "loss": 0.756, "step": 16570 }, { "epoch": 4.921341644404868, "grad_norm": 0.924248218536377, "learning_rate": 7.984565152864353e-07, "loss": 0.737, "step": 16580 }, { "epoch": 4.924309884238647, "grad_norm": 0.961621105670929, "learning_rate": 7.687741169486496e-07, "loss": 0.7688, "step": 16590 }, { "epoch": 4.927278124072425, "grad_norm": 0.9049778580665588, "learning_rate": 7.390917186108638e-07, "loss": 0.7132, "step": 16600 }, { "epoch": 4.930246363906203, "grad_norm": 1.124666690826416, "learning_rate": 7.094093202730781e-07, "loss": 0.7389, "step": 16610 }, { "epoch": 4.933214603739982, "grad_norm": 1.035509467124939, "learning_rate": 6.797269219352924e-07, "loss": 0.7779, "step": 16620 }, { "epoch": 4.936182843573761, "grad_norm": 1.0520305633544922, "learning_rate": 6.500445235975066e-07, "loss": 0.7665, "step": 16630 }, { "epoch": 4.939151083407539, "grad_norm": 0.9276071786880493, "learning_rate": 6.20362125259721e-07, "loss": 0.7571, "step": 16640 }, { "epoch": 4.942119323241318, "grad_norm": 1.0482933521270752, "learning_rate": 5.906797269219353e-07, "loss": 0.7781, "step": 16650 }, { "epoch": 4.945087563075097, "grad_norm": 0.9943667650222778, "learning_rate": 5.609973285841497e-07, "loss": 0.7629, "step": 16660 }, { "epoch": 4.9480558029088755, "grad_norm": 1.0370135307312012, "learning_rate": 5.313149302463639e-07, "loss": 0.7767, "step": 16670 }, { "epoch": 4.951024042742653, "grad_norm": 0.9482097625732422, "learning_rate": 5.016325319085783e-07, "loss": 0.7694, "step": 16680 }, { "epoch": 4.953992282576432, "grad_norm": 0.9172430634498596, "learning_rate": 4.719501335707925e-07, "loss": 0.7461, "step": 16690 }, { "epoch": 4.956960522410211, "grad_norm": 1.109588623046875, "learning_rate": 4.422677352330069e-07, "loss": 0.7463, "step": 16700 }, { "epoch": 4.959928762243989, "grad_norm": 0.920391857624054, "learning_rate": 4.1258533689522117e-07, "loss": 0.7494, "step": 16710 }, { "epoch": 4.962897002077768, "grad_norm": 1.0499415397644043, "learning_rate": 3.8290293855743544e-07, "loss": 0.7568, "step": 16720 }, { "epoch": 4.965865241911547, "grad_norm": 0.9439771771430969, "learning_rate": 3.5322054021964976e-07, "loss": 0.7348, "step": 16730 }, { "epoch": 4.968833481745325, "grad_norm": 1.0074434280395508, "learning_rate": 3.235381418818641e-07, "loss": 0.7173, "step": 16740 }, { "epoch": 4.971801721579103, "grad_norm": 0.9283884763717651, "learning_rate": 2.938557435440784e-07, "loss": 0.7524, "step": 16750 }, { "epoch": 4.974769961412882, "grad_norm": 1.03079354763031, "learning_rate": 2.641733452062927e-07, "loss": 0.7605, "step": 16760 }, { "epoch": 4.977738201246661, "grad_norm": 1.0735660791397095, "learning_rate": 2.3449094686850697e-07, "loss": 0.7535, "step": 16770 }, { "epoch": 4.980706441080439, "grad_norm": 0.9746400117874146, "learning_rate": 2.0480854853072127e-07, "loss": 0.7934, "step": 16780 }, { "epoch": 4.983674680914218, "grad_norm": 0.9652214646339417, "learning_rate": 1.751261501929356e-07, "loss": 0.7396, "step": 16790 }, { "epoch": 4.986642920747997, "grad_norm": 0.9589136242866516, "learning_rate": 1.454437518551499e-07, "loss": 0.7564, "step": 16800 }, { "epoch": 4.9896111605817755, "grad_norm": 1.137319803237915, "learning_rate": 1.157613535173642e-07, "loss": 0.742, "step": 16810 }, { "epoch": 4.992579400415553, "grad_norm": 1.0568934679031372, "learning_rate": 8.607895517957851e-08, "loss": 0.7621, "step": 16820 }, { "epoch": 4.995547640249332, "grad_norm": 0.9926442503929138, "learning_rate": 5.639655684179283e-08, "loss": 0.7375, "step": 16830 }, { "epoch": 4.998515880083111, "grad_norm": 0.9872409105300903, "learning_rate": 2.6714158504007124e-08, "loss": 0.74, "step": 16840 } ], "logging_steps": 10, "max_steps": 16845, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.960972002956411e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }