diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,26486 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9999404631963325, + "eval_steps": 500, + "global_step": 37791, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007938240488995614, + "grad_norm": 0.44355273246765137, + "learning_rate": 4.998676933661454e-05, + "loss": 1.5273, + "step": 10 + }, + { + "epoch": 0.0015876480977991228, + "grad_norm": 0.27582135796546936, + "learning_rate": 4.9973538673229073e-05, + "loss": 1.3011, + "step": 20 + }, + { + "epoch": 0.0023814721466986844, + "grad_norm": 0.3760926425457001, + "learning_rate": 4.996030800984361e-05, + "loss": 1.2377, + "step": 30 + }, + { + "epoch": 0.0031752961955982457, + "grad_norm": 0.2921426594257355, + "learning_rate": 4.994707734645815e-05, + "loss": 1.2327, + "step": 40 + }, + { + "epoch": 0.003969120244497807, + "grad_norm": 0.5147169828414917, + "learning_rate": 4.99338466830727e-05, + "loss": 1.1691, + "step": 50 + }, + { + "epoch": 0.004762944293397369, + "grad_norm": 0.38984984159469604, + "learning_rate": 4.992061601968723e-05, + "loss": 1.1292, + "step": 60 + }, + { + "epoch": 0.00555676834229693, + "grad_norm": 0.4045616090297699, + "learning_rate": 4.990738535630177e-05, + "loss": 1.0903, + "step": 70 + }, + { + "epoch": 0.006350592391196491, + "grad_norm": 0.4870094656944275, + "learning_rate": 4.989415469291631e-05, + "loss": 1.1421, + "step": 80 + }, + { + "epoch": 0.007144416440096053, + "grad_norm": 0.6016878485679626, + "learning_rate": 4.9880924029530846e-05, + "loss": 1.1459, + "step": 90 + }, + { + "epoch": 0.007938240488995614, + "grad_norm": 0.5868497490882874, + "learning_rate": 4.986769336614538e-05, + "loss": 1.0909, + "step": 100 + }, + { + "epoch": 0.008732064537895176, + "grad_norm": 0.6827491521835327, + "learning_rate": 4.985446270275992e-05, + "loss": 1.065, + "step": 110 + }, + { + "epoch": 0.009525888586794738, + "grad_norm": 0.7056289315223694, + "learning_rate": 4.9841232039374456e-05, + "loss": 1.0764, + "step": 120 + }, + { + "epoch": 0.010319712635694298, + "grad_norm": 0.5899627208709717, + "learning_rate": 4.9828001375988995e-05, + "loss": 1.0369, + "step": 130 + }, + { + "epoch": 0.01111353668459386, + "grad_norm": 0.6351290345191956, + "learning_rate": 4.9814770712603534e-05, + "loss": 1.0332, + "step": 140 + }, + { + "epoch": 0.011907360733493421, + "grad_norm": 0.7403615117073059, + "learning_rate": 4.980154004921807e-05, + "loss": 1.0837, + "step": 150 + }, + { + "epoch": 0.012701184782392983, + "grad_norm": 0.7318956255912781, + "learning_rate": 4.978830938583261e-05, + "loss": 1.1196, + "step": 160 + }, + { + "epoch": 0.013495008831292545, + "grad_norm": 0.8113905787467957, + "learning_rate": 4.9775078722447144e-05, + "loss": 1.0629, + "step": 170 + }, + { + "epoch": 0.014288832880192106, + "grad_norm": 0.7525221109390259, + "learning_rate": 4.976184805906168e-05, + "loss": 1.0937, + "step": 180 + }, + { + "epoch": 0.015082656929091666, + "grad_norm": 0.7905648350715637, + "learning_rate": 4.974861739567622e-05, + "loss": 1.0263, + "step": 190 + }, + { + "epoch": 0.015876480977991228, + "grad_norm": 0.47737765312194824, + "learning_rate": 4.973538673229076e-05, + "loss": 1.1193, + "step": 200 + }, + { + "epoch": 0.016670305026890788, + "grad_norm": 0.6782676577568054, + "learning_rate": 4.972215606890529e-05, + "loss": 1.0208, + "step": 210 + }, + { + "epoch": 0.01746412907579035, + "grad_norm": 0.7456789016723633, + "learning_rate": 4.970892540551983e-05, + "loss": 1.039, + "step": 220 + }, + { + "epoch": 0.01825795312468991, + "grad_norm": 0.8023837804794312, + "learning_rate": 4.969569474213438e-05, + "loss": 1.0291, + "step": 230 + }, + { + "epoch": 0.019051777173589475, + "grad_norm": 0.6566641926765442, + "learning_rate": 4.968246407874891e-05, + "loss": 1.071, + "step": 240 + }, + { + "epoch": 0.019845601222489035, + "grad_norm": 0.8695980906486511, + "learning_rate": 4.966923341536345e-05, + "loss": 0.9391, + "step": 250 + }, + { + "epoch": 0.020639425271388595, + "grad_norm": 0.7018496990203857, + "learning_rate": 4.965600275197799e-05, + "loss": 1.0663, + "step": 260 + }, + { + "epoch": 0.02143324932028816, + "grad_norm": 0.7785579562187195, + "learning_rate": 4.9642772088592526e-05, + "loss": 1.0047, + "step": 270 + }, + { + "epoch": 0.02222707336918772, + "grad_norm": 0.6475448608398438, + "learning_rate": 4.962954142520706e-05, + "loss": 1.0443, + "step": 280 + }, + { + "epoch": 0.023020897418087282, + "grad_norm": 0.8628908395767212, + "learning_rate": 4.96163107618216e-05, + "loss": 0.9987, + "step": 290 + }, + { + "epoch": 0.023814721466986842, + "grad_norm": 0.8528658747673035, + "learning_rate": 4.9603080098436136e-05, + "loss": 1.0199, + "step": 300 + }, + { + "epoch": 0.024608545515886402, + "grad_norm": 0.8548170328140259, + "learning_rate": 4.9589849435050675e-05, + "loss": 1.0916, + "step": 310 + }, + { + "epoch": 0.025402369564785966, + "grad_norm": 0.8923755884170532, + "learning_rate": 4.9576618771665214e-05, + "loss": 0.996, + "step": 320 + }, + { + "epoch": 0.026196193613685526, + "grad_norm": 0.8691526651382446, + "learning_rate": 4.956338810827975e-05, + "loss": 1.0162, + "step": 330 + }, + { + "epoch": 0.02699001766258509, + "grad_norm": 0.8156205415725708, + "learning_rate": 4.955015744489429e-05, + "loss": 1.0359, + "step": 340 + }, + { + "epoch": 0.02778384171148465, + "grad_norm": 0.8026768565177917, + "learning_rate": 4.9536926781508824e-05, + "loss": 1.0605, + "step": 350 + }, + { + "epoch": 0.028577665760384213, + "grad_norm": 0.9509242177009583, + "learning_rate": 4.952369611812336e-05, + "loss": 1.004, + "step": 360 + }, + { + "epoch": 0.029371489809283773, + "grad_norm": 0.8537798523902893, + "learning_rate": 4.95104654547379e-05, + "loss": 0.9256, + "step": 370 + }, + { + "epoch": 0.030165313858183333, + "grad_norm": 0.7341266870498657, + "learning_rate": 4.949723479135244e-05, + "loss": 0.95, + "step": 380 + }, + { + "epoch": 0.030959137907082896, + "grad_norm": 0.9486088752746582, + "learning_rate": 4.948400412796697e-05, + "loss": 0.9779, + "step": 390 + }, + { + "epoch": 0.031752961955982456, + "grad_norm": 0.5967279672622681, + "learning_rate": 4.947077346458152e-05, + "loss": 1.0431, + "step": 400 + }, + { + "epoch": 0.03254678600488202, + "grad_norm": 0.6559339165687561, + "learning_rate": 4.945754280119606e-05, + "loss": 1.0192, + "step": 410 + }, + { + "epoch": 0.033340610053781576, + "grad_norm": 0.8226445317268372, + "learning_rate": 4.94443121378106e-05, + "loss": 1.0024, + "step": 420 + }, + { + "epoch": 0.03413443410268114, + "grad_norm": 1.0217857360839844, + "learning_rate": 4.943108147442513e-05, + "loss": 1.0036, + "step": 430 + }, + { + "epoch": 0.0349282581515807, + "grad_norm": 0.9626277089118958, + "learning_rate": 4.941785081103967e-05, + "loss": 0.9955, + "step": 440 + }, + { + "epoch": 0.03572208220048027, + "grad_norm": 0.6774649620056152, + "learning_rate": 4.940462014765421e-05, + "loss": 0.9775, + "step": 450 + }, + { + "epoch": 0.03651590624937982, + "grad_norm": 0.8078123927116394, + "learning_rate": 4.9391389484268746e-05, + "loss": 0.9618, + "step": 460 + }, + { + "epoch": 0.03730973029827939, + "grad_norm": 1.0022040605545044, + "learning_rate": 4.937815882088328e-05, + "loss": 0.9787, + "step": 470 + }, + { + "epoch": 0.03810355434717895, + "grad_norm": 0.7025789618492126, + "learning_rate": 4.936492815749782e-05, + "loss": 0.9842, + "step": 480 + }, + { + "epoch": 0.03889737839607851, + "grad_norm": 0.6538853049278259, + "learning_rate": 4.9351697494112356e-05, + "loss": 0.988, + "step": 490 + }, + { + "epoch": 0.03969120244497807, + "grad_norm": 0.6774317026138306, + "learning_rate": 4.9338466830726895e-05, + "loss": 0.939, + "step": 500 + }, + { + "epoch": 0.040485026493877634, + "grad_norm": 0.9341694116592407, + "learning_rate": 4.9325236167341433e-05, + "loss": 1.0455, + "step": 510 + }, + { + "epoch": 0.04127885054277719, + "grad_norm": 0.8977088332176208, + "learning_rate": 4.931200550395597e-05, + "loss": 1.0065, + "step": 520 + }, + { + "epoch": 0.042072674591676754, + "grad_norm": 0.6621760725975037, + "learning_rate": 4.929877484057051e-05, + "loss": 1.0243, + "step": 530 + }, + { + "epoch": 0.04286649864057632, + "grad_norm": 0.8938838243484497, + "learning_rate": 4.9285544177185043e-05, + "loss": 1.0233, + "step": 540 + }, + { + "epoch": 0.04366032268947588, + "grad_norm": 0.7396982908248901, + "learning_rate": 4.927231351379958e-05, + "loss": 0.9628, + "step": 550 + }, + { + "epoch": 0.04445414673837544, + "grad_norm": 0.8066884279251099, + "learning_rate": 4.925908285041412e-05, + "loss": 1.0092, + "step": 560 + }, + { + "epoch": 0.045247970787275, + "grad_norm": 0.8251396417617798, + "learning_rate": 4.924585218702866e-05, + "loss": 1.0094, + "step": 570 + }, + { + "epoch": 0.046041794836174564, + "grad_norm": 0.8999793529510498, + "learning_rate": 4.92326215236432e-05, + "loss": 0.9971, + "step": 580 + }, + { + "epoch": 0.04683561888507412, + "grad_norm": 0.7802185416221619, + "learning_rate": 4.921939086025774e-05, + "loss": 0.9441, + "step": 590 + }, + { + "epoch": 0.047629442933973684, + "grad_norm": 0.9204962849617004, + "learning_rate": 4.920616019687228e-05, + "loss": 1.0266, + "step": 600 + }, + { + "epoch": 0.04842326698287325, + "grad_norm": 0.8698873519897461, + "learning_rate": 4.919292953348681e-05, + "loss": 0.9666, + "step": 610 + }, + { + "epoch": 0.049217091031772804, + "grad_norm": 0.7144138813018799, + "learning_rate": 4.917969887010135e-05, + "loss": 1.0201, + "step": 620 + }, + { + "epoch": 0.05001091508067237, + "grad_norm": 0.9176733493804932, + "learning_rate": 4.916646820671589e-05, + "loss": 0.9937, + "step": 630 + }, + { + "epoch": 0.05080473912957193, + "grad_norm": 0.8604631423950195, + "learning_rate": 4.9153237543330426e-05, + "loss": 0.9882, + "step": 640 + }, + { + "epoch": 0.051598563178471495, + "grad_norm": 1.0573097467422485, + "learning_rate": 4.914000687994496e-05, + "loss": 0.9997, + "step": 650 + }, + { + "epoch": 0.05239238722737105, + "grad_norm": 0.9698209166526794, + "learning_rate": 4.91267762165595e-05, + "loss": 0.9492, + "step": 660 + }, + { + "epoch": 0.053186211276270615, + "grad_norm": 0.8090052604675293, + "learning_rate": 4.911354555317404e-05, + "loss": 0.9408, + "step": 670 + }, + { + "epoch": 0.05398003532517018, + "grad_norm": 0.9131582975387573, + "learning_rate": 4.910031488978858e-05, + "loss": 0.92, + "step": 680 + }, + { + "epoch": 0.054773859374069735, + "grad_norm": 0.8041892051696777, + "learning_rate": 4.9087084226403114e-05, + "loss": 1.021, + "step": 690 + }, + { + "epoch": 0.0555676834229693, + "grad_norm": 1.0868831872940063, + "learning_rate": 4.907385356301765e-05, + "loss": 1.0078, + "step": 700 + }, + { + "epoch": 0.05636150747186886, + "grad_norm": 0.7588531374931335, + "learning_rate": 4.906062289963219e-05, + "loss": 0.9926, + "step": 710 + }, + { + "epoch": 0.057155331520768425, + "grad_norm": 0.7499911189079285, + "learning_rate": 4.9047392236246724e-05, + "loss": 1.0117, + "step": 720 + }, + { + "epoch": 0.05794915556966798, + "grad_norm": 0.8259711265563965, + "learning_rate": 4.903416157286126e-05, + "loss": 0.9394, + "step": 730 + }, + { + "epoch": 0.058742979618567545, + "grad_norm": 0.7790316939353943, + "learning_rate": 4.90209309094758e-05, + "loss": 0.9598, + "step": 740 + }, + { + "epoch": 0.05953680366746711, + "grad_norm": 0.8743465542793274, + "learning_rate": 4.900770024609034e-05, + "loss": 0.9337, + "step": 750 + }, + { + "epoch": 0.060330627716366665, + "grad_norm": 0.6910851001739502, + "learning_rate": 4.899446958270488e-05, + "loss": 1.0239, + "step": 760 + }, + { + "epoch": 0.06112445176526623, + "grad_norm": 0.8479220271110535, + "learning_rate": 4.898123891931942e-05, + "loss": 0.94, + "step": 770 + }, + { + "epoch": 0.06191827581416579, + "grad_norm": 0.9144072532653809, + "learning_rate": 4.896800825593396e-05, + "loss": 0.9813, + "step": 780 + }, + { + "epoch": 0.06271209986306535, + "grad_norm": 0.7847743034362793, + "learning_rate": 4.8954777592548496e-05, + "loss": 1.0148, + "step": 790 + }, + { + "epoch": 0.06350592391196491, + "grad_norm": 0.6444619297981262, + "learning_rate": 4.894154692916303e-05, + "loss": 1.0174, + "step": 800 + }, + { + "epoch": 0.06429974796086448, + "grad_norm": 0.8129200339317322, + "learning_rate": 4.892831626577757e-05, + "loss": 0.9764, + "step": 810 + }, + { + "epoch": 0.06509357200976404, + "grad_norm": 0.8406952619552612, + "learning_rate": 4.8915085602392106e-05, + "loss": 0.9332, + "step": 820 + }, + { + "epoch": 0.0658873960586636, + "grad_norm": 0.778441846370697, + "learning_rate": 4.890185493900664e-05, + "loss": 0.9439, + "step": 830 + }, + { + "epoch": 0.06668122010756315, + "grad_norm": 0.9417208433151245, + "learning_rate": 4.8888624275621184e-05, + "loss": 0.9337, + "step": 840 + }, + { + "epoch": 0.06747504415646272, + "grad_norm": 0.8156752586364746, + "learning_rate": 4.887539361223572e-05, + "loss": 0.9909, + "step": 850 + }, + { + "epoch": 0.06826886820536228, + "grad_norm": 0.8329022526741028, + "learning_rate": 4.886216294885026e-05, + "loss": 0.9374, + "step": 860 + }, + { + "epoch": 0.06906269225426184, + "grad_norm": 0.816589891910553, + "learning_rate": 4.8848932285464794e-05, + "loss": 1.0349, + "step": 870 + }, + { + "epoch": 0.0698565163031614, + "grad_norm": 0.7630716562271118, + "learning_rate": 4.883570162207933e-05, + "loss": 0.9716, + "step": 880 + }, + { + "epoch": 0.07065034035206097, + "grad_norm": 0.978765606880188, + "learning_rate": 4.882247095869387e-05, + "loss": 1.0316, + "step": 890 + }, + { + "epoch": 0.07144416440096053, + "grad_norm": 0.7739888429641724, + "learning_rate": 4.880924029530841e-05, + "loss": 0.9596, + "step": 900 + }, + { + "epoch": 0.07223798844986008, + "grad_norm": 0.9134480357170105, + "learning_rate": 4.879600963192294e-05, + "loss": 0.9758, + "step": 910 + }, + { + "epoch": 0.07303181249875965, + "grad_norm": 0.8623645305633545, + "learning_rate": 4.878277896853748e-05, + "loss": 1.001, + "step": 920 + }, + { + "epoch": 0.07382563654765921, + "grad_norm": 0.8770903944969177, + "learning_rate": 4.876954830515202e-05, + "loss": 0.9698, + "step": 930 + }, + { + "epoch": 0.07461946059655877, + "grad_norm": 0.6688238382339478, + "learning_rate": 4.875631764176656e-05, + "loss": 1.0244, + "step": 940 + }, + { + "epoch": 0.07541328464545834, + "grad_norm": 0.7384278178215027, + "learning_rate": 4.87430869783811e-05, + "loss": 0.9876, + "step": 950 + }, + { + "epoch": 0.0762071086943579, + "grad_norm": 0.8874367475509644, + "learning_rate": 4.872985631499564e-05, + "loss": 0.9847, + "step": 960 + }, + { + "epoch": 0.07700093274325746, + "grad_norm": 0.6085987687110901, + "learning_rate": 4.871662565161018e-05, + "loss": 0.9255, + "step": 970 + }, + { + "epoch": 0.07779475679215701, + "grad_norm": 0.7393110394477844, + "learning_rate": 4.870339498822471e-05, + "loss": 0.9081, + "step": 980 + }, + { + "epoch": 0.07858858084105658, + "grad_norm": 0.9497765302658081, + "learning_rate": 4.869016432483925e-05, + "loss": 0.9948, + "step": 990 + }, + { + "epoch": 0.07938240488995614, + "grad_norm": 0.7365654110908508, + "learning_rate": 4.867693366145379e-05, + "loss": 1.0194, + "step": 1000 + }, + { + "epoch": 0.0801762289388557, + "grad_norm": 0.9192026257514954, + "learning_rate": 4.8663702998068326e-05, + "loss": 0.9777, + "step": 1010 + }, + { + "epoch": 0.08097005298775527, + "grad_norm": 0.9804342985153198, + "learning_rate": 4.8650472334682865e-05, + "loss": 0.9756, + "step": 1020 + }, + { + "epoch": 0.08176387703665483, + "grad_norm": 0.976823091506958, + "learning_rate": 4.8637241671297403e-05, + "loss": 0.9507, + "step": 1030 + }, + { + "epoch": 0.08255770108555438, + "grad_norm": 0.6893647909164429, + "learning_rate": 4.862401100791194e-05, + "loss": 1.003, + "step": 1040 + }, + { + "epoch": 0.08335152513445394, + "grad_norm": 0.8077487945556641, + "learning_rate": 4.8610780344526475e-05, + "loss": 0.944, + "step": 1050 + }, + { + "epoch": 0.08414534918335351, + "grad_norm": 0.6170672178268433, + "learning_rate": 4.8597549681141013e-05, + "loss": 0.9685, + "step": 1060 + }, + { + "epoch": 0.08493917323225307, + "grad_norm": 0.7847235798835754, + "learning_rate": 4.858431901775555e-05, + "loss": 0.9448, + "step": 1070 + }, + { + "epoch": 0.08573299728115263, + "grad_norm": 0.7903896570205688, + "learning_rate": 4.857108835437009e-05, + "loss": 1.0231, + "step": 1080 + }, + { + "epoch": 0.0865268213300522, + "grad_norm": 0.8650990724563599, + "learning_rate": 4.8557857690984623e-05, + "loss": 0.9676, + "step": 1090 + }, + { + "epoch": 0.08732064537895176, + "grad_norm": 0.9188217520713806, + "learning_rate": 4.854462702759916e-05, + "loss": 0.9851, + "step": 1100 + }, + { + "epoch": 0.08811446942785131, + "grad_norm": 0.9663834571838379, + "learning_rate": 4.853139636421371e-05, + "loss": 0.9442, + "step": 1110 + }, + { + "epoch": 0.08890829347675087, + "grad_norm": 0.8045158386230469, + "learning_rate": 4.851816570082825e-05, + "loss": 0.9411, + "step": 1120 + }, + { + "epoch": 0.08970211752565044, + "grad_norm": 0.9113909006118774, + "learning_rate": 4.850493503744278e-05, + "loss": 0.8866, + "step": 1130 + }, + { + "epoch": 0.09049594157455, + "grad_norm": 0.8947747349739075, + "learning_rate": 4.849170437405732e-05, + "loss": 0.9206, + "step": 1140 + }, + { + "epoch": 0.09128976562344956, + "grad_norm": 0.7741749882698059, + "learning_rate": 4.847847371067186e-05, + "loss": 1.0066, + "step": 1150 + }, + { + "epoch": 0.09208358967234913, + "grad_norm": 0.9636614918708801, + "learning_rate": 4.846524304728639e-05, + "loss": 0.9518, + "step": 1160 + }, + { + "epoch": 0.09287741372124869, + "grad_norm": 0.7008812427520752, + "learning_rate": 4.845201238390093e-05, + "loss": 0.9543, + "step": 1170 + }, + { + "epoch": 0.09367123777014824, + "grad_norm": 0.9602216482162476, + "learning_rate": 4.843878172051547e-05, + "loss": 0.915, + "step": 1180 + }, + { + "epoch": 0.0944650618190478, + "grad_norm": 0.9824376702308655, + "learning_rate": 4.8425551057130006e-05, + "loss": 0.9073, + "step": 1190 + }, + { + "epoch": 0.09525888586794737, + "grad_norm": 0.7355358600616455, + "learning_rate": 4.8412320393744545e-05, + "loss": 0.9487, + "step": 1200 + }, + { + "epoch": 0.09605270991684693, + "grad_norm": 1.0072027444839478, + "learning_rate": 4.8399089730359084e-05, + "loss": 0.9659, + "step": 1210 + }, + { + "epoch": 0.0968465339657465, + "grad_norm": 0.78026282787323, + "learning_rate": 4.838585906697362e-05, + "loss": 0.9716, + "step": 1220 + }, + { + "epoch": 0.09764035801464606, + "grad_norm": 0.8772688508033752, + "learning_rate": 4.837262840358816e-05, + "loss": 0.9621, + "step": 1230 + }, + { + "epoch": 0.09843418206354561, + "grad_norm": 0.8083699345588684, + "learning_rate": 4.8359397740202694e-05, + "loss": 0.8598, + "step": 1240 + }, + { + "epoch": 0.09922800611244517, + "grad_norm": 0.8948017954826355, + "learning_rate": 4.834616707681723e-05, + "loss": 0.9425, + "step": 1250 + }, + { + "epoch": 0.10002183016134474, + "grad_norm": 0.7335965633392334, + "learning_rate": 4.833293641343177e-05, + "loss": 0.9441, + "step": 1260 + }, + { + "epoch": 0.1008156542102443, + "grad_norm": 0.7533779740333557, + "learning_rate": 4.831970575004631e-05, + "loss": 0.9918, + "step": 1270 + }, + { + "epoch": 0.10160947825914386, + "grad_norm": 0.8201155066490173, + "learning_rate": 4.830647508666085e-05, + "loss": 1.014, + "step": 1280 + }, + { + "epoch": 0.10240330230804343, + "grad_norm": 0.6296638250350952, + "learning_rate": 4.829324442327539e-05, + "loss": 0.962, + "step": 1290 + }, + { + "epoch": 0.10319712635694299, + "grad_norm": 0.8968091011047363, + "learning_rate": 4.828001375988993e-05, + "loss": 0.9825, + "step": 1300 + }, + { + "epoch": 0.10399095040584254, + "grad_norm": 0.9169098138809204, + "learning_rate": 4.826678309650446e-05, + "loss": 0.8705, + "step": 1310 + }, + { + "epoch": 0.1047847744547421, + "grad_norm": 0.7152407169342041, + "learning_rate": 4.8253552433119e-05, + "loss": 0.9902, + "step": 1320 + }, + { + "epoch": 0.10557859850364167, + "grad_norm": 0.9684945940971375, + "learning_rate": 4.824032176973354e-05, + "loss": 1.0179, + "step": 1330 + }, + { + "epoch": 0.10637242255254123, + "grad_norm": 1.0346221923828125, + "learning_rate": 4.8227091106348076e-05, + "loss": 0.9245, + "step": 1340 + }, + { + "epoch": 0.10716624660144079, + "grad_norm": 0.9577662348747253, + "learning_rate": 4.821386044296261e-05, + "loss": 1.0263, + "step": 1350 + }, + { + "epoch": 0.10796007065034036, + "grad_norm": 0.756993293762207, + "learning_rate": 4.820062977957715e-05, + "loss": 0.9375, + "step": 1360 + }, + { + "epoch": 0.10875389469923992, + "grad_norm": 0.6969035267829895, + "learning_rate": 4.8187399116191686e-05, + "loss": 0.9871, + "step": 1370 + }, + { + "epoch": 0.10954771874813947, + "grad_norm": 1.1062333583831787, + "learning_rate": 4.817416845280623e-05, + "loss": 0.9659, + "step": 1380 + }, + { + "epoch": 0.11034154279703903, + "grad_norm": 0.592980146408081, + "learning_rate": 4.8160937789420764e-05, + "loss": 1.0116, + "step": 1390 + }, + { + "epoch": 0.1111353668459386, + "grad_norm": 0.8485913276672363, + "learning_rate": 4.81477071260353e-05, + "loss": 0.963, + "step": 1400 + }, + { + "epoch": 0.11192919089483816, + "grad_norm": 0.8063706159591675, + "learning_rate": 4.813447646264984e-05, + "loss": 0.9862, + "step": 1410 + }, + { + "epoch": 0.11272301494373772, + "grad_norm": 0.705464243888855, + "learning_rate": 4.8121245799264374e-05, + "loss": 0.9472, + "step": 1420 + }, + { + "epoch": 0.11351683899263729, + "grad_norm": 0.8876354098320007, + "learning_rate": 4.810801513587891e-05, + "loss": 0.924, + "step": 1430 + }, + { + "epoch": 0.11431066304153685, + "grad_norm": 1.0832111835479736, + "learning_rate": 4.809478447249345e-05, + "loss": 0.9387, + "step": 1440 + }, + { + "epoch": 0.1151044870904364, + "grad_norm": 0.8490377068519592, + "learning_rate": 4.808155380910799e-05, + "loss": 1.0323, + "step": 1450 + }, + { + "epoch": 0.11589831113933596, + "grad_norm": 0.8763223886489868, + "learning_rate": 4.806832314572253e-05, + "loss": 0.9555, + "step": 1460 + }, + { + "epoch": 0.11669213518823553, + "grad_norm": 0.9210900664329529, + "learning_rate": 4.805509248233707e-05, + "loss": 1.0096, + "step": 1470 + }, + { + "epoch": 0.11748595923713509, + "grad_norm": 0.7176244258880615, + "learning_rate": 4.804186181895161e-05, + "loss": 0.9415, + "step": 1480 + }, + { + "epoch": 0.11827978328603465, + "grad_norm": 0.9028609991073608, + "learning_rate": 4.802863115556615e-05, + "loss": 0.9449, + "step": 1490 + }, + { + "epoch": 0.11907360733493422, + "grad_norm": 0.779123067855835, + "learning_rate": 4.801540049218068e-05, + "loss": 0.899, + "step": 1500 + }, + { + "epoch": 0.11986743138383377, + "grad_norm": 0.7735058665275574, + "learning_rate": 4.800216982879522e-05, + "loss": 0.9138, + "step": 1510 + }, + { + "epoch": 0.12066125543273333, + "grad_norm": 0.851739764213562, + "learning_rate": 4.798893916540976e-05, + "loss": 0.9476, + "step": 1520 + }, + { + "epoch": 0.1214550794816329, + "grad_norm": 0.8166431188583374, + "learning_rate": 4.797570850202429e-05, + "loss": 0.9746, + "step": 1530 + }, + { + "epoch": 0.12224890353053246, + "grad_norm": 0.8283756971359253, + "learning_rate": 4.796247783863883e-05, + "loss": 0.9072, + "step": 1540 + }, + { + "epoch": 0.12304272757943202, + "grad_norm": 0.8357489705085754, + "learning_rate": 4.794924717525337e-05, + "loss": 0.9563, + "step": 1550 + }, + { + "epoch": 0.12383655162833158, + "grad_norm": 0.9437370896339417, + "learning_rate": 4.793601651186791e-05, + "loss": 0.9709, + "step": 1560 + }, + { + "epoch": 0.12463037567723115, + "grad_norm": 0.8020033836364746, + "learning_rate": 4.7922785848482445e-05, + "loss": 0.9925, + "step": 1570 + }, + { + "epoch": 0.1254241997261307, + "grad_norm": 0.7915470600128174, + "learning_rate": 4.7909555185096983e-05, + "loss": 0.9991, + "step": 1580 + }, + { + "epoch": 0.12621802377503027, + "grad_norm": 0.6978237628936768, + "learning_rate": 4.789632452171152e-05, + "loss": 0.9994, + "step": 1590 + }, + { + "epoch": 0.12701184782392982, + "grad_norm": 0.9151979088783264, + "learning_rate": 4.788309385832606e-05, + "loss": 0.9341, + "step": 1600 + }, + { + "epoch": 0.12780567187282937, + "grad_norm": 0.8755201697349548, + "learning_rate": 4.7869863194940593e-05, + "loss": 0.9303, + "step": 1610 + }, + { + "epoch": 0.12859949592172895, + "grad_norm": 0.9662395715713501, + "learning_rate": 4.785663253155513e-05, + "loss": 0.9088, + "step": 1620 + }, + { + "epoch": 0.1293933199706285, + "grad_norm": 0.7612505555152893, + "learning_rate": 4.784340186816967e-05, + "loss": 1.0068, + "step": 1630 + }, + { + "epoch": 0.13018714401952808, + "grad_norm": 0.8853713274002075, + "learning_rate": 4.783017120478421e-05, + "loss": 0.9877, + "step": 1640 + }, + { + "epoch": 0.13098096806842763, + "grad_norm": 0.6972758173942566, + "learning_rate": 4.781694054139875e-05, + "loss": 0.9068, + "step": 1650 + }, + { + "epoch": 0.1317747921173272, + "grad_norm": 0.8050327301025391, + "learning_rate": 4.780370987801329e-05, + "loss": 0.8972, + "step": 1660 + }, + { + "epoch": 0.13256861616622675, + "grad_norm": 0.8144531846046448, + "learning_rate": 4.779047921462783e-05, + "loss": 0.889, + "step": 1670 + }, + { + "epoch": 0.1333624402151263, + "grad_norm": 0.8536010980606079, + "learning_rate": 4.777724855124236e-05, + "loss": 0.9718, + "step": 1680 + }, + { + "epoch": 0.13415626426402588, + "grad_norm": 0.7940757274627686, + "learning_rate": 4.77640178878569e-05, + "loss": 0.9628, + "step": 1690 + }, + { + "epoch": 0.13495008831292543, + "grad_norm": 0.6885228157043457, + "learning_rate": 4.775078722447144e-05, + "loss": 0.9482, + "step": 1700 + }, + { + "epoch": 0.135743912361825, + "grad_norm": 0.7801288366317749, + "learning_rate": 4.7737556561085976e-05, + "loss": 0.9084, + "step": 1710 + }, + { + "epoch": 0.13653773641072456, + "grad_norm": 0.7502837777137756, + "learning_rate": 4.772432589770051e-05, + "loss": 0.869, + "step": 1720 + }, + { + "epoch": 0.13733156045962414, + "grad_norm": 0.9921239614486694, + "learning_rate": 4.7711095234315054e-05, + "loss": 0.9065, + "step": 1730 + }, + { + "epoch": 0.13812538450852369, + "grad_norm": 0.8346391916275024, + "learning_rate": 4.769786457092959e-05, + "loss": 0.9831, + "step": 1740 + }, + { + "epoch": 0.13891920855742323, + "grad_norm": 0.8118425607681274, + "learning_rate": 4.7684633907544125e-05, + "loss": 0.9673, + "step": 1750 + }, + { + "epoch": 0.1397130326063228, + "grad_norm": 0.8836124539375305, + "learning_rate": 4.7671403244158664e-05, + "loss": 0.9506, + "step": 1760 + }, + { + "epoch": 0.14050685665522236, + "grad_norm": 0.8998447060585022, + "learning_rate": 4.76581725807732e-05, + "loss": 0.9443, + "step": 1770 + }, + { + "epoch": 0.14130068070412194, + "grad_norm": 0.9058572053909302, + "learning_rate": 4.764494191738774e-05, + "loss": 0.8735, + "step": 1780 + }, + { + "epoch": 0.1420945047530215, + "grad_norm": 0.9656883478164673, + "learning_rate": 4.7631711254002274e-05, + "loss": 0.9379, + "step": 1790 + }, + { + "epoch": 0.14288832880192107, + "grad_norm": 0.8430660367012024, + "learning_rate": 4.761848059061681e-05, + "loss": 0.9405, + "step": 1800 + }, + { + "epoch": 0.14368215285082062, + "grad_norm": 0.963220477104187, + "learning_rate": 4.760524992723135e-05, + "loss": 0.9306, + "step": 1810 + }, + { + "epoch": 0.14447597689972017, + "grad_norm": 0.806779146194458, + "learning_rate": 4.759201926384589e-05, + "loss": 0.964, + "step": 1820 + }, + { + "epoch": 0.14526980094861974, + "grad_norm": 0.6742879152297974, + "learning_rate": 4.757878860046043e-05, + "loss": 0.9398, + "step": 1830 + }, + { + "epoch": 0.1460636249975193, + "grad_norm": 0.8305578827857971, + "learning_rate": 4.756555793707497e-05, + "loss": 0.9661, + "step": 1840 + }, + { + "epoch": 0.14685744904641887, + "grad_norm": 0.9429900646209717, + "learning_rate": 4.755232727368951e-05, + "loss": 0.9151, + "step": 1850 + }, + { + "epoch": 0.14765127309531842, + "grad_norm": 0.7211086750030518, + "learning_rate": 4.753909661030404e-05, + "loss": 0.9367, + "step": 1860 + }, + { + "epoch": 0.148445097144218, + "grad_norm": 0.932462751865387, + "learning_rate": 4.752586594691858e-05, + "loss": 0.9465, + "step": 1870 + }, + { + "epoch": 0.14923892119311755, + "grad_norm": 0.8956672549247742, + "learning_rate": 4.751263528353312e-05, + "loss": 0.9948, + "step": 1880 + }, + { + "epoch": 0.1500327452420171, + "grad_norm": 0.78652423620224, + "learning_rate": 4.7499404620147656e-05, + "loss": 0.9323, + "step": 1890 + }, + { + "epoch": 0.15082656929091667, + "grad_norm": 0.8287889361381531, + "learning_rate": 4.7486173956762195e-05, + "loss": 0.944, + "step": 1900 + }, + { + "epoch": 0.15162039333981622, + "grad_norm": 1.0075963735580444, + "learning_rate": 4.7472943293376734e-05, + "loss": 0.9595, + "step": 1910 + }, + { + "epoch": 0.1524142173887158, + "grad_norm": 1.075189471244812, + "learning_rate": 4.745971262999127e-05, + "loss": 0.9636, + "step": 1920 + }, + { + "epoch": 0.15320804143761535, + "grad_norm": 0.9045613408088684, + "learning_rate": 4.744648196660581e-05, + "loss": 0.9544, + "step": 1930 + }, + { + "epoch": 0.15400186548651493, + "grad_norm": 0.8848656415939331, + "learning_rate": 4.7433251303220344e-05, + "loss": 0.9707, + "step": 1940 + }, + { + "epoch": 0.15479568953541448, + "grad_norm": 1.1052658557891846, + "learning_rate": 4.742002063983488e-05, + "loss": 0.9881, + "step": 1950 + }, + { + "epoch": 0.15558951358431403, + "grad_norm": 0.8946327567100525, + "learning_rate": 4.740678997644942e-05, + "loss": 1.0025, + "step": 1960 + }, + { + "epoch": 0.1563833376332136, + "grad_norm": 0.8792600035667419, + "learning_rate": 4.7393559313063954e-05, + "loss": 0.9993, + "step": 1970 + }, + { + "epoch": 0.15717716168211315, + "grad_norm": 0.9298778176307678, + "learning_rate": 4.738032864967849e-05, + "loss": 0.9169, + "step": 1980 + }, + { + "epoch": 0.15797098573101273, + "grad_norm": 0.7488551735877991, + "learning_rate": 4.736709798629303e-05, + "loss": 0.9047, + "step": 1990 + }, + { + "epoch": 0.15876480977991228, + "grad_norm": 0.7787390351295471, + "learning_rate": 4.735386732290758e-05, + "loss": 0.936, + "step": 2000 + }, + { + "epoch": 0.15955863382881183, + "grad_norm": 0.707550048828125, + "learning_rate": 4.734063665952211e-05, + "loss": 0.9948, + "step": 2010 + }, + { + "epoch": 0.1603524578777114, + "grad_norm": 0.8335928916931152, + "learning_rate": 4.732740599613665e-05, + "loss": 0.9602, + "step": 2020 + }, + { + "epoch": 0.16114628192661096, + "grad_norm": 1.0111210346221924, + "learning_rate": 4.731417533275119e-05, + "loss": 0.9417, + "step": 2030 + }, + { + "epoch": 0.16194010597551053, + "grad_norm": 0.6672728657722473, + "learning_rate": 4.730094466936573e-05, + "loss": 0.9389, + "step": 2040 + }, + { + "epoch": 0.16273393002441008, + "grad_norm": 0.8862355947494507, + "learning_rate": 4.728771400598026e-05, + "loss": 0.9071, + "step": 2050 + }, + { + "epoch": 0.16352775407330966, + "grad_norm": 0.8846628069877625, + "learning_rate": 4.72744833425948e-05, + "loss": 0.9351, + "step": 2060 + }, + { + "epoch": 0.1643215781222092, + "grad_norm": 0.833533763885498, + "learning_rate": 4.726125267920934e-05, + "loss": 0.9713, + "step": 2070 + }, + { + "epoch": 0.16511540217110876, + "grad_norm": 0.724327564239502, + "learning_rate": 4.7248022015823876e-05, + "loss": 0.9349, + "step": 2080 + }, + { + "epoch": 0.16590922622000834, + "grad_norm": 0.9286043643951416, + "learning_rate": 4.7234791352438415e-05, + "loss": 0.9144, + "step": 2090 + }, + { + "epoch": 0.1667030502689079, + "grad_norm": 0.8603926301002502, + "learning_rate": 4.7221560689052953e-05, + "loss": 0.9156, + "step": 2100 + }, + { + "epoch": 0.16749687431780746, + "grad_norm": 0.9300916194915771, + "learning_rate": 4.720833002566749e-05, + "loss": 0.8894, + "step": 2110 + }, + { + "epoch": 0.16829069836670701, + "grad_norm": 0.7799938917160034, + "learning_rate": 4.7195099362282025e-05, + "loss": 0.8955, + "step": 2120 + }, + { + "epoch": 0.1690845224156066, + "grad_norm": 0.9887612462043762, + "learning_rate": 4.7181868698896563e-05, + "loss": 0.9712, + "step": 2130 + }, + { + "epoch": 0.16987834646450614, + "grad_norm": 0.9083105325698853, + "learning_rate": 4.71686380355111e-05, + "loss": 0.9434, + "step": 2140 + }, + { + "epoch": 0.1706721705134057, + "grad_norm": 0.868770182132721, + "learning_rate": 4.715540737212564e-05, + "loss": 0.9522, + "step": 2150 + }, + { + "epoch": 0.17146599456230527, + "grad_norm": 0.819227933883667, + "learning_rate": 4.7142176708740173e-05, + "loss": 0.9812, + "step": 2160 + }, + { + "epoch": 0.17225981861120482, + "grad_norm": 0.9821423292160034, + "learning_rate": 4.712894604535472e-05, + "loss": 0.9075, + "step": 2170 + }, + { + "epoch": 0.1730536426601044, + "grad_norm": 0.6770303845405579, + "learning_rate": 4.711571538196926e-05, + "loss": 0.9006, + "step": 2180 + }, + { + "epoch": 0.17384746670900394, + "grad_norm": 1.0133216381072998, + "learning_rate": 4.71024847185838e-05, + "loss": 0.9851, + "step": 2190 + }, + { + "epoch": 0.17464129075790352, + "grad_norm": 0.6719434261322021, + "learning_rate": 4.708925405519833e-05, + "loss": 0.9876, + "step": 2200 + }, + { + "epoch": 0.17543511480680307, + "grad_norm": 0.9074175357818604, + "learning_rate": 4.707602339181287e-05, + "loss": 0.8679, + "step": 2210 + }, + { + "epoch": 0.17622893885570262, + "grad_norm": 0.7234490513801575, + "learning_rate": 4.706279272842741e-05, + "loss": 0.9469, + "step": 2220 + }, + { + "epoch": 0.1770227629046022, + "grad_norm": 0.8408550024032593, + "learning_rate": 4.704956206504194e-05, + "loss": 0.8871, + "step": 2230 + }, + { + "epoch": 0.17781658695350175, + "grad_norm": 0.9565715789794922, + "learning_rate": 4.703633140165648e-05, + "loss": 0.9597, + "step": 2240 + }, + { + "epoch": 0.17861041100240133, + "grad_norm": 0.7657901644706726, + "learning_rate": 4.702310073827102e-05, + "loss": 0.912, + "step": 2250 + }, + { + "epoch": 0.17940423505130088, + "grad_norm": 0.7222371101379395, + "learning_rate": 4.7009870074885556e-05, + "loss": 0.9369, + "step": 2260 + }, + { + "epoch": 0.18019805910020045, + "grad_norm": 0.8453822731971741, + "learning_rate": 4.6996639411500095e-05, + "loss": 0.94, + "step": 2270 + }, + { + "epoch": 0.1809918831491, + "grad_norm": 0.6532242298126221, + "learning_rate": 4.6983408748114634e-05, + "loss": 0.8897, + "step": 2280 + }, + { + "epoch": 0.18178570719799955, + "grad_norm": 0.8275949358940125, + "learning_rate": 4.697017808472917e-05, + "loss": 0.9235, + "step": 2290 + }, + { + "epoch": 0.18257953124689913, + "grad_norm": 0.7926145792007446, + "learning_rate": 4.695694742134371e-05, + "loss": 0.9327, + "step": 2300 + }, + { + "epoch": 0.18337335529579868, + "grad_norm": 0.8629859089851379, + "learning_rate": 4.6943716757958244e-05, + "loss": 0.9714, + "step": 2310 + }, + { + "epoch": 0.18416717934469826, + "grad_norm": 0.8121680021286011, + "learning_rate": 4.693048609457278e-05, + "loss": 0.9545, + "step": 2320 + }, + { + "epoch": 0.1849610033935978, + "grad_norm": 0.9470918774604797, + "learning_rate": 4.691725543118732e-05, + "loss": 0.8992, + "step": 2330 + }, + { + "epoch": 0.18575482744249738, + "grad_norm": 0.8054450750350952, + "learning_rate": 4.690402476780186e-05, + "loss": 0.9984, + "step": 2340 + }, + { + "epoch": 0.18654865149139693, + "grad_norm": 0.6857488751411438, + "learning_rate": 4.68907941044164e-05, + "loss": 0.9411, + "step": 2350 + }, + { + "epoch": 0.18734247554029648, + "grad_norm": 0.8462006449699402, + "learning_rate": 4.687756344103094e-05, + "loss": 0.9527, + "step": 2360 + }, + { + "epoch": 0.18813629958919606, + "grad_norm": 0.8726380467414856, + "learning_rate": 4.686433277764548e-05, + "loss": 0.8983, + "step": 2370 + }, + { + "epoch": 0.1889301236380956, + "grad_norm": 0.883798360824585, + "learning_rate": 4.685110211426001e-05, + "loss": 0.9382, + "step": 2380 + }, + { + "epoch": 0.1897239476869952, + "grad_norm": 0.8085561394691467, + "learning_rate": 4.683787145087455e-05, + "loss": 0.9371, + "step": 2390 + }, + { + "epoch": 0.19051777173589474, + "grad_norm": 0.816972017288208, + "learning_rate": 4.682464078748909e-05, + "loss": 0.9322, + "step": 2400 + }, + { + "epoch": 0.1913115957847943, + "grad_norm": 0.8701701164245605, + "learning_rate": 4.6811410124103626e-05, + "loss": 0.9277, + "step": 2410 + }, + { + "epoch": 0.19210541983369386, + "grad_norm": 0.9203164577484131, + "learning_rate": 4.679817946071816e-05, + "loss": 0.9365, + "step": 2420 + }, + { + "epoch": 0.1928992438825934, + "grad_norm": 0.9870917797088623, + "learning_rate": 4.67849487973327e-05, + "loss": 0.973, + "step": 2430 + }, + { + "epoch": 0.193693067931493, + "grad_norm": 0.8381191492080688, + "learning_rate": 4.677171813394724e-05, + "loss": 0.9344, + "step": 2440 + }, + { + "epoch": 0.19448689198039254, + "grad_norm": 0.9467155933380127, + "learning_rate": 4.6758487470561775e-05, + "loss": 0.9162, + "step": 2450 + }, + { + "epoch": 0.19528071602929212, + "grad_norm": 0.8836456537246704, + "learning_rate": 4.6745256807176314e-05, + "loss": 0.9548, + "step": 2460 + }, + { + "epoch": 0.19607454007819167, + "grad_norm": 0.806165337562561, + "learning_rate": 4.673202614379085e-05, + "loss": 0.9615, + "step": 2470 + }, + { + "epoch": 0.19686836412709122, + "grad_norm": 0.8333777785301208, + "learning_rate": 4.671879548040539e-05, + "loss": 0.9331, + "step": 2480 + }, + { + "epoch": 0.1976621881759908, + "grad_norm": 0.9667910933494568, + "learning_rate": 4.6705564817019924e-05, + "loss": 0.9031, + "step": 2490 + }, + { + "epoch": 0.19845601222489034, + "grad_norm": 0.8661583065986633, + "learning_rate": 4.669233415363446e-05, + "loss": 0.9208, + "step": 2500 + }, + { + "epoch": 0.19924983627378992, + "grad_norm": 1.0444538593292236, + "learning_rate": 4.6679103490249e-05, + "loss": 0.992, + "step": 2510 + }, + { + "epoch": 0.20004366032268947, + "grad_norm": 0.6848312616348267, + "learning_rate": 4.666587282686354e-05, + "loss": 0.9215, + "step": 2520 + }, + { + "epoch": 0.20083748437158905, + "grad_norm": 0.789969801902771, + "learning_rate": 4.665264216347808e-05, + "loss": 0.9371, + "step": 2530 + }, + { + "epoch": 0.2016313084204886, + "grad_norm": 0.7210158109664917, + "learning_rate": 4.663941150009262e-05, + "loss": 0.9561, + "step": 2540 + }, + { + "epoch": 0.20242513246938815, + "grad_norm": 0.9324051737785339, + "learning_rate": 4.662618083670716e-05, + "loss": 0.9632, + "step": 2550 + }, + { + "epoch": 0.20321895651828772, + "grad_norm": 0.6545863151550293, + "learning_rate": 4.661295017332169e-05, + "loss": 0.9027, + "step": 2560 + }, + { + "epoch": 0.20401278056718727, + "grad_norm": 0.9494184255599976, + "learning_rate": 4.659971950993623e-05, + "loss": 0.8897, + "step": 2570 + }, + { + "epoch": 0.20480660461608685, + "grad_norm": 0.670927107334137, + "learning_rate": 4.658648884655077e-05, + "loss": 0.9166, + "step": 2580 + }, + { + "epoch": 0.2056004286649864, + "grad_norm": 0.8145579695701599, + "learning_rate": 4.657325818316531e-05, + "loss": 0.9364, + "step": 2590 + }, + { + "epoch": 0.20639425271388598, + "grad_norm": 0.9413288831710815, + "learning_rate": 4.656002751977984e-05, + "loss": 0.8997, + "step": 2600 + }, + { + "epoch": 0.20718807676278553, + "grad_norm": 0.8547295331954956, + "learning_rate": 4.6546796856394385e-05, + "loss": 0.9594, + "step": 2610 + }, + { + "epoch": 0.20798190081168508, + "grad_norm": 0.711956799030304, + "learning_rate": 4.6533566193008923e-05, + "loss": 0.9728, + "step": 2620 + }, + { + "epoch": 0.20877572486058465, + "grad_norm": 0.7029785513877869, + "learning_rate": 4.652033552962346e-05, + "loss": 0.9561, + "step": 2630 + }, + { + "epoch": 0.2095695489094842, + "grad_norm": 0.7861992120742798, + "learning_rate": 4.6507104866237995e-05, + "loss": 0.9288, + "step": 2640 + }, + { + "epoch": 0.21036337295838378, + "grad_norm": 0.9786797761917114, + "learning_rate": 4.6493874202852533e-05, + "loss": 0.9492, + "step": 2650 + }, + { + "epoch": 0.21115719700728333, + "grad_norm": 0.8280521631240845, + "learning_rate": 4.648064353946707e-05, + "loss": 0.96, + "step": 2660 + }, + { + "epoch": 0.2119510210561829, + "grad_norm": 0.5959718227386475, + "learning_rate": 4.6467412876081605e-05, + "loss": 0.989, + "step": 2670 + }, + { + "epoch": 0.21274484510508246, + "grad_norm": 0.7416049838066101, + "learning_rate": 4.6454182212696143e-05, + "loss": 1.0148, + "step": 2680 + }, + { + "epoch": 0.213538669153982, + "grad_norm": 0.915602445602417, + "learning_rate": 4.644095154931068e-05, + "loss": 0.9789, + "step": 2690 + }, + { + "epoch": 0.21433249320288159, + "grad_norm": 0.616190493106842, + "learning_rate": 4.642772088592522e-05, + "loss": 0.9163, + "step": 2700 + }, + { + "epoch": 0.21512631725178114, + "grad_norm": 0.7318758368492126, + "learning_rate": 4.641449022253976e-05, + "loss": 0.9084, + "step": 2710 + }, + { + "epoch": 0.2159201413006807, + "grad_norm": 0.7663441896438599, + "learning_rate": 4.64012595591543e-05, + "loss": 0.921, + "step": 2720 + }, + { + "epoch": 0.21671396534958026, + "grad_norm": 0.7115159034729004, + "learning_rate": 4.638802889576884e-05, + "loss": 0.9214, + "step": 2730 + }, + { + "epoch": 0.21750778939847984, + "grad_norm": 0.569379985332489, + "learning_rate": 4.637479823238338e-05, + "loss": 0.9658, + "step": 2740 + }, + { + "epoch": 0.2183016134473794, + "grad_norm": 0.9785438776016235, + "learning_rate": 4.636156756899791e-05, + "loss": 0.8818, + "step": 2750 + }, + { + "epoch": 0.21909543749627894, + "grad_norm": 0.7893972396850586, + "learning_rate": 4.634833690561245e-05, + "loss": 0.9254, + "step": 2760 + }, + { + "epoch": 0.21988926154517852, + "grad_norm": 0.948264479637146, + "learning_rate": 4.633510624222699e-05, + "loss": 0.8619, + "step": 2770 + }, + { + "epoch": 0.22068308559407807, + "grad_norm": 0.8142668008804321, + "learning_rate": 4.6321875578841526e-05, + "loss": 0.9289, + "step": 2780 + }, + { + "epoch": 0.22147690964297764, + "grad_norm": 0.7517206072807312, + "learning_rate": 4.6308644915456065e-05, + "loss": 0.8759, + "step": 2790 + }, + { + "epoch": 0.2222707336918772, + "grad_norm": 0.6064891219139099, + "learning_rate": 4.6295414252070604e-05, + "loss": 0.8558, + "step": 2800 + }, + { + "epoch": 0.22306455774077677, + "grad_norm": 0.7425729036331177, + "learning_rate": 4.628218358868514e-05, + "loss": 0.9281, + "step": 2810 + }, + { + "epoch": 0.22385838178967632, + "grad_norm": 0.7883281707763672, + "learning_rate": 4.6268952925299675e-05, + "loss": 0.8969, + "step": 2820 + }, + { + "epoch": 0.22465220583857587, + "grad_norm": 0.8569539189338684, + "learning_rate": 4.6255722261914214e-05, + "loss": 0.9111, + "step": 2830 + }, + { + "epoch": 0.22544602988747545, + "grad_norm": 0.8346198797225952, + "learning_rate": 4.624249159852875e-05, + "loss": 0.8691, + "step": 2840 + }, + { + "epoch": 0.226239853936375, + "grad_norm": 0.8474957346916199, + "learning_rate": 4.622926093514329e-05, + "loss": 0.8502, + "step": 2850 + }, + { + "epoch": 0.22703367798527457, + "grad_norm": 0.9480683207511902, + "learning_rate": 4.6216030271757824e-05, + "loss": 0.8832, + "step": 2860 + }, + { + "epoch": 0.22782750203417412, + "grad_norm": 0.9572345018386841, + "learning_rate": 4.620279960837236e-05, + "loss": 0.9159, + "step": 2870 + }, + { + "epoch": 0.2286213260830737, + "grad_norm": 0.9414117932319641, + "learning_rate": 4.618956894498691e-05, + "loss": 0.9427, + "step": 2880 + }, + { + "epoch": 0.22941515013197325, + "grad_norm": 0.7920238375663757, + "learning_rate": 4.617633828160145e-05, + "loss": 0.8879, + "step": 2890 + }, + { + "epoch": 0.2302089741808728, + "grad_norm": 0.736736536026001, + "learning_rate": 4.616310761821598e-05, + "loss": 0.9429, + "step": 2900 + }, + { + "epoch": 0.23100279822977238, + "grad_norm": 0.7859109044075012, + "learning_rate": 4.614987695483052e-05, + "loss": 0.8835, + "step": 2910 + }, + { + "epoch": 0.23179662227867193, + "grad_norm": 0.9163410663604736, + "learning_rate": 4.613664629144506e-05, + "loss": 0.9017, + "step": 2920 + }, + { + "epoch": 0.2325904463275715, + "grad_norm": 0.9528617262840271, + "learning_rate": 4.612341562805959e-05, + "loss": 0.9572, + "step": 2930 + }, + { + "epoch": 0.23338427037647105, + "grad_norm": 0.9378358125686646, + "learning_rate": 4.611018496467413e-05, + "loss": 0.9002, + "step": 2940 + }, + { + "epoch": 0.2341780944253706, + "grad_norm": 0.754435122013092, + "learning_rate": 4.609695430128867e-05, + "loss": 0.9026, + "step": 2950 + }, + { + "epoch": 0.23497191847427018, + "grad_norm": 0.906570315361023, + "learning_rate": 4.6083723637903206e-05, + "loss": 0.9109, + "step": 2960 + }, + { + "epoch": 0.23576574252316973, + "grad_norm": 0.9783695936203003, + "learning_rate": 4.6070492974517745e-05, + "loss": 0.9366, + "step": 2970 + }, + { + "epoch": 0.2365595665720693, + "grad_norm": 0.8152298331260681, + "learning_rate": 4.6057262311132284e-05, + "loss": 0.9555, + "step": 2980 + }, + { + "epoch": 0.23735339062096886, + "grad_norm": 0.8119134902954102, + "learning_rate": 4.604403164774682e-05, + "loss": 0.9418, + "step": 2990 + }, + { + "epoch": 0.23814721466986843, + "grad_norm": 0.5931934714317322, + "learning_rate": 4.603080098436136e-05, + "loss": 0.8831, + "step": 3000 + }, + { + "epoch": 0.23894103871876798, + "grad_norm": 1.0472118854522705, + "learning_rate": 4.6017570320975894e-05, + "loss": 0.9529, + "step": 3010 + }, + { + "epoch": 0.23973486276766753, + "grad_norm": 0.8576637506484985, + "learning_rate": 4.600433965759043e-05, + "loss": 0.9101, + "step": 3020 + }, + { + "epoch": 0.2405286868165671, + "grad_norm": 0.7458338141441345, + "learning_rate": 4.599110899420497e-05, + "loss": 0.8409, + "step": 3030 + }, + { + "epoch": 0.24132251086546666, + "grad_norm": 0.8986132740974426, + "learning_rate": 4.5977878330819504e-05, + "loss": 0.9417, + "step": 3040 + }, + { + "epoch": 0.24211633491436624, + "grad_norm": 0.9694287180900574, + "learning_rate": 4.596464766743404e-05, + "loss": 0.9255, + "step": 3050 + }, + { + "epoch": 0.2429101589632658, + "grad_norm": 0.9465477466583252, + "learning_rate": 4.595141700404859e-05, + "loss": 0.9307, + "step": 3060 + }, + { + "epoch": 0.24370398301216536, + "grad_norm": 0.6529215574264526, + "learning_rate": 4.593818634066313e-05, + "loss": 0.8807, + "step": 3070 + }, + { + "epoch": 0.24449780706106491, + "grad_norm": 0.9342437982559204, + "learning_rate": 4.592495567727766e-05, + "loss": 0.9448, + "step": 3080 + }, + { + "epoch": 0.24529163110996446, + "grad_norm": 0.7587674260139465, + "learning_rate": 4.59117250138922e-05, + "loss": 0.9655, + "step": 3090 + }, + { + "epoch": 0.24608545515886404, + "grad_norm": 0.7746047377586365, + "learning_rate": 4.589849435050674e-05, + "loss": 0.9213, + "step": 3100 + }, + { + "epoch": 0.2468792792077636, + "grad_norm": 0.9817701578140259, + "learning_rate": 4.588526368712128e-05, + "loss": 0.9621, + "step": 3110 + }, + { + "epoch": 0.24767310325666317, + "grad_norm": 0.9328246116638184, + "learning_rate": 4.587203302373581e-05, + "loss": 0.8562, + "step": 3120 + }, + { + "epoch": 0.24846692730556272, + "grad_norm": 0.8459776043891907, + "learning_rate": 4.585880236035035e-05, + "loss": 0.9237, + "step": 3130 + }, + { + "epoch": 0.2492607513544623, + "grad_norm": 0.8480767607688904, + "learning_rate": 4.584557169696489e-05, + "loss": 0.8931, + "step": 3140 + }, + { + "epoch": 0.25005457540336185, + "grad_norm": 0.8643662333488464, + "learning_rate": 4.5832341033579426e-05, + "loss": 0.9657, + "step": 3150 + }, + { + "epoch": 0.2508483994522614, + "grad_norm": 0.7610037326812744, + "learning_rate": 4.5819110370193965e-05, + "loss": 0.9678, + "step": 3160 + }, + { + "epoch": 0.25164222350116094, + "grad_norm": 0.8325300812721252, + "learning_rate": 4.5805879706808503e-05, + "loss": 0.9242, + "step": 3170 + }, + { + "epoch": 0.25243604755006055, + "grad_norm": 0.8497787117958069, + "learning_rate": 4.579264904342304e-05, + "loss": 0.9659, + "step": 3180 + }, + { + "epoch": 0.2532298715989601, + "grad_norm": 0.9831972122192383, + "learning_rate": 4.5779418380037575e-05, + "loss": 0.927, + "step": 3190 + }, + { + "epoch": 0.25402369564785965, + "grad_norm": 0.829680323600769, + "learning_rate": 4.5766187716652113e-05, + "loss": 0.8841, + "step": 3200 + }, + { + "epoch": 0.2548175196967592, + "grad_norm": 0.8839125633239746, + "learning_rate": 4.575295705326665e-05, + "loss": 0.8945, + "step": 3210 + }, + { + "epoch": 0.25561134374565875, + "grad_norm": 1.0367532968521118, + "learning_rate": 4.573972638988119e-05, + "loss": 0.8785, + "step": 3220 + }, + { + "epoch": 0.25640516779455835, + "grad_norm": 0.9415130615234375, + "learning_rate": 4.572649572649573e-05, + "loss": 0.9348, + "step": 3230 + }, + { + "epoch": 0.2571989918434579, + "grad_norm": 0.7434957027435303, + "learning_rate": 4.571326506311027e-05, + "loss": 0.8587, + "step": 3240 + }, + { + "epoch": 0.25799281589235745, + "grad_norm": 0.6429414749145508, + "learning_rate": 4.570003439972481e-05, + "loss": 0.9606, + "step": 3250 + }, + { + "epoch": 0.258786639941257, + "grad_norm": 1.4009513854980469, + "learning_rate": 4.568680373633934e-05, + "loss": 0.8492, + "step": 3260 + }, + { + "epoch": 0.2595804639901566, + "grad_norm": 0.6804202198982239, + "learning_rate": 4.567357307295388e-05, + "loss": 0.9661, + "step": 3270 + }, + { + "epoch": 0.26037428803905616, + "grad_norm": 0.7768397331237793, + "learning_rate": 4.566034240956842e-05, + "loss": 0.8915, + "step": 3280 + }, + { + "epoch": 0.2611681120879557, + "grad_norm": 0.8182624578475952, + "learning_rate": 4.564711174618296e-05, + "loss": 0.914, + "step": 3290 + }, + { + "epoch": 0.26196193613685526, + "grad_norm": 0.8247461915016174, + "learning_rate": 4.563388108279749e-05, + "loss": 0.9191, + "step": 3300 + }, + { + "epoch": 0.2627557601857548, + "grad_norm": 0.8486924171447754, + "learning_rate": 4.562065041941203e-05, + "loss": 0.9561, + "step": 3310 + }, + { + "epoch": 0.2635495842346544, + "grad_norm": 0.7965856790542603, + "learning_rate": 4.560741975602657e-05, + "loss": 0.9409, + "step": 3320 + }, + { + "epoch": 0.26434340828355396, + "grad_norm": 1.0266188383102417, + "learning_rate": 4.559418909264111e-05, + "loss": 0.8715, + "step": 3330 + }, + { + "epoch": 0.2651372323324535, + "grad_norm": 0.795364499092102, + "learning_rate": 4.5580958429255645e-05, + "loss": 0.8615, + "step": 3340 + }, + { + "epoch": 0.26593105638135306, + "grad_norm": 0.7666260600090027, + "learning_rate": 4.5567727765870184e-05, + "loss": 0.8847, + "step": 3350 + }, + { + "epoch": 0.2667248804302526, + "grad_norm": 0.741375207901001, + "learning_rate": 4.555449710248472e-05, + "loss": 0.977, + "step": 3360 + }, + { + "epoch": 0.2675187044791522, + "grad_norm": 0.8026778697967529, + "learning_rate": 4.5541266439099255e-05, + "loss": 1.012, + "step": 3370 + }, + { + "epoch": 0.26831252852805176, + "grad_norm": 0.828702986240387, + "learning_rate": 4.5528035775713794e-05, + "loss": 0.9397, + "step": 3380 + }, + { + "epoch": 0.2691063525769513, + "grad_norm": 0.7557156085968018, + "learning_rate": 4.551480511232833e-05, + "loss": 0.9116, + "step": 3390 + }, + { + "epoch": 0.26990017662585086, + "grad_norm": 0.8590690493583679, + "learning_rate": 4.550157444894287e-05, + "loss": 0.899, + "step": 3400 + }, + { + "epoch": 0.27069400067475047, + "grad_norm": 1.0395570993423462, + "learning_rate": 4.548834378555741e-05, + "loss": 0.8633, + "step": 3410 + }, + { + "epoch": 0.27148782472365, + "grad_norm": 0.756775438785553, + "learning_rate": 4.547511312217195e-05, + "loss": 0.9149, + "step": 3420 + }, + { + "epoch": 0.27228164877254957, + "grad_norm": 1.0174977779388428, + "learning_rate": 4.546188245878649e-05, + "loss": 0.8964, + "step": 3430 + }, + { + "epoch": 0.2730754728214491, + "grad_norm": 0.9375600218772888, + "learning_rate": 4.544865179540103e-05, + "loss": 1.0064, + "step": 3440 + }, + { + "epoch": 0.27386929687034867, + "grad_norm": 0.7545995116233826, + "learning_rate": 4.543542113201556e-05, + "loss": 0.8792, + "step": 3450 + }, + { + "epoch": 0.27466312091924827, + "grad_norm": 0.7980530261993408, + "learning_rate": 4.54221904686301e-05, + "loss": 0.9207, + "step": 3460 + }, + { + "epoch": 0.2754569449681478, + "grad_norm": 0.8304005265235901, + "learning_rate": 4.540895980524464e-05, + "loss": 0.867, + "step": 3470 + }, + { + "epoch": 0.27625076901704737, + "grad_norm": 0.9552587270736694, + "learning_rate": 4.539572914185917e-05, + "loss": 0.9498, + "step": 3480 + }, + { + "epoch": 0.2770445930659469, + "grad_norm": 0.8936342000961304, + "learning_rate": 4.538249847847371e-05, + "loss": 0.9253, + "step": 3490 + }, + { + "epoch": 0.27783841711484647, + "grad_norm": 0.8894282579421997, + "learning_rate": 4.5369267815088254e-05, + "loss": 0.8884, + "step": 3500 + }, + { + "epoch": 0.2786322411637461, + "grad_norm": 0.7489833831787109, + "learning_rate": 4.535603715170279e-05, + "loss": 0.9472, + "step": 3510 + }, + { + "epoch": 0.2794260652126456, + "grad_norm": 0.7078631520271301, + "learning_rate": 4.5342806488317325e-05, + "loss": 0.9027, + "step": 3520 + }, + { + "epoch": 0.2802198892615452, + "grad_norm": 0.7065866589546204, + "learning_rate": 4.5329575824931864e-05, + "loss": 0.9334, + "step": 3530 + }, + { + "epoch": 0.2810137133104447, + "grad_norm": 0.5849603414535522, + "learning_rate": 4.53163451615464e-05, + "loss": 0.9532, + "step": 3540 + }, + { + "epoch": 0.28180753735934433, + "grad_norm": 0.7648496627807617, + "learning_rate": 4.530311449816094e-05, + "loss": 0.9062, + "step": 3550 + }, + { + "epoch": 0.2826013614082439, + "grad_norm": 0.7615086436271667, + "learning_rate": 4.5289883834775474e-05, + "loss": 0.999, + "step": 3560 + }, + { + "epoch": 0.28339518545714343, + "grad_norm": 0.7692292928695679, + "learning_rate": 4.527665317139001e-05, + "loss": 0.9794, + "step": 3570 + }, + { + "epoch": 0.284189009506043, + "grad_norm": 0.6985743045806885, + "learning_rate": 4.526342250800455e-05, + "loss": 0.922, + "step": 3580 + }, + { + "epoch": 0.2849828335549425, + "grad_norm": 0.8894872069358826, + "learning_rate": 4.525019184461909e-05, + "loss": 0.9348, + "step": 3590 + }, + { + "epoch": 0.28577665760384213, + "grad_norm": 1.1146018505096436, + "learning_rate": 4.523696118123363e-05, + "loss": 0.9343, + "step": 3600 + }, + { + "epoch": 0.2865704816527417, + "grad_norm": 0.7811164855957031, + "learning_rate": 4.522373051784817e-05, + "loss": 0.9495, + "step": 3610 + }, + { + "epoch": 0.28736430570164123, + "grad_norm": 0.9659305810928345, + "learning_rate": 4.521049985446271e-05, + "loss": 0.9037, + "step": 3620 + }, + { + "epoch": 0.2881581297505408, + "grad_norm": 0.8798300623893738, + "learning_rate": 4.519726919107724e-05, + "loss": 0.8077, + "step": 3630 + }, + { + "epoch": 0.28895195379944033, + "grad_norm": 0.9190603494644165, + "learning_rate": 4.518403852769178e-05, + "loss": 0.9834, + "step": 3640 + }, + { + "epoch": 0.28974577784833994, + "grad_norm": 1.0563730001449585, + "learning_rate": 4.517080786430632e-05, + "loss": 0.903, + "step": 3650 + }, + { + "epoch": 0.2905396018972395, + "grad_norm": 0.9505723714828491, + "learning_rate": 4.515757720092086e-05, + "loss": 0.9106, + "step": 3660 + }, + { + "epoch": 0.29133342594613904, + "grad_norm": 0.7892711758613586, + "learning_rate": 4.5144346537535396e-05, + "loss": 0.9156, + "step": 3670 + }, + { + "epoch": 0.2921272499950386, + "grad_norm": 0.8885009288787842, + "learning_rate": 4.5131115874149935e-05, + "loss": 0.9297, + "step": 3680 + }, + { + "epoch": 0.29292107404393813, + "grad_norm": 0.9264837503433228, + "learning_rate": 4.5117885210764473e-05, + "loss": 0.8791, + "step": 3690 + }, + { + "epoch": 0.29371489809283774, + "grad_norm": 0.8464304208755493, + "learning_rate": 4.5104654547379006e-05, + "loss": 0.9119, + "step": 3700 + }, + { + "epoch": 0.2945087221417373, + "grad_norm": 0.8379583358764648, + "learning_rate": 4.5091423883993545e-05, + "loss": 0.9339, + "step": 3710 + }, + { + "epoch": 0.29530254619063684, + "grad_norm": 0.7604143619537354, + "learning_rate": 4.5078193220608083e-05, + "loss": 0.8827, + "step": 3720 + }, + { + "epoch": 0.2960963702395364, + "grad_norm": 1.2368942499160767, + "learning_rate": 4.506496255722262e-05, + "loss": 0.8412, + "step": 3730 + }, + { + "epoch": 0.296890194288436, + "grad_norm": 0.8182504773139954, + "learning_rate": 4.5051731893837155e-05, + "loss": 0.9742, + "step": 3740 + }, + { + "epoch": 0.29768401833733554, + "grad_norm": 0.6445258259773254, + "learning_rate": 4.5038501230451693e-05, + "loss": 0.9485, + "step": 3750 + }, + { + "epoch": 0.2984778423862351, + "grad_norm": 0.535454511642456, + "learning_rate": 4.502527056706623e-05, + "loss": 0.8326, + "step": 3760 + }, + { + "epoch": 0.29927166643513464, + "grad_norm": 0.9186378121376038, + "learning_rate": 4.501203990368078e-05, + "loss": 0.8988, + "step": 3770 + }, + { + "epoch": 0.3000654904840342, + "grad_norm": 0.8731489181518555, + "learning_rate": 4.499880924029531e-05, + "loss": 0.8857, + "step": 3780 + }, + { + "epoch": 0.3008593145329338, + "grad_norm": 0.9101518392562866, + "learning_rate": 4.498557857690985e-05, + "loss": 0.8697, + "step": 3790 + }, + { + "epoch": 0.30165313858183335, + "grad_norm": 1.0505417585372925, + "learning_rate": 4.497234791352439e-05, + "loss": 0.9239, + "step": 3800 + }, + { + "epoch": 0.3024469626307329, + "grad_norm": 0.9743961095809937, + "learning_rate": 4.495911725013893e-05, + "loss": 0.9206, + "step": 3810 + }, + { + "epoch": 0.30324078667963245, + "grad_norm": 0.7029954195022583, + "learning_rate": 4.494588658675346e-05, + "loss": 0.9278, + "step": 3820 + }, + { + "epoch": 0.304034610728532, + "grad_norm": 1.1363028287887573, + "learning_rate": 4.4932655923368e-05, + "loss": 0.8596, + "step": 3830 + }, + { + "epoch": 0.3048284347774316, + "grad_norm": 0.6260054707527161, + "learning_rate": 4.491942525998254e-05, + "loss": 0.8998, + "step": 3840 + }, + { + "epoch": 0.30562225882633115, + "grad_norm": 0.9502087831497192, + "learning_rate": 4.4906194596597076e-05, + "loss": 0.9423, + "step": 3850 + }, + { + "epoch": 0.3064160828752307, + "grad_norm": 0.6927551627159119, + "learning_rate": 4.4892963933211615e-05, + "loss": 0.9297, + "step": 3860 + }, + { + "epoch": 0.30720990692413025, + "grad_norm": 0.9120237231254578, + "learning_rate": 4.4879733269826154e-05, + "loss": 0.916, + "step": 3870 + }, + { + "epoch": 0.30800373097302985, + "grad_norm": 0.7541207075119019, + "learning_rate": 4.486650260644069e-05, + "loss": 0.8848, + "step": 3880 + }, + { + "epoch": 0.3087975550219294, + "grad_norm": 0.8690943717956543, + "learning_rate": 4.4853271943055225e-05, + "loss": 0.8941, + "step": 3890 + }, + { + "epoch": 0.30959137907082895, + "grad_norm": 0.9503733515739441, + "learning_rate": 4.4840041279669764e-05, + "loss": 0.9158, + "step": 3900 + }, + { + "epoch": 0.3103852031197285, + "grad_norm": 0.8072488903999329, + "learning_rate": 4.48268106162843e-05, + "loss": 0.9325, + "step": 3910 + }, + { + "epoch": 0.31117902716862805, + "grad_norm": 0.9710533618927002, + "learning_rate": 4.481357995289884e-05, + "loss": 0.9011, + "step": 3920 + }, + { + "epoch": 0.31197285121752766, + "grad_norm": 0.8638473749160767, + "learning_rate": 4.4800349289513374e-05, + "loss": 0.9435, + "step": 3930 + }, + { + "epoch": 0.3127666752664272, + "grad_norm": 1.0046641826629639, + "learning_rate": 4.478711862612792e-05, + "loss": 0.9496, + "step": 3940 + }, + { + "epoch": 0.31356049931532676, + "grad_norm": 0.7358148097991943, + "learning_rate": 4.477388796274246e-05, + "loss": 0.9531, + "step": 3950 + }, + { + "epoch": 0.3143543233642263, + "grad_norm": 0.9239563941955566, + "learning_rate": 4.476065729935699e-05, + "loss": 0.9164, + "step": 3960 + }, + { + "epoch": 0.31514814741312586, + "grad_norm": 0.919695258140564, + "learning_rate": 4.474742663597153e-05, + "loss": 0.891, + "step": 3970 + }, + { + "epoch": 0.31594197146202546, + "grad_norm": 0.6320668458938599, + "learning_rate": 4.473419597258607e-05, + "loss": 0.9372, + "step": 3980 + }, + { + "epoch": 0.316735795510925, + "grad_norm": 0.8896647691726685, + "learning_rate": 4.472096530920061e-05, + "loss": 0.9236, + "step": 3990 + }, + { + "epoch": 0.31752961955982456, + "grad_norm": 0.8284244537353516, + "learning_rate": 4.470773464581514e-05, + "loss": 0.8625, + "step": 4000 + }, + { + "epoch": 0.3183234436087241, + "grad_norm": 0.760328471660614, + "learning_rate": 4.469450398242968e-05, + "loss": 0.9433, + "step": 4010 + }, + { + "epoch": 0.31911726765762366, + "grad_norm": 0.7320681810379028, + "learning_rate": 4.468127331904422e-05, + "loss": 0.8655, + "step": 4020 + }, + { + "epoch": 0.31991109170652327, + "grad_norm": 0.9130124449729919, + "learning_rate": 4.4668042655658756e-05, + "loss": 0.8828, + "step": 4030 + }, + { + "epoch": 0.3207049157554228, + "grad_norm": 0.8663350343704224, + "learning_rate": 4.4654811992273295e-05, + "loss": 0.9523, + "step": 4040 + }, + { + "epoch": 0.32149873980432236, + "grad_norm": 0.8587914705276489, + "learning_rate": 4.4641581328887834e-05, + "loss": 0.8242, + "step": 4050 + }, + { + "epoch": 0.3222925638532219, + "grad_norm": 0.7651414275169373, + "learning_rate": 4.462835066550237e-05, + "loss": 0.8989, + "step": 4060 + }, + { + "epoch": 0.3230863879021215, + "grad_norm": 0.834144651889801, + "learning_rate": 4.4615120002116905e-05, + "loss": 0.9105, + "step": 4070 + }, + { + "epoch": 0.32388021195102107, + "grad_norm": 0.691362202167511, + "learning_rate": 4.4601889338731444e-05, + "loss": 0.8757, + "step": 4080 + }, + { + "epoch": 0.3246740359999206, + "grad_norm": 0.9557787179946899, + "learning_rate": 4.458865867534598e-05, + "loss": 0.9276, + "step": 4090 + }, + { + "epoch": 0.32546786004882017, + "grad_norm": 0.8461413383483887, + "learning_rate": 4.457542801196052e-05, + "loss": 0.9158, + "step": 4100 + }, + { + "epoch": 0.3262616840977197, + "grad_norm": 0.8489325046539307, + "learning_rate": 4.456219734857506e-05, + "loss": 0.9529, + "step": 4110 + }, + { + "epoch": 0.3270555081466193, + "grad_norm": 0.8827877044677734, + "learning_rate": 4.45489666851896e-05, + "loss": 0.9152, + "step": 4120 + }, + { + "epoch": 0.32784933219551887, + "grad_norm": 0.69962078332901, + "learning_rate": 4.453573602180414e-05, + "loss": 0.8927, + "step": 4130 + }, + { + "epoch": 0.3286431562444184, + "grad_norm": 0.6991488337516785, + "learning_rate": 4.452250535841868e-05, + "loss": 0.9344, + "step": 4140 + }, + { + "epoch": 0.32943698029331797, + "grad_norm": 0.9323925375938416, + "learning_rate": 4.450927469503321e-05, + "loss": 0.945, + "step": 4150 + }, + { + "epoch": 0.3302308043422175, + "grad_norm": 0.8266938328742981, + "learning_rate": 4.449604403164775e-05, + "loss": 0.9045, + "step": 4160 + }, + { + "epoch": 0.3310246283911171, + "grad_norm": 0.8383837342262268, + "learning_rate": 4.448281336826229e-05, + "loss": 0.8516, + "step": 4170 + }, + { + "epoch": 0.3318184524400167, + "grad_norm": 0.72791987657547, + "learning_rate": 4.446958270487682e-05, + "loss": 0.8718, + "step": 4180 + }, + { + "epoch": 0.3326122764889162, + "grad_norm": 0.7107085585594177, + "learning_rate": 4.445635204149136e-05, + "loss": 0.9216, + "step": 4190 + }, + { + "epoch": 0.3334061005378158, + "grad_norm": 0.7751030325889587, + "learning_rate": 4.44431213781059e-05, + "loss": 0.8512, + "step": 4200 + }, + { + "epoch": 0.3341999245867154, + "grad_norm": 0.764103889465332, + "learning_rate": 4.4429890714720443e-05, + "loss": 0.9594, + "step": 4210 + }, + { + "epoch": 0.33499374863561493, + "grad_norm": 0.7244476079940796, + "learning_rate": 4.4416660051334976e-05, + "loss": 0.9194, + "step": 4220 + }, + { + "epoch": 0.3357875726845145, + "grad_norm": 0.8353381156921387, + "learning_rate": 4.4403429387949515e-05, + "loss": 0.8394, + "step": 4230 + }, + { + "epoch": 0.33658139673341403, + "grad_norm": 0.6536258459091187, + "learning_rate": 4.4390198724564053e-05, + "loss": 0.9125, + "step": 4240 + }, + { + "epoch": 0.3373752207823136, + "grad_norm": 0.8908824920654297, + "learning_rate": 4.437696806117859e-05, + "loss": 0.9419, + "step": 4250 + }, + { + "epoch": 0.3381690448312132, + "grad_norm": 0.6466046571731567, + "learning_rate": 4.4363737397793125e-05, + "loss": 0.946, + "step": 4260 + }, + { + "epoch": 0.33896286888011273, + "grad_norm": 0.8315856456756592, + "learning_rate": 4.4350506734407663e-05, + "loss": 0.9282, + "step": 4270 + }, + { + "epoch": 0.3397566929290123, + "grad_norm": 0.7369117736816406, + "learning_rate": 4.43372760710222e-05, + "loss": 0.9398, + "step": 4280 + }, + { + "epoch": 0.34055051697791183, + "grad_norm": 0.841690719127655, + "learning_rate": 4.432404540763674e-05, + "loss": 0.8411, + "step": 4290 + }, + { + "epoch": 0.3413443410268114, + "grad_norm": 0.8048957586288452, + "learning_rate": 4.431081474425128e-05, + "loss": 0.9204, + "step": 4300 + }, + { + "epoch": 0.342138165075711, + "grad_norm": 0.6272882223129272, + "learning_rate": 4.429758408086582e-05, + "loss": 0.8819, + "step": 4310 + }, + { + "epoch": 0.34293198912461054, + "grad_norm": 0.8420210480690002, + "learning_rate": 4.428435341748036e-05, + "loss": 0.9187, + "step": 4320 + }, + { + "epoch": 0.3437258131735101, + "grad_norm": 0.873335063457489, + "learning_rate": 4.427112275409489e-05, + "loss": 0.8807, + "step": 4330 + }, + { + "epoch": 0.34451963722240964, + "grad_norm": 0.6721493005752563, + "learning_rate": 4.425789209070943e-05, + "loss": 0.9068, + "step": 4340 + }, + { + "epoch": 0.34531346127130924, + "grad_norm": 0.8227477669715881, + "learning_rate": 4.424466142732397e-05, + "loss": 0.9372, + "step": 4350 + }, + { + "epoch": 0.3461072853202088, + "grad_norm": 0.8997884392738342, + "learning_rate": 4.423143076393851e-05, + "loss": 0.8916, + "step": 4360 + }, + { + "epoch": 0.34690110936910834, + "grad_norm": 0.9526328444480896, + "learning_rate": 4.421820010055304e-05, + "loss": 0.857, + "step": 4370 + }, + { + "epoch": 0.3476949334180079, + "grad_norm": 0.8911690711975098, + "learning_rate": 4.4204969437167585e-05, + "loss": 0.9323, + "step": 4380 + }, + { + "epoch": 0.34848875746690744, + "grad_norm": 0.9047393798828125, + "learning_rate": 4.4191738773782124e-05, + "loss": 0.8946, + "step": 4390 + }, + { + "epoch": 0.34928258151580704, + "grad_norm": 0.8549637794494629, + "learning_rate": 4.4178508110396656e-05, + "loss": 0.8755, + "step": 4400 + }, + { + "epoch": 0.3500764055647066, + "grad_norm": 0.9609139561653137, + "learning_rate": 4.4165277447011195e-05, + "loss": 0.8262, + "step": 4410 + }, + { + "epoch": 0.35087022961360614, + "grad_norm": 0.5445454120635986, + "learning_rate": 4.4152046783625734e-05, + "loss": 0.8849, + "step": 4420 + }, + { + "epoch": 0.3516640536625057, + "grad_norm": 0.9729016423225403, + "learning_rate": 4.413881612024027e-05, + "loss": 0.8396, + "step": 4430 + }, + { + "epoch": 0.35245787771140524, + "grad_norm": 1.071895718574524, + "learning_rate": 4.4125585456854805e-05, + "loss": 0.9218, + "step": 4440 + }, + { + "epoch": 0.35325170176030485, + "grad_norm": 0.6580398082733154, + "learning_rate": 4.4112354793469344e-05, + "loss": 1.0065, + "step": 4450 + }, + { + "epoch": 0.3540455258092044, + "grad_norm": 0.6029400825500488, + "learning_rate": 4.409912413008388e-05, + "loss": 0.8869, + "step": 4460 + }, + { + "epoch": 0.35483934985810395, + "grad_norm": 1.124426007270813, + "learning_rate": 4.408589346669842e-05, + "loss": 0.8604, + "step": 4470 + }, + { + "epoch": 0.3556331739070035, + "grad_norm": 0.905108630657196, + "learning_rate": 4.407266280331296e-05, + "loss": 0.8818, + "step": 4480 + }, + { + "epoch": 0.35642699795590305, + "grad_norm": 0.7963013052940369, + "learning_rate": 4.40594321399275e-05, + "loss": 0.9271, + "step": 4490 + }, + { + "epoch": 0.35722082200480265, + "grad_norm": 0.8501315712928772, + "learning_rate": 4.404620147654204e-05, + "loss": 0.9084, + "step": 4500 + }, + { + "epoch": 0.3580146460537022, + "grad_norm": 0.7018426656723022, + "learning_rate": 4.403297081315657e-05, + "loss": 0.9059, + "step": 4510 + }, + { + "epoch": 0.35880847010260175, + "grad_norm": 0.9202073216438293, + "learning_rate": 4.401974014977111e-05, + "loss": 0.846, + "step": 4520 + }, + { + "epoch": 0.3596022941515013, + "grad_norm": 1.0749320983886719, + "learning_rate": 4.400650948638565e-05, + "loss": 0.9123, + "step": 4530 + }, + { + "epoch": 0.3603961182004009, + "grad_norm": 0.7524193525314331, + "learning_rate": 4.399327882300019e-05, + "loss": 0.9018, + "step": 4540 + }, + { + "epoch": 0.36118994224930046, + "grad_norm": 0.7798032164573669, + "learning_rate": 4.398004815961472e-05, + "loss": 0.9221, + "step": 4550 + }, + { + "epoch": 0.3619837662982, + "grad_norm": 0.8113389015197754, + "learning_rate": 4.3966817496229265e-05, + "loss": 0.9005, + "step": 4560 + }, + { + "epoch": 0.36277759034709955, + "grad_norm": 0.7713738083839417, + "learning_rate": 4.3953586832843804e-05, + "loss": 0.9608, + "step": 4570 + }, + { + "epoch": 0.3635714143959991, + "grad_norm": 0.7880844473838806, + "learning_rate": 4.394035616945834e-05, + "loss": 0.9077, + "step": 4580 + }, + { + "epoch": 0.3643652384448987, + "grad_norm": 0.8188297748565674, + "learning_rate": 4.3927125506072875e-05, + "loss": 0.9276, + "step": 4590 + }, + { + "epoch": 0.36515906249379826, + "grad_norm": 0.7484593987464905, + "learning_rate": 4.3913894842687414e-05, + "loss": 0.926, + "step": 4600 + }, + { + "epoch": 0.3659528865426978, + "grad_norm": 0.7410342693328857, + "learning_rate": 4.390066417930195e-05, + "loss": 0.9059, + "step": 4610 + }, + { + "epoch": 0.36674671059159736, + "grad_norm": 0.897441565990448, + "learning_rate": 4.388743351591649e-05, + "loss": 0.8394, + "step": 4620 + }, + { + "epoch": 0.3675405346404969, + "grad_norm": 0.7051084637641907, + "learning_rate": 4.3874202852531024e-05, + "loss": 0.8888, + "step": 4630 + }, + { + "epoch": 0.3683343586893965, + "grad_norm": 0.8222568035125732, + "learning_rate": 4.386097218914556e-05, + "loss": 0.8424, + "step": 4640 + }, + { + "epoch": 0.36912818273829606, + "grad_norm": 0.9209256768226624, + "learning_rate": 4.38477415257601e-05, + "loss": 0.8892, + "step": 4650 + }, + { + "epoch": 0.3699220067871956, + "grad_norm": 0.9021577835083008, + "learning_rate": 4.383451086237464e-05, + "loss": 0.831, + "step": 4660 + }, + { + "epoch": 0.37071583083609516, + "grad_norm": 0.7472784519195557, + "learning_rate": 4.382128019898918e-05, + "loss": 0.8574, + "step": 4670 + }, + { + "epoch": 0.37150965488499477, + "grad_norm": 0.7072018384933472, + "learning_rate": 4.380804953560372e-05, + "loss": 0.9618, + "step": 4680 + }, + { + "epoch": 0.3723034789338943, + "grad_norm": 0.9218171238899231, + "learning_rate": 4.379481887221826e-05, + "loss": 0.8916, + "step": 4690 + }, + { + "epoch": 0.37309730298279387, + "grad_norm": 0.8794512152671814, + "learning_rate": 4.378158820883279e-05, + "loss": 0.9359, + "step": 4700 + }, + { + "epoch": 0.3738911270316934, + "grad_norm": 0.8208445906639099, + "learning_rate": 4.376835754544733e-05, + "loss": 0.8878, + "step": 4710 + }, + { + "epoch": 0.37468495108059297, + "grad_norm": 0.5891085863113403, + "learning_rate": 4.375512688206187e-05, + "loss": 0.9369, + "step": 4720 + }, + { + "epoch": 0.37547877512949257, + "grad_norm": 0.8617219924926758, + "learning_rate": 4.374189621867641e-05, + "loss": 0.8791, + "step": 4730 + }, + { + "epoch": 0.3762725991783921, + "grad_norm": 1.0510433912277222, + "learning_rate": 4.3728665555290946e-05, + "loss": 0.987, + "step": 4740 + }, + { + "epoch": 0.37706642322729167, + "grad_norm": 0.9196320176124573, + "learning_rate": 4.3715434891905485e-05, + "loss": 0.8332, + "step": 4750 + }, + { + "epoch": 0.3778602472761912, + "grad_norm": 0.9651428461074829, + "learning_rate": 4.3702204228520023e-05, + "loss": 0.9301, + "step": 4760 + }, + { + "epoch": 0.37865407132509077, + "grad_norm": 0.7794479131698608, + "learning_rate": 4.3688973565134556e-05, + "loss": 0.8898, + "step": 4770 + }, + { + "epoch": 0.3794478953739904, + "grad_norm": 0.708845853805542, + "learning_rate": 4.3675742901749095e-05, + "loss": 0.8781, + "step": 4780 + }, + { + "epoch": 0.3802417194228899, + "grad_norm": 1.07254159450531, + "learning_rate": 4.3662512238363633e-05, + "loss": 0.8329, + "step": 4790 + }, + { + "epoch": 0.3810355434717895, + "grad_norm": 0.6615691184997559, + "learning_rate": 4.364928157497817e-05, + "loss": 0.9107, + "step": 4800 + }, + { + "epoch": 0.381829367520689, + "grad_norm": 0.7933416962623596, + "learning_rate": 4.3636050911592705e-05, + "loss": 0.8986, + "step": 4810 + }, + { + "epoch": 0.3826231915695886, + "grad_norm": 0.7650178074836731, + "learning_rate": 4.3622820248207243e-05, + "loss": 0.9212, + "step": 4820 + }, + { + "epoch": 0.3834170156184882, + "grad_norm": 0.8654028177261353, + "learning_rate": 4.360958958482179e-05, + "loss": 0.8686, + "step": 4830 + }, + { + "epoch": 0.3842108396673877, + "grad_norm": 0.8981553912162781, + "learning_rate": 4.359635892143633e-05, + "loss": 0.843, + "step": 4840 + }, + { + "epoch": 0.3850046637162873, + "grad_norm": 0.8608548641204834, + "learning_rate": 4.358312825805086e-05, + "loss": 0.8958, + "step": 4850 + }, + { + "epoch": 0.3857984877651868, + "grad_norm": 0.8663391470909119, + "learning_rate": 4.35698975946654e-05, + "loss": 0.8992, + "step": 4860 + }, + { + "epoch": 0.38659231181408643, + "grad_norm": 0.7639971971511841, + "learning_rate": 4.355666693127994e-05, + "loss": 0.8592, + "step": 4870 + }, + { + "epoch": 0.387386135862986, + "grad_norm": 0.5844544768333435, + "learning_rate": 4.354343626789447e-05, + "loss": 0.8638, + "step": 4880 + }, + { + "epoch": 0.38817995991188553, + "grad_norm": 0.8091756701469421, + "learning_rate": 4.353020560450901e-05, + "loss": 0.934, + "step": 4890 + }, + { + "epoch": 0.3889737839607851, + "grad_norm": 0.8555530905723572, + "learning_rate": 4.351697494112355e-05, + "loss": 0.8541, + "step": 4900 + }, + { + "epoch": 0.38976760800968463, + "grad_norm": 0.7854059934616089, + "learning_rate": 4.350374427773809e-05, + "loss": 0.9043, + "step": 4910 + }, + { + "epoch": 0.39056143205858423, + "grad_norm": 1.0440559387207031, + "learning_rate": 4.3490513614352626e-05, + "loss": 0.8941, + "step": 4920 + }, + { + "epoch": 0.3913552561074838, + "grad_norm": 0.7478209733963013, + "learning_rate": 4.3477282950967165e-05, + "loss": 0.8565, + "step": 4930 + }, + { + "epoch": 0.39214908015638333, + "grad_norm": 0.7952112555503845, + "learning_rate": 4.3464052287581704e-05, + "loss": 0.8953, + "step": 4940 + }, + { + "epoch": 0.3929429042052829, + "grad_norm": 0.8431933522224426, + "learning_rate": 4.345082162419624e-05, + "loss": 0.8433, + "step": 4950 + }, + { + "epoch": 0.39373672825418243, + "grad_norm": 0.7418366074562073, + "learning_rate": 4.3437590960810775e-05, + "loss": 0.8713, + "step": 4960 + }, + { + "epoch": 0.39453055230308204, + "grad_norm": 0.8180972933769226, + "learning_rate": 4.3424360297425314e-05, + "loss": 0.9263, + "step": 4970 + }, + { + "epoch": 0.3953243763519816, + "grad_norm": 0.681885302066803, + "learning_rate": 4.341112963403985e-05, + "loss": 0.8812, + "step": 4980 + }, + { + "epoch": 0.39611820040088114, + "grad_norm": 0.6465054750442505, + "learning_rate": 4.3397898970654385e-05, + "loss": 0.8943, + "step": 4990 + }, + { + "epoch": 0.3969120244497807, + "grad_norm": 0.8901807069778442, + "learning_rate": 4.338466830726893e-05, + "loss": 0.9004, + "step": 5000 + }, + { + "epoch": 0.3977058484986803, + "grad_norm": 0.937714695930481, + "learning_rate": 4.337143764388347e-05, + "loss": 0.8341, + "step": 5010 + }, + { + "epoch": 0.39849967254757984, + "grad_norm": 0.6198152303695679, + "learning_rate": 4.335820698049801e-05, + "loss": 0.9793, + "step": 5020 + }, + { + "epoch": 0.3992934965964794, + "grad_norm": 1.0210473537445068, + "learning_rate": 4.334497631711254e-05, + "loss": 0.8939, + "step": 5030 + }, + { + "epoch": 0.40008732064537894, + "grad_norm": 0.7850756645202637, + "learning_rate": 4.333174565372708e-05, + "loss": 0.8545, + "step": 5040 + }, + { + "epoch": 0.4008811446942785, + "grad_norm": 0.9468925595283508, + "learning_rate": 4.331851499034162e-05, + "loss": 0.833, + "step": 5050 + }, + { + "epoch": 0.4016749687431781, + "grad_norm": 0.6961553692817688, + "learning_rate": 4.330528432695616e-05, + "loss": 0.887, + "step": 5060 + }, + { + "epoch": 0.40246879279207765, + "grad_norm": 0.7382469177246094, + "learning_rate": 4.329205366357069e-05, + "loss": 0.8148, + "step": 5070 + }, + { + "epoch": 0.4032626168409772, + "grad_norm": 0.8620008230209351, + "learning_rate": 4.327882300018523e-05, + "loss": 0.8493, + "step": 5080 + }, + { + "epoch": 0.40405644088987674, + "grad_norm": 0.6228137612342834, + "learning_rate": 4.326559233679977e-05, + "loss": 0.961, + "step": 5090 + }, + { + "epoch": 0.4048502649387763, + "grad_norm": 0.8809449672698975, + "learning_rate": 4.3252361673414306e-05, + "loss": 0.8982, + "step": 5100 + }, + { + "epoch": 0.4056440889876759, + "grad_norm": 0.7898284196853638, + "learning_rate": 4.3239131010028845e-05, + "loss": 0.8696, + "step": 5110 + }, + { + "epoch": 0.40643791303657545, + "grad_norm": 0.5993707776069641, + "learning_rate": 4.3225900346643384e-05, + "loss": 0.9455, + "step": 5120 + }, + { + "epoch": 0.407231737085475, + "grad_norm": 0.5548946261405945, + "learning_rate": 4.321266968325792e-05, + "loss": 0.9056, + "step": 5130 + }, + { + "epoch": 0.40802556113437455, + "grad_norm": 0.5974055528640747, + "learning_rate": 4.3199439019872455e-05, + "loss": 0.933, + "step": 5140 + }, + { + "epoch": 0.40881938518327415, + "grad_norm": 0.77692711353302, + "learning_rate": 4.3186208356486994e-05, + "loss": 0.9088, + "step": 5150 + }, + { + "epoch": 0.4096132092321737, + "grad_norm": 0.6619086265563965, + "learning_rate": 4.317297769310153e-05, + "loss": 0.9095, + "step": 5160 + }, + { + "epoch": 0.41040703328107325, + "grad_norm": 0.8819933533668518, + "learning_rate": 4.315974702971607e-05, + "loss": 0.8546, + "step": 5170 + }, + { + "epoch": 0.4112008573299728, + "grad_norm": 0.8302019834518433, + "learning_rate": 4.314651636633061e-05, + "loss": 0.883, + "step": 5180 + }, + { + "epoch": 0.41199468137887235, + "grad_norm": 0.9397719502449036, + "learning_rate": 4.313328570294515e-05, + "loss": 0.8768, + "step": 5190 + }, + { + "epoch": 0.41278850542777196, + "grad_norm": 1.071415901184082, + "learning_rate": 4.312005503955969e-05, + "loss": 0.9198, + "step": 5200 + }, + { + "epoch": 0.4135823294766715, + "grad_norm": 0.7499683499336243, + "learning_rate": 4.310682437617422e-05, + "loss": 0.875, + "step": 5210 + }, + { + "epoch": 0.41437615352557106, + "grad_norm": 0.7693167328834534, + "learning_rate": 4.309359371278876e-05, + "loss": 0.898, + "step": 5220 + }, + { + "epoch": 0.4151699775744706, + "grad_norm": 1.0537440776824951, + "learning_rate": 4.30803630494033e-05, + "loss": 0.9159, + "step": 5230 + }, + { + "epoch": 0.41596380162337016, + "grad_norm": 0.7521883249282837, + "learning_rate": 4.306713238601784e-05, + "loss": 0.9153, + "step": 5240 + }, + { + "epoch": 0.41675762567226976, + "grad_norm": 1.0054055452346802, + "learning_rate": 4.305390172263237e-05, + "loss": 0.8956, + "step": 5250 + }, + { + "epoch": 0.4175514497211693, + "grad_norm": 0.7442034482955933, + "learning_rate": 4.304067105924691e-05, + "loss": 0.891, + "step": 5260 + }, + { + "epoch": 0.41834527377006886, + "grad_norm": 1.084252953529358, + "learning_rate": 4.3027440395861455e-05, + "loss": 0.9027, + "step": 5270 + }, + { + "epoch": 0.4191390978189684, + "grad_norm": 0.8177867531776428, + "learning_rate": 4.3014209732475993e-05, + "loss": 0.9018, + "step": 5280 + }, + { + "epoch": 0.419932921867868, + "grad_norm": 0.48281505703926086, + "learning_rate": 4.3000979069090526e-05, + "loss": 0.9254, + "step": 5290 + }, + { + "epoch": 0.42072674591676756, + "grad_norm": 0.7245819568634033, + "learning_rate": 4.2987748405705065e-05, + "loss": 0.9109, + "step": 5300 + }, + { + "epoch": 0.4215205699656671, + "grad_norm": 0.7288056015968323, + "learning_rate": 4.2974517742319603e-05, + "loss": 0.8432, + "step": 5310 + }, + { + "epoch": 0.42231439401456666, + "grad_norm": 0.7283946871757507, + "learning_rate": 4.2961287078934136e-05, + "loss": 0.8788, + "step": 5320 + }, + { + "epoch": 0.4231082180634662, + "grad_norm": 0.7446802258491516, + "learning_rate": 4.2948056415548675e-05, + "loss": 0.9142, + "step": 5330 + }, + { + "epoch": 0.4239020421123658, + "grad_norm": 1.0653091669082642, + "learning_rate": 4.2934825752163213e-05, + "loss": 0.8688, + "step": 5340 + }, + { + "epoch": 0.42469586616126537, + "grad_norm": 0.702083170413971, + "learning_rate": 4.292159508877775e-05, + "loss": 0.9083, + "step": 5350 + }, + { + "epoch": 0.4254896902101649, + "grad_norm": 0.8971485495567322, + "learning_rate": 4.290836442539229e-05, + "loss": 0.8935, + "step": 5360 + }, + { + "epoch": 0.42628351425906447, + "grad_norm": 0.7836154103279114, + "learning_rate": 4.289513376200683e-05, + "loss": 0.814, + "step": 5370 + }, + { + "epoch": 0.427077338307964, + "grad_norm": 0.964614987373352, + "learning_rate": 4.288190309862137e-05, + "loss": 0.8964, + "step": 5380 + }, + { + "epoch": 0.4278711623568636, + "grad_norm": 0.8366977572441101, + "learning_rate": 4.286867243523591e-05, + "loss": 0.9089, + "step": 5390 + }, + { + "epoch": 0.42866498640576317, + "grad_norm": 0.720564603805542, + "learning_rate": 4.285544177185044e-05, + "loss": 0.905, + "step": 5400 + }, + { + "epoch": 0.4294588104546627, + "grad_norm": 0.7407063841819763, + "learning_rate": 4.284221110846498e-05, + "loss": 0.9191, + "step": 5410 + }, + { + "epoch": 0.43025263450356227, + "grad_norm": 0.7772669792175293, + "learning_rate": 4.282898044507952e-05, + "loss": 0.8633, + "step": 5420 + }, + { + "epoch": 0.4310464585524618, + "grad_norm": 0.7156360149383545, + "learning_rate": 4.281574978169406e-05, + "loss": 0.9515, + "step": 5430 + }, + { + "epoch": 0.4318402826013614, + "grad_norm": 0.7044875621795654, + "learning_rate": 4.2802519118308596e-05, + "loss": 0.9199, + "step": 5440 + }, + { + "epoch": 0.432634106650261, + "grad_norm": 0.7540838122367859, + "learning_rate": 4.2789288454923135e-05, + "loss": 0.9229, + "step": 5450 + }, + { + "epoch": 0.4334279306991605, + "grad_norm": 0.6620701551437378, + "learning_rate": 4.2776057791537674e-05, + "loss": 0.8556, + "step": 5460 + }, + { + "epoch": 0.4342217547480601, + "grad_norm": 0.8868663907051086, + "learning_rate": 4.2762827128152206e-05, + "loss": 0.8666, + "step": 5470 + }, + { + "epoch": 0.4350155787969597, + "grad_norm": 0.7081343531608582, + "learning_rate": 4.2749596464766745e-05, + "loss": 0.8888, + "step": 5480 + }, + { + "epoch": 0.43580940284585923, + "grad_norm": 0.7983565926551819, + "learning_rate": 4.2736365801381284e-05, + "loss": 0.9041, + "step": 5490 + }, + { + "epoch": 0.4366032268947588, + "grad_norm": 0.7562633156776428, + "learning_rate": 4.272313513799582e-05, + "loss": 0.908, + "step": 5500 + }, + { + "epoch": 0.4373970509436583, + "grad_norm": 0.8235949873924255, + "learning_rate": 4.2709904474610355e-05, + "loss": 0.8907, + "step": 5510 + }, + { + "epoch": 0.4381908749925579, + "grad_norm": 0.6528277397155762, + "learning_rate": 4.2696673811224894e-05, + "loss": 0.8403, + "step": 5520 + }, + { + "epoch": 0.4389846990414575, + "grad_norm": 0.7162795662879944, + "learning_rate": 4.268344314783943e-05, + "loss": 0.9001, + "step": 5530 + }, + { + "epoch": 0.43977852309035703, + "grad_norm": 0.8113960027694702, + "learning_rate": 4.267021248445398e-05, + "loss": 0.8785, + "step": 5540 + }, + { + "epoch": 0.4405723471392566, + "grad_norm": 0.6791692972183228, + "learning_rate": 4.265698182106851e-05, + "loss": 0.8744, + "step": 5550 + }, + { + "epoch": 0.44136617118815613, + "grad_norm": 0.7448859214782715, + "learning_rate": 4.264375115768305e-05, + "loss": 0.9191, + "step": 5560 + }, + { + "epoch": 0.4421599952370557, + "grad_norm": 0.8218660950660706, + "learning_rate": 4.263052049429759e-05, + "loss": 0.8693, + "step": 5570 + }, + { + "epoch": 0.4429538192859553, + "grad_norm": 0.8838844299316406, + "learning_rate": 4.261728983091212e-05, + "loss": 0.8701, + "step": 5580 + }, + { + "epoch": 0.44374764333485484, + "grad_norm": 0.9519463181495667, + "learning_rate": 4.260405916752666e-05, + "loss": 0.8621, + "step": 5590 + }, + { + "epoch": 0.4445414673837544, + "grad_norm": 0.9154829382896423, + "learning_rate": 4.25908285041412e-05, + "loss": 0.8917, + "step": 5600 + }, + { + "epoch": 0.44533529143265393, + "grad_norm": 1.03704833984375, + "learning_rate": 4.257759784075574e-05, + "loss": 0.9156, + "step": 5610 + }, + { + "epoch": 0.44612911548155354, + "grad_norm": 0.7216574549674988, + "learning_rate": 4.2564367177370276e-05, + "loss": 0.8858, + "step": 5620 + }, + { + "epoch": 0.4469229395304531, + "grad_norm": 0.6714047789573669, + "learning_rate": 4.2551136513984815e-05, + "loss": 0.8379, + "step": 5630 + }, + { + "epoch": 0.44771676357935264, + "grad_norm": 0.9155183434486389, + "learning_rate": 4.2537905850599354e-05, + "loss": 0.871, + "step": 5640 + }, + { + "epoch": 0.4485105876282522, + "grad_norm": 0.7604344487190247, + "learning_rate": 4.252467518721389e-05, + "loss": 0.9569, + "step": 5650 + }, + { + "epoch": 0.44930441167715174, + "grad_norm": 0.6832132935523987, + "learning_rate": 4.2511444523828425e-05, + "loss": 0.9031, + "step": 5660 + }, + { + "epoch": 0.45009823572605134, + "grad_norm": 0.5372611880302429, + "learning_rate": 4.2498213860442964e-05, + "loss": 0.9364, + "step": 5670 + }, + { + "epoch": 0.4508920597749509, + "grad_norm": 0.8363433480262756, + "learning_rate": 4.24849831970575e-05, + "loss": 0.8708, + "step": 5680 + }, + { + "epoch": 0.45168588382385044, + "grad_norm": 0.7453399896621704, + "learning_rate": 4.2471752533672035e-05, + "loss": 0.8366, + "step": 5690 + }, + { + "epoch": 0.45247970787275, + "grad_norm": 0.9587568044662476, + "learning_rate": 4.2458521870286574e-05, + "loss": 0.884, + "step": 5700 + }, + { + "epoch": 0.45327353192164954, + "grad_norm": 0.9268891215324402, + "learning_rate": 4.244529120690112e-05, + "loss": 0.8884, + "step": 5710 + }, + { + "epoch": 0.45406735597054915, + "grad_norm": 0.7946332693099976, + "learning_rate": 4.243206054351566e-05, + "loss": 0.8686, + "step": 5720 + }, + { + "epoch": 0.4548611800194487, + "grad_norm": 0.7091253995895386, + "learning_rate": 4.241882988013019e-05, + "loss": 0.8849, + "step": 5730 + }, + { + "epoch": 0.45565500406834825, + "grad_norm": 0.7621028423309326, + "learning_rate": 4.240559921674473e-05, + "loss": 0.9082, + "step": 5740 + }, + { + "epoch": 0.4564488281172478, + "grad_norm": 0.8027592301368713, + "learning_rate": 4.239236855335927e-05, + "loss": 0.8613, + "step": 5750 + }, + { + "epoch": 0.4572426521661474, + "grad_norm": 0.7330886721611023, + "learning_rate": 4.237913788997381e-05, + "loss": 0.9076, + "step": 5760 + }, + { + "epoch": 0.45803647621504695, + "grad_norm": 0.9339755177497864, + "learning_rate": 4.236590722658834e-05, + "loss": 0.8422, + "step": 5770 + }, + { + "epoch": 0.4588303002639465, + "grad_norm": 0.810807466506958, + "learning_rate": 4.235267656320288e-05, + "loss": 1.0052, + "step": 5780 + }, + { + "epoch": 0.45962412431284605, + "grad_norm": 0.6460042595863342, + "learning_rate": 4.233944589981742e-05, + "loss": 0.9634, + "step": 5790 + }, + { + "epoch": 0.4604179483617456, + "grad_norm": 0.9872604012489319, + "learning_rate": 4.232621523643196e-05, + "loss": 0.8762, + "step": 5800 + }, + { + "epoch": 0.4612117724106452, + "grad_norm": 0.6482599973678589, + "learning_rate": 4.2312984573046496e-05, + "loss": 0.8439, + "step": 5810 + }, + { + "epoch": 0.46200559645954475, + "grad_norm": 0.7754231691360474, + "learning_rate": 4.2299753909661035e-05, + "loss": 0.9037, + "step": 5820 + }, + { + "epoch": 0.4627994205084443, + "grad_norm": 0.7624156475067139, + "learning_rate": 4.2286523246275573e-05, + "loss": 0.8971, + "step": 5830 + }, + { + "epoch": 0.46359324455734385, + "grad_norm": 0.8388280272483826, + "learning_rate": 4.2273292582890106e-05, + "loss": 0.8161, + "step": 5840 + }, + { + "epoch": 0.4643870686062434, + "grad_norm": 0.9448870420455933, + "learning_rate": 4.2260061919504645e-05, + "loss": 0.8949, + "step": 5850 + }, + { + "epoch": 0.465180892655143, + "grad_norm": 0.9757259488105774, + "learning_rate": 4.2246831256119183e-05, + "loss": 0.8765, + "step": 5860 + }, + { + "epoch": 0.46597471670404256, + "grad_norm": 0.7201075553894043, + "learning_rate": 4.223360059273372e-05, + "loss": 0.9155, + "step": 5870 + }, + { + "epoch": 0.4667685407529421, + "grad_norm": 0.9070501923561096, + "learning_rate": 4.222036992934826e-05, + "loss": 0.9414, + "step": 5880 + }, + { + "epoch": 0.46756236480184166, + "grad_norm": 0.6421061754226685, + "learning_rate": 4.22071392659628e-05, + "loss": 0.8891, + "step": 5890 + }, + { + "epoch": 0.4683561888507412, + "grad_norm": 0.957310140132904, + "learning_rate": 4.219390860257734e-05, + "loss": 0.9296, + "step": 5900 + }, + { + "epoch": 0.4691500128996408, + "grad_norm": 0.9540489315986633, + "learning_rate": 4.218067793919187e-05, + "loss": 0.8695, + "step": 5910 + }, + { + "epoch": 0.46994383694854036, + "grad_norm": 0.7519145011901855, + "learning_rate": 4.216744727580641e-05, + "loss": 0.874, + "step": 5920 + }, + { + "epoch": 0.4707376609974399, + "grad_norm": 0.6650322675704956, + "learning_rate": 4.215421661242095e-05, + "loss": 0.9093, + "step": 5930 + }, + { + "epoch": 0.47153148504633946, + "grad_norm": 0.6322752833366394, + "learning_rate": 4.214098594903549e-05, + "loss": 0.8887, + "step": 5940 + }, + { + "epoch": 0.47232530909523907, + "grad_norm": 0.8493393063545227, + "learning_rate": 4.212775528565002e-05, + "loss": 0.858, + "step": 5950 + }, + { + "epoch": 0.4731191331441386, + "grad_norm": 0.8725616931915283, + "learning_rate": 4.211452462226456e-05, + "loss": 0.8437, + "step": 5960 + }, + { + "epoch": 0.47391295719303816, + "grad_norm": 1.01144540309906, + "learning_rate": 4.21012939588791e-05, + "loss": 0.8824, + "step": 5970 + }, + { + "epoch": 0.4747067812419377, + "grad_norm": 0.8524407744407654, + "learning_rate": 4.2088063295493644e-05, + "loss": 0.8566, + "step": 5980 + }, + { + "epoch": 0.47550060529083726, + "grad_norm": 1.0122267007827759, + "learning_rate": 4.2074832632108176e-05, + "loss": 0.8802, + "step": 5990 + }, + { + "epoch": 0.47629442933973687, + "grad_norm": 0.930794894695282, + "learning_rate": 4.2061601968722715e-05, + "loss": 0.8751, + "step": 6000 + }, + { + "epoch": 0.4770882533886364, + "grad_norm": 1.0592617988586426, + "learning_rate": 4.2048371305337254e-05, + "loss": 0.8464, + "step": 6010 + }, + { + "epoch": 0.47788207743753597, + "grad_norm": 0.7701418995857239, + "learning_rate": 4.2035140641951786e-05, + "loss": 0.8491, + "step": 6020 + }, + { + "epoch": 0.4786759014864355, + "grad_norm": 0.7711431384086609, + "learning_rate": 4.2021909978566325e-05, + "loss": 0.8827, + "step": 6030 + }, + { + "epoch": 0.47946972553533507, + "grad_norm": 0.5670927166938782, + "learning_rate": 4.2008679315180864e-05, + "loss": 0.911, + "step": 6040 + }, + { + "epoch": 0.4802635495842347, + "grad_norm": 0.8850795030593872, + "learning_rate": 4.19954486517954e-05, + "loss": 0.8703, + "step": 6050 + }, + { + "epoch": 0.4810573736331342, + "grad_norm": 0.7688451409339905, + "learning_rate": 4.198221798840994e-05, + "loss": 0.8591, + "step": 6060 + }, + { + "epoch": 0.48185119768203377, + "grad_norm": 0.7370510101318359, + "learning_rate": 4.196898732502448e-05, + "loss": 0.8814, + "step": 6070 + }, + { + "epoch": 0.4826450217309333, + "grad_norm": 0.8961367607116699, + "learning_rate": 4.195575666163902e-05, + "loss": 0.9584, + "step": 6080 + }, + { + "epoch": 0.4834388457798329, + "grad_norm": 0.547537624835968, + "learning_rate": 4.194252599825356e-05, + "loss": 0.8956, + "step": 6090 + }, + { + "epoch": 0.4842326698287325, + "grad_norm": 0.7516084909439087, + "learning_rate": 4.192929533486809e-05, + "loss": 0.8988, + "step": 6100 + }, + { + "epoch": 0.485026493877632, + "grad_norm": 0.856939971446991, + "learning_rate": 4.191606467148263e-05, + "loss": 0.9231, + "step": 6110 + }, + { + "epoch": 0.4858203179265316, + "grad_norm": 0.8620836734771729, + "learning_rate": 4.190283400809717e-05, + "loss": 0.8834, + "step": 6120 + }, + { + "epoch": 0.4866141419754311, + "grad_norm": 0.7581316828727722, + "learning_rate": 4.18896033447117e-05, + "loss": 0.8127, + "step": 6130 + }, + { + "epoch": 0.48740796602433073, + "grad_norm": 0.6257145404815674, + "learning_rate": 4.187637268132624e-05, + "loss": 0.8563, + "step": 6140 + }, + { + "epoch": 0.4882017900732303, + "grad_norm": 0.8719417452812195, + "learning_rate": 4.186314201794078e-05, + "loss": 0.9042, + "step": 6150 + }, + { + "epoch": 0.48899561412212983, + "grad_norm": 0.8335233926773071, + "learning_rate": 4.1849911354555324e-05, + "loss": 0.9324, + "step": 6160 + }, + { + "epoch": 0.4897894381710294, + "grad_norm": 0.8433477878570557, + "learning_rate": 4.1836680691169856e-05, + "loss": 0.8667, + "step": 6170 + }, + { + "epoch": 0.49058326221992893, + "grad_norm": 0.5534406900405884, + "learning_rate": 4.1823450027784395e-05, + "loss": 0.8776, + "step": 6180 + }, + { + "epoch": 0.49137708626882853, + "grad_norm": 0.6916033029556274, + "learning_rate": 4.1810219364398934e-05, + "loss": 0.8879, + "step": 6190 + }, + { + "epoch": 0.4921709103177281, + "grad_norm": 1.0233070850372314, + "learning_rate": 4.179698870101347e-05, + "loss": 0.915, + "step": 6200 + }, + { + "epoch": 0.49296473436662763, + "grad_norm": 0.8879387974739075, + "learning_rate": 4.1783758037628005e-05, + "loss": 0.8883, + "step": 6210 + }, + { + "epoch": 0.4937585584155272, + "grad_norm": 0.8336936235427856, + "learning_rate": 4.1770527374242544e-05, + "loss": 0.8417, + "step": 6220 + }, + { + "epoch": 0.4945523824644268, + "grad_norm": 0.8911527395248413, + "learning_rate": 4.175729671085708e-05, + "loss": 0.7989, + "step": 6230 + }, + { + "epoch": 0.49534620651332634, + "grad_norm": 0.5336394906044006, + "learning_rate": 4.174406604747162e-05, + "loss": 0.9055, + "step": 6240 + }, + { + "epoch": 0.4961400305622259, + "grad_norm": 0.8444200158119202, + "learning_rate": 4.173083538408616e-05, + "loss": 0.8582, + "step": 6250 + }, + { + "epoch": 0.49693385461112544, + "grad_norm": 0.9549778699874878, + "learning_rate": 4.17176047207007e-05, + "loss": 0.9263, + "step": 6260 + }, + { + "epoch": 0.497727678660025, + "grad_norm": 0.8041070699691772, + "learning_rate": 4.170437405731524e-05, + "loss": 0.8669, + "step": 6270 + }, + { + "epoch": 0.4985215027089246, + "grad_norm": 0.7747896313667297, + "learning_rate": 4.169114339392977e-05, + "loss": 0.8525, + "step": 6280 + }, + { + "epoch": 0.49931532675782414, + "grad_norm": 0.6687043309211731, + "learning_rate": 4.167791273054431e-05, + "loss": 0.9159, + "step": 6290 + }, + { + "epoch": 0.5001091508067237, + "grad_norm": 0.725878119468689, + "learning_rate": 4.166468206715885e-05, + "loss": 0.9252, + "step": 6300 + }, + { + "epoch": 0.5009029748556233, + "grad_norm": 0.9067284464836121, + "learning_rate": 4.165145140377339e-05, + "loss": 0.8905, + "step": 6310 + }, + { + "epoch": 0.5016967989045228, + "grad_norm": 0.8240019083023071, + "learning_rate": 4.163822074038792e-05, + "loss": 0.8641, + "step": 6320 + }, + { + "epoch": 0.5024906229534224, + "grad_norm": 0.7333963513374329, + "learning_rate": 4.1624990077002466e-05, + "loss": 0.8823, + "step": 6330 + }, + { + "epoch": 0.5032844470023219, + "grad_norm": 1.1056770086288452, + "learning_rate": 4.1611759413617005e-05, + "loss": 0.8817, + "step": 6340 + }, + { + "epoch": 0.5040782710512215, + "grad_norm": 0.8191234469413757, + "learning_rate": 4.1598528750231543e-05, + "loss": 0.8986, + "step": 6350 + }, + { + "epoch": 0.5048720951001211, + "grad_norm": 0.6650758981704712, + "learning_rate": 4.1585298086846076e-05, + "loss": 0.885, + "step": 6360 + }, + { + "epoch": 0.5056659191490206, + "grad_norm": 0.8753699064254761, + "learning_rate": 4.1572067423460615e-05, + "loss": 0.8402, + "step": 6370 + }, + { + "epoch": 0.5064597431979202, + "grad_norm": 0.8924400806427002, + "learning_rate": 4.1558836760075153e-05, + "loss": 0.9016, + "step": 6380 + }, + { + "epoch": 0.5072535672468197, + "grad_norm": 0.7312005758285522, + "learning_rate": 4.1545606096689686e-05, + "loss": 0.914, + "step": 6390 + }, + { + "epoch": 0.5080473912957193, + "grad_norm": 0.771732747554779, + "learning_rate": 4.1532375433304225e-05, + "loss": 0.8976, + "step": 6400 + }, + { + "epoch": 0.5088412153446189, + "grad_norm": 0.7958862781524658, + "learning_rate": 4.1519144769918763e-05, + "loss": 0.8749, + "step": 6410 + }, + { + "epoch": 0.5096350393935184, + "grad_norm": 0.751964271068573, + "learning_rate": 4.15059141065333e-05, + "loss": 0.852, + "step": 6420 + }, + { + "epoch": 0.510428863442418, + "grad_norm": 0.721443772315979, + "learning_rate": 4.149268344314784e-05, + "loss": 0.8887, + "step": 6430 + }, + { + "epoch": 0.5112226874913175, + "grad_norm": 0.6288412809371948, + "learning_rate": 4.147945277976238e-05, + "loss": 0.8488, + "step": 6440 + }, + { + "epoch": 0.5120165115402171, + "grad_norm": 0.7060637474060059, + "learning_rate": 4.146622211637692e-05, + "loss": 0.8506, + "step": 6450 + }, + { + "epoch": 0.5128103355891167, + "grad_norm": 0.5948452949523926, + "learning_rate": 4.145299145299146e-05, + "loss": 0.8759, + "step": 6460 + }, + { + "epoch": 0.5136041596380162, + "grad_norm": 0.7204840183258057, + "learning_rate": 4.143976078960599e-05, + "loss": 0.9107, + "step": 6470 + }, + { + "epoch": 0.5143979836869158, + "grad_norm": 0.9579210877418518, + "learning_rate": 4.142653012622053e-05, + "loss": 0.7921, + "step": 6480 + }, + { + "epoch": 0.5151918077358154, + "grad_norm": 0.8127973079681396, + "learning_rate": 4.141329946283507e-05, + "loss": 0.8468, + "step": 6490 + }, + { + "epoch": 0.5159856317847149, + "grad_norm": 0.6830429434776306, + "learning_rate": 4.140006879944961e-05, + "loss": 0.9339, + "step": 6500 + }, + { + "epoch": 0.5167794558336145, + "grad_norm": 0.8261887431144714, + "learning_rate": 4.1386838136064146e-05, + "loss": 0.9211, + "step": 6510 + }, + { + "epoch": 0.517573279882514, + "grad_norm": 0.803862988948822, + "learning_rate": 4.1373607472678685e-05, + "loss": 0.883, + "step": 6520 + }, + { + "epoch": 0.5183671039314136, + "grad_norm": 0.971078097820282, + "learning_rate": 4.1360376809293224e-05, + "loss": 0.8913, + "step": 6530 + }, + { + "epoch": 0.5191609279803132, + "grad_norm": 0.8098673224449158, + "learning_rate": 4.1347146145907756e-05, + "loss": 0.9281, + "step": 6540 + }, + { + "epoch": 0.5199547520292127, + "grad_norm": 0.9599949717521667, + "learning_rate": 4.1333915482522295e-05, + "loss": 0.8459, + "step": 6550 + }, + { + "epoch": 0.5207485760781123, + "grad_norm": 0.7749876379966736, + "learning_rate": 4.1320684819136834e-05, + "loss": 0.8717, + "step": 6560 + }, + { + "epoch": 0.5215424001270118, + "grad_norm": 0.9038444757461548, + "learning_rate": 4.130745415575137e-05, + "loss": 0.8716, + "step": 6570 + }, + { + "epoch": 0.5223362241759114, + "grad_norm": 0.9091252088546753, + "learning_rate": 4.1294223492365905e-05, + "loss": 0.8709, + "step": 6580 + }, + { + "epoch": 0.523130048224811, + "grad_norm": 0.8427479863166809, + "learning_rate": 4.1280992828980444e-05, + "loss": 0.905, + "step": 6590 + }, + { + "epoch": 0.5239238722737105, + "grad_norm": 0.6537529230117798, + "learning_rate": 4.126776216559499e-05, + "loss": 0.8252, + "step": 6600 + }, + { + "epoch": 0.5247176963226101, + "grad_norm": 0.7358630895614624, + "learning_rate": 4.125453150220952e-05, + "loss": 0.9075, + "step": 6610 + }, + { + "epoch": 0.5255115203715096, + "grad_norm": 0.9415682554244995, + "learning_rate": 4.124130083882406e-05, + "loss": 0.8233, + "step": 6620 + }, + { + "epoch": 0.5263053444204092, + "grad_norm": 0.790571928024292, + "learning_rate": 4.12280701754386e-05, + "loss": 0.9097, + "step": 6630 + }, + { + "epoch": 0.5270991684693088, + "grad_norm": 0.9315680861473083, + "learning_rate": 4.121483951205314e-05, + "loss": 0.8962, + "step": 6640 + }, + { + "epoch": 0.5278929925182083, + "grad_norm": 0.8426125645637512, + "learning_rate": 4.120160884866767e-05, + "loss": 0.8931, + "step": 6650 + }, + { + "epoch": 0.5286868165671079, + "grad_norm": 0.7593029141426086, + "learning_rate": 4.118837818528221e-05, + "loss": 0.8584, + "step": 6660 + }, + { + "epoch": 0.5294806406160074, + "grad_norm": 0.719986617565155, + "learning_rate": 4.117514752189675e-05, + "loss": 0.9337, + "step": 6670 + }, + { + "epoch": 0.530274464664907, + "grad_norm": 0.8458060026168823, + "learning_rate": 4.116191685851129e-05, + "loss": 0.8355, + "step": 6680 + }, + { + "epoch": 0.5310682887138066, + "grad_norm": 0.7757362127304077, + "learning_rate": 4.1148686195125826e-05, + "loss": 0.897, + "step": 6690 + }, + { + "epoch": 0.5318621127627061, + "grad_norm": 0.7549418807029724, + "learning_rate": 4.1135455531740365e-05, + "loss": 0.8864, + "step": 6700 + }, + { + "epoch": 0.5326559368116057, + "grad_norm": 0.8752275705337524, + "learning_rate": 4.1122224868354904e-05, + "loss": 0.902, + "step": 6710 + }, + { + "epoch": 0.5334497608605052, + "grad_norm": 0.901665985584259, + "learning_rate": 4.1108994204969436e-05, + "loss": 0.8892, + "step": 6720 + }, + { + "epoch": 0.5342435849094048, + "grad_norm": 0.8543225526809692, + "learning_rate": 4.1095763541583975e-05, + "loss": 0.8923, + "step": 6730 + }, + { + "epoch": 0.5350374089583044, + "grad_norm": 0.8486155867576599, + "learning_rate": 4.1082532878198514e-05, + "loss": 0.9043, + "step": 6740 + }, + { + "epoch": 0.5358312330072039, + "grad_norm": 0.869857668876648, + "learning_rate": 4.106930221481305e-05, + "loss": 0.8406, + "step": 6750 + }, + { + "epoch": 0.5366250570561035, + "grad_norm": 0.9113563299179077, + "learning_rate": 4.1056071551427585e-05, + "loss": 0.9667, + "step": 6760 + }, + { + "epoch": 0.537418881105003, + "grad_norm": 0.9034312963485718, + "learning_rate": 4.104284088804213e-05, + "loss": 0.8792, + "step": 6770 + }, + { + "epoch": 0.5382127051539026, + "grad_norm": 0.8775055408477783, + "learning_rate": 4.102961022465667e-05, + "loss": 0.8938, + "step": 6780 + }, + { + "epoch": 0.5390065292028022, + "grad_norm": 0.8911844491958618, + "learning_rate": 4.101637956127121e-05, + "loss": 0.8922, + "step": 6790 + }, + { + "epoch": 0.5398003532517017, + "grad_norm": 0.9125531315803528, + "learning_rate": 4.100314889788574e-05, + "loss": 0.9208, + "step": 6800 + }, + { + "epoch": 0.5405941773006013, + "grad_norm": 0.8507636189460754, + "learning_rate": 4.098991823450028e-05, + "loss": 0.8493, + "step": 6810 + }, + { + "epoch": 0.5413880013495009, + "grad_norm": 0.9445191621780396, + "learning_rate": 4.097668757111482e-05, + "loss": 0.8364, + "step": 6820 + }, + { + "epoch": 0.5421818253984004, + "grad_norm": 0.6830033659934998, + "learning_rate": 4.096345690772935e-05, + "loss": 0.8302, + "step": 6830 + }, + { + "epoch": 0.5429756494473, + "grad_norm": 0.9439083337783813, + "learning_rate": 4.095022624434389e-05, + "loss": 0.8545, + "step": 6840 + }, + { + "epoch": 0.5437694734961995, + "grad_norm": 0.8083987236022949, + "learning_rate": 4.093699558095843e-05, + "loss": 0.9273, + "step": 6850 + }, + { + "epoch": 0.5445632975450991, + "grad_norm": 0.7800227403640747, + "learning_rate": 4.092376491757297e-05, + "loss": 0.8564, + "step": 6860 + }, + { + "epoch": 0.5453571215939987, + "grad_norm": 0.7036982178688049, + "learning_rate": 4.091053425418751e-05, + "loss": 0.8236, + "step": 6870 + }, + { + "epoch": 0.5461509456428982, + "grad_norm": 0.6380200982093811, + "learning_rate": 4.0897303590802046e-05, + "loss": 0.8749, + "step": 6880 + }, + { + "epoch": 0.5469447696917978, + "grad_norm": 0.9935095906257629, + "learning_rate": 4.0884072927416585e-05, + "loss": 0.8536, + "step": 6890 + }, + { + "epoch": 0.5477385937406973, + "grad_norm": 0.9212964773178101, + "learning_rate": 4.0870842264031123e-05, + "loss": 0.88, + "step": 6900 + }, + { + "epoch": 0.5485324177895969, + "grad_norm": 0.7856978178024292, + "learning_rate": 4.0857611600645656e-05, + "loss": 0.9521, + "step": 6910 + }, + { + "epoch": 0.5493262418384965, + "grad_norm": 0.8367793560028076, + "learning_rate": 4.0844380937260195e-05, + "loss": 0.8705, + "step": 6920 + }, + { + "epoch": 0.550120065887396, + "grad_norm": 0.735126256942749, + "learning_rate": 4.0831150273874733e-05, + "loss": 0.9276, + "step": 6930 + }, + { + "epoch": 0.5509138899362956, + "grad_norm": 0.7849065065383911, + "learning_rate": 4.081791961048927e-05, + "loss": 0.8424, + "step": 6940 + }, + { + "epoch": 0.5517077139851951, + "grad_norm": 0.9133653044700623, + "learning_rate": 4.080468894710381e-05, + "loss": 0.8708, + "step": 6950 + }, + { + "epoch": 0.5525015380340947, + "grad_norm": 0.6829420924186707, + "learning_rate": 4.079145828371835e-05, + "loss": 0.8696, + "step": 6960 + }, + { + "epoch": 0.5532953620829943, + "grad_norm": 0.8633001446723938, + "learning_rate": 4.077822762033289e-05, + "loss": 0.9033, + "step": 6970 + }, + { + "epoch": 0.5540891861318938, + "grad_norm": 0.6927655935287476, + "learning_rate": 4.076499695694742e-05, + "loss": 0.8687, + "step": 6980 + }, + { + "epoch": 0.5548830101807934, + "grad_norm": 0.9020519852638245, + "learning_rate": 4.075176629356196e-05, + "loss": 0.8289, + "step": 6990 + }, + { + "epoch": 0.5556768342296929, + "grad_norm": 0.8436912894248962, + "learning_rate": 4.07385356301765e-05, + "loss": 0.8881, + "step": 7000 + }, + { + "epoch": 0.5564706582785925, + "grad_norm": 0.7371892333030701, + "learning_rate": 4.072530496679104e-05, + "loss": 0.8995, + "step": 7010 + }, + { + "epoch": 0.5572644823274921, + "grad_norm": 0.8149321675300598, + "learning_rate": 4.071207430340557e-05, + "loss": 0.9026, + "step": 7020 + }, + { + "epoch": 0.5580583063763916, + "grad_norm": 0.6326161026954651, + "learning_rate": 4.069884364002011e-05, + "loss": 0.8795, + "step": 7030 + }, + { + "epoch": 0.5588521304252912, + "grad_norm": 0.8339284062385559, + "learning_rate": 4.0685612976634655e-05, + "loss": 0.8199, + "step": 7040 + }, + { + "epoch": 0.5596459544741907, + "grad_norm": 0.7637563347816467, + "learning_rate": 4.0672382313249194e-05, + "loss": 0.8635, + "step": 7050 + }, + { + "epoch": 0.5604397785230903, + "grad_norm": 0.8212976455688477, + "learning_rate": 4.0659151649863726e-05, + "loss": 0.8681, + "step": 7060 + }, + { + "epoch": 0.56123360257199, + "grad_norm": 0.7294253706932068, + "learning_rate": 4.0645920986478265e-05, + "loss": 0.8807, + "step": 7070 + }, + { + "epoch": 0.5620274266208894, + "grad_norm": 0.7334606051445007, + "learning_rate": 4.0632690323092804e-05, + "loss": 0.8732, + "step": 7080 + }, + { + "epoch": 0.562821250669789, + "grad_norm": 0.8146048188209534, + "learning_rate": 4.0619459659707336e-05, + "loss": 0.8941, + "step": 7090 + }, + { + "epoch": 0.5636150747186887, + "grad_norm": 0.8506765365600586, + "learning_rate": 4.0606228996321875e-05, + "loss": 0.8885, + "step": 7100 + }, + { + "epoch": 0.5644088987675882, + "grad_norm": 1.0465277433395386, + "learning_rate": 4.0592998332936414e-05, + "loss": 0.877, + "step": 7110 + }, + { + "epoch": 0.5652027228164878, + "grad_norm": 1.0059021711349487, + "learning_rate": 4.057976766955095e-05, + "loss": 0.9511, + "step": 7120 + }, + { + "epoch": 0.5659965468653873, + "grad_norm": 0.7272640466690063, + "learning_rate": 4.056653700616549e-05, + "loss": 0.8562, + "step": 7130 + }, + { + "epoch": 0.5667903709142869, + "grad_norm": 0.6533452868461609, + "learning_rate": 4.055330634278003e-05, + "loss": 0.8402, + "step": 7140 + }, + { + "epoch": 0.5675841949631865, + "grad_norm": 0.8188508152961731, + "learning_rate": 4.054007567939457e-05, + "loss": 0.8589, + "step": 7150 + }, + { + "epoch": 0.568378019012086, + "grad_norm": 0.8505097031593323, + "learning_rate": 4.052684501600911e-05, + "loss": 0.8614, + "step": 7160 + }, + { + "epoch": 0.5691718430609856, + "grad_norm": 0.7959850430488586, + "learning_rate": 4.051361435262364e-05, + "loss": 0.8628, + "step": 7170 + }, + { + "epoch": 0.569965667109885, + "grad_norm": 0.9886569976806641, + "learning_rate": 4.050038368923818e-05, + "loss": 0.8947, + "step": 7180 + }, + { + "epoch": 0.5707594911587847, + "grad_norm": 0.8581879138946533, + "learning_rate": 4.048715302585272e-05, + "loss": 0.8619, + "step": 7190 + }, + { + "epoch": 0.5715533152076843, + "grad_norm": 0.9623154401779175, + "learning_rate": 4.047392236246725e-05, + "loss": 0.8252, + "step": 7200 + }, + { + "epoch": 0.5723471392565838, + "grad_norm": 0.8627947568893433, + "learning_rate": 4.0460691699081796e-05, + "loss": 0.8785, + "step": 7210 + }, + { + "epoch": 0.5731409633054834, + "grad_norm": 0.7813371419906616, + "learning_rate": 4.0447461035696335e-05, + "loss": 0.9171, + "step": 7220 + }, + { + "epoch": 0.5739347873543829, + "grad_norm": 1.0067614316940308, + "learning_rate": 4.0434230372310874e-05, + "loss": 0.836, + "step": 7230 + }, + { + "epoch": 0.5747286114032825, + "grad_norm": 0.6856113076210022, + "learning_rate": 4.0420999708925406e-05, + "loss": 0.9001, + "step": 7240 + }, + { + "epoch": 0.5755224354521821, + "grad_norm": 1.0112297534942627, + "learning_rate": 4.0407769045539945e-05, + "loss": 0.8726, + "step": 7250 + }, + { + "epoch": 0.5763162595010816, + "grad_norm": 0.6633104085922241, + "learning_rate": 4.0394538382154484e-05, + "loss": 0.8487, + "step": 7260 + }, + { + "epoch": 0.5771100835499812, + "grad_norm": 0.7768102884292603, + "learning_rate": 4.038130771876902e-05, + "loss": 0.8202, + "step": 7270 + }, + { + "epoch": 0.5779039075988807, + "grad_norm": 0.7367390394210815, + "learning_rate": 4.0368077055383555e-05, + "loss": 0.9207, + "step": 7280 + }, + { + "epoch": 0.5786977316477803, + "grad_norm": 0.9495264887809753, + "learning_rate": 4.0354846391998094e-05, + "loss": 0.9026, + "step": 7290 + }, + { + "epoch": 0.5794915556966799, + "grad_norm": 0.880790650844574, + "learning_rate": 4.034161572861263e-05, + "loss": 0.8974, + "step": 7300 + }, + { + "epoch": 0.5802853797455794, + "grad_norm": 0.8407362103462219, + "learning_rate": 4.032838506522717e-05, + "loss": 0.8189, + "step": 7310 + }, + { + "epoch": 0.581079203794479, + "grad_norm": 0.6734247207641602, + "learning_rate": 4.031515440184171e-05, + "loss": 0.9121, + "step": 7320 + }, + { + "epoch": 0.5818730278433785, + "grad_norm": 0.965093195438385, + "learning_rate": 4.030192373845625e-05, + "loss": 0.8464, + "step": 7330 + }, + { + "epoch": 0.5826668518922781, + "grad_norm": 0.7038446068763733, + "learning_rate": 4.028869307507079e-05, + "loss": 0.9442, + "step": 7340 + }, + { + "epoch": 0.5834606759411777, + "grad_norm": 0.6789405345916748, + "learning_rate": 4.027546241168532e-05, + "loss": 0.9112, + "step": 7350 + }, + { + "epoch": 0.5842544999900772, + "grad_norm": 0.8215457797050476, + "learning_rate": 4.026223174829986e-05, + "loss": 0.891, + "step": 7360 + }, + { + "epoch": 0.5850483240389768, + "grad_norm": 0.8473328948020935, + "learning_rate": 4.02490010849144e-05, + "loss": 0.8637, + "step": 7370 + }, + { + "epoch": 0.5858421480878763, + "grad_norm": 0.7100654244422913, + "learning_rate": 4.023577042152894e-05, + "loss": 0.9027, + "step": 7380 + }, + { + "epoch": 0.5866359721367759, + "grad_norm": 0.8264563083648682, + "learning_rate": 4.022253975814348e-05, + "loss": 0.8289, + "step": 7390 + }, + { + "epoch": 0.5874297961856755, + "grad_norm": 0.595227062702179, + "learning_rate": 4.0209309094758016e-05, + "loss": 0.8588, + "step": 7400 + }, + { + "epoch": 0.588223620234575, + "grad_norm": 0.8084173202514648, + "learning_rate": 4.0196078431372555e-05, + "loss": 0.9038, + "step": 7410 + }, + { + "epoch": 0.5890174442834746, + "grad_norm": 0.8075309991836548, + "learning_rate": 4.018284776798709e-05, + "loss": 0.9102, + "step": 7420 + }, + { + "epoch": 0.5898112683323742, + "grad_norm": 0.5747039318084717, + "learning_rate": 4.0169617104601626e-05, + "loss": 0.8329, + "step": 7430 + }, + { + "epoch": 0.5906050923812737, + "grad_norm": 0.943231463432312, + "learning_rate": 4.0156386441216165e-05, + "loss": 0.8205, + "step": 7440 + }, + { + "epoch": 0.5913989164301733, + "grad_norm": 0.8682481050491333, + "learning_rate": 4.0143155777830703e-05, + "loss": 0.862, + "step": 7450 + }, + { + "epoch": 0.5921927404790728, + "grad_norm": 0.6591200232505798, + "learning_rate": 4.0129925114445236e-05, + "loss": 0.8736, + "step": 7460 + }, + { + "epoch": 0.5929865645279724, + "grad_norm": 0.7325188517570496, + "learning_rate": 4.0116694451059775e-05, + "loss": 0.8911, + "step": 7470 + }, + { + "epoch": 0.593780388576872, + "grad_norm": 0.5145597457885742, + "learning_rate": 4.010346378767432e-05, + "loss": 0.8992, + "step": 7480 + }, + { + "epoch": 0.5945742126257715, + "grad_norm": 0.6899420022964478, + "learning_rate": 4.009023312428886e-05, + "loss": 0.8651, + "step": 7490 + }, + { + "epoch": 0.5953680366746711, + "grad_norm": 0.8163533806800842, + "learning_rate": 4.007700246090339e-05, + "loss": 0.8571, + "step": 7500 + }, + { + "epoch": 0.5961618607235706, + "grad_norm": 0.8414540886878967, + "learning_rate": 4.006377179751793e-05, + "loss": 0.8866, + "step": 7510 + }, + { + "epoch": 0.5969556847724702, + "grad_norm": 0.9524148106575012, + "learning_rate": 4.005054113413247e-05, + "loss": 0.871, + "step": 7520 + }, + { + "epoch": 0.5977495088213698, + "grad_norm": 0.8480884432792664, + "learning_rate": 4.0037310470747e-05, + "loss": 0.8807, + "step": 7530 + }, + { + "epoch": 0.5985433328702693, + "grad_norm": 0.9701437950134277, + "learning_rate": 4.002407980736154e-05, + "loss": 0.8092, + "step": 7540 + }, + { + "epoch": 0.5993371569191689, + "grad_norm": 0.9509230256080627, + "learning_rate": 4.001084914397608e-05, + "loss": 0.9041, + "step": 7550 + }, + { + "epoch": 0.6001309809680684, + "grad_norm": 0.6511445045471191, + "learning_rate": 3.999761848059062e-05, + "loss": 0.8408, + "step": 7560 + }, + { + "epoch": 0.600924805016968, + "grad_norm": 0.7083035111427307, + "learning_rate": 3.998438781720516e-05, + "loss": 0.8712, + "step": 7570 + }, + { + "epoch": 0.6017186290658676, + "grad_norm": 0.8894531726837158, + "learning_rate": 3.9971157153819696e-05, + "loss": 0.8662, + "step": 7580 + }, + { + "epoch": 0.6025124531147671, + "grad_norm": 0.7354892492294312, + "learning_rate": 3.9957926490434235e-05, + "loss": 0.7996, + "step": 7590 + }, + { + "epoch": 0.6033062771636667, + "grad_norm": 0.8339380025863647, + "learning_rate": 3.9944695827048774e-05, + "loss": 0.8971, + "step": 7600 + }, + { + "epoch": 0.6041001012125662, + "grad_norm": 0.7149907350540161, + "learning_rate": 3.9931465163663306e-05, + "loss": 0.8674, + "step": 7610 + }, + { + "epoch": 0.6048939252614658, + "grad_norm": 0.6684656739234924, + "learning_rate": 3.9918234500277845e-05, + "loss": 0.8449, + "step": 7620 + }, + { + "epoch": 0.6056877493103654, + "grad_norm": 0.803089439868927, + "learning_rate": 3.9905003836892384e-05, + "loss": 0.8952, + "step": 7630 + }, + { + "epoch": 0.6064815733592649, + "grad_norm": 0.7551273107528687, + "learning_rate": 3.9891773173506916e-05, + "loss": 0.8461, + "step": 7640 + }, + { + "epoch": 0.6072753974081645, + "grad_norm": 0.6433910131454468, + "learning_rate": 3.9878542510121455e-05, + "loss": 0.8795, + "step": 7650 + }, + { + "epoch": 0.608069221457064, + "grad_norm": 0.73484867811203, + "learning_rate": 3.9865311846736e-05, + "loss": 0.8916, + "step": 7660 + }, + { + "epoch": 0.6088630455059636, + "grad_norm": 0.8747826218605042, + "learning_rate": 3.985208118335054e-05, + "loss": 0.8803, + "step": 7670 + }, + { + "epoch": 0.6096568695548632, + "grad_norm": 0.9112239480018616, + "learning_rate": 3.983885051996507e-05, + "loss": 0.9397, + "step": 7680 + }, + { + "epoch": 0.6104506936037627, + "grad_norm": 0.8457674384117126, + "learning_rate": 3.982561985657961e-05, + "loss": 0.8451, + "step": 7690 + }, + { + "epoch": 0.6112445176526623, + "grad_norm": 0.7593239545822144, + "learning_rate": 3.981238919319415e-05, + "loss": 0.9272, + "step": 7700 + }, + { + "epoch": 0.6120383417015618, + "grad_norm": 0.7265938520431519, + "learning_rate": 3.979915852980869e-05, + "loss": 0.9268, + "step": 7710 + }, + { + "epoch": 0.6128321657504614, + "grad_norm": 0.7465494871139526, + "learning_rate": 3.978592786642322e-05, + "loss": 0.872, + "step": 7720 + }, + { + "epoch": 0.613625989799361, + "grad_norm": 0.7977067828178406, + "learning_rate": 3.977269720303776e-05, + "loss": 0.8586, + "step": 7730 + }, + { + "epoch": 0.6144198138482605, + "grad_norm": 0.76861572265625, + "learning_rate": 3.97594665396523e-05, + "loss": 0.9187, + "step": 7740 + }, + { + "epoch": 0.6152136378971601, + "grad_norm": 0.9571815133094788, + "learning_rate": 3.974623587626684e-05, + "loss": 0.835, + "step": 7750 + }, + { + "epoch": 0.6160074619460597, + "grad_norm": 0.7097697257995605, + "learning_rate": 3.9733005212881376e-05, + "loss": 0.8519, + "step": 7760 + }, + { + "epoch": 0.6168012859949592, + "grad_norm": 0.6047300696372986, + "learning_rate": 3.9719774549495915e-05, + "loss": 0.8864, + "step": 7770 + }, + { + "epoch": 0.6175951100438588, + "grad_norm": 0.7774935364723206, + "learning_rate": 3.9706543886110454e-05, + "loss": 0.8528, + "step": 7780 + }, + { + "epoch": 0.6183889340927583, + "grad_norm": 0.7351526021957397, + "learning_rate": 3.9693313222724986e-05, + "loss": 0.8548, + "step": 7790 + }, + { + "epoch": 0.6191827581416579, + "grad_norm": 0.8887933492660522, + "learning_rate": 3.9680082559339525e-05, + "loss": 0.9191, + "step": 7800 + }, + { + "epoch": 0.6199765821905575, + "grad_norm": 0.6193240284919739, + "learning_rate": 3.9666851895954064e-05, + "loss": 0.8906, + "step": 7810 + }, + { + "epoch": 0.620770406239457, + "grad_norm": 1.0305780172348022, + "learning_rate": 3.96536212325686e-05, + "loss": 0.9161, + "step": 7820 + }, + { + "epoch": 0.6215642302883566, + "grad_norm": 0.9303094744682312, + "learning_rate": 3.964039056918314e-05, + "loss": 0.923, + "step": 7830 + }, + { + "epoch": 0.6223580543372561, + "grad_norm": 0.7105975151062012, + "learning_rate": 3.962715990579768e-05, + "loss": 0.8918, + "step": 7840 + }, + { + "epoch": 0.6231518783861557, + "grad_norm": 0.7743216753005981, + "learning_rate": 3.961392924241222e-05, + "loss": 0.914, + "step": 7850 + }, + { + "epoch": 0.6239457024350553, + "grad_norm": 1.069062352180481, + "learning_rate": 3.960069857902676e-05, + "loss": 0.9209, + "step": 7860 + }, + { + "epoch": 0.6247395264839548, + "grad_norm": 0.8283140659332275, + "learning_rate": 3.958746791564129e-05, + "loss": 0.8587, + "step": 7870 + }, + { + "epoch": 0.6255333505328544, + "grad_norm": 0.6875911355018616, + "learning_rate": 3.957423725225583e-05, + "loss": 0.8845, + "step": 7880 + }, + { + "epoch": 0.6263271745817539, + "grad_norm": 0.8538176417350769, + "learning_rate": 3.956100658887037e-05, + "loss": 0.8435, + "step": 7890 + }, + { + "epoch": 0.6271209986306535, + "grad_norm": 0.6891659498214722, + "learning_rate": 3.95477759254849e-05, + "loss": 0.8122, + "step": 7900 + }, + { + "epoch": 0.6279148226795531, + "grad_norm": 0.6774616837501526, + "learning_rate": 3.953454526209944e-05, + "loss": 0.9223, + "step": 7910 + }, + { + "epoch": 0.6287086467284526, + "grad_norm": 0.6957716345787048, + "learning_rate": 3.952131459871398e-05, + "loss": 0.9055, + "step": 7920 + }, + { + "epoch": 0.6295024707773522, + "grad_norm": 0.7864802479743958, + "learning_rate": 3.9508083935328525e-05, + "loss": 0.8758, + "step": 7930 + }, + { + "epoch": 0.6302962948262517, + "grad_norm": 0.6542963981628418, + "learning_rate": 3.949485327194306e-05, + "loss": 0.8862, + "step": 7940 + }, + { + "epoch": 0.6310901188751513, + "grad_norm": 0.8261880278587341, + "learning_rate": 3.9481622608557596e-05, + "loss": 0.8559, + "step": 7950 + }, + { + "epoch": 0.6318839429240509, + "grad_norm": 0.7244775295257568, + "learning_rate": 3.9468391945172135e-05, + "loss": 0.896, + "step": 7960 + }, + { + "epoch": 0.6326777669729504, + "grad_norm": 0.8435840010643005, + "learning_rate": 3.9455161281786673e-05, + "loss": 0.8781, + "step": 7970 + }, + { + "epoch": 0.63347159102185, + "grad_norm": 0.8194109797477722, + "learning_rate": 3.9441930618401206e-05, + "loss": 0.9129, + "step": 7980 + }, + { + "epoch": 0.6342654150707495, + "grad_norm": 0.746361494064331, + "learning_rate": 3.9428699955015745e-05, + "loss": 0.9033, + "step": 7990 + }, + { + "epoch": 0.6350592391196491, + "grad_norm": 0.7431300282478333, + "learning_rate": 3.9415469291630283e-05, + "loss": 0.9065, + "step": 8000 + }, + { + "epoch": 0.6358530631685487, + "grad_norm": 0.7559617757797241, + "learning_rate": 3.940223862824482e-05, + "loss": 0.8851, + "step": 8010 + }, + { + "epoch": 0.6366468872174482, + "grad_norm": 0.7204374074935913, + "learning_rate": 3.938900796485936e-05, + "loss": 0.9143, + "step": 8020 + }, + { + "epoch": 0.6374407112663478, + "grad_norm": 0.9044206142425537, + "learning_rate": 3.93757773014739e-05, + "loss": 0.9308, + "step": 8030 + }, + { + "epoch": 0.6382345353152473, + "grad_norm": 0.6706372499465942, + "learning_rate": 3.936254663808844e-05, + "loss": 0.8871, + "step": 8040 + }, + { + "epoch": 0.6390283593641469, + "grad_norm": 0.8371633291244507, + "learning_rate": 3.934931597470297e-05, + "loss": 0.8375, + "step": 8050 + }, + { + "epoch": 0.6398221834130465, + "grad_norm": 0.823395311832428, + "learning_rate": 3.933608531131751e-05, + "loss": 0.8351, + "step": 8060 + }, + { + "epoch": 0.640616007461946, + "grad_norm": 0.7647190690040588, + "learning_rate": 3.932417771427059e-05, + "loss": 0.8564, + "step": 8070 + }, + { + "epoch": 0.6414098315108456, + "grad_norm": 0.7140836119651794, + "learning_rate": 3.931094705088513e-05, + "loss": 0.8248, + "step": 8080 + }, + { + "epoch": 0.6422036555597452, + "grad_norm": 0.7830526232719421, + "learning_rate": 3.929771638749967e-05, + "loss": 0.8325, + "step": 8090 + }, + { + "epoch": 0.6429974796086447, + "grad_norm": 0.8376036286354065, + "learning_rate": 3.9284485724114215e-05, + "loss": 0.8787, + "step": 8100 + }, + { + "epoch": 0.6437913036575443, + "grad_norm": 0.8224188685417175, + "learning_rate": 3.927125506072875e-05, + "loss": 0.8556, + "step": 8110 + }, + { + "epoch": 0.6445851277064438, + "grad_norm": 0.7094652056694031, + "learning_rate": 3.9258024397343286e-05, + "loss": 0.9008, + "step": 8120 + }, + { + "epoch": 0.6453789517553434, + "grad_norm": 0.9661831855773926, + "learning_rate": 3.9244793733957825e-05, + "loss": 0.9017, + "step": 8130 + }, + { + "epoch": 0.646172775804243, + "grad_norm": 0.5937822461128235, + "learning_rate": 3.923156307057236e-05, + "loss": 0.93, + "step": 8140 + }, + { + "epoch": 0.6469665998531425, + "grad_norm": 0.8832845091819763, + "learning_rate": 3.9218332407186896e-05, + "loss": 0.7535, + "step": 8150 + }, + { + "epoch": 0.6477604239020421, + "grad_norm": 0.8030399084091187, + "learning_rate": 3.9205101743801435e-05, + "loss": 0.8628, + "step": 8160 + }, + { + "epoch": 0.6485542479509416, + "grad_norm": 0.6828364729881287, + "learning_rate": 3.9191871080415974e-05, + "loss": 0.8172, + "step": 8170 + }, + { + "epoch": 0.6493480719998412, + "grad_norm": 0.7362493872642517, + "learning_rate": 3.917864041703051e-05, + "loss": 0.8649, + "step": 8180 + }, + { + "epoch": 0.6501418960487408, + "grad_norm": 0.711621105670929, + "learning_rate": 3.916540975364505e-05, + "loss": 0.8908, + "step": 8190 + }, + { + "epoch": 0.6509357200976403, + "grad_norm": 0.7424710392951965, + "learning_rate": 3.915217909025959e-05, + "loss": 0.8684, + "step": 8200 + }, + { + "epoch": 0.6517295441465399, + "grad_norm": 0.8908485770225525, + "learning_rate": 3.913894842687413e-05, + "loss": 0.867, + "step": 8210 + }, + { + "epoch": 0.6525233681954394, + "grad_norm": 0.8501769304275513, + "learning_rate": 3.912571776348866e-05, + "loss": 0.8688, + "step": 8220 + }, + { + "epoch": 0.653317192244339, + "grad_norm": 0.8504555225372314, + "learning_rate": 3.91124871001032e-05, + "loss": 0.8451, + "step": 8230 + }, + { + "epoch": 0.6541110162932386, + "grad_norm": 1.316743016242981, + "learning_rate": 3.909925643671774e-05, + "loss": 0.8375, + "step": 8240 + }, + { + "epoch": 0.6549048403421381, + "grad_norm": 0.7561972141265869, + "learning_rate": 3.908602577333227e-05, + "loss": 0.8597, + "step": 8250 + }, + { + "epoch": 0.6556986643910377, + "grad_norm": 0.7232686877250671, + "learning_rate": 3.907279510994681e-05, + "loss": 0.8808, + "step": 8260 + }, + { + "epoch": 0.6564924884399372, + "grad_norm": 0.6502814888954163, + "learning_rate": 3.9059564446561356e-05, + "loss": 0.939, + "step": 8270 + }, + { + "epoch": 0.6572863124888368, + "grad_norm": 0.9864717721939087, + "learning_rate": 3.9046333783175895e-05, + "loss": 0.8091, + "step": 8280 + }, + { + "epoch": 0.6580801365377364, + "grad_norm": 0.8748832941055298, + "learning_rate": 3.903310311979043e-05, + "loss": 0.8676, + "step": 8290 + }, + { + "epoch": 0.6588739605866359, + "grad_norm": 0.8619644641876221, + "learning_rate": 3.9019872456404966e-05, + "loss": 0.8134, + "step": 8300 + }, + { + "epoch": 0.6596677846355355, + "grad_norm": 0.7893159985542297, + "learning_rate": 3.9006641793019505e-05, + "loss": 0.8656, + "step": 8310 + }, + { + "epoch": 0.660461608684435, + "grad_norm": 0.8085225820541382, + "learning_rate": 3.8993411129634044e-05, + "loss": 0.9401, + "step": 8320 + }, + { + "epoch": 0.6612554327333346, + "grad_norm": 0.8929345011711121, + "learning_rate": 3.8980180466248576e-05, + "loss": 0.8665, + "step": 8330 + }, + { + "epoch": 0.6620492567822343, + "grad_norm": 0.9615473747253418, + "learning_rate": 3.8966949802863115e-05, + "loss": 0.8331, + "step": 8340 + }, + { + "epoch": 0.6628430808311337, + "grad_norm": 0.582528293132782, + "learning_rate": 3.8953719139477654e-05, + "loss": 0.8626, + "step": 8350 + }, + { + "epoch": 0.6636369048800334, + "grad_norm": 0.7773953676223755, + "learning_rate": 3.894048847609219e-05, + "loss": 0.8119, + "step": 8360 + }, + { + "epoch": 0.664430728928933, + "grad_norm": 0.8851808309555054, + "learning_rate": 3.892725781270673e-05, + "loss": 0.8806, + "step": 8370 + }, + { + "epoch": 0.6652245529778325, + "grad_norm": 0.8507623076438904, + "learning_rate": 3.891402714932127e-05, + "loss": 0.8734, + "step": 8380 + }, + { + "epoch": 0.6660183770267321, + "grad_norm": 0.8153007626533508, + "learning_rate": 3.890079648593581e-05, + "loss": 0.8501, + "step": 8390 + }, + { + "epoch": 0.6668122010756315, + "grad_norm": 0.8047279715538025, + "learning_rate": 3.888756582255034e-05, + "loss": 0.8574, + "step": 8400 + }, + { + "epoch": 0.6676060251245312, + "grad_norm": 0.933725118637085, + "learning_rate": 3.887433515916488e-05, + "loss": 0.8638, + "step": 8410 + }, + { + "epoch": 0.6683998491734308, + "grad_norm": 0.7633134126663208, + "learning_rate": 3.886110449577942e-05, + "loss": 0.9276, + "step": 8420 + }, + { + "epoch": 0.6691936732223303, + "grad_norm": 0.8847464323043823, + "learning_rate": 3.884787383239396e-05, + "loss": 0.9414, + "step": 8430 + }, + { + "epoch": 0.6699874972712299, + "grad_norm": 0.9716514945030212, + "learning_rate": 3.88346431690085e-05, + "loss": 0.8823, + "step": 8440 + }, + { + "epoch": 0.6707813213201294, + "grad_norm": 0.8320568203926086, + "learning_rate": 3.8821412505623037e-05, + "loss": 0.812, + "step": 8450 + }, + { + "epoch": 0.671575145369029, + "grad_norm": 0.7961634397506714, + "learning_rate": 3.8808181842237575e-05, + "loss": 0.816, + "step": 8460 + }, + { + "epoch": 0.6723689694179286, + "grad_norm": 0.7325245141983032, + "learning_rate": 3.879495117885211e-05, + "loss": 0.8224, + "step": 8470 + }, + { + "epoch": 0.6731627934668281, + "grad_norm": 0.7438898086547852, + "learning_rate": 3.8781720515466647e-05, + "loss": 0.8102, + "step": 8480 + }, + { + "epoch": 0.6739566175157277, + "grad_norm": 0.7299096584320068, + "learning_rate": 3.8768489852081186e-05, + "loss": 0.9632, + "step": 8490 + }, + { + "epoch": 0.6747504415646272, + "grad_norm": 0.895844578742981, + "learning_rate": 3.8755259188695724e-05, + "loss": 0.9497, + "step": 8500 + }, + { + "epoch": 0.6755442656135268, + "grad_norm": 0.8177179098129272, + "learning_rate": 3.8742028525310257e-05, + "loss": 0.853, + "step": 8510 + }, + { + "epoch": 0.6763380896624264, + "grad_norm": 0.6874784827232361, + "learning_rate": 3.8728797861924796e-05, + "loss": 0.8464, + "step": 8520 + }, + { + "epoch": 0.6771319137113259, + "grad_norm": 0.8291011452674866, + "learning_rate": 3.8715567198539334e-05, + "loss": 0.9136, + "step": 8530 + }, + { + "epoch": 0.6779257377602255, + "grad_norm": 0.6445680260658264, + "learning_rate": 3.870233653515388e-05, + "loss": 0.8429, + "step": 8540 + }, + { + "epoch": 0.678719561809125, + "grad_norm": 0.7403009533882141, + "learning_rate": 3.868910587176841e-05, + "loss": 0.8473, + "step": 8550 + }, + { + "epoch": 0.6795133858580246, + "grad_norm": 0.5939794778823853, + "learning_rate": 3.867587520838295e-05, + "loss": 0.9001, + "step": 8560 + }, + { + "epoch": 0.6803072099069242, + "grad_norm": 0.6442236304283142, + "learning_rate": 3.866264454499749e-05, + "loss": 0.8819, + "step": 8570 + }, + { + "epoch": 0.6811010339558237, + "grad_norm": 0.7586227655410767, + "learning_rate": 3.864941388161202e-05, + "loss": 0.8705, + "step": 8580 + }, + { + "epoch": 0.6818948580047233, + "grad_norm": 0.7336118817329407, + "learning_rate": 3.863618321822656e-05, + "loss": 0.9112, + "step": 8590 + }, + { + "epoch": 0.6826886820536228, + "grad_norm": 0.7981590032577515, + "learning_rate": 3.86229525548411e-05, + "loss": 0.9474, + "step": 8600 + }, + { + "epoch": 0.6834825061025224, + "grad_norm": 0.5665314197540283, + "learning_rate": 3.860972189145564e-05, + "loss": 0.8787, + "step": 8610 + }, + { + "epoch": 0.684276330151422, + "grad_norm": 0.6617142558097839, + "learning_rate": 3.859649122807018e-05, + "loss": 0.9087, + "step": 8620 + }, + { + "epoch": 0.6850701542003215, + "grad_norm": 0.7118547558784485, + "learning_rate": 3.858326056468472e-05, + "loss": 0.8094, + "step": 8630 + }, + { + "epoch": 0.6858639782492211, + "grad_norm": 0.6482293605804443, + "learning_rate": 3.8570029901299256e-05, + "loss": 0.861, + "step": 8640 + }, + { + "epoch": 0.6866578022981206, + "grad_norm": 0.8651313781738281, + "learning_rate": 3.8556799237913795e-05, + "loss": 0.7867, + "step": 8650 + }, + { + "epoch": 0.6874516263470202, + "grad_norm": 0.5534746646881104, + "learning_rate": 3.854356857452833e-05, + "loss": 0.9116, + "step": 8660 + }, + { + "epoch": 0.6882454503959198, + "grad_norm": 0.6434946060180664, + "learning_rate": 3.8530337911142866e-05, + "loss": 0.8464, + "step": 8670 + }, + { + "epoch": 0.6890392744448193, + "grad_norm": 0.7514382600784302, + "learning_rate": 3.8517107247757405e-05, + "loss": 0.8618, + "step": 8680 + }, + { + "epoch": 0.6898330984937189, + "grad_norm": 0.7134977579116821, + "learning_rate": 3.850387658437194e-05, + "loss": 0.8792, + "step": 8690 + }, + { + "epoch": 0.6906269225426185, + "grad_norm": 0.8967533707618713, + "learning_rate": 3.8490645920986476e-05, + "loss": 0.8489, + "step": 8700 + }, + { + "epoch": 0.691420746591518, + "grad_norm": 0.7297849655151367, + "learning_rate": 3.847741525760102e-05, + "loss": 0.8498, + "step": 8710 + }, + { + "epoch": 0.6922145706404176, + "grad_norm": 0.8720963001251221, + "learning_rate": 3.846418459421556e-05, + "loss": 0.8486, + "step": 8720 + }, + { + "epoch": 0.6930083946893171, + "grad_norm": 0.9449414610862732, + "learning_rate": 3.845095393083009e-05, + "loss": 0.8597, + "step": 8730 + }, + { + "epoch": 0.6938022187382167, + "grad_norm": 0.7103795409202576, + "learning_rate": 3.843772326744463e-05, + "loss": 0.8633, + "step": 8740 + }, + { + "epoch": 0.6945960427871163, + "grad_norm": 1.0056965351104736, + "learning_rate": 3.842449260405917e-05, + "loss": 0.8723, + "step": 8750 + }, + { + "epoch": 0.6953898668360158, + "grad_norm": 0.6581205725669861, + "learning_rate": 3.841126194067371e-05, + "loss": 0.8956, + "step": 8760 + }, + { + "epoch": 0.6961836908849154, + "grad_norm": 0.8664624691009521, + "learning_rate": 3.839803127728824e-05, + "loss": 0.8979, + "step": 8770 + }, + { + "epoch": 0.6969775149338149, + "grad_norm": 0.6672316789627075, + "learning_rate": 3.838480061390278e-05, + "loss": 0.896, + "step": 8780 + }, + { + "epoch": 0.6977713389827145, + "grad_norm": 0.7690501809120178, + "learning_rate": 3.837156995051732e-05, + "loss": 0.856, + "step": 8790 + }, + { + "epoch": 0.6985651630316141, + "grad_norm": 0.7820170521736145, + "learning_rate": 3.835833928713186e-05, + "loss": 0.8775, + "step": 8800 + }, + { + "epoch": 0.6993589870805136, + "grad_norm": 1.0023752450942993, + "learning_rate": 3.83451086237464e-05, + "loss": 0.8321, + "step": 8810 + }, + { + "epoch": 0.7001528111294132, + "grad_norm": 0.7955570816993713, + "learning_rate": 3.8331877960360936e-05, + "loss": 0.8198, + "step": 8820 + }, + { + "epoch": 0.7009466351783127, + "grad_norm": 0.7987425327301025, + "learning_rate": 3.8318647296975475e-05, + "loss": 0.8072, + "step": 8830 + }, + { + "epoch": 0.7017404592272123, + "grad_norm": 0.6776150465011597, + "learning_rate": 3.830541663359001e-05, + "loss": 0.8903, + "step": 8840 + }, + { + "epoch": 0.7025342832761119, + "grad_norm": 0.8060004711151123, + "learning_rate": 3.8292185970204546e-05, + "loss": 0.8328, + "step": 8850 + }, + { + "epoch": 0.7033281073250114, + "grad_norm": 0.6647984981536865, + "learning_rate": 3.8278955306819085e-05, + "loss": 0.9151, + "step": 8860 + }, + { + "epoch": 0.704121931373911, + "grad_norm": 0.7198352217674255, + "learning_rate": 3.8265724643433624e-05, + "loss": 0.8502, + "step": 8870 + }, + { + "epoch": 0.7049157554228105, + "grad_norm": 0.8274781107902527, + "learning_rate": 3.825249398004816e-05, + "loss": 0.8856, + "step": 8880 + }, + { + "epoch": 0.7057095794717101, + "grad_norm": 0.868508517742157, + "learning_rate": 3.82392633166627e-05, + "loss": 0.806, + "step": 8890 + }, + { + "epoch": 0.7065034035206097, + "grad_norm": 0.7774865627288818, + "learning_rate": 3.822603265327724e-05, + "loss": 0.844, + "step": 8900 + }, + { + "epoch": 0.7072972275695092, + "grad_norm": 0.8134011030197144, + "learning_rate": 3.821280198989178e-05, + "loss": 0.946, + "step": 8910 + }, + { + "epoch": 0.7080910516184088, + "grad_norm": 0.7954188585281372, + "learning_rate": 3.819957132650631e-05, + "loss": 0.8422, + "step": 8920 + }, + { + "epoch": 0.7088848756673083, + "grad_norm": 0.7246106863021851, + "learning_rate": 3.818634066312085e-05, + "loss": 0.8401, + "step": 8930 + }, + { + "epoch": 0.7096786997162079, + "grad_norm": 0.7810704112052917, + "learning_rate": 3.817310999973539e-05, + "loss": 0.8684, + "step": 8940 + }, + { + "epoch": 0.7104725237651075, + "grad_norm": 0.74953293800354, + "learning_rate": 3.815987933634992e-05, + "loss": 0.8565, + "step": 8950 + }, + { + "epoch": 0.711266347814007, + "grad_norm": 0.9855328798294067, + "learning_rate": 3.814664867296446e-05, + "loss": 0.9263, + "step": 8960 + }, + { + "epoch": 0.7120601718629066, + "grad_norm": 0.5810590386390686, + "learning_rate": 3.8133418009579e-05, + "loss": 0.9298, + "step": 8970 + }, + { + "epoch": 0.7128539959118061, + "grad_norm": 0.817237377166748, + "learning_rate": 3.8120187346193545e-05, + "loss": 0.8469, + "step": 8980 + }, + { + "epoch": 0.7136478199607057, + "grad_norm": 0.7904085516929626, + "learning_rate": 3.810695668280808e-05, + "loss": 0.8513, + "step": 8990 + }, + { + "epoch": 0.7144416440096053, + "grad_norm": 0.6733309626579285, + "learning_rate": 3.8093726019422617e-05, + "loss": 0.8477, + "step": 9000 + }, + { + "epoch": 0.7152354680585048, + "grad_norm": 0.7731225490570068, + "learning_rate": 3.8080495356037155e-05, + "loss": 0.9097, + "step": 9010 + }, + { + "epoch": 0.7160292921074044, + "grad_norm": 0.8163710236549377, + "learning_rate": 3.8067264692651694e-05, + "loss": 0.8878, + "step": 9020 + }, + { + "epoch": 0.716823116156304, + "grad_norm": 0.8576553463935852, + "learning_rate": 3.8054034029266227e-05, + "loss": 0.8706, + "step": 9030 + }, + { + "epoch": 0.7176169402052035, + "grad_norm": 0.7297602295875549, + "learning_rate": 3.8040803365880766e-05, + "loss": 0.9051, + "step": 9040 + }, + { + "epoch": 0.7184107642541031, + "grad_norm": 0.6882147192955017, + "learning_rate": 3.8027572702495304e-05, + "loss": 0.8919, + "step": 9050 + }, + { + "epoch": 0.7192045883030026, + "grad_norm": 0.8439272046089172, + "learning_rate": 3.801434203910984e-05, + "loss": 0.898, + "step": 9060 + }, + { + "epoch": 0.7199984123519022, + "grad_norm": 0.7839822173118591, + "learning_rate": 3.800111137572438e-05, + "loss": 0.9044, + "step": 9070 + }, + { + "epoch": 0.7207922364008018, + "grad_norm": 0.6823743581771851, + "learning_rate": 3.798788071233892e-05, + "loss": 0.8372, + "step": 9080 + }, + { + "epoch": 0.7215860604497013, + "grad_norm": 0.7644033432006836, + "learning_rate": 3.797465004895346e-05, + "loss": 0.9316, + "step": 9090 + }, + { + "epoch": 0.7223798844986009, + "grad_norm": 0.826184093952179, + "learning_rate": 3.796141938556799e-05, + "loss": 0.8519, + "step": 9100 + }, + { + "epoch": 0.7231737085475004, + "grad_norm": 0.8634784817695618, + "learning_rate": 3.794818872218253e-05, + "loss": 0.905, + "step": 9110 + }, + { + "epoch": 0.7239675325964, + "grad_norm": 0.6479964852333069, + "learning_rate": 3.793495805879707e-05, + "loss": 0.7854, + "step": 9120 + }, + { + "epoch": 0.7247613566452996, + "grad_norm": 0.7302672863006592, + "learning_rate": 3.792172739541161e-05, + "loss": 0.877, + "step": 9130 + }, + { + "epoch": 0.7255551806941991, + "grad_norm": 0.7126200199127197, + "learning_rate": 3.790849673202614e-05, + "loss": 0.9136, + "step": 9140 + }, + { + "epoch": 0.7263490047430987, + "grad_norm": 0.8808421492576599, + "learning_rate": 3.789526606864068e-05, + "loss": 0.8717, + "step": 9150 + }, + { + "epoch": 0.7271428287919982, + "grad_norm": 0.7039631605148315, + "learning_rate": 3.7882035405255226e-05, + "loss": 0.835, + "step": 9160 + }, + { + "epoch": 0.7279366528408978, + "grad_norm": 0.7323824167251587, + "learning_rate": 3.786880474186976e-05, + "loss": 0.8713, + "step": 9170 + }, + { + "epoch": 0.7287304768897974, + "grad_norm": 0.7466415762901306, + "learning_rate": 3.78555740784843e-05, + "loss": 0.874, + "step": 9180 + }, + { + "epoch": 0.7295243009386969, + "grad_norm": 0.6216104626655579, + "learning_rate": 3.7842343415098836e-05, + "loss": 0.8985, + "step": 9190 + }, + { + "epoch": 0.7303181249875965, + "grad_norm": 0.8570176362991333, + "learning_rate": 3.7829112751713375e-05, + "loss": 0.8755, + "step": 9200 + }, + { + "epoch": 0.731111949036496, + "grad_norm": 0.7818983197212219, + "learning_rate": 3.781588208832791e-05, + "loss": 0.8462, + "step": 9210 + }, + { + "epoch": 0.7319057730853956, + "grad_norm": 0.6952176094055176, + "learning_rate": 3.7802651424942446e-05, + "loss": 0.8286, + "step": 9220 + }, + { + "epoch": 0.7326995971342952, + "grad_norm": 0.6671289801597595, + "learning_rate": 3.7789420761556985e-05, + "loss": 0.8437, + "step": 9230 + }, + { + "epoch": 0.7334934211831947, + "grad_norm": 0.6920092701911926, + "learning_rate": 3.7776190098171524e-05, + "loss": 0.8691, + "step": 9240 + }, + { + "epoch": 0.7342872452320943, + "grad_norm": 0.5979323387145996, + "learning_rate": 3.776295943478606e-05, + "loss": 0.8728, + "step": 9250 + }, + { + "epoch": 0.7350810692809938, + "grad_norm": 0.8079019784927368, + "learning_rate": 3.77497287714006e-05, + "loss": 0.8504, + "step": 9260 + }, + { + "epoch": 0.7358748933298934, + "grad_norm": 0.6762669086456299, + "learning_rate": 3.773649810801514e-05, + "loss": 0.8608, + "step": 9270 + }, + { + "epoch": 0.736668717378793, + "grad_norm": 0.6366623044013977, + "learning_rate": 3.772326744462967e-05, + "loss": 0.873, + "step": 9280 + }, + { + "epoch": 0.7374625414276925, + "grad_norm": 0.8903300166130066, + "learning_rate": 3.771003678124421e-05, + "loss": 0.8963, + "step": 9290 + }, + { + "epoch": 0.7382563654765921, + "grad_norm": 0.8363783955574036, + "learning_rate": 3.769680611785875e-05, + "loss": 0.8462, + "step": 9300 + }, + { + "epoch": 0.7390501895254917, + "grad_norm": 0.716432511806488, + "learning_rate": 3.768357545447329e-05, + "loss": 0.8599, + "step": 9310 + }, + { + "epoch": 0.7398440135743912, + "grad_norm": 0.8705762624740601, + "learning_rate": 3.767034479108782e-05, + "loss": 0.8659, + "step": 9320 + }, + { + "epoch": 0.7406378376232908, + "grad_norm": 0.7024847865104675, + "learning_rate": 3.765711412770237e-05, + "loss": 0.8697, + "step": 9330 + }, + { + "epoch": 0.7414316616721903, + "grad_norm": 0.7398332357406616, + "learning_rate": 3.7643883464316906e-05, + "loss": 0.8322, + "step": 9340 + }, + { + "epoch": 0.7422254857210899, + "grad_norm": 0.7438483834266663, + "learning_rate": 3.7630652800931445e-05, + "loss": 0.8447, + "step": 9350 + }, + { + "epoch": 0.7430193097699895, + "grad_norm": 0.7145562767982483, + "learning_rate": 3.761742213754598e-05, + "loss": 0.8426, + "step": 9360 + }, + { + "epoch": 0.743813133818889, + "grad_norm": 0.7920240759849548, + "learning_rate": 3.7604191474160516e-05, + "loss": 0.8861, + "step": 9370 + }, + { + "epoch": 0.7446069578677886, + "grad_norm": 0.7747379541397095, + "learning_rate": 3.7590960810775055e-05, + "loss": 0.864, + "step": 9380 + }, + { + "epoch": 0.7454007819166881, + "grad_norm": 0.8978133201599121, + "learning_rate": 3.757773014738959e-05, + "loss": 0.7788, + "step": 9390 + }, + { + "epoch": 0.7461946059655877, + "grad_norm": 0.6355377435684204, + "learning_rate": 3.7564499484004126e-05, + "loss": 0.8701, + "step": 9400 + }, + { + "epoch": 0.7469884300144873, + "grad_norm": 0.6304174065589905, + "learning_rate": 3.7551268820618665e-05, + "loss": 0.8598, + "step": 9410 + }, + { + "epoch": 0.7477822540633868, + "grad_norm": 0.8747965693473816, + "learning_rate": 3.7538038157233204e-05, + "loss": 0.8457, + "step": 9420 + }, + { + "epoch": 0.7485760781122864, + "grad_norm": 0.7455962300300598, + "learning_rate": 3.752480749384774e-05, + "loss": 0.838, + "step": 9430 + }, + { + "epoch": 0.7493699021611859, + "grad_norm": 0.6928434371948242, + "learning_rate": 3.751289989680083e-05, + "loss": 0.8531, + "step": 9440 + }, + { + "epoch": 0.7501637262100855, + "grad_norm": 0.6415355801582336, + "learning_rate": 3.749966923341536e-05, + "loss": 0.836, + "step": 9450 + }, + { + "epoch": 0.7509575502589851, + "grad_norm": 0.7036964893341064, + "learning_rate": 3.74864385700299e-05, + "loss": 0.8683, + "step": 9460 + }, + { + "epoch": 0.7517513743078846, + "grad_norm": 0.546319305896759, + "learning_rate": 3.747320790664444e-05, + "loss": 0.864, + "step": 9470 + }, + { + "epoch": 0.7525451983567842, + "grad_norm": 0.7785167098045349, + "learning_rate": 3.745997724325898e-05, + "loss": 0.9178, + "step": 9480 + }, + { + "epoch": 0.7533390224056837, + "grad_norm": 0.6985113024711609, + "learning_rate": 3.744674657987351e-05, + "loss": 0.8726, + "step": 9490 + }, + { + "epoch": 0.7541328464545833, + "grad_norm": 0.7310826182365417, + "learning_rate": 3.743351591648806e-05, + "loss": 0.8742, + "step": 9500 + }, + { + "epoch": 0.7549266705034829, + "grad_norm": 0.8766258358955383, + "learning_rate": 3.7420285253102596e-05, + "loss": 0.8659, + "step": 9510 + }, + { + "epoch": 0.7557204945523824, + "grad_norm": 0.6016245484352112, + "learning_rate": 3.740705458971713e-05, + "loss": 0.8517, + "step": 9520 + }, + { + "epoch": 0.756514318601282, + "grad_norm": 0.6588014960289001, + "learning_rate": 3.739382392633167e-05, + "loss": 0.8621, + "step": 9530 + }, + { + "epoch": 0.7573081426501815, + "grad_norm": 0.8283513784408569, + "learning_rate": 3.7380593262946206e-05, + "loss": 0.8527, + "step": 9540 + }, + { + "epoch": 0.7581019666990811, + "grad_norm": 0.6860958337783813, + "learning_rate": 3.7367362599560745e-05, + "loss": 0.8549, + "step": 9550 + }, + { + "epoch": 0.7588957907479807, + "grad_norm": 0.8800874948501587, + "learning_rate": 3.735413193617528e-05, + "loss": 0.8435, + "step": 9560 + }, + { + "epoch": 0.7596896147968802, + "grad_norm": 0.6277894973754883, + "learning_rate": 3.7340901272789816e-05, + "loss": 0.845, + "step": 9570 + }, + { + "epoch": 0.7604834388457798, + "grad_norm": 0.8731912970542908, + "learning_rate": 3.7327670609404355e-05, + "loss": 0.8302, + "step": 9580 + }, + { + "epoch": 0.7612772628946793, + "grad_norm": 0.9646309018135071, + "learning_rate": 3.7314439946018894e-05, + "loss": 0.7982, + "step": 9590 + }, + { + "epoch": 0.762071086943579, + "grad_norm": 0.7538288831710815, + "learning_rate": 3.730120928263343e-05, + "loss": 0.8195, + "step": 9600 + }, + { + "epoch": 0.7628649109924786, + "grad_norm": 0.8189886212348938, + "learning_rate": 3.728797861924797e-05, + "loss": 0.8968, + "step": 9610 + }, + { + "epoch": 0.763658735041378, + "grad_norm": 0.9274044036865234, + "learning_rate": 3.727474795586251e-05, + "loss": 0.8673, + "step": 9620 + }, + { + "epoch": 0.7644525590902777, + "grad_norm": 0.7127663493156433, + "learning_rate": 3.726151729247704e-05, + "loss": 0.8359, + "step": 9630 + }, + { + "epoch": 0.7652463831391773, + "grad_norm": 0.8801278471946716, + "learning_rate": 3.724828662909158e-05, + "loss": 0.8546, + "step": 9640 + }, + { + "epoch": 0.7660402071880767, + "grad_norm": 0.8994466066360474, + "learning_rate": 3.723505596570612e-05, + "loss": 0.8434, + "step": 9650 + }, + { + "epoch": 0.7668340312369764, + "grad_norm": 0.7613593339920044, + "learning_rate": 3.722182530232066e-05, + "loss": 0.8683, + "step": 9660 + }, + { + "epoch": 0.7676278552858758, + "grad_norm": 0.6378811597824097, + "learning_rate": 3.72085946389352e-05, + "loss": 0.8729, + "step": 9670 + }, + { + "epoch": 0.7684216793347755, + "grad_norm": 0.6833720207214355, + "learning_rate": 3.719536397554974e-05, + "loss": 0.8427, + "step": 9680 + }, + { + "epoch": 0.7692155033836751, + "grad_norm": 0.6543610692024231, + "learning_rate": 3.718213331216428e-05, + "loss": 0.8309, + "step": 9690 + }, + { + "epoch": 0.7700093274325746, + "grad_norm": 0.7500269412994385, + "learning_rate": 3.7168902648778816e-05, + "loss": 0.8621, + "step": 9700 + }, + { + "epoch": 0.7708031514814742, + "grad_norm": 0.7766570448875427, + "learning_rate": 3.715567198539335e-05, + "loss": 0.8549, + "step": 9710 + }, + { + "epoch": 0.7715969755303737, + "grad_norm": 0.8536286950111389, + "learning_rate": 3.714244132200789e-05, + "loss": 0.8375, + "step": 9720 + }, + { + "epoch": 0.7723907995792733, + "grad_norm": 0.8572762608528137, + "learning_rate": 3.7129210658622426e-05, + "loss": 0.8188, + "step": 9730 + }, + { + "epoch": 0.7731846236281729, + "grad_norm": 1.0639240741729736, + "learning_rate": 3.711597999523696e-05, + "loss": 0.8433, + "step": 9740 + }, + { + "epoch": 0.7739784476770724, + "grad_norm": 0.7401818633079529, + "learning_rate": 3.71027493318515e-05, + "loss": 0.8178, + "step": 9750 + }, + { + "epoch": 0.774772271725972, + "grad_norm": 0.9968650341033936, + "learning_rate": 3.7089518668466036e-05, + "loss": 0.8486, + "step": 9760 + }, + { + "epoch": 0.7755660957748715, + "grad_norm": 0.9690655469894409, + "learning_rate": 3.707628800508058e-05, + "loss": 0.8331, + "step": 9770 + }, + { + "epoch": 0.7763599198237711, + "grad_norm": 1.0017821788787842, + "learning_rate": 3.7063057341695114e-05, + "loss": 0.8756, + "step": 9780 + }, + { + "epoch": 0.7771537438726707, + "grad_norm": 0.7528254389762878, + "learning_rate": 3.704982667830965e-05, + "loss": 0.9261, + "step": 9790 + }, + { + "epoch": 0.7779475679215702, + "grad_norm": 0.7826637625694275, + "learning_rate": 3.703659601492419e-05, + "loss": 0.8768, + "step": 9800 + }, + { + "epoch": 0.7787413919704698, + "grad_norm": 0.6399036049842834, + "learning_rate": 3.702336535153873e-05, + "loss": 0.8645, + "step": 9810 + }, + { + "epoch": 0.7795352160193693, + "grad_norm": 0.649523138999939, + "learning_rate": 3.701013468815326e-05, + "loss": 0.8536, + "step": 9820 + }, + { + "epoch": 0.7803290400682689, + "grad_norm": 0.955564558506012, + "learning_rate": 3.69969040247678e-05, + "loss": 0.8279, + "step": 9830 + }, + { + "epoch": 0.7811228641171685, + "grad_norm": 0.733024537563324, + "learning_rate": 3.698367336138234e-05, + "loss": 0.9158, + "step": 9840 + }, + { + "epoch": 0.781916688166068, + "grad_norm": 0.6793590784072876, + "learning_rate": 3.697044269799688e-05, + "loss": 0.8817, + "step": 9850 + }, + { + "epoch": 0.7827105122149676, + "grad_norm": 0.7847456932067871, + "learning_rate": 3.695721203461142e-05, + "loss": 0.8568, + "step": 9860 + }, + { + "epoch": 0.7835043362638671, + "grad_norm": 0.8460919260978699, + "learning_rate": 3.694398137122596e-05, + "loss": 0.8767, + "step": 9870 + }, + { + "epoch": 0.7842981603127667, + "grad_norm": 0.6977751851081848, + "learning_rate": 3.6930750707840496e-05, + "loss": 0.8671, + "step": 9880 + }, + { + "epoch": 0.7850919843616663, + "grad_norm": 0.7356807589530945, + "learning_rate": 3.691752004445503e-05, + "loss": 0.9341, + "step": 9890 + }, + { + "epoch": 0.7858858084105658, + "grad_norm": 0.7931004762649536, + "learning_rate": 3.690428938106957e-05, + "loss": 0.868, + "step": 9900 + }, + { + "epoch": 0.7866796324594654, + "grad_norm": 0.8790589570999146, + "learning_rate": 3.6891058717684106e-05, + "loss": 0.8516, + "step": 9910 + }, + { + "epoch": 0.7874734565083649, + "grad_norm": 0.7858555912971497, + "learning_rate": 3.6877828054298645e-05, + "loss": 0.8336, + "step": 9920 + }, + { + "epoch": 0.7882672805572645, + "grad_norm": 0.6692208647727966, + "learning_rate": 3.686459739091318e-05, + "loss": 0.8679, + "step": 9930 + }, + { + "epoch": 0.7890611046061641, + "grad_norm": 0.9050410389900208, + "learning_rate": 3.685136672752772e-05, + "loss": 0.7942, + "step": 9940 + }, + { + "epoch": 0.7898549286550636, + "grad_norm": 0.8038280010223389, + "learning_rate": 3.683813606414226e-05, + "loss": 0.8699, + "step": 9950 + }, + { + "epoch": 0.7906487527039632, + "grad_norm": 0.7280693054199219, + "learning_rate": 3.68249054007568e-05, + "loss": 0.8876, + "step": 9960 + }, + { + "epoch": 0.7914425767528628, + "grad_norm": 0.6296914219856262, + "learning_rate": 3.681167473737133e-05, + "loss": 0.8535, + "step": 9970 + }, + { + "epoch": 0.7922364008017623, + "grad_norm": 0.8135585784912109, + "learning_rate": 3.679844407398587e-05, + "loss": 0.8563, + "step": 9980 + }, + { + "epoch": 0.7930302248506619, + "grad_norm": 0.6743822693824768, + "learning_rate": 3.678521341060041e-05, + "loss": 0.9013, + "step": 9990 + }, + { + "epoch": 0.7938240488995614, + "grad_norm": 0.6657276153564453, + "learning_rate": 3.677198274721494e-05, + "loss": 0.8898, + "step": 10000 + }, + { + "epoch": 0.794617872948461, + "grad_norm": 0.7389101386070251, + "learning_rate": 3.675875208382948e-05, + "loss": 0.8925, + "step": 10010 + }, + { + "epoch": 0.7954116969973606, + "grad_norm": 0.802793562412262, + "learning_rate": 3.674552142044402e-05, + "loss": 0.8091, + "step": 10020 + }, + { + "epoch": 0.7962055210462601, + "grad_norm": 0.9091024398803711, + "learning_rate": 3.673229075705856e-05, + "loss": 0.8559, + "step": 10030 + }, + { + "epoch": 0.7969993450951597, + "grad_norm": 0.8592483997344971, + "learning_rate": 3.67190600936731e-05, + "loss": 0.8379, + "step": 10040 + }, + { + "epoch": 0.7977931691440592, + "grad_norm": 0.8377381563186646, + "learning_rate": 3.670582943028764e-05, + "loss": 0.9248, + "step": 10050 + }, + { + "epoch": 0.7985869931929588, + "grad_norm": 0.7962272763252258, + "learning_rate": 3.6692598766902176e-05, + "loss": 0.8846, + "step": 10060 + }, + { + "epoch": 0.7993808172418584, + "grad_norm": 0.5838531255722046, + "learning_rate": 3.6679368103516715e-05, + "loss": 0.8658, + "step": 10070 + }, + { + "epoch": 0.8001746412907579, + "grad_norm": 0.6025267839431763, + "learning_rate": 3.666613744013125e-05, + "loss": 0.8649, + "step": 10080 + }, + { + "epoch": 0.8009684653396575, + "grad_norm": 0.6443192362785339, + "learning_rate": 3.6652906776745786e-05, + "loss": 0.8555, + "step": 10090 + }, + { + "epoch": 0.801762289388557, + "grad_norm": 0.7511973977088928, + "learning_rate": 3.6639676113360325e-05, + "loss": 0.945, + "step": 10100 + }, + { + "epoch": 0.8025561134374566, + "grad_norm": 0.7938660979270935, + "learning_rate": 3.6626445449974864e-05, + "loss": 0.796, + "step": 10110 + }, + { + "epoch": 0.8033499374863562, + "grad_norm": 0.7330296039581299, + "learning_rate": 3.66132147865894e-05, + "loss": 0.7629, + "step": 10120 + }, + { + "epoch": 0.8041437615352557, + "grad_norm": 0.8548980355262756, + "learning_rate": 3.659998412320394e-05, + "loss": 0.8621, + "step": 10130 + }, + { + "epoch": 0.8049375855841553, + "grad_norm": 0.7704104781150818, + "learning_rate": 3.658675345981848e-05, + "loss": 0.8502, + "step": 10140 + }, + { + "epoch": 0.8057314096330548, + "grad_norm": 0.672539472579956, + "learning_rate": 3.657352279643301e-05, + "loss": 0.8981, + "step": 10150 + }, + { + "epoch": 0.8065252336819544, + "grad_norm": 0.8186033964157104, + "learning_rate": 3.656029213304755e-05, + "loss": 0.9071, + "step": 10160 + }, + { + "epoch": 0.807319057730854, + "grad_norm": 0.5315646529197693, + "learning_rate": 3.654706146966209e-05, + "loss": 0.8992, + "step": 10170 + }, + { + "epoch": 0.8081128817797535, + "grad_norm": 0.6407436728477478, + "learning_rate": 3.653383080627663e-05, + "loss": 0.8987, + "step": 10180 + }, + { + "epoch": 0.8089067058286531, + "grad_norm": 0.6754816770553589, + "learning_rate": 3.652060014289116e-05, + "loss": 0.8625, + "step": 10190 + }, + { + "epoch": 0.8097005298775526, + "grad_norm": 0.7109003663063049, + "learning_rate": 3.65073694795057e-05, + "loss": 0.8156, + "step": 10200 + }, + { + "epoch": 0.8104943539264522, + "grad_norm": 0.7097606658935547, + "learning_rate": 3.649413881612025e-05, + "loss": 0.8754, + "step": 10210 + }, + { + "epoch": 0.8112881779753518, + "grad_norm": 0.7361970543861389, + "learning_rate": 3.648090815273478e-05, + "loss": 0.8663, + "step": 10220 + }, + { + "epoch": 0.8120820020242513, + "grad_norm": 0.5668066740036011, + "learning_rate": 3.646767748934932e-05, + "loss": 0.8903, + "step": 10230 + }, + { + "epoch": 0.8128758260731509, + "grad_norm": 0.9411951899528503, + "learning_rate": 3.645444682596386e-05, + "loss": 0.8686, + "step": 10240 + }, + { + "epoch": 0.8136696501220505, + "grad_norm": 0.6737961769104004, + "learning_rate": 3.6441216162578396e-05, + "loss": 0.8497, + "step": 10250 + }, + { + "epoch": 0.81446347417095, + "grad_norm": 0.8757466673851013, + "learning_rate": 3.642798549919293e-05, + "loss": 0.8725, + "step": 10260 + }, + { + "epoch": 0.8152572982198496, + "grad_norm": 0.670192539691925, + "learning_rate": 3.641475483580747e-05, + "loss": 0.892, + "step": 10270 + }, + { + "epoch": 0.8160511222687491, + "grad_norm": 0.7439650893211365, + "learning_rate": 3.6401524172422006e-05, + "loss": 0.8589, + "step": 10280 + }, + { + "epoch": 0.8168449463176487, + "grad_norm": 0.795250654220581, + "learning_rate": 3.6388293509036545e-05, + "loss": 0.8543, + "step": 10290 + }, + { + "epoch": 0.8176387703665483, + "grad_norm": 0.7579794526100159, + "learning_rate": 3.6375062845651084e-05, + "loss": 0.8498, + "step": 10300 + }, + { + "epoch": 0.8184325944154478, + "grad_norm": 0.5933066606521606, + "learning_rate": 3.636183218226562e-05, + "loss": 0.8498, + "step": 10310 + }, + { + "epoch": 0.8192264184643474, + "grad_norm": 0.8285762667655945, + "learning_rate": 3.634860151888016e-05, + "loss": 0.9121, + "step": 10320 + }, + { + "epoch": 0.8200202425132469, + "grad_norm": 0.7152075171470642, + "learning_rate": 3.6335370855494694e-05, + "loss": 0.8678, + "step": 10330 + }, + { + "epoch": 0.8208140665621465, + "grad_norm": 0.933392345905304, + "learning_rate": 3.632214019210923e-05, + "loss": 0.8849, + "step": 10340 + }, + { + "epoch": 0.8216078906110461, + "grad_norm": 0.698727548122406, + "learning_rate": 3.630890952872377e-05, + "loss": 0.8407, + "step": 10350 + }, + { + "epoch": 0.8224017146599456, + "grad_norm": 0.6476231217384338, + "learning_rate": 3.629567886533831e-05, + "loss": 0.9283, + "step": 10360 + }, + { + "epoch": 0.8231955387088452, + "grad_norm": 0.6746705770492554, + "learning_rate": 3.628244820195284e-05, + "loss": 0.8968, + "step": 10370 + }, + { + "epoch": 0.8239893627577447, + "grad_norm": 0.654994547367096, + "learning_rate": 3.626921753856739e-05, + "loss": 0.871, + "step": 10380 + }, + { + "epoch": 0.8247831868066443, + "grad_norm": 0.9437777996063232, + "learning_rate": 3.625598687518193e-05, + "loss": 0.8428, + "step": 10390 + }, + { + "epoch": 0.8255770108555439, + "grad_norm": 0.6646280884742737, + "learning_rate": 3.6242756211796466e-05, + "loss": 0.9034, + "step": 10400 + }, + { + "epoch": 0.8263708349044434, + "grad_norm": 0.8529212474822998, + "learning_rate": 3.6229525548411e-05, + "loss": 0.8507, + "step": 10410 + }, + { + "epoch": 0.827164658953343, + "grad_norm": 0.7901684045791626, + "learning_rate": 3.621629488502554e-05, + "loss": 0.8191, + "step": 10420 + }, + { + "epoch": 0.8279584830022425, + "grad_norm": 0.6879044771194458, + "learning_rate": 3.6203064221640076e-05, + "loss": 0.8222, + "step": 10430 + }, + { + "epoch": 0.8287523070511421, + "grad_norm": 0.8270571827888489, + "learning_rate": 3.618983355825461e-05, + "loss": 0.8483, + "step": 10440 + }, + { + "epoch": 0.8295461311000417, + "grad_norm": 0.8302059769630432, + "learning_rate": 3.617660289486915e-05, + "loss": 0.8513, + "step": 10450 + }, + { + "epoch": 0.8303399551489412, + "grad_norm": 0.669792890548706, + "learning_rate": 3.6163372231483686e-05, + "loss": 0.8854, + "step": 10460 + }, + { + "epoch": 0.8311337791978408, + "grad_norm": 0.858910322189331, + "learning_rate": 3.6150141568098225e-05, + "loss": 0.8241, + "step": 10470 + }, + { + "epoch": 0.8319276032467403, + "grad_norm": 0.8056598901748657, + "learning_rate": 3.6136910904712764e-05, + "loss": 0.8923, + "step": 10480 + }, + { + "epoch": 0.8327214272956399, + "grad_norm": 0.733371913433075, + "learning_rate": 3.61236802413273e-05, + "loss": 0.8247, + "step": 10490 + }, + { + "epoch": 0.8335152513445395, + "grad_norm": 0.5836498141288757, + "learning_rate": 3.611044957794184e-05, + "loss": 0.8817, + "step": 10500 + }, + { + "epoch": 0.834309075393439, + "grad_norm": 0.8154313564300537, + "learning_rate": 3.609721891455638e-05, + "loss": 0.8183, + "step": 10510 + }, + { + "epoch": 0.8351028994423386, + "grad_norm": 0.7569496631622314, + "learning_rate": 3.608398825117091e-05, + "loss": 0.9073, + "step": 10520 + }, + { + "epoch": 0.8358967234912381, + "grad_norm": 0.6307011246681213, + "learning_rate": 3.607075758778545e-05, + "loss": 0.9599, + "step": 10530 + }, + { + "epoch": 0.8366905475401377, + "grad_norm": 0.7529721856117249, + "learning_rate": 3.605752692439999e-05, + "loss": 0.8456, + "step": 10540 + }, + { + "epoch": 0.8374843715890373, + "grad_norm": 0.8897116184234619, + "learning_rate": 3.604429626101452e-05, + "loss": 0.8309, + "step": 10550 + }, + { + "epoch": 0.8382781956379368, + "grad_norm": 0.8665919303894043, + "learning_rate": 3.603106559762907e-05, + "loss": 0.8252, + "step": 10560 + }, + { + "epoch": 0.8390720196868364, + "grad_norm": 0.7858225703239441, + "learning_rate": 3.601783493424361e-05, + "loss": 0.8592, + "step": 10570 + }, + { + "epoch": 0.839865843735736, + "grad_norm": 0.719430685043335, + "learning_rate": 3.6004604270858146e-05, + "loss": 0.8424, + "step": 10580 + }, + { + "epoch": 0.8406596677846355, + "grad_norm": 0.8680655360221863, + "learning_rate": 3.599137360747268e-05, + "loss": 0.8125, + "step": 10590 + }, + { + "epoch": 0.8414534918335351, + "grad_norm": 0.7084610462188721, + "learning_rate": 3.597814294408722e-05, + "loss": 0.878, + "step": 10600 + }, + { + "epoch": 0.8422473158824346, + "grad_norm": 0.9070943593978882, + "learning_rate": 3.5964912280701756e-05, + "loss": 0.8752, + "step": 10610 + }, + { + "epoch": 0.8430411399313342, + "grad_norm": 0.8004103302955627, + "learning_rate": 3.5951681617316295e-05, + "loss": 0.8462, + "step": 10620 + }, + { + "epoch": 0.8438349639802338, + "grad_norm": 0.7617958784103394, + "learning_rate": 3.593845095393083e-05, + "loss": 0.832, + "step": 10630 + }, + { + "epoch": 0.8446287880291333, + "grad_norm": 0.8618339896202087, + "learning_rate": 3.5925220290545366e-05, + "loss": 0.8751, + "step": 10640 + }, + { + "epoch": 0.8454226120780329, + "grad_norm": 0.6573039889335632, + "learning_rate": 3.5911989627159905e-05, + "loss": 0.8706, + "step": 10650 + }, + { + "epoch": 0.8462164361269324, + "grad_norm": 0.6365640163421631, + "learning_rate": 3.589875896377445e-05, + "loss": 0.8706, + "step": 10660 + }, + { + "epoch": 0.847010260175832, + "grad_norm": 0.712862491607666, + "learning_rate": 3.588552830038898e-05, + "loss": 0.7904, + "step": 10670 + }, + { + "epoch": 0.8478040842247316, + "grad_norm": 0.7353672981262207, + "learning_rate": 3.587229763700352e-05, + "loss": 0.8465, + "step": 10680 + }, + { + "epoch": 0.8485979082736311, + "grad_norm": 0.7351557612419128, + "learning_rate": 3.585906697361806e-05, + "loss": 0.8947, + "step": 10690 + }, + { + "epoch": 0.8493917323225307, + "grad_norm": 0.7744636535644531, + "learning_rate": 3.584583631023259e-05, + "loss": 0.8291, + "step": 10700 + }, + { + "epoch": 0.8501855563714302, + "grad_norm": 0.5523749589920044, + "learning_rate": 3.583260564684713e-05, + "loss": 0.9476, + "step": 10710 + }, + { + "epoch": 0.8509793804203298, + "grad_norm": 0.7999189496040344, + "learning_rate": 3.581937498346167e-05, + "loss": 0.7629, + "step": 10720 + }, + { + "epoch": 0.8517732044692294, + "grad_norm": 0.7351500988006592, + "learning_rate": 3.580614432007621e-05, + "loss": 0.8504, + "step": 10730 + }, + { + "epoch": 0.8525670285181289, + "grad_norm": 0.7090862393379211, + "learning_rate": 3.579291365669075e-05, + "loss": 0.9068, + "step": 10740 + }, + { + "epoch": 0.8533608525670285, + "grad_norm": 0.6834081411361694, + "learning_rate": 3.577968299330529e-05, + "loss": 0.8594, + "step": 10750 + }, + { + "epoch": 0.854154676615928, + "grad_norm": 0.7891108989715576, + "learning_rate": 3.576645232991983e-05, + "loss": 0.8918, + "step": 10760 + }, + { + "epoch": 0.8549485006648276, + "grad_norm": 0.7295839786529541, + "learning_rate": 3.5753221666534366e-05, + "loss": 0.8802, + "step": 10770 + }, + { + "epoch": 0.8557423247137272, + "grad_norm": 0.8786309361457825, + "learning_rate": 3.57399910031489e-05, + "loss": 0.8576, + "step": 10780 + }, + { + "epoch": 0.8565361487626267, + "grad_norm": 0.7351185083389282, + "learning_rate": 3.572676033976344e-05, + "loss": 0.8407, + "step": 10790 + }, + { + "epoch": 0.8573299728115263, + "grad_norm": 0.6069027185440063, + "learning_rate": 3.5713529676377976e-05, + "loss": 0.8566, + "step": 10800 + }, + { + "epoch": 0.8581237968604258, + "grad_norm": 0.8635075092315674, + "learning_rate": 3.570029901299251e-05, + "loss": 0.885, + "step": 10810 + }, + { + "epoch": 0.8589176209093254, + "grad_norm": 0.7878072261810303, + "learning_rate": 3.568706834960705e-05, + "loss": 0.8718, + "step": 10820 + }, + { + "epoch": 0.859711444958225, + "grad_norm": 0.7929555177688599, + "learning_rate": 3.567383768622159e-05, + "loss": 0.8686, + "step": 10830 + }, + { + "epoch": 0.8605052690071245, + "grad_norm": 0.8081250786781311, + "learning_rate": 3.566060702283613e-05, + "loss": 0.893, + "step": 10840 + }, + { + "epoch": 0.8612990930560241, + "grad_norm": 0.7977842688560486, + "learning_rate": 3.5647376359450664e-05, + "loss": 0.8753, + "step": 10850 + }, + { + "epoch": 0.8620929171049236, + "grad_norm": 0.7646862268447876, + "learning_rate": 3.56341456960652e-05, + "loss": 0.8658, + "step": 10860 + }, + { + "epoch": 0.8628867411538232, + "grad_norm": 0.7526537775993347, + "learning_rate": 3.562091503267974e-05, + "loss": 0.8557, + "step": 10870 + }, + { + "epoch": 0.8636805652027228, + "grad_norm": 0.7993239760398865, + "learning_rate": 3.560768436929428e-05, + "loss": 0.8521, + "step": 10880 + }, + { + "epoch": 0.8644743892516223, + "grad_norm": 0.9761634469032288, + "learning_rate": 3.559445370590881e-05, + "loss": 0.926, + "step": 10890 + }, + { + "epoch": 0.865268213300522, + "grad_norm": 0.7890929579734802, + "learning_rate": 3.558122304252335e-05, + "loss": 0.8633, + "step": 10900 + }, + { + "epoch": 0.8660620373494216, + "grad_norm": 0.7080404758453369, + "learning_rate": 3.556799237913789e-05, + "loss": 0.9538, + "step": 10910 + }, + { + "epoch": 0.866855861398321, + "grad_norm": 0.8383463621139526, + "learning_rate": 3.555476171575243e-05, + "loss": 0.8548, + "step": 10920 + }, + { + "epoch": 0.8676496854472207, + "grad_norm": 0.8602036237716675, + "learning_rate": 3.554153105236697e-05, + "loss": 0.8676, + "step": 10930 + }, + { + "epoch": 0.8684435094961201, + "grad_norm": 0.7755318880081177, + "learning_rate": 3.552830038898151e-05, + "loss": 0.8335, + "step": 10940 + }, + { + "epoch": 0.8692373335450198, + "grad_norm": 0.7934598922729492, + "learning_rate": 3.5515069725596046e-05, + "loss": 0.7969, + "step": 10950 + }, + { + "epoch": 0.8700311575939194, + "grad_norm": 0.9328092932701111, + "learning_rate": 3.550183906221058e-05, + "loss": 0.7799, + "step": 10960 + }, + { + "epoch": 0.8708249816428189, + "grad_norm": 0.665879487991333, + "learning_rate": 3.548860839882512e-05, + "loss": 0.8617, + "step": 10970 + }, + { + "epoch": 0.8716188056917185, + "grad_norm": 0.775994598865509, + "learning_rate": 3.5475377735439656e-05, + "loss": 0.8282, + "step": 10980 + }, + { + "epoch": 0.872412629740618, + "grad_norm": 0.8753870129585266, + "learning_rate": 3.5462147072054195e-05, + "loss": 0.786, + "step": 10990 + }, + { + "epoch": 0.8732064537895176, + "grad_norm": 0.8270352482795715, + "learning_rate": 3.5448916408668734e-05, + "loss": 0.8371, + "step": 11000 + }, + { + "epoch": 0.8740002778384172, + "grad_norm": 0.7965131402015686, + "learning_rate": 3.543568574528327e-05, + "loss": 0.885, + "step": 11010 + }, + { + "epoch": 0.8747941018873167, + "grad_norm": 0.7161972522735596, + "learning_rate": 3.542245508189781e-05, + "loss": 0.7859, + "step": 11020 + }, + { + "epoch": 0.8755879259362163, + "grad_norm": 0.7473718523979187, + "learning_rate": 3.5409224418512344e-05, + "loss": 0.8075, + "step": 11030 + }, + { + "epoch": 0.8763817499851158, + "grad_norm": 0.8067750334739685, + "learning_rate": 3.539599375512688e-05, + "loss": 0.7951, + "step": 11040 + }, + { + "epoch": 0.8771755740340154, + "grad_norm": 0.8966103196144104, + "learning_rate": 3.538276309174142e-05, + "loss": 0.8467, + "step": 11050 + }, + { + "epoch": 0.877969398082915, + "grad_norm": 0.7432875037193298, + "learning_rate": 3.536953242835596e-05, + "loss": 0.8957, + "step": 11060 + }, + { + "epoch": 0.8787632221318145, + "grad_norm": 0.7818832993507385, + "learning_rate": 3.535630176497049e-05, + "loss": 0.9194, + "step": 11070 + }, + { + "epoch": 0.8795570461807141, + "grad_norm": 0.8966363668441772, + "learning_rate": 3.534307110158503e-05, + "loss": 0.8665, + "step": 11080 + }, + { + "epoch": 0.8803508702296136, + "grad_norm": 0.5509730577468872, + "learning_rate": 3.532984043819957e-05, + "loss": 0.9304, + "step": 11090 + }, + { + "epoch": 0.8811446942785132, + "grad_norm": 0.7345530986785889, + "learning_rate": 3.5316609774814116e-05, + "loss": 0.8524, + "step": 11100 + }, + { + "epoch": 0.8819385183274128, + "grad_norm": 0.9241631031036377, + "learning_rate": 3.530337911142865e-05, + "loss": 0.8748, + "step": 11110 + }, + { + "epoch": 0.8827323423763123, + "grad_norm": 0.7990071177482605, + "learning_rate": 3.529014844804319e-05, + "loss": 0.8633, + "step": 11120 + }, + { + "epoch": 0.8835261664252119, + "grad_norm": 0.8361101746559143, + "learning_rate": 3.5276917784657726e-05, + "loss": 0.8708, + "step": 11130 + }, + { + "epoch": 0.8843199904741114, + "grad_norm": 0.6737117767333984, + "learning_rate": 3.526368712127226e-05, + "loss": 0.8489, + "step": 11140 + }, + { + "epoch": 0.885113814523011, + "grad_norm": 0.8274135589599609, + "learning_rate": 3.52504564578868e-05, + "loss": 0.9062, + "step": 11150 + }, + { + "epoch": 0.8859076385719106, + "grad_norm": 0.5696777701377869, + "learning_rate": 3.5237225794501336e-05, + "loss": 0.8464, + "step": 11160 + }, + { + "epoch": 0.8867014626208101, + "grad_norm": 0.8153122067451477, + "learning_rate": 3.5223995131115875e-05, + "loss": 0.8514, + "step": 11170 + }, + { + "epoch": 0.8874952866697097, + "grad_norm": 0.7769964933395386, + "learning_rate": 3.5210764467730414e-05, + "loss": 0.8245, + "step": 11180 + }, + { + "epoch": 0.8882891107186092, + "grad_norm": 0.6569917798042297, + "learning_rate": 3.519753380434495e-05, + "loss": 0.799, + "step": 11190 + }, + { + "epoch": 0.8890829347675088, + "grad_norm": 0.5966030359268188, + "learning_rate": 3.518430314095949e-05, + "loss": 0.8563, + "step": 11200 + }, + { + "epoch": 0.8898767588164084, + "grad_norm": 0.6492002010345459, + "learning_rate": 3.517107247757403e-05, + "loss": 0.861, + "step": 11210 + }, + { + "epoch": 0.8906705828653079, + "grad_norm": 0.7151293158531189, + "learning_rate": 3.515784181418856e-05, + "loss": 0.8536, + "step": 11220 + }, + { + "epoch": 0.8914644069142075, + "grad_norm": 0.6742676496505737, + "learning_rate": 3.51446111508031e-05, + "loss": 0.8548, + "step": 11230 + }, + { + "epoch": 0.8922582309631071, + "grad_norm": 0.8235064148902893, + "learning_rate": 3.513138048741764e-05, + "loss": 0.8937, + "step": 11240 + }, + { + "epoch": 0.8930520550120066, + "grad_norm": 0.6672957539558411, + "learning_rate": 3.511814982403217e-05, + "loss": 0.8116, + "step": 11250 + }, + { + "epoch": 0.8938458790609062, + "grad_norm": 0.745836079120636, + "learning_rate": 3.510491916064671e-05, + "loss": 0.8183, + "step": 11260 + }, + { + "epoch": 0.8946397031098057, + "grad_norm": 0.93377286195755, + "learning_rate": 3.509168849726126e-05, + "loss": 0.87, + "step": 11270 + }, + { + "epoch": 0.8954335271587053, + "grad_norm": 0.7982069253921509, + "learning_rate": 3.50784578338758e-05, + "loss": 0.8557, + "step": 11280 + }, + { + "epoch": 0.8962273512076049, + "grad_norm": 0.7779275178909302, + "learning_rate": 3.506522717049033e-05, + "loss": 0.7948, + "step": 11290 + }, + { + "epoch": 0.8970211752565044, + "grad_norm": 0.7200841307640076, + "learning_rate": 3.505199650710487e-05, + "loss": 0.8031, + "step": 11300 + }, + { + "epoch": 0.897814999305404, + "grad_norm": 0.7799506783485413, + "learning_rate": 3.503876584371941e-05, + "loss": 0.8692, + "step": 11310 + }, + { + "epoch": 0.8986088233543035, + "grad_norm": 0.6927218437194824, + "learning_rate": 3.5025535180333946e-05, + "loss": 0.8854, + "step": 11320 + }, + { + "epoch": 0.8994026474032031, + "grad_norm": 0.8087165951728821, + "learning_rate": 3.501230451694848e-05, + "loss": 0.8593, + "step": 11330 + }, + { + "epoch": 0.9001964714521027, + "grad_norm": 0.817240834236145, + "learning_rate": 3.499907385356302e-05, + "loss": 0.8312, + "step": 11340 + }, + { + "epoch": 0.9009902955010022, + "grad_norm": 0.6710502505302429, + "learning_rate": 3.4985843190177556e-05, + "loss": 0.883, + "step": 11350 + }, + { + "epoch": 0.9017841195499018, + "grad_norm": 0.6229230761528015, + "learning_rate": 3.4972612526792095e-05, + "loss": 0.885, + "step": 11360 + }, + { + "epoch": 0.9025779435988013, + "grad_norm": 0.7668790221214294, + "learning_rate": 3.4959381863406634e-05, + "loss": 0.8206, + "step": 11370 + }, + { + "epoch": 0.9033717676477009, + "grad_norm": 0.668046236038208, + "learning_rate": 3.494615120002117e-05, + "loss": 0.8866, + "step": 11380 + }, + { + "epoch": 0.9041655916966005, + "grad_norm": 0.7366786003112793, + "learning_rate": 3.493292053663571e-05, + "loss": 0.8091, + "step": 11390 + }, + { + "epoch": 0.9049594157455, + "grad_norm": 0.9512502551078796, + "learning_rate": 3.4919689873250244e-05, + "loss": 0.7877, + "step": 11400 + }, + { + "epoch": 0.9057532397943996, + "grad_norm": 0.8555247783660889, + "learning_rate": 3.490645920986478e-05, + "loss": 0.901, + "step": 11410 + }, + { + "epoch": 0.9065470638432991, + "grad_norm": 0.6285889148712158, + "learning_rate": 3.489322854647932e-05, + "loss": 0.8963, + "step": 11420 + }, + { + "epoch": 0.9073408878921987, + "grad_norm": 0.7358067631721497, + "learning_rate": 3.487999788309386e-05, + "loss": 0.8679, + "step": 11430 + }, + { + "epoch": 0.9081347119410983, + "grad_norm": 0.8041672110557556, + "learning_rate": 3.48667672197084e-05, + "loss": 0.8222, + "step": 11440 + }, + { + "epoch": 0.9089285359899978, + "grad_norm": 0.7826716899871826, + "learning_rate": 3.485353655632294e-05, + "loss": 0.8347, + "step": 11450 + }, + { + "epoch": 0.9097223600388974, + "grad_norm": 0.8296499252319336, + "learning_rate": 3.484030589293748e-05, + "loss": 0.8105, + "step": 11460 + }, + { + "epoch": 0.9105161840877969, + "grad_norm": 0.6100326776504517, + "learning_rate": 3.482707522955201e-05, + "loss": 0.8618, + "step": 11470 + }, + { + "epoch": 0.9113100081366965, + "grad_norm": 0.9201139807701111, + "learning_rate": 3.481384456616655e-05, + "loss": 0.905, + "step": 11480 + }, + { + "epoch": 0.9121038321855961, + "grad_norm": 0.8482776880264282, + "learning_rate": 3.480061390278109e-05, + "loss": 0.8644, + "step": 11490 + }, + { + "epoch": 0.9128976562344956, + "grad_norm": 1.077039122581482, + "learning_rate": 3.4787383239395626e-05, + "loss": 0.848, + "step": 11500 + }, + { + "epoch": 0.9136914802833952, + "grad_norm": 0.6531374454498291, + "learning_rate": 3.477415257601016e-05, + "loss": 0.8716, + "step": 11510 + }, + { + "epoch": 0.9144853043322948, + "grad_norm": 0.7523869276046753, + "learning_rate": 3.47609219126247e-05, + "loss": 0.9086, + "step": 11520 + }, + { + "epoch": 0.9152791283811943, + "grad_norm": 0.7146991491317749, + "learning_rate": 3.4747691249239236e-05, + "loss": 0.8156, + "step": 11530 + }, + { + "epoch": 0.9160729524300939, + "grad_norm": 0.916566789150238, + "learning_rate": 3.473446058585378e-05, + "loss": 0.9163, + "step": 11540 + }, + { + "epoch": 0.9168667764789934, + "grad_norm": 0.8944635987281799, + "learning_rate": 3.4721229922468314e-05, + "loss": 0.8811, + "step": 11550 + }, + { + "epoch": 0.917660600527893, + "grad_norm": 0.7929695248603821, + "learning_rate": 3.470799925908285e-05, + "loss": 0.8048, + "step": 11560 + }, + { + "epoch": 0.9184544245767926, + "grad_norm": 0.6163130402565002, + "learning_rate": 3.469476859569739e-05, + "loss": 0.904, + "step": 11570 + }, + { + "epoch": 0.9192482486256921, + "grad_norm": 0.7213409543037415, + "learning_rate": 3.468153793231193e-05, + "loss": 0.9098, + "step": 11580 + }, + { + "epoch": 0.9200420726745917, + "grad_norm": 0.8145052790641785, + "learning_rate": 3.466830726892646e-05, + "loss": 0.8596, + "step": 11590 + }, + { + "epoch": 0.9208358967234912, + "grad_norm": 0.7872694730758667, + "learning_rate": 3.4655076605541e-05, + "loss": 0.8214, + "step": 11600 + }, + { + "epoch": 0.9216297207723908, + "grad_norm": 0.745549201965332, + "learning_rate": 3.464184594215554e-05, + "loss": 0.885, + "step": 11610 + }, + { + "epoch": 0.9224235448212904, + "grad_norm": 0.8459563255310059, + "learning_rate": 3.462861527877008e-05, + "loss": 0.9197, + "step": 11620 + }, + { + "epoch": 0.9232173688701899, + "grad_norm": 0.9419437646865845, + "learning_rate": 3.461538461538462e-05, + "loss": 0.7556, + "step": 11630 + }, + { + "epoch": 0.9240111929190895, + "grad_norm": 0.7619801759719849, + "learning_rate": 3.460215395199916e-05, + "loss": 0.87, + "step": 11640 + }, + { + "epoch": 0.924805016967989, + "grad_norm": 0.5955981612205505, + "learning_rate": 3.4588923288613696e-05, + "loss": 0.922, + "step": 11650 + }, + { + "epoch": 0.9255988410168886, + "grad_norm": 0.7901360392570496, + "learning_rate": 3.457569262522823e-05, + "loss": 0.8939, + "step": 11660 + }, + { + "epoch": 0.9263926650657882, + "grad_norm": 0.6033757328987122, + "learning_rate": 3.456246196184277e-05, + "loss": 0.7984, + "step": 11670 + }, + { + "epoch": 0.9271864891146877, + "grad_norm": 0.7921754121780396, + "learning_rate": 3.4549231298457306e-05, + "loss": 0.8477, + "step": 11680 + }, + { + "epoch": 0.9279803131635873, + "grad_norm": 0.6191670298576355, + "learning_rate": 3.4536000635071845e-05, + "loss": 0.8468, + "step": 11690 + }, + { + "epoch": 0.9287741372124868, + "grad_norm": 0.6756204962730408, + "learning_rate": 3.452276997168638e-05, + "loss": 0.8113, + "step": 11700 + }, + { + "epoch": 0.9295679612613864, + "grad_norm": 0.8277655839920044, + "learning_rate": 3.450953930830092e-05, + "loss": 0.889, + "step": 11710 + }, + { + "epoch": 0.930361785310286, + "grad_norm": 0.6921846270561218, + "learning_rate": 3.449630864491546e-05, + "loss": 0.9316, + "step": 11720 + }, + { + "epoch": 0.9311556093591855, + "grad_norm": 0.6792823672294617, + "learning_rate": 3.4483077981529994e-05, + "loss": 0.8069, + "step": 11730 + }, + { + "epoch": 0.9319494334080851, + "grad_norm": 0.9177089929580688, + "learning_rate": 3.446984731814453e-05, + "loss": 0.8124, + "step": 11740 + }, + { + "epoch": 0.9327432574569846, + "grad_norm": 0.8078745603561401, + "learning_rate": 3.445661665475907e-05, + "loss": 0.8762, + "step": 11750 + }, + { + "epoch": 0.9335370815058842, + "grad_norm": 0.9108356833457947, + "learning_rate": 3.444338599137361e-05, + "loss": 0.7767, + "step": 11760 + }, + { + "epoch": 0.9343309055547838, + "grad_norm": 0.6663825511932373, + "learning_rate": 3.443015532798814e-05, + "loss": 0.8749, + "step": 11770 + }, + { + "epoch": 0.9351247296036833, + "grad_norm": 0.8398308753967285, + "learning_rate": 3.441692466460268e-05, + "loss": 0.8793, + "step": 11780 + }, + { + "epoch": 0.9359185536525829, + "grad_norm": 0.8784694075584412, + "learning_rate": 3.440369400121722e-05, + "loss": 0.8916, + "step": 11790 + }, + { + "epoch": 0.9367123777014824, + "grad_norm": 0.7957909107208252, + "learning_rate": 3.439046333783176e-05, + "loss": 0.8855, + "step": 11800 + }, + { + "epoch": 0.937506201750382, + "grad_norm": 0.6235615015029907, + "learning_rate": 3.43772326744463e-05, + "loss": 0.9407, + "step": 11810 + }, + { + "epoch": 0.9383000257992816, + "grad_norm": 0.912480354309082, + "learning_rate": 3.436400201106084e-05, + "loss": 0.8935, + "step": 11820 + }, + { + "epoch": 0.9390938498481811, + "grad_norm": 0.8489815592765808, + "learning_rate": 3.435077134767538e-05, + "loss": 0.8482, + "step": 11830 + }, + { + "epoch": 0.9398876738970807, + "grad_norm": 0.6857966780662537, + "learning_rate": 3.433754068428991e-05, + "loss": 0.8982, + "step": 11840 + }, + { + "epoch": 0.9406814979459803, + "grad_norm": 0.6412976384162903, + "learning_rate": 3.432431002090445e-05, + "loss": 0.8496, + "step": 11850 + }, + { + "epoch": 0.9414753219948798, + "grad_norm": 0.7496560215950012, + "learning_rate": 3.431107935751899e-05, + "loss": 0.8434, + "step": 11860 + }, + { + "epoch": 0.9422691460437794, + "grad_norm": 0.7269144058227539, + "learning_rate": 3.4297848694133526e-05, + "loss": 0.8561, + "step": 11870 + }, + { + "epoch": 0.9430629700926789, + "grad_norm": 0.8505829572677612, + "learning_rate": 3.4284618030748065e-05, + "loss": 0.8006, + "step": 11880 + }, + { + "epoch": 0.9438567941415785, + "grad_norm": 0.7966324090957642, + "learning_rate": 3.4271387367362604e-05, + "loss": 0.8968, + "step": 11890 + }, + { + "epoch": 0.9446506181904781, + "grad_norm": 0.7437507510185242, + "learning_rate": 3.425815670397714e-05, + "loss": 0.8193, + "step": 11900 + }, + { + "epoch": 0.9454444422393776, + "grad_norm": 0.8566707968711853, + "learning_rate": 3.424492604059168e-05, + "loss": 0.8084, + "step": 11910 + }, + { + "epoch": 0.9462382662882772, + "grad_norm": 0.6849654316902161, + "learning_rate": 3.4231695377206214e-05, + "loss": 0.8792, + "step": 11920 + }, + { + "epoch": 0.9470320903371767, + "grad_norm": 0.7506431341171265, + "learning_rate": 3.421846471382075e-05, + "loss": 0.8374, + "step": 11930 + }, + { + "epoch": 0.9478259143860763, + "grad_norm": 0.6591388583183289, + "learning_rate": 3.420523405043529e-05, + "loss": 0.9128, + "step": 11940 + }, + { + "epoch": 0.9486197384349759, + "grad_norm": 0.6423110365867615, + "learning_rate": 3.4192003387049824e-05, + "loss": 0.7918, + "step": 11950 + }, + { + "epoch": 0.9494135624838754, + "grad_norm": 0.6780298948287964, + "learning_rate": 3.417877272366436e-05, + "loss": 0.8715, + "step": 11960 + }, + { + "epoch": 0.950207386532775, + "grad_norm": 0.6950411200523376, + "learning_rate": 3.41655420602789e-05, + "loss": 0.8555, + "step": 11970 + }, + { + "epoch": 0.9510012105816745, + "grad_norm": 0.8524368405342102, + "learning_rate": 3.415231139689345e-05, + "loss": 0.8706, + "step": 11980 + }, + { + "epoch": 0.9517950346305741, + "grad_norm": 0.8601171374320984, + "learning_rate": 3.413908073350798e-05, + "loss": 0.8835, + "step": 11990 + }, + { + "epoch": 0.9525888586794737, + "grad_norm": 0.7460906505584717, + "learning_rate": 3.412585007012252e-05, + "loss": 0.8911, + "step": 12000 + }, + { + "epoch": 0.9533826827283732, + "grad_norm": 0.6877840757369995, + "learning_rate": 3.411261940673706e-05, + "loss": 0.8733, + "step": 12010 + }, + { + "epoch": 0.9541765067772728, + "grad_norm": 0.8841610550880432, + "learning_rate": 3.4099388743351596e-05, + "loss": 0.8988, + "step": 12020 + }, + { + "epoch": 0.9549703308261723, + "grad_norm": 0.7192667126655579, + "learning_rate": 3.408615807996613e-05, + "loss": 0.876, + "step": 12030 + }, + { + "epoch": 0.9557641548750719, + "grad_norm": 0.739669144153595, + "learning_rate": 3.407292741658067e-05, + "loss": 0.9147, + "step": 12040 + }, + { + "epoch": 0.9565579789239715, + "grad_norm": 0.9102823138237, + "learning_rate": 3.4059696753195206e-05, + "loss": 0.8335, + "step": 12050 + }, + { + "epoch": 0.957351802972871, + "grad_norm": 0.8623813986778259, + "learning_rate": 3.4046466089809745e-05, + "loss": 0.9067, + "step": 12060 + }, + { + "epoch": 0.9581456270217706, + "grad_norm": 0.6548823714256287, + "learning_rate": 3.4033235426424284e-05, + "loss": 0.8553, + "step": 12070 + }, + { + "epoch": 0.9589394510706701, + "grad_norm": 0.6950840950012207, + "learning_rate": 3.402000476303882e-05, + "loss": 0.8346, + "step": 12080 + }, + { + "epoch": 0.9597332751195697, + "grad_norm": 0.8395444750785828, + "learning_rate": 3.400677409965336e-05, + "loss": 0.8331, + "step": 12090 + }, + { + "epoch": 0.9605270991684693, + "grad_norm": 0.7684609889984131, + "learning_rate": 3.3993543436267894e-05, + "loss": 0.8309, + "step": 12100 + }, + { + "epoch": 0.9613209232173688, + "grad_norm": 0.6737335324287415, + "learning_rate": 3.398031277288243e-05, + "loss": 0.8495, + "step": 12110 + }, + { + "epoch": 0.9621147472662684, + "grad_norm": 0.8039416670799255, + "learning_rate": 3.396708210949697e-05, + "loss": 0.8201, + "step": 12120 + }, + { + "epoch": 0.9629085713151679, + "grad_norm": 0.7795635461807251, + "learning_rate": 3.395385144611151e-05, + "loss": 0.8921, + "step": 12130 + }, + { + "epoch": 0.9637023953640675, + "grad_norm": 0.8788042068481445, + "learning_rate": 3.394062078272604e-05, + "loss": 0.849, + "step": 12140 + }, + { + "epoch": 0.9644962194129671, + "grad_norm": 0.8527970910072327, + "learning_rate": 3.392739011934058e-05, + "loss": 0.8119, + "step": 12150 + }, + { + "epoch": 0.9652900434618666, + "grad_norm": 0.7557294964790344, + "learning_rate": 3.391415945595513e-05, + "loss": 0.8121, + "step": 12160 + }, + { + "epoch": 0.9660838675107662, + "grad_norm": 0.871924877166748, + "learning_rate": 3.390092879256966e-05, + "loss": 0.9318, + "step": 12170 + }, + { + "epoch": 0.9668776915596659, + "grad_norm": 0.6994668245315552, + "learning_rate": 3.38876981291842e-05, + "loss": 0.8465, + "step": 12180 + }, + { + "epoch": 0.9676715156085653, + "grad_norm": 0.7652361989021301, + "learning_rate": 3.387446746579874e-05, + "loss": 0.8967, + "step": 12190 + }, + { + "epoch": 0.968465339657465, + "grad_norm": 0.7152222990989685, + "learning_rate": 3.3861236802413276e-05, + "loss": 0.9297, + "step": 12200 + }, + { + "epoch": 0.9692591637063644, + "grad_norm": 0.8487961292266846, + "learning_rate": 3.384800613902781e-05, + "loss": 0.8412, + "step": 12210 + }, + { + "epoch": 0.970052987755264, + "grad_norm": 0.7079365849494934, + "learning_rate": 3.383477547564235e-05, + "loss": 0.8746, + "step": 12220 + }, + { + "epoch": 0.9708468118041637, + "grad_norm": 0.830172598361969, + "learning_rate": 3.3821544812256886e-05, + "loss": 0.864, + "step": 12230 + }, + { + "epoch": 0.9716406358530632, + "grad_norm": 0.9805923104286194, + "learning_rate": 3.3808314148871425e-05, + "loss": 0.9199, + "step": 12240 + }, + { + "epoch": 0.9724344599019628, + "grad_norm": 0.7402176856994629, + "learning_rate": 3.3795083485485964e-05, + "loss": 0.8274, + "step": 12250 + }, + { + "epoch": 0.9732282839508622, + "grad_norm": 0.7366051077842712, + "learning_rate": 3.37818528221005e-05, + "loss": 0.8966, + "step": 12260 + }, + { + "epoch": 0.9740221079997619, + "grad_norm": 0.8370699286460876, + "learning_rate": 3.376862215871504e-05, + "loss": 0.7642, + "step": 12270 + }, + { + "epoch": 0.9748159320486615, + "grad_norm": 0.8621345162391663, + "learning_rate": 3.3755391495329574e-05, + "loss": 0.8868, + "step": 12280 + }, + { + "epoch": 0.975609756097561, + "grad_norm": 0.6125873327255249, + "learning_rate": 3.374216083194411e-05, + "loss": 0.8572, + "step": 12290 + }, + { + "epoch": 0.9764035801464606, + "grad_norm": 0.708325207233429, + "learning_rate": 3.372893016855865e-05, + "loss": 0.811, + "step": 12300 + }, + { + "epoch": 0.97719740419536, + "grad_norm": 0.9190065264701843, + "learning_rate": 3.371569950517319e-05, + "loss": 0.7922, + "step": 12310 + }, + { + "epoch": 0.9779912282442597, + "grad_norm": 0.6722436547279358, + "learning_rate": 3.370246884178772e-05, + "loss": 0.8409, + "step": 12320 + }, + { + "epoch": 0.9787850522931593, + "grad_norm": 0.6370068788528442, + "learning_rate": 3.368923817840227e-05, + "loss": 0.8367, + "step": 12330 + }, + { + "epoch": 0.9795788763420588, + "grad_norm": 0.7187243103981018, + "learning_rate": 3.367600751501681e-05, + "loss": 0.8673, + "step": 12340 + }, + { + "epoch": 0.9803727003909584, + "grad_norm": 0.6579310297966003, + "learning_rate": 3.366277685163135e-05, + "loss": 0.8582, + "step": 12350 + }, + { + "epoch": 0.9811665244398579, + "grad_norm": 0.9233648777008057, + "learning_rate": 3.364954618824588e-05, + "loss": 0.8838, + "step": 12360 + }, + { + "epoch": 0.9819603484887575, + "grad_norm": 0.8820499181747437, + "learning_rate": 3.363631552486042e-05, + "loss": 0.8572, + "step": 12370 + }, + { + "epoch": 0.9827541725376571, + "grad_norm": 0.7715148329734802, + "learning_rate": 3.362308486147496e-05, + "loss": 0.8855, + "step": 12380 + }, + { + "epoch": 0.9835479965865566, + "grad_norm": 0.85701984167099, + "learning_rate": 3.3609854198089496e-05, + "loss": 0.8448, + "step": 12390 + }, + { + "epoch": 0.9843418206354562, + "grad_norm": 0.8934018015861511, + "learning_rate": 3.359662353470403e-05, + "loss": 0.7882, + "step": 12400 + }, + { + "epoch": 0.9851356446843557, + "grad_norm": 0.7508744597434998, + "learning_rate": 3.358339287131857e-05, + "loss": 0.8488, + "step": 12410 + }, + { + "epoch": 0.9859294687332553, + "grad_norm": 0.7345721125602722, + "learning_rate": 3.3570162207933106e-05, + "loss": 0.8662, + "step": 12420 + }, + { + "epoch": 0.9867232927821549, + "grad_norm": 0.7805215120315552, + "learning_rate": 3.3556931544547645e-05, + "loss": 0.7834, + "step": 12430 + }, + { + "epoch": 0.9875171168310544, + "grad_norm": 0.8609070181846619, + "learning_rate": 3.3543700881162184e-05, + "loss": 0.7946, + "step": 12440 + }, + { + "epoch": 0.988310940879954, + "grad_norm": 0.8504823446273804, + "learning_rate": 3.353047021777672e-05, + "loss": 0.8471, + "step": 12450 + }, + { + "epoch": 0.9891047649288536, + "grad_norm": 0.800946831703186, + "learning_rate": 3.351723955439126e-05, + "loss": 0.8686, + "step": 12460 + }, + { + "epoch": 0.9898985889777531, + "grad_norm": 0.770326554775238, + "learning_rate": 3.3504008891005794e-05, + "loss": 0.7954, + "step": 12470 + }, + { + "epoch": 0.9906924130266527, + "grad_norm": 0.7792863845825195, + "learning_rate": 3.349077822762033e-05, + "loss": 0.8636, + "step": 12480 + }, + { + "epoch": 0.9914862370755522, + "grad_norm": 0.7109197974205017, + "learning_rate": 3.347754756423487e-05, + "loss": 0.9198, + "step": 12490 + }, + { + "epoch": 0.9922800611244518, + "grad_norm": 0.6527384519577026, + "learning_rate": 3.346431690084941e-05, + "loss": 0.8457, + "step": 12500 + }, + { + "epoch": 0.9930738851733514, + "grad_norm": 0.9128293395042419, + "learning_rate": 3.345108623746395e-05, + "loss": 0.9064, + "step": 12510 + }, + { + "epoch": 0.9938677092222509, + "grad_norm": 0.7357638478279114, + "learning_rate": 3.343785557407849e-05, + "loss": 0.8497, + "step": 12520 + }, + { + "epoch": 0.9946615332711505, + "grad_norm": 0.8037011027336121, + "learning_rate": 3.342462491069303e-05, + "loss": 0.8659, + "step": 12530 + }, + { + "epoch": 0.99545535732005, + "grad_norm": 0.8320547938346863, + "learning_rate": 3.341139424730756e-05, + "loss": 0.8856, + "step": 12540 + }, + { + "epoch": 0.9962491813689496, + "grad_norm": 0.7249430418014526, + "learning_rate": 3.33981635839221e-05, + "loss": 0.8472, + "step": 12550 + }, + { + "epoch": 0.9970430054178492, + "grad_norm": 0.7746173143386841, + "learning_rate": 3.338493292053664e-05, + "loss": 0.8397, + "step": 12560 + }, + { + "epoch": 0.9978368294667487, + "grad_norm": 0.7750112414360046, + "learning_rate": 3.3371702257151176e-05, + "loss": 0.8164, + "step": 12570 + }, + { + "epoch": 0.9986306535156483, + "grad_norm": 0.9370802044868469, + "learning_rate": 3.335847159376571e-05, + "loss": 0.9081, + "step": 12580 + }, + { + "epoch": 0.9994244775645478, + "grad_norm": 0.7398301362991333, + "learning_rate": 3.334524093038025e-05, + "loss": 0.8786, + "step": 12590 + }, + { + "epoch": 1.0002183016134474, + "grad_norm": 0.811583399772644, + "learning_rate": 3.333201026699479e-05, + "loss": 0.7198, + "step": 12600 + }, + { + "epoch": 1.001012125662347, + "grad_norm": 0.8118659853935242, + "learning_rate": 3.331877960360933e-05, + "loss": 0.8307, + "step": 12610 + }, + { + "epoch": 1.0018059497112466, + "grad_norm": 0.6908761858940125, + "learning_rate": 3.3305548940223864e-05, + "loss": 0.8154, + "step": 12620 + }, + { + "epoch": 1.002599773760146, + "grad_norm": 0.5952050685882568, + "learning_rate": 3.32923182768384e-05, + "loss": 0.8398, + "step": 12630 + }, + { + "epoch": 1.0033935978090456, + "grad_norm": 0.7398274540901184, + "learning_rate": 3.327908761345294e-05, + "loss": 0.8172, + "step": 12640 + }, + { + "epoch": 1.0041874218579452, + "grad_norm": 0.6245648860931396, + "learning_rate": 3.3265856950067474e-05, + "loss": 0.8967, + "step": 12650 + }, + { + "epoch": 1.0049812459068448, + "grad_norm": 0.7542557716369629, + "learning_rate": 3.325262628668201e-05, + "loss": 0.763, + "step": 12660 + }, + { + "epoch": 1.0057750699557444, + "grad_norm": 0.6645131707191467, + "learning_rate": 3.323939562329655e-05, + "loss": 0.8822, + "step": 12670 + }, + { + "epoch": 1.0065688940046438, + "grad_norm": 0.8816360235214233, + "learning_rate": 3.322616495991109e-05, + "loss": 0.879, + "step": 12680 + }, + { + "epoch": 1.0073627180535434, + "grad_norm": 0.8549253344535828, + "learning_rate": 3.321293429652563e-05, + "loss": 0.836, + "step": 12690 + }, + { + "epoch": 1.008156542102443, + "grad_norm": 0.95768141746521, + "learning_rate": 3.319970363314017e-05, + "loss": 0.842, + "step": 12700 + }, + { + "epoch": 1.0089503661513426, + "grad_norm": 0.8039479851722717, + "learning_rate": 3.318647296975471e-05, + "loss": 0.7732, + "step": 12710 + }, + { + "epoch": 1.0097441902002422, + "grad_norm": 0.724469006061554, + "learning_rate": 3.3173242306369246e-05, + "loss": 0.797, + "step": 12720 + }, + { + "epoch": 1.0105380142491416, + "grad_norm": 0.7534793019294739, + "learning_rate": 3.316001164298378e-05, + "loss": 0.8058, + "step": 12730 + }, + { + "epoch": 1.0113318382980412, + "grad_norm": 0.9147580862045288, + "learning_rate": 3.314678097959832e-05, + "loss": 0.819, + "step": 12740 + }, + { + "epoch": 1.0121256623469408, + "grad_norm": 0.8486768007278442, + "learning_rate": 3.3133550316212856e-05, + "loss": 0.7997, + "step": 12750 + }, + { + "epoch": 1.0129194863958404, + "grad_norm": 0.7328081130981445, + "learning_rate": 3.312031965282739e-05, + "loss": 0.8433, + "step": 12760 + }, + { + "epoch": 1.01371331044474, + "grad_norm": 0.9267632961273193, + "learning_rate": 3.3107088989441934e-05, + "loss": 0.8013, + "step": 12770 + }, + { + "epoch": 1.0145071344936394, + "grad_norm": 0.7737709879875183, + "learning_rate": 3.309385832605647e-05, + "loss": 0.8186, + "step": 12780 + }, + { + "epoch": 1.015300958542539, + "grad_norm": 0.6647375822067261, + "learning_rate": 3.308062766267101e-05, + "loss": 0.8904, + "step": 12790 + }, + { + "epoch": 1.0160947825914386, + "grad_norm": 0.7600811123847961, + "learning_rate": 3.3067396999285544e-05, + "loss": 0.8189, + "step": 12800 + }, + { + "epoch": 1.0168886066403382, + "grad_norm": 0.7824926972389221, + "learning_rate": 3.305416633590008e-05, + "loss": 0.8189, + "step": 12810 + }, + { + "epoch": 1.0176824306892378, + "grad_norm": 0.5769202709197998, + "learning_rate": 3.304093567251462e-05, + "loss": 0.8361, + "step": 12820 + }, + { + "epoch": 1.0184762547381372, + "grad_norm": 0.8311686515808105, + "learning_rate": 3.302770500912916e-05, + "loss": 0.8081, + "step": 12830 + }, + { + "epoch": 1.0192700787870368, + "grad_norm": 0.7909318804740906, + "learning_rate": 3.301447434574369e-05, + "loss": 0.7855, + "step": 12840 + }, + { + "epoch": 1.0200639028359364, + "grad_norm": 0.8774951100349426, + "learning_rate": 3.300124368235823e-05, + "loss": 0.8729, + "step": 12850 + }, + { + "epoch": 1.020857726884836, + "grad_norm": 0.747606635093689, + "learning_rate": 3.298801301897277e-05, + "loss": 0.8321, + "step": 12860 + }, + { + "epoch": 1.0216515509337356, + "grad_norm": 0.8213605880737305, + "learning_rate": 3.297478235558731e-05, + "loss": 0.8039, + "step": 12870 + }, + { + "epoch": 1.022445374982635, + "grad_norm": 0.7250701189041138, + "learning_rate": 3.296155169220185e-05, + "loss": 0.7972, + "step": 12880 + }, + { + "epoch": 1.0232391990315346, + "grad_norm": 0.7011380195617676, + "learning_rate": 3.294832102881639e-05, + "loss": 0.8402, + "step": 12890 + }, + { + "epoch": 1.0240330230804342, + "grad_norm": 0.9204203486442566, + "learning_rate": 3.293509036543093e-05, + "loss": 0.8623, + "step": 12900 + }, + { + "epoch": 1.0248268471293338, + "grad_norm": 0.7364963293075562, + "learning_rate": 3.292185970204546e-05, + "loss": 0.8065, + "step": 12910 + }, + { + "epoch": 1.0256206711782334, + "grad_norm": 0.8210947513580322, + "learning_rate": 3.290862903866e-05, + "loss": 0.8116, + "step": 12920 + }, + { + "epoch": 1.026414495227133, + "grad_norm": 0.8079206347465515, + "learning_rate": 3.289539837527454e-05, + "loss": 0.8278, + "step": 12930 + }, + { + "epoch": 1.0272083192760324, + "grad_norm": 0.8758916258811951, + "learning_rate": 3.2882167711889076e-05, + "loss": 0.8202, + "step": 12940 + }, + { + "epoch": 1.028002143324932, + "grad_norm": 0.8359827399253845, + "learning_rate": 3.2868937048503615e-05, + "loss": 0.7738, + "step": 12950 + }, + { + "epoch": 1.0287959673738316, + "grad_norm": 0.7513866424560547, + "learning_rate": 3.2855706385118154e-05, + "loss": 0.8644, + "step": 12960 + }, + { + "epoch": 1.0295897914227312, + "grad_norm": 0.8743990063667297, + "learning_rate": 3.284247572173269e-05, + "loss": 0.8927, + "step": 12970 + }, + { + "epoch": 1.0303836154716308, + "grad_norm": 0.8003587126731873, + "learning_rate": 3.2829245058347225e-05, + "loss": 0.8206, + "step": 12980 + }, + { + "epoch": 1.0311774395205302, + "grad_norm": 0.7568554878234863, + "learning_rate": 3.2816014394961764e-05, + "loss": 0.8321, + "step": 12990 + }, + { + "epoch": 1.0319712635694298, + "grad_norm": 0.7184220552444458, + "learning_rate": 3.28027837315763e-05, + "loss": 0.8007, + "step": 13000 + }, + { + "epoch": 1.0327650876183294, + "grad_norm": 0.7958794236183167, + "learning_rate": 3.278955306819084e-05, + "loss": 0.886, + "step": 13010 + }, + { + "epoch": 1.033558911667229, + "grad_norm": 0.8230629563331604, + "learning_rate": 3.2776322404805374e-05, + "loss": 0.8296, + "step": 13020 + }, + { + "epoch": 1.0343527357161286, + "grad_norm": 0.8877654671669006, + "learning_rate": 3.276309174141991e-05, + "loss": 0.8461, + "step": 13030 + }, + { + "epoch": 1.035146559765028, + "grad_norm": 0.7817438244819641, + "learning_rate": 3.274986107803446e-05, + "loss": 0.8301, + "step": 13040 + }, + { + "epoch": 1.0359403838139276, + "grad_norm": 0.9634678959846497, + "learning_rate": 3.2736630414649e-05, + "loss": 0.8447, + "step": 13050 + }, + { + "epoch": 1.0367342078628272, + "grad_norm": 0.8039039373397827, + "learning_rate": 3.272339975126353e-05, + "loss": 0.8416, + "step": 13060 + }, + { + "epoch": 1.0375280319117268, + "grad_norm": 0.813490092754364, + "learning_rate": 3.271016908787807e-05, + "loss": 0.8281, + "step": 13070 + }, + { + "epoch": 1.0383218559606264, + "grad_norm": 0.8581125736236572, + "learning_rate": 3.269693842449261e-05, + "loss": 0.8527, + "step": 13080 + }, + { + "epoch": 1.0391156800095258, + "grad_norm": 0.7691791653633118, + "learning_rate": 3.268370776110714e-05, + "loss": 0.8385, + "step": 13090 + }, + { + "epoch": 1.0399095040584254, + "grad_norm": 0.788347601890564, + "learning_rate": 3.267047709772168e-05, + "loss": 0.8372, + "step": 13100 + }, + { + "epoch": 1.040703328107325, + "grad_norm": 0.8846760392189026, + "learning_rate": 3.265724643433622e-05, + "loss": 0.8794, + "step": 13110 + }, + { + "epoch": 1.0414971521562246, + "grad_norm": 0.8345226645469666, + "learning_rate": 3.2644015770950756e-05, + "loss": 0.848, + "step": 13120 + }, + { + "epoch": 1.0422909762051242, + "grad_norm": 0.7519625425338745, + "learning_rate": 3.2630785107565295e-05, + "loss": 0.8409, + "step": 13130 + }, + { + "epoch": 1.0430848002540236, + "grad_norm": 0.7269890904426575, + "learning_rate": 3.2617554444179834e-05, + "loss": 0.8638, + "step": 13140 + }, + { + "epoch": 1.0438786243029232, + "grad_norm": 0.8326950669288635, + "learning_rate": 3.260432378079437e-05, + "loss": 0.7581, + "step": 13150 + }, + { + "epoch": 1.0446724483518228, + "grad_norm": 0.8327310681343079, + "learning_rate": 3.259109311740891e-05, + "loss": 0.834, + "step": 13160 + }, + { + "epoch": 1.0454662724007224, + "grad_norm": 0.9904863238334656, + "learning_rate": 3.2577862454023444e-05, + "loss": 0.7524, + "step": 13170 + }, + { + "epoch": 1.046260096449622, + "grad_norm": 0.9039106965065002, + "learning_rate": 3.256463179063798e-05, + "loss": 0.7784, + "step": 13180 + }, + { + "epoch": 1.0470539204985214, + "grad_norm": 0.6900113821029663, + "learning_rate": 3.255140112725252e-05, + "loss": 0.8065, + "step": 13190 + }, + { + "epoch": 1.047847744547421, + "grad_norm": 0.8624986410140991, + "learning_rate": 3.253817046386706e-05, + "loss": 0.8827, + "step": 13200 + }, + { + "epoch": 1.0486415685963206, + "grad_norm": 0.7626116275787354, + "learning_rate": 3.25249398004816e-05, + "loss": 0.8464, + "step": 13210 + }, + { + "epoch": 1.0494353926452202, + "grad_norm": 0.7918316721916199, + "learning_rate": 3.251170913709614e-05, + "loss": 0.8258, + "step": 13220 + }, + { + "epoch": 1.0502292166941198, + "grad_norm": 0.7877132892608643, + "learning_rate": 3.249847847371068e-05, + "loss": 0.8688, + "step": 13230 + }, + { + "epoch": 1.0510230407430192, + "grad_norm": 0.7505654096603394, + "learning_rate": 3.248524781032521e-05, + "loss": 0.8173, + "step": 13240 + }, + { + "epoch": 1.0518168647919188, + "grad_norm": 0.8359394073486328, + "learning_rate": 3.247201714693975e-05, + "loss": 0.8227, + "step": 13250 + }, + { + "epoch": 1.0526106888408184, + "grad_norm": 0.9003983736038208, + "learning_rate": 3.245878648355429e-05, + "loss": 0.7917, + "step": 13260 + }, + { + "epoch": 1.053404512889718, + "grad_norm": 0.7913084030151367, + "learning_rate": 3.2445555820168826e-05, + "loss": 0.8602, + "step": 13270 + }, + { + "epoch": 1.0541983369386176, + "grad_norm": 0.7529349327087402, + "learning_rate": 3.243232515678336e-05, + "loss": 0.7734, + "step": 13280 + }, + { + "epoch": 1.054992160987517, + "grad_norm": 0.7655823230743408, + "learning_rate": 3.24190944933979e-05, + "loss": 0.8708, + "step": 13290 + }, + { + "epoch": 1.0557859850364166, + "grad_norm": 0.7675358057022095, + "learning_rate": 3.2405863830012436e-05, + "loss": 0.7815, + "step": 13300 + }, + { + "epoch": 1.0565798090853162, + "grad_norm": 0.8172122240066528, + "learning_rate": 3.239263316662698e-05, + "loss": 0.8038, + "step": 13310 + }, + { + "epoch": 1.0573736331342158, + "grad_norm": 0.8098911643028259, + "learning_rate": 3.2379402503241514e-05, + "loss": 0.8107, + "step": 13320 + }, + { + "epoch": 1.0581674571831154, + "grad_norm": 0.7456363439559937, + "learning_rate": 3.236617183985605e-05, + "loss": 0.8484, + "step": 13330 + }, + { + "epoch": 1.0589612812320148, + "grad_norm": 0.8800321221351624, + "learning_rate": 3.235294117647059e-05, + "loss": 0.7985, + "step": 13340 + }, + { + "epoch": 1.0597551052809144, + "grad_norm": 0.7727877497673035, + "learning_rate": 3.2339710513085124e-05, + "loss": 0.8827, + "step": 13350 + }, + { + "epoch": 1.060548929329814, + "grad_norm": 0.8835548162460327, + "learning_rate": 3.232647984969966e-05, + "loss": 0.7698, + "step": 13360 + }, + { + "epoch": 1.0613427533787136, + "grad_norm": 0.7106403708457947, + "learning_rate": 3.23132491863142e-05, + "loss": 0.863, + "step": 13370 + }, + { + "epoch": 1.0621365774276132, + "grad_norm": 0.6507337689399719, + "learning_rate": 3.230001852292874e-05, + "loss": 0.7971, + "step": 13380 + }, + { + "epoch": 1.0629304014765126, + "grad_norm": 0.7235475778579712, + "learning_rate": 3.228678785954328e-05, + "loss": 0.7664, + "step": 13390 + }, + { + "epoch": 1.0637242255254122, + "grad_norm": 0.7101955413818359, + "learning_rate": 3.227355719615782e-05, + "loss": 0.7829, + "step": 13400 + }, + { + "epoch": 1.0645180495743118, + "grad_norm": 0.7536014318466187, + "learning_rate": 3.226032653277236e-05, + "loss": 0.8732, + "step": 13410 + }, + { + "epoch": 1.0653118736232114, + "grad_norm": 0.7618986368179321, + "learning_rate": 3.22470958693869e-05, + "loss": 0.8343, + "step": 13420 + }, + { + "epoch": 1.066105697672111, + "grad_norm": 0.8681114315986633, + "learning_rate": 3.223386520600143e-05, + "loss": 0.8696, + "step": 13430 + }, + { + "epoch": 1.0668995217210104, + "grad_norm": 0.9769225120544434, + "learning_rate": 3.222063454261597e-05, + "loss": 0.8216, + "step": 13440 + }, + { + "epoch": 1.06769334576991, + "grad_norm": 0.775951087474823, + "learning_rate": 3.220872694556905e-05, + "loss": 0.8409, + "step": 13450 + }, + { + "epoch": 1.0684871698188096, + "grad_norm": 0.7240128517150879, + "learning_rate": 3.219549628218359e-05, + "loss": 0.881, + "step": 13460 + }, + { + "epoch": 1.0692809938677093, + "grad_norm": 0.7799546122550964, + "learning_rate": 3.218226561879813e-05, + "loss": 0.8269, + "step": 13470 + }, + { + "epoch": 1.0700748179166089, + "grad_norm": 0.7071607112884521, + "learning_rate": 3.2169034955412666e-05, + "loss": 0.8752, + "step": 13480 + }, + { + "epoch": 1.0708686419655082, + "grad_norm": 0.8684601783752441, + "learning_rate": 3.2155804292027205e-05, + "loss": 0.8156, + "step": 13490 + }, + { + "epoch": 1.0716624660144078, + "grad_norm": 0.7755899429321289, + "learning_rate": 3.2142573628641743e-05, + "loss": 0.8075, + "step": 13500 + }, + { + "epoch": 1.0724562900633074, + "grad_norm": 0.7392835021018982, + "learning_rate": 3.212934296525628e-05, + "loss": 0.8711, + "step": 13510 + }, + { + "epoch": 1.073250114112207, + "grad_norm": 0.8979812860488892, + "learning_rate": 3.2116112301870815e-05, + "loss": 0.8296, + "step": 13520 + }, + { + "epoch": 1.0740439381611067, + "grad_norm": 0.9251773357391357, + "learning_rate": 3.2102881638485354e-05, + "loss": 0.8392, + "step": 13530 + }, + { + "epoch": 1.0748377622100063, + "grad_norm": 0.6050375699996948, + "learning_rate": 3.208965097509989e-05, + "loss": 0.8432, + "step": 13540 + }, + { + "epoch": 1.0756315862589056, + "grad_norm": 0.8619400858879089, + "learning_rate": 3.207642031171443e-05, + "loss": 0.8651, + "step": 13550 + }, + { + "epoch": 1.0764254103078053, + "grad_norm": 0.9240981340408325, + "learning_rate": 3.206318964832897e-05, + "loss": 0.8467, + "step": 13560 + }, + { + "epoch": 1.0772192343567049, + "grad_norm": 0.9178832173347473, + "learning_rate": 3.204995898494351e-05, + "loss": 0.8079, + "step": 13570 + }, + { + "epoch": 1.0780130584056045, + "grad_norm": 0.7604396343231201, + "learning_rate": 3.203672832155805e-05, + "loss": 0.7611, + "step": 13580 + }, + { + "epoch": 1.0788068824545038, + "grad_norm": 0.5721964836120605, + "learning_rate": 3.202349765817258e-05, + "loss": 0.7863, + "step": 13590 + }, + { + "epoch": 1.0796007065034035, + "grad_norm": 0.5710390210151672, + "learning_rate": 3.201026699478712e-05, + "loss": 0.873, + "step": 13600 + }, + { + "epoch": 1.080394530552303, + "grad_norm": 0.72386234998703, + "learning_rate": 3.199703633140166e-05, + "loss": 0.866, + "step": 13610 + }, + { + "epoch": 1.0811883546012027, + "grad_norm": 0.6764307618141174, + "learning_rate": 3.19838056680162e-05, + "loss": 0.8335, + "step": 13620 + }, + { + "epoch": 1.0819821786501023, + "grad_norm": 0.796247124671936, + "learning_rate": 3.197057500463073e-05, + "loss": 0.8253, + "step": 13630 + }, + { + "epoch": 1.0827760026990019, + "grad_norm": 0.8703547120094299, + "learning_rate": 3.195734434124527e-05, + "loss": 0.8459, + "step": 13640 + }, + { + "epoch": 1.0835698267479013, + "grad_norm": 0.6727367639541626, + "learning_rate": 3.194411367785981e-05, + "loss": 0.8283, + "step": 13650 + }, + { + "epoch": 1.0843636507968009, + "grad_norm": 0.8048222064971924, + "learning_rate": 3.193088301447435e-05, + "loss": 0.8383, + "step": 13660 + }, + { + "epoch": 1.0851574748457005, + "grad_norm": 0.8705745339393616, + "learning_rate": 3.1917652351088885e-05, + "loss": 0.8146, + "step": 13670 + }, + { + "epoch": 1.0859512988946, + "grad_norm": 0.7636762261390686, + "learning_rate": 3.1904421687703424e-05, + "loss": 0.9398, + "step": 13680 + }, + { + "epoch": 1.0867451229434997, + "grad_norm": 0.806253969669342, + "learning_rate": 3.189119102431796e-05, + "loss": 0.8356, + "step": 13690 + }, + { + "epoch": 1.087538946992399, + "grad_norm": 0.8001126646995544, + "learning_rate": 3.1877960360932495e-05, + "loss": 0.8216, + "step": 13700 + }, + { + "epoch": 1.0883327710412987, + "grad_norm": 0.8212706446647644, + "learning_rate": 3.1864729697547034e-05, + "loss": 0.8063, + "step": 13710 + }, + { + "epoch": 1.0891265950901983, + "grad_norm": 0.8403292894363403, + "learning_rate": 3.185149903416157e-05, + "loss": 0.7797, + "step": 13720 + }, + { + "epoch": 1.0899204191390979, + "grad_norm": 0.876654326915741, + "learning_rate": 3.183826837077611e-05, + "loss": 0.8569, + "step": 13730 + }, + { + "epoch": 1.0907142431879975, + "grad_norm": 0.69620680809021, + "learning_rate": 3.182503770739065e-05, + "loss": 0.8604, + "step": 13740 + }, + { + "epoch": 1.0915080672368969, + "grad_norm": 0.6936290860176086, + "learning_rate": 3.181180704400519e-05, + "loss": 0.8722, + "step": 13750 + }, + { + "epoch": 1.0923018912857965, + "grad_norm": 0.848301112651825, + "learning_rate": 3.179857638061973e-05, + "loss": 0.8742, + "step": 13760 + }, + { + "epoch": 1.093095715334696, + "grad_norm": 0.9208377003669739, + "learning_rate": 3.178534571723427e-05, + "loss": 0.8333, + "step": 13770 + }, + { + "epoch": 1.0938895393835957, + "grad_norm": 0.8173259496688843, + "learning_rate": 3.17721150538488e-05, + "loss": 0.8459, + "step": 13780 + }, + { + "epoch": 1.0946833634324953, + "grad_norm": 0.8618404865264893, + "learning_rate": 3.175888439046334e-05, + "loss": 0.8343, + "step": 13790 + }, + { + "epoch": 1.0954771874813947, + "grad_norm": 0.7455660104751587, + "learning_rate": 3.174565372707788e-05, + "loss": 0.7877, + "step": 13800 + }, + { + "epoch": 1.0962710115302943, + "grad_norm": 0.7906826734542847, + "learning_rate": 3.173242306369241e-05, + "loss": 0.8711, + "step": 13810 + }, + { + "epoch": 1.0970648355791939, + "grad_norm": 0.8172614574432373, + "learning_rate": 3.171919240030695e-05, + "loss": 0.8373, + "step": 13820 + }, + { + "epoch": 1.0978586596280935, + "grad_norm": 0.7989473938941956, + "learning_rate": 3.1705961736921494e-05, + "loss": 0.8162, + "step": 13830 + }, + { + "epoch": 1.098652483676993, + "grad_norm": 0.8977952003479004, + "learning_rate": 3.169273107353603e-05, + "loss": 0.8057, + "step": 13840 + }, + { + "epoch": 1.0994463077258925, + "grad_norm": 0.6931014060974121, + "learning_rate": 3.1679500410150565e-05, + "loss": 0.8232, + "step": 13850 + }, + { + "epoch": 1.100240131774792, + "grad_norm": 0.8068207502365112, + "learning_rate": 3.1666269746765104e-05, + "loss": 0.8804, + "step": 13860 + }, + { + "epoch": 1.1010339558236917, + "grad_norm": 0.7539369463920593, + "learning_rate": 3.165303908337964e-05, + "loss": 0.8351, + "step": 13870 + }, + { + "epoch": 1.1018277798725913, + "grad_norm": 0.7581447958946228, + "learning_rate": 3.163980841999418e-05, + "loss": 0.846, + "step": 13880 + }, + { + "epoch": 1.102621603921491, + "grad_norm": 0.9124611616134644, + "learning_rate": 3.1626577756608714e-05, + "loss": 0.8507, + "step": 13890 + }, + { + "epoch": 1.1034154279703903, + "grad_norm": 0.7561845779418945, + "learning_rate": 3.161334709322325e-05, + "loss": 0.7908, + "step": 13900 + }, + { + "epoch": 1.1042092520192899, + "grad_norm": 0.718763530254364, + "learning_rate": 3.160011642983779e-05, + "loss": 0.832, + "step": 13910 + }, + { + "epoch": 1.1050030760681895, + "grad_norm": 0.873539924621582, + "learning_rate": 3.158688576645233e-05, + "loss": 0.7854, + "step": 13920 + }, + { + "epoch": 1.105796900117089, + "grad_norm": 0.8948822021484375, + "learning_rate": 3.157365510306687e-05, + "loss": 0.7936, + "step": 13930 + }, + { + "epoch": 1.1065907241659887, + "grad_norm": 0.7010993957519531, + "learning_rate": 3.156042443968141e-05, + "loss": 0.7964, + "step": 13940 + }, + { + "epoch": 1.107384548214888, + "grad_norm": 0.8644427061080933, + "learning_rate": 3.154719377629595e-05, + "loss": 0.8249, + "step": 13950 + }, + { + "epoch": 1.1081783722637877, + "grad_norm": 0.8382445573806763, + "learning_rate": 3.153396311291048e-05, + "loss": 0.8446, + "step": 13960 + }, + { + "epoch": 1.1089721963126873, + "grad_norm": 0.9029629230499268, + "learning_rate": 3.152073244952502e-05, + "loss": 0.8784, + "step": 13970 + }, + { + "epoch": 1.109766020361587, + "grad_norm": 0.8613706827163696, + "learning_rate": 3.150750178613956e-05, + "loss": 0.7617, + "step": 13980 + }, + { + "epoch": 1.1105598444104865, + "grad_norm": 1.0319517850875854, + "learning_rate": 3.14942711227541e-05, + "loss": 0.8107, + "step": 13990 + }, + { + "epoch": 1.1113536684593859, + "grad_norm": 0.8257167339324951, + "learning_rate": 3.1481040459368636e-05, + "loss": 0.7137, + "step": 14000 + }, + { + "epoch": 1.1121474925082855, + "grad_norm": 0.8850013017654419, + "learning_rate": 3.1467809795983175e-05, + "loss": 0.8005, + "step": 14010 + }, + { + "epoch": 1.112941316557185, + "grad_norm": 0.7616755366325378, + "learning_rate": 3.1454579132597713e-05, + "loss": 0.7917, + "step": 14020 + }, + { + "epoch": 1.1137351406060847, + "grad_norm": 0.891237735748291, + "learning_rate": 3.1441348469212246e-05, + "loss": 0.7803, + "step": 14030 + }, + { + "epoch": 1.1145289646549843, + "grad_norm": 0.7421796917915344, + "learning_rate": 3.1428117805826785e-05, + "loss": 0.8219, + "step": 14040 + }, + { + "epoch": 1.1153227887038837, + "grad_norm": 0.8168928027153015, + "learning_rate": 3.1414887142441324e-05, + "loss": 0.7825, + "step": 14050 + }, + { + "epoch": 1.1161166127527833, + "grad_norm": 0.7771959900856018, + "learning_rate": 3.140165647905586e-05, + "loss": 0.8165, + "step": 14060 + }, + { + "epoch": 1.116910436801683, + "grad_norm": 0.7545336484909058, + "learning_rate": 3.1388425815670395e-05, + "loss": 0.9121, + "step": 14070 + }, + { + "epoch": 1.1177042608505825, + "grad_norm": 0.626607358455658, + "learning_rate": 3.1375195152284934e-05, + "loss": 0.8899, + "step": 14080 + }, + { + "epoch": 1.118498084899482, + "grad_norm": 0.8673813343048096, + "learning_rate": 3.136196448889947e-05, + "loss": 0.8339, + "step": 14090 + }, + { + "epoch": 1.1192919089483815, + "grad_norm": 0.8372191786766052, + "learning_rate": 3.134873382551402e-05, + "loss": 0.7954, + "step": 14100 + }, + { + "epoch": 1.120085732997281, + "grad_norm": 0.9092103838920593, + "learning_rate": 3.133550316212855e-05, + "loss": 0.8326, + "step": 14110 + }, + { + "epoch": 1.1208795570461807, + "grad_norm": 0.7805484533309937, + "learning_rate": 3.132227249874309e-05, + "loss": 0.799, + "step": 14120 + }, + { + "epoch": 1.1216733810950803, + "grad_norm": 0.8048092126846313, + "learning_rate": 3.130904183535763e-05, + "loss": 0.8497, + "step": 14130 + }, + { + "epoch": 1.12246720514398, + "grad_norm": 0.922804594039917, + "learning_rate": 3.129581117197217e-05, + "loss": 0.7648, + "step": 14140 + }, + { + "epoch": 1.1232610291928795, + "grad_norm": 0.7149479985237122, + "learning_rate": 3.12825805085867e-05, + "loss": 0.8884, + "step": 14150 + }, + { + "epoch": 1.124054853241779, + "grad_norm": 0.6718799471855164, + "learning_rate": 3.126934984520124e-05, + "loss": 0.7914, + "step": 14160 + }, + { + "epoch": 1.1248486772906785, + "grad_norm": 0.6949440836906433, + "learning_rate": 3.125611918181578e-05, + "loss": 0.8319, + "step": 14170 + }, + { + "epoch": 1.125642501339578, + "grad_norm": 0.8410897254943848, + "learning_rate": 3.1242888518430316e-05, + "loss": 0.8594, + "step": 14180 + }, + { + "epoch": 1.1264363253884777, + "grad_norm": 0.6044743657112122, + "learning_rate": 3.1229657855044855e-05, + "loss": 0.8168, + "step": 14190 + }, + { + "epoch": 1.127230149437377, + "grad_norm": 1.078133225440979, + "learning_rate": 3.1216427191659394e-05, + "loss": 0.834, + "step": 14200 + }, + { + "epoch": 1.1280239734862767, + "grad_norm": 0.9174512624740601, + "learning_rate": 3.120319652827393e-05, + "loss": 0.7889, + "step": 14210 + }, + { + "epoch": 1.1288177975351763, + "grad_norm": 0.5952799916267395, + "learning_rate": 3.1189965864888465e-05, + "loss": 0.79, + "step": 14220 + }, + { + "epoch": 1.129611621584076, + "grad_norm": 0.8574535250663757, + "learning_rate": 3.1176735201503004e-05, + "loss": 0.8347, + "step": 14230 + }, + { + "epoch": 1.1304054456329755, + "grad_norm": 0.882088303565979, + "learning_rate": 3.116350453811754e-05, + "loss": 0.8774, + "step": 14240 + }, + { + "epoch": 1.1311992696818751, + "grad_norm": 0.8594134449958801, + "learning_rate": 3.115027387473208e-05, + "loss": 0.7897, + "step": 14250 + }, + { + "epoch": 1.1319930937307745, + "grad_norm": 0.739986002445221, + "learning_rate": 3.1137043211346614e-05, + "loss": 0.8021, + "step": 14260 + }, + { + "epoch": 1.132786917779674, + "grad_norm": 0.8186190724372864, + "learning_rate": 3.112381254796116e-05, + "loss": 0.8212, + "step": 14270 + }, + { + "epoch": 1.1335807418285737, + "grad_norm": 0.5985064506530762, + "learning_rate": 3.11105818845757e-05, + "loss": 0.8579, + "step": 14280 + }, + { + "epoch": 1.1343745658774733, + "grad_norm": 0.7839154601097107, + "learning_rate": 3.109735122119023e-05, + "loss": 0.8212, + "step": 14290 + }, + { + "epoch": 1.1351683899263727, + "grad_norm": 0.8646635413169861, + "learning_rate": 3.108412055780477e-05, + "loss": 0.8486, + "step": 14300 + }, + { + "epoch": 1.1359622139752723, + "grad_norm": 0.6865241527557373, + "learning_rate": 3.107088989441931e-05, + "loss": 0.8799, + "step": 14310 + }, + { + "epoch": 1.136756038024172, + "grad_norm": 0.7246029376983643, + "learning_rate": 3.105765923103385e-05, + "loss": 0.8432, + "step": 14320 + }, + { + "epoch": 1.1375498620730715, + "grad_norm": 0.8030770421028137, + "learning_rate": 3.104442856764838e-05, + "loss": 0.7673, + "step": 14330 + }, + { + "epoch": 1.1383436861219711, + "grad_norm": 0.7397379279136658, + "learning_rate": 3.103119790426292e-05, + "loss": 0.8608, + "step": 14340 + }, + { + "epoch": 1.1391375101708707, + "grad_norm": 0.7867853045463562, + "learning_rate": 3.101796724087746e-05, + "loss": 0.8263, + "step": 14350 + }, + { + "epoch": 1.13993133421977, + "grad_norm": 0.6703535914421082, + "learning_rate": 3.1004736577491996e-05, + "loss": 0.8149, + "step": 14360 + }, + { + "epoch": 1.1407251582686697, + "grad_norm": 1.018248200416565, + "learning_rate": 3.0991505914106535e-05, + "loss": 0.8686, + "step": 14370 + }, + { + "epoch": 1.1415189823175693, + "grad_norm": 0.8166862726211548, + "learning_rate": 3.0978275250721074e-05, + "loss": 0.8326, + "step": 14380 + }, + { + "epoch": 1.142312806366469, + "grad_norm": 0.8093626499176025, + "learning_rate": 3.096504458733561e-05, + "loss": 0.7861, + "step": 14390 + }, + { + "epoch": 1.1431066304153685, + "grad_norm": 1.1167017221450806, + "learning_rate": 3.0951813923950145e-05, + "loss": 0.8208, + "step": 14400 + }, + { + "epoch": 1.143900454464268, + "grad_norm": 0.6292011141777039, + "learning_rate": 3.0938583260564684e-05, + "loss": 0.7954, + "step": 14410 + }, + { + "epoch": 1.1446942785131675, + "grad_norm": 0.7709428071975708, + "learning_rate": 3.092535259717922e-05, + "loss": 0.8027, + "step": 14420 + }, + { + "epoch": 1.1454881025620671, + "grad_norm": 0.8300706744194031, + "learning_rate": 3.091212193379376e-05, + "loss": 0.8297, + "step": 14430 + }, + { + "epoch": 1.1462819266109667, + "grad_norm": 0.7700504064559937, + "learning_rate": 3.08988912704083e-05, + "loss": 0.8367, + "step": 14440 + }, + { + "epoch": 1.1470757506598663, + "grad_norm": 1.0189820528030396, + "learning_rate": 3.088566060702284e-05, + "loss": 0.7969, + "step": 14450 + }, + { + "epoch": 1.1478695747087657, + "grad_norm": 0.7077481150627136, + "learning_rate": 3.087242994363738e-05, + "loss": 0.7619, + "step": 14460 + }, + { + "epoch": 1.1486633987576653, + "grad_norm": 0.8861839175224304, + "learning_rate": 3.085919928025192e-05, + "loss": 0.7641, + "step": 14470 + }, + { + "epoch": 1.149457222806565, + "grad_norm": 0.8999897837638855, + "learning_rate": 3.084596861686645e-05, + "loss": 0.7736, + "step": 14480 + }, + { + "epoch": 1.1502510468554645, + "grad_norm": 0.9881305694580078, + "learning_rate": 3.083273795348099e-05, + "loss": 0.8508, + "step": 14490 + }, + { + "epoch": 1.1510448709043641, + "grad_norm": 0.6451756358146667, + "learning_rate": 3.081950729009553e-05, + "loss": 0.7832, + "step": 14500 + }, + { + "epoch": 1.1518386949532635, + "grad_norm": 0.9463132619857788, + "learning_rate": 3.080627662671006e-05, + "loss": 0.8006, + "step": 14510 + }, + { + "epoch": 1.1526325190021631, + "grad_norm": 0.8050596117973328, + "learning_rate": 3.07930459633246e-05, + "loss": 0.8835, + "step": 14520 + }, + { + "epoch": 1.1534263430510627, + "grad_norm": 0.6816156506538391, + "learning_rate": 3.077981529993914e-05, + "loss": 0.8281, + "step": 14530 + }, + { + "epoch": 1.1542201670999623, + "grad_norm": 0.7849035263061523, + "learning_rate": 3.0766584636553683e-05, + "loss": 0.7944, + "step": 14540 + }, + { + "epoch": 1.155013991148862, + "grad_norm": 0.7930827140808105, + "learning_rate": 3.0753353973168216e-05, + "loss": 0.8804, + "step": 14550 + }, + { + "epoch": 1.1558078151977613, + "grad_norm": 0.8753893971443176, + "learning_rate": 3.0740123309782755e-05, + "loss": 0.8789, + "step": 14560 + }, + { + "epoch": 1.156601639246661, + "grad_norm": 0.974399983882904, + "learning_rate": 3.0726892646397293e-05, + "loss": 0.8217, + "step": 14570 + }, + { + "epoch": 1.1573954632955605, + "grad_norm": 0.7714707255363464, + "learning_rate": 3.071366198301183e-05, + "loss": 0.767, + "step": 14580 + }, + { + "epoch": 1.1581892873444601, + "grad_norm": 0.7159197926521301, + "learning_rate": 3.0700431319626365e-05, + "loss": 0.7676, + "step": 14590 + }, + { + "epoch": 1.1589831113933597, + "grad_norm": 0.6954260468482971, + "learning_rate": 3.0687200656240904e-05, + "loss": 0.8319, + "step": 14600 + }, + { + "epoch": 1.1597769354422591, + "grad_norm": 0.6127654910087585, + "learning_rate": 3.067396999285544e-05, + "loss": 0.8533, + "step": 14610 + }, + { + "epoch": 1.1605707594911587, + "grad_norm": 0.6652526259422302, + "learning_rate": 3.066073932946998e-05, + "loss": 0.8525, + "step": 14620 + }, + { + "epoch": 1.1613645835400583, + "grad_norm": 0.9769663214683533, + "learning_rate": 3.064750866608452e-05, + "loss": 0.8263, + "step": 14630 + }, + { + "epoch": 1.162158407588958, + "grad_norm": 0.8271260261535645, + "learning_rate": 3.063427800269906e-05, + "loss": 0.8327, + "step": 14640 + }, + { + "epoch": 1.1629522316378575, + "grad_norm": 0.7566442489624023, + "learning_rate": 3.06210473393136e-05, + "loss": 0.7574, + "step": 14650 + }, + { + "epoch": 1.1637460556867572, + "grad_norm": 0.7964077591896057, + "learning_rate": 3.060781667592813e-05, + "loss": 0.7874, + "step": 14660 + }, + { + "epoch": 1.1645398797356565, + "grad_norm": 0.7571009993553162, + "learning_rate": 3.059458601254267e-05, + "loss": 0.7855, + "step": 14670 + }, + { + "epoch": 1.1653337037845561, + "grad_norm": 0.8210294842720032, + "learning_rate": 3.058135534915721e-05, + "loss": 0.8045, + "step": 14680 + }, + { + "epoch": 1.1661275278334557, + "grad_norm": 0.6790111064910889, + "learning_rate": 3.056812468577175e-05, + "loss": 0.8583, + "step": 14690 + }, + { + "epoch": 1.1669213518823554, + "grad_norm": 0.8836639523506165, + "learning_rate": 3.055489402238628e-05, + "loss": 0.81, + "step": 14700 + }, + { + "epoch": 1.1677151759312547, + "grad_norm": 0.92630535364151, + "learning_rate": 3.0541663359000825e-05, + "loss": 0.8531, + "step": 14710 + }, + { + "epoch": 1.1685089999801543, + "grad_norm": 0.681800365447998, + "learning_rate": 3.0528432695615364e-05, + "loss": 0.817, + "step": 14720 + }, + { + "epoch": 1.169302824029054, + "grad_norm": 0.7862175107002258, + "learning_rate": 3.0515202032229896e-05, + "loss": 0.8026, + "step": 14730 + }, + { + "epoch": 1.1700966480779535, + "grad_norm": 0.7809861898422241, + "learning_rate": 3.0501971368844435e-05, + "loss": 0.8225, + "step": 14740 + }, + { + "epoch": 1.1708904721268532, + "grad_norm": 0.9233412146568298, + "learning_rate": 3.0488740705458974e-05, + "loss": 0.8066, + "step": 14750 + }, + { + "epoch": 1.1716842961757528, + "grad_norm": 0.8254870772361755, + "learning_rate": 3.0475510042073513e-05, + "loss": 0.8538, + "step": 14760 + }, + { + "epoch": 1.1724781202246521, + "grad_norm": 0.8267969489097595, + "learning_rate": 3.046227937868805e-05, + "loss": 0.8346, + "step": 14770 + }, + { + "epoch": 1.1732719442735517, + "grad_norm": 0.7673428058624268, + "learning_rate": 3.0449048715302587e-05, + "loss": 0.8336, + "step": 14780 + }, + { + "epoch": 1.1740657683224514, + "grad_norm": 0.8175097107887268, + "learning_rate": 3.0435818051917126e-05, + "loss": 0.8939, + "step": 14790 + }, + { + "epoch": 1.174859592371351, + "grad_norm": 0.8476851582527161, + "learning_rate": 3.0422587388531665e-05, + "loss": 0.828, + "step": 14800 + }, + { + "epoch": 1.1756534164202503, + "grad_norm": 0.8046305179595947, + "learning_rate": 3.0409356725146197e-05, + "loss": 0.8243, + "step": 14810 + }, + { + "epoch": 1.17644724046915, + "grad_norm": 0.8776468634605408, + "learning_rate": 3.039612606176074e-05, + "loss": 0.8292, + "step": 14820 + }, + { + "epoch": 1.1772410645180496, + "grad_norm": 0.7931563258171082, + "learning_rate": 3.038289539837528e-05, + "loss": 0.7834, + "step": 14830 + }, + { + "epoch": 1.1780348885669492, + "grad_norm": 1.0599154233932495, + "learning_rate": 3.036966473498981e-05, + "loss": 0.7835, + "step": 14840 + }, + { + "epoch": 1.1788287126158488, + "grad_norm": 0.902905285358429, + "learning_rate": 3.035643407160435e-05, + "loss": 0.8089, + "step": 14850 + }, + { + "epoch": 1.1796225366647484, + "grad_norm": 0.670547604560852, + "learning_rate": 3.034320340821889e-05, + "loss": 0.7964, + "step": 14860 + }, + { + "epoch": 1.1804163607136477, + "grad_norm": 0.7922139763832092, + "learning_rate": 3.032997274483343e-05, + "loss": 0.8179, + "step": 14870 + }, + { + "epoch": 1.1812101847625474, + "grad_norm": 0.8534002304077148, + "learning_rate": 3.0316742081447963e-05, + "loss": 0.8396, + "step": 14880 + }, + { + "epoch": 1.182004008811447, + "grad_norm": 0.8879272937774658, + "learning_rate": 3.0303511418062502e-05, + "loss": 0.7891, + "step": 14890 + }, + { + "epoch": 1.1827978328603466, + "grad_norm": 0.711462676525116, + "learning_rate": 3.029028075467704e-05, + "loss": 0.7261, + "step": 14900 + }, + { + "epoch": 1.183591656909246, + "grad_norm": 0.790237545967102, + "learning_rate": 3.027705009129158e-05, + "loss": 0.8176, + "step": 14910 + }, + { + "epoch": 1.1843854809581456, + "grad_norm": 0.7441157102584839, + "learning_rate": 3.0263819427906115e-05, + "loss": 0.8004, + "step": 14920 + }, + { + "epoch": 1.1851793050070452, + "grad_norm": 0.6485238075256348, + "learning_rate": 3.0250588764520654e-05, + "loss": 0.8286, + "step": 14930 + }, + { + "epoch": 1.1859731290559448, + "grad_norm": 0.5834975242614746, + "learning_rate": 3.0237358101135193e-05, + "loss": 0.8362, + "step": 14940 + }, + { + "epoch": 1.1867669531048444, + "grad_norm": 0.7251124382019043, + "learning_rate": 3.0224127437749732e-05, + "loss": 0.8968, + "step": 14950 + }, + { + "epoch": 1.187560777153744, + "grad_norm": 0.9666823148727417, + "learning_rate": 3.0210896774364268e-05, + "loss": 0.8247, + "step": 14960 + }, + { + "epoch": 1.1883546012026434, + "grad_norm": 0.839264988899231, + "learning_rate": 3.0197666110978807e-05, + "loss": 0.8347, + "step": 14970 + }, + { + "epoch": 1.189148425251543, + "grad_norm": 0.6984066367149353, + "learning_rate": 3.0184435447593345e-05, + "loss": 0.9087, + "step": 14980 + }, + { + "epoch": 1.1899422493004426, + "grad_norm": 0.6566882729530334, + "learning_rate": 3.017120478420788e-05, + "loss": 0.8933, + "step": 14990 + }, + { + "epoch": 1.1907360733493422, + "grad_norm": 0.880733847618103, + "learning_rate": 3.015797412082242e-05, + "loss": 0.8226, + "step": 15000 + }, + { + "epoch": 1.1915298973982416, + "grad_norm": 0.6445071697235107, + "learning_rate": 3.014474345743696e-05, + "loss": 0.8331, + "step": 15010 + }, + { + "epoch": 1.1923237214471412, + "grad_norm": 1.1646579504013062, + "learning_rate": 3.0131512794051498e-05, + "loss": 0.8809, + "step": 15020 + }, + { + "epoch": 1.1931175454960408, + "grad_norm": 0.7741886973381042, + "learning_rate": 3.011828213066603e-05, + "loss": 0.8101, + "step": 15030 + }, + { + "epoch": 1.1939113695449404, + "grad_norm": 0.760037899017334, + "learning_rate": 3.0105051467280572e-05, + "loss": 0.7767, + "step": 15040 + }, + { + "epoch": 1.19470519359384, + "grad_norm": 1.031218409538269, + "learning_rate": 3.009182080389511e-05, + "loss": 0.8009, + "step": 15050 + }, + { + "epoch": 1.1954990176427396, + "grad_norm": 0.6918702125549316, + "learning_rate": 3.007859014050965e-05, + "loss": 0.7885, + "step": 15060 + }, + { + "epoch": 1.196292841691639, + "grad_norm": 1.0237363576889038, + "learning_rate": 3.0065359477124182e-05, + "loss": 0.8503, + "step": 15070 + }, + { + "epoch": 1.1970866657405386, + "grad_norm": 1.026094913482666, + "learning_rate": 3.005212881373872e-05, + "loss": 0.7394, + "step": 15080 + }, + { + "epoch": 1.1978804897894382, + "grad_norm": 0.8072192072868347, + "learning_rate": 3.0038898150353263e-05, + "loss": 0.8343, + "step": 15090 + }, + { + "epoch": 1.1986743138383378, + "grad_norm": 0.8054350018501282, + "learning_rate": 3.0025667486967796e-05, + "loss": 0.8339, + "step": 15100 + }, + { + "epoch": 1.1994681378872374, + "grad_norm": 0.7186553478240967, + "learning_rate": 3.0012436823582335e-05, + "loss": 0.7915, + "step": 15110 + }, + { + "epoch": 1.2002619619361368, + "grad_norm": 0.9386051297187805, + "learning_rate": 2.9999206160196874e-05, + "loss": 0.8306, + "step": 15120 + }, + { + "epoch": 1.2010557859850364, + "grad_norm": 0.7570390105247498, + "learning_rate": 2.9985975496811412e-05, + "loss": 0.832, + "step": 15130 + }, + { + "epoch": 1.201849610033936, + "grad_norm": 0.8414233326911926, + "learning_rate": 2.9972744833425948e-05, + "loss": 0.8668, + "step": 15140 + }, + { + "epoch": 1.2026434340828356, + "grad_norm": 0.790988564491272, + "learning_rate": 2.9959514170040487e-05, + "loss": 0.802, + "step": 15150 + }, + { + "epoch": 1.2034372581317352, + "grad_norm": 1.0100650787353516, + "learning_rate": 2.9946283506655026e-05, + "loss": 0.8767, + "step": 15160 + }, + { + "epoch": 1.2042310821806346, + "grad_norm": 0.8488495349884033, + "learning_rate": 2.9933052843269565e-05, + "loss": 0.8674, + "step": 15170 + }, + { + "epoch": 1.2050249062295342, + "grad_norm": 0.8193151354789734, + "learning_rate": 2.99198221798841e-05, + "loss": 0.8027, + "step": 15180 + }, + { + "epoch": 1.2058187302784338, + "grad_norm": 0.7268685698509216, + "learning_rate": 2.990659151649864e-05, + "loss": 0.8172, + "step": 15190 + }, + { + "epoch": 1.2066125543273334, + "grad_norm": 0.8344203233718872, + "learning_rate": 2.9893360853113178e-05, + "loss": 0.8125, + "step": 15200 + }, + { + "epoch": 1.207406378376233, + "grad_norm": 0.8553740978240967, + "learning_rate": 2.988013018972771e-05, + "loss": 0.7997, + "step": 15210 + }, + { + "epoch": 1.2082002024251324, + "grad_norm": 0.7145830988883972, + "learning_rate": 2.9866899526342253e-05, + "loss": 0.7697, + "step": 15220 + }, + { + "epoch": 1.208994026474032, + "grad_norm": 0.7507617473602295, + "learning_rate": 2.985366886295679e-05, + "loss": 0.8852, + "step": 15230 + }, + { + "epoch": 1.2097878505229316, + "grad_norm": 0.9048145413398743, + "learning_rate": 2.984043819957133e-05, + "loss": 0.8967, + "step": 15240 + }, + { + "epoch": 1.2105816745718312, + "grad_norm": 0.7048582434654236, + "learning_rate": 2.9827207536185863e-05, + "loss": 0.8613, + "step": 15250 + }, + { + "epoch": 1.2113754986207308, + "grad_norm": 0.6709416508674622, + "learning_rate": 2.98139768728004e-05, + "loss": 0.8725, + "step": 15260 + }, + { + "epoch": 1.2121693226696304, + "grad_norm": 0.7239158749580383, + "learning_rate": 2.9800746209414944e-05, + "loss": 0.8236, + "step": 15270 + }, + { + "epoch": 1.2129631467185298, + "grad_norm": 0.7127059102058411, + "learning_rate": 2.9787515546029483e-05, + "loss": 0.8302, + "step": 15280 + }, + { + "epoch": 1.2137569707674294, + "grad_norm": 0.8340943455696106, + "learning_rate": 2.9774284882644015e-05, + "loss": 0.7475, + "step": 15290 + }, + { + "epoch": 1.214550794816329, + "grad_norm": 0.9518420696258545, + "learning_rate": 2.9761054219258554e-05, + "loss": 0.8673, + "step": 15300 + }, + { + "epoch": 1.2153446188652286, + "grad_norm": 0.6714077591896057, + "learning_rate": 2.9747823555873093e-05, + "loss": 0.7967, + "step": 15310 + }, + { + "epoch": 1.216138442914128, + "grad_norm": 0.9402453303337097, + "learning_rate": 2.973459289248763e-05, + "loss": 0.8375, + "step": 15320 + }, + { + "epoch": 1.2169322669630276, + "grad_norm": 0.8551952242851257, + "learning_rate": 2.9721362229102167e-05, + "loss": 0.8432, + "step": 15330 + }, + { + "epoch": 1.2177260910119272, + "grad_norm": 0.9510105848312378, + "learning_rate": 2.9708131565716706e-05, + "loss": 0.8647, + "step": 15340 + }, + { + "epoch": 1.2185199150608268, + "grad_norm": 0.6727361679077148, + "learning_rate": 2.9694900902331245e-05, + "loss": 0.8573, + "step": 15350 + }, + { + "epoch": 1.2193137391097264, + "grad_norm": 0.8175778388977051, + "learning_rate": 2.968167023894578e-05, + "loss": 0.793, + "step": 15360 + }, + { + "epoch": 1.220107563158626, + "grad_norm": 0.771083652973175, + "learning_rate": 2.966843957556032e-05, + "loss": 0.8401, + "step": 15370 + }, + { + "epoch": 1.2209013872075254, + "grad_norm": 0.8669653534889221, + "learning_rate": 2.965520891217486e-05, + "loss": 0.7758, + "step": 15380 + }, + { + "epoch": 1.221695211256425, + "grad_norm": 0.7607120871543884, + "learning_rate": 2.9641978248789397e-05, + "loss": 0.8172, + "step": 15390 + }, + { + "epoch": 1.2224890353053246, + "grad_norm": 0.9306351542472839, + "learning_rate": 2.9628747585403933e-05, + "loss": 0.8082, + "step": 15400 + }, + { + "epoch": 1.2232828593542242, + "grad_norm": 0.8709003329277039, + "learning_rate": 2.9615516922018472e-05, + "loss": 0.7716, + "step": 15410 + }, + { + "epoch": 1.2240766834031236, + "grad_norm": 0.9127326011657715, + "learning_rate": 2.960228625863301e-05, + "loss": 0.8231, + "step": 15420 + }, + { + "epoch": 1.2248705074520232, + "grad_norm": 0.7594894766807556, + "learning_rate": 2.9589055595247543e-05, + "loss": 0.7788, + "step": 15430 + }, + { + "epoch": 1.2256643315009228, + "grad_norm": 0.7096917033195496, + "learning_rate": 2.9575824931862085e-05, + "loss": 0.8266, + "step": 15440 + }, + { + "epoch": 1.2264581555498224, + "grad_norm": 0.8250033855438232, + "learning_rate": 2.9562594268476624e-05, + "loss": 0.8823, + "step": 15450 + }, + { + "epoch": 1.227251979598722, + "grad_norm": 0.8161039352416992, + "learning_rate": 2.9550686671429705e-05, + "loss": 0.7683, + "step": 15460 + }, + { + "epoch": 1.2280458036476216, + "grad_norm": 0.9151235222816467, + "learning_rate": 2.9537456008044244e-05, + "loss": 0.7988, + "step": 15470 + }, + { + "epoch": 1.228839627696521, + "grad_norm": 0.696901798248291, + "learning_rate": 2.9524225344658783e-05, + "loss": 0.8231, + "step": 15480 + }, + { + "epoch": 1.2296334517454206, + "grad_norm": Infinity, + "learning_rate": 2.9512317747611867e-05, + "loss": 0.8496, + "step": 15490 + }, + { + "epoch": 1.2304272757943202, + "grad_norm": 0.7194181084632874, + "learning_rate": 2.9499087084226406e-05, + "loss": 0.8621, + "step": 15500 + }, + { + "epoch": 1.2312210998432198, + "grad_norm": 0.7951611280441284, + "learning_rate": 2.9485856420840942e-05, + "loss": 0.7799, + "step": 15510 + }, + { + "epoch": 1.2320149238921192, + "grad_norm": 0.8867210149765015, + "learning_rate": 2.947262575745548e-05, + "loss": 0.8452, + "step": 15520 + }, + { + "epoch": 1.2328087479410188, + "grad_norm": 0.842461109161377, + "learning_rate": 2.945939509407002e-05, + "loss": 0.8028, + "step": 15530 + }, + { + "epoch": 1.2336025719899184, + "grad_norm": 0.7903575897216797, + "learning_rate": 2.944616443068456e-05, + "loss": 0.8713, + "step": 15540 + }, + { + "epoch": 1.234396396038818, + "grad_norm": 0.7790343165397644, + "learning_rate": 2.943293376729909e-05, + "loss": 0.7852, + "step": 15550 + }, + { + "epoch": 1.2351902200877176, + "grad_norm": 0.8206233978271484, + "learning_rate": 2.9419703103913633e-05, + "loss": 0.7899, + "step": 15560 + }, + { + "epoch": 1.2359840441366172, + "grad_norm": 0.8968833684921265, + "learning_rate": 2.9406472440528172e-05, + "loss": 0.8477, + "step": 15570 + }, + { + "epoch": 1.2367778681855166, + "grad_norm": 0.9603748917579651, + "learning_rate": 2.9393241777142704e-05, + "loss": 0.79, + "step": 15580 + }, + { + "epoch": 1.2375716922344162, + "grad_norm": 0.7768341898918152, + "learning_rate": 2.9380011113757243e-05, + "loss": 0.7812, + "step": 15590 + }, + { + "epoch": 1.2383655162833158, + "grad_norm": 0.7468785047531128, + "learning_rate": 2.9366780450371782e-05, + "loss": 0.8008, + "step": 15600 + }, + { + "epoch": 1.2391593403322154, + "grad_norm": 1.0277005434036255, + "learning_rate": 2.9353549786986324e-05, + "loss": 0.8168, + "step": 15610 + }, + { + "epoch": 1.2399531643811148, + "grad_norm": 0.7243433594703674, + "learning_rate": 2.9340319123600857e-05, + "loss": 0.8091, + "step": 15620 + }, + { + "epoch": 1.2407469884300144, + "grad_norm": 0.8666514158248901, + "learning_rate": 2.9327088460215395e-05, + "loss": 0.8132, + "step": 15630 + }, + { + "epoch": 1.241540812478914, + "grad_norm": 0.80784010887146, + "learning_rate": 2.9313857796829934e-05, + "loss": 0.8182, + "step": 15640 + }, + { + "epoch": 1.2423346365278136, + "grad_norm": 0.743398129940033, + "learning_rate": 2.9300627133444473e-05, + "loss": 0.8264, + "step": 15650 + }, + { + "epoch": 1.2431284605767132, + "grad_norm": 0.6970568895339966, + "learning_rate": 2.928739647005901e-05, + "loss": 0.8156, + "step": 15660 + }, + { + "epoch": 1.2439222846256128, + "grad_norm": 0.640468955039978, + "learning_rate": 2.9274165806673548e-05, + "loss": 0.8191, + "step": 15670 + }, + { + "epoch": 1.2447161086745122, + "grad_norm": 0.9808381795883179, + "learning_rate": 2.9260935143288087e-05, + "loss": 0.8104, + "step": 15680 + }, + { + "epoch": 1.2455099327234118, + "grad_norm": 0.7460395693778992, + "learning_rate": 2.9247704479902622e-05, + "loss": 0.8131, + "step": 15690 + }, + { + "epoch": 1.2463037567723114, + "grad_norm": 0.8486602306365967, + "learning_rate": 2.923447381651716e-05, + "loss": 0.7889, + "step": 15700 + }, + { + "epoch": 1.247097580821211, + "grad_norm": 0.9149585962295532, + "learning_rate": 2.92212431531317e-05, + "loss": 0.782, + "step": 15710 + }, + { + "epoch": 1.2478914048701106, + "grad_norm": 0.7191526889801025, + "learning_rate": 2.920801248974624e-05, + "loss": 0.8407, + "step": 15720 + }, + { + "epoch": 1.24868522891901, + "grad_norm": 0.7577335834503174, + "learning_rate": 2.9194781826360775e-05, + "loss": 0.8428, + "step": 15730 + }, + { + "epoch": 1.2494790529679096, + "grad_norm": 0.8915782570838928, + "learning_rate": 2.9181551162975313e-05, + "loss": 0.7993, + "step": 15740 + }, + { + "epoch": 1.2502728770168092, + "grad_norm": 0.9318637847900391, + "learning_rate": 2.9168320499589852e-05, + "loss": 0.7586, + "step": 15750 + }, + { + "epoch": 1.2510667010657088, + "grad_norm": 0.7829806208610535, + "learning_rate": 2.915508983620439e-05, + "loss": 0.8027, + "step": 15760 + }, + { + "epoch": 1.2518605251146084, + "grad_norm": 0.7112168073654175, + "learning_rate": 2.9141859172818923e-05, + "loss": 0.8171, + "step": 15770 + }, + { + "epoch": 1.252654349163508, + "grad_norm": 0.935157060623169, + "learning_rate": 2.9128628509433466e-05, + "loss": 0.7728, + "step": 15780 + }, + { + "epoch": 1.2534481732124074, + "grad_norm": 0.8060632348060608, + "learning_rate": 2.9115397846048005e-05, + "loss": 0.835, + "step": 15790 + }, + { + "epoch": 1.254241997261307, + "grad_norm": 0.7491572499275208, + "learning_rate": 2.9102167182662537e-05, + "loss": 0.7851, + "step": 15800 + }, + { + "epoch": 1.2550358213102066, + "grad_norm": 0.844757080078125, + "learning_rate": 2.9088936519277076e-05, + "loss": 0.7442, + "step": 15810 + }, + { + "epoch": 1.255829645359106, + "grad_norm": 0.5904254913330078, + "learning_rate": 2.9075705855891615e-05, + "loss": 0.77, + "step": 15820 + }, + { + "epoch": 1.2566234694080056, + "grad_norm": 0.7826739549636841, + "learning_rate": 2.9062475192506157e-05, + "loss": 0.8496, + "step": 15830 + }, + { + "epoch": 1.2574172934569052, + "grad_norm": 0.7734588980674744, + "learning_rate": 2.904924452912069e-05, + "loss": 0.8367, + "step": 15840 + }, + { + "epoch": 1.2582111175058048, + "grad_norm": 0.712218701839447, + "learning_rate": 2.9036013865735228e-05, + "loss": 0.7709, + "step": 15850 + }, + { + "epoch": 1.2590049415547044, + "grad_norm": 0.7345622181892395, + "learning_rate": 2.9022783202349767e-05, + "loss": 0.8509, + "step": 15860 + }, + { + "epoch": 1.259798765603604, + "grad_norm": 0.9125804901123047, + "learning_rate": 2.9009552538964306e-05, + "loss": 0.8151, + "step": 15870 + }, + { + "epoch": 1.2605925896525036, + "grad_norm": 0.7302107214927673, + "learning_rate": 2.899632187557884e-05, + "loss": 0.8181, + "step": 15880 + }, + { + "epoch": 1.261386413701403, + "grad_norm": 0.921686589717865, + "learning_rate": 2.898309121219338e-05, + "loss": 0.8133, + "step": 15890 + }, + { + "epoch": 1.2621802377503026, + "grad_norm": 0.9053015112876892, + "learning_rate": 2.896986054880792e-05, + "loss": 0.822, + "step": 15900 + }, + { + "epoch": 1.2629740617992022, + "grad_norm": 0.8016523718833923, + "learning_rate": 2.8956629885422455e-05, + "loss": 0.8147, + "step": 15910 + }, + { + "epoch": 1.2637678858481018, + "grad_norm": 0.9724611043930054, + "learning_rate": 2.8943399222036994e-05, + "loss": 0.8608, + "step": 15920 + }, + { + "epoch": 1.2645617098970012, + "grad_norm": 0.9794032573699951, + "learning_rate": 2.8930168558651533e-05, + "loss": 0.7779, + "step": 15930 + }, + { + "epoch": 1.2653555339459008, + "grad_norm": 0.6316156387329102, + "learning_rate": 2.891693789526607e-05, + "loss": 0.8147, + "step": 15940 + }, + { + "epoch": 1.2661493579948004, + "grad_norm": 0.835666298866272, + "learning_rate": 2.8903707231880607e-05, + "loss": 0.8002, + "step": 15950 + }, + { + "epoch": 1.2669431820437, + "grad_norm": 0.6945637464523315, + "learning_rate": 2.8890476568495146e-05, + "loss": 0.9178, + "step": 15960 + }, + { + "epoch": 1.2677370060925996, + "grad_norm": 0.6459661722183228, + "learning_rate": 2.8877245905109685e-05, + "loss": 0.8289, + "step": 15970 + }, + { + "epoch": 1.2685308301414993, + "grad_norm": 0.8627393841743469, + "learning_rate": 2.8864015241724224e-05, + "loss": 0.8809, + "step": 15980 + }, + { + "epoch": 1.2693246541903986, + "grad_norm": 0.8296471834182739, + "learning_rate": 2.8850784578338756e-05, + "loss": 0.8025, + "step": 15990 + }, + { + "epoch": 1.2701184782392982, + "grad_norm": 0.8422446846961975, + "learning_rate": 2.88375539149533e-05, + "loss": 0.8378, + "step": 16000 + }, + { + "epoch": 1.2709123022881978, + "grad_norm": 0.8606308102607727, + "learning_rate": 2.8824323251567837e-05, + "loss": 0.8543, + "step": 16010 + }, + { + "epoch": 1.2717061263370975, + "grad_norm": 0.9312155842781067, + "learning_rate": 2.881109258818237e-05, + "loss": 0.8339, + "step": 16020 + }, + { + "epoch": 1.2724999503859968, + "grad_norm": 0.782143771648407, + "learning_rate": 2.879786192479691e-05, + "loss": 0.7537, + "step": 16030 + }, + { + "epoch": 1.2732937744348964, + "grad_norm": 0.7882851958274841, + "learning_rate": 2.8784631261411447e-05, + "loss": 0.8494, + "step": 16040 + }, + { + "epoch": 1.274087598483796, + "grad_norm": 0.8223291039466858, + "learning_rate": 2.877140059802599e-05, + "loss": 0.788, + "step": 16050 + }, + { + "epoch": 1.2748814225326957, + "grad_norm": 0.7009278535842896, + "learning_rate": 2.8758169934640522e-05, + "loss": 0.8456, + "step": 16060 + }, + { + "epoch": 1.2756752465815953, + "grad_norm": 0.7133784294128418, + "learning_rate": 2.874493927125506e-05, + "loss": 0.7725, + "step": 16070 + }, + { + "epoch": 1.2764690706304949, + "grad_norm": 0.7193998098373413, + "learning_rate": 2.87317086078696e-05, + "loss": 0.8465, + "step": 16080 + }, + { + "epoch": 1.2772628946793942, + "grad_norm": 0.6710327863693237, + "learning_rate": 2.871847794448414e-05, + "loss": 0.8016, + "step": 16090 + }, + { + "epoch": 1.2780567187282939, + "grad_norm": 0.8942824006080627, + "learning_rate": 2.8705247281098674e-05, + "loss": 0.8741, + "step": 16100 + }, + { + "epoch": 1.2788505427771935, + "grad_norm": 0.891715407371521, + "learning_rate": 2.8692016617713213e-05, + "loss": 0.792, + "step": 16110 + }, + { + "epoch": 1.279644366826093, + "grad_norm": 0.7947127819061279, + "learning_rate": 2.8678785954327752e-05, + "loss": 0.8215, + "step": 16120 + }, + { + "epoch": 1.2804381908749924, + "grad_norm": 0.7644555568695068, + "learning_rate": 2.866555529094229e-05, + "loss": 0.7671, + "step": 16130 + }, + { + "epoch": 1.281232014923892, + "grad_norm": 0.9834242463111877, + "learning_rate": 2.8652324627556827e-05, + "loss": 0.7838, + "step": 16140 + }, + { + "epoch": 1.2820258389727917, + "grad_norm": 0.851807177066803, + "learning_rate": 2.8639093964171365e-05, + "loss": 0.9058, + "step": 16150 + }, + { + "epoch": 1.2828196630216913, + "grad_norm": 0.7503984570503235, + "learning_rate": 2.8625863300785904e-05, + "loss": 0.783, + "step": 16160 + }, + { + "epoch": 1.2836134870705909, + "grad_norm": 0.7916014790534973, + "learning_rate": 2.861263263740044e-05, + "loss": 0.888, + "step": 16170 + }, + { + "epoch": 1.2844073111194905, + "grad_norm": 0.8114141225814819, + "learning_rate": 2.859940197401498e-05, + "loss": 0.8192, + "step": 16180 + }, + { + "epoch": 1.2852011351683899, + "grad_norm": 0.7763829827308655, + "learning_rate": 2.8586171310629518e-05, + "loss": 0.8159, + "step": 16190 + }, + { + "epoch": 1.2859949592172895, + "grad_norm": 0.7763867378234863, + "learning_rate": 2.8572940647244057e-05, + "loss": 0.83, + "step": 16200 + }, + { + "epoch": 1.286788783266189, + "grad_norm": 0.969960629940033, + "learning_rate": 2.855970998385859e-05, + "loss": 0.8671, + "step": 16210 + }, + { + "epoch": 1.2875826073150887, + "grad_norm": 0.7875334620475769, + "learning_rate": 2.854647932047313e-05, + "loss": 0.7545, + "step": 16220 + }, + { + "epoch": 1.288376431363988, + "grad_norm": 0.9192537665367126, + "learning_rate": 2.853324865708767e-05, + "loss": 0.8174, + "step": 16230 + }, + { + "epoch": 1.2891702554128877, + "grad_norm": 0.8982329368591309, + "learning_rate": 2.852001799370221e-05, + "loss": 0.8373, + "step": 16240 + }, + { + "epoch": 1.2899640794617873, + "grad_norm": 0.7992169857025146, + "learning_rate": 2.850678733031674e-05, + "loss": 0.8858, + "step": 16250 + }, + { + "epoch": 1.2907579035106869, + "grad_norm": 0.7234670519828796, + "learning_rate": 2.849355666693128e-05, + "loss": 0.8548, + "step": 16260 + }, + { + "epoch": 1.2915517275595865, + "grad_norm": 0.803986132144928, + "learning_rate": 2.8480326003545822e-05, + "loss": 0.843, + "step": 16270 + }, + { + "epoch": 1.292345551608486, + "grad_norm": 0.6676992177963257, + "learning_rate": 2.8467095340160355e-05, + "loss": 0.8432, + "step": 16280 + }, + { + "epoch": 1.2931393756573855, + "grad_norm": 0.7712359428405762, + "learning_rate": 2.8453864676774893e-05, + "loss": 0.7889, + "step": 16290 + }, + { + "epoch": 1.293933199706285, + "grad_norm": 0.9185086488723755, + "learning_rate": 2.8440634013389432e-05, + "loss": 0.8816, + "step": 16300 + }, + { + "epoch": 1.2947270237551847, + "grad_norm": 0.7256115078926086, + "learning_rate": 2.842740335000397e-05, + "loss": 0.8636, + "step": 16310 + }, + { + "epoch": 1.2955208478040843, + "grad_norm": 0.9436436295509338, + "learning_rate": 2.8414172686618507e-05, + "loss": 0.8138, + "step": 16320 + }, + { + "epoch": 1.2963146718529837, + "grad_norm": 0.8887737393379211, + "learning_rate": 2.8400942023233046e-05, + "loss": 0.7817, + "step": 16330 + }, + { + "epoch": 1.2971084959018833, + "grad_norm": 0.8305734395980835, + "learning_rate": 2.8387711359847585e-05, + "loss": 0.7737, + "step": 16340 + }, + { + "epoch": 1.2979023199507829, + "grad_norm": 0.8457329273223877, + "learning_rate": 2.8374480696462124e-05, + "loss": 0.7743, + "step": 16350 + }, + { + "epoch": 1.2986961439996825, + "grad_norm": 0.7918816208839417, + "learning_rate": 2.836125003307666e-05, + "loss": 0.8273, + "step": 16360 + }, + { + "epoch": 1.299489968048582, + "grad_norm": 0.6060484647750854, + "learning_rate": 2.8348019369691198e-05, + "loss": 0.7823, + "step": 16370 + }, + { + "epoch": 1.3002837920974817, + "grad_norm": 0.9133262038230896, + "learning_rate": 2.8334788706305737e-05, + "loss": 0.7733, + "step": 16380 + }, + { + "epoch": 1.3010776161463813, + "grad_norm": 0.8485836386680603, + "learning_rate": 2.8321558042920273e-05, + "loss": 0.7581, + "step": 16390 + }, + { + "epoch": 1.3018714401952807, + "grad_norm": 0.7613677978515625, + "learning_rate": 2.830832737953481e-05, + "loss": 0.8436, + "step": 16400 + }, + { + "epoch": 1.3026652642441803, + "grad_norm": 0.8914282917976379, + "learning_rate": 2.829509671614935e-05, + "loss": 0.8283, + "step": 16410 + }, + { + "epoch": 1.3034590882930799, + "grad_norm": 0.8663108944892883, + "learning_rate": 2.828186605276389e-05, + "loss": 0.7854, + "step": 16420 + }, + { + "epoch": 1.3042529123419793, + "grad_norm": 0.8851757049560547, + "learning_rate": 2.826863538937842e-05, + "loss": 0.824, + "step": 16430 + }, + { + "epoch": 1.3050467363908789, + "grad_norm": 0.8618358969688416, + "learning_rate": 2.8255404725992964e-05, + "loss": 0.8277, + "step": 16440 + }, + { + "epoch": 1.3058405604397785, + "grad_norm": 0.8172771334648132, + "learning_rate": 2.8242174062607503e-05, + "loss": 0.8929, + "step": 16450 + }, + { + "epoch": 1.306634384488678, + "grad_norm": 0.898499608039856, + "learning_rate": 2.822894339922204e-05, + "loss": 0.8683, + "step": 16460 + }, + { + "epoch": 1.3074282085375777, + "grad_norm": 0.7734599113464355, + "learning_rate": 2.8215712735836574e-05, + "loss": 0.8588, + "step": 16470 + }, + { + "epoch": 1.3082220325864773, + "grad_norm": 0.8357298374176025, + "learning_rate": 2.8202482072451113e-05, + "loss": 0.7965, + "step": 16480 + }, + { + "epoch": 1.309015856635377, + "grad_norm": 0.8872872591018677, + "learning_rate": 2.8189251409065655e-05, + "loss": 0.7839, + "step": 16490 + }, + { + "epoch": 1.3098096806842763, + "grad_norm": 0.9397867321968079, + "learning_rate": 2.8176020745680187e-05, + "loss": 0.8508, + "step": 16500 + }, + { + "epoch": 1.3106035047331759, + "grad_norm": 0.790989339351654, + "learning_rate": 2.8162790082294726e-05, + "loss": 0.8086, + "step": 16510 + }, + { + "epoch": 1.3113973287820755, + "grad_norm": 0.747873842716217, + "learning_rate": 2.8149559418909265e-05, + "loss": 0.8536, + "step": 16520 + }, + { + "epoch": 1.312191152830975, + "grad_norm": 0.8547879457473755, + "learning_rate": 2.8136328755523804e-05, + "loss": 0.7932, + "step": 16530 + }, + { + "epoch": 1.3129849768798745, + "grad_norm": 0.8623278141021729, + "learning_rate": 2.812309809213834e-05, + "loss": 0.765, + "step": 16540 + }, + { + "epoch": 1.313778800928774, + "grad_norm": 0.7240385413169861, + "learning_rate": 2.810986742875288e-05, + "loss": 0.8292, + "step": 16550 + }, + { + "epoch": 1.3145726249776737, + "grad_norm": 0.9323850274085999, + "learning_rate": 2.8096636765367417e-05, + "loss": 0.76, + "step": 16560 + }, + { + "epoch": 1.3153664490265733, + "grad_norm": 0.695256769657135, + "learning_rate": 2.8083406101981956e-05, + "loss": 0.8158, + "step": 16570 + }, + { + "epoch": 1.316160273075473, + "grad_norm": 0.6848450899124146, + "learning_rate": 2.8070175438596492e-05, + "loss": 0.7873, + "step": 16580 + }, + { + "epoch": 1.3169540971243725, + "grad_norm": 0.7952008247375488, + "learning_rate": 2.805694477521103e-05, + "loss": 0.8798, + "step": 16590 + }, + { + "epoch": 1.3177479211732719, + "grad_norm": 0.8499898314476013, + "learning_rate": 2.804371411182557e-05, + "loss": 0.7545, + "step": 16600 + }, + { + "epoch": 1.3185417452221715, + "grad_norm": 0.8064092993736267, + "learning_rate": 2.8030483448440102e-05, + "loss": 0.7962, + "step": 16610 + }, + { + "epoch": 1.319335569271071, + "grad_norm": 0.9621743559837341, + "learning_rate": 2.8017252785054644e-05, + "loss": 0.853, + "step": 16620 + }, + { + "epoch": 1.3201293933199707, + "grad_norm": 0.7509051561355591, + "learning_rate": 2.8004022121669183e-05, + "loss": 0.8417, + "step": 16630 + }, + { + "epoch": 1.32092321736887, + "grad_norm": 0.7417434453964233, + "learning_rate": 2.7990791458283722e-05, + "loss": 0.8704, + "step": 16640 + }, + { + "epoch": 1.3217170414177697, + "grad_norm": 0.8676928877830505, + "learning_rate": 2.7977560794898254e-05, + "loss": 0.789, + "step": 16650 + }, + { + "epoch": 1.3225108654666693, + "grad_norm": 0.907887876033783, + "learning_rate": 2.7964330131512793e-05, + "loss": 0.8448, + "step": 16660 + }, + { + "epoch": 1.323304689515569, + "grad_norm": 0.7819181084632874, + "learning_rate": 2.7951099468127335e-05, + "loss": 0.8467, + "step": 16670 + }, + { + "epoch": 1.3240985135644685, + "grad_norm": 0.7135961651802063, + "learning_rate": 2.7937868804741874e-05, + "loss": 0.8322, + "step": 16680 + }, + { + "epoch": 1.324892337613368, + "grad_norm": 0.7459518313407898, + "learning_rate": 2.7924638141356407e-05, + "loss": 0.8954, + "step": 16690 + }, + { + "epoch": 1.3256861616622675, + "grad_norm": 0.8480586409568787, + "learning_rate": 2.7911407477970945e-05, + "loss": 0.8421, + "step": 16700 + }, + { + "epoch": 1.326479985711167, + "grad_norm": 0.7946953773498535, + "learning_rate": 2.7898176814585484e-05, + "loss": 0.7894, + "step": 16710 + }, + { + "epoch": 1.3272738097600667, + "grad_norm": 0.9292244911193848, + "learning_rate": 2.788494615120002e-05, + "loss": 0.8569, + "step": 16720 + }, + { + "epoch": 1.3280676338089663, + "grad_norm": 0.8674454689025879, + "learning_rate": 2.787171548781456e-05, + "loss": 0.8093, + "step": 16730 + }, + { + "epoch": 1.3288614578578657, + "grad_norm": 0.9656598567962646, + "learning_rate": 2.7858484824429098e-05, + "loss": 0.7962, + "step": 16740 + }, + { + "epoch": 1.3296552819067653, + "grad_norm": 0.7918367385864258, + "learning_rate": 2.7845254161043637e-05, + "loss": 0.8784, + "step": 16750 + }, + { + "epoch": 1.330449105955665, + "grad_norm": 0.8442765474319458, + "learning_rate": 2.7832023497658172e-05, + "loss": 0.8143, + "step": 16760 + }, + { + "epoch": 1.3312429300045645, + "grad_norm": 0.8403950333595276, + "learning_rate": 2.781879283427271e-05, + "loss": 0.8452, + "step": 16770 + }, + { + "epoch": 1.3320367540534641, + "grad_norm": 0.8802147507667542, + "learning_rate": 2.780556217088725e-05, + "loss": 0.8828, + "step": 16780 + }, + { + "epoch": 1.3328305781023637, + "grad_norm": 0.8321970105171204, + "learning_rate": 2.779233150750179e-05, + "loss": 0.8389, + "step": 16790 + }, + { + "epoch": 1.333624402151263, + "grad_norm": 0.814605176448822, + "learning_rate": 2.7779100844116325e-05, + "loss": 0.8263, + "step": 16800 + }, + { + "epoch": 1.3344182262001627, + "grad_norm": 0.8342418670654297, + "learning_rate": 2.7765870180730863e-05, + "loss": 0.7823, + "step": 16810 + }, + { + "epoch": 1.3352120502490623, + "grad_norm": 0.8523891568183899, + "learning_rate": 2.7752639517345402e-05, + "loss": 0.8527, + "step": 16820 + }, + { + "epoch": 1.336005874297962, + "grad_norm": 0.9738366603851318, + "learning_rate": 2.7739408853959935e-05, + "loss": 0.8452, + "step": 16830 + }, + { + "epoch": 1.3367996983468613, + "grad_norm": 0.6618583798408508, + "learning_rate": 2.7726178190574477e-05, + "loss": 0.9022, + "step": 16840 + }, + { + "epoch": 1.337593522395761, + "grad_norm": 0.6916811466217041, + "learning_rate": 2.7712947527189016e-05, + "loss": 0.8484, + "step": 16850 + }, + { + "epoch": 1.3383873464446605, + "grad_norm": 0.8240322470664978, + "learning_rate": 2.7699716863803555e-05, + "loss": 0.7851, + "step": 16860 + }, + { + "epoch": 1.3391811704935601, + "grad_norm": 0.8770764470100403, + "learning_rate": 2.7686486200418087e-05, + "loss": 0.8042, + "step": 16870 + }, + { + "epoch": 1.3399749945424597, + "grad_norm": 0.8255194425582886, + "learning_rate": 2.7673255537032626e-05, + "loss": 0.8021, + "step": 16880 + }, + { + "epoch": 1.3407688185913593, + "grad_norm": 0.822505533695221, + "learning_rate": 2.7660024873647168e-05, + "loss": 0.7602, + "step": 16890 + }, + { + "epoch": 1.3415626426402587, + "grad_norm": 1.0661057233810425, + "learning_rate": 2.7646794210261707e-05, + "loss": 0.8531, + "step": 16900 + }, + { + "epoch": 1.3423564666891583, + "grad_norm": 0.8882428407669067, + "learning_rate": 2.763356354687624e-05, + "loss": 0.8681, + "step": 16910 + }, + { + "epoch": 1.343150290738058, + "grad_norm": 0.8333000540733337, + "learning_rate": 2.7620332883490778e-05, + "loss": 0.8016, + "step": 16920 + }, + { + "epoch": 1.3439441147869575, + "grad_norm": 0.7466040849685669, + "learning_rate": 2.7607102220105317e-05, + "loss": 0.8253, + "step": 16930 + }, + { + "epoch": 1.344737938835857, + "grad_norm": 0.9921952486038208, + "learning_rate": 2.759387155671986e-05, + "loss": 0.8058, + "step": 16940 + }, + { + "epoch": 1.3455317628847565, + "grad_norm": 0.7671175599098206, + "learning_rate": 2.758064089333439e-05, + "loss": 0.8876, + "step": 16950 + }, + { + "epoch": 1.3463255869336561, + "grad_norm": 0.5953800678253174, + "learning_rate": 2.756741022994893e-05, + "loss": 0.8096, + "step": 16960 + }, + { + "epoch": 1.3471194109825557, + "grad_norm": 0.8293882012367249, + "learning_rate": 2.755417956656347e-05, + "loss": 0.7875, + "step": 16970 + }, + { + "epoch": 1.3479132350314553, + "grad_norm": 0.7418809533119202, + "learning_rate": 2.7540948903178005e-05, + "loss": 0.7676, + "step": 16980 + }, + { + "epoch": 1.348707059080355, + "grad_norm": 0.8036984205245972, + "learning_rate": 2.7527718239792544e-05, + "loss": 0.8258, + "step": 16990 + }, + { + "epoch": 1.3495008831292545, + "grad_norm": 0.9726935625076294, + "learning_rate": 2.7514487576407083e-05, + "loss": 0.8252, + "step": 17000 + }, + { + "epoch": 1.350294707178154, + "grad_norm": 0.9548497796058655, + "learning_rate": 2.750125691302162e-05, + "loss": 0.7797, + "step": 17010 + }, + { + "epoch": 1.3510885312270535, + "grad_norm": 0.84930819272995, + "learning_rate": 2.7488026249636157e-05, + "loss": 0.8566, + "step": 17020 + }, + { + "epoch": 1.3518823552759531, + "grad_norm": 0.8506318926811218, + "learning_rate": 2.7474795586250696e-05, + "loss": 0.8499, + "step": 17030 + }, + { + "epoch": 1.3526761793248525, + "grad_norm": 0.7205038070678711, + "learning_rate": 2.7461564922865235e-05, + "loss": 0.8265, + "step": 17040 + }, + { + "epoch": 1.3534700033737521, + "grad_norm": 0.8020601868629456, + "learning_rate": 2.7448334259479774e-05, + "loss": 0.7768, + "step": 17050 + }, + { + "epoch": 1.3542638274226517, + "grad_norm": 1.0212651491165161, + "learning_rate": 2.743510359609431e-05, + "loss": 0.8562, + "step": 17060 + }, + { + "epoch": 1.3550576514715513, + "grad_norm": 0.9957228302955627, + "learning_rate": 2.742187293270885e-05, + "loss": 0.8412, + "step": 17070 + }, + { + "epoch": 1.355851475520451, + "grad_norm": 0.8726456165313721, + "learning_rate": 2.7408642269323387e-05, + "loss": 0.8624, + "step": 17080 + }, + { + "epoch": 1.3566452995693505, + "grad_norm": 0.9366957545280457, + "learning_rate": 2.739541160593792e-05, + "loss": 0.8055, + "step": 17090 + }, + { + "epoch": 1.3574391236182501, + "grad_norm": 0.7526766657829285, + "learning_rate": 2.738218094255246e-05, + "loss": 0.901, + "step": 17100 + }, + { + "epoch": 1.3582329476671495, + "grad_norm": 0.8756746053695679, + "learning_rate": 2.7368950279167e-05, + "loss": 0.8179, + "step": 17110 + }, + { + "epoch": 1.3590267717160491, + "grad_norm": 0.7645390629768372, + "learning_rate": 2.735571961578154e-05, + "loss": 0.8382, + "step": 17120 + }, + { + "epoch": 1.3598205957649487, + "grad_norm": 0.8134385943412781, + "learning_rate": 2.7342488952396072e-05, + "loss": 0.8029, + "step": 17130 + }, + { + "epoch": 1.3606144198138483, + "grad_norm": 0.8174660205841064, + "learning_rate": 2.732925828901061e-05, + "loss": 0.8108, + "step": 17140 + }, + { + "epoch": 1.3614082438627477, + "grad_norm": 0.8981174230575562, + "learning_rate": 2.731602762562515e-05, + "loss": 0.8636, + "step": 17150 + }, + { + "epoch": 1.3622020679116473, + "grad_norm": 0.7757834792137146, + "learning_rate": 2.7302796962239692e-05, + "loss": 0.8198, + "step": 17160 + }, + { + "epoch": 1.362995891960547, + "grad_norm": 0.850306510925293, + "learning_rate": 2.7289566298854224e-05, + "loss": 0.8118, + "step": 17170 + }, + { + "epoch": 1.3637897160094465, + "grad_norm": 0.7254217863082886, + "learning_rate": 2.7276335635468763e-05, + "loss": 0.827, + "step": 17180 + }, + { + "epoch": 1.3645835400583461, + "grad_norm": 1.0736218690872192, + "learning_rate": 2.7263104972083302e-05, + "loss": 0.8839, + "step": 17190 + }, + { + "epoch": 1.3653773641072458, + "grad_norm": 0.8943197131156921, + "learning_rate": 2.7249874308697838e-05, + "loss": 0.7996, + "step": 17200 + }, + { + "epoch": 1.3661711881561451, + "grad_norm": 0.9427874684333801, + "learning_rate": 2.7236643645312377e-05, + "loss": 0.8843, + "step": 17210 + }, + { + "epoch": 1.3669650122050447, + "grad_norm": 1.0540951490402222, + "learning_rate": 2.7223412981926915e-05, + "loss": 0.8258, + "step": 17220 + }, + { + "epoch": 1.3677588362539443, + "grad_norm": 0.827373206615448, + "learning_rate": 2.7210182318541454e-05, + "loss": 0.8492, + "step": 17230 + }, + { + "epoch": 1.368552660302844, + "grad_norm": 0.7452979683876038, + "learning_rate": 2.719695165515599e-05, + "loss": 0.8457, + "step": 17240 + }, + { + "epoch": 1.3693464843517433, + "grad_norm": 1.0606896877288818, + "learning_rate": 2.718372099177053e-05, + "loss": 0.8411, + "step": 17250 + }, + { + "epoch": 1.370140308400643, + "grad_norm": 0.6817187070846558, + "learning_rate": 2.7170490328385068e-05, + "loss": 0.7858, + "step": 17260 + }, + { + "epoch": 1.3709341324495425, + "grad_norm": 0.9781541228294373, + "learning_rate": 2.7157259664999607e-05, + "loss": 0.8377, + "step": 17270 + }, + { + "epoch": 1.3717279564984421, + "grad_norm": 0.787799060344696, + "learning_rate": 2.7144029001614142e-05, + "loss": 0.8435, + "step": 17280 + }, + { + "epoch": 1.3725217805473418, + "grad_norm": 0.7571372389793396, + "learning_rate": 2.713079833822868e-05, + "loss": 0.8434, + "step": 17290 + }, + { + "epoch": 1.3733156045962414, + "grad_norm": 0.6738792061805725, + "learning_rate": 2.711756767484322e-05, + "loss": 0.8412, + "step": 17300 + }, + { + "epoch": 1.3741094286451407, + "grad_norm": 1.037701964378357, + "learning_rate": 2.7104337011457752e-05, + "loss": 0.831, + "step": 17310 + }, + { + "epoch": 1.3749032526940403, + "grad_norm": 0.7781122326850891, + "learning_rate": 2.709110634807229e-05, + "loss": 0.8592, + "step": 17320 + }, + { + "epoch": 1.37569707674294, + "grad_norm": 0.9618025422096252, + "learning_rate": 2.7077875684686833e-05, + "loss": 0.808, + "step": 17330 + }, + { + "epoch": 1.3764909007918396, + "grad_norm": 0.768357515335083, + "learning_rate": 2.7064645021301372e-05, + "loss": 0.8701, + "step": 17340 + }, + { + "epoch": 1.377284724840739, + "grad_norm": 0.8587696552276611, + "learning_rate": 2.7051414357915905e-05, + "loss": 0.8703, + "step": 17350 + }, + { + "epoch": 1.3780785488896385, + "grad_norm": 0.9739010334014893, + "learning_rate": 2.7038183694530443e-05, + "loss": 0.7343, + "step": 17360 + }, + { + "epoch": 1.3788723729385381, + "grad_norm": 0.7748016119003296, + "learning_rate": 2.7024953031144982e-05, + "loss": 0.7732, + "step": 17370 + }, + { + "epoch": 1.3796661969874378, + "grad_norm": 0.7629042863845825, + "learning_rate": 2.7011722367759525e-05, + "loss": 0.7921, + "step": 17380 + }, + { + "epoch": 1.3804600210363374, + "grad_norm": 0.9152623414993286, + "learning_rate": 2.6998491704374057e-05, + "loss": 0.8426, + "step": 17390 + }, + { + "epoch": 1.381253845085237, + "grad_norm": 0.8051658868789673, + "learning_rate": 2.6985261040988596e-05, + "loss": 0.8433, + "step": 17400 + }, + { + "epoch": 1.3820476691341363, + "grad_norm": 0.7106397151947021, + "learning_rate": 2.6972030377603135e-05, + "loss": 0.8297, + "step": 17410 + }, + { + "epoch": 1.382841493183036, + "grad_norm": 0.7966054677963257, + "learning_rate": 2.695879971421767e-05, + "loss": 0.9135, + "step": 17420 + }, + { + "epoch": 1.3836353172319356, + "grad_norm": 0.8528908491134644, + "learning_rate": 2.694556905083221e-05, + "loss": 0.7658, + "step": 17430 + }, + { + "epoch": 1.3844291412808352, + "grad_norm": 0.7444737553596497, + "learning_rate": 2.6932338387446748e-05, + "loss": 0.7901, + "step": 17440 + }, + { + "epoch": 1.3852229653297345, + "grad_norm": 0.6893477439880371, + "learning_rate": 2.6919107724061287e-05, + "loss": 0.8823, + "step": 17450 + }, + { + "epoch": 1.3860167893786342, + "grad_norm": 0.8451089859008789, + "learning_rate": 2.6905877060675823e-05, + "loss": 0.8699, + "step": 17460 + }, + { + "epoch": 1.3868106134275338, + "grad_norm": 0.7939800024032593, + "learning_rate": 2.689264639729036e-05, + "loss": 0.8529, + "step": 17470 + }, + { + "epoch": 1.3876044374764334, + "grad_norm": 0.7365521788597107, + "learning_rate": 2.68794157339049e-05, + "loss": 0.8272, + "step": 17480 + }, + { + "epoch": 1.388398261525333, + "grad_norm": 0.8047791123390198, + "learning_rate": 2.686618507051944e-05, + "loss": 0.7692, + "step": 17490 + }, + { + "epoch": 1.3891920855742326, + "grad_norm": 0.7910019755363464, + "learning_rate": 2.6852954407133975e-05, + "loss": 0.8384, + "step": 17500 + }, + { + "epoch": 1.389985909623132, + "grad_norm": 0.8277596235275269, + "learning_rate": 2.6839723743748514e-05, + "loss": 0.7949, + "step": 17510 + }, + { + "epoch": 1.3907797336720316, + "grad_norm": 1.008076786994934, + "learning_rate": 2.6826493080363053e-05, + "loss": 0.8341, + "step": 17520 + }, + { + "epoch": 1.3915735577209312, + "grad_norm": 0.7078754305839539, + "learning_rate": 2.6813262416977585e-05, + "loss": 0.8233, + "step": 17530 + }, + { + "epoch": 1.3923673817698308, + "grad_norm": 0.8263536095619202, + "learning_rate": 2.6800031753592124e-05, + "loss": 0.8366, + "step": 17540 + }, + { + "epoch": 1.3931612058187302, + "grad_norm": 0.8170019388198853, + "learning_rate": 2.6786801090206666e-05, + "loss": 0.83, + "step": 17550 + }, + { + "epoch": 1.3939550298676298, + "grad_norm": 0.8896247744560242, + "learning_rate": 2.6773570426821205e-05, + "loss": 0.8493, + "step": 17560 + }, + { + "epoch": 1.3947488539165294, + "grad_norm": 0.6989545822143555, + "learning_rate": 2.6760339763435737e-05, + "loss": 0.8151, + "step": 17570 + }, + { + "epoch": 1.395542677965429, + "grad_norm": 0.956941545009613, + "learning_rate": 2.6747109100050276e-05, + "loss": 0.8058, + "step": 17580 + }, + { + "epoch": 1.3963365020143286, + "grad_norm": 0.8271597027778625, + "learning_rate": 2.6733878436664815e-05, + "loss": 0.7853, + "step": 17590 + }, + { + "epoch": 1.3971303260632282, + "grad_norm": 0.7426630258560181, + "learning_rate": 2.6720647773279357e-05, + "loss": 0.7689, + "step": 17600 + }, + { + "epoch": 1.3979241501121278, + "grad_norm": 0.8329174518585205, + "learning_rate": 2.670741710989389e-05, + "loss": 0.8266, + "step": 17610 + }, + { + "epoch": 1.3987179741610272, + "grad_norm": 0.918170154094696, + "learning_rate": 2.669418644650843e-05, + "loss": 0.8444, + "step": 17620 + }, + { + "epoch": 1.3995117982099268, + "grad_norm": 0.6863502264022827, + "learning_rate": 2.6680955783122967e-05, + "loss": 0.8654, + "step": 17630 + }, + { + "epoch": 1.4003056222588264, + "grad_norm": 0.8955294489860535, + "learning_rate": 2.6667725119737503e-05, + "loss": 0.8649, + "step": 17640 + }, + { + "epoch": 1.4010994463077258, + "grad_norm": 0.8615051507949829, + "learning_rate": 2.6654494456352042e-05, + "loss": 0.7899, + "step": 17650 + }, + { + "epoch": 1.4018932703566254, + "grad_norm": 0.9451674222946167, + "learning_rate": 2.664126379296658e-05, + "loss": 0.8824, + "step": 17660 + }, + { + "epoch": 1.402687094405525, + "grad_norm": 0.8407998085021973, + "learning_rate": 2.662803312958112e-05, + "loss": 0.7696, + "step": 17670 + }, + { + "epoch": 1.4034809184544246, + "grad_norm": 0.8204694390296936, + "learning_rate": 2.6614802466195655e-05, + "loss": 0.7892, + "step": 17680 + }, + { + "epoch": 1.4042747425033242, + "grad_norm": 0.7346405982971191, + "learning_rate": 2.6601571802810194e-05, + "loss": 0.794, + "step": 17690 + }, + { + "epoch": 1.4050685665522238, + "grad_norm": 0.7684462070465088, + "learning_rate": 2.6588341139424733e-05, + "loss": 0.8656, + "step": 17700 + }, + { + "epoch": 1.4058623906011234, + "grad_norm": 0.7464351058006287, + "learning_rate": 2.6575110476039272e-05, + "loss": 0.8215, + "step": 17710 + }, + { + "epoch": 1.4066562146500228, + "grad_norm": 0.8222455382347107, + "learning_rate": 2.6561879812653808e-05, + "loss": 0.835, + "step": 17720 + }, + { + "epoch": 1.4074500386989224, + "grad_norm": 0.9029430150985718, + "learning_rate": 2.6548649149268347e-05, + "loss": 0.8038, + "step": 17730 + }, + { + "epoch": 1.408243862747822, + "grad_norm": 0.6402363777160645, + "learning_rate": 2.6535418485882885e-05, + "loss": 0.8557, + "step": 17740 + }, + { + "epoch": 1.4090376867967216, + "grad_norm": 0.7686623930931091, + "learning_rate": 2.6522187822497424e-05, + "loss": 0.7969, + "step": 17750 + }, + { + "epoch": 1.409831510845621, + "grad_norm": 0.8864317536354065, + "learning_rate": 2.6508957159111957e-05, + "loss": 0.8525, + "step": 17760 + }, + { + "epoch": 1.4106253348945206, + "grad_norm": 0.8594456911087036, + "learning_rate": 2.64957264957265e-05, + "loss": 0.8271, + "step": 17770 + }, + { + "epoch": 1.4114191589434202, + "grad_norm": 0.8445634841918945, + "learning_rate": 2.6482495832341038e-05, + "loss": 0.761, + "step": 17780 + }, + { + "epoch": 1.4122129829923198, + "grad_norm": 0.8919235467910767, + "learning_rate": 2.646926516895557e-05, + "loss": 0.8448, + "step": 17790 + }, + { + "epoch": 1.4130068070412194, + "grad_norm": 0.9002474546432495, + "learning_rate": 2.645603450557011e-05, + "loss": 0.873, + "step": 17800 + }, + { + "epoch": 1.413800631090119, + "grad_norm": 0.8635540008544922, + "learning_rate": 2.6442803842184648e-05, + "loss": 0.8874, + "step": 17810 + }, + { + "epoch": 1.4145944551390184, + "grad_norm": 0.8504475355148315, + "learning_rate": 2.642957317879919e-05, + "loss": 0.8296, + "step": 17820 + }, + { + "epoch": 1.415388279187918, + "grad_norm": 0.8239719867706299, + "learning_rate": 2.6416342515413722e-05, + "loss": 0.7893, + "step": 17830 + }, + { + "epoch": 1.4161821032368176, + "grad_norm": 0.7520464062690735, + "learning_rate": 2.640311185202826e-05, + "loss": 0.8056, + "step": 17840 + }, + { + "epoch": 1.4169759272857172, + "grad_norm": 0.8690072894096375, + "learning_rate": 2.63898811886428e-05, + "loss": 0.8513, + "step": 17850 + }, + { + "epoch": 1.4177697513346166, + "grad_norm": 0.6335851550102234, + "learning_rate": 2.637665052525734e-05, + "loss": 0.8591, + "step": 17860 + }, + { + "epoch": 1.4185635753835162, + "grad_norm": 0.9497599005699158, + "learning_rate": 2.6363419861871875e-05, + "loss": 0.8655, + "step": 17870 + }, + { + "epoch": 1.4193573994324158, + "grad_norm": 0.7551174163818359, + "learning_rate": 2.6350189198486413e-05, + "loss": 0.7851, + "step": 17880 + }, + { + "epoch": 1.4201512234813154, + "grad_norm": 0.9593257904052734, + "learning_rate": 2.6336958535100952e-05, + "loss": 0.8049, + "step": 17890 + }, + { + "epoch": 1.420945047530215, + "grad_norm": 0.8998299837112427, + "learning_rate": 2.6323727871715488e-05, + "loss": 0.8266, + "step": 17900 + }, + { + "epoch": 1.4217388715791146, + "grad_norm": 0.8189812302589417, + "learning_rate": 2.6310497208330027e-05, + "loss": 0.8333, + "step": 17910 + }, + { + "epoch": 1.422532695628014, + "grad_norm": 0.7574518322944641, + "learning_rate": 2.6297266544944566e-05, + "loss": 0.8794, + "step": 17920 + }, + { + "epoch": 1.4233265196769136, + "grad_norm": 0.815168559551239, + "learning_rate": 2.6284035881559105e-05, + "loss": 0.8662, + "step": 17930 + }, + { + "epoch": 1.4241203437258132, + "grad_norm": 0.7286579012870789, + "learning_rate": 2.627080521817364e-05, + "loss": 0.8239, + "step": 17940 + }, + { + "epoch": 1.4249141677747128, + "grad_norm": 0.8906771540641785, + "learning_rate": 2.625757455478818e-05, + "loss": 0.8331, + "step": 17950 + }, + { + "epoch": 1.4257079918236122, + "grad_norm": 0.7556918859481812, + "learning_rate": 2.6244343891402718e-05, + "loss": 0.8623, + "step": 17960 + }, + { + "epoch": 1.4265018158725118, + "grad_norm": 0.8590714931488037, + "learning_rate": 2.6231113228017257e-05, + "loss": 0.7373, + "step": 17970 + }, + { + "epoch": 1.4272956399214114, + "grad_norm": 0.827892541885376, + "learning_rate": 2.621788256463179e-05, + "loss": 0.7714, + "step": 17980 + }, + { + "epoch": 1.428089463970311, + "grad_norm": 0.7814650535583496, + "learning_rate": 2.620465190124633e-05, + "loss": 0.8154, + "step": 17990 + }, + { + "epoch": 1.4288832880192106, + "grad_norm": 0.8106557130813599, + "learning_rate": 2.619142123786087e-05, + "loss": 0.7834, + "step": 18000 + }, + { + "epoch": 1.4296771120681102, + "grad_norm": 0.7437555193901062, + "learning_rate": 2.6178190574475403e-05, + "loss": 0.7773, + "step": 18010 + }, + { + "epoch": 1.4304709361170096, + "grad_norm": 0.9069659113883972, + "learning_rate": 2.616495991108994e-05, + "loss": 0.7767, + "step": 18020 + }, + { + "epoch": 1.4312647601659092, + "grad_norm": 0.8167043924331665, + "learning_rate": 2.615172924770448e-05, + "loss": 0.8079, + "step": 18030 + }, + { + "epoch": 1.4320585842148088, + "grad_norm": 0.8131256103515625, + "learning_rate": 2.6138498584319023e-05, + "loss": 0.806, + "step": 18040 + }, + { + "epoch": 1.4328524082637084, + "grad_norm": 0.8274345993995667, + "learning_rate": 2.6125267920933555e-05, + "loss": 0.8135, + "step": 18050 + }, + { + "epoch": 1.4336462323126078, + "grad_norm": 0.9551146626472473, + "learning_rate": 2.6112037257548094e-05, + "loss": 0.852, + "step": 18060 + }, + { + "epoch": 1.4344400563615074, + "grad_norm": 0.8088938593864441, + "learning_rate": 2.6098806594162633e-05, + "loss": 0.7876, + "step": 18070 + }, + { + "epoch": 1.435233880410407, + "grad_norm": 0.7884169816970825, + "learning_rate": 2.608557593077717e-05, + "loss": 0.7942, + "step": 18080 + }, + { + "epoch": 1.4360277044593066, + "grad_norm": 0.8303605914115906, + "learning_rate": 2.6072345267391707e-05, + "loss": 0.8792, + "step": 18090 + }, + { + "epoch": 1.4368215285082062, + "grad_norm": 0.75099778175354, + "learning_rate": 2.6059114604006246e-05, + "loss": 0.7786, + "step": 18100 + }, + { + "epoch": 1.4376153525571058, + "grad_norm": 0.7984867095947266, + "learning_rate": 2.6045883940620785e-05, + "loss": 0.786, + "step": 18110 + }, + { + "epoch": 1.4384091766060052, + "grad_norm": 0.7262865900993347, + "learning_rate": 2.603265327723532e-05, + "loss": 0.8207, + "step": 18120 + }, + { + "epoch": 1.4392030006549048, + "grad_norm": 0.8592838048934937, + "learning_rate": 2.601942261384986e-05, + "loss": 0.8127, + "step": 18130 + }, + { + "epoch": 1.4399968247038044, + "grad_norm": 0.6876896619796753, + "learning_rate": 2.60061919504644e-05, + "loss": 0.8638, + "step": 18140 + }, + { + "epoch": 1.440790648752704, + "grad_norm": 0.9342442154884338, + "learning_rate": 2.5992961287078937e-05, + "loss": 0.8032, + "step": 18150 + }, + { + "epoch": 1.4415844728016034, + "grad_norm": 0.9930727481842041, + "learning_rate": 2.597973062369347e-05, + "loss": 0.7582, + "step": 18160 + }, + { + "epoch": 1.442378296850503, + "grad_norm": 0.8912200331687927, + "learning_rate": 2.5966499960308012e-05, + "loss": 0.8152, + "step": 18170 + }, + { + "epoch": 1.4431721208994026, + "grad_norm": 0.7123863697052002, + "learning_rate": 2.595326929692255e-05, + "loss": 0.7756, + "step": 18180 + }, + { + "epoch": 1.4439659449483022, + "grad_norm": 0.8635473251342773, + "learning_rate": 2.594003863353709e-05, + "loss": 0.8072, + "step": 18190 + }, + { + "epoch": 1.4447597689972018, + "grad_norm": 0.7606240510940552, + "learning_rate": 2.5926807970151622e-05, + "loss": 0.8457, + "step": 18200 + }, + { + "epoch": 1.4455535930461014, + "grad_norm": 0.7793667912483215, + "learning_rate": 2.591357730676616e-05, + "loss": 0.8179, + "step": 18210 + }, + { + "epoch": 1.446347417095001, + "grad_norm": 0.6772461533546448, + "learning_rate": 2.5900346643380703e-05, + "loss": 0.8746, + "step": 18220 + }, + { + "epoch": 1.4471412411439004, + "grad_norm": 0.9796277284622192, + "learning_rate": 2.5887115979995235e-05, + "loss": 0.8008, + "step": 18230 + }, + { + "epoch": 1.4479350651928, + "grad_norm": 0.8049467206001282, + "learning_rate": 2.5873885316609774e-05, + "loss": 0.7848, + "step": 18240 + }, + { + "epoch": 1.4487288892416996, + "grad_norm": 0.8529013991355896, + "learning_rate": 2.5860654653224313e-05, + "loss": 0.7956, + "step": 18250 + }, + { + "epoch": 1.449522713290599, + "grad_norm": 1.007569432258606, + "learning_rate": 2.5847423989838852e-05, + "loss": 0.7736, + "step": 18260 + }, + { + "epoch": 1.4503165373394986, + "grad_norm": 0.8529340624809265, + "learning_rate": 2.5834193326453388e-05, + "loss": 0.8122, + "step": 18270 + }, + { + "epoch": 1.4511103613883982, + "grad_norm": 0.8169769644737244, + "learning_rate": 2.5820962663067927e-05, + "loss": 0.841, + "step": 18280 + }, + { + "epoch": 1.4519041854372978, + "grad_norm": 0.7556354999542236, + "learning_rate": 2.5807731999682465e-05, + "loss": 0.8136, + "step": 18290 + }, + { + "epoch": 1.4526980094861974, + "grad_norm": 0.867939293384552, + "learning_rate": 2.5794501336297004e-05, + "loss": 0.8153, + "step": 18300 + }, + { + "epoch": 1.453491833535097, + "grad_norm": 0.9463819861412048, + "learning_rate": 2.578127067291154e-05, + "loss": 0.8175, + "step": 18310 + }, + { + "epoch": 1.4542856575839966, + "grad_norm": 0.9238904714584351, + "learning_rate": 2.576804000952608e-05, + "loss": 0.7291, + "step": 18320 + }, + { + "epoch": 1.455079481632896, + "grad_norm": 0.8403037786483765, + "learning_rate": 2.5754809346140618e-05, + "loss": 0.864, + "step": 18330 + }, + { + "epoch": 1.4558733056817956, + "grad_norm": 0.8228583931922913, + "learning_rate": 2.5741578682755153e-05, + "loss": 0.8357, + "step": 18340 + }, + { + "epoch": 1.4566671297306952, + "grad_norm": 0.730689287185669, + "learning_rate": 2.5728348019369692e-05, + "loss": 0.8575, + "step": 18350 + }, + { + "epoch": 1.4574609537795946, + "grad_norm": 0.7443234920501709, + "learning_rate": 2.571511735598423e-05, + "loss": 0.8414, + "step": 18360 + }, + { + "epoch": 1.4582547778284942, + "grad_norm": 0.9750908017158508, + "learning_rate": 2.570188669259877e-05, + "loss": 0.8359, + "step": 18370 + }, + { + "epoch": 1.4590486018773938, + "grad_norm": 0.9453748464584351, + "learning_rate": 2.5688656029213302e-05, + "loss": 0.7975, + "step": 18380 + }, + { + "epoch": 1.4598424259262934, + "grad_norm": 0.826844334602356, + "learning_rate": 2.5675425365827845e-05, + "loss": 0.7952, + "step": 18390 + }, + { + "epoch": 1.460636249975193, + "grad_norm": 0.937788724899292, + "learning_rate": 2.5662194702442383e-05, + "loss": 0.8504, + "step": 18400 + }, + { + "epoch": 1.4614300740240926, + "grad_norm": 0.9471660852432251, + "learning_rate": 2.5648964039056922e-05, + "loss": 0.8499, + "step": 18410 + }, + { + "epoch": 1.4622238980729922, + "grad_norm": 0.8096206784248352, + "learning_rate": 2.5635733375671455e-05, + "loss": 0.8297, + "step": 18420 + }, + { + "epoch": 1.4630177221218916, + "grad_norm": 1.0029072761535645, + "learning_rate": 2.5622502712285993e-05, + "loss": 0.8496, + "step": 18430 + }, + { + "epoch": 1.4638115461707912, + "grad_norm": 0.9172188639640808, + "learning_rate": 2.5609272048900536e-05, + "loss": 0.823, + "step": 18440 + }, + { + "epoch": 1.4646053702196908, + "grad_norm": 0.8738644123077393, + "learning_rate": 2.5596041385515068e-05, + "loss": 0.8175, + "step": 18450 + }, + { + "epoch": 1.4653991942685904, + "grad_norm": 0.7507184743881226, + "learning_rate": 2.5582810722129607e-05, + "loss": 0.7805, + "step": 18460 + }, + { + "epoch": 1.4661930183174898, + "grad_norm": 0.8251667022705078, + "learning_rate": 2.5569580058744146e-05, + "loss": 0.8151, + "step": 18470 + }, + { + "epoch": 1.4669868423663894, + "grad_norm": 0.9038745164871216, + "learning_rate": 2.5556349395358685e-05, + "loss": 0.8222, + "step": 18480 + }, + { + "epoch": 1.467780666415289, + "grad_norm": 0.7549251914024353, + "learning_rate": 2.554311873197322e-05, + "loss": 0.7928, + "step": 18490 + }, + { + "epoch": 1.4685744904641886, + "grad_norm": 0.8208056688308716, + "learning_rate": 2.552988806858776e-05, + "loss": 0.8973, + "step": 18500 + }, + { + "epoch": 1.4693683145130882, + "grad_norm": 1.0672497749328613, + "learning_rate": 2.5516657405202298e-05, + "loss": 0.8407, + "step": 18510 + }, + { + "epoch": 1.4701621385619879, + "grad_norm": 0.8073993921279907, + "learning_rate": 2.5503426741816837e-05, + "loss": 0.8038, + "step": 18520 + }, + { + "epoch": 1.4709559626108872, + "grad_norm": 0.9057318568229675, + "learning_rate": 2.5490196078431373e-05, + "loss": 0.829, + "step": 18530 + }, + { + "epoch": 1.4717497866597868, + "grad_norm": 0.7318175435066223, + "learning_rate": 2.547696541504591e-05, + "loss": 0.8253, + "step": 18540 + }, + { + "epoch": 1.4725436107086864, + "grad_norm": 0.819756805896759, + "learning_rate": 2.546373475166045e-05, + "loss": 0.8265, + "step": 18550 + }, + { + "epoch": 1.473337434757586, + "grad_norm": 0.852188766002655, + "learning_rate": 2.5450504088274986e-05, + "loss": 0.7702, + "step": 18560 + }, + { + "epoch": 1.4741312588064854, + "grad_norm": 0.8986577391624451, + "learning_rate": 2.5437273424889525e-05, + "loss": 0.8488, + "step": 18570 + }, + { + "epoch": 1.474925082855385, + "grad_norm": 0.6667974591255188, + "learning_rate": 2.5424042761504064e-05, + "loss": 0.8424, + "step": 18580 + }, + { + "epoch": 1.4757189069042846, + "grad_norm": 0.7348021864891052, + "learning_rate": 2.5410812098118603e-05, + "loss": 0.7877, + "step": 18590 + }, + { + "epoch": 1.4765127309531842, + "grad_norm": 0.7171440720558167, + "learning_rate": 2.5397581434733135e-05, + "loss": 0.8142, + "step": 18600 + }, + { + "epoch": 1.4773065550020839, + "grad_norm": 0.7128421068191528, + "learning_rate": 2.5384350771347677e-05, + "loss": 0.8373, + "step": 18610 + }, + { + "epoch": 1.4781003790509835, + "grad_norm": 0.7781753540039062, + "learning_rate": 2.5371120107962216e-05, + "loss": 0.7994, + "step": 18620 + }, + { + "epoch": 1.4788942030998828, + "grad_norm": 0.8336313366889954, + "learning_rate": 2.5357889444576755e-05, + "loss": 0.8565, + "step": 18630 + }, + { + "epoch": 1.4796880271487824, + "grad_norm": 0.9292792677879333, + "learning_rate": 2.5344658781191287e-05, + "loss": 0.7621, + "step": 18640 + }, + { + "epoch": 1.480481851197682, + "grad_norm": 0.81519615650177, + "learning_rate": 2.5331428117805826e-05, + "loss": 0.8161, + "step": 18650 + }, + { + "epoch": 1.4812756752465817, + "grad_norm": 0.7871677279472351, + "learning_rate": 2.531819745442037e-05, + "loss": 0.8304, + "step": 18660 + }, + { + "epoch": 1.482069499295481, + "grad_norm": 0.8033188581466675, + "learning_rate": 2.5304966791034907e-05, + "loss": 0.859, + "step": 18670 + }, + { + "epoch": 1.4828633233443806, + "grad_norm": 0.8543998003005981, + "learning_rate": 2.529173612764944e-05, + "loss": 0.8521, + "step": 18680 + }, + { + "epoch": 1.4836571473932803, + "grad_norm": 0.7945504188537598, + "learning_rate": 2.527850546426398e-05, + "loss": 0.8477, + "step": 18690 + }, + { + "epoch": 1.4844509714421799, + "grad_norm": 0.9741247892379761, + "learning_rate": 2.5265274800878517e-05, + "loss": 0.8306, + "step": 18700 + }, + { + "epoch": 1.4852447954910795, + "grad_norm": 1.0190260410308838, + "learning_rate": 2.5252044137493053e-05, + "loss": 0.7352, + "step": 18710 + }, + { + "epoch": 1.486038619539979, + "grad_norm": 0.8218889832496643, + "learning_rate": 2.5238813474107592e-05, + "loss": 0.8306, + "step": 18720 + }, + { + "epoch": 1.4868324435888784, + "grad_norm": 0.9101859331130981, + "learning_rate": 2.522558281072213e-05, + "loss": 0.8644, + "step": 18730 + }, + { + "epoch": 1.487626267637778, + "grad_norm": 0.8741206526756287, + "learning_rate": 2.521235214733667e-05, + "loss": 0.811, + "step": 18740 + }, + { + "epoch": 1.4884200916866777, + "grad_norm": 0.7072809338569641, + "learning_rate": 2.5199121483951205e-05, + "loss": 0.8437, + "step": 18750 + }, + { + "epoch": 1.4892139157355773, + "grad_norm": 0.8772575855255127, + "learning_rate": 2.5185890820565744e-05, + "loss": 0.7746, + "step": 18760 + }, + { + "epoch": 1.4900077397844766, + "grad_norm": 0.8278160095214844, + "learning_rate": 2.5172660157180283e-05, + "loss": 0.8073, + "step": 18770 + }, + { + "epoch": 1.4908015638333763, + "grad_norm": 0.8926985859870911, + "learning_rate": 2.5159429493794822e-05, + "loss": 0.8103, + "step": 18780 + }, + { + "epoch": 1.4915953878822759, + "grad_norm": 1.0303518772125244, + "learning_rate": 2.5146198830409358e-05, + "loss": 0.8209, + "step": 18790 + }, + { + "epoch": 1.4923892119311755, + "grad_norm": 0.7462199926376343, + "learning_rate": 2.5132968167023897e-05, + "loss": 0.7996, + "step": 18800 + }, + { + "epoch": 1.493183035980075, + "grad_norm": 0.8517622351646423, + "learning_rate": 2.5119737503638435e-05, + "loss": 0.7538, + "step": 18810 + }, + { + "epoch": 1.4939768600289747, + "grad_norm": 0.8433437347412109, + "learning_rate": 2.5106506840252968e-05, + "loss": 0.8481, + "step": 18820 + }, + { + "epoch": 1.494770684077874, + "grad_norm": 0.8017115592956543, + "learning_rate": 2.509327617686751e-05, + "loss": 0.7835, + "step": 18830 + }, + { + "epoch": 1.4955645081267737, + "grad_norm": 0.8956550359725952, + "learning_rate": 2.508004551348205e-05, + "loss": 0.746, + "step": 18840 + }, + { + "epoch": 1.4963583321756733, + "grad_norm": 0.8071773648262024, + "learning_rate": 2.5066814850096588e-05, + "loss": 0.8029, + "step": 18850 + }, + { + "epoch": 1.4971521562245729, + "grad_norm": 0.8614389300346375, + "learning_rate": 2.505358418671112e-05, + "loss": 0.8419, + "step": 18860 + }, + { + "epoch": 1.4979459802734723, + "grad_norm": 0.8073408007621765, + "learning_rate": 2.504035352332566e-05, + "loss": 0.8295, + "step": 18870 + }, + { + "epoch": 1.4987398043223719, + "grad_norm": 0.8368566036224365, + "learning_rate": 2.50271228599402e-05, + "loss": 0.8155, + "step": 18880 + }, + { + "epoch": 1.4995336283712715, + "grad_norm": 0.7750986814498901, + "learning_rate": 2.501389219655474e-05, + "loss": 0.7991, + "step": 18890 + }, + { + "epoch": 1.500327452420171, + "grad_norm": 0.843612015247345, + "learning_rate": 2.5000661533169272e-05, + "loss": 0.8506, + "step": 18900 + }, + { + "epoch": 1.5011212764690707, + "grad_norm": 0.9953761100769043, + "learning_rate": 2.498743086978381e-05, + "loss": 0.8303, + "step": 18910 + }, + { + "epoch": 1.5019151005179703, + "grad_norm": 0.8304370641708374, + "learning_rate": 2.497420020639835e-05, + "loss": 0.7987, + "step": 18920 + }, + { + "epoch": 1.5027089245668699, + "grad_norm": 0.6083853840827942, + "learning_rate": 2.496096954301289e-05, + "loss": 0.7744, + "step": 18930 + }, + { + "epoch": 1.5035027486157693, + "grad_norm": 0.8940452933311462, + "learning_rate": 2.4947738879627425e-05, + "loss": 0.7647, + "step": 18940 + }, + { + "epoch": 1.5042965726646689, + "grad_norm": 0.723276674747467, + "learning_rate": 2.4934508216241963e-05, + "loss": 0.8258, + "step": 18950 + }, + { + "epoch": 1.5050903967135685, + "grad_norm": 0.8840285539627075, + "learning_rate": 2.49212775528565e-05, + "loss": 0.8022, + "step": 18960 + }, + { + "epoch": 1.5058842207624679, + "grad_norm": 0.775147020816803, + "learning_rate": 2.490804688947104e-05, + "loss": 0.8434, + "step": 18970 + }, + { + "epoch": 1.5066780448113675, + "grad_norm": 0.9234639406204224, + "learning_rate": 2.4894816226085577e-05, + "loss": 0.8079, + "step": 18980 + }, + { + "epoch": 1.507471868860267, + "grad_norm": 0.7386789321899414, + "learning_rate": 2.4881585562700116e-05, + "loss": 0.8194, + "step": 18990 + }, + { + "epoch": 1.5082656929091667, + "grad_norm": 0.9541153311729431, + "learning_rate": 2.486835489931465e-05, + "loss": 0.7625, + "step": 19000 + }, + { + "epoch": 1.5090595169580663, + "grad_norm": 0.9702879190444946, + "learning_rate": 2.485512423592919e-05, + "loss": 0.8113, + "step": 19010 + }, + { + "epoch": 1.5098533410069659, + "grad_norm": 0.7624096870422363, + "learning_rate": 2.484189357254373e-05, + "loss": 0.8299, + "step": 19020 + }, + { + "epoch": 1.5106471650558655, + "grad_norm": 0.734418511390686, + "learning_rate": 2.4828662909158265e-05, + "loss": 0.8394, + "step": 19030 + }, + { + "epoch": 1.5114409891047649, + "grad_norm": 0.8682621717453003, + "learning_rate": 2.4815432245772804e-05, + "loss": 0.8202, + "step": 19040 + }, + { + "epoch": 1.5122348131536645, + "grad_norm": 0.7648998498916626, + "learning_rate": 2.4802201582387343e-05, + "loss": 0.8419, + "step": 19050 + }, + { + "epoch": 1.513028637202564, + "grad_norm": 0.702462911605835, + "learning_rate": 2.478897091900188e-05, + "loss": 0.8125, + "step": 19060 + }, + { + "epoch": 1.5138224612514635, + "grad_norm": 0.7904557585716248, + "learning_rate": 2.4775740255616417e-05, + "loss": 0.8679, + "step": 19070 + }, + { + "epoch": 1.514616285300363, + "grad_norm": 1.0039995908737183, + "learning_rate": 2.4762509592230956e-05, + "loss": 0.8623, + "step": 19080 + }, + { + "epoch": 1.5154101093492627, + "grad_norm": 0.8769594430923462, + "learning_rate": 2.474927892884549e-05, + "loss": 0.8309, + "step": 19090 + }, + { + "epoch": 1.5162039333981623, + "grad_norm": 0.742843508720398, + "learning_rate": 2.473737133179858e-05, + "loss": 0.8068, + "step": 19100 + }, + { + "epoch": 1.516997757447062, + "grad_norm": 0.7758870720863342, + "learning_rate": 2.4724140668413115e-05, + "loss": 0.8617, + "step": 19110 + }, + { + "epoch": 1.5177915814959615, + "grad_norm": 0.8473050594329834, + "learning_rate": 2.4710910005027654e-05, + "loss": 0.8175, + "step": 19120 + }, + { + "epoch": 1.518585405544861, + "grad_norm": 0.6727568507194519, + "learning_rate": 2.469767934164219e-05, + "loss": 0.8389, + "step": 19130 + }, + { + "epoch": 1.5193792295937607, + "grad_norm": 0.7093884944915771, + "learning_rate": 2.468444867825673e-05, + "loss": 0.8226, + "step": 19140 + }, + { + "epoch": 1.52017305364266, + "grad_norm": 0.8359051942825317, + "learning_rate": 2.4671218014871267e-05, + "loss": 0.7772, + "step": 19150 + }, + { + "epoch": 1.5209668776915597, + "grad_norm": 0.6940776705741882, + "learning_rate": 2.4657987351485803e-05, + "loss": 0.7137, + "step": 19160 + }, + { + "epoch": 1.521760701740459, + "grad_norm": 0.8422114253044128, + "learning_rate": 2.464475668810034e-05, + "loss": 0.8425, + "step": 19170 + }, + { + "epoch": 1.5225545257893587, + "grad_norm": 1.108744740486145, + "learning_rate": 2.463152602471488e-05, + "loss": 0.8392, + "step": 19180 + }, + { + "epoch": 1.5233483498382583, + "grad_norm": 0.9370563626289368, + "learning_rate": 2.461829536132942e-05, + "loss": 0.8278, + "step": 19190 + }, + { + "epoch": 1.524142173887158, + "grad_norm": 0.8219224214553833, + "learning_rate": 2.4605064697943955e-05, + "loss": 0.7916, + "step": 19200 + }, + { + "epoch": 1.5249359979360575, + "grad_norm": 0.8902950882911682, + "learning_rate": 2.4591834034558494e-05, + "loss": 0.8494, + "step": 19210 + }, + { + "epoch": 1.525729821984957, + "grad_norm": 0.884885311126709, + "learning_rate": 2.4578603371173033e-05, + "loss": 0.8258, + "step": 19220 + }, + { + "epoch": 1.5265236460338567, + "grad_norm": 0.6714193224906921, + "learning_rate": 2.4565372707787572e-05, + "loss": 0.8377, + "step": 19230 + }, + { + "epoch": 1.5273174700827563, + "grad_norm": 0.9097526669502258, + "learning_rate": 2.4552142044402107e-05, + "loss": 0.8093, + "step": 19240 + }, + { + "epoch": 1.5281112941316557, + "grad_norm": 0.6362065672874451, + "learning_rate": 2.4538911381016646e-05, + "loss": 0.8245, + "step": 19250 + }, + { + "epoch": 1.5289051181805553, + "grad_norm": 0.7581408619880676, + "learning_rate": 2.4525680717631182e-05, + "loss": 0.8677, + "step": 19260 + }, + { + "epoch": 1.5296989422294547, + "grad_norm": 0.7132633328437805, + "learning_rate": 2.451245005424572e-05, + "loss": 0.8338, + "step": 19270 + }, + { + "epoch": 1.5304927662783543, + "grad_norm": 0.8469735383987427, + "learning_rate": 2.449921939086026e-05, + "loss": 0.9025, + "step": 19280 + }, + { + "epoch": 1.531286590327254, + "grad_norm": 0.8352085947990417, + "learning_rate": 2.4485988727474795e-05, + "loss": 0.8216, + "step": 19290 + }, + { + "epoch": 1.5320804143761535, + "grad_norm": 0.7701964378356934, + "learning_rate": 2.4472758064089334e-05, + "loss": 0.7969, + "step": 19300 + }, + { + "epoch": 1.532874238425053, + "grad_norm": 0.8008362054824829, + "learning_rate": 2.4459527400703873e-05, + "loss": 0.781, + "step": 19310 + }, + { + "epoch": 1.5336680624739527, + "grad_norm": 0.7611208558082581, + "learning_rate": 2.4446296737318412e-05, + "loss": 0.7799, + "step": 19320 + }, + { + "epoch": 1.5344618865228523, + "grad_norm": 0.7576231360435486, + "learning_rate": 2.4433066073932947e-05, + "loss": 0.8071, + "step": 19330 + }, + { + "epoch": 1.535255710571752, + "grad_norm": 0.9794348478317261, + "learning_rate": 2.4419835410547486e-05, + "loss": 0.8337, + "step": 19340 + }, + { + "epoch": 1.5360495346206513, + "grad_norm": 0.8349586725234985, + "learning_rate": 2.4406604747162022e-05, + "loss": 0.7267, + "step": 19350 + }, + { + "epoch": 1.536843358669551, + "grad_norm": 0.794275164604187, + "learning_rate": 2.4393374083776564e-05, + "loss": 0.7525, + "step": 19360 + }, + { + "epoch": 1.5376371827184503, + "grad_norm": 0.9255030155181885, + "learning_rate": 2.43801434203911e-05, + "loss": 0.7845, + "step": 19370 + }, + { + "epoch": 1.53843100676735, + "grad_norm": 0.7629058957099915, + "learning_rate": 2.436691275700564e-05, + "loss": 0.7849, + "step": 19380 + }, + { + "epoch": 1.5392248308162495, + "grad_norm": 0.8714119791984558, + "learning_rate": 2.4353682093620174e-05, + "loss": 0.8106, + "step": 19390 + }, + { + "epoch": 1.540018654865149, + "grad_norm": 0.9720879793167114, + "learning_rate": 2.4340451430234713e-05, + "loss": 0.868, + "step": 19400 + }, + { + "epoch": 1.5408124789140487, + "grad_norm": 0.848136842250824, + "learning_rate": 2.4327220766849252e-05, + "loss": 0.8204, + "step": 19410 + }, + { + "epoch": 1.5416063029629483, + "grad_norm": 0.8804022073745728, + "learning_rate": 2.4313990103463788e-05, + "loss": 0.8423, + "step": 19420 + }, + { + "epoch": 1.542400127011848, + "grad_norm": 0.7391209602355957, + "learning_rate": 2.4300759440078327e-05, + "loss": 0.8378, + "step": 19430 + }, + { + "epoch": 1.5431939510607475, + "grad_norm": 0.6705385446548462, + "learning_rate": 2.4287528776692866e-05, + "loss": 0.7564, + "step": 19440 + }, + { + "epoch": 1.543987775109647, + "grad_norm": 0.8526281118392944, + "learning_rate": 2.4274298113307404e-05, + "loss": 0.7472, + "step": 19450 + }, + { + "epoch": 1.5447815991585465, + "grad_norm": 0.7899938225746155, + "learning_rate": 2.426106744992194e-05, + "loss": 0.8292, + "step": 19460 + }, + { + "epoch": 1.5455754232074461, + "grad_norm": 0.9177964925765991, + "learning_rate": 2.424783678653648e-05, + "loss": 0.829, + "step": 19470 + }, + { + "epoch": 1.5463692472563455, + "grad_norm": 0.8367957472801208, + "learning_rate": 2.4234606123151014e-05, + "loss": 0.8395, + "step": 19480 + }, + { + "epoch": 1.547163071305245, + "grad_norm": 0.8440412878990173, + "learning_rate": 2.4221375459765557e-05, + "loss": 0.7762, + "step": 19490 + }, + { + "epoch": 1.5479568953541447, + "grad_norm": 0.8187562823295593, + "learning_rate": 2.4208144796380092e-05, + "loss": 0.8345, + "step": 19500 + }, + { + "epoch": 1.5487507194030443, + "grad_norm": 0.8997789025306702, + "learning_rate": 2.4194914132994628e-05, + "loss": 0.8156, + "step": 19510 + }, + { + "epoch": 1.549544543451944, + "grad_norm": 0.8826847672462463, + "learning_rate": 2.4181683469609167e-05, + "loss": 0.8558, + "step": 19520 + }, + { + "epoch": 1.5503383675008435, + "grad_norm": 0.8336629867553711, + "learning_rate": 2.4168452806223706e-05, + "loss": 0.9098, + "step": 19530 + }, + { + "epoch": 1.5511321915497431, + "grad_norm": 0.9518599510192871, + "learning_rate": 2.4155222142838245e-05, + "loss": 0.7868, + "step": 19540 + }, + { + "epoch": 1.5519260155986425, + "grad_norm": 0.6656529903411865, + "learning_rate": 2.414199147945278e-05, + "loss": 0.8164, + "step": 19550 + }, + { + "epoch": 1.5527198396475421, + "grad_norm": 0.8456233739852905, + "learning_rate": 2.412876081606732e-05, + "loss": 0.7531, + "step": 19560 + }, + { + "epoch": 1.5535136636964417, + "grad_norm": 0.8264173269271851, + "learning_rate": 2.4115530152681855e-05, + "loss": 0.8304, + "step": 19570 + }, + { + "epoch": 1.554307487745341, + "grad_norm": 0.9435857534408569, + "learning_rate": 2.4102299489296397e-05, + "loss": 0.8006, + "step": 19580 + }, + { + "epoch": 1.5551013117942407, + "grad_norm": 0.7315828204154968, + "learning_rate": 2.4089068825910932e-05, + "loss": 0.8037, + "step": 19590 + }, + { + "epoch": 1.5558951358431403, + "grad_norm": 0.8221999406814575, + "learning_rate": 2.407583816252547e-05, + "loss": 0.8318, + "step": 19600 + }, + { + "epoch": 1.55668895989204, + "grad_norm": 0.7483288645744324, + "learning_rate": 2.4062607499140007e-05, + "loss": 0.8202, + "step": 19610 + }, + { + "epoch": 1.5574827839409395, + "grad_norm": 0.7994224429130554, + "learning_rate": 2.4049376835754546e-05, + "loss": 0.7977, + "step": 19620 + }, + { + "epoch": 1.5582766079898391, + "grad_norm": 0.7807520031929016, + "learning_rate": 2.4036146172369085e-05, + "loss": 0.7979, + "step": 19630 + }, + { + "epoch": 1.5590704320387387, + "grad_norm": 0.771969199180603, + "learning_rate": 2.402291550898362e-05, + "loss": 0.8071, + "step": 19640 + }, + { + "epoch": 1.5598642560876381, + "grad_norm": 0.7460570931434631, + "learning_rate": 2.400968484559816e-05, + "loss": 0.7829, + "step": 19650 + }, + { + "epoch": 1.5606580801365377, + "grad_norm": 0.737625241279602, + "learning_rate": 2.3996454182212695e-05, + "loss": 0.8223, + "step": 19660 + }, + { + "epoch": 1.5614519041854373, + "grad_norm": 0.9518927335739136, + "learning_rate": 2.3983223518827237e-05, + "loss": 0.7475, + "step": 19670 + }, + { + "epoch": 1.5622457282343367, + "grad_norm": 0.719597578048706, + "learning_rate": 2.3969992855441773e-05, + "loss": 0.8264, + "step": 19680 + }, + { + "epoch": 1.5630395522832363, + "grad_norm": 0.8887612223625183, + "learning_rate": 2.395676219205631e-05, + "loss": 0.7934, + "step": 19690 + }, + { + "epoch": 1.563833376332136, + "grad_norm": 0.801966667175293, + "learning_rate": 2.3943531528670847e-05, + "loss": 0.7757, + "step": 19700 + }, + { + "epoch": 1.5646272003810355, + "grad_norm": 0.9171623587608337, + "learning_rate": 2.3930300865285386e-05, + "loss": 0.7704, + "step": 19710 + }, + { + "epoch": 1.5654210244299351, + "grad_norm": 0.8287226557731628, + "learning_rate": 2.3917070201899925e-05, + "loss": 0.8069, + "step": 19720 + }, + { + "epoch": 1.5662148484788347, + "grad_norm": 0.8021081686019897, + "learning_rate": 2.390383953851446e-05, + "loss": 0.8093, + "step": 19730 + }, + { + "epoch": 1.5670086725277343, + "grad_norm": 0.8900328874588013, + "learning_rate": 2.3890608875129e-05, + "loss": 0.7636, + "step": 19740 + }, + { + "epoch": 1.567802496576634, + "grad_norm": 0.9409016966819763, + "learning_rate": 2.387737821174354e-05, + "loss": 0.7915, + "step": 19750 + }, + { + "epoch": 1.5685963206255333, + "grad_norm": 0.8233746290206909, + "learning_rate": 2.3864147548358077e-05, + "loss": 0.8201, + "step": 19760 + }, + { + "epoch": 1.569390144674433, + "grad_norm": 0.8792815804481506, + "learning_rate": 2.3850916884972613e-05, + "loss": 0.7767, + "step": 19770 + }, + { + "epoch": 1.5701839687233323, + "grad_norm": 0.8257175087928772, + "learning_rate": 2.3837686221587152e-05, + "loss": 0.7879, + "step": 19780 + }, + { + "epoch": 1.570977792772232, + "grad_norm": 0.9405742883682251, + "learning_rate": 2.3824455558201687e-05, + "loss": 0.7247, + "step": 19790 + }, + { + "epoch": 1.5717716168211315, + "grad_norm": 0.8703904747962952, + "learning_rate": 2.381122489481623e-05, + "loss": 0.8193, + "step": 19800 + }, + { + "epoch": 1.5725654408700311, + "grad_norm": 0.9303200244903564, + "learning_rate": 2.3797994231430765e-05, + "loss": 0.7814, + "step": 19810 + }, + { + "epoch": 1.5733592649189307, + "grad_norm": 0.8841179609298706, + "learning_rate": 2.3784763568045304e-05, + "loss": 0.8572, + "step": 19820 + }, + { + "epoch": 1.5741530889678303, + "grad_norm": 0.7013158798217773, + "learning_rate": 2.377153290465984e-05, + "loss": 0.8371, + "step": 19830 + }, + { + "epoch": 1.57494691301673, + "grad_norm": 0.8166126012802124, + "learning_rate": 2.375830224127438e-05, + "loss": 0.7561, + "step": 19840 + }, + { + "epoch": 1.5757407370656296, + "grad_norm": 0.8469328284263611, + "learning_rate": 2.3745071577888917e-05, + "loss": 0.8203, + "step": 19850 + }, + { + "epoch": 1.576534561114529, + "grad_norm": 0.90618497133255, + "learning_rate": 2.3731840914503453e-05, + "loss": 0.808, + "step": 19860 + }, + { + "epoch": 1.5773283851634285, + "grad_norm": 0.9630061984062195, + "learning_rate": 2.3718610251117992e-05, + "loss": 0.7956, + "step": 19870 + }, + { + "epoch": 1.578122209212328, + "grad_norm": 0.8445176482200623, + "learning_rate": 2.3705379587732527e-05, + "loss": 0.8172, + "step": 19880 + }, + { + "epoch": 1.5789160332612275, + "grad_norm": 0.834750235080719, + "learning_rate": 2.369214892434707e-05, + "loss": 0.9094, + "step": 19890 + }, + { + "epoch": 1.5797098573101271, + "grad_norm": 0.8799012303352356, + "learning_rate": 2.3678918260961605e-05, + "loss": 0.8587, + "step": 19900 + }, + { + "epoch": 1.5805036813590267, + "grad_norm": 0.9250380992889404, + "learning_rate": 2.3665687597576144e-05, + "loss": 0.8404, + "step": 19910 + }, + { + "epoch": 1.5812975054079264, + "grad_norm": 0.9735758304595947, + "learning_rate": 2.365245693419068e-05, + "loss": 0.8144, + "step": 19920 + }, + { + "epoch": 1.582091329456826, + "grad_norm": 0.7764338850975037, + "learning_rate": 2.363922627080522e-05, + "loss": 0.7696, + "step": 19930 + }, + { + "epoch": 1.5828851535057256, + "grad_norm": 0.7918556332588196, + "learning_rate": 2.3625995607419758e-05, + "loss": 0.7751, + "step": 19940 + }, + { + "epoch": 1.5836789775546252, + "grad_norm": 0.980975866317749, + "learning_rate": 2.3612764944034297e-05, + "loss": 0.8112, + "step": 19950 + }, + { + "epoch": 1.5844728016035245, + "grad_norm": 0.6612667441368103, + "learning_rate": 2.3599534280648832e-05, + "loss": 0.7655, + "step": 19960 + }, + { + "epoch": 1.5852666256524242, + "grad_norm": 0.7851887941360474, + "learning_rate": 2.358630361726337e-05, + "loss": 0.7865, + "step": 19970 + }, + { + "epoch": 1.5860604497013235, + "grad_norm": 0.8260042071342468, + "learning_rate": 2.357307295387791e-05, + "loss": 0.7995, + "step": 19980 + }, + { + "epoch": 1.5868542737502231, + "grad_norm": 0.918483555316925, + "learning_rate": 2.3559842290492446e-05, + "loss": 0.7792, + "step": 19990 + }, + { + "epoch": 1.5876480977991227, + "grad_norm": 0.9035216569900513, + "learning_rate": 2.3546611627106984e-05, + "loss": 0.8465, + "step": 20000 + }, + { + "epoch": 1.5884419218480224, + "grad_norm": 0.7393106818199158, + "learning_rate": 2.353338096372152e-05, + "loss": 0.7915, + "step": 20010 + }, + { + "epoch": 1.589235745896922, + "grad_norm": 0.9715613722801208, + "learning_rate": 2.3520150300336062e-05, + "loss": 0.8255, + "step": 20020 + }, + { + "epoch": 1.5900295699458216, + "grad_norm": 0.79570472240448, + "learning_rate": 2.3506919636950598e-05, + "loss": 0.8283, + "step": 20030 + }, + { + "epoch": 1.5908233939947212, + "grad_norm": 0.738107442855835, + "learning_rate": 2.3493688973565137e-05, + "loss": 0.8292, + "step": 20040 + }, + { + "epoch": 1.5916172180436208, + "grad_norm": 1.0753180980682373, + "learning_rate": 2.3480458310179672e-05, + "loss": 0.8189, + "step": 20050 + }, + { + "epoch": 1.5924110420925202, + "grad_norm": 0.9600769877433777, + "learning_rate": 2.346722764679421e-05, + "loss": 0.855, + "step": 20060 + }, + { + "epoch": 1.5932048661414198, + "grad_norm": 0.7264878749847412, + "learning_rate": 2.345399698340875e-05, + "loss": 0.8333, + "step": 20070 + }, + { + "epoch": 1.5939986901903194, + "grad_norm": 1.0567606687545776, + "learning_rate": 2.3440766320023286e-05, + "loss": 0.842, + "step": 20080 + }, + { + "epoch": 1.5947925142392188, + "grad_norm": 0.8307549953460693, + "learning_rate": 2.3427535656637825e-05, + "loss": 0.8132, + "step": 20090 + }, + { + "epoch": 1.5955863382881184, + "grad_norm": 0.7764892578125, + "learning_rate": 2.341430499325236e-05, + "loss": 0.783, + "step": 20100 + }, + { + "epoch": 1.596380162337018, + "grad_norm": 0.8395814895629883, + "learning_rate": 2.3401074329866902e-05, + "loss": 0.8022, + "step": 20110 + }, + { + "epoch": 1.5971739863859176, + "grad_norm": 0.7872236371040344, + "learning_rate": 2.3387843666481438e-05, + "loss": 0.8471, + "step": 20120 + }, + { + "epoch": 1.5979678104348172, + "grad_norm": 0.9380735158920288, + "learning_rate": 2.3374613003095977e-05, + "loss": 0.7703, + "step": 20130 + }, + { + "epoch": 1.5987616344837168, + "grad_norm": 1.0980794429779053, + "learning_rate": 2.3361382339710512e-05, + "loss": 0.7735, + "step": 20140 + }, + { + "epoch": 1.5995554585326164, + "grad_norm": 0.8633868098258972, + "learning_rate": 2.334815167632505e-05, + "loss": 0.7795, + "step": 20150 + }, + { + "epoch": 1.6003492825815158, + "grad_norm": 0.8296395540237427, + "learning_rate": 2.333492101293959e-05, + "loss": 0.783, + "step": 20160 + }, + { + "epoch": 1.6011431066304154, + "grad_norm": 0.9974943995475769, + "learning_rate": 2.332169034955413e-05, + "loss": 0.8203, + "step": 20170 + }, + { + "epoch": 1.601936930679315, + "grad_norm": 0.932245671749115, + "learning_rate": 2.3308459686168665e-05, + "loss": 0.8744, + "step": 20180 + }, + { + "epoch": 1.6027307547282144, + "grad_norm": 0.8958556652069092, + "learning_rate": 2.3295229022783204e-05, + "loss": 0.7615, + "step": 20190 + }, + { + "epoch": 1.603524578777114, + "grad_norm": 0.8927448391914368, + "learning_rate": 2.3281998359397743e-05, + "loss": 0.8409, + "step": 20200 + }, + { + "epoch": 1.6043184028260136, + "grad_norm": 0.8587812185287476, + "learning_rate": 2.3268767696012278e-05, + "loss": 0.7895, + "step": 20210 + }, + { + "epoch": 1.6051122268749132, + "grad_norm": 0.9127293825149536, + "learning_rate": 2.3255537032626817e-05, + "loss": 0.7561, + "step": 20220 + }, + { + "epoch": 1.6059060509238128, + "grad_norm": 0.8766940832138062, + "learning_rate": 2.3242306369241353e-05, + "loss": 0.8178, + "step": 20230 + }, + { + "epoch": 1.6066998749727124, + "grad_norm": 0.9021517634391785, + "learning_rate": 2.3229075705855895e-05, + "loss": 0.7978, + "step": 20240 + }, + { + "epoch": 1.607493699021612, + "grad_norm": 0.7711321711540222, + "learning_rate": 2.321584504247043e-05, + "loss": 0.8566, + "step": 20250 + }, + { + "epoch": 1.6082875230705114, + "grad_norm": 0.7530736327171326, + "learning_rate": 2.320261437908497e-05, + "loss": 0.772, + "step": 20260 + }, + { + "epoch": 1.609081347119411, + "grad_norm": 0.7428927421569824, + "learning_rate": 2.3189383715699505e-05, + "loss": 0.8944, + "step": 20270 + }, + { + "epoch": 1.6098751711683106, + "grad_norm": 0.7444825768470764, + "learning_rate": 2.3176153052314044e-05, + "loss": 0.8346, + "step": 20280 + }, + { + "epoch": 1.61066899521721, + "grad_norm": 1.0045958757400513, + "learning_rate": 2.3162922388928583e-05, + "loss": 0.8444, + "step": 20290 + }, + { + "epoch": 1.6114628192661096, + "grad_norm": 1.0150105953216553, + "learning_rate": 2.3149691725543122e-05, + "loss": 0.7973, + "step": 20300 + }, + { + "epoch": 1.6122566433150092, + "grad_norm": 0.9908817410469055, + "learning_rate": 2.3136461062157657e-05, + "loss": 0.7855, + "step": 20310 + }, + { + "epoch": 1.6130504673639088, + "grad_norm": 0.8784881830215454, + "learning_rate": 2.3123230398772193e-05, + "loss": 0.7707, + "step": 20320 + }, + { + "epoch": 1.6138442914128084, + "grad_norm": 0.7329348921775818, + "learning_rate": 2.3109999735386735e-05, + "loss": 0.8465, + "step": 20330 + }, + { + "epoch": 1.614638115461708, + "grad_norm": 0.9049598574638367, + "learning_rate": 2.309676907200127e-05, + "loss": 0.8031, + "step": 20340 + }, + { + "epoch": 1.6154319395106076, + "grad_norm": 0.8993352651596069, + "learning_rate": 2.308353840861581e-05, + "loss": 0.8029, + "step": 20350 + }, + { + "epoch": 1.6162257635595072, + "grad_norm": 0.7339730858802795, + "learning_rate": 2.3070307745230345e-05, + "loss": 0.8168, + "step": 20360 + }, + { + "epoch": 1.6170195876084066, + "grad_norm": 0.8246588706970215, + "learning_rate": 2.3057077081844884e-05, + "loss": 0.8413, + "step": 20370 + }, + { + "epoch": 1.6178134116573062, + "grad_norm": 0.8873500227928162, + "learning_rate": 2.3043846418459423e-05, + "loss": 0.822, + "step": 20380 + }, + { + "epoch": 1.6186072357062056, + "grad_norm": 0.7149674892425537, + "learning_rate": 2.3030615755073962e-05, + "loss": 0.8343, + "step": 20390 + }, + { + "epoch": 1.6194010597551052, + "grad_norm": 0.8390538096427917, + "learning_rate": 2.3017385091688497e-05, + "loss": 0.7945, + "step": 20400 + }, + { + "epoch": 1.6201948838040048, + "grad_norm": 0.7026415467262268, + "learning_rate": 2.3004154428303036e-05, + "loss": 0.7255, + "step": 20410 + }, + { + "epoch": 1.6209887078529044, + "grad_norm": 0.8849766254425049, + "learning_rate": 2.2990923764917575e-05, + "loss": 0.7589, + "step": 20420 + }, + { + "epoch": 1.621782531901804, + "grad_norm": 0.8478531241416931, + "learning_rate": 2.297769310153211e-05, + "loss": 0.7977, + "step": 20430 + }, + { + "epoch": 1.6225763559507036, + "grad_norm": 0.8617717027664185, + "learning_rate": 2.296446243814665e-05, + "loss": 0.8657, + "step": 20440 + }, + { + "epoch": 1.6233701799996032, + "grad_norm": 0.913943350315094, + "learning_rate": 2.2951231774761185e-05, + "loss": 0.8567, + "step": 20450 + }, + { + "epoch": 1.6241640040485028, + "grad_norm": 0.6891298294067383, + "learning_rate": 2.2938001111375724e-05, + "loss": 0.806, + "step": 20460 + }, + { + "epoch": 1.6249578280974022, + "grad_norm": 0.914467990398407, + "learning_rate": 2.2924770447990263e-05, + "loss": 0.7876, + "step": 20470 + }, + { + "epoch": 1.6257516521463018, + "grad_norm": 0.8670864701271057, + "learning_rate": 2.2911539784604802e-05, + "loss": 0.7589, + "step": 20480 + }, + { + "epoch": 1.6265454761952012, + "grad_norm": 0.8266043663024902, + "learning_rate": 2.2898309121219338e-05, + "loss": 0.8038, + "step": 20490 + }, + { + "epoch": 1.6273393002441008, + "grad_norm": 0.7241661548614502, + "learning_rate": 2.2885078457833877e-05, + "loss": 0.8191, + "step": 20500 + }, + { + "epoch": 1.6281331242930004, + "grad_norm": 0.7914606332778931, + "learning_rate": 2.2871847794448416e-05, + "loss": 0.759, + "step": 20510 + }, + { + "epoch": 1.6289269483419, + "grad_norm": 0.8614340424537659, + "learning_rate": 2.2858617131062954e-05, + "loss": 0.7668, + "step": 20520 + }, + { + "epoch": 1.6297207723907996, + "grad_norm": 0.9252728223800659, + "learning_rate": 2.284538646767749e-05, + "loss": 0.7982, + "step": 20530 + }, + { + "epoch": 1.6305145964396992, + "grad_norm": 0.808665931224823, + "learning_rate": 2.2832155804292026e-05, + "loss": 0.8616, + "step": 20540 + }, + { + "epoch": 1.6313084204885988, + "grad_norm": 0.6369495987892151, + "learning_rate": 2.2818925140906568e-05, + "loss": 0.8265, + "step": 20550 + }, + { + "epoch": 1.6321022445374984, + "grad_norm": 0.8415317535400391, + "learning_rate": 2.2805694477521103e-05, + "loss": 0.8126, + "step": 20560 + }, + { + "epoch": 1.6328960685863978, + "grad_norm": 0.9225695133209229, + "learning_rate": 2.2792463814135642e-05, + "loss": 0.8399, + "step": 20570 + }, + { + "epoch": 1.6336898926352974, + "grad_norm": 0.6714004278182983, + "learning_rate": 2.2779233150750178e-05, + "loss": 0.8543, + "step": 20580 + }, + { + "epoch": 1.6344837166841968, + "grad_norm": 0.8270373940467834, + "learning_rate": 2.2766002487364717e-05, + "loss": 0.8255, + "step": 20590 + }, + { + "epoch": 1.6352775407330964, + "grad_norm": 0.7550914883613586, + "learning_rate": 2.2752771823979256e-05, + "loss": 0.7879, + "step": 20600 + }, + { + "epoch": 1.636071364781996, + "grad_norm": 0.865746796131134, + "learning_rate": 2.2739541160593795e-05, + "loss": 0.857, + "step": 20610 + }, + { + "epoch": 1.6368651888308956, + "grad_norm": 0.8743048906326294, + "learning_rate": 2.272631049720833e-05, + "loss": 0.8459, + "step": 20620 + }, + { + "epoch": 1.6376590128797952, + "grad_norm": 0.7982882261276245, + "learning_rate": 2.271307983382287e-05, + "loss": 0.8263, + "step": 20630 + }, + { + "epoch": 1.6384528369286948, + "grad_norm": 0.7997922897338867, + "learning_rate": 2.2699849170437408e-05, + "loss": 0.7892, + "step": 20640 + }, + { + "epoch": 1.6392466609775944, + "grad_norm": 1.0065598487854004, + "learning_rate": 2.2686618507051947e-05, + "loss": 0.7943, + "step": 20650 + }, + { + "epoch": 1.640040485026494, + "grad_norm": 0.828996479511261, + "learning_rate": 2.2673387843666482e-05, + "loss": 0.7368, + "step": 20660 + }, + { + "epoch": 1.6408343090753934, + "grad_norm": 0.8274086117744446, + "learning_rate": 2.2660157180281018e-05, + "loss": 0.8166, + "step": 20670 + }, + { + "epoch": 1.641628133124293, + "grad_norm": 0.8134365081787109, + "learning_rate": 2.2646926516895557e-05, + "loss": 0.8618, + "step": 20680 + }, + { + "epoch": 1.6424219571731926, + "grad_norm": 0.8807488679885864, + "learning_rate": 2.2633695853510096e-05, + "loss": 0.8119, + "step": 20690 + }, + { + "epoch": 1.643215781222092, + "grad_norm": 0.93861985206604, + "learning_rate": 2.2620465190124635e-05, + "loss": 0.8055, + "step": 20700 + }, + { + "epoch": 1.6440096052709916, + "grad_norm": 0.8421952724456787, + "learning_rate": 2.260723452673917e-05, + "loss": 0.8429, + "step": 20710 + }, + { + "epoch": 1.6448034293198912, + "grad_norm": 0.9165804982185364, + "learning_rate": 2.259400386335371e-05, + "loss": 0.7859, + "step": 20720 + }, + { + "epoch": 1.6455972533687908, + "grad_norm": 0.7905272841453552, + "learning_rate": 2.2580773199968248e-05, + "loss": 0.8527, + "step": 20730 + }, + { + "epoch": 1.6463910774176904, + "grad_norm": 0.9320306777954102, + "learning_rate": 2.2567542536582787e-05, + "loss": 0.7826, + "step": 20740 + }, + { + "epoch": 1.64718490146659, + "grad_norm": 0.8871251940727234, + "learning_rate": 2.2554311873197323e-05, + "loss": 0.7819, + "step": 20750 + }, + { + "epoch": 1.6479787255154896, + "grad_norm": 1.1218063831329346, + "learning_rate": 2.254108120981186e-05, + "loss": 0.8509, + "step": 20760 + }, + { + "epoch": 1.648772549564389, + "grad_norm": 0.890609622001648, + "learning_rate": 2.25278505464264e-05, + "loss": 0.8236, + "step": 20770 + }, + { + "epoch": 1.6495663736132886, + "grad_norm": 0.8825326561927795, + "learning_rate": 2.2514619883040936e-05, + "loss": 0.7444, + "step": 20780 + }, + { + "epoch": 1.6503601976621882, + "grad_norm": 0.8295640349388123, + "learning_rate": 2.2501389219655475e-05, + "loss": 0.7725, + "step": 20790 + }, + { + "epoch": 1.6511540217110876, + "grad_norm": 0.9995619654655457, + "learning_rate": 2.248815855627001e-05, + "loss": 0.7952, + "step": 20800 + }, + { + "epoch": 1.6519478457599872, + "grad_norm": 1.0375027656555176, + "learning_rate": 2.247492789288455e-05, + "loss": 0.7575, + "step": 20810 + }, + { + "epoch": 1.6527416698088868, + "grad_norm": 0.7434347867965698, + "learning_rate": 2.246169722949909e-05, + "loss": 0.7775, + "step": 20820 + }, + { + "epoch": 1.6535354938577864, + "grad_norm": 0.7898569107055664, + "learning_rate": 2.2448466566113627e-05, + "loss": 0.8458, + "step": 20830 + }, + { + "epoch": 1.654329317906686, + "grad_norm": 0.8854445815086365, + "learning_rate": 2.2435235902728163e-05, + "loss": 0.8005, + "step": 20840 + }, + { + "epoch": 1.6551231419555856, + "grad_norm": 0.8585209846496582, + "learning_rate": 2.2422005239342702e-05, + "loss": 0.7762, + "step": 20850 + }, + { + "epoch": 1.6559169660044852, + "grad_norm": 0.7355554103851318, + "learning_rate": 2.240877457595724e-05, + "loss": 0.7967, + "step": 20860 + }, + { + "epoch": 1.6567107900533846, + "grad_norm": 0.8480395078659058, + "learning_rate": 2.239554391257178e-05, + "loss": 0.8113, + "step": 20870 + }, + { + "epoch": 1.6575046141022842, + "grad_norm": 0.7819201946258545, + "learning_rate": 2.2382313249186315e-05, + "loss": 0.7618, + "step": 20880 + }, + { + "epoch": 1.6582984381511838, + "grad_norm": 0.7378450036048889, + "learning_rate": 2.236908258580085e-05, + "loss": 0.8375, + "step": 20890 + }, + { + "epoch": 1.6590922622000832, + "grad_norm": 0.8621236085891724, + "learning_rate": 2.235585192241539e-05, + "loss": 0.7767, + "step": 20900 + }, + { + "epoch": 1.6598860862489828, + "grad_norm": 0.720856785774231, + "learning_rate": 2.234262125902993e-05, + "loss": 0.7847, + "step": 20910 + }, + { + "epoch": 1.6606799102978824, + "grad_norm": 0.9445878863334656, + "learning_rate": 2.2329390595644467e-05, + "loss": 0.832, + "step": 20920 + }, + { + "epoch": 1.661473734346782, + "grad_norm": 0.7476319670677185, + "learning_rate": 2.2316159932259003e-05, + "loss": 0.8122, + "step": 20930 + }, + { + "epoch": 1.6622675583956816, + "grad_norm": 0.7706652283668518, + "learning_rate": 2.2302929268873542e-05, + "loss": 0.7505, + "step": 20940 + }, + { + "epoch": 1.6630613824445812, + "grad_norm": 1.0088225603103638, + "learning_rate": 2.228969860548808e-05, + "loss": 0.8042, + "step": 20950 + }, + { + "epoch": 1.6638552064934808, + "grad_norm": 0.906871497631073, + "learning_rate": 2.227646794210262e-05, + "loss": 0.8492, + "step": 20960 + }, + { + "epoch": 1.6646490305423804, + "grad_norm": 0.8510369062423706, + "learning_rate": 2.2263237278717155e-05, + "loss": 0.789, + "step": 20970 + }, + { + "epoch": 1.6654428545912798, + "grad_norm": 0.842056930065155, + "learning_rate": 2.2250006615331694e-05, + "loss": 0.7597, + "step": 20980 + }, + { + "epoch": 1.6662366786401794, + "grad_norm": 0.8941336274147034, + "learning_rate": 2.2236775951946233e-05, + "loss": 0.7614, + "step": 20990 + }, + { + "epoch": 1.6670305026890788, + "grad_norm": 0.6981929540634155, + "learning_rate": 2.222354528856077e-05, + "loss": 0.8553, + "step": 21000 + }, + { + "epoch": 1.6678243267379784, + "grad_norm": 0.7219326496124268, + "learning_rate": 2.2210314625175308e-05, + "loss": 0.8378, + "step": 21010 + }, + { + "epoch": 1.668618150786878, + "grad_norm": 0.6784833073616028, + "learning_rate": 2.2197083961789843e-05, + "loss": 0.87, + "step": 21020 + }, + { + "epoch": 1.6694119748357776, + "grad_norm": 0.7664601802825928, + "learning_rate": 2.2183853298404382e-05, + "loss": 0.8162, + "step": 21030 + }, + { + "epoch": 1.6702057988846772, + "grad_norm": 0.7743760347366333, + "learning_rate": 2.217062263501892e-05, + "loss": 0.7797, + "step": 21040 + }, + { + "epoch": 1.6709996229335768, + "grad_norm": 0.9794851541519165, + "learning_rate": 2.215739197163346e-05, + "loss": 0.8093, + "step": 21050 + }, + { + "epoch": 1.6717934469824764, + "grad_norm": 0.8463415503501892, + "learning_rate": 2.2144161308247996e-05, + "loss": 0.7824, + "step": 21060 + }, + { + "epoch": 1.672587271031376, + "grad_norm": 0.8329752087593079, + "learning_rate": 2.2130930644862534e-05, + "loss": 0.824, + "step": 21070 + }, + { + "epoch": 1.6733810950802754, + "grad_norm": 0.7661581039428711, + "learning_rate": 2.2117699981477073e-05, + "loss": 0.8068, + "step": 21080 + }, + { + "epoch": 1.674174919129175, + "grad_norm": 0.8593927621841431, + "learning_rate": 2.2104469318091612e-05, + "loss": 0.8332, + "step": 21090 + }, + { + "epoch": 1.6749687431780744, + "grad_norm": 0.7758104801177979, + "learning_rate": 2.2091238654706148e-05, + "loss": 0.8511, + "step": 21100 + }, + { + "epoch": 1.675762567226974, + "grad_norm": 0.8966826796531677, + "learning_rate": 2.2078007991320687e-05, + "loss": 0.8041, + "step": 21110 + }, + { + "epoch": 1.6765563912758736, + "grad_norm": 0.8776934146881104, + "learning_rate": 2.2064777327935222e-05, + "loss": 0.8093, + "step": 21120 + }, + { + "epoch": 1.6773502153247732, + "grad_norm": 0.6183434724807739, + "learning_rate": 2.205154666454976e-05, + "loss": 0.8896, + "step": 21130 + }, + { + "epoch": 1.6781440393736728, + "grad_norm": 0.7311633825302124, + "learning_rate": 2.20383160011643e-05, + "loss": 0.7782, + "step": 21140 + }, + { + "epoch": 1.6789378634225725, + "grad_norm": 0.8047720789909363, + "learning_rate": 2.2025085337778836e-05, + "loss": 0.8534, + "step": 21150 + }, + { + "epoch": 1.679731687471472, + "grad_norm": 0.7538871169090271, + "learning_rate": 2.2011854674393375e-05, + "loss": 0.791, + "step": 21160 + }, + { + "epoch": 1.6805255115203717, + "grad_norm": 0.7678347826004028, + "learning_rate": 2.1998624011007914e-05, + "loss": 0.8283, + "step": 21170 + }, + { + "epoch": 1.681319335569271, + "grad_norm": 0.7614843249320984, + "learning_rate": 2.1985393347622452e-05, + "loss": 0.7892, + "step": 21180 + }, + { + "epoch": 1.6821131596181707, + "grad_norm": 0.7208516597747803, + "learning_rate": 2.1972162684236988e-05, + "loss": 0.7965, + "step": 21190 + }, + { + "epoch": 1.68290698366707, + "grad_norm": 0.7032837867736816, + "learning_rate": 2.1958932020851527e-05, + "loss": 0.7784, + "step": 21200 + }, + { + "epoch": 1.6837008077159696, + "grad_norm": 0.705690860748291, + "learning_rate": 2.1945701357466062e-05, + "loss": 0.829, + "step": 21210 + }, + { + "epoch": 1.6844946317648692, + "grad_norm": 0.8606507778167725, + "learning_rate": 2.1932470694080605e-05, + "loss": 0.8002, + "step": 21220 + }, + { + "epoch": 1.6852884558137688, + "grad_norm": 0.7552010416984558, + "learning_rate": 2.191924003069514e-05, + "loss": 0.7753, + "step": 21230 + }, + { + "epoch": 1.6860822798626685, + "grad_norm": 0.9295687079429626, + "learning_rate": 2.1906009367309676e-05, + "loss": 0.8016, + "step": 21240 + }, + { + "epoch": 1.686876103911568, + "grad_norm": 0.8093982934951782, + "learning_rate": 2.1892778703924215e-05, + "loss": 0.8157, + "step": 21250 + }, + { + "epoch": 1.6876699279604677, + "grad_norm": 0.8069568872451782, + "learning_rate": 2.1879548040538754e-05, + "loss": 0.836, + "step": 21260 + }, + { + "epoch": 1.6884637520093673, + "grad_norm": 0.799312174320221, + "learning_rate": 2.1866317377153293e-05, + "loss": 0.8299, + "step": 21270 + }, + { + "epoch": 1.6892575760582667, + "grad_norm": 0.8665412068367004, + "learning_rate": 2.1853086713767828e-05, + "loss": 0.795, + "step": 21280 + }, + { + "epoch": 1.6900514001071663, + "grad_norm": 0.8677714467048645, + "learning_rate": 2.1839856050382367e-05, + "loss": 0.8833, + "step": 21290 + }, + { + "epoch": 1.6908452241560659, + "grad_norm": 0.7136849164962769, + "learning_rate": 2.1826625386996906e-05, + "loss": 0.83, + "step": 21300 + }, + { + "epoch": 1.6916390482049652, + "grad_norm": 0.8250473737716675, + "learning_rate": 2.1813394723611445e-05, + "loss": 0.8308, + "step": 21310 + }, + { + "epoch": 1.6924328722538649, + "grad_norm": 0.7840633988380432, + "learning_rate": 2.180016406022598e-05, + "loss": 0.8283, + "step": 21320 + }, + { + "epoch": 1.6932266963027645, + "grad_norm": 0.8209207653999329, + "learning_rate": 2.178693339684052e-05, + "loss": 0.7752, + "step": 21330 + }, + { + "epoch": 1.694020520351664, + "grad_norm": 0.7074013948440552, + "learning_rate": 2.1773702733455055e-05, + "loss": 0.7849, + "step": 21340 + }, + { + "epoch": 1.6948143444005637, + "grad_norm": 0.8547490835189819, + "learning_rate": 2.1760472070069594e-05, + "loss": 0.8125, + "step": 21350 + }, + { + "epoch": 1.6956081684494633, + "grad_norm": 0.7698444724082947, + "learning_rate": 2.1747241406684133e-05, + "loss": 0.766, + "step": 21360 + }, + { + "epoch": 1.6964019924983629, + "grad_norm": 0.9371459484100342, + "learning_rate": 2.173401074329867e-05, + "loss": 0.7663, + "step": 21370 + }, + { + "epoch": 1.6971958165472623, + "grad_norm": 0.8051425218582153, + "learning_rate": 2.1720780079913207e-05, + "loss": 0.8333, + "step": 21380 + }, + { + "epoch": 1.6979896405961619, + "grad_norm": 0.7680243253707886, + "learning_rate": 2.1707549416527746e-05, + "loss": 0.8129, + "step": 21390 + }, + { + "epoch": 1.6987834646450615, + "grad_norm": 0.6620742082595825, + "learning_rate": 2.1694318753142285e-05, + "loss": 0.8581, + "step": 21400 + }, + { + "epoch": 1.6995772886939609, + "grad_norm": 0.8950489163398743, + "learning_rate": 2.168108808975682e-05, + "loss": 0.8185, + "step": 21410 + }, + { + "epoch": 1.7003711127428605, + "grad_norm": 0.7099031805992126, + "learning_rate": 2.166785742637136e-05, + "loss": 0.8097, + "step": 21420 + }, + { + "epoch": 1.70116493679176, + "grad_norm": 0.9240164756774902, + "learning_rate": 2.1654626762985895e-05, + "loss": 0.8416, + "step": 21430 + }, + { + "epoch": 1.7019587608406597, + "grad_norm": 0.7131415009498596, + "learning_rate": 2.1641396099600437e-05, + "loss": 0.7934, + "step": 21440 + }, + { + "epoch": 1.7027525848895593, + "grad_norm": 0.8093000650405884, + "learning_rate": 2.1628165436214973e-05, + "loss": 0.8598, + "step": 21450 + }, + { + "epoch": 1.7035464089384589, + "grad_norm": 0.8821578025817871, + "learning_rate": 2.1614934772829512e-05, + "loss": 0.8821, + "step": 21460 + }, + { + "epoch": 1.7043402329873585, + "grad_norm": 0.835568904876709, + "learning_rate": 2.1601704109444047e-05, + "loss": 0.8338, + "step": 21470 + }, + { + "epoch": 1.7051340570362579, + "grad_norm": 0.9917017817497253, + "learning_rate": 2.1588473446058586e-05, + "loss": 0.8007, + "step": 21480 + }, + { + "epoch": 1.7059278810851575, + "grad_norm": 0.8382050395011902, + "learning_rate": 2.1575242782673125e-05, + "loss": 0.733, + "step": 21490 + }, + { + "epoch": 1.706721705134057, + "grad_norm": 0.9868109226226807, + "learning_rate": 2.156201211928766e-05, + "loss": 0.8729, + "step": 21500 + }, + { + "epoch": 1.7075155291829565, + "grad_norm": 0.8536074161529541, + "learning_rate": 2.15487814559022e-05, + "loss": 0.8462, + "step": 21510 + }, + { + "epoch": 1.708309353231856, + "grad_norm": 0.7851694822311401, + "learning_rate": 2.153555079251674e-05, + "loss": 0.8142, + "step": 21520 + }, + { + "epoch": 1.7091031772807557, + "grad_norm": 0.9547113180160522, + "learning_rate": 2.1522320129131278e-05, + "loss": 0.8597, + "step": 21530 + }, + { + "epoch": 1.7098970013296553, + "grad_norm": 0.9159030914306641, + "learning_rate": 2.1509089465745813e-05, + "loss": 0.8335, + "step": 21540 + }, + { + "epoch": 1.7106908253785549, + "grad_norm": 0.6976479887962341, + "learning_rate": 2.1495858802360352e-05, + "loss": 0.8117, + "step": 21550 + }, + { + "epoch": 1.7114846494274545, + "grad_norm": 0.7213908433914185, + "learning_rate": 2.1482628138974888e-05, + "loss": 0.8176, + "step": 21560 + }, + { + "epoch": 1.712278473476354, + "grad_norm": 0.8861874341964722, + "learning_rate": 2.146939747558943e-05, + "loss": 0.7886, + "step": 21570 + }, + { + "epoch": 1.7130722975252535, + "grad_norm": 0.87417072057724, + "learning_rate": 2.1456166812203966e-05, + "loss": 0.7848, + "step": 21580 + }, + { + "epoch": 1.713866121574153, + "grad_norm": 0.9183176755905151, + "learning_rate": 2.14429361488185e-05, + "loss": 0.8826, + "step": 21590 + }, + { + "epoch": 1.7146599456230527, + "grad_norm": 0.7000020146369934, + "learning_rate": 2.142970548543304e-05, + "loss": 0.8566, + "step": 21600 + }, + { + "epoch": 1.715453769671952, + "grad_norm": 0.7864513993263245, + "learning_rate": 2.141647482204758e-05, + "loss": 0.7844, + "step": 21610 + }, + { + "epoch": 1.7162475937208517, + "grad_norm": 0.8153032660484314, + "learning_rate": 2.1403244158662118e-05, + "loss": 0.8072, + "step": 21620 + }, + { + "epoch": 1.7170414177697513, + "grad_norm": 0.8709026575088501, + "learning_rate": 2.1390013495276653e-05, + "loss": 0.8276, + "step": 21630 + }, + { + "epoch": 1.7178352418186509, + "grad_norm": 0.8884004950523376, + "learning_rate": 2.1376782831891192e-05, + "loss": 0.8013, + "step": 21640 + }, + { + "epoch": 1.7186290658675505, + "grad_norm": 0.7052041292190552, + "learning_rate": 2.1363552168505728e-05, + "loss": 0.8273, + "step": 21650 + }, + { + "epoch": 1.71942288991645, + "grad_norm": 0.8561339378356934, + "learning_rate": 2.135032150512027e-05, + "loss": 0.8321, + "step": 21660 + }, + { + "epoch": 1.7202167139653497, + "grad_norm": 0.8964945077896118, + "learning_rate": 2.1337090841734806e-05, + "loss": 0.7822, + "step": 21670 + }, + { + "epoch": 1.7210105380142493, + "grad_norm": 0.702278733253479, + "learning_rate": 2.1323860178349345e-05, + "loss": 0.8043, + "step": 21680 + }, + { + "epoch": 1.7218043620631487, + "grad_norm": 0.8085154891014099, + "learning_rate": 2.131062951496388e-05, + "loss": 0.8019, + "step": 21690 + }, + { + "epoch": 1.7225981861120483, + "grad_norm": 0.8028178811073303, + "learning_rate": 2.129739885157842e-05, + "loss": 0.8023, + "step": 21700 + }, + { + "epoch": 1.7233920101609477, + "grad_norm": 0.8829013705253601, + "learning_rate": 2.1284168188192958e-05, + "loss": 0.7858, + "step": 21710 + }, + { + "epoch": 1.7241858342098473, + "grad_norm": 0.8301076889038086, + "learning_rate": 2.1270937524807494e-05, + "loss": 0.8117, + "step": 21720 + }, + { + "epoch": 1.7249796582587469, + "grad_norm": 0.8090790510177612, + "learning_rate": 2.1257706861422032e-05, + "loss": 0.7803, + "step": 21730 + }, + { + "epoch": 1.7257734823076465, + "grad_norm": 0.8940252065658569, + "learning_rate": 2.124447619803657e-05, + "loss": 0.7733, + "step": 21740 + }, + { + "epoch": 1.726567306356546, + "grad_norm": 0.8481318950653076, + "learning_rate": 2.123124553465111e-05, + "loss": 0.826, + "step": 21750 + }, + { + "epoch": 1.7273611304054457, + "grad_norm": 0.7668688297271729, + "learning_rate": 2.1218014871265646e-05, + "loss": 0.8235, + "step": 21760 + }, + { + "epoch": 1.7281549544543453, + "grad_norm": 0.8397562503814697, + "learning_rate": 2.1204784207880185e-05, + "loss": 0.8275, + "step": 21770 + }, + { + "epoch": 1.728948778503245, + "grad_norm": 0.786025881767273, + "learning_rate": 2.119155354449472e-05, + "loss": 0.862, + "step": 21780 + }, + { + "epoch": 1.7297426025521443, + "grad_norm": 0.7707594633102417, + "learning_rate": 2.1178322881109263e-05, + "loss": 0.8089, + "step": 21790 + }, + { + "epoch": 1.730536426601044, + "grad_norm": 0.7764220237731934, + "learning_rate": 2.1165092217723798e-05, + "loss": 0.7513, + "step": 21800 + }, + { + "epoch": 1.7313302506499433, + "grad_norm": 0.9090325832366943, + "learning_rate": 2.1151861554338334e-05, + "loss": 0.7636, + "step": 21810 + }, + { + "epoch": 1.7321240746988429, + "grad_norm": 0.7767049074172974, + "learning_rate": 2.1138630890952873e-05, + "loss": 0.8391, + "step": 21820 + }, + { + "epoch": 1.7329178987477425, + "grad_norm": 0.7917195558547974, + "learning_rate": 2.112540022756741e-05, + "loss": 0.8291, + "step": 21830 + }, + { + "epoch": 1.733711722796642, + "grad_norm": 0.9477527141571045, + "learning_rate": 2.111216956418195e-05, + "loss": 0.7697, + "step": 21840 + }, + { + "epoch": 1.7345055468455417, + "grad_norm": 0.8348498940467834, + "learning_rate": 2.1098938900796486e-05, + "loss": 0.7575, + "step": 21850 + }, + { + "epoch": 1.7352993708944413, + "grad_norm": 0.7253052592277527, + "learning_rate": 2.1085708237411025e-05, + "loss": 0.7476, + "step": 21860 + }, + { + "epoch": 1.736093194943341, + "grad_norm": 0.8680984973907471, + "learning_rate": 2.107247757402556e-05, + "loss": 0.7865, + "step": 21870 + }, + { + "epoch": 1.7368870189922405, + "grad_norm": 0.7522729635238647, + "learning_rate": 2.1059246910640103e-05, + "loss": 0.8271, + "step": 21880 + }, + { + "epoch": 1.73768084304114, + "grad_norm": 0.6878653764724731, + "learning_rate": 2.104601624725464e-05, + "loss": 0.8229, + "step": 21890 + }, + { + "epoch": 1.7384746670900395, + "grad_norm": 0.9120025038719177, + "learning_rate": 2.1032785583869177e-05, + "loss": 0.8646, + "step": 21900 + }, + { + "epoch": 1.7392684911389389, + "grad_norm": 0.7929124236106873, + "learning_rate": 2.1019554920483713e-05, + "loss": 0.8654, + "step": 21910 + }, + { + "epoch": 1.7400623151878385, + "grad_norm": 0.8900664448738098, + "learning_rate": 2.1006324257098252e-05, + "loss": 0.8068, + "step": 21920 + }, + { + "epoch": 1.740856139236738, + "grad_norm": 0.8378934860229492, + "learning_rate": 2.099309359371279e-05, + "loss": 0.7829, + "step": 21930 + }, + { + "epoch": 1.7416499632856377, + "grad_norm": 0.8364038467407227, + "learning_rate": 2.0979862930327326e-05, + "loss": 0.8372, + "step": 21940 + }, + { + "epoch": 1.7424437873345373, + "grad_norm": 0.7890764474868774, + "learning_rate": 2.0966632266941865e-05, + "loss": 0.8067, + "step": 21950 + }, + { + "epoch": 1.743237611383437, + "grad_norm": 0.8029731512069702, + "learning_rate": 2.09534016035564e-05, + "loss": 0.8142, + "step": 21960 + }, + { + "epoch": 1.7440314354323365, + "grad_norm": 0.9675547480583191, + "learning_rate": 2.0940170940170943e-05, + "loss": 0.8511, + "step": 21970 + }, + { + "epoch": 1.7448252594812361, + "grad_norm": 0.866611897945404, + "learning_rate": 2.092694027678548e-05, + "loss": 0.7468, + "step": 21980 + }, + { + "epoch": 1.7456190835301355, + "grad_norm": 0.7923464775085449, + "learning_rate": 2.0913709613400017e-05, + "loss": 0.8396, + "step": 21990 + }, + { + "epoch": 1.7464129075790351, + "grad_norm": 0.8513892292976379, + "learning_rate": 2.0900478950014553e-05, + "loss": 0.8128, + "step": 22000 + }, + { + "epoch": 1.7472067316279347, + "grad_norm": 0.7640386819839478, + "learning_rate": 2.0887248286629092e-05, + "loss": 0.7826, + "step": 22010 + }, + { + "epoch": 1.748000555676834, + "grad_norm": 0.8362756967544556, + "learning_rate": 2.087401762324363e-05, + "loss": 0.7284, + "step": 22020 + }, + { + "epoch": 1.7487943797257337, + "grad_norm": 0.78795325756073, + "learning_rate": 2.086078695985817e-05, + "loss": 0.8005, + "step": 22030 + }, + { + "epoch": 1.7495882037746333, + "grad_norm": 0.9655894637107849, + "learning_rate": 2.0847556296472705e-05, + "loss": 0.8236, + "step": 22040 + }, + { + "epoch": 1.750382027823533, + "grad_norm": 0.8761208653450012, + "learning_rate": 2.0834325633087244e-05, + "loss": 0.8187, + "step": 22050 + }, + { + "epoch": 1.7511758518724325, + "grad_norm": 0.6492605209350586, + "learning_rate": 2.0821094969701783e-05, + "loss": 0.922, + "step": 22060 + }, + { + "epoch": 1.7519696759213321, + "grad_norm": 0.6981455683708191, + "learning_rate": 2.080786430631632e-05, + "loss": 0.7863, + "step": 22070 + }, + { + "epoch": 1.7527634999702317, + "grad_norm": 0.978775680065155, + "learning_rate": 2.0794633642930858e-05, + "loss": 0.7996, + "step": 22080 + }, + { + "epoch": 1.7535573240191311, + "grad_norm": 0.7167590260505676, + "learning_rate": 2.0781402979545393e-05, + "loss": 0.8866, + "step": 22090 + }, + { + "epoch": 1.7543511480680307, + "grad_norm": 0.7744993567466736, + "learning_rate": 2.0768172316159935e-05, + "loss": 0.7559, + "step": 22100 + }, + { + "epoch": 1.7551449721169303, + "grad_norm": 0.8939769268035889, + "learning_rate": 2.075494165277447e-05, + "loss": 0.7956, + "step": 22110 + }, + { + "epoch": 1.7559387961658297, + "grad_norm": 0.74879390001297, + "learning_rate": 2.074171098938901e-05, + "loss": 0.8159, + "step": 22120 + }, + { + "epoch": 1.7567326202147293, + "grad_norm": 0.7568157911300659, + "learning_rate": 2.0728480326003546e-05, + "loss": 0.7673, + "step": 22130 + }, + { + "epoch": 1.757526444263629, + "grad_norm": 0.9827935695648193, + "learning_rate": 2.0715249662618084e-05, + "loss": 0.8079, + "step": 22140 + }, + { + "epoch": 1.7583202683125285, + "grad_norm": 0.8912889957427979, + "learning_rate": 2.0702018999232623e-05, + "loss": 0.821, + "step": 22150 + }, + { + "epoch": 1.7591140923614281, + "grad_norm": 0.8157911896705627, + "learning_rate": 2.068878833584716e-05, + "loss": 0.7612, + "step": 22160 + }, + { + "epoch": 1.7599079164103277, + "grad_norm": 0.745104968547821, + "learning_rate": 2.0675557672461698e-05, + "loss": 0.8183, + "step": 22170 + }, + { + "epoch": 1.7607017404592273, + "grad_norm": 0.846377432346344, + "learning_rate": 2.0662327009076233e-05, + "loss": 0.7756, + "step": 22180 + }, + { + "epoch": 1.7614955645081267, + "grad_norm": 0.8381850719451904, + "learning_rate": 2.0649096345690776e-05, + "loss": 0.8417, + "step": 22190 + }, + { + "epoch": 1.7622893885570263, + "grad_norm": 0.8737789392471313, + "learning_rate": 2.063586568230531e-05, + "loss": 0.7929, + "step": 22200 + }, + { + "epoch": 1.763083212605926, + "grad_norm": 0.8165319561958313, + "learning_rate": 2.062263501891985e-05, + "loss": 0.7364, + "step": 22210 + }, + { + "epoch": 1.7638770366548253, + "grad_norm": 0.9064796566963196, + "learning_rate": 2.0609404355534386e-05, + "loss": 0.8336, + "step": 22220 + }, + { + "epoch": 1.764670860703725, + "grad_norm": 0.8361899256706238, + "learning_rate": 2.0596173692148925e-05, + "loss": 0.7976, + "step": 22230 + }, + { + "epoch": 1.7654646847526245, + "grad_norm": 0.955283522605896, + "learning_rate": 2.0582943028763464e-05, + "loss": 0.781, + "step": 22240 + }, + { + "epoch": 1.7662585088015241, + "grad_norm": 0.87896329164505, + "learning_rate": 2.0569712365378002e-05, + "loss": 0.8037, + "step": 22250 + }, + { + "epoch": 1.7670523328504237, + "grad_norm": 0.91143399477005, + "learning_rate": 2.0556481701992538e-05, + "loss": 0.748, + "step": 22260 + }, + { + "epoch": 1.7678461568993233, + "grad_norm": 0.8325607180595398, + "learning_rate": 2.0543251038607077e-05, + "loss": 0.8072, + "step": 22270 + }, + { + "epoch": 1.768639980948223, + "grad_norm": 0.904513955116272, + "learning_rate": 2.0530020375221616e-05, + "loss": 0.7791, + "step": 22280 + }, + { + "epoch": 1.7694338049971226, + "grad_norm": 0.9098820686340332, + "learning_rate": 2.051678971183615e-05, + "loss": 0.8965, + "step": 22290 + }, + { + "epoch": 1.770227629046022, + "grad_norm": 0.900877833366394, + "learning_rate": 2.050355904845069e-05, + "loss": 0.8368, + "step": 22300 + }, + { + "epoch": 1.7710214530949215, + "grad_norm": 0.9795036911964417, + "learning_rate": 2.0490328385065226e-05, + "loss": 0.8214, + "step": 22310 + }, + { + "epoch": 1.771815277143821, + "grad_norm": 0.7888408303260803, + "learning_rate": 2.0477097721679768e-05, + "loss": 0.829, + "step": 22320 + }, + { + "epoch": 1.7726091011927205, + "grad_norm": 0.6869066953659058, + "learning_rate": 2.0463867058294304e-05, + "loss": 0.8656, + "step": 22330 + }, + { + "epoch": 1.7734029252416201, + "grad_norm": 0.891853928565979, + "learning_rate": 2.0450636394908843e-05, + "loss": 0.8304, + "step": 22340 + }, + { + "epoch": 1.7741967492905197, + "grad_norm": 0.6878489255905151, + "learning_rate": 2.0437405731523378e-05, + "loss": 0.8594, + "step": 22350 + }, + { + "epoch": 1.7749905733394193, + "grad_norm": 0.9317987561225891, + "learning_rate": 2.0424175068137917e-05, + "loss": 0.8681, + "step": 22360 + }, + { + "epoch": 1.775784397388319, + "grad_norm": 0.8260341882705688, + "learning_rate": 2.0410944404752456e-05, + "loss": 0.8144, + "step": 22370 + }, + { + "epoch": 1.7765782214372186, + "grad_norm": 0.7893334031105042, + "learning_rate": 2.0397713741366995e-05, + "loss": 0.8375, + "step": 22380 + }, + { + "epoch": 1.7773720454861182, + "grad_norm": 0.8867517113685608, + "learning_rate": 2.038448307798153e-05, + "loss": 0.817, + "step": 22390 + }, + { + "epoch": 1.7781658695350175, + "grad_norm": 0.843120813369751, + "learning_rate": 2.0371252414596066e-05, + "loss": 0.8542, + "step": 22400 + }, + { + "epoch": 1.7789596935839171, + "grad_norm": 0.8140029311180115, + "learning_rate": 2.035802175121061e-05, + "loss": 0.9021, + "step": 22410 + }, + { + "epoch": 1.7797535176328165, + "grad_norm": 0.9623441100120544, + "learning_rate": 2.0344791087825144e-05, + "loss": 0.8447, + "step": 22420 + }, + { + "epoch": 1.7805473416817161, + "grad_norm": 0.9128091335296631, + "learning_rate": 2.0331560424439683e-05, + "loss": 0.794, + "step": 22430 + }, + { + "epoch": 1.7813411657306157, + "grad_norm": 0.8841097354888916, + "learning_rate": 2.031832976105422e-05, + "loss": 0.8707, + "step": 22440 + }, + { + "epoch": 1.7821349897795153, + "grad_norm": 0.7501084208488464, + "learning_rate": 2.0305099097668757e-05, + "loss": 0.8113, + "step": 22450 + }, + { + "epoch": 1.782928813828415, + "grad_norm": 0.9023767113685608, + "learning_rate": 2.0291868434283296e-05, + "loss": 0.8584, + "step": 22460 + }, + { + "epoch": 1.7837226378773146, + "grad_norm": 0.9524368047714233, + "learning_rate": 2.0278637770897835e-05, + "loss": 0.7864, + "step": 22470 + }, + { + "epoch": 1.7845164619262142, + "grad_norm": 0.7797056436538696, + "learning_rate": 2.026540710751237e-05, + "loss": 0.7762, + "step": 22480 + }, + { + "epoch": 1.7853102859751138, + "grad_norm": 0.918021559715271, + "learning_rate": 2.025217644412691e-05, + "loss": 0.7703, + "step": 22490 + }, + { + "epoch": 1.7861041100240131, + "grad_norm": 0.9234730005264282, + "learning_rate": 2.023894578074145e-05, + "loss": 0.8419, + "step": 22500 + }, + { + "epoch": 1.7868979340729128, + "grad_norm": 0.6988440155982971, + "learning_rate": 2.0225715117355984e-05, + "loss": 0.8038, + "step": 22510 + }, + { + "epoch": 1.7876917581218121, + "grad_norm": 0.96622633934021, + "learning_rate": 2.0212484453970523e-05, + "loss": 0.792, + "step": 22520 + }, + { + "epoch": 1.7884855821707117, + "grad_norm": 0.7496207356452942, + "learning_rate": 2.019925379058506e-05, + "loss": 0.7702, + "step": 22530 + }, + { + "epoch": 1.7892794062196113, + "grad_norm": 0.9251072406768799, + "learning_rate": 2.01860231271996e-05, + "loss": 0.8013, + "step": 22540 + }, + { + "epoch": 1.790073230268511, + "grad_norm": 0.8455217480659485, + "learning_rate": 2.0172792463814136e-05, + "loss": 0.8374, + "step": 22550 + }, + { + "epoch": 1.7908670543174106, + "grad_norm": 0.8806858062744141, + "learning_rate": 2.0159561800428675e-05, + "loss": 0.8292, + "step": 22560 + }, + { + "epoch": 1.7916608783663102, + "grad_norm": 0.8762655258178711, + "learning_rate": 2.014633113704321e-05, + "loss": 0.774, + "step": 22570 + }, + { + "epoch": 1.7924547024152098, + "grad_norm": 0.8209755420684814, + "learning_rate": 2.013310047365775e-05, + "loss": 0.7664, + "step": 22580 + }, + { + "epoch": 1.7932485264641094, + "grad_norm": 0.8102571964263916, + "learning_rate": 2.011986981027229e-05, + "loss": 0.8032, + "step": 22590 + }, + { + "epoch": 1.7940423505130088, + "grad_norm": 0.7247342467308044, + "learning_rate": 2.0106639146886828e-05, + "loss": 0.832, + "step": 22600 + }, + { + "epoch": 1.7948361745619084, + "grad_norm": 0.9266355633735657, + "learning_rate": 2.0093408483501363e-05, + "loss": 0.7909, + "step": 22610 + }, + { + "epoch": 1.795629998610808, + "grad_norm": 0.7393051981925964, + "learning_rate": 2.00801778201159e-05, + "loss": 0.8408, + "step": 22620 + }, + { + "epoch": 1.7964238226597073, + "grad_norm": 0.8639968633651733, + "learning_rate": 2.006694715673044e-05, + "loss": 0.8002, + "step": 22630 + }, + { + "epoch": 1.797217646708607, + "grad_norm": 0.9779701232910156, + "learning_rate": 2.0053716493344977e-05, + "loss": 0.7608, + "step": 22640 + }, + { + "epoch": 1.7980114707575066, + "grad_norm": 0.8327713012695312, + "learning_rate": 2.0040485829959516e-05, + "loss": 0.8247, + "step": 22650 + }, + { + "epoch": 1.7988052948064062, + "grad_norm": 0.8809974193572998, + "learning_rate": 2.002725516657405e-05, + "loss": 0.8666, + "step": 22660 + }, + { + "epoch": 1.7995991188553058, + "grad_norm": 0.8804794549942017, + "learning_rate": 2.001402450318859e-05, + "loss": 0.8138, + "step": 22670 + }, + { + "epoch": 1.8003929429042054, + "grad_norm": 0.8360373377799988, + "learning_rate": 2.000079383980313e-05, + "loss": 0.8091, + "step": 22680 + }, + { + "epoch": 1.801186766953105, + "grad_norm": 0.8613433241844177, + "learning_rate": 1.9987563176417668e-05, + "loss": 0.7882, + "step": 22690 + }, + { + "epoch": 1.8019805910020044, + "grad_norm": 0.8613284826278687, + "learning_rate": 1.9974332513032203e-05, + "loss": 0.8012, + "step": 22700 + }, + { + "epoch": 1.802774415050904, + "grad_norm": 0.8369642496109009, + "learning_rate": 1.9961101849646742e-05, + "loss": 0.7765, + "step": 22710 + }, + { + "epoch": 1.8035682390998036, + "grad_norm": 0.8004109859466553, + "learning_rate": 1.994787118626128e-05, + "loss": 0.7961, + "step": 22720 + }, + { + "epoch": 1.804362063148703, + "grad_norm": 0.792927086353302, + "learning_rate": 1.993464052287582e-05, + "loss": 0.848, + "step": 22730 + }, + { + "epoch": 1.8051558871976026, + "grad_norm": 0.7693809866905212, + "learning_rate": 1.9921409859490356e-05, + "loss": 0.786, + "step": 22740 + }, + { + "epoch": 1.8059497112465022, + "grad_norm": 1.0815696716308594, + "learning_rate": 1.990817919610489e-05, + "loss": 0.8142, + "step": 22750 + }, + { + "epoch": 1.8067435352954018, + "grad_norm": 0.9764348268508911, + "learning_rate": 1.989494853271943e-05, + "loss": 0.8801, + "step": 22760 + }, + { + "epoch": 1.8075373593443014, + "grad_norm": 0.9634998440742493, + "learning_rate": 1.988171786933397e-05, + "loss": 0.7793, + "step": 22770 + }, + { + "epoch": 1.808331183393201, + "grad_norm": 0.8265504837036133, + "learning_rate": 1.9868487205948508e-05, + "loss": 0.7417, + "step": 22780 + }, + { + "epoch": 1.8091250074421006, + "grad_norm": 0.8806119561195374, + "learning_rate": 1.9855256542563044e-05, + "loss": 0.8474, + "step": 22790 + }, + { + "epoch": 1.809918831491, + "grad_norm": 1.0471200942993164, + "learning_rate": 1.9842025879177582e-05, + "loss": 0.8872, + "step": 22800 + }, + { + "epoch": 1.8107126555398996, + "grad_norm": 0.7680615782737732, + "learning_rate": 1.982879521579212e-05, + "loss": 0.8097, + "step": 22810 + }, + { + "epoch": 1.8115064795887992, + "grad_norm": 0.9899775385856628, + "learning_rate": 1.981556455240666e-05, + "loss": 0.8203, + "step": 22820 + }, + { + "epoch": 1.8123003036376986, + "grad_norm": 0.8771310448646545, + "learning_rate": 1.9802333889021196e-05, + "loss": 0.783, + "step": 22830 + }, + { + "epoch": 1.8130941276865982, + "grad_norm": 0.7819077372550964, + "learning_rate": 1.9789103225635735e-05, + "loss": 0.8192, + "step": 22840 + }, + { + "epoch": 1.8138879517354978, + "grad_norm": 0.9690881371498108, + "learning_rate": 1.9775872562250274e-05, + "loss": 0.7972, + "step": 22850 + }, + { + "epoch": 1.8146817757843974, + "grad_norm": 0.9323807954788208, + "learning_rate": 1.976264189886481e-05, + "loss": 0.8351, + "step": 22860 + }, + { + "epoch": 1.815475599833297, + "grad_norm": 0.7597420811653137, + "learning_rate": 1.9749411235479348e-05, + "loss": 0.7927, + "step": 22870 + }, + { + "epoch": 1.8162694238821966, + "grad_norm": 0.614967942237854, + "learning_rate": 1.9736180572093884e-05, + "loss": 0.8414, + "step": 22880 + }, + { + "epoch": 1.8170632479310962, + "grad_norm": 0.9562491178512573, + "learning_rate": 1.9722949908708423e-05, + "loss": 0.7689, + "step": 22890 + }, + { + "epoch": 1.8178570719799958, + "grad_norm": 0.8071816563606262, + "learning_rate": 1.970971924532296e-05, + "loss": 0.7618, + "step": 22900 + }, + { + "epoch": 1.8186508960288952, + "grad_norm": 0.9454165697097778, + "learning_rate": 1.96964885819375e-05, + "loss": 0.8387, + "step": 22910 + }, + { + "epoch": 1.8194447200777948, + "grad_norm": 0.8835422992706299, + "learning_rate": 1.9683257918552036e-05, + "loss": 0.8217, + "step": 22920 + }, + { + "epoch": 1.8202385441266942, + "grad_norm": 0.9334397912025452, + "learning_rate": 1.9670027255166575e-05, + "loss": 0.7968, + "step": 22930 + }, + { + "epoch": 1.8210323681755938, + "grad_norm": 0.9972975254058838, + "learning_rate": 1.9656796591781114e-05, + "loss": 0.7708, + "step": 22940 + }, + { + "epoch": 1.8218261922244934, + "grad_norm": 1.0508019924163818, + "learning_rate": 1.9643565928395653e-05, + "loss": 0.795, + "step": 22950 + }, + { + "epoch": 1.822620016273393, + "grad_norm": 0.8678830862045288, + "learning_rate": 1.963033526501019e-05, + "loss": 0.8326, + "step": 22960 + }, + { + "epoch": 1.8234138403222926, + "grad_norm": 0.7960614562034607, + "learning_rate": 1.9617104601624724e-05, + "loss": 0.8116, + "step": 22970 + }, + { + "epoch": 1.8242076643711922, + "grad_norm": 0.8850997090339661, + "learning_rate": 1.9603873938239263e-05, + "loss": 0.7878, + "step": 22980 + }, + { + "epoch": 1.8250014884200918, + "grad_norm": 0.7653430700302124, + "learning_rate": 1.9590643274853802e-05, + "loss": 0.7775, + "step": 22990 + }, + { + "epoch": 1.8257953124689914, + "grad_norm": 0.87236487865448, + "learning_rate": 1.957741261146834e-05, + "loss": 0.7627, + "step": 23000 + }, + { + "epoch": 1.8265891365178908, + "grad_norm": 1.0964977741241455, + "learning_rate": 1.9564181948082876e-05, + "loss": 0.7447, + "step": 23010 + }, + { + "epoch": 1.8273829605667904, + "grad_norm": 0.8569262623786926, + "learning_rate": 1.9550951284697415e-05, + "loss": 0.8153, + "step": 23020 + }, + { + "epoch": 1.8281767846156898, + "grad_norm": 0.9798825979232788, + "learning_rate": 1.9537720621311954e-05, + "loss": 0.7993, + "step": 23030 + }, + { + "epoch": 1.8289706086645894, + "grad_norm": 0.8593613505363464, + "learning_rate": 1.9524489957926493e-05, + "loss": 0.8205, + "step": 23040 + }, + { + "epoch": 1.829764432713489, + "grad_norm": 0.6907086372375488, + "learning_rate": 1.951125929454103e-05, + "loss": 0.7571, + "step": 23050 + }, + { + "epoch": 1.8305582567623886, + "grad_norm": 0.8665714263916016, + "learning_rate": 1.9498028631155567e-05, + "loss": 0.8783, + "step": 23060 + }, + { + "epoch": 1.8313520808112882, + "grad_norm": 0.8479406833648682, + "learning_rate": 1.9484797967770106e-05, + "loss": 0.8075, + "step": 23070 + }, + { + "epoch": 1.8321459048601878, + "grad_norm": 1.0376914739608765, + "learning_rate": 1.9471567304384645e-05, + "loss": 0.8745, + "step": 23080 + }, + { + "epoch": 1.8329397289090874, + "grad_norm": 0.7835286855697632, + "learning_rate": 1.945833664099918e-05, + "loss": 0.7769, + "step": 23090 + }, + { + "epoch": 1.833733552957987, + "grad_norm": 0.7931967973709106, + "learning_rate": 1.9445105977613716e-05, + "loss": 0.7833, + "step": 23100 + }, + { + "epoch": 1.8345273770068864, + "grad_norm": 0.9185943007469177, + "learning_rate": 1.9433198380566804e-05, + "loss": 0.8216, + "step": 23110 + }, + { + "epoch": 1.835321201055786, + "grad_norm": 0.9194105267524719, + "learning_rate": 1.941996771718134e-05, + "loss": 0.8119, + "step": 23120 + }, + { + "epoch": 1.8361150251046854, + "grad_norm": 0.9397788047790527, + "learning_rate": 1.940673705379588e-05, + "loss": 0.7952, + "step": 23130 + }, + { + "epoch": 1.836908849153585, + "grad_norm": 0.9412715435028076, + "learning_rate": 1.9393506390410414e-05, + "loss": 0.8362, + "step": 23140 + }, + { + "epoch": 1.8377026732024846, + "grad_norm": 0.8924200534820557, + "learning_rate": 1.9380275727024953e-05, + "loss": 0.8191, + "step": 23150 + }, + { + "epoch": 1.8384964972513842, + "grad_norm": 0.9370428323745728, + "learning_rate": 1.9367045063639492e-05, + "loss": 0.7919, + "step": 23160 + }, + { + "epoch": 1.8392903213002838, + "grad_norm": 0.9872609376907349, + "learning_rate": 1.935381440025403e-05, + "loss": 0.7711, + "step": 23170 + }, + { + "epoch": 1.8400841453491834, + "grad_norm": 0.7494041323661804, + "learning_rate": 1.9340583736868566e-05, + "loss": 0.8043, + "step": 23180 + }, + { + "epoch": 1.840877969398083, + "grad_norm": 0.768407940864563, + "learning_rate": 1.9327353073483105e-05, + "loss": 0.7851, + "step": 23190 + }, + { + "epoch": 1.8416717934469826, + "grad_norm": 0.7478283047676086, + "learning_rate": 1.9314122410097644e-05, + "loss": 0.8769, + "step": 23200 + }, + { + "epoch": 1.842465617495882, + "grad_norm": 1.001345157623291, + "learning_rate": 1.930089174671218e-05, + "loss": 0.8168, + "step": 23210 + }, + { + "epoch": 1.8432594415447816, + "grad_norm": 0.9208034873008728, + "learning_rate": 1.928766108332672e-05, + "loss": 0.7763, + "step": 23220 + }, + { + "epoch": 1.8440532655936812, + "grad_norm": 0.7579790949821472, + "learning_rate": 1.9274430419941254e-05, + "loss": 0.8256, + "step": 23230 + }, + { + "epoch": 1.8448470896425806, + "grad_norm": 0.8360084891319275, + "learning_rate": 1.9261199756555797e-05, + "loss": 0.8554, + "step": 23240 + }, + { + "epoch": 1.8456409136914802, + "grad_norm": 0.8228520154953003, + "learning_rate": 1.9247969093170332e-05, + "loss": 0.7981, + "step": 23250 + }, + { + "epoch": 1.8464347377403798, + "grad_norm": 0.7764120697975159, + "learning_rate": 1.923473842978487e-05, + "loss": 0.8218, + "step": 23260 + }, + { + "epoch": 1.8472285617892794, + "grad_norm": 0.730930507183075, + "learning_rate": 1.9221507766399407e-05, + "loss": 0.8442, + "step": 23270 + }, + { + "epoch": 1.848022385838179, + "grad_norm": 0.8573958277702332, + "learning_rate": 1.9208277103013946e-05, + "loss": 0.8032, + "step": 23280 + }, + { + "epoch": 1.8488162098870786, + "grad_norm": 1.118808388710022, + "learning_rate": 1.9195046439628485e-05, + "loss": 0.8619, + "step": 23290 + }, + { + "epoch": 1.8496100339359782, + "grad_norm": 0.7608010768890381, + "learning_rate": 1.9181815776243023e-05, + "loss": 0.8218, + "step": 23300 + }, + { + "epoch": 1.8504038579848776, + "grad_norm": 0.8794158697128296, + "learning_rate": 1.916858511285756e-05, + "loss": 0.8562, + "step": 23310 + }, + { + "epoch": 1.8511976820337772, + "grad_norm": 0.8464013934135437, + "learning_rate": 1.9155354449472098e-05, + "loss": 0.836, + "step": 23320 + }, + { + "epoch": 1.8519915060826768, + "grad_norm": 0.7255980372428894, + "learning_rate": 1.9142123786086637e-05, + "loss": 0.8553, + "step": 23330 + }, + { + "epoch": 1.8527853301315762, + "grad_norm": 0.9724112153053284, + "learning_rate": 1.9128893122701172e-05, + "loss": 0.8473, + "step": 23340 + }, + { + "epoch": 1.8535791541804758, + "grad_norm": 0.7873225212097168, + "learning_rate": 1.911566245931571e-05, + "loss": 0.8406, + "step": 23350 + }, + { + "epoch": 1.8543729782293754, + "grad_norm": 0.8537048697471619, + "learning_rate": 1.9102431795930247e-05, + "loss": 0.8082, + "step": 23360 + }, + { + "epoch": 1.855166802278275, + "grad_norm": 0.7261394262313843, + "learning_rate": 1.9089201132544786e-05, + "loss": 0.8488, + "step": 23370 + }, + { + "epoch": 1.8559606263271746, + "grad_norm": 0.8339587450027466, + "learning_rate": 1.9075970469159325e-05, + "loss": 0.8027, + "step": 23380 + }, + { + "epoch": 1.8567544503760742, + "grad_norm": 1.3965240716934204, + "learning_rate": 1.9062739805773864e-05, + "loss": 0.7967, + "step": 23390 + }, + { + "epoch": 1.8575482744249738, + "grad_norm": 0.789261519908905, + "learning_rate": 1.90495091423884e-05, + "loss": 0.7911, + "step": 23400 + }, + { + "epoch": 1.8583420984738732, + "grad_norm": 0.6335147619247437, + "learning_rate": 1.9036278479002938e-05, + "loss": 0.8299, + "step": 23410 + }, + { + "epoch": 1.8591359225227728, + "grad_norm": 0.7321164608001709, + "learning_rate": 1.9023047815617477e-05, + "loss": 0.8545, + "step": 23420 + }, + { + "epoch": 1.8599297465716724, + "grad_norm": 0.9617056250572205, + "learning_rate": 1.9009817152232016e-05, + "loss": 0.7811, + "step": 23430 + }, + { + "epoch": 1.8607235706205718, + "grad_norm": 0.8931480050086975, + "learning_rate": 1.899658648884655e-05, + "loss": 0.8112, + "step": 23440 + }, + { + "epoch": 1.8615173946694714, + "grad_norm": 1.094520926475525, + "learning_rate": 1.8983355825461087e-05, + "loss": 0.8078, + "step": 23450 + }, + { + "epoch": 1.862311218718371, + "grad_norm": 0.9582784175872803, + "learning_rate": 1.8970125162075626e-05, + "loss": 0.763, + "step": 23460 + }, + { + "epoch": 1.8631050427672706, + "grad_norm": 0.9741404056549072, + "learning_rate": 1.8956894498690165e-05, + "loss": 0.7808, + "step": 23470 + }, + { + "epoch": 1.8638988668161702, + "grad_norm": 0.758374035358429, + "learning_rate": 1.8943663835304704e-05, + "loss": 0.8281, + "step": 23480 + }, + { + "epoch": 1.8646926908650698, + "grad_norm": 0.815403938293457, + "learning_rate": 1.893043317191924e-05, + "loss": 0.7594, + "step": 23490 + }, + { + "epoch": 1.8654865149139694, + "grad_norm": 0.7608597278594971, + "learning_rate": 1.8917202508533778e-05, + "loss": 0.8072, + "step": 23500 + }, + { + "epoch": 1.866280338962869, + "grad_norm": 0.8437971472740173, + "learning_rate": 1.8903971845148317e-05, + "loss": 0.8153, + "step": 23510 + }, + { + "epoch": 1.8670741630117684, + "grad_norm": 0.8459624648094177, + "learning_rate": 1.8890741181762856e-05, + "loss": 0.7585, + "step": 23520 + }, + { + "epoch": 1.867867987060668, + "grad_norm": 0.9859129786491394, + "learning_rate": 1.887751051837739e-05, + "loss": 0.7822, + "step": 23530 + }, + { + "epoch": 1.8686618111095674, + "grad_norm": 0.7122694849967957, + "learning_rate": 1.886427985499193e-05, + "loss": 0.8052, + "step": 23540 + }, + { + "epoch": 1.869455635158467, + "grad_norm": 0.8045480251312256, + "learning_rate": 1.885104919160647e-05, + "loss": 0.7523, + "step": 23550 + }, + { + "epoch": 1.8702494592073666, + "grad_norm": 0.8591973781585693, + "learning_rate": 1.8837818528221005e-05, + "loss": 0.8087, + "step": 23560 + }, + { + "epoch": 1.8710432832562662, + "grad_norm": 0.8517307043075562, + "learning_rate": 1.8824587864835544e-05, + "loss": 0.7844, + "step": 23570 + }, + { + "epoch": 1.8718371073051658, + "grad_norm": 0.96706223487854, + "learning_rate": 1.881135720145008e-05, + "loss": 0.7595, + "step": 23580 + }, + { + "epoch": 1.8726309313540654, + "grad_norm": 0.966511070728302, + "learning_rate": 1.879812653806462e-05, + "loss": 0.8569, + "step": 23590 + }, + { + "epoch": 1.873424755402965, + "grad_norm": 0.8597899675369263, + "learning_rate": 1.8784895874679157e-05, + "loss": 0.7934, + "step": 23600 + }, + { + "epoch": 1.8742185794518647, + "grad_norm": 0.7398180961608887, + "learning_rate": 1.8771665211293696e-05, + "loss": 0.818, + "step": 23610 + }, + { + "epoch": 1.875012403500764, + "grad_norm": 0.8387320637702942, + "learning_rate": 1.8758434547908232e-05, + "loss": 0.8067, + "step": 23620 + }, + { + "epoch": 1.8758062275496636, + "grad_norm": 0.8745064735412598, + "learning_rate": 1.874520388452277e-05, + "loss": 0.7378, + "step": 23630 + }, + { + "epoch": 1.876600051598563, + "grad_norm": 0.7294278740882874, + "learning_rate": 1.873197322113731e-05, + "loss": 0.8538, + "step": 23640 + }, + { + "epoch": 1.8773938756474626, + "grad_norm": 0.8370188474655151, + "learning_rate": 1.871874255775185e-05, + "loss": 0.837, + "step": 23650 + }, + { + "epoch": 1.8781876996963622, + "grad_norm": 0.8423630595207214, + "learning_rate": 1.8705511894366384e-05, + "loss": 0.7942, + "step": 23660 + }, + { + "epoch": 1.8789815237452618, + "grad_norm": 0.8420581817626953, + "learning_rate": 1.8692281230980923e-05, + "loss": 0.8273, + "step": 23670 + }, + { + "epoch": 1.8797753477941614, + "grad_norm": 0.9311301708221436, + "learning_rate": 1.867905056759546e-05, + "loss": 0.832, + "step": 23680 + }, + { + "epoch": 1.880569171843061, + "grad_norm": 0.7972429990768433, + "learning_rate": 1.8665819904209998e-05, + "loss": 0.8131, + "step": 23690 + }, + { + "epoch": 1.8813629958919607, + "grad_norm": 0.7214037775993347, + "learning_rate": 1.8652589240824536e-05, + "loss": 0.8298, + "step": 23700 + }, + { + "epoch": 1.8821568199408603, + "grad_norm": 0.8672298789024353, + "learning_rate": 1.8639358577439072e-05, + "loss": 0.8283, + "step": 23710 + }, + { + "epoch": 1.8829506439897596, + "grad_norm": 0.8380793333053589, + "learning_rate": 1.862612791405361e-05, + "loss": 0.7733, + "step": 23720 + }, + { + "epoch": 1.8837444680386592, + "grad_norm": 0.8176583051681519, + "learning_rate": 1.861289725066815e-05, + "loss": 0.8774, + "step": 23730 + }, + { + "epoch": 1.8845382920875586, + "grad_norm": 0.8315280079841614, + "learning_rate": 1.859966658728269e-05, + "loss": 0.8133, + "step": 23740 + }, + { + "epoch": 1.8853321161364582, + "grad_norm": 0.7591937184333801, + "learning_rate": 1.8586435923897224e-05, + "loss": 0.8648, + "step": 23750 + }, + { + "epoch": 1.8861259401853578, + "grad_norm": 0.9065371155738831, + "learning_rate": 1.8573205260511763e-05, + "loss": 0.7526, + "step": 23760 + }, + { + "epoch": 1.8869197642342574, + "grad_norm": 0.8125611543655396, + "learning_rate": 1.8559974597126302e-05, + "loss": 0.7813, + "step": 23770 + }, + { + "epoch": 1.887713588283157, + "grad_norm": 0.7974992990493774, + "learning_rate": 1.854674393374084e-05, + "loss": 0.8515, + "step": 23780 + }, + { + "epoch": 1.8885074123320567, + "grad_norm": 0.6835601925849915, + "learning_rate": 1.8533513270355377e-05, + "loss": 0.8259, + "step": 23790 + }, + { + "epoch": 1.8893012363809563, + "grad_norm": 0.82981938123703, + "learning_rate": 1.8520282606969912e-05, + "loss": 0.7867, + "step": 23800 + }, + { + "epoch": 1.8900950604298559, + "grad_norm": 0.7962732911109924, + "learning_rate": 1.850705194358445e-05, + "loss": 0.8032, + "step": 23810 + }, + { + "epoch": 1.8908888844787552, + "grad_norm": 0.7996245622634888, + "learning_rate": 1.849382128019899e-05, + "loss": 0.7464, + "step": 23820 + }, + { + "epoch": 1.8916827085276549, + "grad_norm": 0.8889716267585754, + "learning_rate": 1.848059061681353e-05, + "loss": 0.7978, + "step": 23830 + }, + { + "epoch": 1.8924765325765545, + "grad_norm": 0.8882633447647095, + "learning_rate": 1.8467359953428065e-05, + "loss": 0.7114, + "step": 23840 + }, + { + "epoch": 1.8932703566254538, + "grad_norm": 0.8519427180290222, + "learning_rate": 1.8454129290042603e-05, + "loss": 0.8326, + "step": 23850 + }, + { + "epoch": 1.8940641806743534, + "grad_norm": 1.117575764656067, + "learning_rate": 1.8440898626657142e-05, + "loss": 0.8373, + "step": 23860 + }, + { + "epoch": 1.894858004723253, + "grad_norm": 0.871019184589386, + "learning_rate": 1.842766796327168e-05, + "loss": 0.8082, + "step": 23870 + }, + { + "epoch": 1.8956518287721527, + "grad_norm": 0.8169146776199341, + "learning_rate": 1.8414437299886217e-05, + "loss": 0.7634, + "step": 23880 + }, + { + "epoch": 1.8964456528210523, + "grad_norm": 0.8163456320762634, + "learning_rate": 1.8401206636500756e-05, + "loss": 0.8704, + "step": 23890 + }, + { + "epoch": 1.8972394768699519, + "grad_norm": 0.912982165813446, + "learning_rate": 1.838797597311529e-05, + "loss": 0.7831, + "step": 23900 + }, + { + "epoch": 1.8980333009188515, + "grad_norm": 0.8707506656646729, + "learning_rate": 1.837474530972983e-05, + "loss": 0.7522, + "step": 23910 + }, + { + "epoch": 1.8988271249677509, + "grad_norm": 0.7773687839508057, + "learning_rate": 1.836151464634437e-05, + "loss": 0.8301, + "step": 23920 + }, + { + "epoch": 1.8996209490166505, + "grad_norm": 0.846040666103363, + "learning_rate": 1.8348283982958905e-05, + "loss": 0.8274, + "step": 23930 + }, + { + "epoch": 1.90041477306555, + "grad_norm": 0.890678882598877, + "learning_rate": 1.8335053319573444e-05, + "loss": 0.8097, + "step": 23940 + }, + { + "epoch": 1.9012085971144495, + "grad_norm": 0.8786852955818176, + "learning_rate": 1.8321822656187983e-05, + "loss": 0.8448, + "step": 23950 + }, + { + "epoch": 1.902002421163349, + "grad_norm": 0.8807387948036194, + "learning_rate": 1.830859199280252e-05, + "loss": 0.8331, + "step": 23960 + }, + { + "epoch": 1.9027962452122487, + "grad_norm": 0.6707026958465576, + "learning_rate": 1.8295361329417057e-05, + "loss": 0.8109, + "step": 23970 + }, + { + "epoch": 1.9035900692611483, + "grad_norm": 0.9399641156196594, + "learning_rate": 1.8282130666031596e-05, + "loss": 0.7463, + "step": 23980 + }, + { + "epoch": 1.9043838933100479, + "grad_norm": 0.6597705483436584, + "learning_rate": 1.8268900002646135e-05, + "loss": 0.8391, + "step": 23990 + }, + { + "epoch": 1.9051777173589475, + "grad_norm": 0.869948148727417, + "learning_rate": 1.8255669339260674e-05, + "loss": 0.8085, + "step": 24000 + }, + { + "epoch": 1.905971541407847, + "grad_norm": 0.9739574193954468, + "learning_rate": 1.824243867587521e-05, + "loss": 0.8292, + "step": 24010 + }, + { + "epoch": 1.9067653654567465, + "grad_norm": 0.8347158432006836, + "learning_rate": 1.8229208012489745e-05, + "loss": 0.8148, + "step": 24020 + }, + { + "epoch": 1.907559189505646, + "grad_norm": 0.8817285299301147, + "learning_rate": 1.8215977349104284e-05, + "loss": 0.8662, + "step": 24030 + }, + { + "epoch": 1.9083530135545457, + "grad_norm": 0.8079895377159119, + "learning_rate": 1.8202746685718823e-05, + "loss": 0.8082, + "step": 24040 + }, + { + "epoch": 1.909146837603445, + "grad_norm": 0.9502777457237244, + "learning_rate": 1.818951602233336e-05, + "loss": 0.8375, + "step": 24050 + }, + { + "epoch": 1.9099406616523447, + "grad_norm": 0.7885468602180481, + "learning_rate": 1.8176285358947897e-05, + "loss": 0.7887, + "step": 24060 + }, + { + "epoch": 1.9107344857012443, + "grad_norm": 0.861359179019928, + "learning_rate": 1.8163054695562436e-05, + "loss": 0.8925, + "step": 24070 + }, + { + "epoch": 1.9115283097501439, + "grad_norm": 0.8542492389678955, + "learning_rate": 1.8149824032176975e-05, + "loss": 0.7872, + "step": 24080 + }, + { + "epoch": 1.9123221337990435, + "grad_norm": 0.97325199842453, + "learning_rate": 1.8136593368791514e-05, + "loss": 0.8129, + "step": 24090 + }, + { + "epoch": 1.913115957847943, + "grad_norm": 0.8216763734817505, + "learning_rate": 1.812336270540605e-05, + "loss": 0.8226, + "step": 24100 + }, + { + "epoch": 1.9139097818968427, + "grad_norm": 0.8220085501670837, + "learning_rate": 1.811013204202059e-05, + "loss": 0.8222, + "step": 24110 + }, + { + "epoch": 1.914703605945742, + "grad_norm": 0.8182690739631653, + "learning_rate": 1.8096901378635124e-05, + "loss": 0.843, + "step": 24120 + }, + { + "epoch": 1.9154974299946417, + "grad_norm": 1.0019487142562866, + "learning_rate": 1.8083670715249666e-05, + "loss": 0.826, + "step": 24130 + }, + { + "epoch": 1.9162912540435413, + "grad_norm": 0.9827296137809753, + "learning_rate": 1.8070440051864202e-05, + "loss": 0.876, + "step": 24140 + }, + { + "epoch": 1.9170850780924407, + "grad_norm": 0.848641037940979, + "learning_rate": 1.8057209388478737e-05, + "loss": 0.7753, + "step": 24150 + }, + { + "epoch": 1.9178789021413403, + "grad_norm": 0.9888964891433716, + "learning_rate": 1.8043978725093276e-05, + "loss": 0.7491, + "step": 24160 + }, + { + "epoch": 1.9186727261902399, + "grad_norm": 0.8973375558853149, + "learning_rate": 1.8030748061707815e-05, + "loss": 0.7887, + "step": 24170 + }, + { + "epoch": 1.9194665502391395, + "grad_norm": 0.8632115721702576, + "learning_rate": 1.8017517398322354e-05, + "loss": 0.8052, + "step": 24180 + }, + { + "epoch": 1.920260374288039, + "grad_norm": 0.9406533241271973, + "learning_rate": 1.800428673493689e-05, + "loss": 0.7834, + "step": 24190 + }, + { + "epoch": 1.9210541983369387, + "grad_norm": 0.7621222734451294, + "learning_rate": 1.799105607155143e-05, + "loss": 0.8211, + "step": 24200 + }, + { + "epoch": 1.9218480223858383, + "grad_norm": 0.7019477486610413, + "learning_rate": 1.7977825408165964e-05, + "loss": 0.7818, + "step": 24210 + }, + { + "epoch": 1.922641846434738, + "grad_norm": 0.7617840766906738, + "learning_rate": 1.7964594744780506e-05, + "loss": 0.7691, + "step": 24220 + }, + { + "epoch": 1.9234356704836373, + "grad_norm": 0.8671677112579346, + "learning_rate": 1.7951364081395042e-05, + "loss": 0.8076, + "step": 24230 + }, + { + "epoch": 1.9242294945325369, + "grad_norm": 0.7963345646858215, + "learning_rate": 1.793813341800958e-05, + "loss": 0.8549, + "step": 24240 + }, + { + "epoch": 1.9250233185814363, + "grad_norm": 0.7418803572654724, + "learning_rate": 1.7924902754624116e-05, + "loss": 0.8776, + "step": 24250 + }, + { + "epoch": 1.9258171426303359, + "grad_norm": 0.991839587688446, + "learning_rate": 1.7911672091238655e-05, + "loss": 0.8098, + "step": 24260 + }, + { + "epoch": 1.9266109666792355, + "grad_norm": 0.8886794447898865, + "learning_rate": 1.7898441427853194e-05, + "loss": 0.7906, + "step": 24270 + }, + { + "epoch": 1.927404790728135, + "grad_norm": 0.8058855533599854, + "learning_rate": 1.788521076446773e-05, + "loss": 0.7909, + "step": 24280 + }, + { + "epoch": 1.9281986147770347, + "grad_norm": 0.9611883163452148, + "learning_rate": 1.787198010108227e-05, + "loss": 0.7466, + "step": 24290 + }, + { + "epoch": 1.9289924388259343, + "grad_norm": 0.9158162474632263, + "learning_rate": 1.7858749437696808e-05, + "loss": 0.772, + "step": 24300 + }, + { + "epoch": 1.929786262874834, + "grad_norm": 0.7327207922935486, + "learning_rate": 1.7845518774311347e-05, + "loss": 0.832, + "step": 24310 + }, + { + "epoch": 1.9305800869237335, + "grad_norm": 0.8581303954124451, + "learning_rate": 1.7832288110925882e-05, + "loss": 0.8193, + "step": 24320 + }, + { + "epoch": 1.931373910972633, + "grad_norm": 0.8568522930145264, + "learning_rate": 1.781905744754042e-05, + "loss": 0.8105, + "step": 24330 + }, + { + "epoch": 1.9321677350215325, + "grad_norm": 0.9713951945304871, + "learning_rate": 1.7805826784154957e-05, + "loss": 0.7699, + "step": 24340 + }, + { + "epoch": 1.9329615590704319, + "grad_norm": 0.8259567618370056, + "learning_rate": 1.77925961207695e-05, + "loss": 0.8278, + "step": 24350 + }, + { + "epoch": 1.9337553831193315, + "grad_norm": 0.9013030529022217, + "learning_rate": 1.7779365457384035e-05, + "loss": 0.8677, + "step": 24360 + }, + { + "epoch": 1.934549207168231, + "grad_norm": 0.8050611019134521, + "learning_rate": 1.776613479399857e-05, + "loss": 0.7818, + "step": 24370 + }, + { + "epoch": 1.9353430312171307, + "grad_norm": 0.8828538060188293, + "learning_rate": 1.775290413061311e-05, + "loss": 0.8062, + "step": 24380 + }, + { + "epoch": 1.9361368552660303, + "grad_norm": 0.9584046602249146, + "learning_rate": 1.7739673467227648e-05, + "loss": 0.8004, + "step": 24390 + }, + { + "epoch": 1.93693067931493, + "grad_norm": 1.0166678428649902, + "learning_rate": 1.7726442803842187e-05, + "loss": 0.8063, + "step": 24400 + }, + { + "epoch": 1.9377245033638295, + "grad_norm": 0.8781585693359375, + "learning_rate": 1.7713212140456722e-05, + "loss": 0.8428, + "step": 24410 + }, + { + "epoch": 1.9385183274127291, + "grad_norm": 0.9329415559768677, + "learning_rate": 1.769998147707126e-05, + "loss": 0.8114, + "step": 24420 + }, + { + "epoch": 1.9393121514616285, + "grad_norm": 0.9052295684814453, + "learning_rate": 1.7686750813685797e-05, + "loss": 0.8206, + "step": 24430 + }, + { + "epoch": 1.940105975510528, + "grad_norm": 0.8311532139778137, + "learning_rate": 1.767352015030034e-05, + "loss": 0.83, + "step": 24440 + }, + { + "epoch": 1.9408997995594277, + "grad_norm": 0.8196767568588257, + "learning_rate": 1.7660289486914875e-05, + "loss": 0.8425, + "step": 24450 + }, + { + "epoch": 1.941693623608327, + "grad_norm": 0.7230108976364136, + "learning_rate": 1.7647058823529414e-05, + "loss": 0.8507, + "step": 24460 + }, + { + "epoch": 1.9424874476572267, + "grad_norm": 1.056864857673645, + "learning_rate": 1.763382816014395e-05, + "loss": 0.7798, + "step": 24470 + }, + { + "epoch": 1.9432812717061263, + "grad_norm": 1.0241467952728271, + "learning_rate": 1.7620597496758488e-05, + "loss": 0.8083, + "step": 24480 + }, + { + "epoch": 1.944075095755026, + "grad_norm": 1.016441822052002, + "learning_rate": 1.7607366833373027e-05, + "loss": 0.8201, + "step": 24490 + }, + { + "epoch": 1.9448689198039255, + "grad_norm": 0.9095987677574158, + "learning_rate": 1.7594136169987563e-05, + "loss": 0.8211, + "step": 24500 + }, + { + "epoch": 1.9456627438528251, + "grad_norm": 0.7234585881233215, + "learning_rate": 1.75809055066021e-05, + "loss": 0.8225, + "step": 24510 + }, + { + "epoch": 1.9464565679017247, + "grad_norm": 0.8463301062583923, + "learning_rate": 1.756767484321664e-05, + "loss": 0.8663, + "step": 24520 + }, + { + "epoch": 1.947250391950624, + "grad_norm": 0.8381713628768921, + "learning_rate": 1.755444417983118e-05, + "loss": 0.8309, + "step": 24530 + }, + { + "epoch": 1.9480442159995237, + "grad_norm": 1.2629505395889282, + "learning_rate": 1.7541213516445715e-05, + "loss": 0.8193, + "step": 24540 + }, + { + "epoch": 1.9488380400484233, + "grad_norm": 0.889024555683136, + "learning_rate": 1.7527982853060254e-05, + "loss": 0.7851, + "step": 24550 + }, + { + "epoch": 1.9496318640973227, + "grad_norm": 1.1809180974960327, + "learning_rate": 1.751475218967479e-05, + "loss": 0.8625, + "step": 24560 + }, + { + "epoch": 1.9504256881462223, + "grad_norm": 0.973340630531311, + "learning_rate": 1.750152152628933e-05, + "loss": 0.7819, + "step": 24570 + }, + { + "epoch": 1.951219512195122, + "grad_norm": 0.9036343693733215, + "learning_rate": 1.7488290862903867e-05, + "loss": 0.7523, + "step": 24580 + }, + { + "epoch": 1.9520133362440215, + "grad_norm": 0.9107938408851624, + "learning_rate": 1.7475060199518406e-05, + "loss": 0.8277, + "step": 24590 + }, + { + "epoch": 1.9528071602929211, + "grad_norm": 0.7978836297988892, + "learning_rate": 1.746182953613294e-05, + "loss": 0.8058, + "step": 24600 + }, + { + "epoch": 1.9536009843418207, + "grad_norm": 0.9246600270271301, + "learning_rate": 1.744859887274748e-05, + "loss": 0.862, + "step": 24610 + }, + { + "epoch": 1.9543948083907203, + "grad_norm": 0.96808922290802, + "learning_rate": 1.743536820936202e-05, + "loss": 0.9027, + "step": 24620 + }, + { + "epoch": 1.9551886324396197, + "grad_norm": 0.7712015509605408, + "learning_rate": 1.7422137545976555e-05, + "loss": 0.836, + "step": 24630 + }, + { + "epoch": 1.9559824564885193, + "grad_norm": 0.9321126937866211, + "learning_rate": 1.7408906882591094e-05, + "loss": 0.836, + "step": 24640 + }, + { + "epoch": 1.956776280537419, + "grad_norm": 0.8697831630706787, + "learning_rate": 1.739567621920563e-05, + "loss": 0.8271, + "step": 24650 + }, + { + "epoch": 1.9575701045863183, + "grad_norm": 0.8572379350662231, + "learning_rate": 1.7382445555820172e-05, + "loss": 0.7872, + "step": 24660 + }, + { + "epoch": 1.958363928635218, + "grad_norm": 0.7709526419639587, + "learning_rate": 1.7369214892434707e-05, + "loss": 0.8315, + "step": 24670 + }, + { + "epoch": 1.9591577526841175, + "grad_norm": 0.867958128452301, + "learning_rate": 1.7355984229049246e-05, + "loss": 0.7875, + "step": 24680 + }, + { + "epoch": 1.9599515767330171, + "grad_norm": 0.883253276348114, + "learning_rate": 1.7342753565663782e-05, + "loss": 0.8871, + "step": 24690 + }, + { + "epoch": 1.9607454007819167, + "grad_norm": 0.8314810395240784, + "learning_rate": 1.732952290227832e-05, + "loss": 0.7559, + "step": 24700 + }, + { + "epoch": 1.9615392248308163, + "grad_norm": 0.7700093984603882, + "learning_rate": 1.731629223889286e-05, + "loss": 0.8198, + "step": 24710 + }, + { + "epoch": 1.962333048879716, + "grad_norm": 0.7861685156822205, + "learning_rate": 1.7303061575507395e-05, + "loss": 0.8823, + "step": 24720 + }, + { + "epoch": 1.9631268729286153, + "grad_norm": 0.9376586675643921, + "learning_rate": 1.7289830912121934e-05, + "loss": 0.8059, + "step": 24730 + }, + { + "epoch": 1.963920696977515, + "grad_norm": 0.9346411228179932, + "learning_rate": 1.7276600248736473e-05, + "loss": 0.8167, + "step": 24740 + }, + { + "epoch": 1.9647145210264145, + "grad_norm": 0.7460438013076782, + "learning_rate": 1.7263369585351012e-05, + "loss": 0.7703, + "step": 24750 + }, + { + "epoch": 1.965508345075314, + "grad_norm": 0.7430590391159058, + "learning_rate": 1.7250138921965548e-05, + "loss": 0.7716, + "step": 24760 + }, + { + "epoch": 1.9663021691242135, + "grad_norm": 0.9722541570663452, + "learning_rate": 1.7236908258580086e-05, + "loss": 0.7665, + "step": 24770 + }, + { + "epoch": 1.9670959931731131, + "grad_norm": 0.8619949817657471, + "learning_rate": 1.7223677595194622e-05, + "loss": 0.7947, + "step": 24780 + }, + { + "epoch": 1.9678898172220127, + "grad_norm": 0.864704966545105, + "learning_rate": 1.7210446931809164e-05, + "loss": 0.8233, + "step": 24790 + }, + { + "epoch": 1.9686836412709123, + "grad_norm": 0.8963067531585693, + "learning_rate": 1.71972162684237e-05, + "loss": 0.8249, + "step": 24800 + }, + { + "epoch": 1.969477465319812, + "grad_norm": 0.7496664524078369, + "learning_rate": 1.718398560503824e-05, + "loss": 0.7904, + "step": 24810 + }, + { + "epoch": 1.9702712893687115, + "grad_norm": 0.9141893982887268, + "learning_rate": 1.7170754941652774e-05, + "loss": 0.8094, + "step": 24820 + }, + { + "epoch": 1.9710651134176111, + "grad_norm": 0.8252711296081543, + "learning_rate": 1.7157524278267313e-05, + "loss": 0.744, + "step": 24830 + }, + { + "epoch": 1.9718589374665105, + "grad_norm": 0.7899823188781738, + "learning_rate": 1.7144293614881852e-05, + "loss": 0.7799, + "step": 24840 + }, + { + "epoch": 1.9726527615154101, + "grad_norm": 0.7791532874107361, + "learning_rate": 1.7131062951496388e-05, + "loss": 0.8205, + "step": 24850 + }, + { + "epoch": 1.9734465855643095, + "grad_norm": 0.8147314786911011, + "learning_rate": 1.7117832288110927e-05, + "loss": 0.7866, + "step": 24860 + }, + { + "epoch": 1.9742404096132091, + "grad_norm": 0.9811384677886963, + "learning_rate": 1.7104601624725462e-05, + "loss": 0.8046, + "step": 24870 + }, + { + "epoch": 1.9750342336621087, + "grad_norm": 0.6570988893508911, + "learning_rate": 1.7091370961340004e-05, + "loss": 0.8115, + "step": 24880 + }, + { + "epoch": 1.9758280577110083, + "grad_norm": 0.8411725759506226, + "learning_rate": 1.707814029795454e-05, + "loss": 0.8163, + "step": 24890 + }, + { + "epoch": 1.976621881759908, + "grad_norm": 0.9080935120582581, + "learning_rate": 1.706490963456908e-05, + "loss": 0.7894, + "step": 24900 + }, + { + "epoch": 1.9774157058088075, + "grad_norm": 0.8399326801300049, + "learning_rate": 1.7051678971183615e-05, + "loss": 0.8409, + "step": 24910 + }, + { + "epoch": 1.9782095298577071, + "grad_norm": 0.7995646595954895, + "learning_rate": 1.7038448307798153e-05, + "loss": 0.8039, + "step": 24920 + }, + { + "epoch": 1.9790033539066068, + "grad_norm": 0.7588172554969788, + "learning_rate": 1.7025217644412692e-05, + "loss": 0.8694, + "step": 24930 + }, + { + "epoch": 1.9797971779555061, + "grad_norm": 0.8422114253044128, + "learning_rate": 1.701198698102723e-05, + "loss": 0.8497, + "step": 24940 + }, + { + "epoch": 1.9805910020044057, + "grad_norm": 0.9750681519508362, + "learning_rate": 1.6998756317641767e-05, + "loss": 0.809, + "step": 24950 + }, + { + "epoch": 1.9813848260533051, + "grad_norm": 0.8295944929122925, + "learning_rate": 1.6985525654256302e-05, + "loss": 0.828, + "step": 24960 + }, + { + "epoch": 1.9821786501022047, + "grad_norm": 0.7455227375030518, + "learning_rate": 1.6972294990870845e-05, + "loss": 0.774, + "step": 24970 + }, + { + "epoch": 1.9829724741511043, + "grad_norm": 0.7973991632461548, + "learning_rate": 1.695906432748538e-05, + "loss": 0.833, + "step": 24980 + }, + { + "epoch": 1.983766298200004, + "grad_norm": 0.9970287084579468, + "learning_rate": 1.694583366409992e-05, + "loss": 0.7929, + "step": 24990 + }, + { + "epoch": 1.9845601222489035, + "grad_norm": 1.006111741065979, + "learning_rate": 1.6932603000714455e-05, + "loss": 0.7835, + "step": 25000 + }, + { + "epoch": 1.9853539462978032, + "grad_norm": 0.7539806962013245, + "learning_rate": 1.6919372337328994e-05, + "loss": 0.791, + "step": 25010 + }, + { + "epoch": 1.9861477703467028, + "grad_norm": 0.6668902635574341, + "learning_rate": 1.6906141673943533e-05, + "loss": 0.8413, + "step": 25020 + }, + { + "epoch": 1.9869415943956024, + "grad_norm": 0.7992756366729736, + "learning_rate": 1.689291101055807e-05, + "loss": 0.8181, + "step": 25030 + }, + { + "epoch": 1.9877354184445017, + "grad_norm": 0.836247980594635, + "learning_rate": 1.6879680347172607e-05, + "loss": 0.8255, + "step": 25040 + }, + { + "epoch": 1.9885292424934013, + "grad_norm": 0.701686680316925, + "learning_rate": 1.6866449683787146e-05, + "loss": 0.8523, + "step": 25050 + }, + { + "epoch": 1.9893230665423007, + "grad_norm": 0.8680205941200256, + "learning_rate": 1.6853219020401685e-05, + "loss": 0.8737, + "step": 25060 + }, + { + "epoch": 1.9901168905912003, + "grad_norm": 1.0040700435638428, + "learning_rate": 1.683998835701622e-05, + "loss": 0.8068, + "step": 25070 + }, + { + "epoch": 1.9909107146401, + "grad_norm": 0.7312654852867126, + "learning_rate": 1.682675769363076e-05, + "loss": 0.7744, + "step": 25080 + }, + { + "epoch": 1.9917045386889995, + "grad_norm": 0.8141999244689941, + "learning_rate": 1.6813527030245295e-05, + "loss": 0.8031, + "step": 25090 + }, + { + "epoch": 1.9924983627378992, + "grad_norm": 0.9395736455917358, + "learning_rate": 1.6800296366859837e-05, + "loss": 0.8556, + "step": 25100 + }, + { + "epoch": 1.9932921867867988, + "grad_norm": 1.0903936624526978, + "learning_rate": 1.6788388769812918e-05, + "loss": 0.7715, + "step": 25110 + }, + { + "epoch": 1.9940860108356984, + "grad_norm": 0.9055798053741455, + "learning_rate": 1.6775158106427457e-05, + "loss": 0.8161, + "step": 25120 + }, + { + "epoch": 1.994879834884598, + "grad_norm": 1.0567915439605713, + "learning_rate": 1.6761927443041993e-05, + "loss": 0.7872, + "step": 25130 + }, + { + "epoch": 1.9956736589334974, + "grad_norm": 0.8715507388114929, + "learning_rate": 1.6748696779656535e-05, + "loss": 0.847, + "step": 25140 + }, + { + "epoch": 1.996467482982397, + "grad_norm": 0.9350098371505737, + "learning_rate": 1.673546611627107e-05, + "loss": 0.7932, + "step": 25150 + }, + { + "epoch": 1.9972613070312966, + "grad_norm": 0.8718578219413757, + "learning_rate": 1.672223545288561e-05, + "loss": 0.8038, + "step": 25160 + }, + { + "epoch": 1.998055131080196, + "grad_norm": 0.987808883190155, + "learning_rate": 1.6709004789500145e-05, + "loss": 0.8141, + "step": 25170 + }, + { + "epoch": 1.9988489551290956, + "grad_norm": 0.8378719091415405, + "learning_rate": 1.6695774126114684e-05, + "loss": 0.7715, + "step": 25180 + }, + { + "epoch": 1.9996427791779952, + "grad_norm": 0.8598037958145142, + "learning_rate": 1.6682543462729223e-05, + "loss": 0.7998, + "step": 25190 + }, + { + "epoch": 2.0004366032268948, + "grad_norm": 0.8673446774482727, + "learning_rate": 1.666931279934376e-05, + "loss": 0.8147, + "step": 25200 + }, + { + "epoch": 2.0012304272757944, + "grad_norm": 0.8430289030075073, + "learning_rate": 1.6656082135958297e-05, + "loss": 0.801, + "step": 25210 + }, + { + "epoch": 2.002024251324694, + "grad_norm": 0.8227368593215942, + "learning_rate": 1.6642851472572836e-05, + "loss": 0.7521, + "step": 25220 + }, + { + "epoch": 2.0028180753735936, + "grad_norm": 0.8883048295974731, + "learning_rate": 1.6629620809187375e-05, + "loss": 0.8145, + "step": 25230 + }, + { + "epoch": 2.003611899422493, + "grad_norm": 0.6865272521972656, + "learning_rate": 1.661639014580191e-05, + "loss": 0.776, + "step": 25240 + }, + { + "epoch": 2.004405723471393, + "grad_norm": 0.8548721075057983, + "learning_rate": 1.660315948241645e-05, + "loss": 0.7732, + "step": 25250 + }, + { + "epoch": 2.005199547520292, + "grad_norm": 0.8403987884521484, + "learning_rate": 1.6589928819030985e-05, + "loss": 0.7927, + "step": 25260 + }, + { + "epoch": 2.0059933715691916, + "grad_norm": 0.7534387111663818, + "learning_rate": 1.6576698155645527e-05, + "loss": 0.8369, + "step": 25270 + }, + { + "epoch": 2.006787195618091, + "grad_norm": 0.8503125309944153, + "learning_rate": 1.6563467492260063e-05, + "loss": 0.7955, + "step": 25280 + }, + { + "epoch": 2.0075810196669908, + "grad_norm": 0.6855154633522034, + "learning_rate": 1.6550236828874602e-05, + "loss": 0.7958, + "step": 25290 + }, + { + "epoch": 2.0083748437158904, + "grad_norm": 0.7701917290687561, + "learning_rate": 1.6537006165489137e-05, + "loss": 0.7644, + "step": 25300 + }, + { + "epoch": 2.00916866776479, + "grad_norm": 0.9168187379837036, + "learning_rate": 1.6523775502103676e-05, + "loss": 0.8316, + "step": 25310 + }, + { + "epoch": 2.0099624918136896, + "grad_norm": 0.833348274230957, + "learning_rate": 1.6510544838718215e-05, + "loss": 0.7625, + "step": 25320 + }, + { + "epoch": 2.010756315862589, + "grad_norm": 0.8914290070533752, + "learning_rate": 1.649731417533275e-05, + "loss": 0.7627, + "step": 25330 + }, + { + "epoch": 2.011550139911489, + "grad_norm": 1.0038100481033325, + "learning_rate": 1.648408351194729e-05, + "loss": 0.784, + "step": 25340 + }, + { + "epoch": 2.0123439639603884, + "grad_norm": 0.8875173330307007, + "learning_rate": 1.6470852848561825e-05, + "loss": 0.8098, + "step": 25350 + }, + { + "epoch": 2.0131377880092876, + "grad_norm": 0.9379634857177734, + "learning_rate": 1.6457622185176368e-05, + "loss": 0.8373, + "step": 25360 + }, + { + "epoch": 2.013931612058187, + "grad_norm": 0.7726089954376221, + "learning_rate": 1.6444391521790903e-05, + "loss": 0.7727, + "step": 25370 + }, + { + "epoch": 2.0147254361070868, + "grad_norm": 0.7188129425048828, + "learning_rate": 1.6431160858405442e-05, + "loss": 0.7634, + "step": 25380 + }, + { + "epoch": 2.0155192601559864, + "grad_norm": 0.7986645698547363, + "learning_rate": 1.6417930195019978e-05, + "loss": 0.8123, + "step": 25390 + }, + { + "epoch": 2.016313084204886, + "grad_norm": 0.720370888710022, + "learning_rate": 1.6404699531634517e-05, + "loss": 0.7428, + "step": 25400 + }, + { + "epoch": 2.0171069082537856, + "grad_norm": 0.9025735259056091, + "learning_rate": 1.6391468868249055e-05, + "loss": 0.817, + "step": 25410 + }, + { + "epoch": 2.017900732302685, + "grad_norm": 0.8990045785903931, + "learning_rate": 1.637823820486359e-05, + "loss": 0.7254, + "step": 25420 + }, + { + "epoch": 2.018694556351585, + "grad_norm": 0.9403473138809204, + "learning_rate": 1.636500754147813e-05, + "loss": 0.7958, + "step": 25430 + }, + { + "epoch": 2.0194883804004844, + "grad_norm": 1.0497170686721802, + "learning_rate": 1.635177687809267e-05, + "loss": 0.7647, + "step": 25440 + }, + { + "epoch": 2.020282204449384, + "grad_norm": 1.1026662588119507, + "learning_rate": 1.6338546214707208e-05, + "loss": 0.7541, + "step": 25450 + }, + { + "epoch": 2.021076028498283, + "grad_norm": 0.9010255336761475, + "learning_rate": 1.6325315551321743e-05, + "loss": 0.7501, + "step": 25460 + }, + { + "epoch": 2.0218698525471828, + "grad_norm": 0.6994714736938477, + "learning_rate": 1.6312084887936282e-05, + "loss": 0.7985, + "step": 25470 + }, + { + "epoch": 2.0226636765960824, + "grad_norm": 0.7155155539512634, + "learning_rate": 1.6298854224550818e-05, + "loss": 0.7136, + "step": 25480 + }, + { + "epoch": 2.023457500644982, + "grad_norm": 0.9782127141952515, + "learning_rate": 1.628562356116536e-05, + "loss": 0.782, + "step": 25490 + }, + { + "epoch": 2.0242513246938816, + "grad_norm": 0.8731641173362732, + "learning_rate": 1.6272392897779896e-05, + "loss": 0.7801, + "step": 25500 + }, + { + "epoch": 2.025045148742781, + "grad_norm": 0.8580078482627869, + "learning_rate": 1.6259162234394435e-05, + "loss": 0.833, + "step": 25510 + }, + { + "epoch": 2.025838972791681, + "grad_norm": 0.9071916937828064, + "learning_rate": 1.624593157100897e-05, + "loss": 0.8026, + "step": 25520 + }, + { + "epoch": 2.0266327968405804, + "grad_norm": 0.9354455471038818, + "learning_rate": 1.623270090762351e-05, + "loss": 0.7792, + "step": 25530 + }, + { + "epoch": 2.02742662088948, + "grad_norm": 0.8776410222053528, + "learning_rate": 1.6219470244238048e-05, + "loss": 0.7958, + "step": 25540 + }, + { + "epoch": 2.0282204449383796, + "grad_norm": 0.7995261549949646, + "learning_rate": 1.6206239580852584e-05, + "loss": 0.7811, + "step": 25550 + }, + { + "epoch": 2.0290142689872788, + "grad_norm": 0.8553417325019836, + "learning_rate": 1.6193008917467122e-05, + "loss": 0.7713, + "step": 25560 + }, + { + "epoch": 2.0298080930361784, + "grad_norm": 0.9338173270225525, + "learning_rate": 1.6179778254081658e-05, + "loss": 0.7451, + "step": 25570 + }, + { + "epoch": 2.030601917085078, + "grad_norm": 0.9235241413116455, + "learning_rate": 1.61665475906962e-05, + "loss": 0.7126, + "step": 25580 + }, + { + "epoch": 2.0313957411339776, + "grad_norm": 0.864325761795044, + "learning_rate": 1.6153316927310736e-05, + "loss": 0.7988, + "step": 25590 + }, + { + "epoch": 2.032189565182877, + "grad_norm": 0.7986742854118347, + "learning_rate": 1.6140086263925275e-05, + "loss": 0.7959, + "step": 25600 + }, + { + "epoch": 2.032983389231777, + "grad_norm": 0.9655138254165649, + "learning_rate": 1.612685560053981e-05, + "loss": 0.8415, + "step": 25610 + }, + { + "epoch": 2.0337772132806764, + "grad_norm": 0.6859591603279114, + "learning_rate": 1.611362493715435e-05, + "loss": 0.7661, + "step": 25620 + }, + { + "epoch": 2.034571037329576, + "grad_norm": 0.7954578399658203, + "learning_rate": 1.6100394273768888e-05, + "loss": 0.811, + "step": 25630 + }, + { + "epoch": 2.0353648613784756, + "grad_norm": 0.9830471873283386, + "learning_rate": 1.6087163610383427e-05, + "loss": 0.7648, + "step": 25640 + }, + { + "epoch": 2.036158685427375, + "grad_norm": 0.9153994917869568, + "learning_rate": 1.6073932946997963e-05, + "loss": 0.7447, + "step": 25650 + }, + { + "epoch": 2.0369525094762744, + "grad_norm": 0.9104596972465515, + "learning_rate": 1.60607022836125e-05, + "loss": 0.8326, + "step": 25660 + }, + { + "epoch": 2.037746333525174, + "grad_norm": 0.8871384859085083, + "learning_rate": 1.604747162022704e-05, + "loss": 0.8012, + "step": 25670 + }, + { + "epoch": 2.0385401575740736, + "grad_norm": 0.800845205783844, + "learning_rate": 1.6034240956841576e-05, + "loss": 0.7772, + "step": 25680 + }, + { + "epoch": 2.039333981622973, + "grad_norm": 1.036254644393921, + "learning_rate": 1.6021010293456115e-05, + "loss": 0.7825, + "step": 25690 + }, + { + "epoch": 2.040127805671873, + "grad_norm": 0.8596107363700867, + "learning_rate": 1.600777963007065e-05, + "loss": 0.7614, + "step": 25700 + }, + { + "epoch": 2.0409216297207724, + "grad_norm": 0.7805830240249634, + "learning_rate": 1.5994548966685193e-05, + "loss": 0.8176, + "step": 25710 + }, + { + "epoch": 2.041715453769672, + "grad_norm": 1.0922433137893677, + "learning_rate": 1.598131830329973e-05, + "loss": 0.7965, + "step": 25720 + }, + { + "epoch": 2.0425092778185716, + "grad_norm": 0.7439164519309998, + "learning_rate": 1.5968087639914267e-05, + "loss": 0.8144, + "step": 25730 + }, + { + "epoch": 2.043303101867471, + "grad_norm": 0.7576305270195007, + "learning_rate": 1.5954856976528803e-05, + "loss": 0.8107, + "step": 25740 + }, + { + "epoch": 2.044096925916371, + "grad_norm": 0.7325326800346375, + "learning_rate": 1.5941626313143342e-05, + "loss": 0.7942, + "step": 25750 + }, + { + "epoch": 2.04489074996527, + "grad_norm": 0.7904483079910278, + "learning_rate": 1.592839564975788e-05, + "loss": 0.8301, + "step": 25760 + }, + { + "epoch": 2.0456845740141696, + "grad_norm": 0.8427342772483826, + "learning_rate": 1.5915164986372416e-05, + "loss": 0.7408, + "step": 25770 + }, + { + "epoch": 2.046478398063069, + "grad_norm": 0.9607779383659363, + "learning_rate": 1.5901934322986955e-05, + "loss": 0.7796, + "step": 25780 + }, + { + "epoch": 2.047272222111969, + "grad_norm": 0.9316497445106506, + "learning_rate": 1.588870365960149e-05, + "loss": 0.8237, + "step": 25790 + }, + { + "epoch": 2.0480660461608684, + "grad_norm": 0.8862087726593018, + "learning_rate": 1.5875472996216033e-05, + "loss": 0.7505, + "step": 25800 + }, + { + "epoch": 2.048859870209768, + "grad_norm": 0.9712191820144653, + "learning_rate": 1.586224233283057e-05, + "loss": 0.7987, + "step": 25810 + }, + { + "epoch": 2.0496536942586676, + "grad_norm": 0.8523909449577332, + "learning_rate": 1.5849011669445107e-05, + "loss": 0.7585, + "step": 25820 + }, + { + "epoch": 2.050447518307567, + "grad_norm": 0.9257214069366455, + "learning_rate": 1.5835781006059643e-05, + "loss": 0.7431, + "step": 25830 + }, + { + "epoch": 2.051241342356467, + "grad_norm": 0.9414131045341492, + "learning_rate": 1.5822550342674182e-05, + "loss": 0.7289, + "step": 25840 + }, + { + "epoch": 2.0520351664053664, + "grad_norm": 0.7903403639793396, + "learning_rate": 1.580931967928872e-05, + "loss": 0.7282, + "step": 25850 + }, + { + "epoch": 2.052828990454266, + "grad_norm": 0.7305004000663757, + "learning_rate": 1.579608901590326e-05, + "loss": 0.7352, + "step": 25860 + }, + { + "epoch": 2.053622814503165, + "grad_norm": 0.8797780275344849, + "learning_rate": 1.5782858352517795e-05, + "loss": 0.7377, + "step": 25870 + }, + { + "epoch": 2.054416638552065, + "grad_norm": 1.0097732543945312, + "learning_rate": 1.5769627689132334e-05, + "loss": 0.7732, + "step": 25880 + }, + { + "epoch": 2.0552104626009644, + "grad_norm": 0.8882811069488525, + "learning_rate": 1.5756397025746873e-05, + "loss": 0.8136, + "step": 25890 + }, + { + "epoch": 2.056004286649864, + "grad_norm": 0.7051288485527039, + "learning_rate": 1.574316636236141e-05, + "loss": 0.7464, + "step": 25900 + }, + { + "epoch": 2.0567981106987636, + "grad_norm": 0.9617118239402771, + "learning_rate": 1.5729935698975948e-05, + "loss": 0.7851, + "step": 25910 + }, + { + "epoch": 2.057591934747663, + "grad_norm": 0.8823207020759583, + "learning_rate": 1.5716705035590483e-05, + "loss": 0.8155, + "step": 25920 + }, + { + "epoch": 2.058385758796563, + "grad_norm": 1.0698038339614868, + "learning_rate": 1.5703474372205022e-05, + "loss": 0.798, + "step": 25930 + }, + { + "epoch": 2.0591795828454624, + "grad_norm": 1.0213465690612793, + "learning_rate": 1.569024370881956e-05, + "loss": 0.7397, + "step": 25940 + }, + { + "epoch": 2.059973406894362, + "grad_norm": 0.8252881765365601, + "learning_rate": 1.56770130454341e-05, + "loss": 0.7786, + "step": 25950 + }, + { + "epoch": 2.0607672309432616, + "grad_norm": 0.79705810546875, + "learning_rate": 1.5663782382048635e-05, + "loss": 0.7649, + "step": 25960 + }, + { + "epoch": 2.061561054992161, + "grad_norm": 0.7776365280151367, + "learning_rate": 1.5650551718663174e-05, + "loss": 0.7646, + "step": 25970 + }, + { + "epoch": 2.0623548790410604, + "grad_norm": 0.7812894582748413, + "learning_rate": 1.5637321055277713e-05, + "loss": 0.7598, + "step": 25980 + }, + { + "epoch": 2.06314870308996, + "grad_norm": 0.9928243160247803, + "learning_rate": 1.5624090391892252e-05, + "loss": 0.7991, + "step": 25990 + }, + { + "epoch": 2.0639425271388596, + "grad_norm": 0.9473987221717834, + "learning_rate": 1.5610859728506788e-05, + "loss": 0.7417, + "step": 26000 + }, + { + "epoch": 2.0647363511877592, + "grad_norm": 0.9683297276496887, + "learning_rate": 1.5597629065121323e-05, + "loss": 0.7986, + "step": 26010 + }, + { + "epoch": 2.065530175236659, + "grad_norm": 0.9280180335044861, + "learning_rate": 1.5584398401735866e-05, + "loss": 0.7455, + "step": 26020 + }, + { + "epoch": 2.0663239992855584, + "grad_norm": 0.8765127658843994, + "learning_rate": 1.55711677383504e-05, + "loss": 0.8731, + "step": 26030 + }, + { + "epoch": 2.067117823334458, + "grad_norm": 0.7415598034858704, + "learning_rate": 1.555793707496494e-05, + "loss": 0.8217, + "step": 26040 + }, + { + "epoch": 2.0679116473833576, + "grad_norm": 0.8780352473258972, + "learning_rate": 1.554602947791802e-05, + "loss": 0.8236, + "step": 26050 + }, + { + "epoch": 2.0687054714322572, + "grad_norm": 0.9112974405288696, + "learning_rate": 1.5532798814532563e-05, + "loss": 0.7893, + "step": 26060 + }, + { + "epoch": 2.0694992954811564, + "grad_norm": 0.726959228515625, + "learning_rate": 1.55195681511471e-05, + "loss": 0.7708, + "step": 26070 + }, + { + "epoch": 2.070293119530056, + "grad_norm": 0.9605491161346436, + "learning_rate": 1.5506337487761638e-05, + "loss": 0.7306, + "step": 26080 + }, + { + "epoch": 2.0710869435789556, + "grad_norm": 0.6991214752197266, + "learning_rate": 1.5493106824376173e-05, + "loss": 0.8029, + "step": 26090 + }, + { + "epoch": 2.0718807676278552, + "grad_norm": 0.947589099407196, + "learning_rate": 1.5479876160990712e-05, + "loss": 0.7885, + "step": 26100 + }, + { + "epoch": 2.072674591676755, + "grad_norm": 0.9293213486671448, + "learning_rate": 1.546664549760525e-05, + "loss": 0.7285, + "step": 26110 + }, + { + "epoch": 2.0734684157256544, + "grad_norm": 1.076295018196106, + "learning_rate": 1.545341483421979e-05, + "loss": 0.7704, + "step": 26120 + }, + { + "epoch": 2.074262239774554, + "grad_norm": 0.894410252571106, + "learning_rate": 1.5440184170834326e-05, + "loss": 0.7298, + "step": 26130 + }, + { + "epoch": 2.0750560638234536, + "grad_norm": 0.9316887855529785, + "learning_rate": 1.5426953507448865e-05, + "loss": 0.7989, + "step": 26140 + }, + { + "epoch": 2.0758498878723532, + "grad_norm": 0.8590110540390015, + "learning_rate": 1.5413722844063404e-05, + "loss": 0.7369, + "step": 26150 + }, + { + "epoch": 2.076643711921253, + "grad_norm": 0.8195733428001404, + "learning_rate": 1.540049218067794e-05, + "loss": 0.7837, + "step": 26160 + }, + { + "epoch": 2.077437535970152, + "grad_norm": 0.8563514351844788, + "learning_rate": 1.5387261517292478e-05, + "loss": 0.7514, + "step": 26170 + }, + { + "epoch": 2.0782313600190516, + "grad_norm": 0.8873001933097839, + "learning_rate": 1.5374030853907014e-05, + "loss": 0.7356, + "step": 26180 + }, + { + "epoch": 2.0790251840679512, + "grad_norm": 0.9553200006484985, + "learning_rate": 1.5360800190521556e-05, + "loss": 0.7611, + "step": 26190 + }, + { + "epoch": 2.079819008116851, + "grad_norm": 0.782842218875885, + "learning_rate": 1.534756952713609e-05, + "loss": 0.7383, + "step": 26200 + }, + { + "epoch": 2.0806128321657504, + "grad_norm": 0.9627733826637268, + "learning_rate": 1.533433886375063e-05, + "loss": 0.7796, + "step": 26210 + }, + { + "epoch": 2.08140665621465, + "grad_norm": 0.7810627818107605, + "learning_rate": 1.5321108200365166e-05, + "loss": 0.7968, + "step": 26220 + }, + { + "epoch": 2.0822004802635496, + "grad_norm": 0.8176783919334412, + "learning_rate": 1.5307877536979705e-05, + "loss": 0.7525, + "step": 26230 + }, + { + "epoch": 2.0829943043124493, + "grad_norm": 0.9359534382820129, + "learning_rate": 1.5294646873594244e-05, + "loss": 0.8248, + "step": 26240 + }, + { + "epoch": 2.083788128361349, + "grad_norm": 0.7079921960830688, + "learning_rate": 1.528141621020878e-05, + "loss": 0.8115, + "step": 26250 + }, + { + "epoch": 2.0845819524102485, + "grad_norm": 0.8684226870536804, + "learning_rate": 1.5268185546823318e-05, + "loss": 0.7866, + "step": 26260 + }, + { + "epoch": 2.0853757764591476, + "grad_norm": 0.8505408763885498, + "learning_rate": 1.5254954883437855e-05, + "loss": 0.7144, + "step": 26270 + }, + { + "epoch": 2.0861696005080472, + "grad_norm": 0.953670084476471, + "learning_rate": 1.5241724220052394e-05, + "loss": 0.7999, + "step": 26280 + }, + { + "epoch": 2.086963424556947, + "grad_norm": 1.1968601942062378, + "learning_rate": 1.5228493556666932e-05, + "loss": 0.7863, + "step": 26290 + }, + { + "epoch": 2.0877572486058464, + "grad_norm": 0.7209346294403076, + "learning_rate": 1.521526289328147e-05, + "loss": 0.843, + "step": 26300 + }, + { + "epoch": 2.088551072654746, + "grad_norm": 0.8510392904281616, + "learning_rate": 1.5202032229896008e-05, + "loss": 0.831, + "step": 26310 + }, + { + "epoch": 2.0893448967036456, + "grad_norm": 0.899815559387207, + "learning_rate": 1.5188801566510547e-05, + "loss": 0.7577, + "step": 26320 + }, + { + "epoch": 2.0901387207525453, + "grad_norm": 0.8000263571739197, + "learning_rate": 1.5175570903125084e-05, + "loss": 0.7918, + "step": 26330 + }, + { + "epoch": 2.090932544801445, + "grad_norm": 0.7844998240470886, + "learning_rate": 1.5162340239739623e-05, + "loss": 0.7903, + "step": 26340 + }, + { + "epoch": 2.0917263688503445, + "grad_norm": 1.0130077600479126, + "learning_rate": 1.5149109576354158e-05, + "loss": 0.7545, + "step": 26350 + }, + { + "epoch": 2.092520192899244, + "grad_norm": 0.8703743815422058, + "learning_rate": 1.5135878912968696e-05, + "loss": 0.8129, + "step": 26360 + }, + { + "epoch": 2.0933140169481432, + "grad_norm": 0.6894111633300781, + "learning_rate": 1.5122648249583235e-05, + "loss": 0.7017, + "step": 26370 + }, + { + "epoch": 2.094107840997043, + "grad_norm": 0.9596577882766724, + "learning_rate": 1.5109417586197772e-05, + "loss": 0.6868, + "step": 26380 + }, + { + "epoch": 2.0949016650459424, + "grad_norm": 0.878044843673706, + "learning_rate": 1.509618692281231e-05, + "loss": 0.8629, + "step": 26390 + }, + { + "epoch": 2.095695489094842, + "grad_norm": 0.9057536125183105, + "learning_rate": 1.5082956259426848e-05, + "loss": 0.7812, + "step": 26400 + }, + { + "epoch": 2.0964893131437417, + "grad_norm": 0.8363611698150635, + "learning_rate": 1.5069725596041387e-05, + "loss": 0.7738, + "step": 26410 + }, + { + "epoch": 2.0972831371926413, + "grad_norm": 0.7889995574951172, + "learning_rate": 1.5056494932655924e-05, + "loss": 0.8144, + "step": 26420 + }, + { + "epoch": 2.098076961241541, + "grad_norm": 0.9280676245689392, + "learning_rate": 1.5043264269270463e-05, + "loss": 0.7792, + "step": 26430 + }, + { + "epoch": 2.0988707852904405, + "grad_norm": 1.015638828277588, + "learning_rate": 1.5030033605885e-05, + "loss": 0.7745, + "step": 26440 + }, + { + "epoch": 2.09966460933934, + "grad_norm": 0.9778096079826355, + "learning_rate": 1.501680294249954e-05, + "loss": 0.7636, + "step": 26450 + }, + { + "epoch": 2.1004584333882397, + "grad_norm": 0.9082860946655273, + "learning_rate": 1.5003572279114075e-05, + "loss": 0.7399, + "step": 26460 + }, + { + "epoch": 2.101252257437139, + "grad_norm": 0.8969444632530212, + "learning_rate": 1.4990341615728615e-05, + "loss": 0.7326, + "step": 26470 + }, + { + "epoch": 2.1020460814860384, + "grad_norm": 0.9246445298194885, + "learning_rate": 1.4977110952343151e-05, + "loss": 0.789, + "step": 26480 + }, + { + "epoch": 2.102839905534938, + "grad_norm": 0.9216938614845276, + "learning_rate": 1.4963880288957688e-05, + "loss": 0.821, + "step": 26490 + }, + { + "epoch": 2.1036337295838377, + "grad_norm": 0.7816628813743591, + "learning_rate": 1.4950649625572227e-05, + "loss": 0.7705, + "step": 26500 + }, + { + "epoch": 2.1044275536327373, + "grad_norm": 1.04630446434021, + "learning_rate": 1.4937418962186764e-05, + "loss": 0.7733, + "step": 26510 + }, + { + "epoch": 2.105221377681637, + "grad_norm": 0.9890500903129578, + "learning_rate": 1.4924188298801303e-05, + "loss": 0.8013, + "step": 26520 + }, + { + "epoch": 2.1060152017305365, + "grad_norm": 0.8225802779197693, + "learning_rate": 1.491095763541584e-05, + "loss": 0.7539, + "step": 26530 + }, + { + "epoch": 2.106809025779436, + "grad_norm": 0.7966319918632507, + "learning_rate": 1.489772697203038e-05, + "loss": 0.7814, + "step": 26540 + }, + { + "epoch": 2.1076028498283357, + "grad_norm": 0.9166170954704285, + "learning_rate": 1.4884496308644915e-05, + "loss": 0.6954, + "step": 26550 + }, + { + "epoch": 2.1083966738772353, + "grad_norm": 1.0586912631988525, + "learning_rate": 1.4871265645259456e-05, + "loss": 0.7287, + "step": 26560 + }, + { + "epoch": 2.109190497926135, + "grad_norm": 0.9350046515464783, + "learning_rate": 1.4858034981873991e-05, + "loss": 0.7704, + "step": 26570 + }, + { + "epoch": 2.109984321975034, + "grad_norm": 0.9269388914108276, + "learning_rate": 1.4844804318488532e-05, + "loss": 0.8284, + "step": 26580 + }, + { + "epoch": 2.1107781460239337, + "grad_norm": 0.7918737530708313, + "learning_rate": 1.4831573655103067e-05, + "loss": 0.7947, + "step": 26590 + }, + { + "epoch": 2.1115719700728333, + "grad_norm": 0.8778436183929443, + "learning_rate": 1.4818342991717604e-05, + "loss": 0.7774, + "step": 26600 + }, + { + "epoch": 2.112365794121733, + "grad_norm": 1.0038212537765503, + "learning_rate": 1.4805112328332143e-05, + "loss": 0.7356, + "step": 26610 + }, + { + "epoch": 2.1131596181706325, + "grad_norm": 0.811038076877594, + "learning_rate": 1.479188166494668e-05, + "loss": 0.811, + "step": 26620 + }, + { + "epoch": 2.113953442219532, + "grad_norm": 1.1587707996368408, + "learning_rate": 1.477865100156122e-05, + "loss": 0.8549, + "step": 26630 + }, + { + "epoch": 2.1147472662684317, + "grad_norm": 0.74503493309021, + "learning_rate": 1.4765420338175757e-05, + "loss": 0.7897, + "step": 26640 + }, + { + "epoch": 2.1155410903173313, + "grad_norm": 0.9020684361457825, + "learning_rate": 1.4752189674790296e-05, + "loss": 0.7894, + "step": 26650 + }, + { + "epoch": 2.116334914366231, + "grad_norm": 0.7925030589103699, + "learning_rate": 1.4738959011404831e-05, + "loss": 0.8061, + "step": 26660 + }, + { + "epoch": 2.1171287384151305, + "grad_norm": 0.8654730916023254, + "learning_rate": 1.4725728348019372e-05, + "loss": 0.7569, + "step": 26670 + }, + { + "epoch": 2.1179225624640297, + "grad_norm": 0.9839343428611755, + "learning_rate": 1.4712497684633907e-05, + "loss": 0.8012, + "step": 26680 + }, + { + "epoch": 2.1187163865129293, + "grad_norm": 0.8837997317314148, + "learning_rate": 1.4699267021248448e-05, + "loss": 0.8263, + "step": 26690 + }, + { + "epoch": 2.119510210561829, + "grad_norm": 1.0431504249572754, + "learning_rate": 1.4686036357862984e-05, + "loss": 0.8411, + "step": 26700 + }, + { + "epoch": 2.1203040346107285, + "grad_norm": 0.9431108236312866, + "learning_rate": 1.467280569447752e-05, + "loss": 0.786, + "step": 26710 + }, + { + "epoch": 2.121097858659628, + "grad_norm": 0.8475701212882996, + "learning_rate": 1.465957503109206e-05, + "loss": 0.8422, + "step": 26720 + }, + { + "epoch": 2.1218916827085277, + "grad_norm": 0.9545729160308838, + "learning_rate": 1.4646344367706597e-05, + "loss": 0.7607, + "step": 26730 + }, + { + "epoch": 2.1226855067574273, + "grad_norm": 0.9317882061004639, + "learning_rate": 1.4633113704321136e-05, + "loss": 0.7591, + "step": 26740 + }, + { + "epoch": 2.123479330806327, + "grad_norm": 0.8636899590492249, + "learning_rate": 1.4619883040935673e-05, + "loss": 0.7443, + "step": 26750 + }, + { + "epoch": 2.1242731548552265, + "grad_norm": 0.9780907034873962, + "learning_rate": 1.4606652377550212e-05, + "loss": 0.7513, + "step": 26760 + }, + { + "epoch": 2.125066978904126, + "grad_norm": 0.9252169132232666, + "learning_rate": 1.4593421714164748e-05, + "loss": 0.8244, + "step": 26770 + }, + { + "epoch": 2.1258608029530253, + "grad_norm": 0.8407522439956665, + "learning_rate": 1.4580191050779288e-05, + "loss": 0.7896, + "step": 26780 + }, + { + "epoch": 2.126654627001925, + "grad_norm": 0.8842277526855469, + "learning_rate": 1.4566960387393824e-05, + "loss": 0.8509, + "step": 26790 + }, + { + "epoch": 2.1274484510508245, + "grad_norm": 0.8318625092506409, + "learning_rate": 1.4553729724008364e-05, + "loss": 0.7577, + "step": 26800 + }, + { + "epoch": 2.128242275099724, + "grad_norm": 1.0206186771392822, + "learning_rate": 1.45404990606229e-05, + "loss": 0.7706, + "step": 26810 + }, + { + "epoch": 2.1290360991486237, + "grad_norm": 0.7975717782974243, + "learning_rate": 1.4527268397237437e-05, + "loss": 0.7187, + "step": 26820 + }, + { + "epoch": 2.1298299231975233, + "grad_norm": 0.9346230030059814, + "learning_rate": 1.4514037733851976e-05, + "loss": 0.7833, + "step": 26830 + }, + { + "epoch": 2.130623747246423, + "grad_norm": 0.8083703517913818, + "learning_rate": 1.4500807070466513e-05, + "loss": 0.8184, + "step": 26840 + }, + { + "epoch": 2.1314175712953225, + "grad_norm": 1.0353641510009766, + "learning_rate": 1.4487576407081052e-05, + "loss": 0.8394, + "step": 26850 + }, + { + "epoch": 2.132211395344222, + "grad_norm": 0.8950655460357666, + "learning_rate": 1.447434574369559e-05, + "loss": 0.8077, + "step": 26860 + }, + { + "epoch": 2.1330052193931217, + "grad_norm": 1.034495234489441, + "learning_rate": 1.4461115080310128e-05, + "loss": 0.7665, + "step": 26870 + }, + { + "epoch": 2.133799043442021, + "grad_norm": 0.8213537335395813, + "learning_rate": 1.4447884416924664e-05, + "loss": 0.8224, + "step": 26880 + }, + { + "epoch": 2.1345928674909205, + "grad_norm": 1.1027780771255493, + "learning_rate": 1.4434653753539205e-05, + "loss": 0.7717, + "step": 26890 + }, + { + "epoch": 2.13538669153982, + "grad_norm": 1.02985417842865, + "learning_rate": 1.442142309015374e-05, + "loss": 0.7465, + "step": 26900 + }, + { + "epoch": 2.1361805155887197, + "grad_norm": 0.7433731555938721, + "learning_rate": 1.440819242676828e-05, + "loss": 0.7882, + "step": 26910 + }, + { + "epoch": 2.1369743396376193, + "grad_norm": 1.0033385753631592, + "learning_rate": 1.4394961763382816e-05, + "loss": 0.8659, + "step": 26920 + }, + { + "epoch": 2.137768163686519, + "grad_norm": 0.7365638613700867, + "learning_rate": 1.4381731099997355e-05, + "loss": 0.7062, + "step": 26930 + }, + { + "epoch": 2.1385619877354185, + "grad_norm": 1.0388671159744263, + "learning_rate": 1.4368500436611892e-05, + "loss": 0.8148, + "step": 26940 + }, + { + "epoch": 2.139355811784318, + "grad_norm": 0.9410164952278137, + "learning_rate": 1.435526977322643e-05, + "loss": 0.6894, + "step": 26950 + }, + { + "epoch": 2.1401496358332177, + "grad_norm": 0.903447151184082, + "learning_rate": 1.4342039109840969e-05, + "loss": 0.7527, + "step": 26960 + }, + { + "epoch": 2.1409434598821173, + "grad_norm": 1.121335744857788, + "learning_rate": 1.4328808446455506e-05, + "loss": 0.8063, + "step": 26970 + }, + { + "epoch": 2.1417372839310165, + "grad_norm": 0.8166680335998535, + "learning_rate": 1.4315577783070045e-05, + "loss": 0.8217, + "step": 26980 + }, + { + "epoch": 2.142531107979916, + "grad_norm": 0.8533350825309753, + "learning_rate": 1.430234711968458e-05, + "loss": 0.8218, + "step": 26990 + }, + { + "epoch": 2.1433249320288157, + "grad_norm": 0.9793769121170044, + "learning_rate": 1.4289116456299121e-05, + "loss": 0.7345, + "step": 27000 + }, + { + "epoch": 2.1441187560777153, + "grad_norm": 0.9998401403427124, + "learning_rate": 1.4275885792913656e-05, + "loss": 0.7736, + "step": 27010 + }, + { + "epoch": 2.144912580126615, + "grad_norm": 1.0122779607772827, + "learning_rate": 1.4262655129528197e-05, + "loss": 0.7703, + "step": 27020 + }, + { + "epoch": 2.1457064041755145, + "grad_norm": 1.040964961051941, + "learning_rate": 1.4249424466142733e-05, + "loss": 0.7899, + "step": 27030 + }, + { + "epoch": 2.146500228224414, + "grad_norm": 0.8178542852401733, + "learning_rate": 1.4236193802757272e-05, + "loss": 0.7742, + "step": 27040 + }, + { + "epoch": 2.1472940522733137, + "grad_norm": 0.8384308815002441, + "learning_rate": 1.4222963139371809e-05, + "loss": 0.8275, + "step": 27050 + }, + { + "epoch": 2.1480878763222133, + "grad_norm": 0.9705979228019714, + "learning_rate": 1.4209732475986346e-05, + "loss": 0.7532, + "step": 27060 + }, + { + "epoch": 2.148881700371113, + "grad_norm": 0.9038788080215454, + "learning_rate": 1.4196501812600885e-05, + "loss": 0.8162, + "step": 27070 + }, + { + "epoch": 2.1496755244200125, + "grad_norm": 0.9565449953079224, + "learning_rate": 1.4183271149215422e-05, + "loss": 0.7742, + "step": 27080 + }, + { + "epoch": 2.1504693484689117, + "grad_norm": 0.9967783093452454, + "learning_rate": 1.4170040485829961e-05, + "loss": 0.8269, + "step": 27090 + }, + { + "epoch": 2.1512631725178113, + "grad_norm": 0.8932458162307739, + "learning_rate": 1.4156809822444497e-05, + "loss": 0.746, + "step": 27100 + }, + { + "epoch": 2.152056996566711, + "grad_norm": 0.9235972762107849, + "learning_rate": 1.4143579159059037e-05, + "loss": 0.8389, + "step": 27110 + }, + { + "epoch": 2.1528508206156105, + "grad_norm": 0.9918680787086487, + "learning_rate": 1.4130348495673573e-05, + "loss": 0.7368, + "step": 27120 + }, + { + "epoch": 2.15364464466451, + "grad_norm": 1.0589817762374878, + "learning_rate": 1.4117117832288113e-05, + "loss": 0.708, + "step": 27130 + }, + { + "epoch": 2.1544384687134097, + "grad_norm": 0.8974472284317017, + "learning_rate": 1.4103887168902649e-05, + "loss": 0.8292, + "step": 27140 + }, + { + "epoch": 2.1552322927623093, + "grad_norm": 0.8955181837081909, + "learning_rate": 1.4090656505517188e-05, + "loss": 0.7798, + "step": 27150 + }, + { + "epoch": 2.156026116811209, + "grad_norm": 1.0225322246551514, + "learning_rate": 1.4077425842131725e-05, + "loss": 0.7567, + "step": 27160 + }, + { + "epoch": 2.1568199408601085, + "grad_norm": 0.7848957180976868, + "learning_rate": 1.4064195178746262e-05, + "loss": 0.8015, + "step": 27170 + }, + { + "epoch": 2.1576137649090077, + "grad_norm": 0.916002631187439, + "learning_rate": 1.4050964515360801e-05, + "loss": 0.8266, + "step": 27180 + }, + { + "epoch": 2.1584075889579073, + "grad_norm": 0.8571447134017944, + "learning_rate": 1.4037733851975337e-05, + "loss": 0.7878, + "step": 27190 + }, + { + "epoch": 2.159201413006807, + "grad_norm": 0.8290430903434753, + "learning_rate": 1.4024503188589877e-05, + "loss": 0.7503, + "step": 27200 + }, + { + "epoch": 2.1599952370557065, + "grad_norm": 0.9717968702316284, + "learning_rate": 1.4011272525204413e-05, + "loss": 0.7839, + "step": 27210 + }, + { + "epoch": 2.160789061104606, + "grad_norm": 0.8097538948059082, + "learning_rate": 1.3998041861818954e-05, + "loss": 0.784, + "step": 27220 + }, + { + "epoch": 2.1615828851535057, + "grad_norm": 1.0527442693710327, + "learning_rate": 1.3984811198433489e-05, + "loss": 0.7795, + "step": 27230 + }, + { + "epoch": 2.1623767092024053, + "grad_norm": 0.7761133909225464, + "learning_rate": 1.397158053504803e-05, + "loss": 0.796, + "step": 27240 + }, + { + "epoch": 2.163170533251305, + "grad_norm": 1.009090781211853, + "learning_rate": 1.3958349871662565e-05, + "loss": 0.7895, + "step": 27250 + }, + { + "epoch": 2.1639643573002045, + "grad_norm": 0.9476297497749329, + "learning_rate": 1.3945119208277104e-05, + "loss": 0.805, + "step": 27260 + }, + { + "epoch": 2.164758181349104, + "grad_norm": 0.8424776792526245, + "learning_rate": 1.3931888544891641e-05, + "loss": 0.7752, + "step": 27270 + }, + { + "epoch": 2.1655520053980037, + "grad_norm": 1.0687159299850464, + "learning_rate": 1.391865788150618e-05, + "loss": 0.7333, + "step": 27280 + }, + { + "epoch": 2.166345829446903, + "grad_norm": 0.7562177777290344, + "learning_rate": 1.3905427218120718e-05, + "loss": 0.766, + "step": 27290 + }, + { + "epoch": 2.1671396534958025, + "grad_norm": 0.9093291163444519, + "learning_rate": 1.3892196554735253e-05, + "loss": 0.7164, + "step": 27300 + }, + { + "epoch": 2.167933477544702, + "grad_norm": 0.8972833156585693, + "learning_rate": 1.3878965891349794e-05, + "loss": 0.7667, + "step": 27310 + }, + { + "epoch": 2.1687273015936017, + "grad_norm": 0.9251981377601624, + "learning_rate": 1.386573522796433e-05, + "loss": 0.7996, + "step": 27320 + }, + { + "epoch": 2.1695211256425013, + "grad_norm": 0.8145211338996887, + "learning_rate": 1.385250456457887e-05, + "loss": 0.7725, + "step": 27330 + }, + { + "epoch": 2.170314949691401, + "grad_norm": 0.8409168124198914, + "learning_rate": 1.3839273901193405e-05, + "loss": 0.7804, + "step": 27340 + }, + { + "epoch": 2.1711087737403005, + "grad_norm": 0.9111427068710327, + "learning_rate": 1.3826043237807944e-05, + "loss": 0.7775, + "step": 27350 + }, + { + "epoch": 2.1719025977892, + "grad_norm": 0.91303551197052, + "learning_rate": 1.3812812574422482e-05, + "loss": 0.8227, + "step": 27360 + }, + { + "epoch": 2.1726964218380997, + "grad_norm": 0.8870943188667297, + "learning_rate": 1.379958191103702e-05, + "loss": 0.7965, + "step": 27370 + }, + { + "epoch": 2.1734902458869994, + "grad_norm": 0.8687139749526978, + "learning_rate": 1.3786351247651558e-05, + "loss": 0.799, + "step": 27380 + }, + { + "epoch": 2.1742840699358985, + "grad_norm": 0.9708724617958069, + "learning_rate": 1.3773120584266097e-05, + "loss": 0.6977, + "step": 27390 + }, + { + "epoch": 2.175077893984798, + "grad_norm": 0.8507083654403687, + "learning_rate": 1.3759889920880634e-05, + "loss": 0.8515, + "step": 27400 + }, + { + "epoch": 2.1758717180336977, + "grad_norm": 0.9613845348358154, + "learning_rate": 1.374665925749517e-05, + "loss": 0.8635, + "step": 27410 + }, + { + "epoch": 2.1766655420825973, + "grad_norm": 0.8473123908042908, + "learning_rate": 1.373342859410971e-05, + "loss": 0.7451, + "step": 27420 + }, + { + "epoch": 2.177459366131497, + "grad_norm": 0.9432036280632019, + "learning_rate": 1.3720197930724246e-05, + "loss": 0.8018, + "step": 27430 + }, + { + "epoch": 2.1782531901803965, + "grad_norm": 0.8323158025741577, + "learning_rate": 1.3706967267338786e-05, + "loss": 0.7799, + "step": 27440 + }, + { + "epoch": 2.179047014229296, + "grad_norm": 1.166176438331604, + "learning_rate": 1.3693736603953322e-05, + "loss": 0.7891, + "step": 27450 + }, + { + "epoch": 2.1798408382781957, + "grad_norm": 0.781601071357727, + "learning_rate": 1.368050594056786e-05, + "loss": 0.7851, + "step": 27460 + }, + { + "epoch": 2.1806346623270954, + "grad_norm": 1.0222710371017456, + "learning_rate": 1.3667275277182398e-05, + "loss": 0.7159, + "step": 27470 + }, + { + "epoch": 2.181428486375995, + "grad_norm": 0.758436381816864, + "learning_rate": 1.3654044613796937e-05, + "loss": 0.7342, + "step": 27480 + }, + { + "epoch": 2.182222310424894, + "grad_norm": 0.8809502124786377, + "learning_rate": 1.3640813950411474e-05, + "loss": 0.8097, + "step": 27490 + }, + { + "epoch": 2.1830161344737937, + "grad_norm": 0.8321342468261719, + "learning_rate": 1.3627583287026013e-05, + "loss": 0.756, + "step": 27500 + }, + { + "epoch": 2.1838099585226933, + "grad_norm": 0.8868312835693359, + "learning_rate": 1.361435262364055e-05, + "loss": 0.8246, + "step": 27510 + }, + { + "epoch": 2.184603782571593, + "grad_norm": 0.8987812995910645, + "learning_rate": 1.3601121960255086e-05, + "loss": 0.8034, + "step": 27520 + }, + { + "epoch": 2.1853976066204925, + "grad_norm": 0.8325653076171875, + "learning_rate": 1.3587891296869626e-05, + "loss": 0.8197, + "step": 27530 + }, + { + "epoch": 2.186191430669392, + "grad_norm": 0.9166516065597534, + "learning_rate": 1.3574660633484162e-05, + "loss": 0.7413, + "step": 27540 + }, + { + "epoch": 2.1869852547182917, + "grad_norm": 0.9565515518188477, + "learning_rate": 1.3561429970098703e-05, + "loss": 0.7971, + "step": 27550 + }, + { + "epoch": 2.1877790787671914, + "grad_norm": 0.8458524346351624, + "learning_rate": 1.3548199306713238e-05, + "loss": 0.756, + "step": 27560 + }, + { + "epoch": 2.188572902816091, + "grad_norm": 0.7264953255653381, + "learning_rate": 1.3534968643327777e-05, + "loss": 0.798, + "step": 27570 + }, + { + "epoch": 2.1893667268649906, + "grad_norm": 0.9435407519340515, + "learning_rate": 1.3521737979942314e-05, + "loss": 0.7956, + "step": 27580 + }, + { + "epoch": 2.19016055091389, + "grad_norm": 0.9455521106719971, + "learning_rate": 1.3508507316556853e-05, + "loss": 0.7712, + "step": 27590 + }, + { + "epoch": 2.1909543749627893, + "grad_norm": 1.1080188751220703, + "learning_rate": 1.349527665317139e-05, + "loss": 0.7544, + "step": 27600 + }, + { + "epoch": 2.191748199011689, + "grad_norm": 0.98456871509552, + "learning_rate": 1.348204598978593e-05, + "loss": 0.8196, + "step": 27610 + }, + { + "epoch": 2.1925420230605885, + "grad_norm": 0.8413136005401611, + "learning_rate": 1.3468815326400467e-05, + "loss": 0.7864, + "step": 27620 + }, + { + "epoch": 2.193335847109488, + "grad_norm": 0.7672378420829773, + "learning_rate": 1.3455584663015002e-05, + "loss": 0.7624, + "step": 27630 + }, + { + "epoch": 2.1941296711583878, + "grad_norm": 0.9120532274246216, + "learning_rate": 1.3442353999629543e-05, + "loss": 0.7425, + "step": 27640 + }, + { + "epoch": 2.1949234952072874, + "grad_norm": 0.9745380878448486, + "learning_rate": 1.3429123336244078e-05, + "loss": 0.8149, + "step": 27650 + }, + { + "epoch": 2.195717319256187, + "grad_norm": 0.8304915428161621, + "learning_rate": 1.3415892672858619e-05, + "loss": 0.7609, + "step": 27660 + }, + { + "epoch": 2.1965111433050866, + "grad_norm": 0.9230349659919739, + "learning_rate": 1.3402662009473154e-05, + "loss": 0.7648, + "step": 27670 + }, + { + "epoch": 2.197304967353986, + "grad_norm": 1.0619224309921265, + "learning_rate": 1.3389431346087693e-05, + "loss": 0.7887, + "step": 27680 + }, + { + "epoch": 2.1980987914028853, + "grad_norm": 1.0033013820648193, + "learning_rate": 1.337620068270223e-05, + "loss": 0.805, + "step": 27690 + }, + { + "epoch": 2.198892615451785, + "grad_norm": 0.9245965480804443, + "learning_rate": 1.336297001931677e-05, + "loss": 0.8012, + "step": 27700 + }, + { + "epoch": 2.1996864395006845, + "grad_norm": 1.0526070594787598, + "learning_rate": 1.3349739355931307e-05, + "loss": 0.7792, + "step": 27710 + }, + { + "epoch": 2.200480263549584, + "grad_norm": 0.7733340859413147, + "learning_rate": 1.3336508692545846e-05, + "loss": 0.7708, + "step": 27720 + }, + { + "epoch": 2.2012740875984838, + "grad_norm": 1.1390515565872192, + "learning_rate": 1.3323278029160383e-05, + "loss": 0.7797, + "step": 27730 + }, + { + "epoch": 2.2020679116473834, + "grad_norm": 0.8812433481216431, + "learning_rate": 1.3310047365774922e-05, + "loss": 0.719, + "step": 27740 + }, + { + "epoch": 2.202861735696283, + "grad_norm": 0.8924884796142578, + "learning_rate": 1.3296816702389459e-05, + "loss": 0.836, + "step": 27750 + }, + { + "epoch": 2.2036555597451826, + "grad_norm": 0.9707908630371094, + "learning_rate": 1.3283586039003995e-05, + "loss": 0.7732, + "step": 27760 + }, + { + "epoch": 2.204449383794082, + "grad_norm": 0.904464602470398, + "learning_rate": 1.3270355375618535e-05, + "loss": 0.8084, + "step": 27770 + }, + { + "epoch": 2.205243207842982, + "grad_norm": 0.8692517280578613, + "learning_rate": 1.325712471223307e-05, + "loss": 0.8074, + "step": 27780 + }, + { + "epoch": 2.2060370318918814, + "grad_norm": 0.9948639869689941, + "learning_rate": 1.324389404884761e-05, + "loss": 0.7936, + "step": 27790 + }, + { + "epoch": 2.2068308559407805, + "grad_norm": 0.915266215801239, + "learning_rate": 1.3230663385462147e-05, + "loss": 0.8084, + "step": 27800 + }, + { + "epoch": 2.20762467998968, + "grad_norm": 0.8853785395622253, + "learning_rate": 1.3217432722076686e-05, + "loss": 0.774, + "step": 27810 + }, + { + "epoch": 2.2084185040385798, + "grad_norm": 1.0445677042007446, + "learning_rate": 1.3204202058691223e-05, + "loss": 0.8096, + "step": 27820 + }, + { + "epoch": 2.2092123280874794, + "grad_norm": 0.9219695925712585, + "learning_rate": 1.3190971395305762e-05, + "loss": 0.8174, + "step": 27830 + }, + { + "epoch": 2.210006152136379, + "grad_norm": 0.727891206741333, + "learning_rate": 1.31777407319203e-05, + "loss": 0.7495, + "step": 27840 + }, + { + "epoch": 2.2107999761852786, + "grad_norm": 0.8732167482376099, + "learning_rate": 1.3164510068534838e-05, + "loss": 0.7907, + "step": 27850 + }, + { + "epoch": 2.211593800234178, + "grad_norm": 0.9940975904464722, + "learning_rate": 1.3151279405149375e-05, + "loss": 0.8013, + "step": 27860 + }, + { + "epoch": 2.212387624283078, + "grad_norm": 0.9644079804420471, + "learning_rate": 1.3138048741763911e-05, + "loss": 0.7781, + "step": 27870 + }, + { + "epoch": 2.2131814483319774, + "grad_norm": 0.957815945148468, + "learning_rate": 1.3124818078378452e-05, + "loss": 0.7853, + "step": 27880 + }, + { + "epoch": 2.2139752723808765, + "grad_norm": 0.9299752712249756, + "learning_rate": 1.3111587414992987e-05, + "loss": 0.7654, + "step": 27890 + }, + { + "epoch": 2.214769096429776, + "grad_norm": 0.763888418674469, + "learning_rate": 1.3098356751607526e-05, + "loss": 0.8254, + "step": 27900 + }, + { + "epoch": 2.2155629204786758, + "grad_norm": 0.9485118389129639, + "learning_rate": 1.3085126088222063e-05, + "loss": 0.7565, + "step": 27910 + }, + { + "epoch": 2.2163567445275754, + "grad_norm": 0.7794759273529053, + "learning_rate": 1.3071895424836602e-05, + "loss": 0.7794, + "step": 27920 + }, + { + "epoch": 2.217150568576475, + "grad_norm": 0.8612314462661743, + "learning_rate": 1.305866476145114e-05, + "loss": 0.8535, + "step": 27930 + }, + { + "epoch": 2.2179443926253746, + "grad_norm": 0.926816463470459, + "learning_rate": 1.3045434098065678e-05, + "loss": 0.7421, + "step": 27940 + }, + { + "epoch": 2.218738216674274, + "grad_norm": 0.8914726376533508, + "learning_rate": 1.3032203434680216e-05, + "loss": 0.7918, + "step": 27950 + }, + { + "epoch": 2.219532040723174, + "grad_norm": 0.7953301072120667, + "learning_rate": 1.3018972771294755e-05, + "loss": 0.7813, + "step": 27960 + }, + { + "epoch": 2.2203258647720734, + "grad_norm": 1.100185513496399, + "learning_rate": 1.3005742107909292e-05, + "loss": 0.8074, + "step": 27970 + }, + { + "epoch": 2.221119688820973, + "grad_norm": 0.9314887523651123, + "learning_rate": 1.2992511444523827e-05, + "loss": 0.7986, + "step": 27980 + }, + { + "epoch": 2.2219135128698726, + "grad_norm": 0.9188593626022339, + "learning_rate": 1.2979280781138366e-05, + "loss": 0.7485, + "step": 27990 + }, + { + "epoch": 2.2227073369187718, + "grad_norm": 1.0341116189956665, + "learning_rate": 1.2966050117752903e-05, + "loss": 0.7407, + "step": 28000 + }, + { + "epoch": 2.2235011609676714, + "grad_norm": 0.8388242125511169, + "learning_rate": 1.2952819454367442e-05, + "loss": 0.8283, + "step": 28010 + }, + { + "epoch": 2.224294985016571, + "grad_norm": 0.9826040267944336, + "learning_rate": 1.293958879098198e-05, + "loss": 0.7214, + "step": 28020 + }, + { + "epoch": 2.2250888090654706, + "grad_norm": 0.7271533608436584, + "learning_rate": 1.2926358127596519e-05, + "loss": 0.836, + "step": 28030 + }, + { + "epoch": 2.22588263311437, + "grad_norm": 0.795552134513855, + "learning_rate": 1.2913127464211056e-05, + "loss": 0.8255, + "step": 28040 + }, + { + "epoch": 2.22667645716327, + "grad_norm": 0.8182919025421143, + "learning_rate": 1.2899896800825595e-05, + "loss": 0.7078, + "step": 28050 + }, + { + "epoch": 2.2274702812121694, + "grad_norm": 0.9969826340675354, + "learning_rate": 1.2886666137440132e-05, + "loss": 0.7659, + "step": 28060 + }, + { + "epoch": 2.228264105261069, + "grad_norm": 0.8955338001251221, + "learning_rate": 1.2873435474054671e-05, + "loss": 0.8228, + "step": 28070 + }, + { + "epoch": 2.2290579293099686, + "grad_norm": 0.9254652261734009, + "learning_rate": 1.2860204810669208e-05, + "loss": 0.8226, + "step": 28080 + }, + { + "epoch": 2.229851753358868, + "grad_norm": 0.8870140910148621, + "learning_rate": 1.2846974147283744e-05, + "loss": 0.8239, + "step": 28090 + }, + { + "epoch": 2.2306455774077674, + "grad_norm": 1.0438228845596313, + "learning_rate": 1.2833743483898283e-05, + "loss": 0.7559, + "step": 28100 + }, + { + "epoch": 2.231439401456667, + "grad_norm": 1.0301085710525513, + "learning_rate": 1.282051282051282e-05, + "loss": 0.7513, + "step": 28110 + }, + { + "epoch": 2.2322332255055666, + "grad_norm": 0.9675314426422119, + "learning_rate": 1.2807282157127359e-05, + "loss": 0.8013, + "step": 28120 + }, + { + "epoch": 2.233027049554466, + "grad_norm": 1.017149806022644, + "learning_rate": 1.2794051493741896e-05, + "loss": 0.7762, + "step": 28130 + }, + { + "epoch": 2.233820873603366, + "grad_norm": 0.8916369080543518, + "learning_rate": 1.2780820830356435e-05, + "loss": 0.8421, + "step": 28140 + }, + { + "epoch": 2.2346146976522654, + "grad_norm": 0.8855636119842529, + "learning_rate": 1.2767590166970972e-05, + "loss": 0.792, + "step": 28150 + }, + { + "epoch": 2.235408521701165, + "grad_norm": 0.758443295955658, + "learning_rate": 1.2754359503585511e-05, + "loss": 0.7603, + "step": 28160 + }, + { + "epoch": 2.2362023457500646, + "grad_norm": 1.0338447093963623, + "learning_rate": 1.2741128840200048e-05, + "loss": 0.787, + "step": 28170 + }, + { + "epoch": 2.236996169798964, + "grad_norm": 1.0957199335098267, + "learning_rate": 1.2727898176814587e-05, + "loss": 0.7863, + "step": 28180 + }, + { + "epoch": 2.237789993847864, + "grad_norm": 0.8449088335037231, + "learning_rate": 1.2714667513429124e-05, + "loss": 0.782, + "step": 28190 + }, + { + "epoch": 2.238583817896763, + "grad_norm": 0.9781031012535095, + "learning_rate": 1.2701436850043663e-05, + "loss": 0.777, + "step": 28200 + }, + { + "epoch": 2.2393776419456626, + "grad_norm": 0.9978182315826416, + "learning_rate": 1.2688206186658199e-05, + "loss": 0.7937, + "step": 28210 + }, + { + "epoch": 2.240171465994562, + "grad_norm": 1.0001929998397827, + "learning_rate": 1.2674975523272736e-05, + "loss": 0.8321, + "step": 28220 + }, + { + "epoch": 2.240965290043462, + "grad_norm": 0.9343948364257812, + "learning_rate": 1.2661744859887275e-05, + "loss": 0.7676, + "step": 28230 + }, + { + "epoch": 2.2417591140923614, + "grad_norm": 1.0100466012954712, + "learning_rate": 1.2648514196501812e-05, + "loss": 0.7627, + "step": 28240 + }, + { + "epoch": 2.242552938141261, + "grad_norm": 0.7554891109466553, + "learning_rate": 1.2635283533116351e-05, + "loss": 0.7973, + "step": 28250 + }, + { + "epoch": 2.2433467621901606, + "grad_norm": 0.9241369366645813, + "learning_rate": 1.2622052869730888e-05, + "loss": 0.7823, + "step": 28260 + }, + { + "epoch": 2.24414058623906, + "grad_norm": 0.9405050277709961, + "learning_rate": 1.2608822206345427e-05, + "loss": 0.7655, + "step": 28270 + }, + { + "epoch": 2.24493441028796, + "grad_norm": 0.8088013529777527, + "learning_rate": 1.2595591542959965e-05, + "loss": 0.7726, + "step": 28280 + }, + { + "epoch": 2.2457282343368594, + "grad_norm": 0.9160634279251099, + "learning_rate": 1.2582360879574504e-05, + "loss": 0.7966, + "step": 28290 + }, + { + "epoch": 2.246522058385759, + "grad_norm": 0.9134334921836853, + "learning_rate": 1.256913021618904e-05, + "loss": 0.7318, + "step": 28300 + }, + { + "epoch": 2.247315882434658, + "grad_norm": 0.9910647869110107, + "learning_rate": 1.255589955280358e-05, + "loss": 0.8372, + "step": 28310 + }, + { + "epoch": 2.248109706483558, + "grad_norm": 0.9692395329475403, + "learning_rate": 1.2542668889418115e-05, + "loss": 0.7783, + "step": 28320 + }, + { + "epoch": 2.2489035305324574, + "grad_norm": 0.9558752179145813, + "learning_rate": 1.2529438226032653e-05, + "loss": 0.7304, + "step": 28330 + }, + { + "epoch": 2.249697354581357, + "grad_norm": 1.0415302515029907, + "learning_rate": 1.2516207562647191e-05, + "loss": 0.8635, + "step": 28340 + }, + { + "epoch": 2.2504911786302566, + "grad_norm": 0.7867638468742371, + "learning_rate": 1.2502976899261729e-05, + "loss": 0.8164, + "step": 28350 + }, + { + "epoch": 2.251285002679156, + "grad_norm": 0.7861484885215759, + "learning_rate": 1.2489746235876268e-05, + "loss": 0.8794, + "step": 28360 + }, + { + "epoch": 2.252078826728056, + "grad_norm": 0.8788695335388184, + "learning_rate": 1.2476515572490807e-05, + "loss": 0.8134, + "step": 28370 + }, + { + "epoch": 2.2528726507769554, + "grad_norm": 0.9511646032333374, + "learning_rate": 1.2463284909105342e-05, + "loss": 0.7574, + "step": 28380 + }, + { + "epoch": 2.253666474825855, + "grad_norm": 1.1273367404937744, + "learning_rate": 1.2450054245719881e-05, + "loss": 0.7506, + "step": 28390 + }, + { + "epoch": 2.254460298874754, + "grad_norm": 0.773710310459137, + "learning_rate": 1.2436823582334418e-05, + "loss": 0.8182, + "step": 28400 + }, + { + "epoch": 2.255254122923654, + "grad_norm": 0.8454147577285767, + "learning_rate": 1.2423592918948957e-05, + "loss": 0.7827, + "step": 28410 + }, + { + "epoch": 2.2560479469725534, + "grad_norm": 0.7909261584281921, + "learning_rate": 1.2410362255563494e-05, + "loss": 0.854, + "step": 28420 + }, + { + "epoch": 2.256841771021453, + "grad_norm": 1.1431065797805786, + "learning_rate": 1.2397131592178032e-05, + "loss": 0.7735, + "step": 28430 + }, + { + "epoch": 2.2576355950703526, + "grad_norm": 0.8554138541221619, + "learning_rate": 1.238390092879257e-05, + "loss": 0.7886, + "step": 28440 + }, + { + "epoch": 2.258429419119252, + "grad_norm": 0.801655113697052, + "learning_rate": 1.2370670265407108e-05, + "loss": 0.8245, + "step": 28450 + }, + { + "epoch": 2.259223243168152, + "grad_norm": 0.8901875019073486, + "learning_rate": 1.2357439602021647e-05, + "loss": 0.7802, + "step": 28460 + }, + { + "epoch": 2.2600170672170514, + "grad_norm": 1.0075801610946655, + "learning_rate": 1.2344208938636184e-05, + "loss": 0.763, + "step": 28470 + }, + { + "epoch": 2.260810891265951, + "grad_norm": 1.1321099996566772, + "learning_rate": 1.2330978275250723e-05, + "loss": 0.8004, + "step": 28480 + }, + { + "epoch": 2.2616047153148506, + "grad_norm": 0.7835102081298828, + "learning_rate": 1.231774761186526e-05, + "loss": 0.7653, + "step": 28490 + }, + { + "epoch": 2.2623985393637502, + "grad_norm": 0.9481699466705322, + "learning_rate": 1.2304516948479797e-05, + "loss": 0.7393, + "step": 28500 + }, + { + "epoch": 2.2631923634126494, + "grad_norm": 0.876270055770874, + "learning_rate": 1.2291286285094335e-05, + "loss": 0.7817, + "step": 28510 + }, + { + "epoch": 2.263986187461549, + "grad_norm": 0.9594854712486267, + "learning_rate": 1.2278055621708873e-05, + "loss": 0.8727, + "step": 28520 + }, + { + "epoch": 2.2647800115104486, + "grad_norm": 0.919558584690094, + "learning_rate": 1.226482495832341e-05, + "loss": 0.8242, + "step": 28530 + }, + { + "epoch": 2.265573835559348, + "grad_norm": 0.7271903157234192, + "learning_rate": 1.2251594294937948e-05, + "loss": 0.7551, + "step": 28540 + }, + { + "epoch": 2.266367659608248, + "grad_norm": 1.0002189874649048, + "learning_rate": 1.2238363631552487e-05, + "loss": 0.7631, + "step": 28550 + }, + { + "epoch": 2.2671614836571474, + "grad_norm": 0.8755606412887573, + "learning_rate": 1.2225132968167024e-05, + "loss": 0.7551, + "step": 28560 + }, + { + "epoch": 2.267955307706047, + "grad_norm": 0.8737064599990845, + "learning_rate": 1.2211902304781563e-05, + "loss": 0.7563, + "step": 28570 + }, + { + "epoch": 2.2687491317549466, + "grad_norm": 1.0743333101272583, + "learning_rate": 1.21986716413961e-05, + "loss": 0.7898, + "step": 28580 + }, + { + "epoch": 2.2695429558038462, + "grad_norm": 0.7818424105644226, + "learning_rate": 1.218544097801064e-05, + "loss": 0.8337, + "step": 28590 + }, + { + "epoch": 2.2703367798527454, + "grad_norm": 0.8167740702629089, + "learning_rate": 1.2172210314625176e-05, + "loss": 0.7188, + "step": 28600 + }, + { + "epoch": 2.271130603901645, + "grad_norm": 1.0544767379760742, + "learning_rate": 1.2158979651239714e-05, + "loss": 0.7083, + "step": 28610 + }, + { + "epoch": 2.2719244279505446, + "grad_norm": 0.9623509049415588, + "learning_rate": 1.2145748987854251e-05, + "loss": 0.8609, + "step": 28620 + }, + { + "epoch": 2.272718251999444, + "grad_norm": 0.9412724375724792, + "learning_rate": 1.213251832446879e-05, + "loss": 0.7785, + "step": 28630 + }, + { + "epoch": 2.273512076048344, + "grad_norm": 0.9624713063240051, + "learning_rate": 1.2119287661083327e-05, + "loss": 0.7225, + "step": 28640 + }, + { + "epoch": 2.2743059000972434, + "grad_norm": 0.958265483379364, + "learning_rate": 1.2106056997697864e-05, + "loss": 0.7591, + "step": 28650 + }, + { + "epoch": 2.275099724146143, + "grad_norm": 1.0588797330856323, + "learning_rate": 1.2092826334312403e-05, + "loss": 0.7488, + "step": 28660 + }, + { + "epoch": 2.2758935481950426, + "grad_norm": 0.8400679230690002, + "learning_rate": 1.207959567092694e-05, + "loss": 0.7603, + "step": 28670 + }, + { + "epoch": 2.2766873722439422, + "grad_norm": 0.8208017349243164, + "learning_rate": 1.206636500754148e-05, + "loss": 0.7729, + "step": 28680 + }, + { + "epoch": 2.277481196292842, + "grad_norm": 0.8255361914634705, + "learning_rate": 1.2053134344156017e-05, + "loss": 0.7548, + "step": 28690 + }, + { + "epoch": 2.2782750203417415, + "grad_norm": 1.0541584491729736, + "learning_rate": 1.2039903680770556e-05, + "loss": 0.7856, + "step": 28700 + }, + { + "epoch": 2.2790688443906406, + "grad_norm": 1.1183170080184937, + "learning_rate": 1.2026673017385093e-05, + "loss": 0.7884, + "step": 28710 + }, + { + "epoch": 2.27986266843954, + "grad_norm": 0.8430930376052856, + "learning_rate": 1.201344235399963e-05, + "loss": 0.8003, + "step": 28720 + }, + { + "epoch": 2.28065649248844, + "grad_norm": 0.872404932975769, + "learning_rate": 1.2000211690614167e-05, + "loss": 0.7212, + "step": 28730 + }, + { + "epoch": 2.2814503165373394, + "grad_norm": 0.8843950033187866, + "learning_rate": 1.1986981027228704e-05, + "loss": 0.7738, + "step": 28740 + }, + { + "epoch": 2.282244140586239, + "grad_norm": 0.8415934443473816, + "learning_rate": 1.1973750363843243e-05, + "loss": 0.7609, + "step": 28750 + }, + { + "epoch": 2.2830379646351386, + "grad_norm": 0.7827284932136536, + "learning_rate": 1.196051970045778e-05, + "loss": 0.7876, + "step": 28760 + }, + { + "epoch": 2.2838317886840382, + "grad_norm": 1.0107696056365967, + "learning_rate": 1.194728903707232e-05, + "loss": 0.7452, + "step": 28770 + }, + { + "epoch": 2.284625612732938, + "grad_norm": 0.802697479724884, + "learning_rate": 1.1934058373686857e-05, + "loss": 0.8176, + "step": 28780 + }, + { + "epoch": 2.2854194367818375, + "grad_norm": 0.8944302797317505, + "learning_rate": 1.1920827710301396e-05, + "loss": 0.7346, + "step": 28790 + }, + { + "epoch": 2.286213260830737, + "grad_norm": 0.8373863101005554, + "learning_rate": 1.1907597046915933e-05, + "loss": 0.7585, + "step": 28800 + }, + { + "epoch": 2.2870070848796367, + "grad_norm": 0.964019238948822, + "learning_rate": 1.1894366383530472e-05, + "loss": 0.7473, + "step": 28810 + }, + { + "epoch": 2.287800908928536, + "grad_norm": 1.0196411609649658, + "learning_rate": 1.1881135720145009e-05, + "loss": 0.7552, + "step": 28820 + }, + { + "epoch": 2.2885947329774354, + "grad_norm": 0.758881688117981, + "learning_rate": 1.1867905056759546e-05, + "loss": 0.8031, + "step": 28830 + }, + { + "epoch": 2.289388557026335, + "grad_norm": 0.768775999546051, + "learning_rate": 1.1854674393374085e-05, + "loss": 0.7993, + "step": 28840 + }, + { + "epoch": 2.2901823810752346, + "grad_norm": 0.9451555013656616, + "learning_rate": 1.184144372998862e-05, + "loss": 0.8086, + "step": 28850 + }, + { + "epoch": 2.2909762051241342, + "grad_norm": 0.9371961951255798, + "learning_rate": 1.182821306660316e-05, + "loss": 0.7808, + "step": 28860 + }, + { + "epoch": 2.291770029173034, + "grad_norm": 0.7734199166297913, + "learning_rate": 1.1814982403217697e-05, + "loss": 0.7633, + "step": 28870 + }, + { + "epoch": 2.2925638532219335, + "grad_norm": 0.787481427192688, + "learning_rate": 1.1801751739832236e-05, + "loss": 0.837, + "step": 28880 + }, + { + "epoch": 2.293357677270833, + "grad_norm": 0.9613273739814758, + "learning_rate": 1.1788521076446773e-05, + "loss": 0.726, + "step": 28890 + }, + { + "epoch": 2.2941515013197327, + "grad_norm": 0.9527220129966736, + "learning_rate": 1.1775290413061312e-05, + "loss": 0.7615, + "step": 28900 + }, + { + "epoch": 2.294945325368632, + "grad_norm": 0.9209556579589844, + "learning_rate": 1.176205974967585e-05, + "loss": 0.8216, + "step": 28910 + }, + { + "epoch": 2.2957391494175314, + "grad_norm": 0.9544232487678528, + "learning_rate": 1.1748829086290388e-05, + "loss": 0.794, + "step": 28920 + }, + { + "epoch": 2.296532973466431, + "grad_norm": 0.9990895390510559, + "learning_rate": 1.1735598422904925e-05, + "loss": 0.7551, + "step": 28930 + }, + { + "epoch": 2.2973267975153306, + "grad_norm": 1.1743451356887817, + "learning_rate": 1.1722367759519463e-05, + "loss": 0.7476, + "step": 28940 + }, + { + "epoch": 2.2981206215642302, + "grad_norm": 0.9132225513458252, + "learning_rate": 1.1709137096134002e-05, + "loss": 0.7687, + "step": 28950 + }, + { + "epoch": 2.29891444561313, + "grad_norm": 0.7507365345954895, + "learning_rate": 1.1695906432748537e-05, + "loss": 0.7469, + "step": 28960 + }, + { + "epoch": 2.2997082696620295, + "grad_norm": 0.8947992920875549, + "learning_rate": 1.1682675769363076e-05, + "loss": 0.8074, + "step": 28970 + }, + { + "epoch": 2.300502093710929, + "grad_norm": 1.213215708732605, + "learning_rate": 1.1669445105977613e-05, + "loss": 0.7489, + "step": 28980 + }, + { + "epoch": 2.3012959177598287, + "grad_norm": 0.8981990218162537, + "learning_rate": 1.1656214442592152e-05, + "loss": 0.7278, + "step": 28990 + }, + { + "epoch": 2.3020897418087283, + "grad_norm": 0.820511519908905, + "learning_rate": 1.164298377920669e-05, + "loss": 0.7792, + "step": 29000 + }, + { + "epoch": 2.302883565857628, + "grad_norm": 1.1118419170379639, + "learning_rate": 1.1629753115821228e-05, + "loss": 0.7267, + "step": 29010 + }, + { + "epoch": 2.303677389906527, + "grad_norm": 1.0339609384536743, + "learning_rate": 1.1616522452435766e-05, + "loss": 0.7933, + "step": 29020 + }, + { + "epoch": 2.3044712139554266, + "grad_norm": 0.8476637601852417, + "learning_rate": 1.1603291789050305e-05, + "loss": 0.7775, + "step": 29030 + }, + { + "epoch": 2.3052650380043263, + "grad_norm": 0.7746336460113525, + "learning_rate": 1.1590061125664842e-05, + "loss": 0.7619, + "step": 29040 + }, + { + "epoch": 2.306058862053226, + "grad_norm": 1.1253118515014648, + "learning_rate": 1.1576830462279379e-05, + "loss": 0.7724, + "step": 29050 + }, + { + "epoch": 2.3068526861021255, + "grad_norm": 1.1791478395462036, + "learning_rate": 1.1563599798893918e-05, + "loss": 0.794, + "step": 29060 + }, + { + "epoch": 2.307646510151025, + "grad_norm": 0.935023307800293, + "learning_rate": 1.1550369135508455e-05, + "loss": 0.7548, + "step": 29070 + }, + { + "epoch": 2.3084403341999247, + "grad_norm": 0.8298285603523254, + "learning_rate": 1.1537138472122992e-05, + "loss": 0.7939, + "step": 29080 + }, + { + "epoch": 2.3092341582488243, + "grad_norm": 0.8414050340652466, + "learning_rate": 1.152390780873753e-05, + "loss": 0.8132, + "step": 29090 + }, + { + "epoch": 2.310027982297724, + "grad_norm": 0.9045025110244751, + "learning_rate": 1.1510677145352069e-05, + "loss": 0.7828, + "step": 29100 + }, + { + "epoch": 2.310821806346623, + "grad_norm": 0.8417425751686096, + "learning_rate": 1.1497446481966606e-05, + "loss": 0.8046, + "step": 29110 + }, + { + "epoch": 2.3116156303955226, + "grad_norm": 0.9332298040390015, + "learning_rate": 1.1484215818581145e-05, + "loss": 0.8187, + "step": 29120 + }, + { + "epoch": 2.3124094544444223, + "grad_norm": 1.017712116241455, + "learning_rate": 1.1470985155195682e-05, + "loss": 0.8168, + "step": 29130 + }, + { + "epoch": 2.313203278493322, + "grad_norm": 0.9712084531784058, + "learning_rate": 1.145775449181022e-05, + "loss": 0.7914, + "step": 29140 + }, + { + "epoch": 2.3139971025422215, + "grad_norm": 0.8326614499092102, + "learning_rate": 1.1444523828424758e-05, + "loss": 0.757, + "step": 29150 + }, + { + "epoch": 2.314790926591121, + "grad_norm": 0.7456022500991821, + "learning_rate": 1.1431293165039295e-05, + "loss": 0.8309, + "step": 29160 + }, + { + "epoch": 2.3155847506400207, + "grad_norm": 0.895487368106842, + "learning_rate": 1.1418062501653834e-05, + "loss": 0.7881, + "step": 29170 + }, + { + "epoch": 2.3163785746889203, + "grad_norm": 0.9077318906784058, + "learning_rate": 1.1404831838268372e-05, + "loss": 0.8497, + "step": 29180 + }, + { + "epoch": 2.31717239873782, + "grad_norm": 0.7892618775367737, + "learning_rate": 1.1391601174882909e-05, + "loss": 0.7859, + "step": 29190 + }, + { + "epoch": 2.3179662227867195, + "grad_norm": 0.8005306720733643, + "learning_rate": 1.1378370511497446e-05, + "loss": 0.7707, + "step": 29200 + }, + { + "epoch": 2.318760046835619, + "grad_norm": 1.0797233581542969, + "learning_rate": 1.1365139848111985e-05, + "loss": 0.7446, + "step": 29210 + }, + { + "epoch": 2.3195538708845183, + "grad_norm": 0.9487477540969849, + "learning_rate": 1.135323225106507e-05, + "loss": 0.8089, + "step": 29220 + }, + { + "epoch": 2.320347694933418, + "grad_norm": 0.9373265504837036, + "learning_rate": 1.1340001587679607e-05, + "loss": 0.7926, + "step": 29230 + }, + { + "epoch": 2.3211415189823175, + "grad_norm": 1.0408942699432373, + "learning_rate": 1.1326770924294144e-05, + "loss": 0.8057, + "step": 29240 + }, + { + "epoch": 2.321935343031217, + "grad_norm": 0.9446254968643188, + "learning_rate": 1.1313540260908683e-05, + "loss": 0.8202, + "step": 29250 + }, + { + "epoch": 2.3227291670801167, + "grad_norm": 1.1571518182754517, + "learning_rate": 1.130030959752322e-05, + "loss": 0.7437, + "step": 29260 + }, + { + "epoch": 2.3235229911290163, + "grad_norm": 0.8599859476089478, + "learning_rate": 1.1287078934137759e-05, + "loss": 0.7217, + "step": 29270 + }, + { + "epoch": 2.324316815177916, + "grad_norm": 0.9582074880599976, + "learning_rate": 1.1273848270752296e-05, + "loss": 0.8472, + "step": 29280 + }, + { + "epoch": 2.3251106392268155, + "grad_norm": 0.9550096392631531, + "learning_rate": 1.1260617607366835e-05, + "loss": 0.7503, + "step": 29290 + }, + { + "epoch": 2.325904463275715, + "grad_norm": 0.972760796546936, + "learning_rate": 1.1247386943981372e-05, + "loss": 0.7296, + "step": 29300 + }, + { + "epoch": 2.3266982873246143, + "grad_norm": 0.9969944953918457, + "learning_rate": 1.123415628059591e-05, + "loss": 0.7982, + "step": 29310 + }, + { + "epoch": 2.3274921113735143, + "grad_norm": 0.9653575420379639, + "learning_rate": 1.1220925617210447e-05, + "loss": 0.8543, + "step": 29320 + }, + { + "epoch": 2.3282859354224135, + "grad_norm": 0.8924707174301147, + "learning_rate": 1.1207694953824986e-05, + "loss": 0.7601, + "step": 29330 + }, + { + "epoch": 2.329079759471313, + "grad_norm": 0.9261003136634827, + "learning_rate": 1.1194464290439523e-05, + "loss": 0.7984, + "step": 29340 + }, + { + "epoch": 2.3298735835202127, + "grad_norm": 0.678521454334259, + "learning_rate": 1.118123362705406e-05, + "loss": 0.8383, + "step": 29350 + }, + { + "epoch": 2.3306674075691123, + "grad_norm": 1.122550368309021, + "learning_rate": 1.1168002963668599e-05, + "loss": 0.7964, + "step": 29360 + }, + { + "epoch": 2.331461231618012, + "grad_norm": 0.8651812672615051, + "learning_rate": 1.1154772300283136e-05, + "loss": 0.7974, + "step": 29370 + }, + { + "epoch": 2.3322550556669115, + "grad_norm": 1.018391489982605, + "learning_rate": 1.1141541636897675e-05, + "loss": 0.7908, + "step": 29380 + }, + { + "epoch": 2.333048879715811, + "grad_norm": 0.8569789528846741, + "learning_rate": 1.1128310973512212e-05, + "loss": 0.7332, + "step": 29390 + }, + { + "epoch": 2.3338427037647107, + "grad_norm": 0.7940601706504822, + "learning_rate": 1.1115080310126751e-05, + "loss": 0.8628, + "step": 29400 + }, + { + "epoch": 2.3346365278136103, + "grad_norm": 0.9954172372817993, + "learning_rate": 1.1101849646741289e-05, + "loss": 0.7837, + "step": 29410 + }, + { + "epoch": 2.3354303518625095, + "grad_norm": 0.9305082559585571, + "learning_rate": 1.1088618983355826e-05, + "loss": 0.7587, + "step": 29420 + }, + { + "epoch": 2.336224175911409, + "grad_norm": 1.003757357597351, + "learning_rate": 1.1075388319970365e-05, + "loss": 0.8239, + "step": 29430 + }, + { + "epoch": 2.3370179999603087, + "grad_norm": 0.8799276947975159, + "learning_rate": 1.1062157656584902e-05, + "loss": 0.728, + "step": 29440 + }, + { + "epoch": 2.3378118240092083, + "grad_norm": 0.758741021156311, + "learning_rate": 1.104892699319944e-05, + "loss": 0.806, + "step": 29450 + }, + { + "epoch": 2.338605648058108, + "grad_norm": 0.9049004912376404, + "learning_rate": 1.1035696329813976e-05, + "loss": 0.7504, + "step": 29460 + }, + { + "epoch": 2.3393994721070075, + "grad_norm": 1.0414828062057495, + "learning_rate": 1.1022465666428515e-05, + "loss": 0.77, + "step": 29470 + }, + { + "epoch": 2.340193296155907, + "grad_norm": 0.8456864356994629, + "learning_rate": 1.1009235003043053e-05, + "loss": 0.7447, + "step": 29480 + }, + { + "epoch": 2.3409871202048067, + "grad_norm": 0.7757733464241028, + "learning_rate": 1.0996004339657591e-05, + "loss": 0.7494, + "step": 29490 + }, + { + "epoch": 2.3417809442537063, + "grad_norm": 1.131042718887329, + "learning_rate": 1.0982773676272129e-05, + "loss": 0.8584, + "step": 29500 + }, + { + "epoch": 2.342574768302606, + "grad_norm": 0.8263581395149231, + "learning_rate": 1.0969543012886668e-05, + "loss": 0.741, + "step": 29510 + }, + { + "epoch": 2.3433685923515055, + "grad_norm": 1.135141134262085, + "learning_rate": 1.0956312349501205e-05, + "loss": 0.7473, + "step": 29520 + }, + { + "epoch": 2.3441624164004047, + "grad_norm": 1.2775719165802002, + "learning_rate": 1.0943081686115742e-05, + "loss": 0.7666, + "step": 29530 + }, + { + "epoch": 2.3449562404493043, + "grad_norm": 0.8533919453620911, + "learning_rate": 1.0929851022730281e-05, + "loss": 0.7309, + "step": 29540 + }, + { + "epoch": 2.345750064498204, + "grad_norm": 0.7968776822090149, + "learning_rate": 1.0916620359344817e-05, + "loss": 0.8046, + "step": 29550 + }, + { + "epoch": 2.3465438885471035, + "grad_norm": 0.9204419851303101, + "learning_rate": 1.0903389695959356e-05, + "loss": 0.7839, + "step": 29560 + }, + { + "epoch": 2.347337712596003, + "grad_norm": 1.0441004037857056, + "learning_rate": 1.0890159032573893e-05, + "loss": 0.7742, + "step": 29570 + }, + { + "epoch": 2.3481315366449027, + "grad_norm": 0.9476767182350159, + "learning_rate": 1.0876928369188432e-05, + "loss": 0.7271, + "step": 29580 + }, + { + "epoch": 2.3489253606938023, + "grad_norm": 0.8613219857215881, + "learning_rate": 1.0863697705802969e-05, + "loss": 0.7584, + "step": 29590 + }, + { + "epoch": 2.349719184742702, + "grad_norm": 0.8703140616416931, + "learning_rate": 1.0850467042417508e-05, + "loss": 0.7311, + "step": 29600 + }, + { + "epoch": 2.3505130087916015, + "grad_norm": 0.8106393218040466, + "learning_rate": 1.0837236379032045e-05, + "loss": 0.8058, + "step": 29610 + }, + { + "epoch": 2.3513068328405007, + "grad_norm": 1.057108759880066, + "learning_rate": 1.0824005715646584e-05, + "loss": 0.79, + "step": 29620 + }, + { + "epoch": 2.3521006568894003, + "grad_norm": 1.0854650735855103, + "learning_rate": 1.0810775052261121e-05, + "loss": 0.8291, + "step": 29630 + }, + { + "epoch": 2.3528944809383, + "grad_norm": 0.9281707406044006, + "learning_rate": 1.0797544388875658e-05, + "loss": 0.7736, + "step": 29640 + }, + { + "epoch": 2.3536883049871995, + "grad_norm": 0.7807760238647461, + "learning_rate": 1.0784313725490197e-05, + "loss": 0.7909, + "step": 29650 + }, + { + "epoch": 2.354482129036099, + "grad_norm": 0.9227698445320129, + "learning_rate": 1.0771083062104735e-05, + "loss": 0.8431, + "step": 29660 + }, + { + "epoch": 2.3552759530849987, + "grad_norm": 0.8814806938171387, + "learning_rate": 1.0757852398719272e-05, + "loss": 0.8081, + "step": 29670 + }, + { + "epoch": 2.3560697771338983, + "grad_norm": 0.7971763014793396, + "learning_rate": 1.0744621735333809e-05, + "loss": 0.7334, + "step": 29680 + }, + { + "epoch": 2.356863601182798, + "grad_norm": 0.8405244946479797, + "learning_rate": 1.0731391071948348e-05, + "loss": 0.7273, + "step": 29690 + }, + { + "epoch": 2.3576574252316975, + "grad_norm": 1.0113767385482788, + "learning_rate": 1.0718160408562885e-05, + "loss": 0.832, + "step": 29700 + }, + { + "epoch": 2.358451249280597, + "grad_norm": 0.9246639013290405, + "learning_rate": 1.0704929745177424e-05, + "loss": 0.8362, + "step": 29710 + }, + { + "epoch": 2.3592450733294967, + "grad_norm": 1.0259578227996826, + "learning_rate": 1.0691699081791961e-05, + "loss": 0.7198, + "step": 29720 + }, + { + "epoch": 2.360038897378396, + "grad_norm": 0.9511812925338745, + "learning_rate": 1.06784684184065e-05, + "loss": 0.7828, + "step": 29730 + }, + { + "epoch": 2.3608327214272955, + "grad_norm": 0.87998366355896, + "learning_rate": 1.0665237755021038e-05, + "loss": 0.7572, + "step": 29740 + }, + { + "epoch": 2.361626545476195, + "grad_norm": 0.6550692915916443, + "learning_rate": 1.0652007091635575e-05, + "loss": 0.7808, + "step": 29750 + }, + { + "epoch": 2.3624203695250947, + "grad_norm": 0.8000962734222412, + "learning_rate": 1.0638776428250114e-05, + "loss": 0.784, + "step": 29760 + }, + { + "epoch": 2.3632141935739943, + "grad_norm": 0.8203931450843811, + "learning_rate": 1.0625545764864651e-05, + "loss": 0.8397, + "step": 29770 + }, + { + "epoch": 2.364008017622894, + "grad_norm": 0.9608734250068665, + "learning_rate": 1.0612315101479188e-05, + "loss": 0.7318, + "step": 29780 + }, + { + "epoch": 2.3648018416717935, + "grad_norm": 0.9658749103546143, + "learning_rate": 1.0599084438093725e-05, + "loss": 0.8222, + "step": 29790 + }, + { + "epoch": 2.365595665720693, + "grad_norm": 0.9885733127593994, + "learning_rate": 1.0585853774708264e-05, + "loss": 0.7645, + "step": 29800 + }, + { + "epoch": 2.3663894897695927, + "grad_norm": 0.9032514691352844, + "learning_rate": 1.0572623111322802e-05, + "loss": 0.8039, + "step": 29810 + }, + { + "epoch": 2.367183313818492, + "grad_norm": 0.9113055467605591, + "learning_rate": 1.055939244793734e-05, + "loss": 0.8142, + "step": 29820 + }, + { + "epoch": 2.3679771378673915, + "grad_norm": 0.9187504649162292, + "learning_rate": 1.0546161784551878e-05, + "loss": 0.7926, + "step": 29830 + }, + { + "epoch": 2.368770961916291, + "grad_norm": 0.8882543444633484, + "learning_rate": 1.0532931121166417e-05, + "loss": 0.7267, + "step": 29840 + }, + { + "epoch": 2.3695647859651907, + "grad_norm": 0.9198811054229736, + "learning_rate": 1.0519700457780954e-05, + "loss": 0.7839, + "step": 29850 + }, + { + "epoch": 2.3703586100140903, + "grad_norm": 0.8237046003341675, + "learning_rate": 1.0506469794395491e-05, + "loss": 0.7327, + "step": 29860 + }, + { + "epoch": 2.37115243406299, + "grad_norm": 0.9879904985427856, + "learning_rate": 1.049323913101003e-05, + "loss": 0.759, + "step": 29870 + }, + { + "epoch": 2.3719462581118895, + "grad_norm": 0.7641343474388123, + "learning_rate": 1.0480008467624567e-05, + "loss": 0.7876, + "step": 29880 + }, + { + "epoch": 2.372740082160789, + "grad_norm": 0.8497799634933472, + "learning_rate": 1.0466777804239106e-05, + "loss": 0.788, + "step": 29890 + }, + { + "epoch": 2.3735339062096887, + "grad_norm": 0.7273193001747131, + "learning_rate": 1.0453547140853642e-05, + "loss": 0.8189, + "step": 29900 + }, + { + "epoch": 2.3743277302585883, + "grad_norm": 0.9262228012084961, + "learning_rate": 1.044031647746818e-05, + "loss": 0.8058, + "step": 29910 + }, + { + "epoch": 2.375121554307488, + "grad_norm": 0.8716014623641968, + "learning_rate": 1.0427085814082718e-05, + "loss": 0.7813, + "step": 29920 + }, + { + "epoch": 2.375915378356387, + "grad_norm": 1.047127366065979, + "learning_rate": 1.0413855150697257e-05, + "loss": 0.8362, + "step": 29930 + }, + { + "epoch": 2.3767092024052867, + "grad_norm": 1.0029412508010864, + "learning_rate": 1.0400624487311794e-05, + "loss": 0.7892, + "step": 29940 + }, + { + "epoch": 2.3775030264541863, + "grad_norm": 0.8537968397140503, + "learning_rate": 1.0387393823926331e-05, + "loss": 0.7837, + "step": 29950 + }, + { + "epoch": 2.378296850503086, + "grad_norm": 0.8679432272911072, + "learning_rate": 1.037416316054087e-05, + "loss": 0.7785, + "step": 29960 + }, + { + "epoch": 2.3790906745519855, + "grad_norm": 1.063770055770874, + "learning_rate": 1.0360932497155407e-05, + "loss": 0.7029, + "step": 29970 + }, + { + "epoch": 2.379884498600885, + "grad_norm": 0.8927010297775269, + "learning_rate": 1.0347701833769946e-05, + "loss": 0.7595, + "step": 29980 + }, + { + "epoch": 2.3806783226497847, + "grad_norm": 0.8152598738670349, + "learning_rate": 1.0334471170384484e-05, + "loss": 0.8162, + "step": 29990 + }, + { + "epoch": 2.3814721466986843, + "grad_norm": 0.9757214188575745, + "learning_rate": 1.0321240506999023e-05, + "loss": 0.7675, + "step": 30000 + }, + { + "epoch": 2.382265970747584, + "grad_norm": 0.8075728416442871, + "learning_rate": 1.0308009843613558e-05, + "loss": 0.8226, + "step": 30010 + }, + { + "epoch": 2.383059794796483, + "grad_norm": 0.7815035581588745, + "learning_rate": 1.0294779180228097e-05, + "loss": 0.8277, + "step": 30020 + }, + { + "epoch": 2.383853618845383, + "grad_norm": 0.8908788561820984, + "learning_rate": 1.0281548516842634e-05, + "loss": 0.773, + "step": 30030 + }, + { + "epoch": 2.3846474428942823, + "grad_norm": 0.8850157260894775, + "learning_rate": 1.0268317853457173e-05, + "loss": 0.7643, + "step": 30040 + }, + { + "epoch": 2.385441266943182, + "grad_norm": 0.802002489566803, + "learning_rate": 1.025508719007171e-05, + "loss": 0.8312, + "step": 30050 + }, + { + "epoch": 2.3862350909920815, + "grad_norm": 0.9820096492767334, + "learning_rate": 1.0241856526686248e-05, + "loss": 0.8148, + "step": 30060 + }, + { + "epoch": 2.387028915040981, + "grad_norm": 0.934891402721405, + "learning_rate": 1.0228625863300787e-05, + "loss": 0.7546, + "step": 30070 + }, + { + "epoch": 2.3878227390898807, + "grad_norm": 0.9393154978752136, + "learning_rate": 1.0215395199915324e-05, + "loss": 0.7793, + "step": 30080 + }, + { + "epoch": 2.3886165631387803, + "grad_norm": 0.9475399255752563, + "learning_rate": 1.0202164536529863e-05, + "loss": 0.7756, + "step": 30090 + }, + { + "epoch": 2.38941038718768, + "grad_norm": 0.8171314001083374, + "learning_rate": 1.01889338731444e-05, + "loss": 0.78, + "step": 30100 + }, + { + "epoch": 2.3902042112365796, + "grad_norm": 0.85038822889328, + "learning_rate": 1.0175703209758939e-05, + "loss": 0.7777, + "step": 30110 + }, + { + "epoch": 2.390998035285479, + "grad_norm": 0.9316538572311401, + "learning_rate": 1.0162472546373476e-05, + "loss": 0.8058, + "step": 30120 + }, + { + "epoch": 2.3917918593343783, + "grad_norm": 0.7901173233985901, + "learning_rate": 1.0149241882988013e-05, + "loss": 0.7778, + "step": 30130 + }, + { + "epoch": 2.392585683383278, + "grad_norm": 0.907632052898407, + "learning_rate": 1.013601121960255e-05, + "loss": 0.7577, + "step": 30140 + }, + { + "epoch": 2.3933795074321775, + "grad_norm": 0.7853071689605713, + "learning_rate": 1.012278055621709e-05, + "loss": 0.7905, + "step": 30150 + }, + { + "epoch": 2.394173331481077, + "grad_norm": 0.9845629930496216, + "learning_rate": 1.0109549892831627e-05, + "loss": 0.8268, + "step": 30160 + }, + { + "epoch": 2.3949671555299767, + "grad_norm": 0.8937403559684753, + "learning_rate": 1.0096319229446164e-05, + "loss": 0.842, + "step": 30170 + }, + { + "epoch": 2.3957609795788763, + "grad_norm": 0.9141705632209778, + "learning_rate": 1.0083088566060703e-05, + "loss": 0.7376, + "step": 30180 + }, + { + "epoch": 2.396554803627776, + "grad_norm": 0.8283079266548157, + "learning_rate": 1.006985790267524e-05, + "loss": 0.7864, + "step": 30190 + }, + { + "epoch": 2.3973486276766756, + "grad_norm": 1.2408968210220337, + "learning_rate": 1.0056627239289779e-05, + "loss": 0.8002, + "step": 30200 + }, + { + "epoch": 2.398142451725575, + "grad_norm": 0.8985006809234619, + "learning_rate": 1.0043396575904316e-05, + "loss": 0.7865, + "step": 30210 + }, + { + "epoch": 2.3989362757744748, + "grad_norm": 0.8631697297096252, + "learning_rate": 1.0030165912518855e-05, + "loss": 0.8899, + "step": 30220 + }, + { + "epoch": 2.3997300998233744, + "grad_norm": 0.8567438721656799, + "learning_rate": 1.0016935249133392e-05, + "loss": 0.7726, + "step": 30230 + }, + { + "epoch": 2.4005239238722735, + "grad_norm": 0.780764102935791, + "learning_rate": 1.0003704585747931e-05, + "loss": 0.8075, + "step": 30240 + }, + { + "epoch": 2.401317747921173, + "grad_norm": 0.9728577136993408, + "learning_rate": 9.990473922362467e-06, + "loss": 0.7686, + "step": 30250 + }, + { + "epoch": 2.4021115719700727, + "grad_norm": 0.9801949858665466, + "learning_rate": 9.977243258977006e-06, + "loss": 0.7584, + "step": 30260 + }, + { + "epoch": 2.4029053960189724, + "grad_norm": 1.1175134181976318, + "learning_rate": 9.964012595591543e-06, + "loss": 0.7502, + "step": 30270 + }, + { + "epoch": 2.403699220067872, + "grad_norm": 1.078339695930481, + "learning_rate": 9.95078193220608e-06, + "loss": 0.8411, + "step": 30280 + }, + { + "epoch": 2.4044930441167716, + "grad_norm": 1.1007136106491089, + "learning_rate": 9.93755126882062e-06, + "loss": 0.8423, + "step": 30290 + }, + { + "epoch": 2.405286868165671, + "grad_norm": 0.8614094853401184, + "learning_rate": 9.924320605435157e-06, + "loss": 0.8043, + "step": 30300 + }, + { + "epoch": 2.4060806922145708, + "grad_norm": 0.723034679889679, + "learning_rate": 9.911089942049695e-06, + "loss": 0.822, + "step": 30310 + }, + { + "epoch": 2.4068745162634704, + "grad_norm": 0.899814784526825, + "learning_rate": 9.897859278664233e-06, + "loss": 0.8256, + "step": 30320 + }, + { + "epoch": 2.4076683403123695, + "grad_norm": 0.8916921615600586, + "learning_rate": 9.884628615278772e-06, + "loss": 0.8253, + "step": 30330 + }, + { + "epoch": 2.408462164361269, + "grad_norm": 0.8144512176513672, + "learning_rate": 9.871397951893309e-06, + "loss": 0.8021, + "step": 30340 + }, + { + "epoch": 2.4092559884101687, + "grad_norm": 0.9181259274482727, + "learning_rate": 9.858167288507846e-06, + "loss": 0.8385, + "step": 30350 + }, + { + "epoch": 2.4100498124590684, + "grad_norm": 0.9962033629417419, + "learning_rate": 9.844936625122383e-06, + "loss": 0.8356, + "step": 30360 + }, + { + "epoch": 2.410843636507968, + "grad_norm": 1.0186583995819092, + "learning_rate": 9.831705961736922e-06, + "loss": 0.7458, + "step": 30370 + }, + { + "epoch": 2.4116374605568676, + "grad_norm": 0.9851090312004089, + "learning_rate": 9.81847529835146e-06, + "loss": 0.8241, + "step": 30380 + }, + { + "epoch": 2.412431284605767, + "grad_norm": 0.8016131520271301, + "learning_rate": 9.805244634965997e-06, + "loss": 0.7743, + "step": 30390 + }, + { + "epoch": 2.4132251086546668, + "grad_norm": 0.937201976776123, + "learning_rate": 9.792013971580536e-06, + "loss": 0.7902, + "step": 30400 + }, + { + "epoch": 2.4140189327035664, + "grad_norm": 0.6954747438430786, + "learning_rate": 9.778783308195073e-06, + "loss": 0.8559, + "step": 30410 + }, + { + "epoch": 2.414812756752466, + "grad_norm": 0.8315379023551941, + "learning_rate": 9.765552644809612e-06, + "loss": 0.7645, + "step": 30420 + }, + { + "epoch": 2.4156065808013656, + "grad_norm": 0.86857008934021, + "learning_rate": 9.752321981424149e-06, + "loss": 0.7826, + "step": 30430 + }, + { + "epoch": 2.4164004048502647, + "grad_norm": 0.9260841012001038, + "learning_rate": 9.739091318038688e-06, + "loss": 0.8496, + "step": 30440 + }, + { + "epoch": 2.4171942288991644, + "grad_norm": 1.021768569946289, + "learning_rate": 9.725860654653225e-06, + "loss": 0.7887, + "step": 30450 + }, + { + "epoch": 2.417988052948064, + "grad_norm": 0.9660463929176331, + "learning_rate": 9.712629991267762e-06, + "loss": 0.7776, + "step": 30460 + }, + { + "epoch": 2.4187818769969636, + "grad_norm": 0.7337642908096313, + "learning_rate": 9.699399327882301e-06, + "loss": 0.786, + "step": 30470 + }, + { + "epoch": 2.419575701045863, + "grad_norm": 1.0514928102493286, + "learning_rate": 9.686168664496839e-06, + "loss": 0.7785, + "step": 30480 + }, + { + "epoch": 2.4203695250947628, + "grad_norm": 1.1055537462234497, + "learning_rate": 9.672938001111376e-06, + "loss": 0.7879, + "step": 30490 + }, + { + "epoch": 2.4211633491436624, + "grad_norm": 0.9621986746788025, + "learning_rate": 9.659707337725913e-06, + "loss": 0.7828, + "step": 30500 + }, + { + "epoch": 2.421957173192562, + "grad_norm": 0.9570415019989014, + "learning_rate": 9.646476674340452e-06, + "loss": 0.7945, + "step": 30510 + }, + { + "epoch": 2.4227509972414616, + "grad_norm": 0.831770122051239, + "learning_rate": 9.63324601095499e-06, + "loss": 0.772, + "step": 30520 + }, + { + "epoch": 2.4235448212903608, + "grad_norm": 0.7580466270446777, + "learning_rate": 9.620015347569528e-06, + "loss": 0.8456, + "step": 30530 + }, + { + "epoch": 2.424338645339261, + "grad_norm": 1.027631163597107, + "learning_rate": 9.606784684184065e-06, + "loss": 0.7945, + "step": 30540 + }, + { + "epoch": 2.42513246938816, + "grad_norm": 1.0360970497131348, + "learning_rate": 9.593554020798604e-06, + "loss": 0.8, + "step": 30550 + }, + { + "epoch": 2.4259262934370596, + "grad_norm": 0.9129590392112732, + "learning_rate": 9.580323357413142e-06, + "loss": 0.8057, + "step": 30560 + }, + { + "epoch": 2.426720117485959, + "grad_norm": 0.7934035658836365, + "learning_rate": 9.567092694027679e-06, + "loss": 0.8296, + "step": 30570 + }, + { + "epoch": 2.4275139415348588, + "grad_norm": 0.9812830090522766, + "learning_rate": 9.553862030642218e-06, + "loss": 0.7877, + "step": 30580 + }, + { + "epoch": 2.4283077655837584, + "grad_norm": 0.9974318146705627, + "learning_rate": 9.540631367256755e-06, + "loss": 0.7479, + "step": 30590 + }, + { + "epoch": 2.429101589632658, + "grad_norm": 0.9436841011047363, + "learning_rate": 9.527400703871292e-06, + "loss": 0.8279, + "step": 30600 + }, + { + "epoch": 2.4298954136815576, + "grad_norm": 0.9886923432350159, + "learning_rate": 9.51417004048583e-06, + "loss": 0.7854, + "step": 30610 + }, + { + "epoch": 2.430689237730457, + "grad_norm": 0.9457038044929504, + "learning_rate": 9.500939377100368e-06, + "loss": 0.827, + "step": 30620 + }, + { + "epoch": 2.431483061779357, + "grad_norm": 0.93290776014328, + "learning_rate": 9.487708713714906e-06, + "loss": 0.7439, + "step": 30630 + }, + { + "epoch": 2.432276885828256, + "grad_norm": 0.9726458787918091, + "learning_rate": 9.474478050329444e-06, + "loss": 0.7855, + "step": 30640 + }, + { + "epoch": 2.4330707098771556, + "grad_norm": 1.1130033731460571, + "learning_rate": 9.461247386943982e-06, + "loss": 0.7976, + "step": 30650 + }, + { + "epoch": 2.433864533926055, + "grad_norm": 0.7888174653053284, + "learning_rate": 9.44801672355852e-06, + "loss": 0.7857, + "step": 30660 + }, + { + "epoch": 2.434658357974955, + "grad_norm": 0.9603310227394104, + "learning_rate": 9.434786060173058e-06, + "loss": 0.7558, + "step": 30670 + }, + { + "epoch": 2.4354521820238544, + "grad_norm": 0.9045382738113403, + "learning_rate": 9.421555396787595e-06, + "loss": 0.7363, + "step": 30680 + }, + { + "epoch": 2.436246006072754, + "grad_norm": 1.1511313915252686, + "learning_rate": 9.408324733402134e-06, + "loss": 0.8566, + "step": 30690 + }, + { + "epoch": 2.4370398301216536, + "grad_norm": 0.9795113205909729, + "learning_rate": 9.395094070016671e-06, + "loss": 0.7888, + "step": 30700 + }, + { + "epoch": 2.437833654170553, + "grad_norm": 1.0283374786376953, + "learning_rate": 9.381863406631208e-06, + "loss": 0.8044, + "step": 30710 + }, + { + "epoch": 2.438627478219453, + "grad_norm": 1.023207664489746, + "learning_rate": 9.368632743245746e-06, + "loss": 0.818, + "step": 30720 + }, + { + "epoch": 2.4394213022683524, + "grad_norm": 0.7632606029510498, + "learning_rate": 9.355402079860285e-06, + "loss": 0.8481, + "step": 30730 + }, + { + "epoch": 2.440215126317252, + "grad_norm": 0.8076527118682861, + "learning_rate": 9.342171416474822e-06, + "loss": 0.772, + "step": 30740 + }, + { + "epoch": 2.441008950366151, + "grad_norm": 0.9960797429084778, + "learning_rate": 9.32894075308936e-06, + "loss": 0.7411, + "step": 30750 + }, + { + "epoch": 2.441802774415051, + "grad_norm": 0.8365696668624878, + "learning_rate": 9.315710089703898e-06, + "loss": 0.804, + "step": 30760 + }, + { + "epoch": 2.4425965984639504, + "grad_norm": 0.9081131815910339, + "learning_rate": 9.302479426318437e-06, + "loss": 0.797, + "step": 30770 + }, + { + "epoch": 2.44339042251285, + "grad_norm": 0.9612648487091064, + "learning_rate": 9.289248762932974e-06, + "loss": 0.7829, + "step": 30780 + }, + { + "epoch": 2.4441842465617496, + "grad_norm": 0.8103266954421997, + "learning_rate": 9.276018099547511e-06, + "loss": 0.8183, + "step": 30790 + }, + { + "epoch": 2.444978070610649, + "grad_norm": 0.8949236273765564, + "learning_rate": 9.26278743616205e-06, + "loss": 0.7561, + "step": 30800 + }, + { + "epoch": 2.445771894659549, + "grad_norm": 0.8719674348831177, + "learning_rate": 9.249556772776588e-06, + "loss": 0.7777, + "step": 30810 + }, + { + "epoch": 2.4465657187084484, + "grad_norm": 0.959321916103363, + "learning_rate": 9.236326109391125e-06, + "loss": 0.7687, + "step": 30820 + }, + { + "epoch": 2.447359542757348, + "grad_norm": 0.8504953980445862, + "learning_rate": 9.223095446005662e-06, + "loss": 0.7785, + "step": 30830 + }, + { + "epoch": 2.448153366806247, + "grad_norm": 1.073606014251709, + "learning_rate": 9.209864782620201e-06, + "loss": 0.8652, + "step": 30840 + }, + { + "epoch": 2.448947190855147, + "grad_norm": 1.1981532573699951, + "learning_rate": 9.196634119234738e-06, + "loss": 0.7746, + "step": 30850 + }, + { + "epoch": 2.4497410149040464, + "grad_norm": 0.819255530834198, + "learning_rate": 9.183403455849277e-06, + "loss": 0.8058, + "step": 30860 + }, + { + "epoch": 2.450534838952946, + "grad_norm": 1.0335006713867188, + "learning_rate": 9.170172792463814e-06, + "loss": 0.8036, + "step": 30870 + }, + { + "epoch": 2.4513286630018456, + "grad_norm": 0.7458271980285645, + "learning_rate": 9.156942129078353e-06, + "loss": 0.7471, + "step": 30880 + }, + { + "epoch": 2.452122487050745, + "grad_norm": 0.7569029331207275, + "learning_rate": 9.14371146569289e-06, + "loss": 0.768, + "step": 30890 + }, + { + "epoch": 2.452916311099645, + "grad_norm": 1.0082522630691528, + "learning_rate": 9.130480802307428e-06, + "loss": 0.7802, + "step": 30900 + }, + { + "epoch": 2.4537101351485444, + "grad_norm": 1.024379014968872, + "learning_rate": 9.117250138921967e-06, + "loss": 0.7639, + "step": 30910 + }, + { + "epoch": 2.454503959197444, + "grad_norm": 1.1425983905792236, + "learning_rate": 9.104019475536504e-06, + "loss": 0.7593, + "step": 30920 + }, + { + "epoch": 2.4552977832463436, + "grad_norm": 0.9849503040313721, + "learning_rate": 9.090788812151043e-06, + "loss": 0.8317, + "step": 30930 + }, + { + "epoch": 2.4560916072952432, + "grad_norm": 1.0086652040481567, + "learning_rate": 9.077558148765578e-06, + "loss": 0.7548, + "step": 30940 + }, + { + "epoch": 2.4568854313441424, + "grad_norm": 0.8764373660087585, + "learning_rate": 9.064327485380117e-06, + "loss": 0.7411, + "step": 30950 + }, + { + "epoch": 2.457679255393042, + "grad_norm": 1.1179012060165405, + "learning_rate": 9.051096821994655e-06, + "loss": 0.734, + "step": 30960 + }, + { + "epoch": 2.4584730794419416, + "grad_norm": 1.0787594318389893, + "learning_rate": 9.037866158609193e-06, + "loss": 0.8439, + "step": 30970 + }, + { + "epoch": 2.459266903490841, + "grad_norm": 0.9047144651412964, + "learning_rate": 9.02463549522373e-06, + "loss": 0.7876, + "step": 30980 + }, + { + "epoch": 2.460060727539741, + "grad_norm": 0.771873414516449, + "learning_rate": 9.01140483183827e-06, + "loss": 0.8258, + "step": 30990 + }, + { + "epoch": 2.4608545515886404, + "grad_norm": 0.8397606611251831, + "learning_rate": 8.998174168452807e-06, + "loss": 0.8014, + "step": 31000 + }, + { + "epoch": 2.46164837563754, + "grad_norm": 1.1035096645355225, + "learning_rate": 8.984943505067344e-06, + "loss": 0.7098, + "step": 31010 + }, + { + "epoch": 2.4624421996864396, + "grad_norm": 0.8682432770729065, + "learning_rate": 8.971712841681883e-06, + "loss": 0.7, + "step": 31020 + }, + { + "epoch": 2.4632360237353392, + "grad_norm": 0.9221312403678894, + "learning_rate": 8.95848217829642e-06, + "loss": 0.8001, + "step": 31030 + }, + { + "epoch": 2.4640298477842384, + "grad_norm": 1.064479947090149, + "learning_rate": 8.94525151491096e-06, + "loss": 0.727, + "step": 31040 + }, + { + "epoch": 2.464823671833138, + "grad_norm": 1.1401010751724243, + "learning_rate": 8.932020851525495e-06, + "loss": 0.7556, + "step": 31050 + }, + { + "epoch": 2.4656174958820376, + "grad_norm": 0.9812748432159424, + "learning_rate": 8.918790188140034e-06, + "loss": 0.7788, + "step": 31060 + }, + { + "epoch": 2.466411319930937, + "grad_norm": 0.8414878845214844, + "learning_rate": 8.905559524754571e-06, + "loss": 0.751, + "step": 31070 + }, + { + "epoch": 2.467205143979837, + "grad_norm": 0.9719010591506958, + "learning_rate": 8.89232886136911e-06, + "loss": 0.7413, + "step": 31080 + }, + { + "epoch": 2.4679989680287364, + "grad_norm": 0.8958083987236023, + "learning_rate": 8.879098197983647e-06, + "loss": 0.7575, + "step": 31090 + }, + { + "epoch": 2.468792792077636, + "grad_norm": 0.94130939245224, + "learning_rate": 8.865867534598184e-06, + "loss": 0.7973, + "step": 31100 + }, + { + "epoch": 2.4695866161265356, + "grad_norm": 0.8929863572120667, + "learning_rate": 8.852636871212723e-06, + "loss": 0.8096, + "step": 31110 + }, + { + "epoch": 2.4703804401754352, + "grad_norm": 0.9234357476234436, + "learning_rate": 8.83940620782726e-06, + "loss": 0.8093, + "step": 31120 + }, + { + "epoch": 2.471174264224335, + "grad_norm": 0.8647427558898926, + "learning_rate": 8.8261755444418e-06, + "loss": 0.7498, + "step": 31130 + }, + { + "epoch": 2.4719680882732344, + "grad_norm": 0.9515289068222046, + "learning_rate": 8.812944881056337e-06, + "loss": 0.814, + "step": 31140 + }, + { + "epoch": 2.4727619123221336, + "grad_norm": 1.1594178676605225, + "learning_rate": 8.799714217670876e-06, + "loss": 0.8102, + "step": 31150 + }, + { + "epoch": 2.473555736371033, + "grad_norm": 0.7505339980125427, + "learning_rate": 8.786483554285413e-06, + "loss": 0.8073, + "step": 31160 + }, + { + "epoch": 2.474349560419933, + "grad_norm": 1.0167568922042847, + "learning_rate": 8.77325289089995e-06, + "loss": 0.7955, + "step": 31170 + }, + { + "epoch": 2.4751433844688324, + "grad_norm": 0.8681299090385437, + "learning_rate": 8.760022227514487e-06, + "loss": 0.8455, + "step": 31180 + }, + { + "epoch": 2.475937208517732, + "grad_norm": 0.9660631418228149, + "learning_rate": 8.746791564129026e-06, + "loss": 0.7721, + "step": 31190 + }, + { + "epoch": 2.4767310325666316, + "grad_norm": 1.0215500593185425, + "learning_rate": 8.733560900743563e-06, + "loss": 0.7798, + "step": 31200 + }, + { + "epoch": 2.4775248566155312, + "grad_norm": 1.1119939088821411, + "learning_rate": 8.7203302373581e-06, + "loss": 0.7486, + "step": 31210 + }, + { + "epoch": 2.478318680664431, + "grad_norm": 0.8434147238731384, + "learning_rate": 8.70709957397264e-06, + "loss": 0.7727, + "step": 31220 + }, + { + "epoch": 2.4791125047133304, + "grad_norm": 0.9059457778930664, + "learning_rate": 8.693868910587177e-06, + "loss": 0.7878, + "step": 31230 + }, + { + "epoch": 2.4799063287622296, + "grad_norm": 0.9416773319244385, + "learning_rate": 8.680638247201716e-06, + "loss": 0.7827, + "step": 31240 + }, + { + "epoch": 2.4807001528111297, + "grad_norm": 1.0320826768875122, + "learning_rate": 8.667407583816253e-06, + "loss": 0.803, + "step": 31250 + }, + { + "epoch": 2.481493976860029, + "grad_norm": 1.0947669744491577, + "learning_rate": 8.654176920430792e-06, + "loss": 0.7813, + "step": 31260 + }, + { + "epoch": 2.4822878009089284, + "grad_norm": 0.9328486919403076, + "learning_rate": 8.640946257045329e-06, + "loss": 0.7644, + "step": 31270 + }, + { + "epoch": 2.483081624957828, + "grad_norm": 1.0494132041931152, + "learning_rate": 8.627715593659868e-06, + "loss": 0.8421, + "step": 31280 + }, + { + "epoch": 2.4838754490067276, + "grad_norm": 0.7501383423805237, + "learning_rate": 8.614484930274404e-06, + "loss": 0.7802, + "step": 31290 + }, + { + "epoch": 2.4846692730556272, + "grad_norm": 0.9962626099586487, + "learning_rate": 8.601254266888942e-06, + "loss": 0.7712, + "step": 31300 + }, + { + "epoch": 2.485463097104527, + "grad_norm": 0.8576439023017883, + "learning_rate": 8.58802360350348e-06, + "loss": 0.8233, + "step": 31310 + }, + { + "epoch": 2.4862569211534264, + "grad_norm": 1.1776164770126343, + "learning_rate": 8.574792940118017e-06, + "loss": 0.7753, + "step": 31320 + }, + { + "epoch": 2.487050745202326, + "grad_norm": 0.8087422847747803, + "learning_rate": 8.561562276732556e-06, + "loss": 0.7315, + "step": 31330 + }, + { + "epoch": 2.4878445692512257, + "grad_norm": 0.8869718909263611, + "learning_rate": 8.548331613347093e-06, + "loss": 0.7489, + "step": 31340 + }, + { + "epoch": 2.488638393300125, + "grad_norm": 0.9648873805999756, + "learning_rate": 8.535100949961632e-06, + "loss": 0.7823, + "step": 31350 + }, + { + "epoch": 2.4894322173490244, + "grad_norm": 0.9656558632850647, + "learning_rate": 8.52187028657617e-06, + "loss": 0.7554, + "step": 31360 + }, + { + "epoch": 2.490226041397924, + "grad_norm": 1.0111730098724365, + "learning_rate": 8.508639623190708e-06, + "loss": 0.7656, + "step": 31370 + }, + { + "epoch": 2.4910198654468236, + "grad_norm": 0.9099489450454712, + "learning_rate": 8.495408959805245e-06, + "loss": 0.8378, + "step": 31380 + }, + { + "epoch": 2.4918136894957232, + "grad_norm": 0.9399808645248413, + "learning_rate": 8.482178296419784e-06, + "loss": 0.8008, + "step": 31390 + }, + { + "epoch": 2.492607513544623, + "grad_norm": 0.6809582710266113, + "learning_rate": 8.46894763303432e-06, + "loss": 0.8124, + "step": 31400 + }, + { + "epoch": 2.4934013375935224, + "grad_norm": 0.8552061915397644, + "learning_rate": 8.455716969648859e-06, + "loss": 0.7942, + "step": 31410 + }, + { + "epoch": 2.494195161642422, + "grad_norm": 0.8102262616157532, + "learning_rate": 8.442486306263396e-06, + "loss": 0.7476, + "step": 31420 + }, + { + "epoch": 2.4949889856913217, + "grad_norm": 0.8236813545227051, + "learning_rate": 8.429255642877933e-06, + "loss": 0.8093, + "step": 31430 + }, + { + "epoch": 2.4957828097402213, + "grad_norm": 0.8333600163459778, + "learning_rate": 8.416024979492472e-06, + "loss": 0.7869, + "step": 31440 + }, + { + "epoch": 2.496576633789121, + "grad_norm": 1.0917775630950928, + "learning_rate": 8.40279431610701e-06, + "loss": 0.7584, + "step": 31450 + }, + { + "epoch": 2.49737045783802, + "grad_norm": 1.0433661937713623, + "learning_rate": 8.389563652721548e-06, + "loss": 0.8002, + "step": 31460 + }, + { + "epoch": 2.4981642818869196, + "grad_norm": 0.8462589979171753, + "learning_rate": 8.376332989336086e-06, + "loss": 0.7642, + "step": 31470 + }, + { + "epoch": 2.4989581059358192, + "grad_norm": 0.8918420076370239, + "learning_rate": 8.363102325950625e-06, + "loss": 0.7486, + "step": 31480 + }, + { + "epoch": 2.499751929984719, + "grad_norm": 0.9466501474380493, + "learning_rate": 8.349871662565162e-06, + "loss": 0.8042, + "step": 31490 + }, + { + "epoch": 2.5005457540336185, + "grad_norm": 0.9120933413505554, + "learning_rate": 8.336640999179699e-06, + "loss": 0.761, + "step": 31500 + }, + { + "epoch": 2.501339578082518, + "grad_norm": 0.8320806622505188, + "learning_rate": 8.323410335794238e-06, + "loss": 0.7678, + "step": 31510 + }, + { + "epoch": 2.5021334021314177, + "grad_norm": 0.9464184641838074, + "learning_rate": 8.310179672408775e-06, + "loss": 0.728, + "step": 31520 + }, + { + "epoch": 2.5029272261803173, + "grad_norm": 0.831498384475708, + "learning_rate": 8.296949009023312e-06, + "loss": 0.7438, + "step": 31530 + }, + { + "epoch": 2.503721050229217, + "grad_norm": 0.7958276271820068, + "learning_rate": 8.28371834563785e-06, + "loss": 0.7579, + "step": 31540 + }, + { + "epoch": 2.504514874278116, + "grad_norm": 0.859502375125885, + "learning_rate": 8.270487682252389e-06, + "loss": 0.8299, + "step": 31550 + }, + { + "epoch": 2.505308698327016, + "grad_norm": 0.9718039035797119, + "learning_rate": 8.257257018866926e-06, + "loss": 0.7772, + "step": 31560 + }, + { + "epoch": 2.5061025223759152, + "grad_norm": 0.8938490152359009, + "learning_rate": 8.244026355481465e-06, + "loss": 0.823, + "step": 31570 + }, + { + "epoch": 2.506896346424815, + "grad_norm": 0.9374427199363708, + "learning_rate": 8.230795692096002e-06, + "loss": 0.8246, + "step": 31580 + }, + { + "epoch": 2.5076901704737145, + "grad_norm": 0.8214637637138367, + "learning_rate": 8.217565028710541e-06, + "loss": 0.7747, + "step": 31590 + }, + { + "epoch": 2.508483994522614, + "grad_norm": 1.0425516366958618, + "learning_rate": 8.204334365325078e-06, + "loss": 0.7211, + "step": 31600 + }, + { + "epoch": 2.5092778185715137, + "grad_norm": 0.8850832581520081, + "learning_rate": 8.191103701939615e-06, + "loss": 0.7707, + "step": 31610 + }, + { + "epoch": 2.5100716426204133, + "grad_norm": 0.7210223078727722, + "learning_rate": 8.177873038554154e-06, + "loss": 0.7833, + "step": 31620 + }, + { + "epoch": 2.510865466669313, + "grad_norm": 0.9423604011535645, + "learning_rate": 8.164642375168692e-06, + "loss": 0.7697, + "step": 31630 + }, + { + "epoch": 2.511659290718212, + "grad_norm": 0.937473475933075, + "learning_rate": 8.151411711783229e-06, + "loss": 0.7924, + "step": 31640 + }, + { + "epoch": 2.512453114767112, + "grad_norm": 1.0225754976272583, + "learning_rate": 8.138181048397766e-06, + "loss": 0.8051, + "step": 31650 + }, + { + "epoch": 2.5132469388160112, + "grad_norm": 0.9610739350318909, + "learning_rate": 8.124950385012305e-06, + "loss": 0.7844, + "step": 31660 + }, + { + "epoch": 2.514040762864911, + "grad_norm": 0.6740348935127258, + "learning_rate": 8.111719721626842e-06, + "loss": 0.7502, + "step": 31670 + }, + { + "epoch": 2.5148345869138105, + "grad_norm": 0.9693344235420227, + "learning_rate": 8.098489058241381e-06, + "loss": 0.7821, + "step": 31680 + }, + { + "epoch": 2.51562841096271, + "grad_norm": 1.042784333229065, + "learning_rate": 8.085258394855918e-06, + "loss": 0.8042, + "step": 31690 + }, + { + "epoch": 2.5164222350116097, + "grad_norm": 0.726180911064148, + "learning_rate": 8.072027731470457e-06, + "loss": 0.7687, + "step": 31700 + }, + { + "epoch": 2.5172160590605093, + "grad_norm": 0.8935758471488953, + "learning_rate": 8.058797068084994e-06, + "loss": 0.7846, + "step": 31710 + }, + { + "epoch": 2.518009883109409, + "grad_norm": 0.884282648563385, + "learning_rate": 8.045566404699532e-06, + "loss": 0.7613, + "step": 31720 + }, + { + "epoch": 2.5188037071583085, + "grad_norm": 0.8129022717475891, + "learning_rate": 8.03233574131407e-06, + "loss": 0.8585, + "step": 31730 + }, + { + "epoch": 2.519597531207208, + "grad_norm": 0.9726604223251343, + "learning_rate": 8.019105077928608e-06, + "loss": 0.7648, + "step": 31740 + }, + { + "epoch": 2.5203913552561072, + "grad_norm": 0.7345698475837708, + "learning_rate": 8.005874414543145e-06, + "loss": 0.8031, + "step": 31750 + }, + { + "epoch": 2.5211851793050073, + "grad_norm": 0.9011138081550598, + "learning_rate": 7.992643751157682e-06, + "loss": 0.7917, + "step": 31760 + }, + { + "epoch": 2.5219790033539065, + "grad_norm": 0.9308391213417053, + "learning_rate": 7.979413087772221e-06, + "loss": 0.8107, + "step": 31770 + }, + { + "epoch": 2.522772827402806, + "grad_norm": 0.9620044827461243, + "learning_rate": 7.966182424386758e-06, + "loss": 0.7515, + "step": 31780 + }, + { + "epoch": 2.5235666514517057, + "grad_norm": 0.7784889936447144, + "learning_rate": 7.952951761001297e-06, + "loss": 0.7873, + "step": 31790 + }, + { + "epoch": 2.5243604755006053, + "grad_norm": 0.9141636490821838, + "learning_rate": 7.939721097615835e-06, + "loss": 0.8912, + "step": 31800 + }, + { + "epoch": 2.525154299549505, + "grad_norm": 0.7752735614776611, + "learning_rate": 7.926490434230374e-06, + "loss": 0.7585, + "step": 31810 + }, + { + "epoch": 2.5259481235984045, + "grad_norm": 0.7791392803192139, + "learning_rate": 7.91325977084491e-06, + "loss": 0.8186, + "step": 31820 + }, + { + "epoch": 2.526741947647304, + "grad_norm": 1.1256967782974243, + "learning_rate": 7.900029107459448e-06, + "loss": 0.7574, + "step": 31830 + }, + { + "epoch": 2.5275357716962037, + "grad_norm": 1.1258056163787842, + "learning_rate": 7.886798444073987e-06, + "loss": 0.7826, + "step": 31840 + }, + { + "epoch": 2.5283295957451033, + "grad_norm": 1.118212103843689, + "learning_rate": 7.873567780688524e-06, + "loss": 0.8267, + "step": 31850 + }, + { + "epoch": 2.5291234197940025, + "grad_norm": 1.0277642011642456, + "learning_rate": 7.860337117303061e-06, + "loss": 0.7603, + "step": 31860 + }, + { + "epoch": 2.529917243842902, + "grad_norm": 0.9534600377082825, + "learning_rate": 7.847106453917599e-06, + "loss": 0.7872, + "step": 31870 + }, + { + "epoch": 2.5307110678918017, + "grad_norm": 0.9541226625442505, + "learning_rate": 7.833875790532138e-06, + "loss": 0.7756, + "step": 31880 + }, + { + "epoch": 2.5315048919407013, + "grad_norm": 0.8130003213882446, + "learning_rate": 7.820645127146675e-06, + "loss": 0.8042, + "step": 31890 + }, + { + "epoch": 2.532298715989601, + "grad_norm": 1.0782946348190308, + "learning_rate": 7.807414463761214e-06, + "loss": 0.7568, + "step": 31900 + }, + { + "epoch": 2.5330925400385005, + "grad_norm": 1.0216178894042969, + "learning_rate": 7.794183800375751e-06, + "loss": 0.7805, + "step": 31910 + }, + { + "epoch": 2.5338863640874, + "grad_norm": 1.0545295476913452, + "learning_rate": 7.78095313699029e-06, + "loss": 0.7286, + "step": 31920 + }, + { + "epoch": 2.5346801881362997, + "grad_norm": 0.8440236449241638, + "learning_rate": 7.767722473604827e-06, + "loss": 0.731, + "step": 31930 + }, + { + "epoch": 2.5354740121851993, + "grad_norm": 1.0648548603057861, + "learning_rate": 7.754491810219364e-06, + "loss": 0.8121, + "step": 31940 + }, + { + "epoch": 2.5362678362340985, + "grad_norm": 1.0364627838134766, + "learning_rate": 7.741261146833903e-06, + "loss": 0.7799, + "step": 31950 + }, + { + "epoch": 2.5370616602829985, + "grad_norm": 1.0399836301803589, + "learning_rate": 7.72803048344844e-06, + "loss": 0.8083, + "step": 31960 + }, + { + "epoch": 2.5378554843318977, + "grad_norm": 0.8876070380210876, + "learning_rate": 7.71479982006298e-06, + "loss": 0.8043, + "step": 31970 + }, + { + "epoch": 2.5386493083807973, + "grad_norm": 0.9416910409927368, + "learning_rate": 7.701569156677515e-06, + "loss": 0.7913, + "step": 31980 + }, + { + "epoch": 2.539443132429697, + "grad_norm": 0.7986873984336853, + "learning_rate": 7.688338493292054e-06, + "loss": 0.7693, + "step": 31990 + }, + { + "epoch": 2.5402369564785965, + "grad_norm": 1.0350062847137451, + "learning_rate": 7.675107829906591e-06, + "loss": 0.7858, + "step": 32000 + }, + { + "epoch": 2.541030780527496, + "grad_norm": 0.820899486541748, + "learning_rate": 7.66187716652113e-06, + "loss": 0.781, + "step": 32010 + }, + { + "epoch": 2.5418246045763957, + "grad_norm": 0.9244166612625122, + "learning_rate": 7.648646503135667e-06, + "loss": 0.7373, + "step": 32020 + }, + { + "epoch": 2.5426184286252953, + "grad_norm": 0.9774866104125977, + "learning_rate": 7.635415839750206e-06, + "loss": 0.7804, + "step": 32030 + }, + { + "epoch": 2.543412252674195, + "grad_norm": 0.9632622599601746, + "learning_rate": 7.6221851763647435e-06, + "loss": 0.8138, + "step": 32040 + }, + { + "epoch": 2.5442060767230945, + "grad_norm": 0.9499511122703552, + "learning_rate": 7.6089545129792815e-06, + "loss": 0.8123, + "step": 32050 + }, + { + "epoch": 2.5449999007719937, + "grad_norm": 0.9002213478088379, + "learning_rate": 7.59572384959382e-06, + "loss": 0.7807, + "step": 32060 + }, + { + "epoch": 2.5457937248208937, + "grad_norm": 0.965908944606781, + "learning_rate": 7.582493186208358e-06, + "loss": 0.8199, + "step": 32070 + }, + { + "epoch": 2.546587548869793, + "grad_norm": 0.9211716055870056, + "learning_rate": 7.569262522822895e-06, + "loss": 0.6892, + "step": 32080 + }, + { + "epoch": 2.5473813729186925, + "grad_norm": 1.01425039768219, + "learning_rate": 7.556031859437433e-06, + "loss": 0.7395, + "step": 32090 + }, + { + "epoch": 2.548175196967592, + "grad_norm": 0.9339504241943359, + "learning_rate": 7.54280119605197e-06, + "loss": 0.7235, + "step": 32100 + }, + { + "epoch": 2.5489690210164917, + "grad_norm": 1.2052608728408813, + "learning_rate": 7.5295705326665075e-06, + "loss": 0.7632, + "step": 32110 + }, + { + "epoch": 2.5497628450653913, + "grad_norm": 0.9481042623519897, + "learning_rate": 7.5163398692810456e-06, + "loss": 0.7757, + "step": 32120 + }, + { + "epoch": 2.550556669114291, + "grad_norm": 1.060491681098938, + "learning_rate": 7.503109205895584e-06, + "loss": 0.8118, + "step": 32130 + }, + { + "epoch": 2.5513504931631905, + "grad_norm": 0.9081485867500305, + "learning_rate": 7.489878542510122e-06, + "loss": 0.7584, + "step": 32140 + }, + { + "epoch": 2.5521443172120897, + "grad_norm": 0.8219790458679199, + "learning_rate": 7.47664787912466e-06, + "loss": 0.8306, + "step": 32150 + }, + { + "epoch": 2.5529381412609897, + "grad_norm": 0.9022849202156067, + "learning_rate": 7.463417215739198e-06, + "loss": 0.7768, + "step": 32160 + }, + { + "epoch": 2.553731965309889, + "grad_norm": 1.0158053636550903, + "learning_rate": 7.450186552353736e-06, + "loss": 0.7782, + "step": 32170 + }, + { + "epoch": 2.5545257893587885, + "grad_norm": 0.8737016916275024, + "learning_rate": 7.436955888968273e-06, + "loss": 0.744, + "step": 32180 + }, + { + "epoch": 2.555319613407688, + "grad_norm": 0.9380881190299988, + "learning_rate": 7.423725225582811e-06, + "loss": 0.7312, + "step": 32190 + }, + { + "epoch": 2.5561134374565877, + "grad_norm": 0.9312672019004822, + "learning_rate": 7.410494562197349e-06, + "loss": 0.7955, + "step": 32200 + }, + { + "epoch": 2.5569072615054873, + "grad_norm": 1.055057168006897, + "learning_rate": 7.397263898811886e-06, + "loss": 0.7535, + "step": 32210 + }, + { + "epoch": 2.557701085554387, + "grad_norm": 0.9864413142204285, + "learning_rate": 7.384033235426424e-06, + "loss": 0.8113, + "step": 32220 + }, + { + "epoch": 2.5584949096032865, + "grad_norm": 0.8412654995918274, + "learning_rate": 7.370802572040962e-06, + "loss": 0.77, + "step": 32230 + }, + { + "epoch": 2.559288733652186, + "grad_norm": 0.8634358048439026, + "learning_rate": 7.3575719086555e-06, + "loss": 0.7868, + "step": 32240 + }, + { + "epoch": 2.5600825577010857, + "grad_norm": 1.081010103225708, + "learning_rate": 7.344341245270038e-06, + "loss": 0.8323, + "step": 32250 + }, + { + "epoch": 2.560876381749985, + "grad_norm": 0.9374126195907593, + "learning_rate": 7.331110581884576e-06, + "loss": 0.78, + "step": 32260 + }, + { + "epoch": 2.561670205798885, + "grad_norm": 1.0620479583740234, + "learning_rate": 7.317879918499114e-06, + "loss": 0.7825, + "step": 32270 + }, + { + "epoch": 2.562464029847784, + "grad_norm": 0.8730692863464355, + "learning_rate": 7.304649255113652e-06, + "loss": 0.7922, + "step": 32280 + }, + { + "epoch": 2.5632578538966837, + "grad_norm": 1.0309388637542725, + "learning_rate": 7.2914185917281895e-06, + "loss": 0.79, + "step": 32290 + }, + { + "epoch": 2.5640516779455833, + "grad_norm": 0.8236114382743835, + "learning_rate": 7.278187928342728e-06, + "loss": 0.7463, + "step": 32300 + }, + { + "epoch": 2.564845501994483, + "grad_norm": 0.9980899095535278, + "learning_rate": 7.264957264957266e-06, + "loss": 0.7699, + "step": 32310 + }, + { + "epoch": 2.5656393260433825, + "grad_norm": 0.8803249597549438, + "learning_rate": 7.251726601571804e-06, + "loss": 0.8637, + "step": 32320 + }, + { + "epoch": 2.566433150092282, + "grad_norm": 0.9461043477058411, + "learning_rate": 7.23849593818634e-06, + "loss": 0.8058, + "step": 32330 + }, + { + "epoch": 2.5672269741411817, + "grad_norm": 1.0341662168502808, + "learning_rate": 7.225265274800878e-06, + "loss": 0.8059, + "step": 32340 + }, + { + "epoch": 2.5680207981900813, + "grad_norm": 0.7638232111930847, + "learning_rate": 7.212034611415416e-06, + "loss": 0.8157, + "step": 32350 + }, + { + "epoch": 2.568814622238981, + "grad_norm": 0.8748646974563599, + "learning_rate": 7.198803948029954e-06, + "loss": 0.7982, + "step": 32360 + }, + { + "epoch": 2.56960844628788, + "grad_norm": 0.9040183424949646, + "learning_rate": 7.1855732846444925e-06, + "loss": 0.8113, + "step": 32370 + }, + { + "epoch": 2.5704022703367797, + "grad_norm": 0.7992666959762573, + "learning_rate": 7.1723426212590306e-06, + "loss": 0.791, + "step": 32380 + }, + { + "epoch": 2.5711960943856793, + "grad_norm": 0.9834904074668884, + "learning_rate": 7.159111957873569e-06, + "loss": 0.8094, + "step": 32390 + }, + { + "epoch": 2.571989918434579, + "grad_norm": 1.0673480033874512, + "learning_rate": 7.145881294488106e-06, + "loss": 0.7499, + "step": 32400 + }, + { + "epoch": 2.5727837424834785, + "grad_norm": 1.0334373712539673, + "learning_rate": 7.132650631102644e-06, + "loss": 0.7715, + "step": 32410 + }, + { + "epoch": 2.573577566532378, + "grad_norm": 0.9567040205001831, + "learning_rate": 7.119419967717182e-06, + "loss": 0.7818, + "step": 32420 + }, + { + "epoch": 2.5743713905812777, + "grad_norm": 0.890590488910675, + "learning_rate": 7.10618930433172e-06, + "loss": 0.7509, + "step": 32430 + }, + { + "epoch": 2.5751652146301773, + "grad_norm": 0.8644120693206787, + "learning_rate": 7.0929586409462565e-06, + "loss": 0.7556, + "step": 32440 + }, + { + "epoch": 2.575959038679077, + "grad_norm": 0.8161708116531372, + "learning_rate": 7.079727977560795e-06, + "loss": 0.8253, + "step": 32450 + }, + { + "epoch": 2.576752862727976, + "grad_norm": 0.8945040106773376, + "learning_rate": 7.066497314175333e-06, + "loss": 0.754, + "step": 32460 + }, + { + "epoch": 2.577546686776876, + "grad_norm": 0.9900794625282288, + "learning_rate": 7.053266650789871e-06, + "loss": 0.7775, + "step": 32470 + }, + { + "epoch": 2.5783405108257753, + "grad_norm": 0.7387442588806152, + "learning_rate": 7.040035987404409e-06, + "loss": 0.8277, + "step": 32480 + }, + { + "epoch": 2.579134334874675, + "grad_norm": 0.8887678980827332, + "learning_rate": 7.026805324018947e-06, + "loss": 0.8033, + "step": 32490 + }, + { + "epoch": 2.5799281589235745, + "grad_norm": 1.0268609523773193, + "learning_rate": 7.013574660633485e-06, + "loss": 0.7224, + "step": 32500 + }, + { + "epoch": 2.580721982972474, + "grad_norm": 0.7121632099151611, + "learning_rate": 7.000343997248022e-06, + "loss": 0.8311, + "step": 32510 + }, + { + "epoch": 2.5815158070213737, + "grad_norm": 0.9221680164337158, + "learning_rate": 6.98711333386256e-06, + "loss": 0.8105, + "step": 32520 + }, + { + "epoch": 2.5823096310702733, + "grad_norm": 0.8981099128723145, + "learning_rate": 6.973882670477098e-06, + "loss": 0.7886, + "step": 32530 + }, + { + "epoch": 2.583103455119173, + "grad_norm": 0.8529419302940369, + "learning_rate": 6.9606520070916365e-06, + "loss": 0.7879, + "step": 32540 + }, + { + "epoch": 2.5838972791680725, + "grad_norm": 0.9864525198936462, + "learning_rate": 6.9474213437061745e-06, + "loss": 0.7829, + "step": 32550 + }, + { + "epoch": 2.584691103216972, + "grad_norm": 1.1552358865737915, + "learning_rate": 6.934190680320711e-06, + "loss": 0.737, + "step": 32560 + }, + { + "epoch": 2.5854849272658713, + "grad_norm": 0.9277712106704712, + "learning_rate": 6.920960016935249e-06, + "loss": 0.773, + "step": 32570 + }, + { + "epoch": 2.586278751314771, + "grad_norm": 1.1222898960113525, + "learning_rate": 6.907729353549787e-06, + "loss": 0.7899, + "step": 32580 + }, + { + "epoch": 2.5870725753636705, + "grad_norm": 0.9069304466247559, + "learning_rate": 6.894498690164325e-06, + "loss": 0.7647, + "step": 32590 + }, + { + "epoch": 2.58786639941257, + "grad_norm": 1.0726667642593384, + "learning_rate": 6.881268026778863e-06, + "loss": 0.752, + "step": 32600 + }, + { + "epoch": 2.5886602234614697, + "grad_norm": 1.020435094833374, + "learning_rate": 6.8680373633934005e-06, + "loss": 0.7844, + "step": 32610 + }, + { + "epoch": 2.5894540475103693, + "grad_norm": 0.928993821144104, + "learning_rate": 6.8548067000079386e-06, + "loss": 0.8021, + "step": 32620 + }, + { + "epoch": 2.590247871559269, + "grad_norm": 0.8852306604385376, + "learning_rate": 6.841576036622477e-06, + "loss": 0.7672, + "step": 32630 + }, + { + "epoch": 2.5910416956081685, + "grad_norm": 0.8123989701271057, + "learning_rate": 6.828345373237015e-06, + "loss": 0.8049, + "step": 32640 + }, + { + "epoch": 2.591835519657068, + "grad_norm": 0.8990057110786438, + "learning_rate": 6.815114709851553e-06, + "loss": 0.72, + "step": 32650 + }, + { + "epoch": 2.5926293437059673, + "grad_norm": 1.0066652297973633, + "learning_rate": 6.801884046466091e-06, + "loss": 0.7786, + "step": 32660 + }, + { + "epoch": 2.5934231677548674, + "grad_norm": 0.984686017036438, + "learning_rate": 6.788653383080627e-06, + "loss": 0.7876, + "step": 32670 + }, + { + "epoch": 2.5942169918037665, + "grad_norm": 0.9421548247337341, + "learning_rate": 6.775422719695165e-06, + "loss": 0.7983, + "step": 32680 + }, + { + "epoch": 2.595010815852666, + "grad_norm": 0.9953796863555908, + "learning_rate": 6.762192056309703e-06, + "loss": 0.7933, + "step": 32690 + }, + { + "epoch": 2.5958046399015657, + "grad_norm": 1.0435398817062378, + "learning_rate": 6.7489613929242415e-06, + "loss": 0.8087, + "step": 32700 + }, + { + "epoch": 2.5965984639504653, + "grad_norm": 0.8738863468170166, + "learning_rate": 6.73573072953878e-06, + "loss": 0.7782, + "step": 32710 + }, + { + "epoch": 2.597392287999365, + "grad_norm": 1.0245190858840942, + "learning_rate": 6.722500066153317e-06, + "loss": 0.8013, + "step": 32720 + }, + { + "epoch": 2.5981861120482646, + "grad_norm": 0.965116560459137, + "learning_rate": 6.709269402767855e-06, + "loss": 0.8208, + "step": 32730 + }, + { + "epoch": 2.598979936097164, + "grad_norm": 1.0972223281860352, + "learning_rate": 6.696038739382393e-06, + "loss": 0.7588, + "step": 32740 + }, + { + "epoch": 2.5997737601460638, + "grad_norm": 1.1245334148406982, + "learning_rate": 6.682808075996931e-06, + "loss": 0.8014, + "step": 32750 + }, + { + "epoch": 2.6005675841949634, + "grad_norm": 0.9737082719802856, + "learning_rate": 6.669577412611469e-06, + "loss": 0.7959, + "step": 32760 + }, + { + "epoch": 2.6013614082438625, + "grad_norm": 0.9256748557090759, + "learning_rate": 6.656346749226007e-06, + "loss": 0.7476, + "step": 32770 + }, + { + "epoch": 2.6021552322927626, + "grad_norm": 0.9990182518959045, + "learning_rate": 6.643116085840545e-06, + "loss": 0.8093, + "step": 32780 + }, + { + "epoch": 2.6029490563416617, + "grad_norm": 1.174641489982605, + "learning_rate": 6.629885422455082e-06, + "loss": 0.7579, + "step": 32790 + }, + { + "epoch": 2.6037428803905613, + "grad_norm": 0.8486537933349609, + "learning_rate": 6.61665475906962e-06, + "loss": 0.755, + "step": 32800 + }, + { + "epoch": 2.604536704439461, + "grad_norm": 1.0289491415023804, + "learning_rate": 6.603424095684158e-06, + "loss": 0.7426, + "step": 32810 + }, + { + "epoch": 2.6053305284883606, + "grad_norm": 0.9940427541732788, + "learning_rate": 6.590193432298696e-06, + "loss": 0.802, + "step": 32820 + }, + { + "epoch": 2.60612435253726, + "grad_norm": 0.9397895336151123, + "learning_rate": 6.576962768913233e-06, + "loss": 0.7455, + "step": 32830 + }, + { + "epoch": 2.6069181765861598, + "grad_norm": 0.9215680360794067, + "learning_rate": 6.563732105527771e-06, + "loss": 0.8156, + "step": 32840 + }, + { + "epoch": 2.6077120006350594, + "grad_norm": 0.9983633160591125, + "learning_rate": 6.550501442142309e-06, + "loss": 0.749, + "step": 32850 + }, + { + "epoch": 2.6085058246839585, + "grad_norm": 0.9694678783416748, + "learning_rate": 6.537270778756847e-06, + "loss": 0.7625, + "step": 32860 + }, + { + "epoch": 2.6092996487328586, + "grad_norm": 0.9712207913398743, + "learning_rate": 6.5240401153713855e-06, + "loss": 0.7712, + "step": 32870 + }, + { + "epoch": 2.6100934727817577, + "grad_norm": 0.8602631688117981, + "learning_rate": 6.5108094519859236e-06, + "loss": 0.7941, + "step": 32880 + }, + { + "epoch": 2.6108872968306573, + "grad_norm": 1.1966452598571777, + "learning_rate": 6.497578788600462e-06, + "loss": 0.7736, + "step": 32890 + }, + { + "epoch": 2.611681120879557, + "grad_norm": 1.0157437324523926, + "learning_rate": 6.484348125214998e-06, + "loss": 0.7439, + "step": 32900 + }, + { + "epoch": 2.6124749449284566, + "grad_norm": 0.9984763264656067, + "learning_rate": 6.471117461829536e-06, + "loss": 0.7874, + "step": 32910 + }, + { + "epoch": 2.613268768977356, + "grad_norm": 0.9150610566139221, + "learning_rate": 6.457886798444074e-06, + "loss": 0.6659, + "step": 32920 + }, + { + "epoch": 2.6140625930262558, + "grad_norm": 1.074073076248169, + "learning_rate": 6.444656135058611e-06, + "loss": 0.8417, + "step": 32930 + }, + { + "epoch": 2.6148564170751554, + "grad_norm": 0.8862860798835754, + "learning_rate": 6.4314254716731495e-06, + "loss": 0.7755, + "step": 32940 + }, + { + "epoch": 2.615650241124055, + "grad_norm": 0.9611344933509827, + "learning_rate": 6.4181948082876876e-06, + "loss": 0.7053, + "step": 32950 + }, + { + "epoch": 2.6164440651729546, + "grad_norm": 0.8691285252571106, + "learning_rate": 6.404964144902226e-06, + "loss": 0.7922, + "step": 32960 + }, + { + "epoch": 2.6172378892218537, + "grad_norm": 0.7891274690628052, + "learning_rate": 6.391733481516764e-06, + "loss": 0.7849, + "step": 32970 + }, + { + "epoch": 2.618031713270754, + "grad_norm": 0.9660747647285461, + "learning_rate": 6.378502818131302e-06, + "loss": 0.8229, + "step": 32980 + }, + { + "epoch": 2.618825537319653, + "grad_norm": 0.9171955585479736, + "learning_rate": 6.36527215474584e-06, + "loss": 0.8054, + "step": 32990 + }, + { + "epoch": 2.6196193613685526, + "grad_norm": 1.118066668510437, + "learning_rate": 6.352041491360378e-06, + "loss": 0.7319, + "step": 33000 + }, + { + "epoch": 2.620413185417452, + "grad_norm": 1.0057705640792847, + "learning_rate": 6.338810827974915e-06, + "loss": 0.7856, + "step": 33010 + }, + { + "epoch": 2.6212070094663518, + "grad_norm": 0.8103092908859253, + "learning_rate": 6.3255801645894524e-06, + "loss": 0.804, + "step": 33020 + }, + { + "epoch": 2.6220008335152514, + "grad_norm": 0.926285982131958, + "learning_rate": 6.3123495012039905e-06, + "loss": 0.8147, + "step": 33030 + }, + { + "epoch": 2.622794657564151, + "grad_norm": 0.8489839434623718, + "learning_rate": 6.299118837818528e-06, + "loss": 0.7151, + "step": 33040 + }, + { + "epoch": 2.6235884816130506, + "grad_norm": 0.8944807648658752, + "learning_rate": 6.285888174433066e-06, + "loss": 0.7859, + "step": 33050 + }, + { + "epoch": 2.62438230566195, + "grad_norm": 0.8567325472831726, + "learning_rate": 6.272657511047604e-06, + "loss": 0.8029, + "step": 33060 + }, + { + "epoch": 2.62517612971085, + "grad_norm": 0.9069671034812927, + "learning_rate": 6.259426847662142e-06, + "loss": 0.7756, + "step": 33070 + }, + { + "epoch": 2.625969953759749, + "grad_norm": 0.924453854560852, + "learning_rate": 6.24619618427668e-06, + "loss": 0.7929, + "step": 33080 + }, + { + "epoch": 2.6267637778086486, + "grad_norm": 0.9638664126396179, + "learning_rate": 6.232965520891218e-06, + "loss": 0.8279, + "step": 33090 + }, + { + "epoch": 2.627557601857548, + "grad_norm": 0.9644804000854492, + "learning_rate": 6.219734857505756e-06, + "loss": 0.7289, + "step": 33100 + }, + { + "epoch": 2.6283514259064478, + "grad_norm": 0.8256565928459167, + "learning_rate": 6.2065041941202935e-06, + "loss": 0.7667, + "step": 33110 + }, + { + "epoch": 2.6291452499553474, + "grad_norm": 0.9192927479743958, + "learning_rate": 6.1932735307348315e-06, + "loss": 0.7884, + "step": 33120 + }, + { + "epoch": 2.629939074004247, + "grad_norm": 0.8394224047660828, + "learning_rate": 6.180042867349369e-06, + "loss": 0.7822, + "step": 33130 + }, + { + "epoch": 2.6307328980531466, + "grad_norm": 0.9371026754379272, + "learning_rate": 6.166812203963907e-06, + "loss": 0.7951, + "step": 33140 + }, + { + "epoch": 2.631526722102046, + "grad_norm": 0.9811797738075256, + "learning_rate": 6.153581540578445e-06, + "loss": 0.7974, + "step": 33150 + }, + { + "epoch": 2.632320546150946, + "grad_norm": 1.0210554599761963, + "learning_rate": 6.140350877192982e-06, + "loss": 0.7797, + "step": 33160 + }, + { + "epoch": 2.633114370199845, + "grad_norm": 1.1501883268356323, + "learning_rate": 6.12712021380752e-06, + "loss": 0.8254, + "step": 33170 + }, + { + "epoch": 2.633908194248745, + "grad_norm": 1.004550576210022, + "learning_rate": 6.113889550422058e-06, + "loss": 0.7635, + "step": 33180 + }, + { + "epoch": 2.634702018297644, + "grad_norm": 0.8310564756393433, + "learning_rate": 6.100658887036596e-06, + "loss": 0.835, + "step": 33190 + }, + { + "epoch": 2.6354958423465438, + "grad_norm": 0.8766066431999207, + "learning_rate": 6.0874282236511345e-06, + "loss": 0.8109, + "step": 33200 + }, + { + "epoch": 2.6362896663954434, + "grad_norm": 1.1657209396362305, + "learning_rate": 6.0741975602656726e-06, + "loss": 0.7254, + "step": 33210 + }, + { + "epoch": 2.637083490444343, + "grad_norm": 0.9214816689491272, + "learning_rate": 6.062289963218756e-06, + "loss": 0.7934, + "step": 33220 + }, + { + "epoch": 2.6378773144932426, + "grad_norm": 0.9850200414657593, + "learning_rate": 6.049059299833293e-06, + "loss": 0.814, + "step": 33230 + }, + { + "epoch": 2.638671138542142, + "grad_norm": 0.7920827865600586, + "learning_rate": 6.035828636447831e-06, + "loss": 0.7924, + "step": 33240 + }, + { + "epoch": 2.639464962591042, + "grad_norm": 0.9591189026832581, + "learning_rate": 6.0225979730623695e-06, + "loss": 0.793, + "step": 33250 + }, + { + "epoch": 2.6402587866399414, + "grad_norm": 0.8499795198440552, + "learning_rate": 6.0093673096769076e-06, + "loss": 0.8147, + "step": 33260 + }, + { + "epoch": 2.641052610688841, + "grad_norm": 0.9025987982749939, + "learning_rate": 5.996136646291446e-06, + "loss": 0.7892, + "step": 33270 + }, + { + "epoch": 2.64184643473774, + "grad_norm": 0.8324596881866455, + "learning_rate": 5.982905982905984e-06, + "loss": 0.7845, + "step": 33280 + }, + { + "epoch": 2.64264025878664, + "grad_norm": 0.948970377445221, + "learning_rate": 5.969675319520521e-06, + "loss": 0.7481, + "step": 33290 + }, + { + "epoch": 2.6434340828355394, + "grad_norm": 0.7791500687599182, + "learning_rate": 5.956444656135059e-06, + "loss": 0.7557, + "step": 33300 + }, + { + "epoch": 2.644227906884439, + "grad_norm": 0.7366840243339539, + "learning_rate": 5.943213992749596e-06, + "loss": 0.8133, + "step": 33310 + }, + { + "epoch": 2.6450217309333386, + "grad_norm": 0.9585718512535095, + "learning_rate": 5.929983329364134e-06, + "loss": 0.7364, + "step": 33320 + }, + { + "epoch": 2.645815554982238, + "grad_norm": 0.8997151851654053, + "learning_rate": 5.916752665978672e-06, + "loss": 0.8047, + "step": 33330 + }, + { + "epoch": 2.646609379031138, + "grad_norm": 1.0085241794586182, + "learning_rate": 5.9035220025932105e-06, + "loss": 0.7328, + "step": 33340 + }, + { + "epoch": 2.6474032030800374, + "grad_norm": 1.000828742980957, + "learning_rate": 5.890291339207748e-06, + "loss": 0.756, + "step": 33350 + }, + { + "epoch": 2.648197027128937, + "grad_norm": 0.913764476776123, + "learning_rate": 5.877060675822286e-06, + "loss": 0.8004, + "step": 33360 + }, + { + "epoch": 2.648990851177836, + "grad_norm": 0.861119270324707, + "learning_rate": 5.863830012436824e-06, + "loss": 0.8222, + "step": 33370 + }, + { + "epoch": 2.649784675226736, + "grad_norm": 0.7925031185150146, + "learning_rate": 5.850599349051362e-06, + "loss": 0.7273, + "step": 33380 + }, + { + "epoch": 2.6505784992756354, + "grad_norm": 1.0246551036834717, + "learning_rate": 5.8373686856659e-06, + "loss": 0.8253, + "step": 33390 + }, + { + "epoch": 2.651372323324535, + "grad_norm": 0.970195472240448, + "learning_rate": 5.824138022280438e-06, + "loss": 0.7488, + "step": 33400 + }, + { + "epoch": 2.6521661473734346, + "grad_norm": 0.955636203289032, + "learning_rate": 5.810907358894975e-06, + "loss": 0.7855, + "step": 33410 + }, + { + "epoch": 2.652959971422334, + "grad_norm": 0.837679386138916, + "learning_rate": 5.797676695509513e-06, + "loss": 0.7337, + "step": 33420 + }, + { + "epoch": 2.653753795471234, + "grad_norm": 0.9340305328369141, + "learning_rate": 5.784446032124051e-06, + "loss": 0.7628, + "step": 33430 + }, + { + "epoch": 2.6545476195201334, + "grad_norm": 0.8416723608970642, + "learning_rate": 5.771215368738589e-06, + "loss": 0.818, + "step": 33440 + }, + { + "epoch": 2.655341443569033, + "grad_norm": 0.9004825353622437, + "learning_rate": 5.757984705353127e-06, + "loss": 0.7861, + "step": 33450 + }, + { + "epoch": 2.6561352676179326, + "grad_norm": 0.7813234925270081, + "learning_rate": 5.744754041967665e-06, + "loss": 0.804, + "step": 33460 + }, + { + "epoch": 2.656929091666832, + "grad_norm": 1.1214011907577515, + "learning_rate": 5.731523378582202e-06, + "loss": 0.7651, + "step": 33470 + }, + { + "epoch": 2.6577229157157314, + "grad_norm": 0.9582983255386353, + "learning_rate": 5.71829271519674e-06, + "loss": 0.7695, + "step": 33480 + }, + { + "epoch": 2.6585167397646314, + "grad_norm": 0.7920512557029724, + "learning_rate": 5.705062051811278e-06, + "loss": 0.8515, + "step": 33490 + }, + { + "epoch": 2.6593105638135306, + "grad_norm": 0.904657781124115, + "learning_rate": 5.691831388425816e-06, + "loss": 0.8063, + "step": 33500 + }, + { + "epoch": 2.66010438786243, + "grad_norm": 0.9910812377929688, + "learning_rate": 5.678600725040354e-06, + "loss": 0.7586, + "step": 33510 + }, + { + "epoch": 2.66089821191133, + "grad_norm": 1.2386099100112915, + "learning_rate": 5.665370061654892e-06, + "loss": 0.764, + "step": 33520 + }, + { + "epoch": 2.6616920359602294, + "grad_norm": 0.9596011638641357, + "learning_rate": 5.652139398269429e-06, + "loss": 0.7142, + "step": 33530 + }, + { + "epoch": 2.662485860009129, + "grad_norm": 1.1034601926803589, + "learning_rate": 5.638908734883967e-06, + "loss": 0.8422, + "step": 33540 + }, + { + "epoch": 2.6632796840580286, + "grad_norm": 0.8817473649978638, + "learning_rate": 5.625678071498505e-06, + "loss": 0.7953, + "step": 33550 + }, + { + "epoch": 2.6640735081069282, + "grad_norm": 0.9314897656440735, + "learning_rate": 5.612447408113043e-06, + "loss": 0.8269, + "step": 33560 + }, + { + "epoch": 2.6648673321558274, + "grad_norm": 0.8691668510437012, + "learning_rate": 5.599216744727581e-06, + "loss": 0.7674, + "step": 33570 + }, + { + "epoch": 2.6656611562047274, + "grad_norm": 0.869050145149231, + "learning_rate": 5.5859860813421185e-06, + "loss": 0.7608, + "step": 33580 + }, + { + "epoch": 2.6664549802536266, + "grad_norm": 0.7281513214111328, + "learning_rate": 5.5727554179566566e-06, + "loss": 0.7388, + "step": 33590 + }, + { + "epoch": 2.667248804302526, + "grad_norm": 0.9663460850715637, + "learning_rate": 5.559524754571195e-06, + "loss": 0.7931, + "step": 33600 + }, + { + "epoch": 2.668042628351426, + "grad_norm": 0.8808895945549011, + "learning_rate": 5.546294091185733e-06, + "loss": 0.7915, + "step": 33610 + }, + { + "epoch": 2.6688364524003254, + "grad_norm": 1.0292035341262817, + "learning_rate": 5.53306342780027e-06, + "loss": 0.7188, + "step": 33620 + }, + { + "epoch": 2.669630276449225, + "grad_norm": 0.8974072337150574, + "learning_rate": 5.519832764414808e-06, + "loss": 0.7602, + "step": 33630 + }, + { + "epoch": 2.6704241004981246, + "grad_norm": 0.8630101084709167, + "learning_rate": 5.506602101029345e-06, + "loss": 0.806, + "step": 33640 + }, + { + "epoch": 2.6712179245470242, + "grad_norm": 0.958050549030304, + "learning_rate": 5.493371437643883e-06, + "loss": 0.7824, + "step": 33650 + }, + { + "epoch": 2.672011748595924, + "grad_norm": 0.9174726605415344, + "learning_rate": 5.4801407742584214e-06, + "loss": 0.848, + "step": 33660 + }, + { + "epoch": 2.6728055726448234, + "grad_norm": 0.9651468992233276, + "learning_rate": 5.4669101108729595e-06, + "loss": 0.7625, + "step": 33670 + }, + { + "epoch": 2.6735993966937226, + "grad_norm": 0.8684685230255127, + "learning_rate": 5.453679447487498e-06, + "loss": 0.7831, + "step": 33680 + }, + { + "epoch": 2.6743932207426226, + "grad_norm": 0.9022814631462097, + "learning_rate": 5.440448784102036e-06, + "loss": 0.7718, + "step": 33690 + }, + { + "epoch": 2.675187044791522, + "grad_norm": 0.9168913960456848, + "learning_rate": 5.427218120716573e-06, + "loss": 0.87, + "step": 33700 + }, + { + "epoch": 2.6759808688404214, + "grad_norm": 1.065454125404358, + "learning_rate": 5.413987457331111e-06, + "loss": 0.7865, + "step": 33710 + }, + { + "epoch": 2.676774692889321, + "grad_norm": 1.0886614322662354, + "learning_rate": 5.400756793945649e-06, + "loss": 0.7529, + "step": 33720 + }, + { + "epoch": 2.6775685169382206, + "grad_norm": 1.0097647905349731, + "learning_rate": 5.387526130560186e-06, + "loss": 0.7951, + "step": 33730 + }, + { + "epoch": 2.6783623409871202, + "grad_norm": 0.7832012176513672, + "learning_rate": 5.374295467174724e-06, + "loss": 0.7762, + "step": 33740 + }, + { + "epoch": 2.67915616503602, + "grad_norm": 0.7780231833457947, + "learning_rate": 5.3610648037892625e-06, + "loss": 0.8242, + "step": 33750 + }, + { + "epoch": 2.6799499890849194, + "grad_norm": 0.9866840243339539, + "learning_rate": 5.3478341404038e-06, + "loss": 0.7879, + "step": 33760 + }, + { + "epoch": 2.680743813133819, + "grad_norm": 1.0353221893310547, + "learning_rate": 5.334603477018338e-06, + "loss": 0.7558, + "step": 33770 + }, + { + "epoch": 2.6815376371827186, + "grad_norm": 1.043462872505188, + "learning_rate": 5.321372813632876e-06, + "loss": 0.7572, + "step": 33780 + }, + { + "epoch": 2.682331461231618, + "grad_norm": 0.7620270848274231, + "learning_rate": 5.308142150247414e-06, + "loss": 0.8275, + "step": 33790 + }, + { + "epoch": 2.6831252852805174, + "grad_norm": 0.835926353931427, + "learning_rate": 5.294911486861952e-06, + "loss": 0.8125, + "step": 33800 + }, + { + "epoch": 2.683919109329417, + "grad_norm": 1.1094342470169067, + "learning_rate": 5.281680823476489e-06, + "loss": 0.8078, + "step": 33810 + }, + { + "epoch": 2.6847129333783166, + "grad_norm": 0.8570294380187988, + "learning_rate": 5.268450160091027e-06, + "loss": 0.7878, + "step": 33820 + }, + { + "epoch": 2.6855067574272162, + "grad_norm": 0.7822785377502441, + "learning_rate": 5.255219496705565e-06, + "loss": 0.7786, + "step": 33830 + }, + { + "epoch": 2.686300581476116, + "grad_norm": 1.0392347574234009, + "learning_rate": 5.241988833320103e-06, + "loss": 0.7568, + "step": 33840 + }, + { + "epoch": 2.6870944055250154, + "grad_norm": 0.9113541841506958, + "learning_rate": 5.228758169934641e-06, + "loss": 0.82, + "step": 33850 + }, + { + "epoch": 2.687888229573915, + "grad_norm": 0.928196370601654, + "learning_rate": 5.215527506549179e-06, + "loss": 0.7763, + "step": 33860 + }, + { + "epoch": 2.6886820536228146, + "grad_norm": 1.0967717170715332, + "learning_rate": 5.202296843163716e-06, + "loss": 0.7269, + "step": 33870 + }, + { + "epoch": 2.689475877671714, + "grad_norm": 1.093342900276184, + "learning_rate": 5.189066179778254e-06, + "loss": 0.8291, + "step": 33880 + }, + { + "epoch": 2.690269701720614, + "grad_norm": 1.088734745979309, + "learning_rate": 5.175835516392792e-06, + "loss": 0.7851, + "step": 33890 + }, + { + "epoch": 2.691063525769513, + "grad_norm": 0.8813207149505615, + "learning_rate": 5.16260485300733e-06, + "loss": 0.7603, + "step": 33900 + }, + { + "epoch": 2.6918573498184126, + "grad_norm": 0.9411490559577942, + "learning_rate": 5.149374189621868e-06, + "loss": 0.8174, + "step": 33910 + }, + { + "epoch": 2.6926511738673122, + "grad_norm": 0.6878808736801147, + "learning_rate": 5.1361435262364064e-06, + "loss": 0.7783, + "step": 33920 + }, + { + "epoch": 2.693444997916212, + "grad_norm": 1.0253195762634277, + "learning_rate": 5.122912862850944e-06, + "loss": 0.8235, + "step": 33930 + }, + { + "epoch": 2.6942388219651114, + "grad_norm": 1.0071438550949097, + "learning_rate": 5.109682199465481e-06, + "loss": 0.7548, + "step": 33940 + }, + { + "epoch": 2.695032646014011, + "grad_norm": 0.876988410949707, + "learning_rate": 5.096451536080019e-06, + "loss": 0.7692, + "step": 33950 + }, + { + "epoch": 2.6958264700629107, + "grad_norm": 0.8041113615036011, + "learning_rate": 5.083220872694557e-06, + "loss": 0.7844, + "step": 33960 + }, + { + "epoch": 2.6966202941118103, + "grad_norm": 0.8075045943260193, + "learning_rate": 5.069990209309095e-06, + "loss": 0.7855, + "step": 33970 + }, + { + "epoch": 2.69741411816071, + "grad_norm": 0.9511701464653015, + "learning_rate": 5.056759545923633e-06, + "loss": 0.7444, + "step": 33980 + }, + { + "epoch": 2.698207942209609, + "grad_norm": 0.9707065224647522, + "learning_rate": 5.0435288825381705e-06, + "loss": 0.757, + "step": 33990 + }, + { + "epoch": 2.699001766258509, + "grad_norm": 1.0104913711547852, + "learning_rate": 5.0302982191527085e-06, + "loss": 0.7089, + "step": 34000 + }, + { + "epoch": 2.6997955903074082, + "grad_norm": 0.9136759042739868, + "learning_rate": 5.017067555767247e-06, + "loss": 0.7715, + "step": 34010 + }, + { + "epoch": 2.700589414356308, + "grad_norm": 1.179523229598999, + "learning_rate": 5.003836892381785e-06, + "loss": 0.7927, + "step": 34020 + }, + { + "epoch": 2.7013832384052074, + "grad_norm": 0.7838528156280518, + "learning_rate": 4.990606228996322e-06, + "loss": 0.7484, + "step": 34030 + }, + { + "epoch": 2.702177062454107, + "grad_norm": 1.034165859222412, + "learning_rate": 4.97737556561086e-06, + "loss": 0.8256, + "step": 34040 + }, + { + "epoch": 2.7029708865030067, + "grad_norm": 1.1382567882537842, + "learning_rate": 4.964144902225397e-06, + "loss": 0.7398, + "step": 34050 + }, + { + "epoch": 2.7037647105519063, + "grad_norm": 0.9856430292129517, + "learning_rate": 4.950914238839935e-06, + "loss": 0.7579, + "step": 34060 + }, + { + "epoch": 2.704558534600806, + "grad_norm": 0.8700586557388306, + "learning_rate": 4.937683575454473e-06, + "loss": 0.7864, + "step": 34070 + }, + { + "epoch": 2.705352358649705, + "grad_norm": 0.9212909936904907, + "learning_rate": 4.9244529120690115e-06, + "loss": 0.7534, + "step": 34080 + }, + { + "epoch": 2.706146182698605, + "grad_norm": 0.9201529026031494, + "learning_rate": 4.912545315022096e-06, + "loss": 0.7437, + "step": 34090 + }, + { + "epoch": 2.7069400067475042, + "grad_norm": 0.9818313121795654, + "learning_rate": 4.899314651636634e-06, + "loss": 0.7595, + "step": 34100 + }, + { + "epoch": 2.707733830796404, + "grad_norm": 0.9335930347442627, + "learning_rate": 4.886083988251171e-06, + "loss": 0.7705, + "step": 34110 + }, + { + "epoch": 2.7085276548453034, + "grad_norm": 1.0011483430862427, + "learning_rate": 4.872853324865708e-06, + "loss": 0.7944, + "step": 34120 + }, + { + "epoch": 2.709321478894203, + "grad_norm": 0.9489617943763733, + "learning_rate": 4.8596226614802465e-06, + "loss": 0.7808, + "step": 34130 + }, + { + "epoch": 2.7101153029431027, + "grad_norm": 1.0148978233337402, + "learning_rate": 4.8463919980947845e-06, + "loss": 0.7909, + "step": 34140 + }, + { + "epoch": 2.7109091269920023, + "grad_norm": 0.89469313621521, + "learning_rate": 4.833161334709323e-06, + "loss": 0.8341, + "step": 34150 + }, + { + "epoch": 2.711702951040902, + "grad_norm": 1.135300874710083, + "learning_rate": 4.819930671323861e-06, + "loss": 0.785, + "step": 34160 + }, + { + "epoch": 2.7124967750898015, + "grad_norm": 0.9394294619560242, + "learning_rate": 4.806700007938398e-06, + "loss": 0.7499, + "step": 34170 + }, + { + "epoch": 2.713290599138701, + "grad_norm": 0.9445950984954834, + "learning_rate": 4.793469344552936e-06, + "loss": 0.7784, + "step": 34180 + }, + { + "epoch": 2.7140844231876002, + "grad_norm": 0.7616167664527893, + "learning_rate": 4.780238681167474e-06, + "loss": 0.8297, + "step": 34190 + }, + { + "epoch": 2.7148782472365003, + "grad_norm": 0.9969817399978638, + "learning_rate": 4.767008017782012e-06, + "loss": 0.771, + "step": 34200 + }, + { + "epoch": 2.7156720712853994, + "grad_norm": 0.9528368711471558, + "learning_rate": 4.75377735439655e-06, + "loss": 0.8011, + "step": 34210 + }, + { + "epoch": 2.716465895334299, + "grad_norm": 0.9359155893325806, + "learning_rate": 4.7405466910110875e-06, + "loss": 0.7494, + "step": 34220 + }, + { + "epoch": 2.7172597193831987, + "grad_norm": 0.9772827625274658, + "learning_rate": 4.727316027625625e-06, + "loss": 0.7775, + "step": 34230 + }, + { + "epoch": 2.7180535434320983, + "grad_norm": 0.8860666751861572, + "learning_rate": 4.714085364240163e-06, + "loss": 0.7459, + "step": 34240 + }, + { + "epoch": 2.718847367480998, + "grad_norm": 0.9986729621887207, + "learning_rate": 4.700854700854701e-06, + "loss": 0.7979, + "step": 34250 + }, + { + "epoch": 2.7196411915298975, + "grad_norm": 1.0801081657409668, + "learning_rate": 4.687624037469239e-06, + "loss": 0.765, + "step": 34260 + }, + { + "epoch": 2.720435015578797, + "grad_norm": 1.139674186706543, + "learning_rate": 4.674393374083777e-06, + "loss": 0.7262, + "step": 34270 + }, + { + "epoch": 2.7212288396276967, + "grad_norm": 1.0824925899505615, + "learning_rate": 4.661162710698315e-06, + "loss": 0.7435, + "step": 34280 + }, + { + "epoch": 2.7220226636765963, + "grad_norm": 1.0776777267456055, + "learning_rate": 4.647932047312852e-06, + "loss": 0.741, + "step": 34290 + }, + { + "epoch": 2.7228164877254954, + "grad_norm": 0.7067087292671204, + "learning_rate": 4.6347013839273904e-06, + "loss": 0.7718, + "step": 34300 + }, + { + "epoch": 2.723610311774395, + "grad_norm": 0.9206405282020569, + "learning_rate": 4.6214707205419285e-06, + "loss": 0.8099, + "step": 34310 + }, + { + "epoch": 2.7244041358232947, + "grad_norm": 0.9427090287208557, + "learning_rate": 4.608240057156466e-06, + "loss": 0.8307, + "step": 34320 + }, + { + "epoch": 2.7251979598721943, + "grad_norm": 0.9077566862106323, + "learning_rate": 4.595009393771004e-06, + "loss": 0.7284, + "step": 34330 + }, + { + "epoch": 2.725991783921094, + "grad_norm": 1.0393189191818237, + "learning_rate": 4.581778730385542e-06, + "loss": 0.76, + "step": 34340 + }, + { + "epoch": 2.7267856079699935, + "grad_norm": 0.7990016937255859, + "learning_rate": 4.568548067000079e-06, + "loss": 0.8172, + "step": 34350 + }, + { + "epoch": 2.727579432018893, + "grad_norm": 0.9199971556663513, + "learning_rate": 4.555317403614617e-06, + "loss": 0.8014, + "step": 34360 + }, + { + "epoch": 2.7283732560677927, + "grad_norm": 1.028395175933838, + "learning_rate": 4.542086740229155e-06, + "loss": 0.7645, + "step": 34370 + }, + { + "epoch": 2.7291670801166923, + "grad_norm": 0.9020273089408875, + "learning_rate": 4.528856076843693e-06, + "loss": 0.7711, + "step": 34380 + }, + { + "epoch": 2.7299609041655915, + "grad_norm": 1.0992895364761353, + "learning_rate": 4.5156254134582315e-06, + "loss": 0.7861, + "step": 34390 + }, + { + "epoch": 2.7307547282144915, + "grad_norm": 0.8821068406105042, + "learning_rate": 4.502394750072769e-06, + "loss": 0.7572, + "step": 34400 + }, + { + "epoch": 2.7315485522633907, + "grad_norm": 0.8414868712425232, + "learning_rate": 4.489164086687307e-06, + "loss": 0.8069, + "step": 34410 + }, + { + "epoch": 2.7323423763122903, + "grad_norm": 1.0444607734680176, + "learning_rate": 4.475933423301845e-06, + "loss": 0.8252, + "step": 34420 + }, + { + "epoch": 2.73313620036119, + "grad_norm": 0.7695857882499695, + "learning_rate": 4.462702759916382e-06, + "loss": 0.8368, + "step": 34430 + }, + { + "epoch": 2.7339300244100895, + "grad_norm": 0.9509304761886597, + "learning_rate": 4.44947209653092e-06, + "loss": 0.796, + "step": 34440 + }, + { + "epoch": 2.734723848458989, + "grad_norm": 0.798905074596405, + "learning_rate": 4.436241433145458e-06, + "loss": 0.7507, + "step": 34450 + }, + { + "epoch": 2.7355176725078887, + "grad_norm": 0.9972504377365112, + "learning_rate": 4.4230107697599955e-06, + "loss": 0.8479, + "step": 34460 + }, + { + "epoch": 2.7363114965567883, + "grad_norm": 0.9366284012794495, + "learning_rate": 4.4097801063745336e-06, + "loss": 0.7644, + "step": 34470 + }, + { + "epoch": 2.737105320605688, + "grad_norm": 1.0526267290115356, + "learning_rate": 4.396549442989072e-06, + "loss": 0.7578, + "step": 34480 + }, + { + "epoch": 2.7378991446545875, + "grad_norm": 0.8934128880500793, + "learning_rate": 4.38331877960361e-06, + "loss": 0.8356, + "step": 34490 + }, + { + "epoch": 2.7386929687034867, + "grad_norm": 1.0690593719482422, + "learning_rate": 4.370088116218148e-06, + "loss": 0.78, + "step": 34500 + }, + { + "epoch": 2.7394867927523863, + "grad_norm": 0.9687326550483704, + "learning_rate": 4.356857452832686e-06, + "loss": 0.7375, + "step": 34510 + }, + { + "epoch": 2.740280616801286, + "grad_norm": 0.9082489609718323, + "learning_rate": 4.343626789447223e-06, + "loss": 0.7743, + "step": 34520 + }, + { + "epoch": 2.7410744408501855, + "grad_norm": 0.951102614402771, + "learning_rate": 4.330396126061761e-06, + "loss": 0.797, + "step": 34530 + }, + { + "epoch": 2.741868264899085, + "grad_norm": 1.032860517501831, + "learning_rate": 4.3171654626762984e-06, + "loss": 0.8604, + "step": 34540 + }, + { + "epoch": 2.7426620889479847, + "grad_norm": 0.9105594158172607, + "learning_rate": 4.3039347992908365e-06, + "loss": 0.7811, + "step": 34550 + }, + { + "epoch": 2.7434559129968843, + "grad_norm": 0.9474354982376099, + "learning_rate": 4.290704135905375e-06, + "loss": 0.7872, + "step": 34560 + }, + { + "epoch": 2.744249737045784, + "grad_norm": 0.7172473669052124, + "learning_rate": 4.277473472519913e-06, + "loss": 0.7295, + "step": 34570 + }, + { + "epoch": 2.7450435610946835, + "grad_norm": 0.8522198796272278, + "learning_rate": 4.26424280913445e-06, + "loss": 0.7326, + "step": 34580 + }, + { + "epoch": 2.7458373851435827, + "grad_norm": 1.0782872438430786, + "learning_rate": 4.251012145748988e-06, + "loss": 0.7579, + "step": 34590 + }, + { + "epoch": 2.7466312091924827, + "grad_norm": 1.02603280544281, + "learning_rate": 4.237781482363526e-06, + "loss": 0.7964, + "step": 34600 + }, + { + "epoch": 2.747425033241382, + "grad_norm": 0.9840404391288757, + "learning_rate": 4.224550818978064e-06, + "loss": 0.8247, + "step": 34610 + }, + { + "epoch": 2.7482188572902815, + "grad_norm": 0.8115825653076172, + "learning_rate": 4.211320155592602e-06, + "loss": 0.7881, + "step": 34620 + }, + { + "epoch": 2.749012681339181, + "grad_norm": 1.053634524345398, + "learning_rate": 4.1980894922071395e-06, + "loss": 0.7765, + "step": 34630 + }, + { + "epoch": 2.7498065053880807, + "grad_norm": 0.8972168564796448, + "learning_rate": 4.184858828821677e-06, + "loss": 0.7956, + "step": 34640 + }, + { + "epoch": 2.7506003294369803, + "grad_norm": 0.9098021388053894, + "learning_rate": 4.171628165436215e-06, + "loss": 0.7835, + "step": 34650 + }, + { + "epoch": 2.75139415348588, + "grad_norm": 0.8237542510032654, + "learning_rate": 4.158397502050753e-06, + "loss": 0.7629, + "step": 34660 + }, + { + "epoch": 2.7521879775347795, + "grad_norm": 1.1170654296875, + "learning_rate": 4.145166838665291e-06, + "loss": 0.7887, + "step": 34670 + }, + { + "epoch": 2.752981801583679, + "grad_norm": 1.165892243385315, + "learning_rate": 4.131936175279829e-06, + "loss": 0.7624, + "step": 34680 + }, + { + "epoch": 2.7537756256325787, + "grad_norm": 0.8696096539497375, + "learning_rate": 4.118705511894366e-06, + "loss": 0.7976, + "step": 34690 + }, + { + "epoch": 2.754569449681478, + "grad_norm": 1.0606629848480225, + "learning_rate": 4.105474848508904e-06, + "loss": 0.7863, + "step": 34700 + }, + { + "epoch": 2.755363273730378, + "grad_norm": 0.9789169430732727, + "learning_rate": 4.092244185123442e-06, + "loss": 0.7347, + "step": 34710 + }, + { + "epoch": 2.756157097779277, + "grad_norm": 0.9391206502914429, + "learning_rate": 4.0790135217379805e-06, + "loss": 0.8182, + "step": 34720 + }, + { + "epoch": 2.7569509218281767, + "grad_norm": 0.9895641803741455, + "learning_rate": 4.0657828583525186e-06, + "loss": 0.7514, + "step": 34730 + }, + { + "epoch": 2.7577447458770763, + "grad_norm": 0.8437541723251343, + "learning_rate": 4.052552194967056e-06, + "loss": 0.7866, + "step": 34740 + }, + { + "epoch": 2.758538569925976, + "grad_norm": 1.0394833087921143, + "learning_rate": 4.039321531581593e-06, + "loss": 0.8191, + "step": 34750 + }, + { + "epoch": 2.7593323939748755, + "grad_norm": 0.9118010997772217, + "learning_rate": 4.026090868196131e-06, + "loss": 0.7347, + "step": 34760 + }, + { + "epoch": 2.760126218023775, + "grad_norm": 0.8801964521408081, + "learning_rate": 4.012860204810669e-06, + "loss": 0.7648, + "step": 34770 + }, + { + "epoch": 2.7609200420726747, + "grad_norm": 0.9062778353691101, + "learning_rate": 3.999629541425207e-06, + "loss": 0.7519, + "step": 34780 + }, + { + "epoch": 2.761713866121574, + "grad_norm": 0.9668712615966797, + "learning_rate": 3.986398878039745e-06, + "loss": 0.7754, + "step": 34790 + }, + { + "epoch": 2.762507690170474, + "grad_norm": 0.7536428570747375, + "learning_rate": 3.9731682146542834e-06, + "loss": 0.8258, + "step": 34800 + }, + { + "epoch": 2.763301514219373, + "grad_norm": 0.9052528142929077, + "learning_rate": 3.959937551268821e-06, + "loss": 0.829, + "step": 34810 + }, + { + "epoch": 2.7640953382682727, + "grad_norm": 0.9865114688873291, + "learning_rate": 3.946706887883359e-06, + "loss": 0.8256, + "step": 34820 + }, + { + "epoch": 2.7648891623171723, + "grad_norm": 0.8684812188148499, + "learning_rate": 3.933476224497897e-06, + "loss": 0.7742, + "step": 34830 + }, + { + "epoch": 2.765682986366072, + "grad_norm": 1.075271725654602, + "learning_rate": 3.920245561112434e-06, + "loss": 0.8008, + "step": 34840 + }, + { + "epoch": 2.7664768104149715, + "grad_norm": 0.9537501335144043, + "learning_rate": 3.907014897726972e-06, + "loss": 0.8286, + "step": 34850 + }, + { + "epoch": 2.767270634463871, + "grad_norm": 0.8851303458213806, + "learning_rate": 3.89378423434151e-06, + "loss": 0.815, + "step": 34860 + }, + { + "epoch": 2.7680644585127707, + "grad_norm": 0.8173339366912842, + "learning_rate": 3.8805535709560474e-06, + "loss": 0.8254, + "step": 34870 + }, + { + "epoch": 2.7688582825616703, + "grad_norm": 1.0855127573013306, + "learning_rate": 3.8673229075705855e-06, + "loss": 0.7537, + "step": 34880 + }, + { + "epoch": 2.76965210661057, + "grad_norm": 1.0475999116897583, + "learning_rate": 3.854092244185124e-06, + "loss": 0.7576, + "step": 34890 + }, + { + "epoch": 2.770445930659469, + "grad_norm": 0.9665488004684448, + "learning_rate": 3.840861580799662e-06, + "loss": 0.728, + "step": 34900 + }, + { + "epoch": 2.771239754708369, + "grad_norm": 0.8024313449859619, + "learning_rate": 3.8276309174142e-06, + "loss": 0.7808, + "step": 34910 + }, + { + "epoch": 2.7720335787572683, + "grad_norm": 0.955835223197937, + "learning_rate": 3.814400254028737e-06, + "loss": 0.7961, + "step": 34920 + }, + { + "epoch": 2.772827402806168, + "grad_norm": 0.996760368347168, + "learning_rate": 3.8011695906432747e-06, + "loss": 0.7857, + "step": 34930 + }, + { + "epoch": 2.7736212268550675, + "grad_norm": 0.9988394975662231, + "learning_rate": 3.7879389272578127e-06, + "loss": 0.7821, + "step": 34940 + }, + { + "epoch": 2.774415050903967, + "grad_norm": 0.9682359099388123, + "learning_rate": 3.774708263872351e-06, + "loss": 0.8208, + "step": 34950 + }, + { + "epoch": 2.7752088749528667, + "grad_norm": 0.8783732652664185, + "learning_rate": 3.761477600486889e-06, + "loss": 0.8191, + "step": 34960 + }, + { + "epoch": 2.7760026990017663, + "grad_norm": 1.049475073814392, + "learning_rate": 3.7482469371014266e-06, + "loss": 0.733, + "step": 34970 + }, + { + "epoch": 2.776796523050666, + "grad_norm": 0.9063088893890381, + "learning_rate": 3.7350162737159638e-06, + "loss": 0.8224, + "step": 34980 + }, + { + "epoch": 2.7775903470995655, + "grad_norm": 0.9383750557899475, + "learning_rate": 3.721785610330502e-06, + "loss": 0.7338, + "step": 34990 + }, + { + "epoch": 2.778384171148465, + "grad_norm": 0.849460244178772, + "learning_rate": 3.70855494694504e-06, + "loss": 0.7359, + "step": 35000 + }, + { + "epoch": 2.7791779951973643, + "grad_norm": 1.0074882507324219, + "learning_rate": 3.695324283559578e-06, + "loss": 0.7969, + "step": 35010 + }, + { + "epoch": 2.779971819246264, + "grad_norm": 0.9304841160774231, + "learning_rate": 3.6820936201741157e-06, + "loss": 0.7987, + "step": 35020 + }, + { + "epoch": 2.7807656432951635, + "grad_norm": 1.1180306673049927, + "learning_rate": 3.6688629567886538e-06, + "loss": 0.7637, + "step": 35030 + }, + { + "epoch": 2.781559467344063, + "grad_norm": 0.9960020184516907, + "learning_rate": 3.655632293403191e-06, + "loss": 0.7664, + "step": 35040 + }, + { + "epoch": 2.7823532913929627, + "grad_norm": 1.1792720556259155, + "learning_rate": 3.642401630017729e-06, + "loss": 0.7887, + "step": 35050 + }, + { + "epoch": 2.7831471154418623, + "grad_norm": 0.7898070216178894, + "learning_rate": 3.629170966632267e-06, + "loss": 0.8356, + "step": 35060 + }, + { + "epoch": 2.783940939490762, + "grad_norm": 0.7194439172744751, + "learning_rate": 3.6159403032468052e-06, + "loss": 0.7888, + "step": 35070 + }, + { + "epoch": 2.7847347635396615, + "grad_norm": 1.077726125717163, + "learning_rate": 3.602709639861343e-06, + "loss": 0.8264, + "step": 35080 + }, + { + "epoch": 2.785528587588561, + "grad_norm": 1.00077486038208, + "learning_rate": 3.589478976475881e-06, + "loss": 0.7843, + "step": 35090 + }, + { + "epoch": 2.7863224116374603, + "grad_norm": 1.037901520729065, + "learning_rate": 3.576248313090418e-06, + "loss": 0.7137, + "step": 35100 + }, + { + "epoch": 2.7871162356863604, + "grad_norm": 0.8961439728736877, + "learning_rate": 3.5630176497049563e-06, + "loss": 0.7605, + "step": 35110 + }, + { + "epoch": 2.7879100597352595, + "grad_norm": 0.9663600325584412, + "learning_rate": 3.5497869863194944e-06, + "loss": 0.732, + "step": 35120 + }, + { + "epoch": 2.788703883784159, + "grad_norm": 0.8328473567962646, + "learning_rate": 3.536556322934032e-06, + "loss": 0.7589, + "step": 35130 + }, + { + "epoch": 2.7894977078330587, + "grad_norm": 1.0931073427200317, + "learning_rate": 3.52332565954857e-06, + "loss": 0.8035, + "step": 35140 + }, + { + "epoch": 2.7902915318819583, + "grad_norm": 0.8528993725776672, + "learning_rate": 3.5100949961631073e-06, + "loss": 0.818, + "step": 35150 + }, + { + "epoch": 2.791085355930858, + "grad_norm": 0.9126871228218079, + "learning_rate": 3.4968643327776454e-06, + "loss": 0.7092, + "step": 35160 + }, + { + "epoch": 2.7918791799797575, + "grad_norm": 0.8456578850746155, + "learning_rate": 3.4836336693921835e-06, + "loss": 0.7571, + "step": 35170 + }, + { + "epoch": 2.792673004028657, + "grad_norm": 1.0316028594970703, + "learning_rate": 3.470403006006721e-06, + "loss": 0.7413, + "step": 35180 + }, + { + "epoch": 2.7934668280775568, + "grad_norm": 0.8580575585365295, + "learning_rate": 3.4571723426212592e-06, + "loss": 0.7685, + "step": 35190 + }, + { + "epoch": 2.7942606521264564, + "grad_norm": 0.9380501508712769, + "learning_rate": 3.4439416792357973e-06, + "loss": 0.7756, + "step": 35200 + }, + { + "epoch": 2.7950544761753555, + "grad_norm": 0.8025434613227844, + "learning_rate": 3.4307110158503345e-06, + "loss": 0.7646, + "step": 35210 + }, + { + "epoch": 2.7958483002242556, + "grad_norm": 0.828920841217041, + "learning_rate": 3.4174803524648726e-06, + "loss": 0.8187, + "step": 35220 + }, + { + "epoch": 2.7966421242731547, + "grad_norm": 0.8058600425720215, + "learning_rate": 3.4042496890794107e-06, + "loss": 0.7068, + "step": 35230 + }, + { + "epoch": 2.7974359483220543, + "grad_norm": 0.9694280028343201, + "learning_rate": 3.3910190256939484e-06, + "loss": 0.7607, + "step": 35240 + }, + { + "epoch": 2.798229772370954, + "grad_norm": 0.9409134984016418, + "learning_rate": 3.3777883623084864e-06, + "loss": 0.7601, + "step": 35250 + }, + { + "epoch": 2.7990235964198535, + "grad_norm": 1.1917532682418823, + "learning_rate": 3.3645576989230245e-06, + "loss": 0.7952, + "step": 35260 + }, + { + "epoch": 2.799817420468753, + "grad_norm": 0.8570424318313599, + "learning_rate": 3.3513270355375618e-06, + "loss": 0.8205, + "step": 35270 + }, + { + "epoch": 2.8006112445176528, + "grad_norm": 1.0113235712051392, + "learning_rate": 3.3380963721521e-06, + "loss": 0.8328, + "step": 35280 + }, + { + "epoch": 2.8014050685665524, + "grad_norm": 0.9469982385635376, + "learning_rate": 3.3248657087666375e-06, + "loss": 0.7525, + "step": 35290 + }, + { + "epoch": 2.8021988926154515, + "grad_norm": 1.03362238407135, + "learning_rate": 3.3116350453811756e-06, + "loss": 0.7338, + "step": 35300 + }, + { + "epoch": 2.8029927166643516, + "grad_norm": 1.1091042757034302, + "learning_rate": 3.2984043819957137e-06, + "loss": 0.7698, + "step": 35310 + }, + { + "epoch": 2.8037865407132507, + "grad_norm": 1.1740446090698242, + "learning_rate": 3.2851737186102517e-06, + "loss": 0.7686, + "step": 35320 + }, + { + "epoch": 2.8045803647621503, + "grad_norm": 1.0045807361602783, + "learning_rate": 3.271943055224789e-06, + "loss": 0.7481, + "step": 35330 + }, + { + "epoch": 2.80537418881105, + "grad_norm": 0.9175392985343933, + "learning_rate": 3.258712391839327e-06, + "loss": 0.7549, + "step": 35340 + }, + { + "epoch": 2.8061680128599495, + "grad_norm": 1.221364974975586, + "learning_rate": 3.2454817284538647e-06, + "loss": 0.7778, + "step": 35350 + }, + { + "epoch": 2.806961836908849, + "grad_norm": 0.9719924330711365, + "learning_rate": 3.2322510650684028e-06, + "loss": 0.7635, + "step": 35360 + }, + { + "epoch": 2.8077556609577488, + "grad_norm": 0.9859107732772827, + "learning_rate": 3.219020401682941e-06, + "loss": 0.8012, + "step": 35370 + }, + { + "epoch": 2.8085494850066484, + "grad_norm": 1.052498698234558, + "learning_rate": 3.2057897382974785e-06, + "loss": 0.8148, + "step": 35380 + }, + { + "epoch": 2.809343309055548, + "grad_norm": 0.7815044522285461, + "learning_rate": 3.192559074912016e-06, + "loss": 0.7499, + "step": 35390 + }, + { + "epoch": 2.8101371331044476, + "grad_norm": 1.2309107780456543, + "learning_rate": 3.179328411526554e-06, + "loss": 0.8063, + "step": 35400 + }, + { + "epoch": 2.8109309571533467, + "grad_norm": 1.1573877334594727, + "learning_rate": 3.166097748141092e-06, + "loss": 0.7626, + "step": 35410 + }, + { + "epoch": 2.811724781202247, + "grad_norm": 1.0697295665740967, + "learning_rate": 3.15286708475563e-06, + "loss": 0.7613, + "step": 35420 + }, + { + "epoch": 2.812518605251146, + "grad_norm": 0.9957389235496521, + "learning_rate": 3.139636421370168e-06, + "loss": 0.7382, + "step": 35430 + }, + { + "epoch": 2.8133124293000455, + "grad_norm": 0.9990561604499817, + "learning_rate": 3.1264057579847053e-06, + "loss": 0.7597, + "step": 35440 + }, + { + "epoch": 2.814106253348945, + "grad_norm": 1.1225290298461914, + "learning_rate": 3.1131750945992434e-06, + "loss": 0.817, + "step": 35450 + }, + { + "epoch": 2.8149000773978448, + "grad_norm": 1.0921093225479126, + "learning_rate": 3.099944431213781e-06, + "loss": 0.7977, + "step": 35460 + }, + { + "epoch": 2.8156939014467444, + "grad_norm": 0.9584634304046631, + "learning_rate": 3.086713767828319e-06, + "loss": 0.7256, + "step": 35470 + }, + { + "epoch": 2.816487725495644, + "grad_norm": 0.8979785442352295, + "learning_rate": 3.073483104442857e-06, + "loss": 0.8171, + "step": 35480 + }, + { + "epoch": 2.8172815495445436, + "grad_norm": 0.9863408803939819, + "learning_rate": 3.060252441057395e-06, + "loss": 0.8187, + "step": 35490 + }, + { + "epoch": 2.818075373593443, + "grad_norm": 0.9419934153556824, + "learning_rate": 3.0470217776719325e-06, + "loss": 0.7236, + "step": 35500 + }, + { + "epoch": 2.818869197642343, + "grad_norm": 0.7588933706283569, + "learning_rate": 3.0337911142864706e-06, + "loss": 0.7996, + "step": 35510 + }, + { + "epoch": 2.819663021691242, + "grad_norm": 0.9996737837791443, + "learning_rate": 3.0205604509010083e-06, + "loss": 0.7806, + "step": 35520 + }, + { + "epoch": 2.8204568457401415, + "grad_norm": 1.1076947450637817, + "learning_rate": 3.0073297875155463e-06, + "loss": 0.7945, + "step": 35530 + }, + { + "epoch": 2.821250669789041, + "grad_norm": 0.942015528678894, + "learning_rate": 2.9940991241300844e-06, + "loss": 0.7777, + "step": 35540 + }, + { + "epoch": 2.8220444938379408, + "grad_norm": 0.9087643027305603, + "learning_rate": 2.9808684607446216e-06, + "loss": 0.8318, + "step": 35550 + }, + { + "epoch": 2.8228383178868404, + "grad_norm": 1.026443600654602, + "learning_rate": 2.9676377973591597e-06, + "loss": 0.7698, + "step": 35560 + }, + { + "epoch": 2.82363214193574, + "grad_norm": 0.9382554888725281, + "learning_rate": 2.9544071339736974e-06, + "loss": 0.7869, + "step": 35570 + }, + { + "epoch": 2.8244259659846396, + "grad_norm": 0.7953215837478638, + "learning_rate": 2.9411764705882355e-06, + "loss": 0.7498, + "step": 35580 + }, + { + "epoch": 2.825219790033539, + "grad_norm": 0.9464923143386841, + "learning_rate": 2.9279458072027735e-06, + "loss": 0.7229, + "step": 35590 + }, + { + "epoch": 2.826013614082439, + "grad_norm": 0.9360369443893433, + "learning_rate": 2.914715143817311e-06, + "loss": 0.7409, + "step": 35600 + }, + { + "epoch": 2.826807438131338, + "grad_norm": 0.9968975186347961, + "learning_rate": 2.901484480431849e-06, + "loss": 0.8111, + "step": 35610 + }, + { + "epoch": 2.827601262180238, + "grad_norm": 0.7860965132713318, + "learning_rate": 2.888253817046387e-06, + "loss": 0.8074, + "step": 35620 + }, + { + "epoch": 2.828395086229137, + "grad_norm": 0.8920134902000427, + "learning_rate": 2.8750231536609246e-06, + "loss": 0.7975, + "step": 35630 + }, + { + "epoch": 2.8291889102780368, + "grad_norm": 0.802579939365387, + "learning_rate": 2.8617924902754627e-06, + "loss": 0.7383, + "step": 35640 + }, + { + "epoch": 2.8299827343269364, + "grad_norm": 0.790934681892395, + "learning_rate": 2.8485618268900003e-06, + "loss": 0.7811, + "step": 35650 + }, + { + "epoch": 2.830776558375836, + "grad_norm": 0.9820284247398376, + "learning_rate": 2.835331163504538e-06, + "loss": 0.7466, + "step": 35660 + }, + { + "epoch": 2.8315703824247356, + "grad_norm": 1.035416841506958, + "learning_rate": 2.822100500119076e-06, + "loss": 0.7927, + "step": 35670 + }, + { + "epoch": 2.832364206473635, + "grad_norm": 1.0442098379135132, + "learning_rate": 2.808869836733614e-06, + "loss": 0.7825, + "step": 35680 + }, + { + "epoch": 2.833158030522535, + "grad_norm": 1.0599391460418701, + "learning_rate": 2.795639173348152e-06, + "loss": 0.8229, + "step": 35690 + }, + { + "epoch": 2.8339518545714344, + "grad_norm": 1.2767279148101807, + "learning_rate": 2.78240850996269e-06, + "loss": 0.7614, + "step": 35700 + }, + { + "epoch": 2.834745678620334, + "grad_norm": 1.054215431213379, + "learning_rate": 2.7691778465772275e-06, + "loss": 0.799, + "step": 35710 + }, + { + "epoch": 2.835539502669233, + "grad_norm": 0.8956333994865417, + "learning_rate": 2.755947183191765e-06, + "loss": 0.7311, + "step": 35720 + }, + { + "epoch": 2.8363333267181328, + "grad_norm": 1.0596696138381958, + "learning_rate": 2.7427165198063033e-06, + "loss": 0.7634, + "step": 35730 + }, + { + "epoch": 2.8371271507670324, + "grad_norm": 1.0594090223312378, + "learning_rate": 2.7294858564208414e-06, + "loss": 0.8017, + "step": 35740 + }, + { + "epoch": 2.837920974815932, + "grad_norm": 1.278930902481079, + "learning_rate": 2.716255193035379e-06, + "loss": 0.7555, + "step": 35750 + }, + { + "epoch": 2.8387147988648316, + "grad_norm": 1.1408640146255493, + "learning_rate": 2.7030245296499167e-06, + "loss": 0.7985, + "step": 35760 + }, + { + "epoch": 2.839508622913731, + "grad_norm": 0.8677951693534851, + "learning_rate": 2.6897938662644547e-06, + "loss": 0.7973, + "step": 35770 + }, + { + "epoch": 2.840302446962631, + "grad_norm": 0.9274892807006836, + "learning_rate": 2.6765632028789924e-06, + "loss": 0.81, + "step": 35780 + }, + { + "epoch": 2.8410962710115304, + "grad_norm": 0.9693270921707153, + "learning_rate": 2.6633325394935305e-06, + "loss": 0.7442, + "step": 35790 + }, + { + "epoch": 2.84189009506043, + "grad_norm": 0.9193242192268372, + "learning_rate": 2.6501018761080686e-06, + "loss": 0.7411, + "step": 35800 + }, + { + "epoch": 2.842683919109329, + "grad_norm": 0.9392445087432861, + "learning_rate": 2.636871212722606e-06, + "loss": 0.7334, + "step": 35810 + }, + { + "epoch": 2.843477743158229, + "grad_norm": 0.9353786706924438, + "learning_rate": 2.623640549337144e-06, + "loss": 0.8122, + "step": 35820 + }, + { + "epoch": 2.8442715672071284, + "grad_norm": 0.7990022301673889, + "learning_rate": 2.6104098859516815e-06, + "loss": 0.7969, + "step": 35830 + }, + { + "epoch": 2.845065391256028, + "grad_norm": 0.7755838632583618, + "learning_rate": 2.5971792225662196e-06, + "loss": 0.8085, + "step": 35840 + }, + { + "epoch": 2.8458592153049276, + "grad_norm": 0.9300201535224915, + "learning_rate": 2.5839485591807577e-06, + "loss": 0.8288, + "step": 35850 + }, + { + "epoch": 2.846653039353827, + "grad_norm": 1.0524238348007202, + "learning_rate": 2.5707178957952953e-06, + "loss": 0.7261, + "step": 35860 + }, + { + "epoch": 2.847446863402727, + "grad_norm": 1.0584592819213867, + "learning_rate": 2.557487232409833e-06, + "loss": 0.779, + "step": 35870 + }, + { + "epoch": 2.8482406874516264, + "grad_norm": 1.093838095664978, + "learning_rate": 2.544256569024371e-06, + "loss": 0.8021, + "step": 35880 + }, + { + "epoch": 2.849034511500526, + "grad_norm": 0.6603565216064453, + "learning_rate": 2.5310259056389087e-06, + "loss": 0.8322, + "step": 35890 + }, + { + "epoch": 2.8498283355494256, + "grad_norm": 0.9231084585189819, + "learning_rate": 2.517795242253447e-06, + "loss": 0.7669, + "step": 35900 + }, + { + "epoch": 2.850622159598325, + "grad_norm": 0.9503127336502075, + "learning_rate": 2.5045645788679845e-06, + "loss": 0.7969, + "step": 35910 + }, + { + "epoch": 2.8514159836472244, + "grad_norm": 0.9831972122192383, + "learning_rate": 2.491333915482522e-06, + "loss": 0.7686, + "step": 35920 + }, + { + "epoch": 2.8522098076961244, + "grad_norm": 0.9379409551620483, + "learning_rate": 2.4781032520970602e-06, + "loss": 0.7717, + "step": 35930 + }, + { + "epoch": 2.8530036317450236, + "grad_norm": 0.9157143235206604, + "learning_rate": 2.4648725887115983e-06, + "loss": 0.7746, + "step": 35940 + }, + { + "epoch": 2.853797455793923, + "grad_norm": 0.896962583065033, + "learning_rate": 2.451641925326136e-06, + "loss": 0.7007, + "step": 35950 + }, + { + "epoch": 2.854591279842823, + "grad_norm": 1.0108494758605957, + "learning_rate": 2.438411261940674e-06, + "loss": 0.7847, + "step": 35960 + }, + { + "epoch": 2.8553851038917224, + "grad_norm": 1.0298715829849243, + "learning_rate": 2.4251805985552117e-06, + "loss": 0.7752, + "step": 35970 + }, + { + "epoch": 2.856178927940622, + "grad_norm": 1.0451425313949585, + "learning_rate": 2.4119499351697493e-06, + "loss": 0.7909, + "step": 35980 + }, + { + "epoch": 2.8569727519895216, + "grad_norm": 0.9338813424110413, + "learning_rate": 2.3987192717842874e-06, + "loss": 0.7663, + "step": 35990 + }, + { + "epoch": 2.857766576038421, + "grad_norm": 0.905019998550415, + "learning_rate": 2.3854886083988255e-06, + "loss": 0.746, + "step": 36000 + }, + { + "epoch": 2.8585604000873204, + "grad_norm": 0.8356999754905701, + "learning_rate": 2.372257945013363e-06, + "loss": 0.7736, + "step": 36010 + }, + { + "epoch": 2.8593542241362204, + "grad_norm": 1.016014575958252, + "learning_rate": 2.359027281627901e-06, + "loss": 0.8181, + "step": 36020 + }, + { + "epoch": 2.8601480481851196, + "grad_norm": 0.9665111303329468, + "learning_rate": 2.345796618242439e-06, + "loss": 0.7668, + "step": 36030 + }, + { + "epoch": 2.860941872234019, + "grad_norm": 0.8725518584251404, + "learning_rate": 2.3325659548569766e-06, + "loss": 0.8637, + "step": 36040 + }, + { + "epoch": 2.861735696282919, + "grad_norm": 1.1010116338729858, + "learning_rate": 2.3193352914715146e-06, + "loss": 0.7869, + "step": 36050 + }, + { + "epoch": 2.8625295203318184, + "grad_norm": 1.0355409383773804, + "learning_rate": 2.3061046280860527e-06, + "loss": 0.7937, + "step": 36060 + }, + { + "epoch": 2.863323344380718, + "grad_norm": 1.042080283164978, + "learning_rate": 2.29287396470059e-06, + "loss": 0.7468, + "step": 36070 + }, + { + "epoch": 2.8641171684296176, + "grad_norm": 0.9209195375442505, + "learning_rate": 2.279643301315128e-06, + "loss": 0.765, + "step": 36080 + }, + { + "epoch": 2.864910992478517, + "grad_norm": 0.9951355457305908, + "learning_rate": 2.2664126379296657e-06, + "loss": 0.7323, + "step": 36090 + }, + { + "epoch": 2.865704816527417, + "grad_norm": 0.969089925289154, + "learning_rate": 2.2531819745442038e-06, + "loss": 0.7319, + "step": 36100 + }, + { + "epoch": 2.8664986405763164, + "grad_norm": 1.1039822101593018, + "learning_rate": 2.239951311158742e-06, + "loss": 0.7683, + "step": 36110 + }, + { + "epoch": 2.8672924646252156, + "grad_norm": 1.0060070753097534, + "learning_rate": 2.2267206477732795e-06, + "loss": 0.7516, + "step": 36120 + }, + { + "epoch": 2.8680862886741156, + "grad_norm": 0.8177821636199951, + "learning_rate": 2.213489984387817e-06, + "loss": 0.8189, + "step": 36130 + }, + { + "epoch": 2.868880112723015, + "grad_norm": 0.8943607807159424, + "learning_rate": 2.2002593210023552e-06, + "loss": 0.7652, + "step": 36140 + }, + { + "epoch": 2.8696739367719144, + "grad_norm": 0.8499715328216553, + "learning_rate": 2.187028657616893e-06, + "loss": 0.8638, + "step": 36150 + }, + { + "epoch": 2.870467760820814, + "grad_norm": 0.8518121838569641, + "learning_rate": 2.173797994231431e-06, + "loss": 0.8079, + "step": 36160 + }, + { + "epoch": 2.8712615848697136, + "grad_norm": 1.0183836221694946, + "learning_rate": 2.1605673308459686e-06, + "loss": 0.7418, + "step": 36170 + }, + { + "epoch": 2.872055408918613, + "grad_norm": 1.092712640762329, + "learning_rate": 2.1473366674605063e-06, + "loss": 0.7641, + "step": 36180 + }, + { + "epoch": 2.872849232967513, + "grad_norm": 1.0446547269821167, + "learning_rate": 2.1341060040750444e-06, + "loss": 0.79, + "step": 36190 + }, + { + "epoch": 2.8736430570164124, + "grad_norm": 0.9164413213729858, + "learning_rate": 2.1208753406895824e-06, + "loss": 0.8226, + "step": 36200 + }, + { + "epoch": 2.874436881065312, + "grad_norm": 0.9824967980384827, + "learning_rate": 2.10764467730412e-06, + "loss": 0.7573, + "step": 36210 + }, + { + "epoch": 2.8752307051142116, + "grad_norm": 1.1960909366607666, + "learning_rate": 2.094414013918658e-06, + "loss": 0.7664, + "step": 36220 + }, + { + "epoch": 2.876024529163111, + "grad_norm": 1.140021562576294, + "learning_rate": 2.081183350533196e-06, + "loss": 0.7775, + "step": 36230 + }, + { + "epoch": 2.8768183532120104, + "grad_norm": 1.0924111604690552, + "learning_rate": 2.0679526871477335e-06, + "loss": 0.7845, + "step": 36240 + }, + { + "epoch": 2.87761217726091, + "grad_norm": 0.8583391308784485, + "learning_rate": 2.0547220237622716e-06, + "loss": 0.8183, + "step": 36250 + }, + { + "epoch": 2.8784060013098096, + "grad_norm": 1.004576563835144, + "learning_rate": 2.0414913603768097e-06, + "loss": 0.7346, + "step": 36260 + }, + { + "epoch": 2.879199825358709, + "grad_norm": 0.8908334374427795, + "learning_rate": 2.0282606969913473e-06, + "loss": 0.86, + "step": 36270 + }, + { + "epoch": 2.879993649407609, + "grad_norm": 1.0126510858535767, + "learning_rate": 2.015030033605885e-06, + "loss": 0.8082, + "step": 36280 + }, + { + "epoch": 2.8807874734565084, + "grad_norm": 1.0101633071899414, + "learning_rate": 2.001799370220423e-06, + "loss": 0.796, + "step": 36290 + }, + { + "epoch": 2.881581297505408, + "grad_norm": 0.8697376847267151, + "learning_rate": 1.9885687068349607e-06, + "loss": 0.7772, + "step": 36300 + }, + { + "epoch": 2.8823751215543076, + "grad_norm": 1.0588469505310059, + "learning_rate": 1.9753380434494988e-06, + "loss": 0.7941, + "step": 36310 + }, + { + "epoch": 2.883168945603207, + "grad_norm": 0.8329793810844421, + "learning_rate": 1.962107380064037e-06, + "loss": 0.7721, + "step": 36320 + }, + { + "epoch": 2.883962769652107, + "grad_norm": 1.057317852973938, + "learning_rate": 1.948876716678574e-06, + "loss": 0.7572, + "step": 36330 + }, + { + "epoch": 2.884756593701006, + "grad_norm": 0.87096107006073, + "learning_rate": 1.935646053293112e-06, + "loss": 0.7499, + "step": 36340 + }, + { + "epoch": 2.8855504177499056, + "grad_norm": 0.9541813135147095, + "learning_rate": 1.92241538990765e-06, + "loss": 0.7405, + "step": 36350 + }, + { + "epoch": 2.886344241798805, + "grad_norm": 0.7810908555984497, + "learning_rate": 1.909184726522188e-06, + "loss": 0.7629, + "step": 36360 + }, + { + "epoch": 2.887138065847705, + "grad_norm": 0.8539981842041016, + "learning_rate": 1.8959540631367258e-06, + "loss": 0.7746, + "step": 36370 + }, + { + "epoch": 2.8879318898966044, + "grad_norm": 1.008750081062317, + "learning_rate": 1.8827233997512634e-06, + "loss": 0.8307, + "step": 36380 + }, + { + "epoch": 2.888725713945504, + "grad_norm": 0.9009593725204468, + "learning_rate": 1.8694927363658015e-06, + "loss": 0.7134, + "step": 36390 + }, + { + "epoch": 2.8895195379944036, + "grad_norm": 1.03337562084198, + "learning_rate": 1.8562620729803394e-06, + "loss": 0.7966, + "step": 36400 + }, + { + "epoch": 2.8903133620433032, + "grad_norm": 0.9321558475494385, + "learning_rate": 1.843031409594877e-06, + "loss": 0.8321, + "step": 36410 + }, + { + "epoch": 2.891107186092203, + "grad_norm": 0.8705295324325562, + "learning_rate": 1.829800746209415e-06, + "loss": 0.821, + "step": 36420 + }, + { + "epoch": 2.891901010141102, + "grad_norm": 1.093181848526001, + "learning_rate": 1.816570082823953e-06, + "loss": 0.7454, + "step": 36430 + }, + { + "epoch": 2.892694834190002, + "grad_norm": 1.1398403644561768, + "learning_rate": 1.8033394194384906e-06, + "loss": 0.8026, + "step": 36440 + }, + { + "epoch": 2.8934886582389012, + "grad_norm": 0.8322063088417053, + "learning_rate": 1.7901087560530285e-06, + "loss": 0.8001, + "step": 36450 + }, + { + "epoch": 2.894282482287801, + "grad_norm": 0.9073172807693481, + "learning_rate": 1.7768780926675666e-06, + "loss": 0.8225, + "step": 36460 + }, + { + "epoch": 2.8950763063367004, + "grad_norm": 0.8599694967269897, + "learning_rate": 1.7636474292821043e-06, + "loss": 0.7355, + "step": 36470 + }, + { + "epoch": 2.8958701303856, + "grad_norm": 0.9327001571655273, + "learning_rate": 1.7504167658966421e-06, + "loss": 0.7628, + "step": 36480 + }, + { + "epoch": 2.8966639544344996, + "grad_norm": 1.1032301187515259, + "learning_rate": 1.7371861025111802e-06, + "loss": 0.7719, + "step": 36490 + }, + { + "epoch": 2.8974577784833992, + "grad_norm": 1.0107958316802979, + "learning_rate": 1.7239554391257176e-06, + "loss": 0.7785, + "step": 36500 + }, + { + "epoch": 2.898251602532299, + "grad_norm": 0.9559915065765381, + "learning_rate": 1.7107247757402557e-06, + "loss": 0.7749, + "step": 36510 + }, + { + "epoch": 2.899045426581198, + "grad_norm": 0.9474267363548279, + "learning_rate": 1.6974941123547936e-06, + "loss": 0.8143, + "step": 36520 + }, + { + "epoch": 2.899839250630098, + "grad_norm": 0.7721654772758484, + "learning_rate": 1.6842634489693313e-06, + "loss": 0.7488, + "step": 36530 + }, + { + "epoch": 2.9006330746789972, + "grad_norm": 0.8206939697265625, + "learning_rate": 1.6710327855838693e-06, + "loss": 0.7174, + "step": 36540 + }, + { + "epoch": 2.901426898727897, + "grad_norm": 0.8247297406196594, + "learning_rate": 1.6578021221984072e-06, + "loss": 0.7827, + "step": 36550 + }, + { + "epoch": 2.9022207227767964, + "grad_norm": 1.261728286743164, + "learning_rate": 1.6445714588129449e-06, + "loss": 0.7458, + "step": 36560 + }, + { + "epoch": 2.903014546825696, + "grad_norm": 0.9776942729949951, + "learning_rate": 1.631340795427483e-06, + "loss": 0.7707, + "step": 36570 + }, + { + "epoch": 2.9038083708745956, + "grad_norm": 1.0205057859420776, + "learning_rate": 1.6181101320420208e-06, + "loss": 0.7757, + "step": 36580 + }, + { + "epoch": 2.9046021949234953, + "grad_norm": 1.0153295993804932, + "learning_rate": 1.6048794686565585e-06, + "loss": 0.7932, + "step": 36590 + }, + { + "epoch": 2.905396018972395, + "grad_norm": 1.073330044746399, + "learning_rate": 1.5916488052710963e-06, + "loss": 0.742, + "step": 36600 + }, + { + "epoch": 2.9061898430212945, + "grad_norm": 1.0096873044967651, + "learning_rate": 1.578418141885634e-06, + "loss": 0.8228, + "step": 36610 + }, + { + "epoch": 2.906983667070194, + "grad_norm": 1.1333979368209839, + "learning_rate": 1.565187478500172e-06, + "loss": 0.7219, + "step": 36620 + }, + { + "epoch": 2.9077774911190932, + "grad_norm": 1.1649788618087769, + "learning_rate": 1.55195681511471e-06, + "loss": 0.8223, + "step": 36630 + }, + { + "epoch": 2.9085713151679933, + "grad_norm": 0.8990685939788818, + "learning_rate": 1.5387261517292478e-06, + "loss": 0.7569, + "step": 36640 + }, + { + "epoch": 2.9093651392168924, + "grad_norm": 1.097396731376648, + "learning_rate": 1.5254954883437857e-06, + "loss": 0.7456, + "step": 36650 + }, + { + "epoch": 2.910158963265792, + "grad_norm": 0.9843643307685852, + "learning_rate": 1.5122648249583233e-06, + "loss": 0.7238, + "step": 36660 + }, + { + "epoch": 2.9109527873146916, + "grad_norm": 1.0941895246505737, + "learning_rate": 1.4990341615728614e-06, + "loss": 0.8031, + "step": 36670 + }, + { + "epoch": 2.9117466113635913, + "grad_norm": 0.9502320289611816, + "learning_rate": 1.485803498187399e-06, + "loss": 0.8127, + "step": 36680 + }, + { + "epoch": 2.912540435412491, + "grad_norm": 0.8268551826477051, + "learning_rate": 1.472572834801937e-06, + "loss": 0.7619, + "step": 36690 + }, + { + "epoch": 2.9133342594613905, + "grad_norm": 1.1277645826339722, + "learning_rate": 1.459342171416475e-06, + "loss": 0.8046, + "step": 36700 + }, + { + "epoch": 2.91412808351029, + "grad_norm": 0.9089832901954651, + "learning_rate": 1.4461115080310127e-06, + "loss": 0.7497, + "step": 36710 + }, + { + "epoch": 2.9149219075591892, + "grad_norm": 0.9186846017837524, + "learning_rate": 1.4328808446455505e-06, + "loss": 0.771, + "step": 36720 + }, + { + "epoch": 2.9157157316080893, + "grad_norm": 1.0367939472198486, + "learning_rate": 1.4196501812600884e-06, + "loss": 0.7857, + "step": 36730 + }, + { + "epoch": 2.9165095556569884, + "grad_norm": 0.9849185347557068, + "learning_rate": 1.4064195178746263e-06, + "loss": 0.8304, + "step": 36740 + }, + { + "epoch": 2.917303379705888, + "grad_norm": 1.022588849067688, + "learning_rate": 1.3931888544891641e-06, + "loss": 0.7862, + "step": 36750 + }, + { + "epoch": 2.9180972037547876, + "grad_norm": 0.8609658479690552, + "learning_rate": 1.379958191103702e-06, + "loss": 0.7718, + "step": 36760 + }, + { + "epoch": 2.9188910278036873, + "grad_norm": 0.9891222715377808, + "learning_rate": 1.3667275277182399e-06, + "loss": 0.7544, + "step": 36770 + }, + { + "epoch": 2.919684851852587, + "grad_norm": 1.002481460571289, + "learning_rate": 1.3534968643327777e-06, + "loss": 0.8537, + "step": 36780 + }, + { + "epoch": 2.9204786759014865, + "grad_norm": 1.0346823930740356, + "learning_rate": 1.3402662009473156e-06, + "loss": 0.7605, + "step": 36790 + }, + { + "epoch": 2.921272499950386, + "grad_norm": 1.154091715812683, + "learning_rate": 1.3270355375618535e-06, + "loss": 0.7222, + "step": 36800 + }, + { + "epoch": 2.9220663239992857, + "grad_norm": 1.07093346118927, + "learning_rate": 1.3138048741763914e-06, + "loss": 0.8214, + "step": 36810 + }, + { + "epoch": 2.9228601480481853, + "grad_norm": 0.9682141542434692, + "learning_rate": 1.300574210790929e-06, + "loss": 0.7921, + "step": 36820 + }, + { + "epoch": 2.9236539720970844, + "grad_norm": 0.9235316514968872, + "learning_rate": 1.287343547405467e-06, + "loss": 0.8082, + "step": 36830 + }, + { + "epoch": 2.9244477961459845, + "grad_norm": 1.0996876955032349, + "learning_rate": 1.2741128840200047e-06, + "loss": 0.7969, + "step": 36840 + }, + { + "epoch": 2.9252416201948837, + "grad_norm": 0.8696537017822266, + "learning_rate": 1.2608822206345426e-06, + "loss": 0.7344, + "step": 36850 + }, + { + "epoch": 2.9260354442437833, + "grad_norm": 1.0605179071426392, + "learning_rate": 1.2476515572490805e-06, + "loss": 0.7609, + "step": 36860 + }, + { + "epoch": 2.926829268292683, + "grad_norm": 1.034128189086914, + "learning_rate": 1.2344208938636184e-06, + "loss": 0.8119, + "step": 36870 + }, + { + "epoch": 2.9276230923415825, + "grad_norm": 1.083617091178894, + "learning_rate": 1.2211902304781562e-06, + "loss": 0.7628, + "step": 36880 + }, + { + "epoch": 2.928416916390482, + "grad_norm": 1.001076102256775, + "learning_rate": 1.207959567092694e-06, + "loss": 0.7721, + "step": 36890 + }, + { + "epoch": 2.9292107404393817, + "grad_norm": 0.8138130903244019, + "learning_rate": 1.194728903707232e-06, + "loss": 0.7045, + "step": 36900 + }, + { + "epoch": 2.9300045644882813, + "grad_norm": 0.8781431317329407, + "learning_rate": 1.1814982403217698e-06, + "loss": 0.8074, + "step": 36910 + }, + { + "epoch": 2.930798388537181, + "grad_norm": 0.8041318655014038, + "learning_rate": 1.1682675769363077e-06, + "loss": 0.7763, + "step": 36920 + }, + { + "epoch": 2.9315922125860805, + "grad_norm": 0.9655971527099609, + "learning_rate": 1.1550369135508456e-06, + "loss": 0.7511, + "step": 36930 + }, + { + "epoch": 2.9323860366349797, + "grad_norm": 1.0426075458526611, + "learning_rate": 1.1418062501653834e-06, + "loss": 0.7663, + "step": 36940 + }, + { + "epoch": 2.9331798606838793, + "grad_norm": 1.0765414237976074, + "learning_rate": 1.128575586779921e-06, + "loss": 0.8248, + "step": 36950 + }, + { + "epoch": 2.933973684732779, + "grad_norm": 0.8941267132759094, + "learning_rate": 1.1153449233944592e-06, + "loss": 0.8233, + "step": 36960 + }, + { + "epoch": 2.9347675087816785, + "grad_norm": 1.1485576629638672, + "learning_rate": 1.1021142600089968e-06, + "loss": 0.7031, + "step": 36970 + }, + { + "epoch": 2.935561332830578, + "grad_norm": 1.0902643203735352, + "learning_rate": 1.0888835966235347e-06, + "loss": 0.7641, + "step": 36980 + }, + { + "epoch": 2.9363551568794777, + "grad_norm": 0.8667743802070618, + "learning_rate": 1.0756529332380728e-06, + "loss": 0.7518, + "step": 36990 + }, + { + "epoch": 2.9371489809283773, + "grad_norm": 1.2408496141433716, + "learning_rate": 1.0624222698526104e-06, + "loss": 0.8086, + "step": 37000 + }, + { + "epoch": 2.937942804977277, + "grad_norm": 0.9038211703300476, + "learning_rate": 1.0491916064671483e-06, + "loss": 0.7384, + "step": 37010 + }, + { + "epoch": 2.9387366290261765, + "grad_norm": 1.0854371786117554, + "learning_rate": 1.0359609430816862e-06, + "loss": 0.73, + "step": 37020 + }, + { + "epoch": 2.9395304530750757, + "grad_norm": 0.986851155757904, + "learning_rate": 1.022730279696224e-06, + "loss": 0.7916, + "step": 37030 + }, + { + "epoch": 2.9403242771239757, + "grad_norm": 0.9306720495223999, + "learning_rate": 1.009499616310762e-06, + "loss": 0.7434, + "step": 37040 + }, + { + "epoch": 2.941118101172875, + "grad_norm": 1.075705647468567, + "learning_rate": 9.962689529252998e-07, + "loss": 0.7953, + "step": 37050 + }, + { + "epoch": 2.9419119252217745, + "grad_norm": 0.9013558626174927, + "learning_rate": 9.830382895398376e-07, + "loss": 0.7916, + "step": 37060 + }, + { + "epoch": 2.942705749270674, + "grad_norm": 1.0537384748458862, + "learning_rate": 9.698076261543755e-07, + "loss": 0.768, + "step": 37070 + }, + { + "epoch": 2.9434995733195737, + "grad_norm": 0.8394224643707275, + "learning_rate": 9.565769627689132e-07, + "loss": 0.8091, + "step": 37080 + }, + { + "epoch": 2.9442933973684733, + "grad_norm": 0.9808106422424316, + "learning_rate": 9.433462993834511e-07, + "loss": 0.7533, + "step": 37090 + }, + { + "epoch": 2.945087221417373, + "grad_norm": 0.8724024295806885, + "learning_rate": 9.30115635997989e-07, + "loss": 0.736, + "step": 37100 + }, + { + "epoch": 2.9458810454662725, + "grad_norm": 1.0221234560012817, + "learning_rate": 9.168849726125268e-07, + "loss": 0.8032, + "step": 37110 + }, + { + "epoch": 2.946674869515172, + "grad_norm": 0.8556252121925354, + "learning_rate": 9.036543092270647e-07, + "loss": 0.7819, + "step": 37120 + }, + { + "epoch": 2.9474686935640717, + "grad_norm": 0.9892777800559998, + "learning_rate": 8.904236458416025e-07, + "loss": 0.7875, + "step": 37130 + }, + { + "epoch": 2.948262517612971, + "grad_norm": 0.8527591824531555, + "learning_rate": 8.771929824561404e-07, + "loss": 0.7543, + "step": 37140 + }, + { + "epoch": 2.949056341661871, + "grad_norm": 1.1402502059936523, + "learning_rate": 8.639623190706783e-07, + "loss": 0.8077, + "step": 37150 + }, + { + "epoch": 2.94985016571077, + "grad_norm": 0.9675154089927673, + "learning_rate": 8.507316556852161e-07, + "loss": 0.7796, + "step": 37160 + }, + { + "epoch": 2.9506439897596697, + "grad_norm": 0.8908880352973938, + "learning_rate": 8.375009922997539e-07, + "loss": 0.7795, + "step": 37170 + }, + { + "epoch": 2.9514378138085693, + "grad_norm": 0.9992043972015381, + "learning_rate": 8.242703289142918e-07, + "loss": 0.801, + "step": 37180 + }, + { + "epoch": 2.952231637857469, + "grad_norm": 0.9588622450828552, + "learning_rate": 8.110396655288297e-07, + "loss": 0.83, + "step": 37190 + }, + { + "epoch": 2.9530254619063685, + "grad_norm": 0.9657321572303772, + "learning_rate": 7.978090021433675e-07, + "loss": 0.8267, + "step": 37200 + }, + { + "epoch": 2.953819285955268, + "grad_norm": 0.8427868485450745, + "learning_rate": 7.845783387579052e-07, + "loss": 0.8107, + "step": 37210 + }, + { + "epoch": 2.9546131100041677, + "grad_norm": 1.1387544870376587, + "learning_rate": 7.713476753724432e-07, + "loss": 0.7857, + "step": 37220 + }, + { + "epoch": 2.955406934053067, + "grad_norm": 0.8458208441734314, + "learning_rate": 7.581170119869811e-07, + "loss": 0.815, + "step": 37230 + }, + { + "epoch": 2.956200758101967, + "grad_norm": 1.2061816453933716, + "learning_rate": 7.44886348601519e-07, + "loss": 0.7483, + "step": 37240 + }, + { + "epoch": 2.956994582150866, + "grad_norm": 0.93003910779953, + "learning_rate": 7.316556852160567e-07, + "loss": 0.7802, + "step": 37250 + }, + { + "epoch": 2.9577884061997657, + "grad_norm": 0.8982664346694946, + "learning_rate": 7.184250218305946e-07, + "loss": 0.8041, + "step": 37260 + }, + { + "epoch": 2.9585822302486653, + "grad_norm": 1.058793306350708, + "learning_rate": 7.051943584451326e-07, + "loss": 0.7814, + "step": 37270 + }, + { + "epoch": 2.959376054297565, + "grad_norm": 0.9171466827392578, + "learning_rate": 6.919636950596703e-07, + "loss": 0.7798, + "step": 37280 + }, + { + "epoch": 2.9601698783464645, + "grad_norm": 0.8719449043273926, + "learning_rate": 6.787330316742082e-07, + "loss": 0.8213, + "step": 37290 + }, + { + "epoch": 2.960963702395364, + "grad_norm": 0.8667718172073364, + "learning_rate": 6.655023682887459e-07, + "loss": 0.7089, + "step": 37300 + }, + { + "epoch": 2.9617575264442637, + "grad_norm": 0.8706735968589783, + "learning_rate": 6.522717049032839e-07, + "loss": 0.7605, + "step": 37310 + }, + { + "epoch": 2.9625513504931633, + "grad_norm": 0.9747303128242493, + "learning_rate": 6.390410415178218e-07, + "loss": 0.7329, + "step": 37320 + }, + { + "epoch": 2.963345174542063, + "grad_norm": 1.0735442638397217, + "learning_rate": 6.258103781323596e-07, + "loss": 0.784, + "step": 37330 + }, + { + "epoch": 2.964138998590962, + "grad_norm": 0.8519766330718994, + "learning_rate": 6.125797147468974e-07, + "loss": 0.7985, + "step": 37340 + }, + { + "epoch": 2.964932822639862, + "grad_norm": 1.0868384838104248, + "learning_rate": 5.993490513614353e-07, + "loss": 0.8058, + "step": 37350 + }, + { + "epoch": 2.9657266466887613, + "grad_norm": 1.1832605600357056, + "learning_rate": 5.861183879759732e-07, + "loss": 0.7284, + "step": 37360 + }, + { + "epoch": 2.966520470737661, + "grad_norm": 1.0526885986328125, + "learning_rate": 5.72887724590511e-07, + "loss": 0.8052, + "step": 37370 + }, + { + "epoch": 2.9673142947865605, + "grad_norm": 1.137607455253601, + "learning_rate": 5.596570612050488e-07, + "loss": 0.8004, + "step": 37380 + }, + { + "epoch": 2.96810811883546, + "grad_norm": 0.8867448568344116, + "learning_rate": 5.464263978195867e-07, + "loss": 0.7579, + "step": 37390 + }, + { + "epoch": 2.9689019428843597, + "grad_norm": 0.992607057094574, + "learning_rate": 5.331957344341246e-07, + "loss": 0.7234, + "step": 37400 + }, + { + "epoch": 2.9696957669332593, + "grad_norm": 0.9389944076538086, + "learning_rate": 5.199650710486624e-07, + "loss": 0.7204, + "step": 37410 + }, + { + "epoch": 2.970489590982159, + "grad_norm": 1.1689140796661377, + "learning_rate": 5.067344076632003e-07, + "loss": 0.7956, + "step": 37420 + }, + { + "epoch": 2.9712834150310585, + "grad_norm": 1.0063467025756836, + "learning_rate": 4.93503744277738e-07, + "loss": 0.8114, + "step": 37430 + }, + { + "epoch": 2.972077239079958, + "grad_norm": 0.9847369194030762, + "learning_rate": 4.815961472308222e-07, + "loss": 0.7854, + "step": 37440 + }, + { + "epoch": 2.9728710631288573, + "grad_norm": 0.8891132473945618, + "learning_rate": 4.6836548384536e-07, + "loss": 0.8131, + "step": 37450 + }, + { + "epoch": 2.973664887177757, + "grad_norm": 1.1381394863128662, + "learning_rate": 4.551348204598979e-07, + "loss": 0.7877, + "step": 37460 + }, + { + "epoch": 2.9744587112266565, + "grad_norm": 1.0094081163406372, + "learning_rate": 4.4190415707443577e-07, + "loss": 0.7812, + "step": 37470 + }, + { + "epoch": 2.975252535275556, + "grad_norm": 1.1680759191513062, + "learning_rate": 4.286734936889736e-07, + "loss": 0.7642, + "step": 37480 + }, + { + "epoch": 2.9760463593244557, + "grad_norm": 1.0051449537277222, + "learning_rate": 4.1544283030351145e-07, + "loss": 0.7755, + "step": 37490 + }, + { + "epoch": 2.9768401833733553, + "grad_norm": 1.1129390001296997, + "learning_rate": 4.0221216691804927e-07, + "loss": 0.7441, + "step": 37500 + }, + { + "epoch": 2.977634007422255, + "grad_norm": 0.8162450194358826, + "learning_rate": 3.8898150353258714e-07, + "loss": 0.7353, + "step": 37510 + }, + { + "epoch": 2.9784278314711545, + "grad_norm": 0.8343786001205444, + "learning_rate": 3.7575084014712495e-07, + "loss": 0.8653, + "step": 37520 + }, + { + "epoch": 2.979221655520054, + "grad_norm": 0.9422026872634888, + "learning_rate": 3.625201767616629e-07, + "loss": 0.7788, + "step": 37530 + }, + { + "epoch": 2.9800154795689533, + "grad_norm": 0.7513265013694763, + "learning_rate": 3.492895133762007e-07, + "loss": 0.7978, + "step": 37540 + }, + { + "epoch": 2.9808093036178533, + "grad_norm": 1.1932988166809082, + "learning_rate": 3.3605884999073856e-07, + "loss": 0.7766, + "step": 37550 + }, + { + "epoch": 2.9816031276667525, + "grad_norm": 0.8924500942230225, + "learning_rate": 3.228281866052764e-07, + "loss": 0.816, + "step": 37560 + }, + { + "epoch": 2.982396951715652, + "grad_norm": 1.2411748170852661, + "learning_rate": 3.095975232198143e-07, + "loss": 0.7699, + "step": 37570 + }, + { + "epoch": 2.9831907757645517, + "grad_norm": 0.9249628186225891, + "learning_rate": 2.963668598343521e-07, + "loss": 0.8282, + "step": 37580 + }, + { + "epoch": 2.9839845998134513, + "grad_norm": 0.9491875171661377, + "learning_rate": 2.8313619644889e-07, + "loss": 0.848, + "step": 37590 + }, + { + "epoch": 2.984778423862351, + "grad_norm": 1.0644991397857666, + "learning_rate": 2.699055330634278e-07, + "loss": 0.7872, + "step": 37600 + }, + { + "epoch": 2.9855722479112505, + "grad_norm": 1.1010690927505493, + "learning_rate": 2.5667486967796566e-07, + "loss": 0.7806, + "step": 37610 + }, + { + "epoch": 2.98636607196015, + "grad_norm": 0.8743943572044373, + "learning_rate": 2.4344420629250353e-07, + "loss": 0.7946, + "step": 37620 + }, + { + "epoch": 2.9871598960090497, + "grad_norm": 0.909243106842041, + "learning_rate": 2.3021354290704137e-07, + "loss": 0.8322, + "step": 37630 + }, + { + "epoch": 2.9879537200579493, + "grad_norm": 0.8433053493499756, + "learning_rate": 2.1698287952157921e-07, + "loss": 0.8142, + "step": 37640 + }, + { + "epoch": 2.9887475441068485, + "grad_norm": 0.8749028444290161, + "learning_rate": 2.0375221613611706e-07, + "loss": 0.7644, + "step": 37650 + }, + { + "epoch": 2.989541368155748, + "grad_norm": 1.0767362117767334, + "learning_rate": 1.9052155275065492e-07, + "loss": 0.7983, + "step": 37660 + }, + { + "epoch": 2.9903351922046477, + "grad_norm": 1.0285117626190186, + "learning_rate": 1.772908893651928e-07, + "loss": 0.7426, + "step": 37670 + }, + { + "epoch": 2.9911290162535473, + "grad_norm": 0.986137330532074, + "learning_rate": 1.6406022597973063e-07, + "loss": 0.7596, + "step": 37680 + }, + { + "epoch": 2.991922840302447, + "grad_norm": 0.9621859192848206, + "learning_rate": 1.508295625942685e-07, + "loss": 0.7735, + "step": 37690 + }, + { + "epoch": 2.9927166643513465, + "grad_norm": 0.9703050851821899, + "learning_rate": 1.3759889920880634e-07, + "loss": 0.7648, + "step": 37700 + }, + { + "epoch": 2.993510488400246, + "grad_norm": 0.7976683378219604, + "learning_rate": 1.2436823582334419e-07, + "loss": 0.7838, + "step": 37710 + }, + { + "epoch": 2.9943043124491457, + "grad_norm": 0.8955792188644409, + "learning_rate": 1.1113757243788204e-07, + "loss": 0.7638, + "step": 37720 + }, + { + "epoch": 2.9950981364980453, + "grad_norm": 0.8139486312866211, + "learning_rate": 9.79069090524199e-08, + "loss": 0.7634, + "step": 37730 + }, + { + "epoch": 2.9958919605469445, + "grad_norm": 1.1136054992675781, + "learning_rate": 8.467624566695774e-08, + "loss": 0.7807, + "step": 37740 + }, + { + "epoch": 2.9966857845958446, + "grad_norm": 0.8526073098182678, + "learning_rate": 7.144558228149559e-08, + "loss": 0.742, + "step": 37750 + }, + { + "epoch": 2.9974796086447437, + "grad_norm": 0.9740519523620605, + "learning_rate": 5.821491889603345e-08, + "loss": 0.7485, + "step": 37760 + }, + { + "epoch": 2.9982734326936433, + "grad_norm": 0.8623798489570618, + "learning_rate": 4.4984255510571303e-08, + "loss": 0.8157, + "step": 37770 + }, + { + "epoch": 2.999067256742543, + "grad_norm": 1.0553157329559326, + "learning_rate": 3.175359212510916e-08, + "loss": 0.7943, + "step": 37780 + }, + { + "epoch": 2.9998610807914425, + "grad_norm": 0.9887831807136536, + "learning_rate": 1.8522928739647007e-08, + "loss": 0.7406, + "step": 37790 + } + ], + "logging_steps": 10, + "max_steps": 37791, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.60495148153779e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}