{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.998933522929257, "eval_steps": 500, "global_step": 6327, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004739898092191018, "grad_norm": 0.36863938554596193, "learning_rate": 9.999938362758687e-05, "loss": 2.2013, "step": 10 }, { "epoch": 0.009479796184382036, "grad_norm": 0.500081400365978, "learning_rate": 9.999753452554404e-05, "loss": 2.0963, "step": 20 }, { "epoch": 0.014219694276573054, "grad_norm": 0.4147328978072978, "learning_rate": 9.999445273946093e-05, "loss": 2.1148, "step": 30 }, { "epoch": 0.018959592368764072, "grad_norm": 0.35930434339916095, "learning_rate": 9.999013834531869e-05, "loss": 2.1787, "step": 40 }, { "epoch": 0.02369949046095509, "grad_norm": 0.5423960911916873, "learning_rate": 9.998459144948825e-05, "loss": 2.1055, "step": 50 }, { "epoch": 0.028439388553146108, "grad_norm": 0.40880961489218204, "learning_rate": 9.997781218872771e-05, "loss": 2.1723, "step": 60 }, { "epoch": 0.033179286645337126, "grad_norm": 0.43317175193770346, "learning_rate": 9.99698007301791e-05, "loss": 2.0316, "step": 70 }, { "epoch": 0.037919184737528144, "grad_norm": 0.37892996565691084, "learning_rate": 9.996055727136406e-05, "loss": 2.0171, "step": 80 }, { "epoch": 0.04265908282971916, "grad_norm": 0.43747110352943336, "learning_rate": 9.995008204017915e-05, "loss": 2.0887, "step": 90 }, { "epoch": 0.04739898092191018, "grad_norm": 0.39459537932523525, "learning_rate": 9.993837529489007e-05, "loss": 2.03, "step": 100 }, { "epoch": 0.0521388790141012, "grad_norm": 0.3839963642801344, "learning_rate": 9.992543732412544e-05, "loss": 1.9306, "step": 110 }, { "epoch": 0.056878777106292217, "grad_norm": 0.40520894461995377, "learning_rate": 9.99112684468696e-05, "loss": 2.0425, "step": 120 }, { "epoch": 0.061618675198483235, "grad_norm": 0.390483069303289, "learning_rate": 9.989586901245472e-05, "loss": 2.087, "step": 130 }, { "epoch": 0.06635857329067425, "grad_norm": 0.4180445176279502, "learning_rate": 9.987923940055228e-05, "loss": 2.0679, "step": 140 }, { "epoch": 0.07109847138286526, "grad_norm": 0.49880430744694115, "learning_rate": 9.986138002116364e-05, "loss": 2.0628, "step": 150 }, { "epoch": 0.07583836947505629, "grad_norm": 0.4427166571522091, "learning_rate": 9.984229131460996e-05, "loss": 2.0681, "step": 160 }, { "epoch": 0.0805782675672473, "grad_norm": 0.38471776319499607, "learning_rate": 9.982197375152129e-05, "loss": 2.0019, "step": 170 }, { "epoch": 0.08531816565943832, "grad_norm": 0.4628072900469101, "learning_rate": 9.980042783282509e-05, "loss": 1.9909, "step": 180 }, { "epoch": 0.09005806375162934, "grad_norm": 0.3939907069527393, "learning_rate": 9.977765408973374e-05, "loss": 2.0713, "step": 190 }, { "epoch": 0.09479796184382036, "grad_norm": 0.4184843314019155, "learning_rate": 9.97536530837315e-05, "loss": 1.9729, "step": 200 }, { "epoch": 0.09953785993601137, "grad_norm": 0.6336861212047761, "learning_rate": 9.97284254065607e-05, "loss": 2.0278, "step": 210 }, { "epoch": 0.1042777580282024, "grad_norm": 0.39737439720337403, "learning_rate": 9.970197168020713e-05, "loss": 2.0603, "step": 220 }, { "epoch": 0.10901765612039341, "grad_norm": 0.4161581082817388, "learning_rate": 9.967429255688468e-05, "loss": 2.0308, "step": 230 }, { "epoch": 0.11375755421258443, "grad_norm": 0.4122381540422074, "learning_rate": 9.964538871901923e-05, "loss": 2.1011, "step": 240 }, { "epoch": 0.11849745230477544, "grad_norm": 0.40792411841005016, "learning_rate": 9.961526087923193e-05, "loss": 2.0535, "step": 250 }, { "epoch": 0.12323735039696647, "grad_norm": 0.4298298302428991, "learning_rate": 9.958390978032157e-05, "loss": 1.9882, "step": 260 }, { "epoch": 0.12797724848915748, "grad_norm": 0.3932772338211237, "learning_rate": 9.955133619524623e-05, "loss": 2.0703, "step": 270 }, { "epoch": 0.1327171465813485, "grad_norm": 0.4304879913642714, "learning_rate": 9.951754092710429e-05, "loss": 2.0661, "step": 280 }, { "epoch": 0.13745704467353953, "grad_norm": 0.3933942358750948, "learning_rate": 9.948252480911458e-05, "loss": 1.9941, "step": 290 }, { "epoch": 0.14219694276573053, "grad_norm": 0.3876898041012675, "learning_rate": 9.944628870459587e-05, "loss": 2.001, "step": 300 }, { "epoch": 0.14693684085792155, "grad_norm": 0.39971149840828696, "learning_rate": 9.940883350694556e-05, "loss": 1.9889, "step": 310 }, { "epoch": 0.15167673895011258, "grad_norm": 0.4322868983437022, "learning_rate": 9.93701601396177e-05, "loss": 2.019, "step": 320 }, { "epoch": 0.1564166370423036, "grad_norm": 0.40679391432223605, "learning_rate": 9.933026955610014e-05, "loss": 2.0402, "step": 330 }, { "epoch": 0.1611565351344946, "grad_norm": 0.40265148647862, "learning_rate": 9.928916273989108e-05, "loss": 1.9488, "step": 340 }, { "epoch": 0.16589643322668562, "grad_norm": 0.4119893126018702, "learning_rate": 9.924684070447483e-05, "loss": 2.0143, "step": 350 }, { "epoch": 0.17063633131887665, "grad_norm": 0.41446853317804655, "learning_rate": 9.92033044932968e-05, "loss": 1.9393, "step": 360 }, { "epoch": 0.17537622941106767, "grad_norm": 0.4775440242382454, "learning_rate": 9.915855517973776e-05, "loss": 1.9899, "step": 370 }, { "epoch": 0.18011612750325867, "grad_norm": 0.41303403265485017, "learning_rate": 9.91125938670874e-05, "loss": 2.0431, "step": 380 }, { "epoch": 0.1848560255954497, "grad_norm": 0.381415505593885, "learning_rate": 9.906542168851715e-05, "loss": 1.9778, "step": 390 }, { "epoch": 0.18959592368764072, "grad_norm": 0.45202098843075295, "learning_rate": 9.901703980705219e-05, "loss": 2.0098, "step": 400 }, { "epoch": 0.19433582177983175, "grad_norm": 0.38808197740496003, "learning_rate": 9.896744941554279e-05, "loss": 1.9467, "step": 410 }, { "epoch": 0.19907571987202274, "grad_norm": 0.40860216072850924, "learning_rate": 9.891665173663492e-05, "loss": 2.0267, "step": 420 }, { "epoch": 0.20381561796421377, "grad_norm": 0.4068044305771888, "learning_rate": 9.886464802274009e-05, "loss": 2.0872, "step": 430 }, { "epoch": 0.2085555160564048, "grad_norm": 0.43039544158069454, "learning_rate": 9.88114395560045e-05, "loss": 2.0094, "step": 440 }, { "epoch": 0.21329541414859582, "grad_norm": 0.37668435282131046, "learning_rate": 9.875702764827737e-05, "loss": 2.0032, "step": 450 }, { "epoch": 0.21803531224078682, "grad_norm": 0.4289799607032317, "learning_rate": 9.87014136410787e-05, "loss": 1.9535, "step": 460 }, { "epoch": 0.22277521033297784, "grad_norm": 0.416501457655663, "learning_rate": 9.864459890556604e-05, "loss": 2.0246, "step": 470 }, { "epoch": 0.22751510842516887, "grad_norm": 0.42709577377722036, "learning_rate": 9.858658484250082e-05, "loss": 1.9675, "step": 480 }, { "epoch": 0.23225500651735986, "grad_norm": 0.38491345570315816, "learning_rate": 9.852737288221378e-05, "loss": 1.9768, "step": 490 }, { "epoch": 0.2369949046095509, "grad_norm": 0.4331220698731146, "learning_rate": 9.846696448456967e-05, "loss": 1.96, "step": 500 }, { "epoch": 0.2417348027017419, "grad_norm": 0.5157356350680703, "learning_rate": 9.840536113893129e-05, "loss": 2.0168, "step": 510 }, { "epoch": 0.24647470079393294, "grad_norm": 0.42673885807943607, "learning_rate": 9.834256436412272e-05, "loss": 1.9192, "step": 520 }, { "epoch": 0.25121459888612396, "grad_norm": 0.399056341637914, "learning_rate": 9.827857570839198e-05, "loss": 2.009, "step": 530 }, { "epoch": 0.25595449697831496, "grad_norm": 0.38514488410609315, "learning_rate": 9.821339674937274e-05, "loss": 2.0237, "step": 540 }, { "epoch": 0.26069439507050596, "grad_norm": 0.43535566879213633, "learning_rate": 9.814702909404547e-05, "loss": 1.9746, "step": 550 }, { "epoch": 0.265434293162697, "grad_norm": 0.4277848981360601, "learning_rate": 9.807947437869788e-05, "loss": 2.0008, "step": 560 }, { "epoch": 0.270174191254888, "grad_norm": 0.42806115487352164, "learning_rate": 9.801073426888447e-05, "loss": 2.0819, "step": 570 }, { "epoch": 0.27491408934707906, "grad_norm": 0.36287005859609833, "learning_rate": 9.794081045938554e-05, "loss": 2.0256, "step": 580 }, { "epoch": 0.27965398743927006, "grad_norm": 0.467970576527151, "learning_rate": 9.786970467416538e-05, "loss": 2.0221, "step": 590 }, { "epoch": 0.28439388553146105, "grad_norm": 0.37993477630266503, "learning_rate": 9.779741866632977e-05, "loss": 1.9589, "step": 600 }, { "epoch": 0.2891337836236521, "grad_norm": 0.44198107142469956, "learning_rate": 9.772395421808274e-05, "loss": 2.0035, "step": 610 }, { "epoch": 0.2938736817158431, "grad_norm": 0.44573447679188816, "learning_rate": 9.764931314068267e-05, "loss": 1.9909, "step": 620 }, { "epoch": 0.2986135798080341, "grad_norm": 0.4731340699659092, "learning_rate": 9.757349727439759e-05, "loss": 2.0103, "step": 630 }, { "epoch": 0.30335347790022515, "grad_norm": 0.3963283837850387, "learning_rate": 9.749650848845984e-05, "loss": 2.0639, "step": 640 }, { "epoch": 0.30809337599241615, "grad_norm": 0.3884422717238912, "learning_rate": 9.741834868101998e-05, "loss": 2.0342, "step": 650 }, { "epoch": 0.3128332740846072, "grad_norm": 0.42096628799860736, "learning_rate": 9.733901977909997e-05, "loss": 2.0037, "step": 660 }, { "epoch": 0.3175731721767982, "grad_norm": 0.3922372868315195, "learning_rate": 9.725852373854568e-05, "loss": 2.0327, "step": 670 }, { "epoch": 0.3223130702689892, "grad_norm": 0.37724258160489493, "learning_rate": 9.717686254397866e-05, "loss": 1.9996, "step": 680 }, { "epoch": 0.32705296836118025, "grad_norm": 0.36849429342184464, "learning_rate": 9.70940382087472e-05, "loss": 1.9789, "step": 690 }, { "epoch": 0.33179286645337125, "grad_norm": 0.38001698944458373, "learning_rate": 9.701005277487673e-05, "loss": 1.8886, "step": 700 }, { "epoch": 0.33653276454556225, "grad_norm": 0.4434394537121414, "learning_rate": 9.692490831301944e-05, "loss": 2.0773, "step": 710 }, { "epoch": 0.3412726626377533, "grad_norm": 0.44409242659624243, "learning_rate": 9.683860692240321e-05, "loss": 1.9944, "step": 720 }, { "epoch": 0.3460125607299443, "grad_norm": 0.3706038723114169, "learning_rate": 9.675115073077989e-05, "loss": 1.9399, "step": 730 }, { "epoch": 0.35075245882213535, "grad_norm": 0.3775340444246396, "learning_rate": 9.666254189437286e-05, "loss": 2.0434, "step": 740 }, { "epoch": 0.35549235691432635, "grad_norm": 0.39740898678838216, "learning_rate": 9.657278259782378e-05, "loss": 2.0483, "step": 750 }, { "epoch": 0.36023225500651734, "grad_norm": 0.3856650140837026, "learning_rate": 9.648187505413886e-05, "loss": 1.9621, "step": 760 }, { "epoch": 0.3649721530987084, "grad_norm": 0.49084336306431187, "learning_rate": 9.638982150463415e-05, "loss": 1.9878, "step": 770 }, { "epoch": 0.3697120511908994, "grad_norm": 0.41318948101107866, "learning_rate": 9.629662421888039e-05, "loss": 2.0805, "step": 780 }, { "epoch": 0.3744519492830904, "grad_norm": 0.402590356367594, "learning_rate": 9.620228549464703e-05, "loss": 2.0258, "step": 790 }, { "epoch": 0.37919184737528144, "grad_norm": 0.4461694641117838, "learning_rate": 9.610680765784556e-05, "loss": 1.9692, "step": 800 }, { "epoch": 0.38393174546747244, "grad_norm": 0.41581795351534184, "learning_rate": 9.601019306247215e-05, "loss": 2.022, "step": 810 }, { "epoch": 0.3886716435596635, "grad_norm": 0.4182347418587252, "learning_rate": 9.591244409054965e-05, "loss": 1.9989, "step": 820 }, { "epoch": 0.3934115416518545, "grad_norm": 0.36463111311757684, "learning_rate": 9.581356315206885e-05, "loss": 2.0483, "step": 830 }, { "epoch": 0.3981514397440455, "grad_norm": 0.4636476781338481, "learning_rate": 9.571355268492907e-05, "loss": 1.9491, "step": 840 }, { "epoch": 0.40289133783623654, "grad_norm": 0.43027600259738763, "learning_rate": 9.561241515487802e-05, "loss": 1.9423, "step": 850 }, { "epoch": 0.40763123592842754, "grad_norm": 0.43322329785996827, "learning_rate": 9.551015305545104e-05, "loss": 1.9349, "step": 860 }, { "epoch": 0.41237113402061853, "grad_norm": 0.3900423005352424, "learning_rate": 9.540676890790962e-05, "loss": 1.9571, "step": 870 }, { "epoch": 0.4171110321128096, "grad_norm": 0.3736027589992883, "learning_rate": 9.53022652611792e-05, "loss": 2.033, "step": 880 }, { "epoch": 0.4218509302050006, "grad_norm": 0.4412678924097936, "learning_rate": 9.519664469178638e-05, "loss": 1.9928, "step": 890 }, { "epoch": 0.42659082829719164, "grad_norm": 0.36064586995797043, "learning_rate": 9.508990980379537e-05, "loss": 2.0181, "step": 900 }, { "epoch": 0.43133072638938263, "grad_norm": 0.36982453028008294, "learning_rate": 9.498206322874381e-05, "loss": 2.0118, "step": 910 }, { "epoch": 0.43607062448157363, "grad_norm": 0.4936789348648113, "learning_rate": 9.487310762557784e-05, "loss": 2.0388, "step": 920 }, { "epoch": 0.4408105225737647, "grad_norm": 0.4192120475618224, "learning_rate": 9.476304568058657e-05, "loss": 2.0001, "step": 930 }, { "epoch": 0.4455504206659557, "grad_norm": 0.4212248975591549, "learning_rate": 9.465188010733586e-05, "loss": 2.0464, "step": 940 }, { "epoch": 0.4502903187581467, "grad_norm": 0.4111853146435081, "learning_rate": 9.453961364660143e-05, "loss": 2.0118, "step": 950 }, { "epoch": 0.45503021685033773, "grad_norm": 0.3911083150496816, "learning_rate": 9.442624906630124e-05, "loss": 1.9256, "step": 960 }, { "epoch": 0.45977011494252873, "grad_norm": 0.4275198886604283, "learning_rate": 9.431178916142731e-05, "loss": 2.0142, "step": 970 }, { "epoch": 0.4645100130347197, "grad_norm": 0.41213645663674664, "learning_rate": 9.419623675397672e-05, "loss": 1.9863, "step": 980 }, { "epoch": 0.4692499111269108, "grad_norm": 0.39744532831875506, "learning_rate": 9.407959469288214e-05, "loss": 1.963, "step": 990 }, { "epoch": 0.4739898092191018, "grad_norm": 0.40358506493166846, "learning_rate": 9.396186585394153e-05, "loss": 1.9724, "step": 1000 }, { "epoch": 0.47872970731129283, "grad_norm": 0.3715075397009002, "learning_rate": 9.384305313974719e-05, "loss": 1.9564, "step": 1010 }, { "epoch": 0.4834696054034838, "grad_norm": 0.41249417731334614, "learning_rate": 9.372315947961434e-05, "loss": 2.0089, "step": 1020 }, { "epoch": 0.4882095034956748, "grad_norm": 0.4477075629260475, "learning_rate": 9.360218782950873e-05, "loss": 2.0249, "step": 1030 }, { "epoch": 0.4929494015878659, "grad_norm": 0.41335031918044873, "learning_rate": 9.34801411719739e-05, "loss": 2.0439, "step": 1040 }, { "epoch": 0.4976892996800569, "grad_norm": 0.4023689824634566, "learning_rate": 9.335702251605756e-05, "loss": 2.0278, "step": 1050 }, { "epoch": 0.5024291977722479, "grad_norm": 0.37476123227339486, "learning_rate": 9.32328348972374e-05, "loss": 2.0854, "step": 1060 }, { "epoch": 0.5071690958644389, "grad_norm": 0.3680109272331818, "learning_rate": 9.310758137734634e-05, "loss": 2.0505, "step": 1070 }, { "epoch": 0.5119089939566299, "grad_norm": 0.47590335433852127, "learning_rate": 9.298126504449697e-05, "loss": 1.9342, "step": 1080 }, { "epoch": 0.5166488920488209, "grad_norm": 0.443747158773761, "learning_rate": 9.285388901300537e-05, "loss": 2.0338, "step": 1090 }, { "epoch": 0.5213887901410119, "grad_norm": 0.4300619230217585, "learning_rate": 9.272545642331443e-05, "loss": 1.9431, "step": 1100 }, { "epoch": 0.526128688233203, "grad_norm": 0.4068927208227842, "learning_rate": 9.259597044191636e-05, "loss": 1.9639, "step": 1110 }, { "epoch": 0.530868586325394, "grad_norm": 0.3904780080331756, "learning_rate": 9.246543426127463e-05, "loss": 2.044, "step": 1120 }, { "epoch": 0.535608484417585, "grad_norm": 0.4074988084895911, "learning_rate": 9.233385109974528e-05, "loss": 1.9209, "step": 1130 }, { "epoch": 0.540348382509776, "grad_norm": 0.48971289458578504, "learning_rate": 9.220122420149753e-05, "loss": 1.9405, "step": 1140 }, { "epoch": 0.545088280601967, "grad_norm": 0.4560990819156225, "learning_rate": 9.206755683643383e-05, "loss": 1.9754, "step": 1150 }, { "epoch": 0.5498281786941581, "grad_norm": 0.4953771996336736, "learning_rate": 9.193285230010923e-05, "loss": 1.9832, "step": 1160 }, { "epoch": 0.5545680767863491, "grad_norm": 0.452270837264993, "learning_rate": 9.179711391365016e-05, "loss": 2.0267, "step": 1170 }, { "epoch": 0.5593079748785401, "grad_norm": 0.38839940667413064, "learning_rate": 9.166034502367246e-05, "loss": 2.0303, "step": 1180 }, { "epoch": 0.5640478729707311, "grad_norm": 0.4434400621892702, "learning_rate": 9.152254900219899e-05, "loss": 2.019, "step": 1190 }, { "epoch": 0.5687877710629221, "grad_norm": 0.4265655972195879, "learning_rate": 9.138372924657638e-05, "loss": 1.9578, "step": 1200 }, { "epoch": 0.5735276691551132, "grad_norm": 0.37712073893593084, "learning_rate": 9.124388917939135e-05, "loss": 1.9002, "step": 1210 }, { "epoch": 0.5782675672473042, "grad_norm": 0.3967821230664083, "learning_rate": 9.110303224838628e-05, "loss": 1.9982, "step": 1220 }, { "epoch": 0.5830074653394952, "grad_norm": 0.4225910574667248, "learning_rate": 9.096116192637424e-05, "loss": 1.9999, "step": 1230 }, { "epoch": 0.5877473634316862, "grad_norm": 0.46005143244561764, "learning_rate": 9.081828171115334e-05, "loss": 1.9269, "step": 1240 }, { "epoch": 0.5924872615238772, "grad_norm": 0.41650738683050376, "learning_rate": 9.067439512542048e-05, "loss": 2.0138, "step": 1250 }, { "epoch": 0.5972271596160682, "grad_norm": 0.4595664788322495, "learning_rate": 9.052950571668457e-05, "loss": 1.8902, "step": 1260 }, { "epoch": 0.6019670577082593, "grad_norm": 0.47181766838174233, "learning_rate": 9.038361705717897e-05, "loss": 2.0354, "step": 1270 }, { "epoch": 0.6067069558004503, "grad_norm": 0.4016620461236779, "learning_rate": 9.023673274377349e-05, "loss": 2.0428, "step": 1280 }, { "epoch": 0.6114468538926413, "grad_norm": 0.44582424551905314, "learning_rate": 9.00888563978857e-05, "loss": 1.9205, "step": 1290 }, { "epoch": 0.6161867519848323, "grad_norm": 0.4731092970060822, "learning_rate": 8.993999166539155e-05, "loss": 1.9468, "step": 1300 }, { "epoch": 0.6209266500770233, "grad_norm": 0.41403788063445784, "learning_rate": 8.979014221653569e-05, "loss": 1.967, "step": 1310 }, { "epoch": 0.6256665481692144, "grad_norm": 0.3824681634104647, "learning_rate": 8.963931174584072e-05, "loss": 1.9764, "step": 1320 }, { "epoch": 0.6304064462614054, "grad_norm": 0.3979138111413701, "learning_rate": 8.94875039720163e-05, "loss": 2.0262, "step": 1330 }, { "epoch": 0.6351463443535964, "grad_norm": 0.41027150705022153, "learning_rate": 8.93347226378674e-05, "loss": 1.9379, "step": 1340 }, { "epoch": 0.6398862424457874, "grad_norm": 0.46333301444068553, "learning_rate": 8.9180971510202e-05, "loss": 1.9551, "step": 1350 }, { "epoch": 0.6446261405379784, "grad_norm": 0.39959859369206574, "learning_rate": 8.902625437973823e-05, "loss": 1.9199, "step": 1360 }, { "epoch": 0.6493660386301695, "grad_norm": 0.42731835258341894, "learning_rate": 8.887057506101096e-05, "loss": 2.0178, "step": 1370 }, { "epoch": 0.6541059367223605, "grad_norm": 0.43891265274307517, "learning_rate": 8.871393739227764e-05, "loss": 1.9369, "step": 1380 }, { "epoch": 0.6588458348145515, "grad_norm": 0.4314210574368562, "learning_rate": 8.855634523542384e-05, "loss": 2.0049, "step": 1390 }, { "epoch": 0.6635857329067425, "grad_norm": 0.44613138847149775, "learning_rate": 8.839780247586785e-05, "loss": 1.9509, "step": 1400 }, { "epoch": 0.6683256309989335, "grad_norm": 0.4379460820834945, "learning_rate": 8.823831302246498e-05, "loss": 1.9541, "step": 1410 }, { "epoch": 0.6730655290911245, "grad_norm": 0.3682639471382051, "learning_rate": 8.807788080741124e-05, "loss": 2.0064, "step": 1420 }, { "epoch": 0.6778054271833156, "grad_norm": 0.3981445155765943, "learning_rate": 8.791650978614627e-05, "loss": 1.9151, "step": 1430 }, { "epoch": 0.6825453252755066, "grad_norm": 0.3868845773205047, "learning_rate": 8.77542039372559e-05, "loss": 2.0033, "step": 1440 }, { "epoch": 0.6872852233676976, "grad_norm": 0.4065050795968265, "learning_rate": 8.759096726237406e-05, "loss": 1.9333, "step": 1450 }, { "epoch": 0.6920251214598886, "grad_norm": 0.4019451177579478, "learning_rate": 8.742680378608405e-05, "loss": 1.9738, "step": 1460 }, { "epoch": 0.6967650195520796, "grad_norm": 0.40929290402886576, "learning_rate": 8.726171755581943e-05, "loss": 1.9054, "step": 1470 }, { "epoch": 0.7015049176442707, "grad_norm": 0.4521322208310143, "learning_rate": 8.709571264176409e-05, "loss": 2.038, "step": 1480 }, { "epoch": 0.7062448157364617, "grad_norm": 0.4152045328204035, "learning_rate": 8.692879313675201e-05, "loss": 2.0632, "step": 1490 }, { "epoch": 0.7109847138286527, "grad_norm": 0.4153887781497306, "learning_rate": 8.676096315616633e-05, "loss": 1.9658, "step": 1500 }, { "epoch": 0.7157246119208437, "grad_norm": 0.4421939758182222, "learning_rate": 8.659222683783785e-05, "loss": 1.9318, "step": 1510 }, { "epoch": 0.7204645100130347, "grad_norm": 0.40964882006156955, "learning_rate": 8.642258834194306e-05, "loss": 1.9843, "step": 1520 }, { "epoch": 0.7252044081052257, "grad_norm": 0.4083908197791484, "learning_rate": 8.625205185090148e-05, "loss": 1.9828, "step": 1530 }, { "epoch": 0.7299443061974168, "grad_norm": 0.39713303306109243, "learning_rate": 8.608062156927267e-05, "loss": 1.9957, "step": 1540 }, { "epoch": 0.7346842042896078, "grad_norm": 0.3984748196137378, "learning_rate": 8.59083017236525e-05, "loss": 1.9756, "step": 1550 }, { "epoch": 0.7394241023817988, "grad_norm": 0.3801131175331665, "learning_rate": 8.57350965625689e-05, "loss": 2.0876, "step": 1560 }, { "epoch": 0.7441640004739898, "grad_norm": 0.40526485533564677, "learning_rate": 8.556101035637723e-05, "loss": 1.9273, "step": 1570 }, { "epoch": 0.7489038985661808, "grad_norm": 0.43256807999674307, "learning_rate": 8.538604739715487e-05, "loss": 1.9965, "step": 1580 }, { "epoch": 0.7536437966583719, "grad_norm": 0.4089571388848955, "learning_rate": 8.521021199859547e-05, "loss": 1.9838, "step": 1590 }, { "epoch": 0.7583836947505629, "grad_norm": 0.43989226476544846, "learning_rate": 8.503350849590261e-05, "loss": 2.0101, "step": 1600 }, { "epoch": 0.7631235928427539, "grad_norm": 0.4312349465343795, "learning_rate": 8.485594124568286e-05, "loss": 2.0024, "step": 1610 }, { "epoch": 0.7678634909349449, "grad_norm": 0.42870468778423404, "learning_rate": 8.467751462583837e-05, "loss": 1.9171, "step": 1620 }, { "epoch": 0.7726033890271359, "grad_norm": 0.37297491856173187, "learning_rate": 8.449823303545902e-05, "loss": 1.9234, "step": 1630 }, { "epoch": 0.777343287119327, "grad_norm": 0.43903627896277525, "learning_rate": 8.431810089471386e-05, "loss": 2.0138, "step": 1640 }, { "epoch": 0.782083185211518, "grad_norm": 0.4356441070614573, "learning_rate": 8.413712264474218e-05, "loss": 1.9822, "step": 1650 }, { "epoch": 0.786823083303709, "grad_norm": 0.42844869008890196, "learning_rate": 8.395530274754401e-05, "loss": 1.9615, "step": 1660 }, { "epoch": 0.7915629813959, "grad_norm": 0.442280918540681, "learning_rate": 8.377264568587012e-05, "loss": 1.9835, "step": 1670 }, { "epoch": 0.796302879488091, "grad_norm": 0.42858220049882395, "learning_rate": 8.358915596311143e-05, "loss": 1.9043, "step": 1680 }, { "epoch": 0.801042777580282, "grad_norm": 0.388683268775689, "learning_rate": 8.340483810318809e-05, "loss": 2.0451, "step": 1690 }, { "epoch": 0.8057826756724731, "grad_norm": 0.4116698984896444, "learning_rate": 8.321969665043785e-05, "loss": 1.9792, "step": 1700 }, { "epoch": 0.8105225737646641, "grad_norm": 0.40384036708963345, "learning_rate": 8.303373616950408e-05, "loss": 1.8407, "step": 1710 }, { "epoch": 0.8152624718568551, "grad_norm": 0.4680015183031998, "learning_rate": 8.28469612452232e-05, "loss": 1.9616, "step": 1720 }, { "epoch": 0.8200023699490461, "grad_norm": 0.43443236620799985, "learning_rate": 8.265937648251162e-05, "loss": 1.9879, "step": 1730 }, { "epoch": 0.8247422680412371, "grad_norm": 0.4892981794701289, "learning_rate": 8.247098650625229e-05, "loss": 1.9988, "step": 1740 }, { "epoch": 0.8294821661334282, "grad_norm": 0.41120558715230104, "learning_rate": 8.228179596118055e-05, "loss": 2.0057, "step": 1750 }, { "epoch": 0.8342220642256192, "grad_norm": 0.3856884225256909, "learning_rate": 8.209180951176972e-05, "loss": 2.0345, "step": 1760 }, { "epoch": 0.8389619623178102, "grad_norm": 0.43262267182183567, "learning_rate": 8.190103184211606e-05, "loss": 2.0506, "step": 1770 }, { "epoch": 0.8437018604100012, "grad_norm": 0.46227543956491046, "learning_rate": 8.170946765582327e-05, "loss": 1.9537, "step": 1780 }, { "epoch": 0.8484417585021922, "grad_norm": 0.41122944892391, "learning_rate": 8.151712167588654e-05, "loss": 1.9481, "step": 1790 }, { "epoch": 0.8531816565943833, "grad_norm": 0.4762971181475547, "learning_rate": 8.13239986445761e-05, "loss": 1.969, "step": 1800 }, { "epoch": 0.8579215546865743, "grad_norm": 0.41348450657088276, "learning_rate": 8.113010332332032e-05, "loss": 2.0127, "step": 1810 }, { "epoch": 0.8626614527787653, "grad_norm": 0.41355376759860496, "learning_rate": 8.093544049258826e-05, "loss": 1.9378, "step": 1820 }, { "epoch": 0.8674013508709563, "grad_norm": 0.4739386141603482, "learning_rate": 8.074001495177187e-05, "loss": 1.9548, "step": 1830 }, { "epoch": 0.8721412489631473, "grad_norm": 0.4067937473126016, "learning_rate": 8.054383151906766e-05, "loss": 1.9588, "step": 1840 }, { "epoch": 0.8768811470553383, "grad_norm": 0.4603727127637402, "learning_rate": 8.034689503135783e-05, "loss": 1.9616, "step": 1850 }, { "epoch": 0.8816210451475294, "grad_norm": 0.404919540874673, "learning_rate": 8.014921034409115e-05, "loss": 1.9476, "step": 1860 }, { "epoch": 0.8863609432397204, "grad_norm": 0.39850400899429533, "learning_rate": 7.99507823311631e-05, "loss": 1.9603, "step": 1870 }, { "epoch": 0.8911008413319114, "grad_norm": 0.48693274229874695, "learning_rate": 7.97516158847958e-05, "loss": 2.0121, "step": 1880 }, { "epoch": 0.8958407394241024, "grad_norm": 0.45401122715232545, "learning_rate": 7.955171591541739e-05, "loss": 1.8593, "step": 1890 }, { "epoch": 0.9005806375162934, "grad_norm": 0.38605278944495364, "learning_rate": 7.935108735154094e-05, "loss": 1.9199, "step": 1900 }, { "epoch": 0.9053205356084845, "grad_norm": 0.4453838492498413, "learning_rate": 7.914973513964291e-05, "loss": 1.9354, "step": 1910 }, { "epoch": 0.9100604337006755, "grad_norm": 0.4123431078009058, "learning_rate": 7.894766424404126e-05, "loss": 1.9807, "step": 1920 }, { "epoch": 0.9148003317928665, "grad_norm": 0.43369573713775106, "learning_rate": 7.874487964677301e-05, "loss": 1.9707, "step": 1930 }, { "epoch": 0.9195402298850575, "grad_norm": 0.3949770503185179, "learning_rate": 7.854138634747145e-05, "loss": 1.9742, "step": 1940 }, { "epoch": 0.9242801279772485, "grad_norm": 0.4224215984268503, "learning_rate": 7.833718936324277e-05, "loss": 1.9465, "step": 1950 }, { "epoch": 0.9290200260694395, "grad_norm": 0.5228997588486322, "learning_rate": 7.813229372854251e-05, "loss": 1.9454, "step": 1960 }, { "epoch": 0.9337599241616306, "grad_norm": 0.42165180512522465, "learning_rate": 7.792670449505135e-05, "loss": 1.9175, "step": 1970 }, { "epoch": 0.9384998222538216, "grad_norm": 0.40378336800384856, "learning_rate": 7.772042673155055e-05, "loss": 1.9237, "step": 1980 }, { "epoch": 0.9432397203460126, "grad_norm": 0.45740238886085255, "learning_rate": 7.751346552379706e-05, "loss": 1.9752, "step": 1990 }, { "epoch": 0.9479796184382036, "grad_norm": 0.39149703066060726, "learning_rate": 7.730582597439799e-05, "loss": 1.98, "step": 2000 }, { "epoch": 0.9527195165303946, "grad_norm": 0.4198989958604622, "learning_rate": 7.709751320268499e-05, "loss": 1.9937, "step": 2010 }, { "epoch": 0.9574594146225857, "grad_norm": 0.45036655944797305, "learning_rate": 7.688853234458786e-05, "loss": 1.9439, "step": 2020 }, { "epoch": 0.9621993127147767, "grad_norm": 0.47886989965002774, "learning_rate": 7.667888855250806e-05, "loss": 1.8984, "step": 2030 }, { "epoch": 0.9669392108069677, "grad_norm": 0.4485436591345206, "learning_rate": 7.646858699519158e-05, "loss": 1.9997, "step": 2040 }, { "epoch": 0.9716791088991587, "grad_norm": 0.4089350286618743, "learning_rate": 7.625763285760154e-05, "loss": 2.0561, "step": 2050 }, { "epoch": 0.9764190069913496, "grad_norm": 0.5012148973934161, "learning_rate": 7.604603134079039e-05, "loss": 1.9108, "step": 2060 }, { "epoch": 0.9811589050835408, "grad_norm": 0.4193397192808331, "learning_rate": 7.583378766177163e-05, "loss": 2.0375, "step": 2070 }, { "epoch": 0.9858988031757318, "grad_norm": 0.3996742152514563, "learning_rate": 7.56209070533912e-05, "loss": 1.8992, "step": 2080 }, { "epoch": 0.9906387012679227, "grad_norm": 0.43312783729617976, "learning_rate": 7.540739476419847e-05, "loss": 2.0202, "step": 2090 }, { "epoch": 0.9953785993601137, "grad_norm": 0.47876561721756805, "learning_rate": 7.519325605831684e-05, "loss": 1.9258, "step": 2100 }, { "epoch": 1.0001184974523047, "grad_norm": 0.40845159679128945, "learning_rate": 7.497849621531396e-05, "loss": 1.8963, "step": 2110 }, { "epoch": 1.0048583955444959, "grad_norm": 0.4911320886031023, "learning_rate": 7.476312053007151e-05, "loss": 1.8763, "step": 2120 }, { "epoch": 1.0095982936366867, "grad_norm": 0.4341191300612264, "learning_rate": 7.454713431265475e-05, "loss": 1.9345, "step": 2130 }, { "epoch": 1.0143381917288778, "grad_norm": 0.44526984352662835, "learning_rate": 7.43305428881815e-05, "loss": 1.9666, "step": 2140 }, { "epoch": 1.019078089821069, "grad_norm": 0.45021419491727926, "learning_rate": 7.411335159669093e-05, "loss": 1.9683, "step": 2150 }, { "epoch": 1.0238179879132598, "grad_norm": 0.46367987121746707, "learning_rate": 7.389556579301186e-05, "loss": 1.884, "step": 2160 }, { "epoch": 1.028557886005451, "grad_norm": 0.518631039907863, "learning_rate": 7.367719084663074e-05, "loss": 1.8473, "step": 2170 }, { "epoch": 1.0332977840976418, "grad_norm": 0.4686244164357671, "learning_rate": 7.345823214155927e-05, "loss": 1.8894, "step": 2180 }, { "epoch": 1.038037682189833, "grad_norm": 0.5124536145999882, "learning_rate": 7.323869507620169e-05, "loss": 1.886, "step": 2190 }, { "epoch": 1.0427775802820238, "grad_norm": 0.428865165913033, "learning_rate": 7.30185850632216e-05, "loss": 1.8934, "step": 2200 }, { "epoch": 1.047517478374215, "grad_norm": 0.4575909980653946, "learning_rate": 7.27979075294086e-05, "loss": 1.8793, "step": 2210 }, { "epoch": 1.052257376466406, "grad_norm": 0.46819042427920937, "learning_rate": 7.257666791554448e-05, "loss": 1.9177, "step": 2220 }, { "epoch": 1.056997274558597, "grad_norm": 0.5869490097444697, "learning_rate": 7.2354871676269e-05, "loss": 1.8888, "step": 2230 }, { "epoch": 1.061737172650788, "grad_norm": 0.4407701363338049, "learning_rate": 7.213252427994547e-05, "loss": 1.9145, "step": 2240 }, { "epoch": 1.066477070742979, "grad_norm": 0.5471189926425418, "learning_rate": 7.1909631208526e-05, "loss": 1.8647, "step": 2250 }, { "epoch": 1.07121696883517, "grad_norm": 0.45247580903783674, "learning_rate": 7.168619795741616e-05, "loss": 1.8793, "step": 2260 }, { "epoch": 1.0759568669273611, "grad_norm": 0.5394937103937341, "learning_rate": 7.146223003533964e-05, "loss": 1.9394, "step": 2270 }, { "epoch": 1.080696765019552, "grad_norm": 0.5010981958648577, "learning_rate": 7.12377329642024e-05, "loss": 1.8009, "step": 2280 }, { "epoch": 1.0854366631117431, "grad_norm": 0.49455090224086273, "learning_rate": 7.101271227895646e-05, "loss": 1.9877, "step": 2290 }, { "epoch": 1.090176561203934, "grad_norm": 0.4487359249312413, "learning_rate": 7.07871735274636e-05, "loss": 1.8578, "step": 2300 }, { "epoch": 1.0949164592961251, "grad_norm": 0.5006725728639967, "learning_rate": 7.056112227035831e-05, "loss": 1.9142, "step": 2310 }, { "epoch": 1.0996563573883162, "grad_norm": 0.46840477309344347, "learning_rate": 7.033456408091103e-05, "loss": 1.9178, "step": 2320 }, { "epoch": 1.1043962554805071, "grad_norm": 0.44881264282080685, "learning_rate": 7.010750454489042e-05, "loss": 1.9011, "step": 2330 }, { "epoch": 1.1091361535726982, "grad_norm": 0.4914874135601711, "learning_rate": 6.987994926042588e-05, "loss": 1.8817, "step": 2340 }, { "epoch": 1.1138760516648891, "grad_norm": 0.4875786937414022, "learning_rate": 6.965190383786938e-05, "loss": 1.9151, "step": 2350 }, { "epoch": 1.1186159497570802, "grad_norm": 0.47374621253430516, "learning_rate": 6.942337389965722e-05, "loss": 1.8652, "step": 2360 }, { "epoch": 1.1233558478492713, "grad_norm": 0.45812614575538185, "learning_rate": 6.919436508017139e-05, "loss": 1.9191, "step": 2370 }, { "epoch": 1.1280957459414622, "grad_norm": 0.5233924389852819, "learning_rate": 6.896488302560062e-05, "loss": 1.8944, "step": 2380 }, { "epoch": 1.1328356440336533, "grad_norm": 0.4760349705385804, "learning_rate": 6.873493339380125e-05, "loss": 1.8896, "step": 2390 }, { "epoch": 1.1375755421258442, "grad_norm": 0.47170548205722757, "learning_rate": 6.850452185415763e-05, "loss": 1.8436, "step": 2400 }, { "epoch": 1.1423154402180353, "grad_norm": 0.4742928761569321, "learning_rate": 6.827365408744244e-05, "loss": 1.938, "step": 2410 }, { "epoch": 1.1470553383102264, "grad_norm": 0.5423850691494456, "learning_rate": 6.804233578567658e-05, "loss": 1.8889, "step": 2420 }, { "epoch": 1.1517952364024173, "grad_norm": 0.48227588856524584, "learning_rate": 6.781057265198885e-05, "loss": 1.9094, "step": 2430 }, { "epoch": 1.1565351344946084, "grad_norm": 0.45425361404028264, "learning_rate": 6.75783704004753e-05, "loss": 1.859, "step": 2440 }, { "epoch": 1.1612750325867993, "grad_norm": 0.4433613473826934, "learning_rate": 6.734573475605846e-05, "loss": 1.9084, "step": 2450 }, { "epoch": 1.1660149306789904, "grad_norm": 0.4943942467439202, "learning_rate": 6.711267145434603e-05, "loss": 1.9647, "step": 2460 }, { "epoch": 1.1707548287711815, "grad_norm": 0.4577985217898985, "learning_rate": 6.687918624148963e-05, "loss": 1.8903, "step": 2470 }, { "epoch": 1.1754947268633724, "grad_norm": 0.5864019689805202, "learning_rate": 6.664528487404298e-05, "loss": 1.8431, "step": 2480 }, { "epoch": 1.1802346249555635, "grad_norm": 0.4979542549244347, "learning_rate": 6.641097311882015e-05, "loss": 1.9381, "step": 2490 }, { "epoch": 1.1849745230477544, "grad_norm": 0.5142117151718176, "learning_rate": 6.617625675275317e-05, "loss": 1.8608, "step": 2500 }, { "epoch": 1.1897144211399455, "grad_norm": 0.5179927851112526, "learning_rate": 6.59411415627498e-05, "loss": 1.9493, "step": 2510 }, { "epoch": 1.1944543192321366, "grad_norm": 0.5221841655224025, "learning_rate": 6.570563334555068e-05, "loss": 1.8724, "step": 2520 }, { "epoch": 1.1991942173243275, "grad_norm": 0.4985837837212232, "learning_rate": 6.546973790758655e-05, "loss": 1.952, "step": 2530 }, { "epoch": 1.2039341154165186, "grad_norm": 0.5552319456240327, "learning_rate": 6.523346106483504e-05, "loss": 1.9397, "step": 2540 }, { "epoch": 1.2086740135087095, "grad_norm": 0.4769628041892156, "learning_rate": 6.499680864267725e-05, "loss": 2.0053, "step": 2550 }, { "epoch": 1.2134139116009006, "grad_norm": 0.4516518959319936, "learning_rate": 6.475978647575416e-05, "loss": 1.9402, "step": 2560 }, { "epoch": 1.2181538096930915, "grad_norm": 0.4913816447981876, "learning_rate": 6.452240040782276e-05, "loss": 1.8451, "step": 2570 }, { "epoch": 1.2228937077852826, "grad_norm": 0.4748765999127487, "learning_rate": 6.4284656291612e-05, "loss": 1.9117, "step": 2580 }, { "epoch": 1.2276336058774737, "grad_norm": 0.5114110285568767, "learning_rate": 6.404655998867848e-05, "loss": 1.8831, "step": 2590 }, { "epoch": 1.2323735039696646, "grad_norm": 0.47839985560769943, "learning_rate": 6.380811736926188e-05, "loss": 1.8627, "step": 2600 }, { "epoch": 1.2371134020618557, "grad_norm": 0.5355232832118345, "learning_rate": 6.356933431214034e-05, "loss": 1.9189, "step": 2610 }, { "epoch": 1.2418533001540466, "grad_norm": 0.4895001261750141, "learning_rate": 6.33302167044854e-05, "loss": 1.9699, "step": 2620 }, { "epoch": 1.2465931982462377, "grad_norm": 0.4635882938471385, "learning_rate": 6.309077044171694e-05, "loss": 1.8779, "step": 2630 }, { "epoch": 1.2513330963384286, "grad_norm": 0.45916609044978873, "learning_rate": 6.285100142735782e-05, "loss": 1.8527, "step": 2640 }, { "epoch": 1.2560729944306197, "grad_norm": 0.46784246908879684, "learning_rate": 6.261091557288826e-05, "loss": 1.8844, "step": 2650 }, { "epoch": 1.2608128925228108, "grad_norm": 0.5131345820024794, "learning_rate": 6.237051879760014e-05, "loss": 1.8402, "step": 2660 }, { "epoch": 1.2655527906150017, "grad_norm": 0.5766279369511716, "learning_rate": 6.21298170284511e-05, "loss": 1.8558, "step": 2670 }, { "epoch": 1.2702926887071928, "grad_norm": 0.48863073587665085, "learning_rate": 6.188881619991834e-05, "loss": 1.9337, "step": 2680 }, { "epoch": 1.2750325867993837, "grad_norm": 0.5958235159214345, "learning_rate": 6.164752225385235e-05, "loss": 1.9018, "step": 2690 }, { "epoch": 1.2797724848915748, "grad_norm": 0.5127854587716114, "learning_rate": 6.140594113933042e-05, "loss": 1.928, "step": 2700 }, { "epoch": 1.284512382983766, "grad_norm": 0.4918233056408275, "learning_rate": 6.116407881250994e-05, "loss": 1.9623, "step": 2710 }, { "epoch": 1.2892522810759568, "grad_norm": 0.4759408966884228, "learning_rate": 6.0921941236481505e-05, "loss": 1.876, "step": 2720 }, { "epoch": 1.293992179168148, "grad_norm": 0.49692255085585224, "learning_rate": 6.067953438112205e-05, "loss": 1.871, "step": 2730 }, { "epoch": 1.2987320772603388, "grad_norm": 0.51069268079758, "learning_rate": 6.043686422294747e-05, "loss": 1.9503, "step": 2740 }, { "epoch": 1.30347197535253, "grad_norm": 0.4848235028179103, "learning_rate": 6.019393674496543e-05, "loss": 1.9636, "step": 2750 }, { "epoch": 1.308211873444721, "grad_norm": 0.7269161906292443, "learning_rate": 5.995075793652775e-05, "loss": 1.8818, "step": 2760 }, { "epoch": 1.312951771536912, "grad_norm": 0.46011103384366614, "learning_rate": 5.9707333793182794e-05, "loss": 1.9123, "step": 2770 }, { "epoch": 1.317691669629103, "grad_norm": 0.5009880993886451, "learning_rate": 5.946367031652761e-05, "loss": 1.9407, "step": 2780 }, { "epoch": 1.3224315677212939, "grad_norm": 0.5049332736921734, "learning_rate": 5.921977351406004e-05, "loss": 1.8624, "step": 2790 }, { "epoch": 1.327171465813485, "grad_norm": 0.4984446750273935, "learning_rate": 5.8975649399030485e-05, "loss": 1.8407, "step": 2800 }, { "epoch": 1.331911363905676, "grad_norm": 0.5202629992326526, "learning_rate": 5.873130399029374e-05, "loss": 1.8723, "step": 2810 }, { "epoch": 1.336651261997867, "grad_norm": 0.57260787674711, "learning_rate": 5.8486743312160584e-05, "loss": 1.9077, "step": 2820 }, { "epoch": 1.341391160090058, "grad_norm": 0.47793956835922086, "learning_rate": 5.824197339424923e-05, "loss": 1.9855, "step": 2830 }, { "epoch": 1.346131058182249, "grad_norm": 0.4699288477951403, "learning_rate": 5.799700027133666e-05, "loss": 1.9131, "step": 2840 }, { "epoch": 1.35087095627444, "grad_norm": 0.504238497502292, "learning_rate": 5.7751829983209896e-05, "loss": 1.9438, "step": 2850 }, { "epoch": 1.3556108543666312, "grad_norm": 0.4814570049600418, "learning_rate": 5.750646857451701e-05, "loss": 1.9549, "step": 2860 }, { "epoch": 1.360350752458822, "grad_norm": 0.5038793494327912, "learning_rate": 5.726092209461814e-05, "loss": 1.9016, "step": 2870 }, { "epoch": 1.3650906505510132, "grad_norm": 0.5240318677978467, "learning_rate": 5.701519659743636e-05, "loss": 1.9323, "step": 2880 }, { "epoch": 1.369830548643204, "grad_norm": 0.5135642745972475, "learning_rate": 5.6769298141308345e-05, "loss": 1.8633, "step": 2890 }, { "epoch": 1.3745704467353952, "grad_norm": 0.5115968529507217, "learning_rate": 5.652323278883511e-05, "loss": 1.8486, "step": 2900 }, { "epoch": 1.3793103448275863, "grad_norm": 0.4973184073827783, "learning_rate": 5.6277006606732465e-05, "loss": 1.9067, "step": 2910 }, { "epoch": 1.3840502429197772, "grad_norm": 0.48576803898302945, "learning_rate": 5.603062566568144e-05, "loss": 1.9167, "step": 2920 }, { "epoch": 1.3887901410119683, "grad_norm": 0.532613823404453, "learning_rate": 5.5784096040178624e-05, "loss": 1.916, "step": 2930 }, { "epoch": 1.3935300391041592, "grad_norm": 0.5402345956070669, "learning_rate": 5.5537423808386457e-05, "loss": 1.9193, "step": 2940 }, { "epoch": 1.3982699371963503, "grad_norm": 0.4920153790997806, "learning_rate": 5.5290615051983276e-05, "loss": 1.8214, "step": 2950 }, { "epoch": 1.4030098352885414, "grad_norm": 0.5305053717830343, "learning_rate": 5.504367585601342e-05, "loss": 1.8724, "step": 2960 }, { "epoch": 1.4077497333807323, "grad_norm": 0.5348665608450567, "learning_rate": 5.479661230873723e-05, "loss": 1.9576, "step": 2970 }, { "epoch": 1.4124896314729234, "grad_norm": 0.5212184732874925, "learning_rate": 5.4549430501480895e-05, "loss": 1.9409, "step": 2980 }, { "epoch": 1.4172295295651143, "grad_norm": 0.513803010422433, "learning_rate": 5.43021365284863e-05, "loss": 1.8691, "step": 2990 }, { "epoch": 1.4219694276573054, "grad_norm": 0.5405731422319697, "learning_rate": 5.405473648676074e-05, "loss": 1.9071, "step": 3000 }, { "epoch": 1.4267093257494965, "grad_norm": 0.5828580104321831, "learning_rate": 5.380723647592668e-05, "loss": 1.8781, "step": 3010 }, { "epoch": 1.4314492238416874, "grad_norm": 0.4730373307838654, "learning_rate": 5.3559642598071244e-05, "loss": 1.9514, "step": 3020 }, { "epoch": 1.4361891219338785, "grad_norm": 0.5098706245647135, "learning_rate": 5.3311960957595885e-05, "loss": 1.9019, "step": 3030 }, { "epoch": 1.4409290200260694, "grad_norm": 0.4902558604014986, "learning_rate": 5.306419766106582e-05, "loss": 1.8003, "step": 3040 }, { "epoch": 1.4456689181182605, "grad_norm": 0.5662981198334492, "learning_rate": 5.2816358817059483e-05, "loss": 1.9584, "step": 3050 }, { "epoch": 1.4504088162104516, "grad_norm": 0.5080795735549143, "learning_rate": 5.2568450536017946e-05, "loss": 1.8299, "step": 3060 }, { "epoch": 1.4551487143026425, "grad_norm": 0.4883320170692768, "learning_rate": 5.23204789300942e-05, "loss": 1.8948, "step": 3070 }, { "epoch": 1.4598886123948336, "grad_norm": 0.5018665885085004, "learning_rate": 5.207245011300256e-05, "loss": 1.9096, "step": 3080 }, { "epoch": 1.4646285104870245, "grad_norm": 0.49985987707909735, "learning_rate": 5.182437019986781e-05, "loss": 1.8725, "step": 3090 }, { "epoch": 1.4693684085792156, "grad_norm": 0.5501802725606001, "learning_rate": 5.157624530707457e-05, "loss": 1.852, "step": 3100 }, { "epoch": 1.4741083066714067, "grad_norm": 0.5050415458131547, "learning_rate": 5.132808155211637e-05, "loss": 1.9234, "step": 3110 }, { "epoch": 1.4788482047635976, "grad_norm": 0.5388328369977669, "learning_rate": 5.107988505344493e-05, "loss": 1.8503, "step": 3120 }, { "epoch": 1.4835881028557887, "grad_norm": 0.5294932998067775, "learning_rate": 5.083166193031924e-05, "loss": 1.8602, "step": 3130 }, { "epoch": 1.4883280009479796, "grad_norm": 0.5081432892581731, "learning_rate": 5.058341830265473e-05, "loss": 1.8916, "step": 3140 }, { "epoch": 1.4930678990401707, "grad_norm": 0.48231454449779565, "learning_rate": 5.033516029087231e-05, "loss": 1.9268, "step": 3150 }, { "epoch": 1.4978077971323618, "grad_norm": 0.5031248301603529, "learning_rate": 5.008689401574762e-05, "loss": 1.8619, "step": 3160 }, { "epoch": 1.5025476952245527, "grad_norm": 0.48955254310210605, "learning_rate": 4.983862559825994e-05, "loss": 1.9342, "step": 3170 }, { "epoch": 1.5072875933167436, "grad_norm": 0.5786990144175583, "learning_rate": 4.959036115944146e-05, "loss": 1.9487, "step": 3180 }, { "epoch": 1.5120274914089347, "grad_norm": 0.5204059056090741, "learning_rate": 4.93421068202262e-05, "loss": 1.9237, "step": 3190 }, { "epoch": 1.5167673895011258, "grad_norm": 0.5063131987653341, "learning_rate": 4.909386870129921e-05, "loss": 1.9752, "step": 3200 }, { "epoch": 1.5215072875933169, "grad_norm": 0.48289993909064316, "learning_rate": 4.884565292294563e-05, "loss": 1.8891, "step": 3210 }, { "epoch": 1.5262471856855078, "grad_norm": 0.5172395191973475, "learning_rate": 4.859746560489979e-05, "loss": 1.8907, "step": 3220 }, { "epoch": 1.5309870837776987, "grad_norm": 0.4807916914066212, "learning_rate": 4.834931286619432e-05, "loss": 1.9074, "step": 3230 }, { "epoch": 1.5357269818698898, "grad_norm": 0.5144939695987174, "learning_rate": 4.810120082500934e-05, "loss": 1.8338, "step": 3240 }, { "epoch": 1.5404668799620809, "grad_norm": 0.5199756044880577, "learning_rate": 4.785313559852156e-05, "loss": 1.965, "step": 3250 }, { "epoch": 1.545206778054272, "grad_norm": 0.5415928562917922, "learning_rate": 4.7605123302753433e-05, "loss": 1.8472, "step": 3260 }, { "epoch": 1.5499466761464629, "grad_norm": 0.5335132590972799, "learning_rate": 4.735717005242248e-05, "loss": 1.8558, "step": 3270 }, { "epoch": 1.5546865742386538, "grad_norm": 0.5581108907205053, "learning_rate": 4.710928196079042e-05, "loss": 1.8794, "step": 3280 }, { "epoch": 1.5594264723308449, "grad_norm": 0.5335645184315633, "learning_rate": 4.6861465139512475e-05, "loss": 1.8271, "step": 3290 }, { "epoch": 1.564166370423036, "grad_norm": 0.5470177997128685, "learning_rate": 4.661372569848678e-05, "loss": 1.8935, "step": 3300 }, { "epoch": 1.568906268515227, "grad_norm": 0.5362519757955545, "learning_rate": 4.636606974570361e-05, "loss": 1.8072, "step": 3310 }, { "epoch": 1.573646166607418, "grad_norm": 0.6040810957613818, "learning_rate": 4.611850338709482e-05, "loss": 1.7864, "step": 3320 }, { "epoch": 1.5783860646996088, "grad_norm": 0.5318403452991018, "learning_rate": 4.5871032726383386e-05, "loss": 1.8524, "step": 3330 }, { "epoch": 1.5831259627918, "grad_norm": 0.5512446332300014, "learning_rate": 4.562366386493286e-05, "loss": 1.8972, "step": 3340 }, { "epoch": 1.587865860883991, "grad_norm": 0.5083043080271707, "learning_rate": 4.537640290159688e-05, "loss": 1.7909, "step": 3350 }, { "epoch": 1.5926057589761822, "grad_norm": 0.516558139348224, "learning_rate": 4.512925593256895e-05, "loss": 1.9006, "step": 3360 }, { "epoch": 1.597345657068373, "grad_norm": 0.5406712324925647, "learning_rate": 4.4882229051232e-05, "loss": 1.9456, "step": 3370 }, { "epoch": 1.602085555160564, "grad_norm": 0.5537236012465999, "learning_rate": 4.463532834800825e-05, "loss": 1.8696, "step": 3380 }, { "epoch": 1.606825453252755, "grad_norm": 0.5501268633544832, "learning_rate": 4.438855991020896e-05, "loss": 1.9089, "step": 3390 }, { "epoch": 1.6115653513449462, "grad_norm": 0.5642376324584947, "learning_rate": 4.414192982188446e-05, "loss": 1.868, "step": 3400 }, { "epoch": 1.616305249437137, "grad_norm": 0.49603254737837815, "learning_rate": 4.3895444163674006e-05, "loss": 1.9261, "step": 3410 }, { "epoch": 1.6210451475293282, "grad_norm": 0.5264212888797052, "learning_rate": 4.364910901265606e-05, "loss": 1.9271, "step": 3420 }, { "epoch": 1.625785045621519, "grad_norm": 0.5165427594444576, "learning_rate": 4.340293044219825e-05, "loss": 1.8798, "step": 3430 }, { "epoch": 1.6305249437137102, "grad_norm": 0.5111756681074762, "learning_rate": 4.315691452180777e-05, "loss": 1.8821, "step": 3440 }, { "epoch": 1.6352648418059013, "grad_norm": 0.5353729238490614, "learning_rate": 4.2911067316981656e-05, "loss": 1.9193, "step": 3450 }, { "epoch": 1.6400047398980921, "grad_norm": 0.5427362289483532, "learning_rate": 4.2665394889057325e-05, "loss": 1.8648, "step": 3460 }, { "epoch": 1.6447446379902833, "grad_norm": 0.5316532712452083, "learning_rate": 4.2419903295063045e-05, "loss": 1.8696, "step": 3470 }, { "epoch": 1.6494845360824741, "grad_norm": 0.5445515739019248, "learning_rate": 4.2174598587568706e-05, "loss": 1.7773, "step": 3480 }, { "epoch": 1.6542244341746652, "grad_norm": 0.515985891781636, "learning_rate": 4.192948681453645e-05, "loss": 1.9528, "step": 3490 }, { "epoch": 1.6589643322668564, "grad_norm": 0.533497568011406, "learning_rate": 4.168457401917169e-05, "loss": 1.9089, "step": 3500 }, { "epoch": 1.6637042303590472, "grad_norm": 0.5034380410666982, "learning_rate": 4.1439866239774065e-05, "loss": 1.902, "step": 3510 }, { "epoch": 1.6684441284512384, "grad_norm": 0.5008886693586585, "learning_rate": 4.119536950958853e-05, "loss": 1.8597, "step": 3520 }, { "epoch": 1.6731840265434292, "grad_norm": 0.5042866133180605, "learning_rate": 4.095108985665668e-05, "loss": 1.941, "step": 3530 }, { "epoch": 1.6779239246356203, "grad_norm": 0.4894456961892347, "learning_rate": 4.070703330366809e-05, "loss": 1.8749, "step": 3540 }, { "epoch": 1.6826638227278115, "grad_norm": 0.5304927617260963, "learning_rate": 4.0463205867811834e-05, "loss": 1.9169, "step": 3550 }, { "epoch": 1.6874037208200023, "grad_norm": 0.5192399220515885, "learning_rate": 4.0219613560628074e-05, "loss": 1.8853, "step": 3560 }, { "epoch": 1.6921436189121932, "grad_norm": 0.5436581114459818, "learning_rate": 3.997626238785997e-05, "loss": 1.9093, "step": 3570 }, { "epoch": 1.6968835170043843, "grad_norm": 0.5671093634463978, "learning_rate": 3.973315834930549e-05, "loss": 1.8667, "step": 3580 }, { "epoch": 1.7016234150965754, "grad_norm": 0.5505401718757482, "learning_rate": 3.949030743866955e-05, "loss": 1.8701, "step": 3590 }, { "epoch": 1.7063633131887666, "grad_norm": 0.5107784655812311, "learning_rate": 3.924771564341621e-05, "loss": 1.8796, "step": 3600 }, { "epoch": 1.7111032112809574, "grad_norm": 0.5123424894974382, "learning_rate": 3.900538894462112e-05, "loss": 1.9345, "step": 3610 }, { "epoch": 1.7158431093731483, "grad_norm": 0.5975803333556319, "learning_rate": 3.876333331682394e-05, "loss": 1.9071, "step": 3620 }, { "epoch": 1.7205830074653394, "grad_norm": 0.5607215795184285, "learning_rate": 3.8521554727881115e-05, "loss": 1.8444, "step": 3630 }, { "epoch": 1.7253229055575305, "grad_norm": 0.5812681320546813, "learning_rate": 3.828005913881876e-05, "loss": 1.8783, "step": 3640 }, { "epoch": 1.7300628036497216, "grad_norm": 0.5809996822930421, "learning_rate": 3.803885250368562e-05, "loss": 1.8667, "step": 3650 }, { "epoch": 1.7348027017419125, "grad_norm": 0.5264379258394054, "learning_rate": 3.7797940769406324e-05, "loss": 1.8832, "step": 3660 }, { "epoch": 1.7395425998341034, "grad_norm": 0.5452547674401557, "learning_rate": 3.755732987563476e-05, "loss": 1.9126, "step": 3670 }, { "epoch": 1.7442824979262945, "grad_norm": 0.5573756045226962, "learning_rate": 3.731702575460763e-05, "loss": 1.9267, "step": 3680 }, { "epoch": 1.7490223960184856, "grad_norm": 0.5891329270301621, "learning_rate": 3.707703433099815e-05, "loss": 1.8927, "step": 3690 }, { "epoch": 1.7537622941106767, "grad_norm": 0.5379354015536967, "learning_rate": 3.683736152177005e-05, "loss": 1.8829, "step": 3700 }, { "epoch": 1.7585021922028676, "grad_norm": 0.584902744080287, "learning_rate": 3.659801323603163e-05, "loss": 1.9032, "step": 3710 }, { "epoch": 1.7632420902950585, "grad_norm": 0.47271945766863005, "learning_rate": 3.63589953748901e-05, "loss": 1.8634, "step": 3720 }, { "epoch": 1.7679819883872496, "grad_norm": 0.5602358756096469, "learning_rate": 3.612031383130612e-05, "loss": 1.8436, "step": 3730 }, { "epoch": 1.7727218864794407, "grad_norm": 0.5171084893952771, "learning_rate": 3.5881974489948456e-05, "loss": 1.8279, "step": 3740 }, { "epoch": 1.7774617845716318, "grad_norm": 0.5085114117110985, "learning_rate": 3.564398322704887e-05, "loss": 1.8842, "step": 3750 }, { "epoch": 1.7822016826638227, "grad_norm": 0.5395255555244833, "learning_rate": 3.5406345910257346e-05, "loss": 1.8974, "step": 3760 }, { "epoch": 1.7869415807560136, "grad_norm": 0.5256917642696852, "learning_rate": 3.5169068398497344e-05, "loss": 1.9247, "step": 3770 }, { "epoch": 1.7916814788482047, "grad_norm": 0.5297510632715654, "learning_rate": 3.493215654182134e-05, "loss": 1.8941, "step": 3780 }, { "epoch": 1.7964213769403958, "grad_norm": 0.4887292770108947, "learning_rate": 3.4695616181266674e-05, "loss": 1.8662, "step": 3790 }, { "epoch": 1.801161275032587, "grad_norm": 0.605286928037954, "learning_rate": 3.445945314871144e-05, "loss": 1.7946, "step": 3800 }, { "epoch": 1.8059011731247778, "grad_norm": 0.5534598174424521, "learning_rate": 3.422367326673079e-05, "loss": 1.9319, "step": 3810 }, { "epoch": 1.8106410712169687, "grad_norm": 0.516541325820194, "learning_rate": 3.398828234845331e-05, "loss": 1.9102, "step": 3820 }, { "epoch": 1.8153809693091598, "grad_norm": 0.5316375380294128, "learning_rate": 3.3753286197417714e-05, "loss": 1.9137, "step": 3830 }, { "epoch": 1.820120867401351, "grad_norm": 0.5048711282201915, "learning_rate": 3.3518690607429784e-05, "loss": 1.8643, "step": 3840 }, { "epoch": 1.824860765493542, "grad_norm": 0.5407400572506997, "learning_rate": 3.3284501362419566e-05, "loss": 1.8524, "step": 3850 }, { "epoch": 1.829600663585733, "grad_norm": 0.5444240928370307, "learning_rate": 3.305072423629862e-05, "loss": 1.9604, "step": 3860 }, { "epoch": 1.8343405616779238, "grad_norm": 0.5259735881080222, "learning_rate": 3.281736499281783e-05, "loss": 1.8699, "step": 3870 }, { "epoch": 1.839080459770115, "grad_norm": 0.5412391021904834, "learning_rate": 3.2584429385425163e-05, "loss": 1.9233, "step": 3880 }, { "epoch": 1.843820357862306, "grad_norm": 0.581528749881215, "learning_rate": 3.235192315712394e-05, "loss": 1.9037, "step": 3890 }, { "epoch": 1.8485602559544971, "grad_norm": 0.486599214527775, "learning_rate": 3.211985204033114e-05, "loss": 1.881, "step": 3900 }, { "epoch": 1.853300154046688, "grad_norm": 0.5732281840924196, "learning_rate": 3.188822175673618e-05, "loss": 1.9289, "step": 3910 }, { "epoch": 1.858040052138879, "grad_norm": 0.5393218742500727, "learning_rate": 3.165703801715969e-05, "loss": 1.8178, "step": 3920 }, { "epoch": 1.86277995023107, "grad_norm": 0.5317421200650526, "learning_rate": 3.142630652141286e-05, "loss": 1.7813, "step": 3930 }, { "epoch": 1.8675198483232611, "grad_norm": 0.4707578563318653, "learning_rate": 3.119603295815685e-05, "loss": 1.8928, "step": 3940 }, { "epoch": 1.8722597464154522, "grad_norm": 0.503217338566424, "learning_rate": 3.096622300476253e-05, "loss": 1.9702, "step": 3950 }, { "epoch": 1.8769996445076431, "grad_norm": 0.5191335631232252, "learning_rate": 3.07368823271705e-05, "loss": 1.8832, "step": 3960 }, { "epoch": 1.881739542599834, "grad_norm": 0.5929718795388419, "learning_rate": 3.050801657975147e-05, "loss": 1.9705, "step": 3970 }, { "epoch": 1.8864794406920251, "grad_norm": 0.5203449537199084, "learning_rate": 3.0279631405166754e-05, "loss": 1.8005, "step": 3980 }, { "epoch": 1.8912193387842162, "grad_norm": 0.6060740003713215, "learning_rate": 3.0051732434229184e-05, "loss": 1.8802, "step": 3990 }, { "epoch": 1.895959236876407, "grad_norm": 0.5254251326665124, "learning_rate": 2.9824325285764332e-05, "loss": 1.9063, "step": 4000 }, { "epoch": 1.9006991349685982, "grad_norm": 0.5412654814841995, "learning_rate": 2.9597415566471874e-05, "loss": 1.7974, "step": 4010 }, { "epoch": 1.905439033060789, "grad_norm": 0.6096977687423671, "learning_rate": 2.9371008870787474e-05, "loss": 1.8789, "step": 4020 }, { "epoch": 1.9101789311529802, "grad_norm": 0.5751076752952912, "learning_rate": 2.914511078074481e-05, "loss": 1.9147, "step": 4030 }, { "epoch": 1.9149188292451713, "grad_norm": 0.5596872085857021, "learning_rate": 2.891972686583791e-05, "loss": 1.8939, "step": 4040 }, { "epoch": 1.9196587273373622, "grad_norm": 0.5205001238706851, "learning_rate": 2.8694862682883866e-05, "loss": 1.8675, "step": 4050 }, { "epoch": 1.9243986254295533, "grad_norm": 0.6060966652232279, "learning_rate": 2.8470523775885816e-05, "loss": 1.8542, "step": 4060 }, { "epoch": 1.9291385235217442, "grad_norm": 0.5060927602134601, "learning_rate": 2.824671567589635e-05, "loss": 1.9095, "step": 4070 }, { "epoch": 1.9338784216139353, "grad_norm": 0.527071756794979, "learning_rate": 2.8023443900880984e-05, "loss": 1.8144, "step": 4080 }, { "epoch": 1.9386183197061264, "grad_norm": 0.6186591144971271, "learning_rate": 2.780071395558222e-05, "loss": 1.9328, "step": 4090 }, { "epoch": 1.9433582177983173, "grad_norm": 0.5084958011646354, "learning_rate": 2.757853133138382e-05, "loss": 1.8292, "step": 4100 }, { "epoch": 1.9480981158905084, "grad_norm": 0.5671058444452819, "learning_rate": 2.7356901506175426e-05, "loss": 1.8621, "step": 4110 }, { "epoch": 1.9528380139826993, "grad_norm": 0.6077250993929268, "learning_rate": 2.7135829944217406e-05, "loss": 1.8969, "step": 4120 }, { "epoch": 1.9575779120748904, "grad_norm": 0.5478709269890887, "learning_rate": 2.6915322096006244e-05, "loss": 1.9648, "step": 4130 }, { "epoch": 1.9623178101670815, "grad_norm": 0.5304846907499281, "learning_rate": 2.6695383398140155e-05, "loss": 1.8867, "step": 4140 }, { "epoch": 1.9670577082592724, "grad_norm": 0.5084950385451593, "learning_rate": 2.6476019273184938e-05, "loss": 1.8987, "step": 4150 }, { "epoch": 1.9717976063514633, "grad_norm": 0.5881914443826771, "learning_rate": 2.6257235129540424e-05, "loss": 1.8718, "step": 4160 }, { "epoch": 1.9765375044436544, "grad_norm": 0.5557425542971698, "learning_rate": 2.603903636130701e-05, "loss": 1.8204, "step": 4170 }, { "epoch": 1.9812774025358455, "grad_norm": 0.5235298330164154, "learning_rate": 2.5821428348152788e-05, "loss": 1.915, "step": 4180 }, { "epoch": 1.9860173006280366, "grad_norm": 0.6107709148392828, "learning_rate": 2.560441645518078e-05, "loss": 1.8223, "step": 4190 }, { "epoch": 1.9907571987202275, "grad_norm": 0.5614697856069703, "learning_rate": 2.538800603279673e-05, "loss": 1.8439, "step": 4200 }, { "epoch": 1.9954970968124184, "grad_norm": 0.5563269995130558, "learning_rate": 2.5172202416577236e-05, "loss": 1.8982, "step": 4210 }, { "epoch": 2.0002369949046095, "grad_norm": 0.5673849628756762, "learning_rate": 2.4957010927138136e-05, "loss": 1.8956, "step": 4220 }, { "epoch": 2.0049768929968006, "grad_norm": 0.5274159605663582, "learning_rate": 2.4742436870003326e-05, "loss": 1.8572, "step": 4230 }, { "epoch": 2.0097167910889917, "grad_norm": 0.5388999304024686, "learning_rate": 2.452848553547396e-05, "loss": 1.8441, "step": 4240 }, { "epoch": 2.014456689181183, "grad_norm": 0.5715679686982497, "learning_rate": 2.431516219849809e-05, "loss": 1.838, "step": 4250 }, { "epoch": 2.0191965872733735, "grad_norm": 0.5795119843431206, "learning_rate": 2.4102472118540487e-05, "loss": 1.8329, "step": 4260 }, { "epoch": 2.0239364853655646, "grad_norm": 0.5503184533431318, "learning_rate": 2.3890420539453057e-05, "loss": 1.8733, "step": 4270 }, { "epoch": 2.0286763834577557, "grad_norm": 0.54871121092008, "learning_rate": 2.3679012689345558e-05, "loss": 1.8601, "step": 4280 }, { "epoch": 2.033416281549947, "grad_norm": 0.5879797146794722, "learning_rate": 2.3468253780456678e-05, "loss": 1.7751, "step": 4290 }, { "epoch": 2.038156179642138, "grad_norm": 0.5510154682184406, "learning_rate": 2.3258149009025482e-05, "loss": 1.827, "step": 4300 }, { "epoch": 2.0428960777343286, "grad_norm": 0.513792181350148, "learning_rate": 2.3048703555163357e-05, "loss": 1.8474, "step": 4310 }, { "epoch": 2.0476359758265197, "grad_norm": 0.5489219942664323, "learning_rate": 2.2839922582726336e-05, "loss": 1.8862, "step": 4320 }, { "epoch": 2.052375873918711, "grad_norm": 0.6504687065880719, "learning_rate": 2.2631811239187646e-05, "loss": 1.7984, "step": 4330 }, { "epoch": 2.057115772010902, "grad_norm": 0.6130904570523673, "learning_rate": 2.2424374655510965e-05, "loss": 1.7921, "step": 4340 }, { "epoch": 2.0618556701030926, "grad_norm": 0.6408124203446663, "learning_rate": 2.2217617946023765e-05, "loss": 1.8592, "step": 4350 }, { "epoch": 2.0665955681952837, "grad_norm": 0.6181447797115482, "learning_rate": 2.201154620829137e-05, "loss": 1.8067, "step": 4360 }, { "epoch": 2.071335466287475, "grad_norm": 0.5627617017019729, "learning_rate": 2.1806164522991118e-05, "loss": 1.7701, "step": 4370 }, { "epoch": 2.076075364379666, "grad_norm": 0.5510540438192786, "learning_rate": 2.1601477953787214e-05, "loss": 1.857, "step": 4380 }, { "epoch": 2.080815262471857, "grad_norm": 0.6083237779423979, "learning_rate": 2.1397491547205807e-05, "loss": 1.7601, "step": 4390 }, { "epoch": 2.0855551605640477, "grad_norm": 0.6047311337345246, "learning_rate": 2.119421033251071e-05, "loss": 1.8347, "step": 4400 }, { "epoch": 2.0902950586562388, "grad_norm": 0.5662369508712475, "learning_rate": 2.0991639321579214e-05, "loss": 1.8545, "step": 4410 }, { "epoch": 2.09503495674843, "grad_norm": 0.5935079368512177, "learning_rate": 2.078978350877862e-05, "loss": 1.879, "step": 4420 }, { "epoch": 2.099774854840621, "grad_norm": 0.571586984028468, "learning_rate": 2.058864787084309e-05, "loss": 1.7671, "step": 4430 }, { "epoch": 2.104514752932812, "grad_norm": 0.5682037137995106, "learning_rate": 2.0388237366751006e-05, "loss": 1.865, "step": 4440 }, { "epoch": 2.1092546510250028, "grad_norm": 0.5490908649638305, "learning_rate": 2.018855693760257e-05, "loss": 1.78, "step": 4450 }, { "epoch": 2.113994549117194, "grad_norm": 0.6176356249016943, "learning_rate": 1.998961150649814e-05, "loss": 1.8435, "step": 4460 }, { "epoch": 2.118734447209385, "grad_norm": 0.5319868348925916, "learning_rate": 1.9791405978416694e-05, "loss": 1.8981, "step": 4470 }, { "epoch": 2.123474345301576, "grad_norm": 0.5752723871436735, "learning_rate": 1.9593945240095052e-05, "loss": 1.7755, "step": 4480 }, { "epoch": 2.128214243393767, "grad_norm": 0.6366681694521167, "learning_rate": 1.9397234159907275e-05, "loss": 1.8707, "step": 4490 }, { "epoch": 2.132954141485958, "grad_norm": 0.5901487974014347, "learning_rate": 1.920127758774466e-05, "loss": 1.8256, "step": 4500 }, { "epoch": 2.137694039578149, "grad_norm": 0.5888105104943471, "learning_rate": 1.9006080354896267e-05, "loss": 1.8357, "step": 4510 }, { "epoch": 2.14243393767034, "grad_norm": 0.5878169661429707, "learning_rate": 1.8811647273929628e-05, "loss": 1.8241, "step": 4520 }, { "epoch": 2.147173835762531, "grad_norm": 0.5581948418607748, "learning_rate": 1.8617983138572277e-05, "loss": 1.848, "step": 4530 }, { "epoch": 2.1519137338547223, "grad_norm": 0.6137321662868356, "learning_rate": 1.8425092723593395e-05, "loss": 1.78, "step": 4540 }, { "epoch": 2.156653631946913, "grad_norm": 0.558081495592443, "learning_rate": 1.823298078468624e-05, "loss": 1.8153, "step": 4550 }, { "epoch": 2.161393530039104, "grad_norm": 0.6039625325723422, "learning_rate": 1.8041652058350767e-05, "loss": 1.8416, "step": 4560 }, { "epoch": 2.166133428131295, "grad_norm": 0.6295821331128388, "learning_rate": 1.785111126177691e-05, "loss": 1.7953, "step": 4570 }, { "epoch": 2.1708733262234863, "grad_norm": 0.5911527371211652, "learning_rate": 1.7661363092728307e-05, "loss": 1.7851, "step": 4580 }, { "epoch": 2.1756132243156774, "grad_norm": 0.565852777352692, "learning_rate": 1.7472412229426455e-05, "loss": 1.8101, "step": 4590 }, { "epoch": 2.180353122407868, "grad_norm": 0.5656454600563583, "learning_rate": 1.7284263330435317e-05, "loss": 1.917, "step": 4600 }, { "epoch": 2.185093020500059, "grad_norm": 0.6035646498858932, "learning_rate": 1.709692103454651e-05, "loss": 1.8168, "step": 4610 }, { "epoch": 2.1898329185922503, "grad_norm": 0.5477939270708279, "learning_rate": 1.6910389960664992e-05, "loss": 1.777, "step": 4620 }, { "epoch": 2.1945728166844414, "grad_norm": 0.5898939001383526, "learning_rate": 1.672467470769507e-05, "loss": 1.7575, "step": 4630 }, { "epoch": 2.1993127147766325, "grad_norm": 0.544798273283213, "learning_rate": 1.6539779854427074e-05, "loss": 1.8834, "step": 4640 }, { "epoch": 2.204052612868823, "grad_norm": 0.610618761949142, "learning_rate": 1.6355709959424487e-05, "loss": 1.8785, "step": 4650 }, { "epoch": 2.2087925109610143, "grad_norm": 0.6064522176814057, "learning_rate": 1.6172469560911553e-05, "loss": 1.7854, "step": 4660 }, { "epoch": 2.2135324090532054, "grad_norm": 0.6022849345976745, "learning_rate": 1.599006317666131e-05, "loss": 1.8497, "step": 4670 }, { "epoch": 2.2182723071453965, "grad_norm": 0.5926151325695663, "learning_rate": 1.5808495303884297e-05, "loss": 1.8184, "step": 4680 }, { "epoch": 2.2230122052375876, "grad_norm": 0.5740462281531319, "learning_rate": 1.562777041911761e-05, "loss": 1.8073, "step": 4690 }, { "epoch": 2.2277521033297782, "grad_norm": 0.595274030679382, "learning_rate": 1.5447892978114592e-05, "loss": 1.8095, "step": 4700 }, { "epoch": 2.2324920014219694, "grad_norm": 0.5805561493774153, "learning_rate": 1.526886741573496e-05, "loss": 1.7907, "step": 4710 }, { "epoch": 2.2372318995141605, "grad_norm": 0.6585750772533296, "learning_rate": 1.5090698145835413e-05, "loss": 1.8081, "step": 4720 }, { "epoch": 2.2419717976063516, "grad_norm": 0.7616121844460758, "learning_rate": 1.491338956116085e-05, "loss": 1.8571, "step": 4730 }, { "epoch": 2.2467116956985427, "grad_norm": 0.6037559488690589, "learning_rate": 1.473694603323611e-05, "loss": 1.8194, "step": 4740 }, { "epoch": 2.2514515937907333, "grad_norm": 0.6412117105060221, "learning_rate": 1.4561371912258098e-05, "loss": 1.7447, "step": 4750 }, { "epoch": 2.2561914918829244, "grad_norm": 0.6178165307415238, "learning_rate": 1.4386671526988593e-05, "loss": 1.8047, "step": 4760 }, { "epoch": 2.2609313899751156, "grad_norm": 0.5887211775830831, "learning_rate": 1.421284918464752e-05, "loss": 1.8309, "step": 4770 }, { "epoch": 2.2656712880673067, "grad_norm": 0.6715832023904247, "learning_rate": 1.4039909170806764e-05, "loss": 1.7598, "step": 4780 }, { "epoch": 2.2704111861594978, "grad_norm": 0.5565711226911474, "learning_rate": 1.386785574928446e-05, "loss": 1.8042, "step": 4790 }, { "epoch": 2.2751510842516884, "grad_norm": 1.0370061435438975, "learning_rate": 1.3696693162039893e-05, "loss": 1.8418, "step": 4800 }, { "epoch": 2.2798909823438795, "grad_norm": 0.619379427966442, "learning_rate": 1.3526425629068967e-05, "loss": 1.8709, "step": 4810 }, { "epoch": 2.2846308804360707, "grad_norm": 0.6181820044240368, "learning_rate": 1.3357057348300067e-05, "loss": 1.8222, "step": 4820 }, { "epoch": 2.2893707785282618, "grad_norm": 0.6447967865409838, "learning_rate": 1.318859249549066e-05, "loss": 1.8183, "step": 4830 }, { "epoch": 2.294110676620453, "grad_norm": 0.6058171204419526, "learning_rate": 1.3021035224124224e-05, "loss": 1.805, "step": 4840 }, { "epoch": 2.2988505747126435, "grad_norm": 0.5434323398332925, "learning_rate": 1.2854389665307975e-05, "loss": 1.7541, "step": 4850 }, { "epoch": 2.3035904728048346, "grad_norm": 0.6113667985824829, "learning_rate": 1.2688659927670915e-05, "loss": 1.758, "step": 4860 }, { "epoch": 2.3083303708970258, "grad_norm": 0.5720767875706882, "learning_rate": 1.2523850097262563e-05, "loss": 1.8322, "step": 4870 }, { "epoch": 2.313070268989217, "grad_norm": 0.5628951626795141, "learning_rate": 1.2359964237452238e-05, "loss": 1.7798, "step": 4880 }, { "epoch": 2.317810167081408, "grad_norm": 0.6094150987430762, "learning_rate": 1.219700638882888e-05, "loss": 1.7842, "step": 4890 }, { "epoch": 2.3225500651735986, "grad_norm": 0.6036779282592939, "learning_rate": 1.2034980569101367e-05, "loss": 1.8383, "step": 4900 }, { "epoch": 2.3272899632657897, "grad_norm": 0.6175747345768624, "learning_rate": 1.1873890772999502e-05, "loss": 1.9046, "step": 4910 }, { "epoch": 2.332029861357981, "grad_norm": 0.5564649373869762, "learning_rate": 1.1713740972175574e-05, "loss": 1.8104, "step": 4920 }, { "epoch": 2.336769759450172, "grad_norm": 0.6441404862225901, "learning_rate": 1.155453511510633e-05, "loss": 1.7864, "step": 4930 }, { "epoch": 2.341509657542363, "grad_norm": 0.6927623121031959, "learning_rate": 1.1396277126995707e-05, "loss": 1.829, "step": 4940 }, { "epoch": 2.3462495556345537, "grad_norm": 0.6537904475611329, "learning_rate": 1.1238970909677993e-05, "loss": 1.8655, "step": 4950 }, { "epoch": 2.350989453726745, "grad_norm": 0.5779494171909159, "learning_rate": 1.1082620341521766e-05, "loss": 1.7482, "step": 4960 }, { "epoch": 2.355729351818936, "grad_norm": 0.6161830958900923, "learning_rate": 1.0927229277334061e-05, "loss": 1.7789, "step": 4970 }, { "epoch": 2.360469249911127, "grad_norm": 0.5946038603032194, "learning_rate": 1.0772801548265498e-05, "loss": 1.8189, "step": 4980 }, { "epoch": 2.365209148003318, "grad_norm": 0.6072288944056834, "learning_rate": 1.0619340961715746e-05, "loss": 1.8588, "step": 4990 }, { "epoch": 2.369949046095509, "grad_norm": 0.5882805952028816, "learning_rate": 1.0466851301239711e-05, "loss": 1.8238, "step": 5000 }, { "epoch": 2.3746889441877, "grad_norm": 0.6288910196539964, "learning_rate": 1.0315336326454161e-05, "loss": 1.7055, "step": 5010 }, { "epoch": 2.379428842279891, "grad_norm": 0.6043835236662759, "learning_rate": 1.0164799772945149e-05, "loss": 1.8134, "step": 5020 }, { "epoch": 2.384168740372082, "grad_norm": 0.5821262142704368, "learning_rate": 1.0015245352175811e-05, "loss": 1.797, "step": 5030 }, { "epoch": 2.3889086384642733, "grad_norm": 0.6369667143877562, "learning_rate": 9.866676751394927e-06, "loss": 1.8199, "step": 5040 }, { "epoch": 2.393648536556464, "grad_norm": 0.5924507902566707, "learning_rate": 9.719097633545975e-06, "loss": 1.8524, "step": 5050 }, { "epoch": 2.398388434648655, "grad_norm": 0.5762513665027686, "learning_rate": 9.572511637176811e-06, "loss": 1.8428, "step": 5060 }, { "epoch": 2.403128332740846, "grad_norm": 0.5799149040724592, "learning_rate": 9.426922376350028e-06, "loss": 1.8463, "step": 5070 }, { "epoch": 2.4078682308330372, "grad_norm": 0.5898000658332848, "learning_rate": 9.282333440553804e-06, "loss": 1.7772, "step": 5080 }, { "epoch": 2.4126081289252284, "grad_norm": 0.5967206158269678, "learning_rate": 9.13874839461336e-06, "loss": 1.8234, "step": 5090 }, { "epoch": 2.417348027017419, "grad_norm": 0.6245591569289297, "learning_rate": 8.996170778603153e-06, "loss": 1.8047, "step": 5100 }, { "epoch": 2.42208792510961, "grad_norm": 0.5981945344970201, "learning_rate": 8.854604107759568e-06, "loss": 1.8429, "step": 5110 }, { "epoch": 2.4268278232018012, "grad_norm": 0.6112665064763977, "learning_rate": 8.714051872394213e-06, "loss": 1.7746, "step": 5120 }, { "epoch": 2.4315677212939923, "grad_norm": 0.5847743009358597, "learning_rate": 8.574517537807897e-06, "loss": 1.7703, "step": 5130 }, { "epoch": 2.436307619386183, "grad_norm": 0.5617053604855574, "learning_rate": 8.436004544205217e-06, "loss": 1.8498, "step": 5140 }, { "epoch": 2.441047517478374, "grad_norm": 0.5947168640425712, "learning_rate": 8.2985163066097e-06, "loss": 1.8439, "step": 5150 }, { "epoch": 2.4457874155705652, "grad_norm": 0.6456439652584188, "learning_rate": 8.162056214779618e-06, "loss": 1.8125, "step": 5160 }, { "epoch": 2.4505273136627563, "grad_norm": 0.6053385247801931, "learning_rate": 8.02662763312439e-06, "loss": 1.8193, "step": 5170 }, { "epoch": 2.4552672117549474, "grad_norm": 0.6364991896683941, "learning_rate": 7.89223390062172e-06, "loss": 1.8081, "step": 5180 }, { "epoch": 2.460007109847138, "grad_norm": 0.630663938586301, "learning_rate": 7.758878330735142e-06, "loss": 1.8317, "step": 5190 }, { "epoch": 2.464747007939329, "grad_norm": 0.6625585293729884, "learning_rate": 7.626564211332465e-06, "loss": 1.7914, "step": 5200 }, { "epoch": 2.4694869060315203, "grad_norm": 0.6132933711832741, "learning_rate": 7.49529480460458e-06, "loss": 1.8072, "step": 5210 }, { "epoch": 2.4742268041237114, "grad_norm": 0.6723366054843423, "learning_rate": 7.3650733469851574e-06, "loss": 1.8693, "step": 5220 }, { "epoch": 2.4789667022159025, "grad_norm": 0.5948715205500895, "learning_rate": 7.235903049070742e-06, "loss": 1.7441, "step": 5230 }, { "epoch": 2.483706600308093, "grad_norm": 0.602660875671921, "learning_rate": 7.1077870955416685e-06, "loss": 1.8301, "step": 5240 }, { "epoch": 2.4884464984002843, "grad_norm": 0.6657860629895173, "learning_rate": 6.98072864508349e-06, "loss": 1.7357, "step": 5250 }, { "epoch": 2.4931863964924754, "grad_norm": 0.6400301583474429, "learning_rate": 6.854730830309203e-06, "loss": 1.8309, "step": 5260 }, { "epoch": 2.4979262945846665, "grad_norm": 0.6519457597490862, "learning_rate": 6.729796757681861e-06, "loss": 1.8622, "step": 5270 }, { "epoch": 2.502666192676857, "grad_norm": 0.6018425213466797, "learning_rate": 6.605929507438108e-06, "loss": 1.8124, "step": 5280 }, { "epoch": 2.5074060907690483, "grad_norm": 0.6356535657958864, "learning_rate": 6.4831321335121706e-06, "loss": 1.8493, "step": 5290 }, { "epoch": 2.5121459888612394, "grad_norm": 0.5933711757944313, "learning_rate": 6.361407663460612e-06, "loss": 1.8152, "step": 5300 }, { "epoch": 2.5168858869534305, "grad_norm": 0.6176252282132866, "learning_rate": 6.240759098387628e-06, "loss": 1.7796, "step": 5310 }, { "epoch": 2.5216257850456216, "grad_norm": 0.6035543936375999, "learning_rate": 6.12118941287112e-06, "loss": 1.8072, "step": 5320 }, { "epoch": 2.5263656831378123, "grad_norm": 0.6423602506797493, "learning_rate": 6.002701554889306e-06, "loss": 1.8894, "step": 5330 }, { "epoch": 2.5311055812300034, "grad_norm": 0.6166718860982423, "learning_rate": 5.885298445748072e-06, "loss": 1.8476, "step": 5340 }, { "epoch": 2.5358454793221945, "grad_norm": 0.6250486214392823, "learning_rate": 5.768982980008924e-06, "loss": 1.8044, "step": 5350 }, { "epoch": 2.5405853774143856, "grad_norm": 0.6409013217160432, "learning_rate": 5.653758025417616e-06, "loss": 1.7732, "step": 5360 }, { "epoch": 2.5453252755065767, "grad_norm": 0.5853729101352203, "learning_rate": 5.5396264228335e-06, "loss": 1.816, "step": 5370 }, { "epoch": 2.5500651735987674, "grad_norm": 0.6674717253505213, "learning_rate": 5.42659098615943e-06, "loss": 1.828, "step": 5380 }, { "epoch": 2.5548050716909585, "grad_norm": 0.6079460431124653, "learning_rate": 5.314654502272393e-06, "loss": 1.8305, "step": 5390 }, { "epoch": 2.5595449697831496, "grad_norm": 0.6132271739956523, "learning_rate": 5.203819730954806e-06, "loss": 1.9389, "step": 5400 }, { "epoch": 2.5642848678753407, "grad_norm": 0.6412964569520792, "learning_rate": 5.094089404826513e-06, "loss": 1.8878, "step": 5410 }, { "epoch": 2.569024765967532, "grad_norm": 0.6314773808659059, "learning_rate": 4.985466229277331e-06, "loss": 1.7996, "step": 5420 }, { "epoch": 2.5737646640597225, "grad_norm": 0.6019377364178156, "learning_rate": 4.877952882400411e-06, "loss": 1.8326, "step": 5430 }, { "epoch": 2.5785045621519136, "grad_norm": 0.6375177888153616, "learning_rate": 4.771552014926206e-06, "loss": 1.8313, "step": 5440 }, { "epoch": 2.5832444602441047, "grad_norm": 0.6184290636855982, "learning_rate": 4.666266250157097e-06, "loss": 1.8408, "step": 5450 }, { "epoch": 2.587984358336296, "grad_norm": 0.6145812896553856, "learning_rate": 4.562098183902713e-06, "loss": 1.7928, "step": 5460 }, { "epoch": 2.592724256428487, "grad_norm": 0.5863286484938057, "learning_rate": 4.459050384415941e-06, "loss": 1.7671, "step": 5470 }, { "epoch": 2.5974641545206776, "grad_norm": 0.5908385265300592, "learning_rate": 4.357125392329636e-06, "loss": 1.8528, "step": 5480 }, { "epoch": 2.6022040526128687, "grad_norm": 0.6315835702501038, "learning_rate": 4.256325720593912e-06, "loss": 1.8952, "step": 5490 }, { "epoch": 2.60694395070506, "grad_norm": 0.5905062832031487, "learning_rate": 4.15665385441425e-06, "loss": 1.8604, "step": 5500 }, { "epoch": 2.611683848797251, "grad_norm": 0.568727331363524, "learning_rate": 4.0581122511901934e-06, "loss": 1.8351, "step": 5510 }, { "epoch": 2.616423746889442, "grad_norm": 0.6400621125560388, "learning_rate": 3.960703340454791e-06, "loss": 1.857, "step": 5520 }, { "epoch": 2.6211636449816327, "grad_norm": 0.6844853412168999, "learning_rate": 3.864429523814644e-06, "loss": 1.8371, "step": 5530 }, { "epoch": 2.625903543073824, "grad_norm": 0.6040727492768455, "learning_rate": 3.7692931748907425e-06, "loss": 1.8582, "step": 5540 }, { "epoch": 2.630643441166015, "grad_norm": 0.6488970700922259, "learning_rate": 3.675296639259912e-06, "loss": 1.8466, "step": 5550 }, { "epoch": 2.635383339258206, "grad_norm": 0.606860701135619, "learning_rate": 3.5824422343970267e-06, "loss": 1.8823, "step": 5560 }, { "epoch": 2.640123237350397, "grad_norm": 0.6107041616886252, "learning_rate": 3.4907322496178397e-06, "loss": 1.7635, "step": 5570 }, { "epoch": 2.6448631354425878, "grad_norm": 0.6205661299793865, "learning_rate": 3.4001689460225195e-06, "loss": 1.7604, "step": 5580 }, { "epoch": 2.649603033534779, "grad_norm": 0.6114908815089501, "learning_rate": 3.3107545564399434e-06, "loss": 1.8452, "step": 5590 }, { "epoch": 2.65434293162697, "grad_norm": 0.621202845423754, "learning_rate": 3.2224912853726476e-06, "loss": 1.8557, "step": 5600 }, { "epoch": 2.659082829719161, "grad_norm": 0.6376438148340446, "learning_rate": 3.1353813089424424e-06, "loss": 1.8295, "step": 5610 }, { "epoch": 2.663822727811352, "grad_norm": 0.6085163299666503, "learning_rate": 3.0494267748367723e-06, "loss": 1.7302, "step": 5620 }, { "epoch": 2.668562625903543, "grad_norm": 0.6330680248898437, "learning_rate": 2.9646298022557915e-06, "loss": 1.7756, "step": 5630 }, { "epoch": 2.673302523995734, "grad_norm": 0.6575109357986112, "learning_rate": 2.8809924818600952e-06, "loss": 1.7728, "step": 5640 }, { "epoch": 2.678042422087925, "grad_norm": 0.5972530598708538, "learning_rate": 2.7985168757191482e-06, "loss": 1.7927, "step": 5650 }, { "epoch": 2.682782320180116, "grad_norm": 0.6505229836146454, "learning_rate": 2.7172050172604824e-06, "loss": 1.768, "step": 5660 }, { "epoch": 2.6875222182723073, "grad_norm": 0.6339702452986381, "learning_rate": 2.63705891121957e-06, "loss": 1.7756, "step": 5670 }, { "epoch": 2.692262116364498, "grad_norm": 0.6729168831182509, "learning_rate": 2.5580805335903457e-06, "loss": 1.8363, "step": 5680 }, { "epoch": 2.697002014456689, "grad_norm": 0.6421591660117998, "learning_rate": 2.4802718315765527e-06, "loss": 1.7585, "step": 5690 }, { "epoch": 2.70174191254888, "grad_norm": 0.5993295713871896, "learning_rate": 2.403634723543674e-06, "loss": 1.8379, "step": 5700 }, { "epoch": 2.7064818106410713, "grad_norm": 0.5931932390101198, "learning_rate": 2.3281710989716933e-06, "loss": 1.8127, "step": 5710 }, { "epoch": 2.7112217087332624, "grad_norm": 0.6007499215207198, "learning_rate": 2.2538828184084595e-06, "loss": 1.7643, "step": 5720 }, { "epoch": 2.715961606825453, "grad_norm": 0.6294360874753062, "learning_rate": 2.1807717134238347e-06, "loss": 1.8007, "step": 5730 }, { "epoch": 2.720701504917644, "grad_norm": 0.6305932589800126, "learning_rate": 2.1088395865645537e-06, "loss": 1.802, "step": 5740 }, { "epoch": 2.7254414030098353, "grad_norm": 0.6091954631732173, "learning_rate": 2.038088211309769e-06, "loss": 1.7978, "step": 5750 }, { "epoch": 2.7301813011020264, "grad_norm": 0.6353525285344948, "learning_rate": 1.968519332027302e-06, "loss": 1.8641, "step": 5760 }, { "epoch": 2.7349211991942175, "grad_norm": 0.5869911293052614, "learning_rate": 1.9001346639306805e-06, "loss": 1.876, "step": 5770 }, { "epoch": 2.739661097286408, "grad_norm": 0.6462140073621514, "learning_rate": 1.8329358930368245e-06, "loss": 1.7947, "step": 5780 }, { "epoch": 2.7444009953785993, "grad_norm": 0.6298906028352366, "learning_rate": 1.7669246761244763e-06, "loss": 1.7983, "step": 5790 }, { "epoch": 2.7491408934707904, "grad_norm": 0.6351921002703318, "learning_rate": 1.7021026406933427e-06, "loss": 1.7563, "step": 5800 }, { "epoch": 2.7538807915629815, "grad_norm": 0.6081707137727146, "learning_rate": 1.638471384924012e-06, "loss": 1.8005, "step": 5810 }, { "epoch": 2.7586206896551726, "grad_norm": 0.6527854672102444, "learning_rate": 1.5760324776385171e-06, "loss": 1.8228, "step": 5820 }, { "epoch": 2.7633605877473633, "grad_norm": 0.6207692422398574, "learning_rate": 1.5147874582616518e-06, "loss": 1.8751, "step": 5830 }, { "epoch": 2.7681004858395544, "grad_norm": 0.6078351786970941, "learning_rate": 1.4547378367830267e-06, "loss": 1.854, "step": 5840 }, { "epoch": 2.7728403839317455, "grad_norm": 0.5914179875660134, "learning_rate": 1.3958850937198453e-06, "loss": 1.8771, "step": 5850 }, { "epoch": 2.7775802820239366, "grad_norm": 0.6150352638939602, "learning_rate": 1.3382306800804045e-06, "loss": 1.7422, "step": 5860 }, { "epoch": 2.7823201801161277, "grad_norm": 0.6205091178728268, "learning_rate": 1.2817760173282954e-06, "loss": 1.8005, "step": 5870 }, { "epoch": 2.7870600782083184, "grad_norm": 0.6352299718478237, "learning_rate": 1.2265224973474042e-06, "loss": 1.7703, "step": 5880 }, { "epoch": 2.7917999763005095, "grad_norm": 0.6466624089179797, "learning_rate": 1.1724714824075333e-06, "loss": 1.8315, "step": 5890 }, { "epoch": 2.7965398743927006, "grad_norm": 0.5968151491811187, "learning_rate": 1.1196243051308787e-06, "loss": 1.9011, "step": 5900 }, { "epoch": 2.8012797724848917, "grad_norm": 0.6310690230989541, "learning_rate": 1.0679822684591112e-06, "loss": 1.8434, "step": 5910 }, { "epoch": 2.806019670577083, "grad_norm": 0.6459331883257132, "learning_rate": 1.0175466456213034e-06, "loss": 1.7773, "step": 5920 }, { "epoch": 2.8107595686692735, "grad_norm": 0.6898338914840095, "learning_rate": 9.683186801025256e-07, "loss": 1.8417, "step": 5930 }, { "epoch": 2.8154994667614646, "grad_norm": 0.6097250867359322, "learning_rate": 9.202995856131769e-07, "loss": 1.8076, "step": 5940 }, { "epoch": 2.8202393648536557, "grad_norm": 0.6610392263190566, "learning_rate": 8.734905460590581e-07, "loss": 1.7511, "step": 5950 }, { "epoch": 2.824979262945847, "grad_norm": 0.6070988311686517, "learning_rate": 8.278927155121851e-07, "loss": 1.8309, "step": 5960 }, { "epoch": 2.829719161038038, "grad_norm": 0.6261583831010433, "learning_rate": 7.835072181823666e-07, "loss": 1.8377, "step": 5970 }, { "epoch": 2.8344590591302286, "grad_norm": 0.6243423055956993, "learning_rate": 7.403351483894427e-07, "loss": 1.7941, "step": 5980 }, { "epoch": 2.8391989572224197, "grad_norm": 0.702784469663522, "learning_rate": 6.983775705363238e-07, "loss": 1.8042, "step": 5990 }, { "epoch": 2.8439388553146108, "grad_norm": 0.5996597981711203, "learning_rate": 6.576355190827499e-07, "loss": 1.8512, "step": 6000 }, { "epoch": 2.848678753406802, "grad_norm": 0.5539803926109534, "learning_rate": 6.181099985197947e-07, "loss": 1.8558, "step": 6010 }, { "epoch": 2.853418651498993, "grad_norm": 0.5462268948543724, "learning_rate": 5.798019833450629e-07, "loss": 1.7838, "step": 6020 }, { "epoch": 2.8581585495911837, "grad_norm": 0.6522918616165346, "learning_rate": 5.4271241803871e-07, "loss": 1.8523, "step": 6030 }, { "epoch": 2.8628984476833748, "grad_norm": 0.6013569849197028, "learning_rate": 5.068422170401377e-07, "loss": 1.8239, "step": 6040 }, { "epoch": 2.867638345775566, "grad_norm": 0.6217056805780841, "learning_rate": 4.72192264725424e-07, "loss": 1.8316, "step": 6050 }, { "epoch": 2.872378243867757, "grad_norm": 0.6047869013985818, "learning_rate": 4.387634153855791e-07, "loss": 1.8189, "step": 6060 }, { "epoch": 2.877118141959948, "grad_norm": 0.6730414277089524, "learning_rate": 4.065564932054067e-07, "loss": 1.7824, "step": 6070 }, { "epoch": 2.8818580400521387, "grad_norm": 0.612791047561647, "learning_rate": 3.755722922432481e-07, "loss": 1.7867, "step": 6080 }, { "epoch": 2.88659793814433, "grad_norm": 0.6615842561782111, "learning_rate": 3.4581157641137563e-07, "loss": 1.8359, "step": 6090 }, { "epoch": 2.891337836236521, "grad_norm": 0.6358101876016702, "learning_rate": 3.1727507945714663e-07, "loss": 1.8628, "step": 6100 }, { "epoch": 2.896077734328712, "grad_norm": 0.5951921137175086, "learning_rate": 2.8996350494495116e-07, "loss": 1.8516, "step": 6110 }, { "epoch": 2.900817632420903, "grad_norm": 0.6310271682459363, "learning_rate": 2.6387752623883156e-07, "loss": 1.8437, "step": 6120 }, { "epoch": 2.905557530513094, "grad_norm": 0.6305755436522482, "learning_rate": 2.390177864858956e-07, "loss": 1.8514, "step": 6130 }, { "epoch": 2.910297428605285, "grad_norm": 0.6404150710185624, "learning_rate": 2.1538489860044587e-07, "loss": 1.8186, "step": 6140 }, { "epoch": 2.915037326697476, "grad_norm": 0.6158013141692098, "learning_rate": 1.92979445248892e-07, "loss": 1.8083, "step": 6150 }, { "epoch": 2.919777224789667, "grad_norm": 0.6416671093424775, "learning_rate": 1.7180197883537308e-07, "loss": 1.7786, "step": 6160 }, { "epoch": 2.9245171228818583, "grad_norm": 0.5582605199061633, "learning_rate": 1.518530214881242e-07, "loss": 1.7976, "step": 6170 }, { "epoch": 2.929257020974049, "grad_norm": 0.6106802327952866, "learning_rate": 1.3313306504663115e-07, "loss": 1.7604, "step": 6180 }, { "epoch": 2.93399691906624, "grad_norm": 0.649320638486437, "learning_rate": 1.1564257104947352e-07, "loss": 1.8441, "step": 6190 }, { "epoch": 2.938736817158431, "grad_norm": 0.5884577603080124, "learning_rate": 9.938197072298372e-08, "loss": 1.8196, "step": 6200 }, { "epoch": 2.9434767152506223, "grad_norm": 0.6392485935256708, "learning_rate": 8.435166497057222e-08, "loss": 1.857, "step": 6210 }, { "epoch": 2.9482166133428134, "grad_norm": 0.6506401892518179, "learning_rate": 7.055202436287433e-08, "loss": 1.7725, "step": 6220 }, { "epoch": 2.952956511435004, "grad_norm": 0.6149298488489828, "learning_rate": 5.7983389128596355e-08, "loss": 1.8946, "step": 6230 }, { "epoch": 2.957696409527195, "grad_norm": 0.5722181216171393, "learning_rate": 4.664606914615011e-08, "loss": 1.8542, "step": 6240 }, { "epoch": 2.9624363076193863, "grad_norm": 0.6428450313630513, "learning_rate": 3.654034393598127e-08, "loss": 1.824, "step": 6250 }, { "epoch": 2.9671762057115774, "grad_norm": 0.6329021168786573, "learning_rate": 2.766646265369155e-08, "loss": 1.8012, "step": 6260 }, { "epoch": 2.9719161038037685, "grad_norm": 0.6406715656233972, "learning_rate": 2.0024644083921352e-08, "loss": 1.8472, "step": 6270 }, { "epoch": 2.976656001895959, "grad_norm": 0.5842266635593326, "learning_rate": 1.3615076634898582e-08, "loss": 1.8102, "step": 6280 }, { "epoch": 2.9813958999881502, "grad_norm": 0.6430039656205391, "learning_rate": 8.437918333864536e-09, "loss": 1.7935, "step": 6290 }, { "epoch": 2.9861357980803414, "grad_norm": 0.6055802510109696, "learning_rate": 4.493296823104842e-09, "loss": 1.8425, "step": 6300 }, { "epoch": 2.990875696172532, "grad_norm": 0.5757552404684133, "learning_rate": 1.781309356863048e-09, "loss": 1.8636, "step": 6310 }, { "epoch": 2.9956155942647236, "grad_norm": 0.602338679600079, "learning_rate": 3.0202279890922947e-10, "loss": 1.7555, "step": 6320 }, { "epoch": 2.998933522929257, "step": 6327, "total_flos": 3180599149854720.0, "train_loss": 1.9022130669246677, "train_runtime": 57110.8809, "train_samples_per_second": 0.887, "train_steps_per_second": 0.111 } ], "logging_steps": 10, "max_steps": 6327, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3180599149854720.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }