diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4466 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.998933522929257, + "eval_steps": 500, + "global_step": 6327, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004739898092191018, + "grad_norm": 0.36863938554596193, + "learning_rate": 9.999938362758687e-05, + "loss": 2.2013, + "step": 10 + }, + { + "epoch": 0.009479796184382036, + "grad_norm": 0.500081400365978, + "learning_rate": 9.999753452554404e-05, + "loss": 2.0963, + "step": 20 + }, + { + "epoch": 0.014219694276573054, + "grad_norm": 0.4147328978072978, + "learning_rate": 9.999445273946093e-05, + "loss": 2.1148, + "step": 30 + }, + { + "epoch": 0.018959592368764072, + "grad_norm": 0.35930434339916095, + "learning_rate": 9.999013834531869e-05, + "loss": 2.1787, + "step": 40 + }, + { + "epoch": 0.02369949046095509, + "grad_norm": 0.5423960911916873, + "learning_rate": 9.998459144948825e-05, + "loss": 2.1055, + "step": 50 + }, + { + "epoch": 0.028439388553146108, + "grad_norm": 0.40880961489218204, + "learning_rate": 9.997781218872771e-05, + "loss": 2.1723, + "step": 60 + }, + { + "epoch": 0.033179286645337126, + "grad_norm": 0.43317175193770346, + "learning_rate": 9.99698007301791e-05, + "loss": 2.0316, + "step": 70 + }, + { + "epoch": 0.037919184737528144, + "grad_norm": 0.37892996565691084, + "learning_rate": 9.996055727136406e-05, + "loss": 2.0171, + "step": 80 + }, + { + "epoch": 0.04265908282971916, + "grad_norm": 0.43747110352943336, + "learning_rate": 9.995008204017915e-05, + "loss": 2.0887, + "step": 90 + }, + { + "epoch": 0.04739898092191018, + "grad_norm": 0.39459537932523525, + "learning_rate": 9.993837529489007e-05, + "loss": 2.03, + "step": 100 + }, + { + "epoch": 0.0521388790141012, + "grad_norm": 0.3839963642801344, + "learning_rate": 9.992543732412544e-05, + "loss": 1.9306, + "step": 110 + }, + { + "epoch": 0.056878777106292217, + "grad_norm": 0.40520894461995377, + "learning_rate": 9.99112684468696e-05, + "loss": 2.0425, + "step": 120 + }, + { + "epoch": 0.061618675198483235, + "grad_norm": 0.390483069303289, + "learning_rate": 9.989586901245472e-05, + "loss": 2.087, + "step": 130 + }, + { + "epoch": 0.06635857329067425, + "grad_norm": 0.4180445176279502, + "learning_rate": 9.987923940055228e-05, + "loss": 2.0679, + "step": 140 + }, + { + "epoch": 0.07109847138286526, + "grad_norm": 0.49880430744694115, + "learning_rate": 9.986138002116364e-05, + "loss": 2.0628, + "step": 150 + }, + { + "epoch": 0.07583836947505629, + "grad_norm": 0.4427166571522091, + "learning_rate": 9.984229131460996e-05, + "loss": 2.0681, + "step": 160 + }, + { + "epoch": 0.0805782675672473, + "grad_norm": 0.38471776319499607, + "learning_rate": 9.982197375152129e-05, + "loss": 2.0019, + "step": 170 + }, + { + "epoch": 0.08531816565943832, + "grad_norm": 0.4628072900469101, + "learning_rate": 9.980042783282509e-05, + "loss": 1.9909, + "step": 180 + }, + { + "epoch": 0.09005806375162934, + "grad_norm": 0.3939907069527393, + "learning_rate": 9.977765408973374e-05, + "loss": 2.0713, + "step": 190 + }, + { + "epoch": 0.09479796184382036, + "grad_norm": 0.4184843314019155, + "learning_rate": 9.97536530837315e-05, + "loss": 1.9729, + "step": 200 + }, + { + "epoch": 0.09953785993601137, + "grad_norm": 0.6336861212047761, + "learning_rate": 9.97284254065607e-05, + "loss": 2.0278, + "step": 210 + }, + { + "epoch": 0.1042777580282024, + "grad_norm": 0.39737439720337403, + "learning_rate": 9.970197168020713e-05, + "loss": 2.0603, + "step": 220 + }, + { + "epoch": 0.10901765612039341, + "grad_norm": 0.4161581082817388, + "learning_rate": 9.967429255688468e-05, + "loss": 2.0308, + "step": 230 + }, + { + "epoch": 0.11375755421258443, + "grad_norm": 0.4122381540422074, + "learning_rate": 9.964538871901923e-05, + "loss": 2.1011, + "step": 240 + }, + { + "epoch": 0.11849745230477544, + "grad_norm": 0.40792411841005016, + "learning_rate": 9.961526087923193e-05, + "loss": 2.0535, + "step": 250 + }, + { + "epoch": 0.12323735039696647, + "grad_norm": 0.4298298302428991, + "learning_rate": 9.958390978032157e-05, + "loss": 1.9882, + "step": 260 + }, + { + "epoch": 0.12797724848915748, + "grad_norm": 0.3932772338211237, + "learning_rate": 9.955133619524623e-05, + "loss": 2.0703, + "step": 270 + }, + { + "epoch": 0.1327171465813485, + "grad_norm": 0.4304879913642714, + "learning_rate": 9.951754092710429e-05, + "loss": 2.0661, + "step": 280 + }, + { + "epoch": 0.13745704467353953, + "grad_norm": 0.3933942358750948, + "learning_rate": 9.948252480911458e-05, + "loss": 1.9941, + "step": 290 + }, + { + "epoch": 0.14219694276573053, + "grad_norm": 0.3876898041012675, + "learning_rate": 9.944628870459587e-05, + "loss": 2.001, + "step": 300 + }, + { + "epoch": 0.14693684085792155, + "grad_norm": 0.39971149840828696, + "learning_rate": 9.940883350694556e-05, + "loss": 1.9889, + "step": 310 + }, + { + "epoch": 0.15167673895011258, + "grad_norm": 0.4322868983437022, + "learning_rate": 9.93701601396177e-05, + "loss": 2.019, + "step": 320 + }, + { + "epoch": 0.1564166370423036, + "grad_norm": 0.40679391432223605, + "learning_rate": 9.933026955610014e-05, + "loss": 2.0402, + "step": 330 + }, + { + "epoch": 0.1611565351344946, + "grad_norm": 0.40265148647862, + "learning_rate": 9.928916273989108e-05, + "loss": 1.9488, + "step": 340 + }, + { + "epoch": 0.16589643322668562, + "grad_norm": 0.4119893126018702, + "learning_rate": 9.924684070447483e-05, + "loss": 2.0143, + "step": 350 + }, + { + "epoch": 0.17063633131887665, + "grad_norm": 0.41446853317804655, + "learning_rate": 9.92033044932968e-05, + "loss": 1.9393, + "step": 360 + }, + { + "epoch": 0.17537622941106767, + "grad_norm": 0.4775440242382454, + "learning_rate": 9.915855517973776e-05, + "loss": 1.9899, + "step": 370 + }, + { + "epoch": 0.18011612750325867, + "grad_norm": 0.41303403265485017, + "learning_rate": 9.91125938670874e-05, + "loss": 2.0431, + "step": 380 + }, + { + "epoch": 0.1848560255954497, + "grad_norm": 0.381415505593885, + "learning_rate": 9.906542168851715e-05, + "loss": 1.9778, + "step": 390 + }, + { + "epoch": 0.18959592368764072, + "grad_norm": 0.45202098843075295, + "learning_rate": 9.901703980705219e-05, + "loss": 2.0098, + "step": 400 + }, + { + "epoch": 0.19433582177983175, + "grad_norm": 0.38808197740496003, + "learning_rate": 9.896744941554279e-05, + "loss": 1.9467, + "step": 410 + }, + { + "epoch": 0.19907571987202274, + "grad_norm": 0.40860216072850924, + "learning_rate": 9.891665173663492e-05, + "loss": 2.0267, + "step": 420 + }, + { + "epoch": 0.20381561796421377, + "grad_norm": 0.4068044305771888, + "learning_rate": 9.886464802274009e-05, + "loss": 2.0872, + "step": 430 + }, + { + "epoch": 0.2085555160564048, + "grad_norm": 0.43039544158069454, + "learning_rate": 9.88114395560045e-05, + "loss": 2.0094, + "step": 440 + }, + { + "epoch": 0.21329541414859582, + "grad_norm": 0.37668435282131046, + "learning_rate": 9.875702764827737e-05, + "loss": 2.0032, + "step": 450 + }, + { + "epoch": 0.21803531224078682, + "grad_norm": 0.4289799607032317, + "learning_rate": 9.87014136410787e-05, + "loss": 1.9535, + "step": 460 + }, + { + "epoch": 0.22277521033297784, + "grad_norm": 0.416501457655663, + "learning_rate": 9.864459890556604e-05, + "loss": 2.0246, + "step": 470 + }, + { + "epoch": 0.22751510842516887, + "grad_norm": 0.42709577377722036, + "learning_rate": 9.858658484250082e-05, + "loss": 1.9675, + "step": 480 + }, + { + "epoch": 0.23225500651735986, + "grad_norm": 0.38491345570315816, + "learning_rate": 9.852737288221378e-05, + "loss": 1.9768, + "step": 490 + }, + { + "epoch": 0.2369949046095509, + "grad_norm": 0.4331220698731146, + "learning_rate": 9.846696448456967e-05, + "loss": 1.96, + "step": 500 + }, + { + "epoch": 0.2417348027017419, + "grad_norm": 0.5157356350680703, + "learning_rate": 9.840536113893129e-05, + "loss": 2.0168, + "step": 510 + }, + { + "epoch": 0.24647470079393294, + "grad_norm": 0.42673885807943607, + "learning_rate": 9.834256436412272e-05, + "loss": 1.9192, + "step": 520 + }, + { + "epoch": 0.25121459888612396, + "grad_norm": 0.399056341637914, + "learning_rate": 9.827857570839198e-05, + "loss": 2.009, + "step": 530 + }, + { + "epoch": 0.25595449697831496, + "grad_norm": 0.38514488410609315, + "learning_rate": 9.821339674937274e-05, + "loss": 2.0237, + "step": 540 + }, + { + "epoch": 0.26069439507050596, + "grad_norm": 0.43535566879213633, + "learning_rate": 9.814702909404547e-05, + "loss": 1.9746, + "step": 550 + }, + { + "epoch": 0.265434293162697, + "grad_norm": 0.4277848981360601, + "learning_rate": 9.807947437869788e-05, + "loss": 2.0008, + "step": 560 + }, + { + "epoch": 0.270174191254888, + "grad_norm": 0.42806115487352164, + "learning_rate": 9.801073426888447e-05, + "loss": 2.0819, + "step": 570 + }, + { + "epoch": 0.27491408934707906, + "grad_norm": 0.36287005859609833, + "learning_rate": 9.794081045938554e-05, + "loss": 2.0256, + "step": 580 + }, + { + "epoch": 0.27965398743927006, + "grad_norm": 0.467970576527151, + "learning_rate": 9.786970467416538e-05, + "loss": 2.0221, + "step": 590 + }, + { + "epoch": 0.28439388553146105, + "grad_norm": 0.37993477630266503, + "learning_rate": 9.779741866632977e-05, + "loss": 1.9589, + "step": 600 + }, + { + "epoch": 0.2891337836236521, + "grad_norm": 0.44198107142469956, + "learning_rate": 9.772395421808274e-05, + "loss": 2.0035, + "step": 610 + }, + { + "epoch": 0.2938736817158431, + "grad_norm": 0.44573447679188816, + "learning_rate": 9.764931314068267e-05, + "loss": 1.9909, + "step": 620 + }, + { + "epoch": 0.2986135798080341, + "grad_norm": 0.4731340699659092, + "learning_rate": 9.757349727439759e-05, + "loss": 2.0103, + "step": 630 + }, + { + "epoch": 0.30335347790022515, + "grad_norm": 0.3963283837850387, + "learning_rate": 9.749650848845984e-05, + "loss": 2.0639, + "step": 640 + }, + { + "epoch": 0.30809337599241615, + "grad_norm": 0.3884422717238912, + "learning_rate": 9.741834868101998e-05, + "loss": 2.0342, + "step": 650 + }, + { + "epoch": 0.3128332740846072, + "grad_norm": 0.42096628799860736, + "learning_rate": 9.733901977909997e-05, + "loss": 2.0037, + "step": 660 + }, + { + "epoch": 0.3175731721767982, + "grad_norm": 0.3922372868315195, + "learning_rate": 9.725852373854568e-05, + "loss": 2.0327, + "step": 670 + }, + { + "epoch": 0.3223130702689892, + "grad_norm": 0.37724258160489493, + "learning_rate": 9.717686254397866e-05, + "loss": 1.9996, + "step": 680 + }, + { + "epoch": 0.32705296836118025, + "grad_norm": 0.36849429342184464, + "learning_rate": 9.70940382087472e-05, + "loss": 1.9789, + "step": 690 + }, + { + "epoch": 0.33179286645337125, + "grad_norm": 0.38001698944458373, + "learning_rate": 9.701005277487673e-05, + "loss": 1.8886, + "step": 700 + }, + { + "epoch": 0.33653276454556225, + "grad_norm": 0.4434394537121414, + "learning_rate": 9.692490831301944e-05, + "loss": 2.0773, + "step": 710 + }, + { + "epoch": 0.3412726626377533, + "grad_norm": 0.44409242659624243, + "learning_rate": 9.683860692240321e-05, + "loss": 1.9944, + "step": 720 + }, + { + "epoch": 0.3460125607299443, + "grad_norm": 0.3706038723114169, + "learning_rate": 9.675115073077989e-05, + "loss": 1.9399, + "step": 730 + }, + { + "epoch": 0.35075245882213535, + "grad_norm": 0.3775340444246396, + "learning_rate": 9.666254189437286e-05, + "loss": 2.0434, + "step": 740 + }, + { + "epoch": 0.35549235691432635, + "grad_norm": 0.39740898678838216, + "learning_rate": 9.657278259782378e-05, + "loss": 2.0483, + "step": 750 + }, + { + "epoch": 0.36023225500651734, + "grad_norm": 0.3856650140837026, + "learning_rate": 9.648187505413886e-05, + "loss": 1.9621, + "step": 760 + }, + { + "epoch": 0.3649721530987084, + "grad_norm": 0.49084336306431187, + "learning_rate": 9.638982150463415e-05, + "loss": 1.9878, + "step": 770 + }, + { + "epoch": 0.3697120511908994, + "grad_norm": 0.41318948101107866, + "learning_rate": 9.629662421888039e-05, + "loss": 2.0805, + "step": 780 + }, + { + "epoch": 0.3744519492830904, + "grad_norm": 0.402590356367594, + "learning_rate": 9.620228549464703e-05, + "loss": 2.0258, + "step": 790 + }, + { + "epoch": 0.37919184737528144, + "grad_norm": 0.4461694641117838, + "learning_rate": 9.610680765784556e-05, + "loss": 1.9692, + "step": 800 + }, + { + "epoch": 0.38393174546747244, + "grad_norm": 0.41581795351534184, + "learning_rate": 9.601019306247215e-05, + "loss": 2.022, + "step": 810 + }, + { + "epoch": 0.3886716435596635, + "grad_norm": 0.4182347418587252, + "learning_rate": 9.591244409054965e-05, + "loss": 1.9989, + "step": 820 + }, + { + "epoch": 0.3934115416518545, + "grad_norm": 0.36463111311757684, + "learning_rate": 9.581356315206885e-05, + "loss": 2.0483, + "step": 830 + }, + { + "epoch": 0.3981514397440455, + "grad_norm": 0.4636476781338481, + "learning_rate": 9.571355268492907e-05, + "loss": 1.9491, + "step": 840 + }, + { + "epoch": 0.40289133783623654, + "grad_norm": 0.43027600259738763, + "learning_rate": 9.561241515487802e-05, + "loss": 1.9423, + "step": 850 + }, + { + "epoch": 0.40763123592842754, + "grad_norm": 0.43322329785996827, + "learning_rate": 9.551015305545104e-05, + "loss": 1.9349, + "step": 860 + }, + { + "epoch": 0.41237113402061853, + "grad_norm": 0.3900423005352424, + "learning_rate": 9.540676890790962e-05, + "loss": 1.9571, + "step": 870 + }, + { + "epoch": 0.4171110321128096, + "grad_norm": 0.3736027589992883, + "learning_rate": 9.53022652611792e-05, + "loss": 2.033, + "step": 880 + }, + { + "epoch": 0.4218509302050006, + "grad_norm": 0.4412678924097936, + "learning_rate": 9.519664469178638e-05, + "loss": 1.9928, + "step": 890 + }, + { + "epoch": 0.42659082829719164, + "grad_norm": 0.36064586995797043, + "learning_rate": 9.508990980379537e-05, + "loss": 2.0181, + "step": 900 + }, + { + "epoch": 0.43133072638938263, + "grad_norm": 0.36982453028008294, + "learning_rate": 9.498206322874381e-05, + "loss": 2.0118, + "step": 910 + }, + { + "epoch": 0.43607062448157363, + "grad_norm": 0.4936789348648113, + "learning_rate": 9.487310762557784e-05, + "loss": 2.0388, + "step": 920 + }, + { + "epoch": 0.4408105225737647, + "grad_norm": 0.4192120475618224, + "learning_rate": 9.476304568058657e-05, + "loss": 2.0001, + "step": 930 + }, + { + "epoch": 0.4455504206659557, + "grad_norm": 0.4212248975591549, + "learning_rate": 9.465188010733586e-05, + "loss": 2.0464, + "step": 940 + }, + { + "epoch": 0.4502903187581467, + "grad_norm": 0.4111853146435081, + "learning_rate": 9.453961364660143e-05, + "loss": 2.0118, + "step": 950 + }, + { + "epoch": 0.45503021685033773, + "grad_norm": 0.3911083150496816, + "learning_rate": 9.442624906630124e-05, + "loss": 1.9256, + "step": 960 + }, + { + "epoch": 0.45977011494252873, + "grad_norm": 0.4275198886604283, + "learning_rate": 9.431178916142731e-05, + "loss": 2.0142, + "step": 970 + }, + { + "epoch": 0.4645100130347197, + "grad_norm": 0.41213645663674664, + "learning_rate": 9.419623675397672e-05, + "loss": 1.9863, + "step": 980 + }, + { + "epoch": 0.4692499111269108, + "grad_norm": 0.39744532831875506, + "learning_rate": 9.407959469288214e-05, + "loss": 1.963, + "step": 990 + }, + { + "epoch": 0.4739898092191018, + "grad_norm": 0.40358506493166846, + "learning_rate": 9.396186585394153e-05, + "loss": 1.9724, + "step": 1000 + }, + { + "epoch": 0.47872970731129283, + "grad_norm": 0.3715075397009002, + "learning_rate": 9.384305313974719e-05, + "loss": 1.9564, + "step": 1010 + }, + { + "epoch": 0.4834696054034838, + "grad_norm": 0.41249417731334614, + "learning_rate": 9.372315947961434e-05, + "loss": 2.0089, + "step": 1020 + }, + { + "epoch": 0.4882095034956748, + "grad_norm": 0.4477075629260475, + "learning_rate": 9.360218782950873e-05, + "loss": 2.0249, + "step": 1030 + }, + { + "epoch": 0.4929494015878659, + "grad_norm": 0.41335031918044873, + "learning_rate": 9.34801411719739e-05, + "loss": 2.0439, + "step": 1040 + }, + { + "epoch": 0.4976892996800569, + "grad_norm": 0.4023689824634566, + "learning_rate": 9.335702251605756e-05, + "loss": 2.0278, + "step": 1050 + }, + { + "epoch": 0.5024291977722479, + "grad_norm": 0.37476123227339486, + "learning_rate": 9.32328348972374e-05, + "loss": 2.0854, + "step": 1060 + }, + { + "epoch": 0.5071690958644389, + "grad_norm": 0.3680109272331818, + "learning_rate": 9.310758137734634e-05, + "loss": 2.0505, + "step": 1070 + }, + { + "epoch": 0.5119089939566299, + "grad_norm": 0.47590335433852127, + "learning_rate": 9.298126504449697e-05, + "loss": 1.9342, + "step": 1080 + }, + { + "epoch": 0.5166488920488209, + "grad_norm": 0.443747158773761, + "learning_rate": 9.285388901300537e-05, + "loss": 2.0338, + "step": 1090 + }, + { + "epoch": 0.5213887901410119, + "grad_norm": 0.4300619230217585, + "learning_rate": 9.272545642331443e-05, + "loss": 1.9431, + "step": 1100 + }, + { + "epoch": 0.526128688233203, + "grad_norm": 0.4068927208227842, + "learning_rate": 9.259597044191636e-05, + "loss": 1.9639, + "step": 1110 + }, + { + "epoch": 0.530868586325394, + "grad_norm": 0.3904780080331756, + "learning_rate": 9.246543426127463e-05, + "loss": 2.044, + "step": 1120 + }, + { + "epoch": 0.535608484417585, + "grad_norm": 0.4074988084895911, + "learning_rate": 9.233385109974528e-05, + "loss": 1.9209, + "step": 1130 + }, + { + "epoch": 0.540348382509776, + "grad_norm": 0.48971289458578504, + "learning_rate": 9.220122420149753e-05, + "loss": 1.9405, + "step": 1140 + }, + { + "epoch": 0.545088280601967, + "grad_norm": 0.4560990819156225, + "learning_rate": 9.206755683643383e-05, + "loss": 1.9754, + "step": 1150 + }, + { + "epoch": 0.5498281786941581, + "grad_norm": 0.4953771996336736, + "learning_rate": 9.193285230010923e-05, + "loss": 1.9832, + "step": 1160 + }, + { + "epoch": 0.5545680767863491, + "grad_norm": 0.452270837264993, + "learning_rate": 9.179711391365016e-05, + "loss": 2.0267, + "step": 1170 + }, + { + "epoch": 0.5593079748785401, + "grad_norm": 0.38839940667413064, + "learning_rate": 9.166034502367246e-05, + "loss": 2.0303, + "step": 1180 + }, + { + "epoch": 0.5640478729707311, + "grad_norm": 0.4434400621892702, + "learning_rate": 9.152254900219899e-05, + "loss": 2.019, + "step": 1190 + }, + { + "epoch": 0.5687877710629221, + "grad_norm": 0.4265655972195879, + "learning_rate": 9.138372924657638e-05, + "loss": 1.9578, + "step": 1200 + }, + { + "epoch": 0.5735276691551132, + "grad_norm": 0.37712073893593084, + "learning_rate": 9.124388917939135e-05, + "loss": 1.9002, + "step": 1210 + }, + { + "epoch": 0.5782675672473042, + "grad_norm": 0.3967821230664083, + "learning_rate": 9.110303224838628e-05, + "loss": 1.9982, + "step": 1220 + }, + { + "epoch": 0.5830074653394952, + "grad_norm": 0.4225910574667248, + "learning_rate": 9.096116192637424e-05, + "loss": 1.9999, + "step": 1230 + }, + { + "epoch": 0.5877473634316862, + "grad_norm": 0.46005143244561764, + "learning_rate": 9.081828171115334e-05, + "loss": 1.9269, + "step": 1240 + }, + { + "epoch": 0.5924872615238772, + "grad_norm": 0.41650738683050376, + "learning_rate": 9.067439512542048e-05, + "loss": 2.0138, + "step": 1250 + }, + { + "epoch": 0.5972271596160682, + "grad_norm": 0.4595664788322495, + "learning_rate": 9.052950571668457e-05, + "loss": 1.8902, + "step": 1260 + }, + { + "epoch": 0.6019670577082593, + "grad_norm": 0.47181766838174233, + "learning_rate": 9.038361705717897e-05, + "loss": 2.0354, + "step": 1270 + }, + { + "epoch": 0.6067069558004503, + "grad_norm": 0.4016620461236779, + "learning_rate": 9.023673274377349e-05, + "loss": 2.0428, + "step": 1280 + }, + { + "epoch": 0.6114468538926413, + "grad_norm": 0.44582424551905314, + "learning_rate": 9.00888563978857e-05, + "loss": 1.9205, + "step": 1290 + }, + { + "epoch": 0.6161867519848323, + "grad_norm": 0.4731092970060822, + "learning_rate": 8.993999166539155e-05, + "loss": 1.9468, + "step": 1300 + }, + { + "epoch": 0.6209266500770233, + "grad_norm": 0.41403788063445784, + "learning_rate": 8.979014221653569e-05, + "loss": 1.967, + "step": 1310 + }, + { + "epoch": 0.6256665481692144, + "grad_norm": 0.3824681634104647, + "learning_rate": 8.963931174584072e-05, + "loss": 1.9764, + "step": 1320 + }, + { + "epoch": 0.6304064462614054, + "grad_norm": 0.3979138111413701, + "learning_rate": 8.94875039720163e-05, + "loss": 2.0262, + "step": 1330 + }, + { + "epoch": 0.6351463443535964, + "grad_norm": 0.41027150705022153, + "learning_rate": 8.93347226378674e-05, + "loss": 1.9379, + "step": 1340 + }, + { + "epoch": 0.6398862424457874, + "grad_norm": 0.46333301444068553, + "learning_rate": 8.9180971510202e-05, + "loss": 1.9551, + "step": 1350 + }, + { + "epoch": 0.6446261405379784, + "grad_norm": 0.39959859369206574, + "learning_rate": 8.902625437973823e-05, + "loss": 1.9199, + "step": 1360 + }, + { + "epoch": 0.6493660386301695, + "grad_norm": 0.42731835258341894, + "learning_rate": 8.887057506101096e-05, + "loss": 2.0178, + "step": 1370 + }, + { + "epoch": 0.6541059367223605, + "grad_norm": 0.43891265274307517, + "learning_rate": 8.871393739227764e-05, + "loss": 1.9369, + "step": 1380 + }, + { + "epoch": 0.6588458348145515, + "grad_norm": 0.4314210574368562, + "learning_rate": 8.855634523542384e-05, + "loss": 2.0049, + "step": 1390 + }, + { + "epoch": 0.6635857329067425, + "grad_norm": 0.44613138847149775, + "learning_rate": 8.839780247586785e-05, + "loss": 1.9509, + "step": 1400 + }, + { + "epoch": 0.6683256309989335, + "grad_norm": 0.4379460820834945, + "learning_rate": 8.823831302246498e-05, + "loss": 1.9541, + "step": 1410 + }, + { + "epoch": 0.6730655290911245, + "grad_norm": 0.3682639471382051, + "learning_rate": 8.807788080741124e-05, + "loss": 2.0064, + "step": 1420 + }, + { + "epoch": 0.6778054271833156, + "grad_norm": 0.3981445155765943, + "learning_rate": 8.791650978614627e-05, + "loss": 1.9151, + "step": 1430 + }, + { + "epoch": 0.6825453252755066, + "grad_norm": 0.3868845773205047, + "learning_rate": 8.77542039372559e-05, + "loss": 2.0033, + "step": 1440 + }, + { + "epoch": 0.6872852233676976, + "grad_norm": 0.4065050795968265, + "learning_rate": 8.759096726237406e-05, + "loss": 1.9333, + "step": 1450 + }, + { + "epoch": 0.6920251214598886, + "grad_norm": 0.4019451177579478, + "learning_rate": 8.742680378608405e-05, + "loss": 1.9738, + "step": 1460 + }, + { + "epoch": 0.6967650195520796, + "grad_norm": 0.40929290402886576, + "learning_rate": 8.726171755581943e-05, + "loss": 1.9054, + "step": 1470 + }, + { + "epoch": 0.7015049176442707, + "grad_norm": 0.4521322208310143, + "learning_rate": 8.709571264176409e-05, + "loss": 2.038, + "step": 1480 + }, + { + "epoch": 0.7062448157364617, + "grad_norm": 0.4152045328204035, + "learning_rate": 8.692879313675201e-05, + "loss": 2.0632, + "step": 1490 + }, + { + "epoch": 0.7109847138286527, + "grad_norm": 0.4153887781497306, + "learning_rate": 8.676096315616633e-05, + "loss": 1.9658, + "step": 1500 + }, + { + "epoch": 0.7157246119208437, + "grad_norm": 0.4421939758182222, + "learning_rate": 8.659222683783785e-05, + "loss": 1.9318, + "step": 1510 + }, + { + "epoch": 0.7204645100130347, + "grad_norm": 0.40964882006156955, + "learning_rate": 8.642258834194306e-05, + "loss": 1.9843, + "step": 1520 + }, + { + "epoch": 0.7252044081052257, + "grad_norm": 0.4083908197791484, + "learning_rate": 8.625205185090148e-05, + "loss": 1.9828, + "step": 1530 + }, + { + "epoch": 0.7299443061974168, + "grad_norm": 0.39713303306109243, + "learning_rate": 8.608062156927267e-05, + "loss": 1.9957, + "step": 1540 + }, + { + "epoch": 0.7346842042896078, + "grad_norm": 0.3984748196137378, + "learning_rate": 8.59083017236525e-05, + "loss": 1.9756, + "step": 1550 + }, + { + "epoch": 0.7394241023817988, + "grad_norm": 0.3801131175331665, + "learning_rate": 8.57350965625689e-05, + "loss": 2.0876, + "step": 1560 + }, + { + "epoch": 0.7441640004739898, + "grad_norm": 0.40526485533564677, + "learning_rate": 8.556101035637723e-05, + "loss": 1.9273, + "step": 1570 + }, + { + "epoch": 0.7489038985661808, + "grad_norm": 0.43256807999674307, + "learning_rate": 8.538604739715487e-05, + "loss": 1.9965, + "step": 1580 + }, + { + "epoch": 0.7536437966583719, + "grad_norm": 0.4089571388848955, + "learning_rate": 8.521021199859547e-05, + "loss": 1.9838, + "step": 1590 + }, + { + "epoch": 0.7583836947505629, + "grad_norm": 0.43989226476544846, + "learning_rate": 8.503350849590261e-05, + "loss": 2.0101, + "step": 1600 + }, + { + "epoch": 0.7631235928427539, + "grad_norm": 0.4312349465343795, + "learning_rate": 8.485594124568286e-05, + "loss": 2.0024, + "step": 1610 + }, + { + "epoch": 0.7678634909349449, + "grad_norm": 0.42870468778423404, + "learning_rate": 8.467751462583837e-05, + "loss": 1.9171, + "step": 1620 + }, + { + "epoch": 0.7726033890271359, + "grad_norm": 0.37297491856173187, + "learning_rate": 8.449823303545902e-05, + "loss": 1.9234, + "step": 1630 + }, + { + "epoch": 0.777343287119327, + "grad_norm": 0.43903627896277525, + "learning_rate": 8.431810089471386e-05, + "loss": 2.0138, + "step": 1640 + }, + { + "epoch": 0.782083185211518, + "grad_norm": 0.4356441070614573, + "learning_rate": 8.413712264474218e-05, + "loss": 1.9822, + "step": 1650 + }, + { + "epoch": 0.786823083303709, + "grad_norm": 0.42844869008890196, + "learning_rate": 8.395530274754401e-05, + "loss": 1.9615, + "step": 1660 + }, + { + "epoch": 0.7915629813959, + "grad_norm": 0.442280918540681, + "learning_rate": 8.377264568587012e-05, + "loss": 1.9835, + "step": 1670 + }, + { + "epoch": 0.796302879488091, + "grad_norm": 0.42858220049882395, + "learning_rate": 8.358915596311143e-05, + "loss": 1.9043, + "step": 1680 + }, + { + "epoch": 0.801042777580282, + "grad_norm": 0.388683268775689, + "learning_rate": 8.340483810318809e-05, + "loss": 2.0451, + "step": 1690 + }, + { + "epoch": 0.8057826756724731, + "grad_norm": 0.4116698984896444, + "learning_rate": 8.321969665043785e-05, + "loss": 1.9792, + "step": 1700 + }, + { + "epoch": 0.8105225737646641, + "grad_norm": 0.40384036708963345, + "learning_rate": 8.303373616950408e-05, + "loss": 1.8407, + "step": 1710 + }, + { + "epoch": 0.8152624718568551, + "grad_norm": 0.4680015183031998, + "learning_rate": 8.28469612452232e-05, + "loss": 1.9616, + "step": 1720 + }, + { + "epoch": 0.8200023699490461, + "grad_norm": 0.43443236620799985, + "learning_rate": 8.265937648251162e-05, + "loss": 1.9879, + "step": 1730 + }, + { + "epoch": 0.8247422680412371, + "grad_norm": 0.4892981794701289, + "learning_rate": 8.247098650625229e-05, + "loss": 1.9988, + "step": 1740 + }, + { + "epoch": 0.8294821661334282, + "grad_norm": 0.41120558715230104, + "learning_rate": 8.228179596118055e-05, + "loss": 2.0057, + "step": 1750 + }, + { + "epoch": 0.8342220642256192, + "grad_norm": 0.3856884225256909, + "learning_rate": 8.209180951176972e-05, + "loss": 2.0345, + "step": 1760 + }, + { + "epoch": 0.8389619623178102, + "grad_norm": 0.43262267182183567, + "learning_rate": 8.190103184211606e-05, + "loss": 2.0506, + "step": 1770 + }, + { + "epoch": 0.8437018604100012, + "grad_norm": 0.46227543956491046, + "learning_rate": 8.170946765582327e-05, + "loss": 1.9537, + "step": 1780 + }, + { + "epoch": 0.8484417585021922, + "grad_norm": 0.41122944892391, + "learning_rate": 8.151712167588654e-05, + "loss": 1.9481, + "step": 1790 + }, + { + "epoch": 0.8531816565943833, + "grad_norm": 0.4762971181475547, + "learning_rate": 8.13239986445761e-05, + "loss": 1.969, + "step": 1800 + }, + { + "epoch": 0.8579215546865743, + "grad_norm": 0.41348450657088276, + "learning_rate": 8.113010332332032e-05, + "loss": 2.0127, + "step": 1810 + }, + { + "epoch": 0.8626614527787653, + "grad_norm": 0.41355376759860496, + "learning_rate": 8.093544049258826e-05, + "loss": 1.9378, + "step": 1820 + }, + { + "epoch": 0.8674013508709563, + "grad_norm": 0.4739386141603482, + "learning_rate": 8.074001495177187e-05, + "loss": 1.9548, + "step": 1830 + }, + { + "epoch": 0.8721412489631473, + "grad_norm": 0.4067937473126016, + "learning_rate": 8.054383151906766e-05, + "loss": 1.9588, + "step": 1840 + }, + { + "epoch": 0.8768811470553383, + "grad_norm": 0.4603727127637402, + "learning_rate": 8.034689503135783e-05, + "loss": 1.9616, + "step": 1850 + }, + { + "epoch": 0.8816210451475294, + "grad_norm": 0.404919540874673, + "learning_rate": 8.014921034409115e-05, + "loss": 1.9476, + "step": 1860 + }, + { + "epoch": 0.8863609432397204, + "grad_norm": 0.39850400899429533, + "learning_rate": 7.99507823311631e-05, + "loss": 1.9603, + "step": 1870 + }, + { + "epoch": 0.8911008413319114, + "grad_norm": 0.48693274229874695, + "learning_rate": 7.97516158847958e-05, + "loss": 2.0121, + "step": 1880 + }, + { + "epoch": 0.8958407394241024, + "grad_norm": 0.45401122715232545, + "learning_rate": 7.955171591541739e-05, + "loss": 1.8593, + "step": 1890 + }, + { + "epoch": 0.9005806375162934, + "grad_norm": 0.38605278944495364, + "learning_rate": 7.935108735154094e-05, + "loss": 1.9199, + "step": 1900 + }, + { + "epoch": 0.9053205356084845, + "grad_norm": 0.4453838492498413, + "learning_rate": 7.914973513964291e-05, + "loss": 1.9354, + "step": 1910 + }, + { + "epoch": 0.9100604337006755, + "grad_norm": 0.4123431078009058, + "learning_rate": 7.894766424404126e-05, + "loss": 1.9807, + "step": 1920 + }, + { + "epoch": 0.9148003317928665, + "grad_norm": 0.43369573713775106, + "learning_rate": 7.874487964677301e-05, + "loss": 1.9707, + "step": 1930 + }, + { + "epoch": 0.9195402298850575, + "grad_norm": 0.3949770503185179, + "learning_rate": 7.854138634747145e-05, + "loss": 1.9742, + "step": 1940 + }, + { + "epoch": 0.9242801279772485, + "grad_norm": 0.4224215984268503, + "learning_rate": 7.833718936324277e-05, + "loss": 1.9465, + "step": 1950 + }, + { + "epoch": 0.9290200260694395, + "grad_norm": 0.5228997588486322, + "learning_rate": 7.813229372854251e-05, + "loss": 1.9454, + "step": 1960 + }, + { + "epoch": 0.9337599241616306, + "grad_norm": 0.42165180512522465, + "learning_rate": 7.792670449505135e-05, + "loss": 1.9175, + "step": 1970 + }, + { + "epoch": 0.9384998222538216, + "grad_norm": 0.40378336800384856, + "learning_rate": 7.772042673155055e-05, + "loss": 1.9237, + "step": 1980 + }, + { + "epoch": 0.9432397203460126, + "grad_norm": 0.45740238886085255, + "learning_rate": 7.751346552379706e-05, + "loss": 1.9752, + "step": 1990 + }, + { + "epoch": 0.9479796184382036, + "grad_norm": 0.39149703066060726, + "learning_rate": 7.730582597439799e-05, + "loss": 1.98, + "step": 2000 + }, + { + "epoch": 0.9527195165303946, + "grad_norm": 0.4198989958604622, + "learning_rate": 7.709751320268499e-05, + "loss": 1.9937, + "step": 2010 + }, + { + "epoch": 0.9574594146225857, + "grad_norm": 0.45036655944797305, + "learning_rate": 7.688853234458786e-05, + "loss": 1.9439, + "step": 2020 + }, + { + "epoch": 0.9621993127147767, + "grad_norm": 0.47886989965002774, + "learning_rate": 7.667888855250806e-05, + "loss": 1.8984, + "step": 2030 + }, + { + "epoch": 0.9669392108069677, + "grad_norm": 0.4485436591345206, + "learning_rate": 7.646858699519158e-05, + "loss": 1.9997, + "step": 2040 + }, + { + "epoch": 0.9716791088991587, + "grad_norm": 0.4089350286618743, + "learning_rate": 7.625763285760154e-05, + "loss": 2.0561, + "step": 2050 + }, + { + "epoch": 0.9764190069913496, + "grad_norm": 0.5012148973934161, + "learning_rate": 7.604603134079039e-05, + "loss": 1.9108, + "step": 2060 + }, + { + "epoch": 0.9811589050835408, + "grad_norm": 0.4193397192808331, + "learning_rate": 7.583378766177163e-05, + "loss": 2.0375, + "step": 2070 + }, + { + "epoch": 0.9858988031757318, + "grad_norm": 0.3996742152514563, + "learning_rate": 7.56209070533912e-05, + "loss": 1.8992, + "step": 2080 + }, + { + "epoch": 0.9906387012679227, + "grad_norm": 0.43312783729617976, + "learning_rate": 7.540739476419847e-05, + "loss": 2.0202, + "step": 2090 + }, + { + "epoch": 0.9953785993601137, + "grad_norm": 0.47876561721756805, + "learning_rate": 7.519325605831684e-05, + "loss": 1.9258, + "step": 2100 + }, + { + "epoch": 1.0001184974523047, + "grad_norm": 0.40845159679128945, + "learning_rate": 7.497849621531396e-05, + "loss": 1.8963, + "step": 2110 + }, + { + "epoch": 1.0048583955444959, + "grad_norm": 0.4911320886031023, + "learning_rate": 7.476312053007151e-05, + "loss": 1.8763, + "step": 2120 + }, + { + "epoch": 1.0095982936366867, + "grad_norm": 0.4341191300612264, + "learning_rate": 7.454713431265475e-05, + "loss": 1.9345, + "step": 2130 + }, + { + "epoch": 1.0143381917288778, + "grad_norm": 0.44526984352662835, + "learning_rate": 7.43305428881815e-05, + "loss": 1.9666, + "step": 2140 + }, + { + "epoch": 1.019078089821069, + "grad_norm": 0.45021419491727926, + "learning_rate": 7.411335159669093e-05, + "loss": 1.9683, + "step": 2150 + }, + { + "epoch": 1.0238179879132598, + "grad_norm": 0.46367987121746707, + "learning_rate": 7.389556579301186e-05, + "loss": 1.884, + "step": 2160 + }, + { + "epoch": 1.028557886005451, + "grad_norm": 0.518631039907863, + "learning_rate": 7.367719084663074e-05, + "loss": 1.8473, + "step": 2170 + }, + { + "epoch": 1.0332977840976418, + "grad_norm": 0.4686244164357671, + "learning_rate": 7.345823214155927e-05, + "loss": 1.8894, + "step": 2180 + }, + { + "epoch": 1.038037682189833, + "grad_norm": 0.5124536145999882, + "learning_rate": 7.323869507620169e-05, + "loss": 1.886, + "step": 2190 + }, + { + "epoch": 1.0427775802820238, + "grad_norm": 0.428865165913033, + "learning_rate": 7.30185850632216e-05, + "loss": 1.8934, + "step": 2200 + }, + { + "epoch": 1.047517478374215, + "grad_norm": 0.4575909980653946, + "learning_rate": 7.27979075294086e-05, + "loss": 1.8793, + "step": 2210 + }, + { + "epoch": 1.052257376466406, + "grad_norm": 0.46819042427920937, + "learning_rate": 7.257666791554448e-05, + "loss": 1.9177, + "step": 2220 + }, + { + "epoch": 1.056997274558597, + "grad_norm": 0.5869490097444697, + "learning_rate": 7.2354871676269e-05, + "loss": 1.8888, + "step": 2230 + }, + { + "epoch": 1.061737172650788, + "grad_norm": 0.4407701363338049, + "learning_rate": 7.213252427994547e-05, + "loss": 1.9145, + "step": 2240 + }, + { + "epoch": 1.066477070742979, + "grad_norm": 0.5471189926425418, + "learning_rate": 7.1909631208526e-05, + "loss": 1.8647, + "step": 2250 + }, + { + "epoch": 1.07121696883517, + "grad_norm": 0.45247580903783674, + "learning_rate": 7.168619795741616e-05, + "loss": 1.8793, + "step": 2260 + }, + { + "epoch": 1.0759568669273611, + "grad_norm": 0.5394937103937341, + "learning_rate": 7.146223003533964e-05, + "loss": 1.9394, + "step": 2270 + }, + { + "epoch": 1.080696765019552, + "grad_norm": 0.5010981958648577, + "learning_rate": 7.12377329642024e-05, + "loss": 1.8009, + "step": 2280 + }, + { + "epoch": 1.0854366631117431, + "grad_norm": 0.49455090224086273, + "learning_rate": 7.101271227895646e-05, + "loss": 1.9877, + "step": 2290 + }, + { + "epoch": 1.090176561203934, + "grad_norm": 0.4487359249312413, + "learning_rate": 7.07871735274636e-05, + "loss": 1.8578, + "step": 2300 + }, + { + "epoch": 1.0949164592961251, + "grad_norm": 0.5006725728639967, + "learning_rate": 7.056112227035831e-05, + "loss": 1.9142, + "step": 2310 + }, + { + "epoch": 1.0996563573883162, + "grad_norm": 0.46840477309344347, + "learning_rate": 7.033456408091103e-05, + "loss": 1.9178, + "step": 2320 + }, + { + "epoch": 1.1043962554805071, + "grad_norm": 0.44881264282080685, + "learning_rate": 7.010750454489042e-05, + "loss": 1.9011, + "step": 2330 + }, + { + "epoch": 1.1091361535726982, + "grad_norm": 0.4914874135601711, + "learning_rate": 6.987994926042588e-05, + "loss": 1.8817, + "step": 2340 + }, + { + "epoch": 1.1138760516648891, + "grad_norm": 0.4875786937414022, + "learning_rate": 6.965190383786938e-05, + "loss": 1.9151, + "step": 2350 + }, + { + "epoch": 1.1186159497570802, + "grad_norm": 0.47374621253430516, + "learning_rate": 6.942337389965722e-05, + "loss": 1.8652, + "step": 2360 + }, + { + "epoch": 1.1233558478492713, + "grad_norm": 0.45812614575538185, + "learning_rate": 6.919436508017139e-05, + "loss": 1.9191, + "step": 2370 + }, + { + "epoch": 1.1280957459414622, + "grad_norm": 0.5233924389852819, + "learning_rate": 6.896488302560062e-05, + "loss": 1.8944, + "step": 2380 + }, + { + "epoch": 1.1328356440336533, + "grad_norm": 0.4760349705385804, + "learning_rate": 6.873493339380125e-05, + "loss": 1.8896, + "step": 2390 + }, + { + "epoch": 1.1375755421258442, + "grad_norm": 0.47170548205722757, + "learning_rate": 6.850452185415763e-05, + "loss": 1.8436, + "step": 2400 + }, + { + "epoch": 1.1423154402180353, + "grad_norm": 0.4742928761569321, + "learning_rate": 6.827365408744244e-05, + "loss": 1.938, + "step": 2410 + }, + { + "epoch": 1.1470553383102264, + "grad_norm": 0.5423850691494456, + "learning_rate": 6.804233578567658e-05, + "loss": 1.8889, + "step": 2420 + }, + { + "epoch": 1.1517952364024173, + "grad_norm": 0.48227588856524584, + "learning_rate": 6.781057265198885e-05, + "loss": 1.9094, + "step": 2430 + }, + { + "epoch": 1.1565351344946084, + "grad_norm": 0.45425361404028264, + "learning_rate": 6.75783704004753e-05, + "loss": 1.859, + "step": 2440 + }, + { + "epoch": 1.1612750325867993, + "grad_norm": 0.4433613473826934, + "learning_rate": 6.734573475605846e-05, + "loss": 1.9084, + "step": 2450 + }, + { + "epoch": 1.1660149306789904, + "grad_norm": 0.4943942467439202, + "learning_rate": 6.711267145434603e-05, + "loss": 1.9647, + "step": 2460 + }, + { + "epoch": 1.1707548287711815, + "grad_norm": 0.4577985217898985, + "learning_rate": 6.687918624148963e-05, + "loss": 1.8903, + "step": 2470 + }, + { + "epoch": 1.1754947268633724, + "grad_norm": 0.5864019689805202, + "learning_rate": 6.664528487404298e-05, + "loss": 1.8431, + "step": 2480 + }, + { + "epoch": 1.1802346249555635, + "grad_norm": 0.4979542549244347, + "learning_rate": 6.641097311882015e-05, + "loss": 1.9381, + "step": 2490 + }, + { + "epoch": 1.1849745230477544, + "grad_norm": 0.5142117151718176, + "learning_rate": 6.617625675275317e-05, + "loss": 1.8608, + "step": 2500 + }, + { + "epoch": 1.1897144211399455, + "grad_norm": 0.5179927851112526, + "learning_rate": 6.59411415627498e-05, + "loss": 1.9493, + "step": 2510 + }, + { + "epoch": 1.1944543192321366, + "grad_norm": 0.5221841655224025, + "learning_rate": 6.570563334555068e-05, + "loss": 1.8724, + "step": 2520 + }, + { + "epoch": 1.1991942173243275, + "grad_norm": 0.4985837837212232, + "learning_rate": 6.546973790758655e-05, + "loss": 1.952, + "step": 2530 + }, + { + "epoch": 1.2039341154165186, + "grad_norm": 0.5552319456240327, + "learning_rate": 6.523346106483504e-05, + "loss": 1.9397, + "step": 2540 + }, + { + "epoch": 1.2086740135087095, + "grad_norm": 0.4769628041892156, + "learning_rate": 6.499680864267725e-05, + "loss": 2.0053, + "step": 2550 + }, + { + "epoch": 1.2134139116009006, + "grad_norm": 0.4516518959319936, + "learning_rate": 6.475978647575416e-05, + "loss": 1.9402, + "step": 2560 + }, + { + "epoch": 1.2181538096930915, + "grad_norm": 0.4913816447981876, + "learning_rate": 6.452240040782276e-05, + "loss": 1.8451, + "step": 2570 + }, + { + "epoch": 1.2228937077852826, + "grad_norm": 0.4748765999127487, + "learning_rate": 6.4284656291612e-05, + "loss": 1.9117, + "step": 2580 + }, + { + "epoch": 1.2276336058774737, + "grad_norm": 0.5114110285568767, + "learning_rate": 6.404655998867848e-05, + "loss": 1.8831, + "step": 2590 + }, + { + "epoch": 1.2323735039696646, + "grad_norm": 0.47839985560769943, + "learning_rate": 6.380811736926188e-05, + "loss": 1.8627, + "step": 2600 + }, + { + "epoch": 1.2371134020618557, + "grad_norm": 0.5355232832118345, + "learning_rate": 6.356933431214034e-05, + "loss": 1.9189, + "step": 2610 + }, + { + "epoch": 1.2418533001540466, + "grad_norm": 0.4895001261750141, + "learning_rate": 6.33302167044854e-05, + "loss": 1.9699, + "step": 2620 + }, + { + "epoch": 1.2465931982462377, + "grad_norm": 0.4635882938471385, + "learning_rate": 6.309077044171694e-05, + "loss": 1.8779, + "step": 2630 + }, + { + "epoch": 1.2513330963384286, + "grad_norm": 0.45916609044978873, + "learning_rate": 6.285100142735782e-05, + "loss": 1.8527, + "step": 2640 + }, + { + "epoch": 1.2560729944306197, + "grad_norm": 0.46784246908879684, + "learning_rate": 6.261091557288826e-05, + "loss": 1.8844, + "step": 2650 + }, + { + "epoch": 1.2608128925228108, + "grad_norm": 0.5131345820024794, + "learning_rate": 6.237051879760014e-05, + "loss": 1.8402, + "step": 2660 + }, + { + "epoch": 1.2655527906150017, + "grad_norm": 0.5766279369511716, + "learning_rate": 6.21298170284511e-05, + "loss": 1.8558, + "step": 2670 + }, + { + "epoch": 1.2702926887071928, + "grad_norm": 0.48863073587665085, + "learning_rate": 6.188881619991834e-05, + "loss": 1.9337, + "step": 2680 + }, + { + "epoch": 1.2750325867993837, + "grad_norm": 0.5958235159214345, + "learning_rate": 6.164752225385235e-05, + "loss": 1.9018, + "step": 2690 + }, + { + "epoch": 1.2797724848915748, + "grad_norm": 0.5127854587716114, + "learning_rate": 6.140594113933042e-05, + "loss": 1.928, + "step": 2700 + }, + { + "epoch": 1.284512382983766, + "grad_norm": 0.4918233056408275, + "learning_rate": 6.116407881250994e-05, + "loss": 1.9623, + "step": 2710 + }, + { + "epoch": 1.2892522810759568, + "grad_norm": 0.4759408966884228, + "learning_rate": 6.0921941236481505e-05, + "loss": 1.876, + "step": 2720 + }, + { + "epoch": 1.293992179168148, + "grad_norm": 0.49692255085585224, + "learning_rate": 6.067953438112205e-05, + "loss": 1.871, + "step": 2730 + }, + { + "epoch": 1.2987320772603388, + "grad_norm": 0.51069268079758, + "learning_rate": 6.043686422294747e-05, + "loss": 1.9503, + "step": 2740 + }, + { + "epoch": 1.30347197535253, + "grad_norm": 0.4848235028179103, + "learning_rate": 6.019393674496543e-05, + "loss": 1.9636, + "step": 2750 + }, + { + "epoch": 1.308211873444721, + "grad_norm": 0.7269161906292443, + "learning_rate": 5.995075793652775e-05, + "loss": 1.8818, + "step": 2760 + }, + { + "epoch": 1.312951771536912, + "grad_norm": 0.46011103384366614, + "learning_rate": 5.9707333793182794e-05, + "loss": 1.9123, + "step": 2770 + }, + { + "epoch": 1.317691669629103, + "grad_norm": 0.5009880993886451, + "learning_rate": 5.946367031652761e-05, + "loss": 1.9407, + "step": 2780 + }, + { + "epoch": 1.3224315677212939, + "grad_norm": 0.5049332736921734, + "learning_rate": 5.921977351406004e-05, + "loss": 1.8624, + "step": 2790 + }, + { + "epoch": 1.327171465813485, + "grad_norm": 0.4984446750273935, + "learning_rate": 5.8975649399030485e-05, + "loss": 1.8407, + "step": 2800 + }, + { + "epoch": 1.331911363905676, + "grad_norm": 0.5202629992326526, + "learning_rate": 5.873130399029374e-05, + "loss": 1.8723, + "step": 2810 + }, + { + "epoch": 1.336651261997867, + "grad_norm": 0.57260787674711, + "learning_rate": 5.8486743312160584e-05, + "loss": 1.9077, + "step": 2820 + }, + { + "epoch": 1.341391160090058, + "grad_norm": 0.47793956835922086, + "learning_rate": 5.824197339424923e-05, + "loss": 1.9855, + "step": 2830 + }, + { + "epoch": 1.346131058182249, + "grad_norm": 0.4699288477951403, + "learning_rate": 5.799700027133666e-05, + "loss": 1.9131, + "step": 2840 + }, + { + "epoch": 1.35087095627444, + "grad_norm": 0.504238497502292, + "learning_rate": 5.7751829983209896e-05, + "loss": 1.9438, + "step": 2850 + }, + { + "epoch": 1.3556108543666312, + "grad_norm": 0.4814570049600418, + "learning_rate": 5.750646857451701e-05, + "loss": 1.9549, + "step": 2860 + }, + { + "epoch": 1.360350752458822, + "grad_norm": 0.5038793494327912, + "learning_rate": 5.726092209461814e-05, + "loss": 1.9016, + "step": 2870 + }, + { + "epoch": 1.3650906505510132, + "grad_norm": 0.5240318677978467, + "learning_rate": 5.701519659743636e-05, + "loss": 1.9323, + "step": 2880 + }, + { + "epoch": 1.369830548643204, + "grad_norm": 0.5135642745972475, + "learning_rate": 5.6769298141308345e-05, + "loss": 1.8633, + "step": 2890 + }, + { + "epoch": 1.3745704467353952, + "grad_norm": 0.5115968529507217, + "learning_rate": 5.652323278883511e-05, + "loss": 1.8486, + "step": 2900 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.4973184073827783, + "learning_rate": 5.6277006606732465e-05, + "loss": 1.9067, + "step": 2910 + }, + { + "epoch": 1.3840502429197772, + "grad_norm": 0.48576803898302945, + "learning_rate": 5.603062566568144e-05, + "loss": 1.9167, + "step": 2920 + }, + { + "epoch": 1.3887901410119683, + "grad_norm": 0.532613823404453, + "learning_rate": 5.5784096040178624e-05, + "loss": 1.916, + "step": 2930 + }, + { + "epoch": 1.3935300391041592, + "grad_norm": 0.5402345956070669, + "learning_rate": 5.5537423808386457e-05, + "loss": 1.9193, + "step": 2940 + }, + { + "epoch": 1.3982699371963503, + "grad_norm": 0.4920153790997806, + "learning_rate": 5.5290615051983276e-05, + "loss": 1.8214, + "step": 2950 + }, + { + "epoch": 1.4030098352885414, + "grad_norm": 0.5305053717830343, + "learning_rate": 5.504367585601342e-05, + "loss": 1.8724, + "step": 2960 + }, + { + "epoch": 1.4077497333807323, + "grad_norm": 0.5348665608450567, + "learning_rate": 5.479661230873723e-05, + "loss": 1.9576, + "step": 2970 + }, + { + "epoch": 1.4124896314729234, + "grad_norm": 0.5212184732874925, + "learning_rate": 5.4549430501480895e-05, + "loss": 1.9409, + "step": 2980 + }, + { + "epoch": 1.4172295295651143, + "grad_norm": 0.513803010422433, + "learning_rate": 5.43021365284863e-05, + "loss": 1.8691, + "step": 2990 + }, + { + "epoch": 1.4219694276573054, + "grad_norm": 0.5405731422319697, + "learning_rate": 5.405473648676074e-05, + "loss": 1.9071, + "step": 3000 + }, + { + "epoch": 1.4267093257494965, + "grad_norm": 0.5828580104321831, + "learning_rate": 5.380723647592668e-05, + "loss": 1.8781, + "step": 3010 + }, + { + "epoch": 1.4314492238416874, + "grad_norm": 0.4730373307838654, + "learning_rate": 5.3559642598071244e-05, + "loss": 1.9514, + "step": 3020 + }, + { + "epoch": 1.4361891219338785, + "grad_norm": 0.5098706245647135, + "learning_rate": 5.3311960957595885e-05, + "loss": 1.9019, + "step": 3030 + }, + { + "epoch": 1.4409290200260694, + "grad_norm": 0.4902558604014986, + "learning_rate": 5.306419766106582e-05, + "loss": 1.8003, + "step": 3040 + }, + { + "epoch": 1.4456689181182605, + "grad_norm": 0.5662981198334492, + "learning_rate": 5.2816358817059483e-05, + "loss": 1.9584, + "step": 3050 + }, + { + "epoch": 1.4504088162104516, + "grad_norm": 0.5080795735549143, + "learning_rate": 5.2568450536017946e-05, + "loss": 1.8299, + "step": 3060 + }, + { + "epoch": 1.4551487143026425, + "grad_norm": 0.4883320170692768, + "learning_rate": 5.23204789300942e-05, + "loss": 1.8948, + "step": 3070 + }, + { + "epoch": 1.4598886123948336, + "grad_norm": 0.5018665885085004, + "learning_rate": 5.207245011300256e-05, + "loss": 1.9096, + "step": 3080 + }, + { + "epoch": 1.4646285104870245, + "grad_norm": 0.49985987707909735, + "learning_rate": 5.182437019986781e-05, + "loss": 1.8725, + "step": 3090 + }, + { + "epoch": 1.4693684085792156, + "grad_norm": 0.5501802725606001, + "learning_rate": 5.157624530707457e-05, + "loss": 1.852, + "step": 3100 + }, + { + "epoch": 1.4741083066714067, + "grad_norm": 0.5050415458131547, + "learning_rate": 5.132808155211637e-05, + "loss": 1.9234, + "step": 3110 + }, + { + "epoch": 1.4788482047635976, + "grad_norm": 0.5388328369977669, + "learning_rate": 5.107988505344493e-05, + "loss": 1.8503, + "step": 3120 + }, + { + "epoch": 1.4835881028557887, + "grad_norm": 0.5294932998067775, + "learning_rate": 5.083166193031924e-05, + "loss": 1.8602, + "step": 3130 + }, + { + "epoch": 1.4883280009479796, + "grad_norm": 0.5081432892581731, + "learning_rate": 5.058341830265473e-05, + "loss": 1.8916, + "step": 3140 + }, + { + "epoch": 1.4930678990401707, + "grad_norm": 0.48231454449779565, + "learning_rate": 5.033516029087231e-05, + "loss": 1.9268, + "step": 3150 + }, + { + "epoch": 1.4978077971323618, + "grad_norm": 0.5031248301603529, + "learning_rate": 5.008689401574762e-05, + "loss": 1.8619, + "step": 3160 + }, + { + "epoch": 1.5025476952245527, + "grad_norm": 0.48955254310210605, + "learning_rate": 4.983862559825994e-05, + "loss": 1.9342, + "step": 3170 + }, + { + "epoch": 1.5072875933167436, + "grad_norm": 0.5786990144175583, + "learning_rate": 4.959036115944146e-05, + "loss": 1.9487, + "step": 3180 + }, + { + "epoch": 1.5120274914089347, + "grad_norm": 0.5204059056090741, + "learning_rate": 4.93421068202262e-05, + "loss": 1.9237, + "step": 3190 + }, + { + "epoch": 1.5167673895011258, + "grad_norm": 0.5063131987653341, + "learning_rate": 4.909386870129921e-05, + "loss": 1.9752, + "step": 3200 + }, + { + "epoch": 1.5215072875933169, + "grad_norm": 0.48289993909064316, + "learning_rate": 4.884565292294563e-05, + "loss": 1.8891, + "step": 3210 + }, + { + "epoch": 1.5262471856855078, + "grad_norm": 0.5172395191973475, + "learning_rate": 4.859746560489979e-05, + "loss": 1.8907, + "step": 3220 + }, + { + "epoch": 1.5309870837776987, + "grad_norm": 0.4807916914066212, + "learning_rate": 4.834931286619432e-05, + "loss": 1.9074, + "step": 3230 + }, + { + "epoch": 1.5357269818698898, + "grad_norm": 0.5144939695987174, + "learning_rate": 4.810120082500934e-05, + "loss": 1.8338, + "step": 3240 + }, + { + "epoch": 1.5404668799620809, + "grad_norm": 0.5199756044880577, + "learning_rate": 4.785313559852156e-05, + "loss": 1.965, + "step": 3250 + }, + { + "epoch": 1.545206778054272, + "grad_norm": 0.5415928562917922, + "learning_rate": 4.7605123302753433e-05, + "loss": 1.8472, + "step": 3260 + }, + { + "epoch": 1.5499466761464629, + "grad_norm": 0.5335132590972799, + "learning_rate": 4.735717005242248e-05, + "loss": 1.8558, + "step": 3270 + }, + { + "epoch": 1.5546865742386538, + "grad_norm": 0.5581108907205053, + "learning_rate": 4.710928196079042e-05, + "loss": 1.8794, + "step": 3280 + }, + { + "epoch": 1.5594264723308449, + "grad_norm": 0.5335645184315633, + "learning_rate": 4.6861465139512475e-05, + "loss": 1.8271, + "step": 3290 + }, + { + "epoch": 1.564166370423036, + "grad_norm": 0.5470177997128685, + "learning_rate": 4.661372569848678e-05, + "loss": 1.8935, + "step": 3300 + }, + { + "epoch": 1.568906268515227, + "grad_norm": 0.5362519757955545, + "learning_rate": 4.636606974570361e-05, + "loss": 1.8072, + "step": 3310 + }, + { + "epoch": 1.573646166607418, + "grad_norm": 0.6040810957613818, + "learning_rate": 4.611850338709482e-05, + "loss": 1.7864, + "step": 3320 + }, + { + "epoch": 1.5783860646996088, + "grad_norm": 0.5318403452991018, + "learning_rate": 4.5871032726383386e-05, + "loss": 1.8524, + "step": 3330 + }, + { + "epoch": 1.5831259627918, + "grad_norm": 0.5512446332300014, + "learning_rate": 4.562366386493286e-05, + "loss": 1.8972, + "step": 3340 + }, + { + "epoch": 1.587865860883991, + "grad_norm": 0.5083043080271707, + "learning_rate": 4.537640290159688e-05, + "loss": 1.7909, + "step": 3350 + }, + { + "epoch": 1.5926057589761822, + "grad_norm": 0.516558139348224, + "learning_rate": 4.512925593256895e-05, + "loss": 1.9006, + "step": 3360 + }, + { + "epoch": 1.597345657068373, + "grad_norm": 0.5406712324925647, + "learning_rate": 4.4882229051232e-05, + "loss": 1.9456, + "step": 3370 + }, + { + "epoch": 1.602085555160564, + "grad_norm": 0.5537236012465999, + "learning_rate": 4.463532834800825e-05, + "loss": 1.8696, + "step": 3380 + }, + { + "epoch": 1.606825453252755, + "grad_norm": 0.5501268633544832, + "learning_rate": 4.438855991020896e-05, + "loss": 1.9089, + "step": 3390 + }, + { + "epoch": 1.6115653513449462, + "grad_norm": 0.5642376324584947, + "learning_rate": 4.414192982188446e-05, + "loss": 1.868, + "step": 3400 + }, + { + "epoch": 1.616305249437137, + "grad_norm": 0.49603254737837815, + "learning_rate": 4.3895444163674006e-05, + "loss": 1.9261, + "step": 3410 + }, + { + "epoch": 1.6210451475293282, + "grad_norm": 0.5264212888797052, + "learning_rate": 4.364910901265606e-05, + "loss": 1.9271, + "step": 3420 + }, + { + "epoch": 1.625785045621519, + "grad_norm": 0.5165427594444576, + "learning_rate": 4.340293044219825e-05, + "loss": 1.8798, + "step": 3430 + }, + { + "epoch": 1.6305249437137102, + "grad_norm": 0.5111756681074762, + "learning_rate": 4.315691452180777e-05, + "loss": 1.8821, + "step": 3440 + }, + { + "epoch": 1.6352648418059013, + "grad_norm": 0.5353729238490614, + "learning_rate": 4.2911067316981656e-05, + "loss": 1.9193, + "step": 3450 + }, + { + "epoch": 1.6400047398980921, + "grad_norm": 0.5427362289483532, + "learning_rate": 4.2665394889057325e-05, + "loss": 1.8648, + "step": 3460 + }, + { + "epoch": 1.6447446379902833, + "grad_norm": 0.5316532712452083, + "learning_rate": 4.2419903295063045e-05, + "loss": 1.8696, + "step": 3470 + }, + { + "epoch": 1.6494845360824741, + "grad_norm": 0.5445515739019248, + "learning_rate": 4.2174598587568706e-05, + "loss": 1.7773, + "step": 3480 + }, + { + "epoch": 1.6542244341746652, + "grad_norm": 0.515985891781636, + "learning_rate": 4.192948681453645e-05, + "loss": 1.9528, + "step": 3490 + }, + { + "epoch": 1.6589643322668564, + "grad_norm": 0.533497568011406, + "learning_rate": 4.168457401917169e-05, + "loss": 1.9089, + "step": 3500 + }, + { + "epoch": 1.6637042303590472, + "grad_norm": 0.5034380410666982, + "learning_rate": 4.1439866239774065e-05, + "loss": 1.902, + "step": 3510 + }, + { + "epoch": 1.6684441284512384, + "grad_norm": 0.5008886693586585, + "learning_rate": 4.119536950958853e-05, + "loss": 1.8597, + "step": 3520 + }, + { + "epoch": 1.6731840265434292, + "grad_norm": 0.5042866133180605, + "learning_rate": 4.095108985665668e-05, + "loss": 1.941, + "step": 3530 + }, + { + "epoch": 1.6779239246356203, + "grad_norm": 0.4894456961892347, + "learning_rate": 4.070703330366809e-05, + "loss": 1.8749, + "step": 3540 + }, + { + "epoch": 1.6826638227278115, + "grad_norm": 0.5304927617260963, + "learning_rate": 4.0463205867811834e-05, + "loss": 1.9169, + "step": 3550 + }, + { + "epoch": 1.6874037208200023, + "grad_norm": 0.5192399220515885, + "learning_rate": 4.0219613560628074e-05, + "loss": 1.8853, + "step": 3560 + }, + { + "epoch": 1.6921436189121932, + "grad_norm": 0.5436581114459818, + "learning_rate": 3.997626238785997e-05, + "loss": 1.9093, + "step": 3570 + }, + { + "epoch": 1.6968835170043843, + "grad_norm": 0.5671093634463978, + "learning_rate": 3.973315834930549e-05, + "loss": 1.8667, + "step": 3580 + }, + { + "epoch": 1.7016234150965754, + "grad_norm": 0.5505401718757482, + "learning_rate": 3.949030743866955e-05, + "loss": 1.8701, + "step": 3590 + }, + { + "epoch": 1.7063633131887666, + "grad_norm": 0.5107784655812311, + "learning_rate": 3.924771564341621e-05, + "loss": 1.8796, + "step": 3600 + }, + { + "epoch": 1.7111032112809574, + "grad_norm": 0.5123424894974382, + "learning_rate": 3.900538894462112e-05, + "loss": 1.9345, + "step": 3610 + }, + { + "epoch": 1.7158431093731483, + "grad_norm": 0.5975803333556319, + "learning_rate": 3.876333331682394e-05, + "loss": 1.9071, + "step": 3620 + }, + { + "epoch": 1.7205830074653394, + "grad_norm": 0.5607215795184285, + "learning_rate": 3.8521554727881115e-05, + "loss": 1.8444, + "step": 3630 + }, + { + "epoch": 1.7253229055575305, + "grad_norm": 0.5812681320546813, + "learning_rate": 3.828005913881876e-05, + "loss": 1.8783, + "step": 3640 + }, + { + "epoch": 1.7300628036497216, + "grad_norm": 0.5809996822930421, + "learning_rate": 3.803885250368562e-05, + "loss": 1.8667, + "step": 3650 + }, + { + "epoch": 1.7348027017419125, + "grad_norm": 0.5264379258394054, + "learning_rate": 3.7797940769406324e-05, + "loss": 1.8832, + "step": 3660 + }, + { + "epoch": 1.7395425998341034, + "grad_norm": 0.5452547674401557, + "learning_rate": 3.755732987563476e-05, + "loss": 1.9126, + "step": 3670 + }, + { + "epoch": 1.7442824979262945, + "grad_norm": 0.5573756045226962, + "learning_rate": 3.731702575460763e-05, + "loss": 1.9267, + "step": 3680 + }, + { + "epoch": 1.7490223960184856, + "grad_norm": 0.5891329270301621, + "learning_rate": 3.707703433099815e-05, + "loss": 1.8927, + "step": 3690 + }, + { + "epoch": 1.7537622941106767, + "grad_norm": 0.5379354015536967, + "learning_rate": 3.683736152177005e-05, + "loss": 1.8829, + "step": 3700 + }, + { + "epoch": 1.7585021922028676, + "grad_norm": 0.584902744080287, + "learning_rate": 3.659801323603163e-05, + "loss": 1.9032, + "step": 3710 + }, + { + "epoch": 1.7632420902950585, + "grad_norm": 0.47271945766863005, + "learning_rate": 3.63589953748901e-05, + "loss": 1.8634, + "step": 3720 + }, + { + "epoch": 1.7679819883872496, + "grad_norm": 0.5602358756096469, + "learning_rate": 3.612031383130612e-05, + "loss": 1.8436, + "step": 3730 + }, + { + "epoch": 1.7727218864794407, + "grad_norm": 0.5171084893952771, + "learning_rate": 3.5881974489948456e-05, + "loss": 1.8279, + "step": 3740 + }, + { + "epoch": 1.7774617845716318, + "grad_norm": 0.5085114117110985, + "learning_rate": 3.564398322704887e-05, + "loss": 1.8842, + "step": 3750 + }, + { + "epoch": 1.7822016826638227, + "grad_norm": 0.5395255555244833, + "learning_rate": 3.5406345910257346e-05, + "loss": 1.8974, + "step": 3760 + }, + { + "epoch": 1.7869415807560136, + "grad_norm": 0.5256917642696852, + "learning_rate": 3.5169068398497344e-05, + "loss": 1.9247, + "step": 3770 + }, + { + "epoch": 1.7916814788482047, + "grad_norm": 0.5297510632715654, + "learning_rate": 3.493215654182134e-05, + "loss": 1.8941, + "step": 3780 + }, + { + "epoch": 1.7964213769403958, + "grad_norm": 0.4887292770108947, + "learning_rate": 3.4695616181266674e-05, + "loss": 1.8662, + "step": 3790 + }, + { + "epoch": 1.801161275032587, + "grad_norm": 0.605286928037954, + "learning_rate": 3.445945314871144e-05, + "loss": 1.7946, + "step": 3800 + }, + { + "epoch": 1.8059011731247778, + "grad_norm": 0.5534598174424521, + "learning_rate": 3.422367326673079e-05, + "loss": 1.9319, + "step": 3810 + }, + { + "epoch": 1.8106410712169687, + "grad_norm": 0.516541325820194, + "learning_rate": 3.398828234845331e-05, + "loss": 1.9102, + "step": 3820 + }, + { + "epoch": 1.8153809693091598, + "grad_norm": 0.5316375380294128, + "learning_rate": 3.3753286197417714e-05, + "loss": 1.9137, + "step": 3830 + }, + { + "epoch": 1.820120867401351, + "grad_norm": 0.5048711282201915, + "learning_rate": 3.3518690607429784e-05, + "loss": 1.8643, + "step": 3840 + }, + { + "epoch": 1.824860765493542, + "grad_norm": 0.5407400572506997, + "learning_rate": 3.3284501362419566e-05, + "loss": 1.8524, + "step": 3850 + }, + { + "epoch": 1.829600663585733, + "grad_norm": 0.5444240928370307, + "learning_rate": 3.305072423629862e-05, + "loss": 1.9604, + "step": 3860 + }, + { + "epoch": 1.8343405616779238, + "grad_norm": 0.5259735881080222, + "learning_rate": 3.281736499281783e-05, + "loss": 1.8699, + "step": 3870 + }, + { + "epoch": 1.839080459770115, + "grad_norm": 0.5412391021904834, + "learning_rate": 3.2584429385425163e-05, + "loss": 1.9233, + "step": 3880 + }, + { + "epoch": 1.843820357862306, + "grad_norm": 0.581528749881215, + "learning_rate": 3.235192315712394e-05, + "loss": 1.9037, + "step": 3890 + }, + { + "epoch": 1.8485602559544971, + "grad_norm": 0.486599214527775, + "learning_rate": 3.211985204033114e-05, + "loss": 1.881, + "step": 3900 + }, + { + "epoch": 1.853300154046688, + "grad_norm": 0.5732281840924196, + "learning_rate": 3.188822175673618e-05, + "loss": 1.9289, + "step": 3910 + }, + { + "epoch": 1.858040052138879, + "grad_norm": 0.5393218742500727, + "learning_rate": 3.165703801715969e-05, + "loss": 1.8178, + "step": 3920 + }, + { + "epoch": 1.86277995023107, + "grad_norm": 0.5317421200650526, + "learning_rate": 3.142630652141286e-05, + "loss": 1.7813, + "step": 3930 + }, + { + "epoch": 1.8675198483232611, + "grad_norm": 0.4707578563318653, + "learning_rate": 3.119603295815685e-05, + "loss": 1.8928, + "step": 3940 + }, + { + "epoch": 1.8722597464154522, + "grad_norm": 0.503217338566424, + "learning_rate": 3.096622300476253e-05, + "loss": 1.9702, + "step": 3950 + }, + { + "epoch": 1.8769996445076431, + "grad_norm": 0.5191335631232252, + "learning_rate": 3.07368823271705e-05, + "loss": 1.8832, + "step": 3960 + }, + { + "epoch": 1.881739542599834, + "grad_norm": 0.5929718795388419, + "learning_rate": 3.050801657975147e-05, + "loss": 1.9705, + "step": 3970 + }, + { + "epoch": 1.8864794406920251, + "grad_norm": 0.5203449537199084, + "learning_rate": 3.0279631405166754e-05, + "loss": 1.8005, + "step": 3980 + }, + { + "epoch": 1.8912193387842162, + "grad_norm": 0.6060740003713215, + "learning_rate": 3.0051732434229184e-05, + "loss": 1.8802, + "step": 3990 + }, + { + "epoch": 1.895959236876407, + "grad_norm": 0.5254251326665124, + "learning_rate": 2.9824325285764332e-05, + "loss": 1.9063, + "step": 4000 + }, + { + "epoch": 1.9006991349685982, + "grad_norm": 0.5412654814841995, + "learning_rate": 2.9597415566471874e-05, + "loss": 1.7974, + "step": 4010 + }, + { + "epoch": 1.905439033060789, + "grad_norm": 0.6096977687423671, + "learning_rate": 2.9371008870787474e-05, + "loss": 1.8789, + "step": 4020 + }, + { + "epoch": 1.9101789311529802, + "grad_norm": 0.5751076752952912, + "learning_rate": 2.914511078074481e-05, + "loss": 1.9147, + "step": 4030 + }, + { + "epoch": 1.9149188292451713, + "grad_norm": 0.5596872085857021, + "learning_rate": 2.891972686583791e-05, + "loss": 1.8939, + "step": 4040 + }, + { + "epoch": 1.9196587273373622, + "grad_norm": 0.5205001238706851, + "learning_rate": 2.8694862682883866e-05, + "loss": 1.8675, + "step": 4050 + }, + { + "epoch": 1.9243986254295533, + "grad_norm": 0.6060966652232279, + "learning_rate": 2.8470523775885816e-05, + "loss": 1.8542, + "step": 4060 + }, + { + "epoch": 1.9291385235217442, + "grad_norm": 0.5060927602134601, + "learning_rate": 2.824671567589635e-05, + "loss": 1.9095, + "step": 4070 + }, + { + "epoch": 1.9338784216139353, + "grad_norm": 0.527071756794979, + "learning_rate": 2.8023443900880984e-05, + "loss": 1.8144, + "step": 4080 + }, + { + "epoch": 1.9386183197061264, + "grad_norm": 0.6186591144971271, + "learning_rate": 2.780071395558222e-05, + "loss": 1.9328, + "step": 4090 + }, + { + "epoch": 1.9433582177983173, + "grad_norm": 0.5084958011646354, + "learning_rate": 2.757853133138382e-05, + "loss": 1.8292, + "step": 4100 + }, + { + "epoch": 1.9480981158905084, + "grad_norm": 0.5671058444452819, + "learning_rate": 2.7356901506175426e-05, + "loss": 1.8621, + "step": 4110 + }, + { + "epoch": 1.9528380139826993, + "grad_norm": 0.6077250993929268, + "learning_rate": 2.7135829944217406e-05, + "loss": 1.8969, + "step": 4120 + }, + { + "epoch": 1.9575779120748904, + "grad_norm": 0.5478709269890887, + "learning_rate": 2.6915322096006244e-05, + "loss": 1.9648, + "step": 4130 + }, + { + "epoch": 1.9623178101670815, + "grad_norm": 0.5304846907499281, + "learning_rate": 2.6695383398140155e-05, + "loss": 1.8867, + "step": 4140 + }, + { + "epoch": 1.9670577082592724, + "grad_norm": 0.5084950385451593, + "learning_rate": 2.6476019273184938e-05, + "loss": 1.8987, + "step": 4150 + }, + { + "epoch": 1.9717976063514633, + "grad_norm": 0.5881914443826771, + "learning_rate": 2.6257235129540424e-05, + "loss": 1.8718, + "step": 4160 + }, + { + "epoch": 1.9765375044436544, + "grad_norm": 0.5557425542971698, + "learning_rate": 2.603903636130701e-05, + "loss": 1.8204, + "step": 4170 + }, + { + "epoch": 1.9812774025358455, + "grad_norm": 0.5235298330164154, + "learning_rate": 2.5821428348152788e-05, + "loss": 1.915, + "step": 4180 + }, + { + "epoch": 1.9860173006280366, + "grad_norm": 0.6107709148392828, + "learning_rate": 2.560441645518078e-05, + "loss": 1.8223, + "step": 4190 + }, + { + "epoch": 1.9907571987202275, + "grad_norm": 0.5614697856069703, + "learning_rate": 2.538800603279673e-05, + "loss": 1.8439, + "step": 4200 + }, + { + "epoch": 1.9954970968124184, + "grad_norm": 0.5563269995130558, + "learning_rate": 2.5172202416577236e-05, + "loss": 1.8982, + "step": 4210 + }, + { + "epoch": 2.0002369949046095, + "grad_norm": 0.5673849628756762, + "learning_rate": 2.4957010927138136e-05, + "loss": 1.8956, + "step": 4220 + }, + { + "epoch": 2.0049768929968006, + "grad_norm": 0.5274159605663582, + "learning_rate": 2.4742436870003326e-05, + "loss": 1.8572, + "step": 4230 + }, + { + "epoch": 2.0097167910889917, + "grad_norm": 0.5388999304024686, + "learning_rate": 2.452848553547396e-05, + "loss": 1.8441, + "step": 4240 + }, + { + "epoch": 2.014456689181183, + "grad_norm": 0.5715679686982497, + "learning_rate": 2.431516219849809e-05, + "loss": 1.838, + "step": 4250 + }, + { + "epoch": 2.0191965872733735, + "grad_norm": 0.5795119843431206, + "learning_rate": 2.4102472118540487e-05, + "loss": 1.8329, + "step": 4260 + }, + { + "epoch": 2.0239364853655646, + "grad_norm": 0.5503184533431318, + "learning_rate": 2.3890420539453057e-05, + "loss": 1.8733, + "step": 4270 + }, + { + "epoch": 2.0286763834577557, + "grad_norm": 0.54871121092008, + "learning_rate": 2.3679012689345558e-05, + "loss": 1.8601, + "step": 4280 + }, + { + "epoch": 2.033416281549947, + "grad_norm": 0.5879797146794722, + "learning_rate": 2.3468253780456678e-05, + "loss": 1.7751, + "step": 4290 + }, + { + "epoch": 2.038156179642138, + "grad_norm": 0.5510154682184406, + "learning_rate": 2.3258149009025482e-05, + "loss": 1.827, + "step": 4300 + }, + { + "epoch": 2.0428960777343286, + "grad_norm": 0.513792181350148, + "learning_rate": 2.3048703555163357e-05, + "loss": 1.8474, + "step": 4310 + }, + { + "epoch": 2.0476359758265197, + "grad_norm": 0.5489219942664323, + "learning_rate": 2.2839922582726336e-05, + "loss": 1.8862, + "step": 4320 + }, + { + "epoch": 2.052375873918711, + "grad_norm": 0.6504687065880719, + "learning_rate": 2.2631811239187646e-05, + "loss": 1.7984, + "step": 4330 + }, + { + "epoch": 2.057115772010902, + "grad_norm": 0.6130904570523673, + "learning_rate": 2.2424374655510965e-05, + "loss": 1.7921, + "step": 4340 + }, + { + "epoch": 2.0618556701030926, + "grad_norm": 0.6408124203446663, + "learning_rate": 2.2217617946023765e-05, + "loss": 1.8592, + "step": 4350 + }, + { + "epoch": 2.0665955681952837, + "grad_norm": 0.6181447797115482, + "learning_rate": 2.201154620829137e-05, + "loss": 1.8067, + "step": 4360 + }, + { + "epoch": 2.071335466287475, + "grad_norm": 0.5627617017019729, + "learning_rate": 2.1806164522991118e-05, + "loss": 1.7701, + "step": 4370 + }, + { + "epoch": 2.076075364379666, + "grad_norm": 0.5510540438192786, + "learning_rate": 2.1601477953787214e-05, + "loss": 1.857, + "step": 4380 + }, + { + "epoch": 2.080815262471857, + "grad_norm": 0.6083237779423979, + "learning_rate": 2.1397491547205807e-05, + "loss": 1.7601, + "step": 4390 + }, + { + "epoch": 2.0855551605640477, + "grad_norm": 0.6047311337345246, + "learning_rate": 2.119421033251071e-05, + "loss": 1.8347, + "step": 4400 + }, + { + "epoch": 2.0902950586562388, + "grad_norm": 0.5662369508712475, + "learning_rate": 2.0991639321579214e-05, + "loss": 1.8545, + "step": 4410 + }, + { + "epoch": 2.09503495674843, + "grad_norm": 0.5935079368512177, + "learning_rate": 2.078978350877862e-05, + "loss": 1.879, + "step": 4420 + }, + { + "epoch": 2.099774854840621, + "grad_norm": 0.571586984028468, + "learning_rate": 2.058864787084309e-05, + "loss": 1.7671, + "step": 4430 + }, + { + "epoch": 2.104514752932812, + "grad_norm": 0.5682037137995106, + "learning_rate": 2.0388237366751006e-05, + "loss": 1.865, + "step": 4440 + }, + { + "epoch": 2.1092546510250028, + "grad_norm": 0.5490908649638305, + "learning_rate": 2.018855693760257e-05, + "loss": 1.78, + "step": 4450 + }, + { + "epoch": 2.113994549117194, + "grad_norm": 0.6176356249016943, + "learning_rate": 1.998961150649814e-05, + "loss": 1.8435, + "step": 4460 + }, + { + "epoch": 2.118734447209385, + "grad_norm": 0.5319868348925916, + "learning_rate": 1.9791405978416694e-05, + "loss": 1.8981, + "step": 4470 + }, + { + "epoch": 2.123474345301576, + "grad_norm": 0.5752723871436735, + "learning_rate": 1.9593945240095052e-05, + "loss": 1.7755, + "step": 4480 + }, + { + "epoch": 2.128214243393767, + "grad_norm": 0.6366681694521167, + "learning_rate": 1.9397234159907275e-05, + "loss": 1.8707, + "step": 4490 + }, + { + "epoch": 2.132954141485958, + "grad_norm": 0.5901487974014347, + "learning_rate": 1.920127758774466e-05, + "loss": 1.8256, + "step": 4500 + }, + { + "epoch": 2.137694039578149, + "grad_norm": 0.5888105104943471, + "learning_rate": 1.9006080354896267e-05, + "loss": 1.8357, + "step": 4510 + }, + { + "epoch": 2.14243393767034, + "grad_norm": 0.5878169661429707, + "learning_rate": 1.8811647273929628e-05, + "loss": 1.8241, + "step": 4520 + }, + { + "epoch": 2.147173835762531, + "grad_norm": 0.5581948418607748, + "learning_rate": 1.8617983138572277e-05, + "loss": 1.848, + "step": 4530 + }, + { + "epoch": 2.1519137338547223, + "grad_norm": 0.6137321662868356, + "learning_rate": 1.8425092723593395e-05, + "loss": 1.78, + "step": 4540 + }, + { + "epoch": 2.156653631946913, + "grad_norm": 0.558081495592443, + "learning_rate": 1.823298078468624e-05, + "loss": 1.8153, + "step": 4550 + }, + { + "epoch": 2.161393530039104, + "grad_norm": 0.6039625325723422, + "learning_rate": 1.8041652058350767e-05, + "loss": 1.8416, + "step": 4560 + }, + { + "epoch": 2.166133428131295, + "grad_norm": 0.6295821331128388, + "learning_rate": 1.785111126177691e-05, + "loss": 1.7953, + "step": 4570 + }, + { + "epoch": 2.1708733262234863, + "grad_norm": 0.5911527371211652, + "learning_rate": 1.7661363092728307e-05, + "loss": 1.7851, + "step": 4580 + }, + { + "epoch": 2.1756132243156774, + "grad_norm": 0.565852777352692, + "learning_rate": 1.7472412229426455e-05, + "loss": 1.8101, + "step": 4590 + }, + { + "epoch": 2.180353122407868, + "grad_norm": 0.5656454600563583, + "learning_rate": 1.7284263330435317e-05, + "loss": 1.917, + "step": 4600 + }, + { + "epoch": 2.185093020500059, + "grad_norm": 0.6035646498858932, + "learning_rate": 1.709692103454651e-05, + "loss": 1.8168, + "step": 4610 + }, + { + "epoch": 2.1898329185922503, + "grad_norm": 0.5477939270708279, + "learning_rate": 1.6910389960664992e-05, + "loss": 1.777, + "step": 4620 + }, + { + "epoch": 2.1945728166844414, + "grad_norm": 0.5898939001383526, + "learning_rate": 1.672467470769507e-05, + "loss": 1.7575, + "step": 4630 + }, + { + "epoch": 2.1993127147766325, + "grad_norm": 0.544798273283213, + "learning_rate": 1.6539779854427074e-05, + "loss": 1.8834, + "step": 4640 + }, + { + "epoch": 2.204052612868823, + "grad_norm": 0.610618761949142, + "learning_rate": 1.6355709959424487e-05, + "loss": 1.8785, + "step": 4650 + }, + { + "epoch": 2.2087925109610143, + "grad_norm": 0.6064522176814057, + "learning_rate": 1.6172469560911553e-05, + "loss": 1.7854, + "step": 4660 + }, + { + "epoch": 2.2135324090532054, + "grad_norm": 0.6022849345976745, + "learning_rate": 1.599006317666131e-05, + "loss": 1.8497, + "step": 4670 + }, + { + "epoch": 2.2182723071453965, + "grad_norm": 0.5926151325695663, + "learning_rate": 1.5808495303884297e-05, + "loss": 1.8184, + "step": 4680 + }, + { + "epoch": 2.2230122052375876, + "grad_norm": 0.5740462281531319, + "learning_rate": 1.562777041911761e-05, + "loss": 1.8073, + "step": 4690 + }, + { + "epoch": 2.2277521033297782, + "grad_norm": 0.595274030679382, + "learning_rate": 1.5447892978114592e-05, + "loss": 1.8095, + "step": 4700 + }, + { + "epoch": 2.2324920014219694, + "grad_norm": 0.5805561493774153, + "learning_rate": 1.526886741573496e-05, + "loss": 1.7907, + "step": 4710 + }, + { + "epoch": 2.2372318995141605, + "grad_norm": 0.6585750772533296, + "learning_rate": 1.5090698145835413e-05, + "loss": 1.8081, + "step": 4720 + }, + { + "epoch": 2.2419717976063516, + "grad_norm": 0.7616121844460758, + "learning_rate": 1.491338956116085e-05, + "loss": 1.8571, + "step": 4730 + }, + { + "epoch": 2.2467116956985427, + "grad_norm": 0.6037559488690589, + "learning_rate": 1.473694603323611e-05, + "loss": 1.8194, + "step": 4740 + }, + { + "epoch": 2.2514515937907333, + "grad_norm": 0.6412117105060221, + "learning_rate": 1.4561371912258098e-05, + "loss": 1.7447, + "step": 4750 + }, + { + "epoch": 2.2561914918829244, + "grad_norm": 0.6178165307415238, + "learning_rate": 1.4386671526988593e-05, + "loss": 1.8047, + "step": 4760 + }, + { + "epoch": 2.2609313899751156, + "grad_norm": 0.5887211775830831, + "learning_rate": 1.421284918464752e-05, + "loss": 1.8309, + "step": 4770 + }, + { + "epoch": 2.2656712880673067, + "grad_norm": 0.6715832023904247, + "learning_rate": 1.4039909170806764e-05, + "loss": 1.7598, + "step": 4780 + }, + { + "epoch": 2.2704111861594978, + "grad_norm": 0.5565711226911474, + "learning_rate": 1.386785574928446e-05, + "loss": 1.8042, + "step": 4790 + }, + { + "epoch": 2.2751510842516884, + "grad_norm": 1.0370061435438975, + "learning_rate": 1.3696693162039893e-05, + "loss": 1.8418, + "step": 4800 + }, + { + "epoch": 2.2798909823438795, + "grad_norm": 0.619379427966442, + "learning_rate": 1.3526425629068967e-05, + "loss": 1.8709, + "step": 4810 + }, + { + "epoch": 2.2846308804360707, + "grad_norm": 0.6181820044240368, + "learning_rate": 1.3357057348300067e-05, + "loss": 1.8222, + "step": 4820 + }, + { + "epoch": 2.2893707785282618, + "grad_norm": 0.6447967865409838, + "learning_rate": 1.318859249549066e-05, + "loss": 1.8183, + "step": 4830 + }, + { + "epoch": 2.294110676620453, + "grad_norm": 0.6058171204419526, + "learning_rate": 1.3021035224124224e-05, + "loss": 1.805, + "step": 4840 + }, + { + "epoch": 2.2988505747126435, + "grad_norm": 0.5434323398332925, + "learning_rate": 1.2854389665307975e-05, + "loss": 1.7541, + "step": 4850 + }, + { + "epoch": 2.3035904728048346, + "grad_norm": 0.6113667985824829, + "learning_rate": 1.2688659927670915e-05, + "loss": 1.758, + "step": 4860 + }, + { + "epoch": 2.3083303708970258, + "grad_norm": 0.5720767875706882, + "learning_rate": 1.2523850097262563e-05, + "loss": 1.8322, + "step": 4870 + }, + { + "epoch": 2.313070268989217, + "grad_norm": 0.5628951626795141, + "learning_rate": 1.2359964237452238e-05, + "loss": 1.7798, + "step": 4880 + }, + { + "epoch": 2.317810167081408, + "grad_norm": 0.6094150987430762, + "learning_rate": 1.219700638882888e-05, + "loss": 1.7842, + "step": 4890 + }, + { + "epoch": 2.3225500651735986, + "grad_norm": 0.6036779282592939, + "learning_rate": 1.2034980569101367e-05, + "loss": 1.8383, + "step": 4900 + }, + { + "epoch": 2.3272899632657897, + "grad_norm": 0.6175747345768624, + "learning_rate": 1.1873890772999502e-05, + "loss": 1.9046, + "step": 4910 + }, + { + "epoch": 2.332029861357981, + "grad_norm": 0.5564649373869762, + "learning_rate": 1.1713740972175574e-05, + "loss": 1.8104, + "step": 4920 + }, + { + "epoch": 2.336769759450172, + "grad_norm": 0.6441404862225901, + "learning_rate": 1.155453511510633e-05, + "loss": 1.7864, + "step": 4930 + }, + { + "epoch": 2.341509657542363, + "grad_norm": 0.6927623121031959, + "learning_rate": 1.1396277126995707e-05, + "loss": 1.829, + "step": 4940 + }, + { + "epoch": 2.3462495556345537, + "grad_norm": 0.6537904475611329, + "learning_rate": 1.1238970909677993e-05, + "loss": 1.8655, + "step": 4950 + }, + { + "epoch": 2.350989453726745, + "grad_norm": 0.5779494171909159, + "learning_rate": 1.1082620341521766e-05, + "loss": 1.7482, + "step": 4960 + }, + { + "epoch": 2.355729351818936, + "grad_norm": 0.6161830958900923, + "learning_rate": 1.0927229277334061e-05, + "loss": 1.7789, + "step": 4970 + }, + { + "epoch": 2.360469249911127, + "grad_norm": 0.5946038603032194, + "learning_rate": 1.0772801548265498e-05, + "loss": 1.8189, + "step": 4980 + }, + { + "epoch": 2.365209148003318, + "grad_norm": 0.6072288944056834, + "learning_rate": 1.0619340961715746e-05, + "loss": 1.8588, + "step": 4990 + }, + { + "epoch": 2.369949046095509, + "grad_norm": 0.5882805952028816, + "learning_rate": 1.0466851301239711e-05, + "loss": 1.8238, + "step": 5000 + }, + { + "epoch": 2.3746889441877, + "grad_norm": 0.6288910196539964, + "learning_rate": 1.0315336326454161e-05, + "loss": 1.7055, + "step": 5010 + }, + { + "epoch": 2.379428842279891, + "grad_norm": 0.6043835236662759, + "learning_rate": 1.0164799772945149e-05, + "loss": 1.8134, + "step": 5020 + }, + { + "epoch": 2.384168740372082, + "grad_norm": 0.5821262142704368, + "learning_rate": 1.0015245352175811e-05, + "loss": 1.797, + "step": 5030 + }, + { + "epoch": 2.3889086384642733, + "grad_norm": 0.6369667143877562, + "learning_rate": 9.866676751394927e-06, + "loss": 1.8199, + "step": 5040 + }, + { + "epoch": 2.393648536556464, + "grad_norm": 0.5924507902566707, + "learning_rate": 9.719097633545975e-06, + "loss": 1.8524, + "step": 5050 + }, + { + "epoch": 2.398388434648655, + "grad_norm": 0.5762513665027686, + "learning_rate": 9.572511637176811e-06, + "loss": 1.8428, + "step": 5060 + }, + { + "epoch": 2.403128332740846, + "grad_norm": 0.5799149040724592, + "learning_rate": 9.426922376350028e-06, + "loss": 1.8463, + "step": 5070 + }, + { + "epoch": 2.4078682308330372, + "grad_norm": 0.5898000658332848, + "learning_rate": 9.282333440553804e-06, + "loss": 1.7772, + "step": 5080 + }, + { + "epoch": 2.4126081289252284, + "grad_norm": 0.5967206158269678, + "learning_rate": 9.13874839461336e-06, + "loss": 1.8234, + "step": 5090 + }, + { + "epoch": 2.417348027017419, + "grad_norm": 0.6245591569289297, + "learning_rate": 8.996170778603153e-06, + "loss": 1.8047, + "step": 5100 + }, + { + "epoch": 2.42208792510961, + "grad_norm": 0.5981945344970201, + "learning_rate": 8.854604107759568e-06, + "loss": 1.8429, + "step": 5110 + }, + { + "epoch": 2.4268278232018012, + "grad_norm": 0.6112665064763977, + "learning_rate": 8.714051872394213e-06, + "loss": 1.7746, + "step": 5120 + }, + { + "epoch": 2.4315677212939923, + "grad_norm": 0.5847743009358597, + "learning_rate": 8.574517537807897e-06, + "loss": 1.7703, + "step": 5130 + }, + { + "epoch": 2.436307619386183, + "grad_norm": 0.5617053604855574, + "learning_rate": 8.436004544205217e-06, + "loss": 1.8498, + "step": 5140 + }, + { + "epoch": 2.441047517478374, + "grad_norm": 0.5947168640425712, + "learning_rate": 8.2985163066097e-06, + "loss": 1.8439, + "step": 5150 + }, + { + "epoch": 2.4457874155705652, + "grad_norm": 0.6456439652584188, + "learning_rate": 8.162056214779618e-06, + "loss": 1.8125, + "step": 5160 + }, + { + "epoch": 2.4505273136627563, + "grad_norm": 0.6053385247801931, + "learning_rate": 8.02662763312439e-06, + "loss": 1.8193, + "step": 5170 + }, + { + "epoch": 2.4552672117549474, + "grad_norm": 0.6364991896683941, + "learning_rate": 7.89223390062172e-06, + "loss": 1.8081, + "step": 5180 + }, + { + "epoch": 2.460007109847138, + "grad_norm": 0.630663938586301, + "learning_rate": 7.758878330735142e-06, + "loss": 1.8317, + "step": 5190 + }, + { + "epoch": 2.464747007939329, + "grad_norm": 0.6625585293729884, + "learning_rate": 7.626564211332465e-06, + "loss": 1.7914, + "step": 5200 + }, + { + "epoch": 2.4694869060315203, + "grad_norm": 0.6132933711832741, + "learning_rate": 7.49529480460458e-06, + "loss": 1.8072, + "step": 5210 + }, + { + "epoch": 2.4742268041237114, + "grad_norm": 0.6723366054843423, + "learning_rate": 7.3650733469851574e-06, + "loss": 1.8693, + "step": 5220 + }, + { + "epoch": 2.4789667022159025, + "grad_norm": 0.5948715205500895, + "learning_rate": 7.235903049070742e-06, + "loss": 1.7441, + "step": 5230 + }, + { + "epoch": 2.483706600308093, + "grad_norm": 0.602660875671921, + "learning_rate": 7.1077870955416685e-06, + "loss": 1.8301, + "step": 5240 + }, + { + "epoch": 2.4884464984002843, + "grad_norm": 0.6657860629895173, + "learning_rate": 6.98072864508349e-06, + "loss": 1.7357, + "step": 5250 + }, + { + "epoch": 2.4931863964924754, + "grad_norm": 0.6400301583474429, + "learning_rate": 6.854730830309203e-06, + "loss": 1.8309, + "step": 5260 + }, + { + "epoch": 2.4979262945846665, + "grad_norm": 0.6519457597490862, + "learning_rate": 6.729796757681861e-06, + "loss": 1.8622, + "step": 5270 + }, + { + "epoch": 2.502666192676857, + "grad_norm": 0.6018425213466797, + "learning_rate": 6.605929507438108e-06, + "loss": 1.8124, + "step": 5280 + }, + { + "epoch": 2.5074060907690483, + "grad_norm": 0.6356535657958864, + "learning_rate": 6.4831321335121706e-06, + "loss": 1.8493, + "step": 5290 + }, + { + "epoch": 2.5121459888612394, + "grad_norm": 0.5933711757944313, + "learning_rate": 6.361407663460612e-06, + "loss": 1.8152, + "step": 5300 + }, + { + "epoch": 2.5168858869534305, + "grad_norm": 0.6176252282132866, + "learning_rate": 6.240759098387628e-06, + "loss": 1.7796, + "step": 5310 + }, + { + "epoch": 2.5216257850456216, + "grad_norm": 0.6035543936375999, + "learning_rate": 6.12118941287112e-06, + "loss": 1.8072, + "step": 5320 + }, + { + "epoch": 2.5263656831378123, + "grad_norm": 0.6423602506797493, + "learning_rate": 6.002701554889306e-06, + "loss": 1.8894, + "step": 5330 + }, + { + "epoch": 2.5311055812300034, + "grad_norm": 0.6166718860982423, + "learning_rate": 5.885298445748072e-06, + "loss": 1.8476, + "step": 5340 + }, + { + "epoch": 2.5358454793221945, + "grad_norm": 0.6250486214392823, + "learning_rate": 5.768982980008924e-06, + "loss": 1.8044, + "step": 5350 + }, + { + "epoch": 2.5405853774143856, + "grad_norm": 0.6409013217160432, + "learning_rate": 5.653758025417616e-06, + "loss": 1.7732, + "step": 5360 + }, + { + "epoch": 2.5453252755065767, + "grad_norm": 0.5853729101352203, + "learning_rate": 5.5396264228335e-06, + "loss": 1.816, + "step": 5370 + }, + { + "epoch": 2.5500651735987674, + "grad_norm": 0.6674717253505213, + "learning_rate": 5.42659098615943e-06, + "loss": 1.828, + "step": 5380 + }, + { + "epoch": 2.5548050716909585, + "grad_norm": 0.6079460431124653, + "learning_rate": 5.314654502272393e-06, + "loss": 1.8305, + "step": 5390 + }, + { + "epoch": 2.5595449697831496, + "grad_norm": 0.6132271739956523, + "learning_rate": 5.203819730954806e-06, + "loss": 1.9389, + "step": 5400 + }, + { + "epoch": 2.5642848678753407, + "grad_norm": 0.6412964569520792, + "learning_rate": 5.094089404826513e-06, + "loss": 1.8878, + "step": 5410 + }, + { + "epoch": 2.569024765967532, + "grad_norm": 0.6314773808659059, + "learning_rate": 4.985466229277331e-06, + "loss": 1.7996, + "step": 5420 + }, + { + "epoch": 2.5737646640597225, + "grad_norm": 0.6019377364178156, + "learning_rate": 4.877952882400411e-06, + "loss": 1.8326, + "step": 5430 + }, + { + "epoch": 2.5785045621519136, + "grad_norm": 0.6375177888153616, + "learning_rate": 4.771552014926206e-06, + "loss": 1.8313, + "step": 5440 + }, + { + "epoch": 2.5832444602441047, + "grad_norm": 0.6184290636855982, + "learning_rate": 4.666266250157097e-06, + "loss": 1.8408, + "step": 5450 + }, + { + "epoch": 2.587984358336296, + "grad_norm": 0.6145812896553856, + "learning_rate": 4.562098183902713e-06, + "loss": 1.7928, + "step": 5460 + }, + { + "epoch": 2.592724256428487, + "grad_norm": 0.5863286484938057, + "learning_rate": 4.459050384415941e-06, + "loss": 1.7671, + "step": 5470 + }, + { + "epoch": 2.5974641545206776, + "grad_norm": 0.5908385265300592, + "learning_rate": 4.357125392329636e-06, + "loss": 1.8528, + "step": 5480 + }, + { + "epoch": 2.6022040526128687, + "grad_norm": 0.6315835702501038, + "learning_rate": 4.256325720593912e-06, + "loss": 1.8952, + "step": 5490 + }, + { + "epoch": 2.60694395070506, + "grad_norm": 0.5905062832031487, + "learning_rate": 4.15665385441425e-06, + "loss": 1.8604, + "step": 5500 + }, + { + "epoch": 2.611683848797251, + "grad_norm": 0.568727331363524, + "learning_rate": 4.0581122511901934e-06, + "loss": 1.8351, + "step": 5510 + }, + { + "epoch": 2.616423746889442, + "grad_norm": 0.6400621125560388, + "learning_rate": 3.960703340454791e-06, + "loss": 1.857, + "step": 5520 + }, + { + "epoch": 2.6211636449816327, + "grad_norm": 0.6844853412168999, + "learning_rate": 3.864429523814644e-06, + "loss": 1.8371, + "step": 5530 + }, + { + "epoch": 2.625903543073824, + "grad_norm": 0.6040727492768455, + "learning_rate": 3.7692931748907425e-06, + "loss": 1.8582, + "step": 5540 + }, + { + "epoch": 2.630643441166015, + "grad_norm": 0.6488970700922259, + "learning_rate": 3.675296639259912e-06, + "loss": 1.8466, + "step": 5550 + }, + { + "epoch": 2.635383339258206, + "grad_norm": 0.606860701135619, + "learning_rate": 3.5824422343970267e-06, + "loss": 1.8823, + "step": 5560 + }, + { + "epoch": 2.640123237350397, + "grad_norm": 0.6107041616886252, + "learning_rate": 3.4907322496178397e-06, + "loss": 1.7635, + "step": 5570 + }, + { + "epoch": 2.6448631354425878, + "grad_norm": 0.6205661299793865, + "learning_rate": 3.4001689460225195e-06, + "loss": 1.7604, + "step": 5580 + }, + { + "epoch": 2.649603033534779, + "grad_norm": 0.6114908815089501, + "learning_rate": 3.3107545564399434e-06, + "loss": 1.8452, + "step": 5590 + }, + { + "epoch": 2.65434293162697, + "grad_norm": 0.621202845423754, + "learning_rate": 3.2224912853726476e-06, + "loss": 1.8557, + "step": 5600 + }, + { + "epoch": 2.659082829719161, + "grad_norm": 0.6376438148340446, + "learning_rate": 3.1353813089424424e-06, + "loss": 1.8295, + "step": 5610 + }, + { + "epoch": 2.663822727811352, + "grad_norm": 0.6085163299666503, + "learning_rate": 3.0494267748367723e-06, + "loss": 1.7302, + "step": 5620 + }, + { + "epoch": 2.668562625903543, + "grad_norm": 0.6330680248898437, + "learning_rate": 2.9646298022557915e-06, + "loss": 1.7756, + "step": 5630 + }, + { + "epoch": 2.673302523995734, + "grad_norm": 0.6575109357986112, + "learning_rate": 2.8809924818600952e-06, + "loss": 1.7728, + "step": 5640 + }, + { + "epoch": 2.678042422087925, + "grad_norm": 0.5972530598708538, + "learning_rate": 2.7985168757191482e-06, + "loss": 1.7927, + "step": 5650 + }, + { + "epoch": 2.682782320180116, + "grad_norm": 0.6505229836146454, + "learning_rate": 2.7172050172604824e-06, + "loss": 1.768, + "step": 5660 + }, + { + "epoch": 2.6875222182723073, + "grad_norm": 0.6339702452986381, + "learning_rate": 2.63705891121957e-06, + "loss": 1.7756, + "step": 5670 + }, + { + "epoch": 2.692262116364498, + "grad_norm": 0.6729168831182509, + "learning_rate": 2.5580805335903457e-06, + "loss": 1.8363, + "step": 5680 + }, + { + "epoch": 2.697002014456689, + "grad_norm": 0.6421591660117998, + "learning_rate": 2.4802718315765527e-06, + "loss": 1.7585, + "step": 5690 + }, + { + "epoch": 2.70174191254888, + "grad_norm": 0.5993295713871896, + "learning_rate": 2.403634723543674e-06, + "loss": 1.8379, + "step": 5700 + }, + { + "epoch": 2.7064818106410713, + "grad_norm": 0.5931932390101198, + "learning_rate": 2.3281710989716933e-06, + "loss": 1.8127, + "step": 5710 + }, + { + "epoch": 2.7112217087332624, + "grad_norm": 0.6007499215207198, + "learning_rate": 2.2538828184084595e-06, + "loss": 1.7643, + "step": 5720 + }, + { + "epoch": 2.715961606825453, + "grad_norm": 0.6294360874753062, + "learning_rate": 2.1807717134238347e-06, + "loss": 1.8007, + "step": 5730 + }, + { + "epoch": 2.720701504917644, + "grad_norm": 0.6305932589800126, + "learning_rate": 2.1088395865645537e-06, + "loss": 1.802, + "step": 5740 + }, + { + "epoch": 2.7254414030098353, + "grad_norm": 0.6091954631732173, + "learning_rate": 2.038088211309769e-06, + "loss": 1.7978, + "step": 5750 + }, + { + "epoch": 2.7301813011020264, + "grad_norm": 0.6353525285344948, + "learning_rate": 1.968519332027302e-06, + "loss": 1.8641, + "step": 5760 + }, + { + "epoch": 2.7349211991942175, + "grad_norm": 0.5869911293052614, + "learning_rate": 1.9001346639306805e-06, + "loss": 1.876, + "step": 5770 + }, + { + "epoch": 2.739661097286408, + "grad_norm": 0.6462140073621514, + "learning_rate": 1.8329358930368245e-06, + "loss": 1.7947, + "step": 5780 + }, + { + "epoch": 2.7444009953785993, + "grad_norm": 0.6298906028352366, + "learning_rate": 1.7669246761244763e-06, + "loss": 1.7983, + "step": 5790 + }, + { + "epoch": 2.7491408934707904, + "grad_norm": 0.6351921002703318, + "learning_rate": 1.7021026406933427e-06, + "loss": 1.7563, + "step": 5800 + }, + { + "epoch": 2.7538807915629815, + "grad_norm": 0.6081707137727146, + "learning_rate": 1.638471384924012e-06, + "loss": 1.8005, + "step": 5810 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 0.6527854672102444, + "learning_rate": 1.5760324776385171e-06, + "loss": 1.8228, + "step": 5820 + }, + { + "epoch": 2.7633605877473633, + "grad_norm": 0.6207692422398574, + "learning_rate": 1.5147874582616518e-06, + "loss": 1.8751, + "step": 5830 + }, + { + "epoch": 2.7681004858395544, + "grad_norm": 0.6078351786970941, + "learning_rate": 1.4547378367830267e-06, + "loss": 1.854, + "step": 5840 + }, + { + "epoch": 2.7728403839317455, + "grad_norm": 0.5914179875660134, + "learning_rate": 1.3958850937198453e-06, + "loss": 1.8771, + "step": 5850 + }, + { + "epoch": 2.7775802820239366, + "grad_norm": 0.6150352638939602, + "learning_rate": 1.3382306800804045e-06, + "loss": 1.7422, + "step": 5860 + }, + { + "epoch": 2.7823201801161277, + "grad_norm": 0.6205091178728268, + "learning_rate": 1.2817760173282954e-06, + "loss": 1.8005, + "step": 5870 + }, + { + "epoch": 2.7870600782083184, + "grad_norm": 0.6352299718478237, + "learning_rate": 1.2265224973474042e-06, + "loss": 1.7703, + "step": 5880 + }, + { + "epoch": 2.7917999763005095, + "grad_norm": 0.6466624089179797, + "learning_rate": 1.1724714824075333e-06, + "loss": 1.8315, + "step": 5890 + }, + { + "epoch": 2.7965398743927006, + "grad_norm": 0.5968151491811187, + "learning_rate": 1.1196243051308787e-06, + "loss": 1.9011, + "step": 5900 + }, + { + "epoch": 2.8012797724848917, + "grad_norm": 0.6310690230989541, + "learning_rate": 1.0679822684591112e-06, + "loss": 1.8434, + "step": 5910 + }, + { + "epoch": 2.806019670577083, + "grad_norm": 0.6459331883257132, + "learning_rate": 1.0175466456213034e-06, + "loss": 1.7773, + "step": 5920 + }, + { + "epoch": 2.8107595686692735, + "grad_norm": 0.6898338914840095, + "learning_rate": 9.683186801025256e-07, + "loss": 1.8417, + "step": 5930 + }, + { + "epoch": 2.8154994667614646, + "grad_norm": 0.6097250867359322, + "learning_rate": 9.202995856131769e-07, + "loss": 1.8076, + "step": 5940 + }, + { + "epoch": 2.8202393648536557, + "grad_norm": 0.6610392263190566, + "learning_rate": 8.734905460590581e-07, + "loss": 1.7511, + "step": 5950 + }, + { + "epoch": 2.824979262945847, + "grad_norm": 0.6070988311686517, + "learning_rate": 8.278927155121851e-07, + "loss": 1.8309, + "step": 5960 + }, + { + "epoch": 2.829719161038038, + "grad_norm": 0.6261583831010433, + "learning_rate": 7.835072181823666e-07, + "loss": 1.8377, + "step": 5970 + }, + { + "epoch": 2.8344590591302286, + "grad_norm": 0.6243423055956993, + "learning_rate": 7.403351483894427e-07, + "loss": 1.7941, + "step": 5980 + }, + { + "epoch": 2.8391989572224197, + "grad_norm": 0.702784469663522, + "learning_rate": 6.983775705363238e-07, + "loss": 1.8042, + "step": 5990 + }, + { + "epoch": 2.8439388553146108, + "grad_norm": 0.5996597981711203, + "learning_rate": 6.576355190827499e-07, + "loss": 1.8512, + "step": 6000 + }, + { + "epoch": 2.848678753406802, + "grad_norm": 0.5539803926109534, + "learning_rate": 6.181099985197947e-07, + "loss": 1.8558, + "step": 6010 + }, + { + "epoch": 2.853418651498993, + "grad_norm": 0.5462268948543724, + "learning_rate": 5.798019833450629e-07, + "loss": 1.7838, + "step": 6020 + }, + { + "epoch": 2.8581585495911837, + "grad_norm": 0.6522918616165346, + "learning_rate": 5.4271241803871e-07, + "loss": 1.8523, + "step": 6030 + }, + { + "epoch": 2.8628984476833748, + "grad_norm": 0.6013569849197028, + "learning_rate": 5.068422170401377e-07, + "loss": 1.8239, + "step": 6040 + }, + { + "epoch": 2.867638345775566, + "grad_norm": 0.6217056805780841, + "learning_rate": 4.72192264725424e-07, + "loss": 1.8316, + "step": 6050 + }, + { + "epoch": 2.872378243867757, + "grad_norm": 0.6047869013985818, + "learning_rate": 4.387634153855791e-07, + "loss": 1.8189, + "step": 6060 + }, + { + "epoch": 2.877118141959948, + "grad_norm": 0.6730414277089524, + "learning_rate": 4.065564932054067e-07, + "loss": 1.7824, + "step": 6070 + }, + { + "epoch": 2.8818580400521387, + "grad_norm": 0.612791047561647, + "learning_rate": 3.755722922432481e-07, + "loss": 1.7867, + "step": 6080 + }, + { + "epoch": 2.88659793814433, + "grad_norm": 0.6615842561782111, + "learning_rate": 3.4581157641137563e-07, + "loss": 1.8359, + "step": 6090 + }, + { + "epoch": 2.891337836236521, + "grad_norm": 0.6358101876016702, + "learning_rate": 3.1727507945714663e-07, + "loss": 1.8628, + "step": 6100 + }, + { + "epoch": 2.896077734328712, + "grad_norm": 0.5951921137175086, + "learning_rate": 2.8996350494495116e-07, + "loss": 1.8516, + "step": 6110 + }, + { + "epoch": 2.900817632420903, + "grad_norm": 0.6310271682459363, + "learning_rate": 2.6387752623883156e-07, + "loss": 1.8437, + "step": 6120 + }, + { + "epoch": 2.905557530513094, + "grad_norm": 0.6305755436522482, + "learning_rate": 2.390177864858956e-07, + "loss": 1.8514, + "step": 6130 + }, + { + "epoch": 2.910297428605285, + "grad_norm": 0.6404150710185624, + "learning_rate": 2.1538489860044587e-07, + "loss": 1.8186, + "step": 6140 + }, + { + "epoch": 2.915037326697476, + "grad_norm": 0.6158013141692098, + "learning_rate": 1.92979445248892e-07, + "loss": 1.8083, + "step": 6150 + }, + { + "epoch": 2.919777224789667, + "grad_norm": 0.6416671093424775, + "learning_rate": 1.7180197883537308e-07, + "loss": 1.7786, + "step": 6160 + }, + { + "epoch": 2.9245171228818583, + "grad_norm": 0.5582605199061633, + "learning_rate": 1.518530214881242e-07, + "loss": 1.7976, + "step": 6170 + }, + { + "epoch": 2.929257020974049, + "grad_norm": 0.6106802327952866, + "learning_rate": 1.3313306504663115e-07, + "loss": 1.7604, + "step": 6180 + }, + { + "epoch": 2.93399691906624, + "grad_norm": 0.649320638486437, + "learning_rate": 1.1564257104947352e-07, + "loss": 1.8441, + "step": 6190 + }, + { + "epoch": 2.938736817158431, + "grad_norm": 0.5884577603080124, + "learning_rate": 9.938197072298372e-08, + "loss": 1.8196, + "step": 6200 + }, + { + "epoch": 2.9434767152506223, + "grad_norm": 0.6392485935256708, + "learning_rate": 8.435166497057222e-08, + "loss": 1.857, + "step": 6210 + }, + { + "epoch": 2.9482166133428134, + "grad_norm": 0.6506401892518179, + "learning_rate": 7.055202436287433e-08, + "loss": 1.7725, + "step": 6220 + }, + { + "epoch": 2.952956511435004, + "grad_norm": 0.6149298488489828, + "learning_rate": 5.7983389128596355e-08, + "loss": 1.8946, + "step": 6230 + }, + { + "epoch": 2.957696409527195, + "grad_norm": 0.5722181216171393, + "learning_rate": 4.664606914615011e-08, + "loss": 1.8542, + "step": 6240 + }, + { + "epoch": 2.9624363076193863, + "grad_norm": 0.6428450313630513, + "learning_rate": 3.654034393598127e-08, + "loss": 1.824, + "step": 6250 + }, + { + "epoch": 2.9671762057115774, + "grad_norm": 0.6329021168786573, + "learning_rate": 2.766646265369155e-08, + "loss": 1.8012, + "step": 6260 + }, + { + "epoch": 2.9719161038037685, + "grad_norm": 0.6406715656233972, + "learning_rate": 2.0024644083921352e-08, + "loss": 1.8472, + "step": 6270 + }, + { + "epoch": 2.976656001895959, + "grad_norm": 0.5842266635593326, + "learning_rate": 1.3615076634898582e-08, + "loss": 1.8102, + "step": 6280 + }, + { + "epoch": 2.9813958999881502, + "grad_norm": 0.6430039656205391, + "learning_rate": 8.437918333864536e-09, + "loss": 1.7935, + "step": 6290 + }, + { + "epoch": 2.9861357980803414, + "grad_norm": 0.6055802510109696, + "learning_rate": 4.493296823104842e-09, + "loss": 1.8425, + "step": 6300 + }, + { + "epoch": 2.990875696172532, + "grad_norm": 0.5757552404684133, + "learning_rate": 1.781309356863048e-09, + "loss": 1.8636, + "step": 6310 + }, + { + "epoch": 2.9956155942647236, + "grad_norm": 0.602338679600079, + "learning_rate": 3.0202279890922947e-10, + "loss": 1.7555, + "step": 6320 + }, + { + "epoch": 2.998933522929257, + "step": 6327, + "total_flos": 3180599149854720.0, + "train_loss": 1.9022130669246677, + "train_runtime": 57110.8809, + "train_samples_per_second": 0.887, + "train_steps_per_second": 0.111 + } + ], + "logging_steps": 10, + "max_steps": 6327, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3180599149854720.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}