material_cpt_parsing_3epochs / trainer_state.json
Rakancorle1's picture
End of training
7ae2112 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.998933522929257,
"eval_steps": 500,
"global_step": 6327,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004739898092191018,
"grad_norm": 0.36863938554596193,
"learning_rate": 9.999938362758687e-05,
"loss": 2.2013,
"step": 10
},
{
"epoch": 0.009479796184382036,
"grad_norm": 0.500081400365978,
"learning_rate": 9.999753452554404e-05,
"loss": 2.0963,
"step": 20
},
{
"epoch": 0.014219694276573054,
"grad_norm": 0.4147328978072978,
"learning_rate": 9.999445273946093e-05,
"loss": 2.1148,
"step": 30
},
{
"epoch": 0.018959592368764072,
"grad_norm": 0.35930434339916095,
"learning_rate": 9.999013834531869e-05,
"loss": 2.1787,
"step": 40
},
{
"epoch": 0.02369949046095509,
"grad_norm": 0.5423960911916873,
"learning_rate": 9.998459144948825e-05,
"loss": 2.1055,
"step": 50
},
{
"epoch": 0.028439388553146108,
"grad_norm": 0.40880961489218204,
"learning_rate": 9.997781218872771e-05,
"loss": 2.1723,
"step": 60
},
{
"epoch": 0.033179286645337126,
"grad_norm": 0.43317175193770346,
"learning_rate": 9.99698007301791e-05,
"loss": 2.0316,
"step": 70
},
{
"epoch": 0.037919184737528144,
"grad_norm": 0.37892996565691084,
"learning_rate": 9.996055727136406e-05,
"loss": 2.0171,
"step": 80
},
{
"epoch": 0.04265908282971916,
"grad_norm": 0.43747110352943336,
"learning_rate": 9.995008204017915e-05,
"loss": 2.0887,
"step": 90
},
{
"epoch": 0.04739898092191018,
"grad_norm": 0.39459537932523525,
"learning_rate": 9.993837529489007e-05,
"loss": 2.03,
"step": 100
},
{
"epoch": 0.0521388790141012,
"grad_norm": 0.3839963642801344,
"learning_rate": 9.992543732412544e-05,
"loss": 1.9306,
"step": 110
},
{
"epoch": 0.056878777106292217,
"grad_norm": 0.40520894461995377,
"learning_rate": 9.99112684468696e-05,
"loss": 2.0425,
"step": 120
},
{
"epoch": 0.061618675198483235,
"grad_norm": 0.390483069303289,
"learning_rate": 9.989586901245472e-05,
"loss": 2.087,
"step": 130
},
{
"epoch": 0.06635857329067425,
"grad_norm": 0.4180445176279502,
"learning_rate": 9.987923940055228e-05,
"loss": 2.0679,
"step": 140
},
{
"epoch": 0.07109847138286526,
"grad_norm": 0.49880430744694115,
"learning_rate": 9.986138002116364e-05,
"loss": 2.0628,
"step": 150
},
{
"epoch": 0.07583836947505629,
"grad_norm": 0.4427166571522091,
"learning_rate": 9.984229131460996e-05,
"loss": 2.0681,
"step": 160
},
{
"epoch": 0.0805782675672473,
"grad_norm": 0.38471776319499607,
"learning_rate": 9.982197375152129e-05,
"loss": 2.0019,
"step": 170
},
{
"epoch": 0.08531816565943832,
"grad_norm": 0.4628072900469101,
"learning_rate": 9.980042783282509e-05,
"loss": 1.9909,
"step": 180
},
{
"epoch": 0.09005806375162934,
"grad_norm": 0.3939907069527393,
"learning_rate": 9.977765408973374e-05,
"loss": 2.0713,
"step": 190
},
{
"epoch": 0.09479796184382036,
"grad_norm": 0.4184843314019155,
"learning_rate": 9.97536530837315e-05,
"loss": 1.9729,
"step": 200
},
{
"epoch": 0.09953785993601137,
"grad_norm": 0.6336861212047761,
"learning_rate": 9.97284254065607e-05,
"loss": 2.0278,
"step": 210
},
{
"epoch": 0.1042777580282024,
"grad_norm": 0.39737439720337403,
"learning_rate": 9.970197168020713e-05,
"loss": 2.0603,
"step": 220
},
{
"epoch": 0.10901765612039341,
"grad_norm": 0.4161581082817388,
"learning_rate": 9.967429255688468e-05,
"loss": 2.0308,
"step": 230
},
{
"epoch": 0.11375755421258443,
"grad_norm": 0.4122381540422074,
"learning_rate": 9.964538871901923e-05,
"loss": 2.1011,
"step": 240
},
{
"epoch": 0.11849745230477544,
"grad_norm": 0.40792411841005016,
"learning_rate": 9.961526087923193e-05,
"loss": 2.0535,
"step": 250
},
{
"epoch": 0.12323735039696647,
"grad_norm": 0.4298298302428991,
"learning_rate": 9.958390978032157e-05,
"loss": 1.9882,
"step": 260
},
{
"epoch": 0.12797724848915748,
"grad_norm": 0.3932772338211237,
"learning_rate": 9.955133619524623e-05,
"loss": 2.0703,
"step": 270
},
{
"epoch": 0.1327171465813485,
"grad_norm": 0.4304879913642714,
"learning_rate": 9.951754092710429e-05,
"loss": 2.0661,
"step": 280
},
{
"epoch": 0.13745704467353953,
"grad_norm": 0.3933942358750948,
"learning_rate": 9.948252480911458e-05,
"loss": 1.9941,
"step": 290
},
{
"epoch": 0.14219694276573053,
"grad_norm": 0.3876898041012675,
"learning_rate": 9.944628870459587e-05,
"loss": 2.001,
"step": 300
},
{
"epoch": 0.14693684085792155,
"grad_norm": 0.39971149840828696,
"learning_rate": 9.940883350694556e-05,
"loss": 1.9889,
"step": 310
},
{
"epoch": 0.15167673895011258,
"grad_norm": 0.4322868983437022,
"learning_rate": 9.93701601396177e-05,
"loss": 2.019,
"step": 320
},
{
"epoch": 0.1564166370423036,
"grad_norm": 0.40679391432223605,
"learning_rate": 9.933026955610014e-05,
"loss": 2.0402,
"step": 330
},
{
"epoch": 0.1611565351344946,
"grad_norm": 0.40265148647862,
"learning_rate": 9.928916273989108e-05,
"loss": 1.9488,
"step": 340
},
{
"epoch": 0.16589643322668562,
"grad_norm": 0.4119893126018702,
"learning_rate": 9.924684070447483e-05,
"loss": 2.0143,
"step": 350
},
{
"epoch": 0.17063633131887665,
"grad_norm": 0.41446853317804655,
"learning_rate": 9.92033044932968e-05,
"loss": 1.9393,
"step": 360
},
{
"epoch": 0.17537622941106767,
"grad_norm": 0.4775440242382454,
"learning_rate": 9.915855517973776e-05,
"loss": 1.9899,
"step": 370
},
{
"epoch": 0.18011612750325867,
"grad_norm": 0.41303403265485017,
"learning_rate": 9.91125938670874e-05,
"loss": 2.0431,
"step": 380
},
{
"epoch": 0.1848560255954497,
"grad_norm": 0.381415505593885,
"learning_rate": 9.906542168851715e-05,
"loss": 1.9778,
"step": 390
},
{
"epoch": 0.18959592368764072,
"grad_norm": 0.45202098843075295,
"learning_rate": 9.901703980705219e-05,
"loss": 2.0098,
"step": 400
},
{
"epoch": 0.19433582177983175,
"grad_norm": 0.38808197740496003,
"learning_rate": 9.896744941554279e-05,
"loss": 1.9467,
"step": 410
},
{
"epoch": 0.19907571987202274,
"grad_norm": 0.40860216072850924,
"learning_rate": 9.891665173663492e-05,
"loss": 2.0267,
"step": 420
},
{
"epoch": 0.20381561796421377,
"grad_norm": 0.4068044305771888,
"learning_rate": 9.886464802274009e-05,
"loss": 2.0872,
"step": 430
},
{
"epoch": 0.2085555160564048,
"grad_norm": 0.43039544158069454,
"learning_rate": 9.88114395560045e-05,
"loss": 2.0094,
"step": 440
},
{
"epoch": 0.21329541414859582,
"grad_norm": 0.37668435282131046,
"learning_rate": 9.875702764827737e-05,
"loss": 2.0032,
"step": 450
},
{
"epoch": 0.21803531224078682,
"grad_norm": 0.4289799607032317,
"learning_rate": 9.87014136410787e-05,
"loss": 1.9535,
"step": 460
},
{
"epoch": 0.22277521033297784,
"grad_norm": 0.416501457655663,
"learning_rate": 9.864459890556604e-05,
"loss": 2.0246,
"step": 470
},
{
"epoch": 0.22751510842516887,
"grad_norm": 0.42709577377722036,
"learning_rate": 9.858658484250082e-05,
"loss": 1.9675,
"step": 480
},
{
"epoch": 0.23225500651735986,
"grad_norm": 0.38491345570315816,
"learning_rate": 9.852737288221378e-05,
"loss": 1.9768,
"step": 490
},
{
"epoch": 0.2369949046095509,
"grad_norm": 0.4331220698731146,
"learning_rate": 9.846696448456967e-05,
"loss": 1.96,
"step": 500
},
{
"epoch": 0.2417348027017419,
"grad_norm": 0.5157356350680703,
"learning_rate": 9.840536113893129e-05,
"loss": 2.0168,
"step": 510
},
{
"epoch": 0.24647470079393294,
"grad_norm": 0.42673885807943607,
"learning_rate": 9.834256436412272e-05,
"loss": 1.9192,
"step": 520
},
{
"epoch": 0.25121459888612396,
"grad_norm": 0.399056341637914,
"learning_rate": 9.827857570839198e-05,
"loss": 2.009,
"step": 530
},
{
"epoch": 0.25595449697831496,
"grad_norm": 0.38514488410609315,
"learning_rate": 9.821339674937274e-05,
"loss": 2.0237,
"step": 540
},
{
"epoch": 0.26069439507050596,
"grad_norm": 0.43535566879213633,
"learning_rate": 9.814702909404547e-05,
"loss": 1.9746,
"step": 550
},
{
"epoch": 0.265434293162697,
"grad_norm": 0.4277848981360601,
"learning_rate": 9.807947437869788e-05,
"loss": 2.0008,
"step": 560
},
{
"epoch": 0.270174191254888,
"grad_norm": 0.42806115487352164,
"learning_rate": 9.801073426888447e-05,
"loss": 2.0819,
"step": 570
},
{
"epoch": 0.27491408934707906,
"grad_norm": 0.36287005859609833,
"learning_rate": 9.794081045938554e-05,
"loss": 2.0256,
"step": 580
},
{
"epoch": 0.27965398743927006,
"grad_norm": 0.467970576527151,
"learning_rate": 9.786970467416538e-05,
"loss": 2.0221,
"step": 590
},
{
"epoch": 0.28439388553146105,
"grad_norm": 0.37993477630266503,
"learning_rate": 9.779741866632977e-05,
"loss": 1.9589,
"step": 600
},
{
"epoch": 0.2891337836236521,
"grad_norm": 0.44198107142469956,
"learning_rate": 9.772395421808274e-05,
"loss": 2.0035,
"step": 610
},
{
"epoch": 0.2938736817158431,
"grad_norm": 0.44573447679188816,
"learning_rate": 9.764931314068267e-05,
"loss": 1.9909,
"step": 620
},
{
"epoch": 0.2986135798080341,
"grad_norm": 0.4731340699659092,
"learning_rate": 9.757349727439759e-05,
"loss": 2.0103,
"step": 630
},
{
"epoch": 0.30335347790022515,
"grad_norm": 0.3963283837850387,
"learning_rate": 9.749650848845984e-05,
"loss": 2.0639,
"step": 640
},
{
"epoch": 0.30809337599241615,
"grad_norm": 0.3884422717238912,
"learning_rate": 9.741834868101998e-05,
"loss": 2.0342,
"step": 650
},
{
"epoch": 0.3128332740846072,
"grad_norm": 0.42096628799860736,
"learning_rate": 9.733901977909997e-05,
"loss": 2.0037,
"step": 660
},
{
"epoch": 0.3175731721767982,
"grad_norm": 0.3922372868315195,
"learning_rate": 9.725852373854568e-05,
"loss": 2.0327,
"step": 670
},
{
"epoch": 0.3223130702689892,
"grad_norm": 0.37724258160489493,
"learning_rate": 9.717686254397866e-05,
"loss": 1.9996,
"step": 680
},
{
"epoch": 0.32705296836118025,
"grad_norm": 0.36849429342184464,
"learning_rate": 9.70940382087472e-05,
"loss": 1.9789,
"step": 690
},
{
"epoch": 0.33179286645337125,
"grad_norm": 0.38001698944458373,
"learning_rate": 9.701005277487673e-05,
"loss": 1.8886,
"step": 700
},
{
"epoch": 0.33653276454556225,
"grad_norm": 0.4434394537121414,
"learning_rate": 9.692490831301944e-05,
"loss": 2.0773,
"step": 710
},
{
"epoch": 0.3412726626377533,
"grad_norm": 0.44409242659624243,
"learning_rate": 9.683860692240321e-05,
"loss": 1.9944,
"step": 720
},
{
"epoch": 0.3460125607299443,
"grad_norm": 0.3706038723114169,
"learning_rate": 9.675115073077989e-05,
"loss": 1.9399,
"step": 730
},
{
"epoch": 0.35075245882213535,
"grad_norm": 0.3775340444246396,
"learning_rate": 9.666254189437286e-05,
"loss": 2.0434,
"step": 740
},
{
"epoch": 0.35549235691432635,
"grad_norm": 0.39740898678838216,
"learning_rate": 9.657278259782378e-05,
"loss": 2.0483,
"step": 750
},
{
"epoch": 0.36023225500651734,
"grad_norm": 0.3856650140837026,
"learning_rate": 9.648187505413886e-05,
"loss": 1.9621,
"step": 760
},
{
"epoch": 0.3649721530987084,
"grad_norm": 0.49084336306431187,
"learning_rate": 9.638982150463415e-05,
"loss": 1.9878,
"step": 770
},
{
"epoch": 0.3697120511908994,
"grad_norm": 0.41318948101107866,
"learning_rate": 9.629662421888039e-05,
"loss": 2.0805,
"step": 780
},
{
"epoch": 0.3744519492830904,
"grad_norm": 0.402590356367594,
"learning_rate": 9.620228549464703e-05,
"loss": 2.0258,
"step": 790
},
{
"epoch": 0.37919184737528144,
"grad_norm": 0.4461694641117838,
"learning_rate": 9.610680765784556e-05,
"loss": 1.9692,
"step": 800
},
{
"epoch": 0.38393174546747244,
"grad_norm": 0.41581795351534184,
"learning_rate": 9.601019306247215e-05,
"loss": 2.022,
"step": 810
},
{
"epoch": 0.3886716435596635,
"grad_norm": 0.4182347418587252,
"learning_rate": 9.591244409054965e-05,
"loss": 1.9989,
"step": 820
},
{
"epoch": 0.3934115416518545,
"grad_norm": 0.36463111311757684,
"learning_rate": 9.581356315206885e-05,
"loss": 2.0483,
"step": 830
},
{
"epoch": 0.3981514397440455,
"grad_norm": 0.4636476781338481,
"learning_rate": 9.571355268492907e-05,
"loss": 1.9491,
"step": 840
},
{
"epoch": 0.40289133783623654,
"grad_norm": 0.43027600259738763,
"learning_rate": 9.561241515487802e-05,
"loss": 1.9423,
"step": 850
},
{
"epoch": 0.40763123592842754,
"grad_norm": 0.43322329785996827,
"learning_rate": 9.551015305545104e-05,
"loss": 1.9349,
"step": 860
},
{
"epoch": 0.41237113402061853,
"grad_norm": 0.3900423005352424,
"learning_rate": 9.540676890790962e-05,
"loss": 1.9571,
"step": 870
},
{
"epoch": 0.4171110321128096,
"grad_norm": 0.3736027589992883,
"learning_rate": 9.53022652611792e-05,
"loss": 2.033,
"step": 880
},
{
"epoch": 0.4218509302050006,
"grad_norm": 0.4412678924097936,
"learning_rate": 9.519664469178638e-05,
"loss": 1.9928,
"step": 890
},
{
"epoch": 0.42659082829719164,
"grad_norm": 0.36064586995797043,
"learning_rate": 9.508990980379537e-05,
"loss": 2.0181,
"step": 900
},
{
"epoch": 0.43133072638938263,
"grad_norm": 0.36982453028008294,
"learning_rate": 9.498206322874381e-05,
"loss": 2.0118,
"step": 910
},
{
"epoch": 0.43607062448157363,
"grad_norm": 0.4936789348648113,
"learning_rate": 9.487310762557784e-05,
"loss": 2.0388,
"step": 920
},
{
"epoch": 0.4408105225737647,
"grad_norm": 0.4192120475618224,
"learning_rate": 9.476304568058657e-05,
"loss": 2.0001,
"step": 930
},
{
"epoch": 0.4455504206659557,
"grad_norm": 0.4212248975591549,
"learning_rate": 9.465188010733586e-05,
"loss": 2.0464,
"step": 940
},
{
"epoch": 0.4502903187581467,
"grad_norm": 0.4111853146435081,
"learning_rate": 9.453961364660143e-05,
"loss": 2.0118,
"step": 950
},
{
"epoch": 0.45503021685033773,
"grad_norm": 0.3911083150496816,
"learning_rate": 9.442624906630124e-05,
"loss": 1.9256,
"step": 960
},
{
"epoch": 0.45977011494252873,
"grad_norm": 0.4275198886604283,
"learning_rate": 9.431178916142731e-05,
"loss": 2.0142,
"step": 970
},
{
"epoch": 0.4645100130347197,
"grad_norm": 0.41213645663674664,
"learning_rate": 9.419623675397672e-05,
"loss": 1.9863,
"step": 980
},
{
"epoch": 0.4692499111269108,
"grad_norm": 0.39744532831875506,
"learning_rate": 9.407959469288214e-05,
"loss": 1.963,
"step": 990
},
{
"epoch": 0.4739898092191018,
"grad_norm": 0.40358506493166846,
"learning_rate": 9.396186585394153e-05,
"loss": 1.9724,
"step": 1000
},
{
"epoch": 0.47872970731129283,
"grad_norm": 0.3715075397009002,
"learning_rate": 9.384305313974719e-05,
"loss": 1.9564,
"step": 1010
},
{
"epoch": 0.4834696054034838,
"grad_norm": 0.41249417731334614,
"learning_rate": 9.372315947961434e-05,
"loss": 2.0089,
"step": 1020
},
{
"epoch": 0.4882095034956748,
"grad_norm": 0.4477075629260475,
"learning_rate": 9.360218782950873e-05,
"loss": 2.0249,
"step": 1030
},
{
"epoch": 0.4929494015878659,
"grad_norm": 0.41335031918044873,
"learning_rate": 9.34801411719739e-05,
"loss": 2.0439,
"step": 1040
},
{
"epoch": 0.4976892996800569,
"grad_norm": 0.4023689824634566,
"learning_rate": 9.335702251605756e-05,
"loss": 2.0278,
"step": 1050
},
{
"epoch": 0.5024291977722479,
"grad_norm": 0.37476123227339486,
"learning_rate": 9.32328348972374e-05,
"loss": 2.0854,
"step": 1060
},
{
"epoch": 0.5071690958644389,
"grad_norm": 0.3680109272331818,
"learning_rate": 9.310758137734634e-05,
"loss": 2.0505,
"step": 1070
},
{
"epoch": 0.5119089939566299,
"grad_norm": 0.47590335433852127,
"learning_rate": 9.298126504449697e-05,
"loss": 1.9342,
"step": 1080
},
{
"epoch": 0.5166488920488209,
"grad_norm": 0.443747158773761,
"learning_rate": 9.285388901300537e-05,
"loss": 2.0338,
"step": 1090
},
{
"epoch": 0.5213887901410119,
"grad_norm": 0.4300619230217585,
"learning_rate": 9.272545642331443e-05,
"loss": 1.9431,
"step": 1100
},
{
"epoch": 0.526128688233203,
"grad_norm": 0.4068927208227842,
"learning_rate": 9.259597044191636e-05,
"loss": 1.9639,
"step": 1110
},
{
"epoch": 0.530868586325394,
"grad_norm": 0.3904780080331756,
"learning_rate": 9.246543426127463e-05,
"loss": 2.044,
"step": 1120
},
{
"epoch": 0.535608484417585,
"grad_norm": 0.4074988084895911,
"learning_rate": 9.233385109974528e-05,
"loss": 1.9209,
"step": 1130
},
{
"epoch": 0.540348382509776,
"grad_norm": 0.48971289458578504,
"learning_rate": 9.220122420149753e-05,
"loss": 1.9405,
"step": 1140
},
{
"epoch": 0.545088280601967,
"grad_norm": 0.4560990819156225,
"learning_rate": 9.206755683643383e-05,
"loss": 1.9754,
"step": 1150
},
{
"epoch": 0.5498281786941581,
"grad_norm": 0.4953771996336736,
"learning_rate": 9.193285230010923e-05,
"loss": 1.9832,
"step": 1160
},
{
"epoch": 0.5545680767863491,
"grad_norm": 0.452270837264993,
"learning_rate": 9.179711391365016e-05,
"loss": 2.0267,
"step": 1170
},
{
"epoch": 0.5593079748785401,
"grad_norm": 0.38839940667413064,
"learning_rate": 9.166034502367246e-05,
"loss": 2.0303,
"step": 1180
},
{
"epoch": 0.5640478729707311,
"grad_norm": 0.4434400621892702,
"learning_rate": 9.152254900219899e-05,
"loss": 2.019,
"step": 1190
},
{
"epoch": 0.5687877710629221,
"grad_norm": 0.4265655972195879,
"learning_rate": 9.138372924657638e-05,
"loss": 1.9578,
"step": 1200
},
{
"epoch": 0.5735276691551132,
"grad_norm": 0.37712073893593084,
"learning_rate": 9.124388917939135e-05,
"loss": 1.9002,
"step": 1210
},
{
"epoch": 0.5782675672473042,
"grad_norm": 0.3967821230664083,
"learning_rate": 9.110303224838628e-05,
"loss": 1.9982,
"step": 1220
},
{
"epoch": 0.5830074653394952,
"grad_norm": 0.4225910574667248,
"learning_rate": 9.096116192637424e-05,
"loss": 1.9999,
"step": 1230
},
{
"epoch": 0.5877473634316862,
"grad_norm": 0.46005143244561764,
"learning_rate": 9.081828171115334e-05,
"loss": 1.9269,
"step": 1240
},
{
"epoch": 0.5924872615238772,
"grad_norm": 0.41650738683050376,
"learning_rate": 9.067439512542048e-05,
"loss": 2.0138,
"step": 1250
},
{
"epoch": 0.5972271596160682,
"grad_norm": 0.4595664788322495,
"learning_rate": 9.052950571668457e-05,
"loss": 1.8902,
"step": 1260
},
{
"epoch": 0.6019670577082593,
"grad_norm": 0.47181766838174233,
"learning_rate": 9.038361705717897e-05,
"loss": 2.0354,
"step": 1270
},
{
"epoch": 0.6067069558004503,
"grad_norm": 0.4016620461236779,
"learning_rate": 9.023673274377349e-05,
"loss": 2.0428,
"step": 1280
},
{
"epoch": 0.6114468538926413,
"grad_norm": 0.44582424551905314,
"learning_rate": 9.00888563978857e-05,
"loss": 1.9205,
"step": 1290
},
{
"epoch": 0.6161867519848323,
"grad_norm": 0.4731092970060822,
"learning_rate": 8.993999166539155e-05,
"loss": 1.9468,
"step": 1300
},
{
"epoch": 0.6209266500770233,
"grad_norm": 0.41403788063445784,
"learning_rate": 8.979014221653569e-05,
"loss": 1.967,
"step": 1310
},
{
"epoch": 0.6256665481692144,
"grad_norm": 0.3824681634104647,
"learning_rate": 8.963931174584072e-05,
"loss": 1.9764,
"step": 1320
},
{
"epoch": 0.6304064462614054,
"grad_norm": 0.3979138111413701,
"learning_rate": 8.94875039720163e-05,
"loss": 2.0262,
"step": 1330
},
{
"epoch": 0.6351463443535964,
"grad_norm": 0.41027150705022153,
"learning_rate": 8.93347226378674e-05,
"loss": 1.9379,
"step": 1340
},
{
"epoch": 0.6398862424457874,
"grad_norm": 0.46333301444068553,
"learning_rate": 8.9180971510202e-05,
"loss": 1.9551,
"step": 1350
},
{
"epoch": 0.6446261405379784,
"grad_norm": 0.39959859369206574,
"learning_rate": 8.902625437973823e-05,
"loss": 1.9199,
"step": 1360
},
{
"epoch": 0.6493660386301695,
"grad_norm": 0.42731835258341894,
"learning_rate": 8.887057506101096e-05,
"loss": 2.0178,
"step": 1370
},
{
"epoch": 0.6541059367223605,
"grad_norm": 0.43891265274307517,
"learning_rate": 8.871393739227764e-05,
"loss": 1.9369,
"step": 1380
},
{
"epoch": 0.6588458348145515,
"grad_norm": 0.4314210574368562,
"learning_rate": 8.855634523542384e-05,
"loss": 2.0049,
"step": 1390
},
{
"epoch": 0.6635857329067425,
"grad_norm": 0.44613138847149775,
"learning_rate": 8.839780247586785e-05,
"loss": 1.9509,
"step": 1400
},
{
"epoch": 0.6683256309989335,
"grad_norm": 0.4379460820834945,
"learning_rate": 8.823831302246498e-05,
"loss": 1.9541,
"step": 1410
},
{
"epoch": 0.6730655290911245,
"grad_norm": 0.3682639471382051,
"learning_rate": 8.807788080741124e-05,
"loss": 2.0064,
"step": 1420
},
{
"epoch": 0.6778054271833156,
"grad_norm": 0.3981445155765943,
"learning_rate": 8.791650978614627e-05,
"loss": 1.9151,
"step": 1430
},
{
"epoch": 0.6825453252755066,
"grad_norm": 0.3868845773205047,
"learning_rate": 8.77542039372559e-05,
"loss": 2.0033,
"step": 1440
},
{
"epoch": 0.6872852233676976,
"grad_norm": 0.4065050795968265,
"learning_rate": 8.759096726237406e-05,
"loss": 1.9333,
"step": 1450
},
{
"epoch": 0.6920251214598886,
"grad_norm": 0.4019451177579478,
"learning_rate": 8.742680378608405e-05,
"loss": 1.9738,
"step": 1460
},
{
"epoch": 0.6967650195520796,
"grad_norm": 0.40929290402886576,
"learning_rate": 8.726171755581943e-05,
"loss": 1.9054,
"step": 1470
},
{
"epoch": 0.7015049176442707,
"grad_norm": 0.4521322208310143,
"learning_rate": 8.709571264176409e-05,
"loss": 2.038,
"step": 1480
},
{
"epoch": 0.7062448157364617,
"grad_norm": 0.4152045328204035,
"learning_rate": 8.692879313675201e-05,
"loss": 2.0632,
"step": 1490
},
{
"epoch": 0.7109847138286527,
"grad_norm": 0.4153887781497306,
"learning_rate": 8.676096315616633e-05,
"loss": 1.9658,
"step": 1500
},
{
"epoch": 0.7157246119208437,
"grad_norm": 0.4421939758182222,
"learning_rate": 8.659222683783785e-05,
"loss": 1.9318,
"step": 1510
},
{
"epoch": 0.7204645100130347,
"grad_norm": 0.40964882006156955,
"learning_rate": 8.642258834194306e-05,
"loss": 1.9843,
"step": 1520
},
{
"epoch": 0.7252044081052257,
"grad_norm": 0.4083908197791484,
"learning_rate": 8.625205185090148e-05,
"loss": 1.9828,
"step": 1530
},
{
"epoch": 0.7299443061974168,
"grad_norm": 0.39713303306109243,
"learning_rate": 8.608062156927267e-05,
"loss": 1.9957,
"step": 1540
},
{
"epoch": 0.7346842042896078,
"grad_norm": 0.3984748196137378,
"learning_rate": 8.59083017236525e-05,
"loss": 1.9756,
"step": 1550
},
{
"epoch": 0.7394241023817988,
"grad_norm": 0.3801131175331665,
"learning_rate": 8.57350965625689e-05,
"loss": 2.0876,
"step": 1560
},
{
"epoch": 0.7441640004739898,
"grad_norm": 0.40526485533564677,
"learning_rate": 8.556101035637723e-05,
"loss": 1.9273,
"step": 1570
},
{
"epoch": 0.7489038985661808,
"grad_norm": 0.43256807999674307,
"learning_rate": 8.538604739715487e-05,
"loss": 1.9965,
"step": 1580
},
{
"epoch": 0.7536437966583719,
"grad_norm": 0.4089571388848955,
"learning_rate": 8.521021199859547e-05,
"loss": 1.9838,
"step": 1590
},
{
"epoch": 0.7583836947505629,
"grad_norm": 0.43989226476544846,
"learning_rate": 8.503350849590261e-05,
"loss": 2.0101,
"step": 1600
},
{
"epoch": 0.7631235928427539,
"grad_norm": 0.4312349465343795,
"learning_rate": 8.485594124568286e-05,
"loss": 2.0024,
"step": 1610
},
{
"epoch": 0.7678634909349449,
"grad_norm": 0.42870468778423404,
"learning_rate": 8.467751462583837e-05,
"loss": 1.9171,
"step": 1620
},
{
"epoch": 0.7726033890271359,
"grad_norm": 0.37297491856173187,
"learning_rate": 8.449823303545902e-05,
"loss": 1.9234,
"step": 1630
},
{
"epoch": 0.777343287119327,
"grad_norm": 0.43903627896277525,
"learning_rate": 8.431810089471386e-05,
"loss": 2.0138,
"step": 1640
},
{
"epoch": 0.782083185211518,
"grad_norm": 0.4356441070614573,
"learning_rate": 8.413712264474218e-05,
"loss": 1.9822,
"step": 1650
},
{
"epoch": 0.786823083303709,
"grad_norm": 0.42844869008890196,
"learning_rate": 8.395530274754401e-05,
"loss": 1.9615,
"step": 1660
},
{
"epoch": 0.7915629813959,
"grad_norm": 0.442280918540681,
"learning_rate": 8.377264568587012e-05,
"loss": 1.9835,
"step": 1670
},
{
"epoch": 0.796302879488091,
"grad_norm": 0.42858220049882395,
"learning_rate": 8.358915596311143e-05,
"loss": 1.9043,
"step": 1680
},
{
"epoch": 0.801042777580282,
"grad_norm": 0.388683268775689,
"learning_rate": 8.340483810318809e-05,
"loss": 2.0451,
"step": 1690
},
{
"epoch": 0.8057826756724731,
"grad_norm": 0.4116698984896444,
"learning_rate": 8.321969665043785e-05,
"loss": 1.9792,
"step": 1700
},
{
"epoch": 0.8105225737646641,
"grad_norm": 0.40384036708963345,
"learning_rate": 8.303373616950408e-05,
"loss": 1.8407,
"step": 1710
},
{
"epoch": 0.8152624718568551,
"grad_norm": 0.4680015183031998,
"learning_rate": 8.28469612452232e-05,
"loss": 1.9616,
"step": 1720
},
{
"epoch": 0.8200023699490461,
"grad_norm": 0.43443236620799985,
"learning_rate": 8.265937648251162e-05,
"loss": 1.9879,
"step": 1730
},
{
"epoch": 0.8247422680412371,
"grad_norm": 0.4892981794701289,
"learning_rate": 8.247098650625229e-05,
"loss": 1.9988,
"step": 1740
},
{
"epoch": 0.8294821661334282,
"grad_norm": 0.41120558715230104,
"learning_rate": 8.228179596118055e-05,
"loss": 2.0057,
"step": 1750
},
{
"epoch": 0.8342220642256192,
"grad_norm": 0.3856884225256909,
"learning_rate": 8.209180951176972e-05,
"loss": 2.0345,
"step": 1760
},
{
"epoch": 0.8389619623178102,
"grad_norm": 0.43262267182183567,
"learning_rate": 8.190103184211606e-05,
"loss": 2.0506,
"step": 1770
},
{
"epoch": 0.8437018604100012,
"grad_norm": 0.46227543956491046,
"learning_rate": 8.170946765582327e-05,
"loss": 1.9537,
"step": 1780
},
{
"epoch": 0.8484417585021922,
"grad_norm": 0.41122944892391,
"learning_rate": 8.151712167588654e-05,
"loss": 1.9481,
"step": 1790
},
{
"epoch": 0.8531816565943833,
"grad_norm": 0.4762971181475547,
"learning_rate": 8.13239986445761e-05,
"loss": 1.969,
"step": 1800
},
{
"epoch": 0.8579215546865743,
"grad_norm": 0.41348450657088276,
"learning_rate": 8.113010332332032e-05,
"loss": 2.0127,
"step": 1810
},
{
"epoch": 0.8626614527787653,
"grad_norm": 0.41355376759860496,
"learning_rate": 8.093544049258826e-05,
"loss": 1.9378,
"step": 1820
},
{
"epoch": 0.8674013508709563,
"grad_norm": 0.4739386141603482,
"learning_rate": 8.074001495177187e-05,
"loss": 1.9548,
"step": 1830
},
{
"epoch": 0.8721412489631473,
"grad_norm": 0.4067937473126016,
"learning_rate": 8.054383151906766e-05,
"loss": 1.9588,
"step": 1840
},
{
"epoch": 0.8768811470553383,
"grad_norm": 0.4603727127637402,
"learning_rate": 8.034689503135783e-05,
"loss": 1.9616,
"step": 1850
},
{
"epoch": 0.8816210451475294,
"grad_norm": 0.404919540874673,
"learning_rate": 8.014921034409115e-05,
"loss": 1.9476,
"step": 1860
},
{
"epoch": 0.8863609432397204,
"grad_norm": 0.39850400899429533,
"learning_rate": 7.99507823311631e-05,
"loss": 1.9603,
"step": 1870
},
{
"epoch": 0.8911008413319114,
"grad_norm": 0.48693274229874695,
"learning_rate": 7.97516158847958e-05,
"loss": 2.0121,
"step": 1880
},
{
"epoch": 0.8958407394241024,
"grad_norm": 0.45401122715232545,
"learning_rate": 7.955171591541739e-05,
"loss": 1.8593,
"step": 1890
},
{
"epoch": 0.9005806375162934,
"grad_norm": 0.38605278944495364,
"learning_rate": 7.935108735154094e-05,
"loss": 1.9199,
"step": 1900
},
{
"epoch": 0.9053205356084845,
"grad_norm": 0.4453838492498413,
"learning_rate": 7.914973513964291e-05,
"loss": 1.9354,
"step": 1910
},
{
"epoch": 0.9100604337006755,
"grad_norm": 0.4123431078009058,
"learning_rate": 7.894766424404126e-05,
"loss": 1.9807,
"step": 1920
},
{
"epoch": 0.9148003317928665,
"grad_norm": 0.43369573713775106,
"learning_rate": 7.874487964677301e-05,
"loss": 1.9707,
"step": 1930
},
{
"epoch": 0.9195402298850575,
"grad_norm": 0.3949770503185179,
"learning_rate": 7.854138634747145e-05,
"loss": 1.9742,
"step": 1940
},
{
"epoch": 0.9242801279772485,
"grad_norm": 0.4224215984268503,
"learning_rate": 7.833718936324277e-05,
"loss": 1.9465,
"step": 1950
},
{
"epoch": 0.9290200260694395,
"grad_norm": 0.5228997588486322,
"learning_rate": 7.813229372854251e-05,
"loss": 1.9454,
"step": 1960
},
{
"epoch": 0.9337599241616306,
"grad_norm": 0.42165180512522465,
"learning_rate": 7.792670449505135e-05,
"loss": 1.9175,
"step": 1970
},
{
"epoch": 0.9384998222538216,
"grad_norm": 0.40378336800384856,
"learning_rate": 7.772042673155055e-05,
"loss": 1.9237,
"step": 1980
},
{
"epoch": 0.9432397203460126,
"grad_norm": 0.45740238886085255,
"learning_rate": 7.751346552379706e-05,
"loss": 1.9752,
"step": 1990
},
{
"epoch": 0.9479796184382036,
"grad_norm": 0.39149703066060726,
"learning_rate": 7.730582597439799e-05,
"loss": 1.98,
"step": 2000
},
{
"epoch": 0.9527195165303946,
"grad_norm": 0.4198989958604622,
"learning_rate": 7.709751320268499e-05,
"loss": 1.9937,
"step": 2010
},
{
"epoch": 0.9574594146225857,
"grad_norm": 0.45036655944797305,
"learning_rate": 7.688853234458786e-05,
"loss": 1.9439,
"step": 2020
},
{
"epoch": 0.9621993127147767,
"grad_norm": 0.47886989965002774,
"learning_rate": 7.667888855250806e-05,
"loss": 1.8984,
"step": 2030
},
{
"epoch": 0.9669392108069677,
"grad_norm": 0.4485436591345206,
"learning_rate": 7.646858699519158e-05,
"loss": 1.9997,
"step": 2040
},
{
"epoch": 0.9716791088991587,
"grad_norm": 0.4089350286618743,
"learning_rate": 7.625763285760154e-05,
"loss": 2.0561,
"step": 2050
},
{
"epoch": 0.9764190069913496,
"grad_norm": 0.5012148973934161,
"learning_rate": 7.604603134079039e-05,
"loss": 1.9108,
"step": 2060
},
{
"epoch": 0.9811589050835408,
"grad_norm": 0.4193397192808331,
"learning_rate": 7.583378766177163e-05,
"loss": 2.0375,
"step": 2070
},
{
"epoch": 0.9858988031757318,
"grad_norm": 0.3996742152514563,
"learning_rate": 7.56209070533912e-05,
"loss": 1.8992,
"step": 2080
},
{
"epoch": 0.9906387012679227,
"grad_norm": 0.43312783729617976,
"learning_rate": 7.540739476419847e-05,
"loss": 2.0202,
"step": 2090
},
{
"epoch": 0.9953785993601137,
"grad_norm": 0.47876561721756805,
"learning_rate": 7.519325605831684e-05,
"loss": 1.9258,
"step": 2100
},
{
"epoch": 1.0001184974523047,
"grad_norm": 0.40845159679128945,
"learning_rate": 7.497849621531396e-05,
"loss": 1.8963,
"step": 2110
},
{
"epoch": 1.0048583955444959,
"grad_norm": 0.4911320886031023,
"learning_rate": 7.476312053007151e-05,
"loss": 1.8763,
"step": 2120
},
{
"epoch": 1.0095982936366867,
"grad_norm": 0.4341191300612264,
"learning_rate": 7.454713431265475e-05,
"loss": 1.9345,
"step": 2130
},
{
"epoch": 1.0143381917288778,
"grad_norm": 0.44526984352662835,
"learning_rate": 7.43305428881815e-05,
"loss": 1.9666,
"step": 2140
},
{
"epoch": 1.019078089821069,
"grad_norm": 0.45021419491727926,
"learning_rate": 7.411335159669093e-05,
"loss": 1.9683,
"step": 2150
},
{
"epoch": 1.0238179879132598,
"grad_norm": 0.46367987121746707,
"learning_rate": 7.389556579301186e-05,
"loss": 1.884,
"step": 2160
},
{
"epoch": 1.028557886005451,
"grad_norm": 0.518631039907863,
"learning_rate": 7.367719084663074e-05,
"loss": 1.8473,
"step": 2170
},
{
"epoch": 1.0332977840976418,
"grad_norm": 0.4686244164357671,
"learning_rate": 7.345823214155927e-05,
"loss": 1.8894,
"step": 2180
},
{
"epoch": 1.038037682189833,
"grad_norm": 0.5124536145999882,
"learning_rate": 7.323869507620169e-05,
"loss": 1.886,
"step": 2190
},
{
"epoch": 1.0427775802820238,
"grad_norm": 0.428865165913033,
"learning_rate": 7.30185850632216e-05,
"loss": 1.8934,
"step": 2200
},
{
"epoch": 1.047517478374215,
"grad_norm": 0.4575909980653946,
"learning_rate": 7.27979075294086e-05,
"loss": 1.8793,
"step": 2210
},
{
"epoch": 1.052257376466406,
"grad_norm": 0.46819042427920937,
"learning_rate": 7.257666791554448e-05,
"loss": 1.9177,
"step": 2220
},
{
"epoch": 1.056997274558597,
"grad_norm": 0.5869490097444697,
"learning_rate": 7.2354871676269e-05,
"loss": 1.8888,
"step": 2230
},
{
"epoch": 1.061737172650788,
"grad_norm": 0.4407701363338049,
"learning_rate": 7.213252427994547e-05,
"loss": 1.9145,
"step": 2240
},
{
"epoch": 1.066477070742979,
"grad_norm": 0.5471189926425418,
"learning_rate": 7.1909631208526e-05,
"loss": 1.8647,
"step": 2250
},
{
"epoch": 1.07121696883517,
"grad_norm": 0.45247580903783674,
"learning_rate": 7.168619795741616e-05,
"loss": 1.8793,
"step": 2260
},
{
"epoch": 1.0759568669273611,
"grad_norm": 0.5394937103937341,
"learning_rate": 7.146223003533964e-05,
"loss": 1.9394,
"step": 2270
},
{
"epoch": 1.080696765019552,
"grad_norm": 0.5010981958648577,
"learning_rate": 7.12377329642024e-05,
"loss": 1.8009,
"step": 2280
},
{
"epoch": 1.0854366631117431,
"grad_norm": 0.49455090224086273,
"learning_rate": 7.101271227895646e-05,
"loss": 1.9877,
"step": 2290
},
{
"epoch": 1.090176561203934,
"grad_norm": 0.4487359249312413,
"learning_rate": 7.07871735274636e-05,
"loss": 1.8578,
"step": 2300
},
{
"epoch": 1.0949164592961251,
"grad_norm": 0.5006725728639967,
"learning_rate": 7.056112227035831e-05,
"loss": 1.9142,
"step": 2310
},
{
"epoch": 1.0996563573883162,
"grad_norm": 0.46840477309344347,
"learning_rate": 7.033456408091103e-05,
"loss": 1.9178,
"step": 2320
},
{
"epoch": 1.1043962554805071,
"grad_norm": 0.44881264282080685,
"learning_rate": 7.010750454489042e-05,
"loss": 1.9011,
"step": 2330
},
{
"epoch": 1.1091361535726982,
"grad_norm": 0.4914874135601711,
"learning_rate": 6.987994926042588e-05,
"loss": 1.8817,
"step": 2340
},
{
"epoch": 1.1138760516648891,
"grad_norm": 0.4875786937414022,
"learning_rate": 6.965190383786938e-05,
"loss": 1.9151,
"step": 2350
},
{
"epoch": 1.1186159497570802,
"grad_norm": 0.47374621253430516,
"learning_rate": 6.942337389965722e-05,
"loss": 1.8652,
"step": 2360
},
{
"epoch": 1.1233558478492713,
"grad_norm": 0.45812614575538185,
"learning_rate": 6.919436508017139e-05,
"loss": 1.9191,
"step": 2370
},
{
"epoch": 1.1280957459414622,
"grad_norm": 0.5233924389852819,
"learning_rate": 6.896488302560062e-05,
"loss": 1.8944,
"step": 2380
},
{
"epoch": 1.1328356440336533,
"grad_norm": 0.4760349705385804,
"learning_rate": 6.873493339380125e-05,
"loss": 1.8896,
"step": 2390
},
{
"epoch": 1.1375755421258442,
"grad_norm": 0.47170548205722757,
"learning_rate": 6.850452185415763e-05,
"loss": 1.8436,
"step": 2400
},
{
"epoch": 1.1423154402180353,
"grad_norm": 0.4742928761569321,
"learning_rate": 6.827365408744244e-05,
"loss": 1.938,
"step": 2410
},
{
"epoch": 1.1470553383102264,
"grad_norm": 0.5423850691494456,
"learning_rate": 6.804233578567658e-05,
"loss": 1.8889,
"step": 2420
},
{
"epoch": 1.1517952364024173,
"grad_norm": 0.48227588856524584,
"learning_rate": 6.781057265198885e-05,
"loss": 1.9094,
"step": 2430
},
{
"epoch": 1.1565351344946084,
"grad_norm": 0.45425361404028264,
"learning_rate": 6.75783704004753e-05,
"loss": 1.859,
"step": 2440
},
{
"epoch": 1.1612750325867993,
"grad_norm": 0.4433613473826934,
"learning_rate": 6.734573475605846e-05,
"loss": 1.9084,
"step": 2450
},
{
"epoch": 1.1660149306789904,
"grad_norm": 0.4943942467439202,
"learning_rate": 6.711267145434603e-05,
"loss": 1.9647,
"step": 2460
},
{
"epoch": 1.1707548287711815,
"grad_norm": 0.4577985217898985,
"learning_rate": 6.687918624148963e-05,
"loss": 1.8903,
"step": 2470
},
{
"epoch": 1.1754947268633724,
"grad_norm": 0.5864019689805202,
"learning_rate": 6.664528487404298e-05,
"loss": 1.8431,
"step": 2480
},
{
"epoch": 1.1802346249555635,
"grad_norm": 0.4979542549244347,
"learning_rate": 6.641097311882015e-05,
"loss": 1.9381,
"step": 2490
},
{
"epoch": 1.1849745230477544,
"grad_norm": 0.5142117151718176,
"learning_rate": 6.617625675275317e-05,
"loss": 1.8608,
"step": 2500
},
{
"epoch": 1.1897144211399455,
"grad_norm": 0.5179927851112526,
"learning_rate": 6.59411415627498e-05,
"loss": 1.9493,
"step": 2510
},
{
"epoch": 1.1944543192321366,
"grad_norm": 0.5221841655224025,
"learning_rate": 6.570563334555068e-05,
"loss": 1.8724,
"step": 2520
},
{
"epoch": 1.1991942173243275,
"grad_norm": 0.4985837837212232,
"learning_rate": 6.546973790758655e-05,
"loss": 1.952,
"step": 2530
},
{
"epoch": 1.2039341154165186,
"grad_norm": 0.5552319456240327,
"learning_rate": 6.523346106483504e-05,
"loss": 1.9397,
"step": 2540
},
{
"epoch": 1.2086740135087095,
"grad_norm": 0.4769628041892156,
"learning_rate": 6.499680864267725e-05,
"loss": 2.0053,
"step": 2550
},
{
"epoch": 1.2134139116009006,
"grad_norm": 0.4516518959319936,
"learning_rate": 6.475978647575416e-05,
"loss": 1.9402,
"step": 2560
},
{
"epoch": 1.2181538096930915,
"grad_norm": 0.4913816447981876,
"learning_rate": 6.452240040782276e-05,
"loss": 1.8451,
"step": 2570
},
{
"epoch": 1.2228937077852826,
"grad_norm": 0.4748765999127487,
"learning_rate": 6.4284656291612e-05,
"loss": 1.9117,
"step": 2580
},
{
"epoch": 1.2276336058774737,
"grad_norm": 0.5114110285568767,
"learning_rate": 6.404655998867848e-05,
"loss": 1.8831,
"step": 2590
},
{
"epoch": 1.2323735039696646,
"grad_norm": 0.47839985560769943,
"learning_rate": 6.380811736926188e-05,
"loss": 1.8627,
"step": 2600
},
{
"epoch": 1.2371134020618557,
"grad_norm": 0.5355232832118345,
"learning_rate": 6.356933431214034e-05,
"loss": 1.9189,
"step": 2610
},
{
"epoch": 1.2418533001540466,
"grad_norm": 0.4895001261750141,
"learning_rate": 6.33302167044854e-05,
"loss": 1.9699,
"step": 2620
},
{
"epoch": 1.2465931982462377,
"grad_norm": 0.4635882938471385,
"learning_rate": 6.309077044171694e-05,
"loss": 1.8779,
"step": 2630
},
{
"epoch": 1.2513330963384286,
"grad_norm": 0.45916609044978873,
"learning_rate": 6.285100142735782e-05,
"loss": 1.8527,
"step": 2640
},
{
"epoch": 1.2560729944306197,
"grad_norm": 0.46784246908879684,
"learning_rate": 6.261091557288826e-05,
"loss": 1.8844,
"step": 2650
},
{
"epoch": 1.2608128925228108,
"grad_norm": 0.5131345820024794,
"learning_rate": 6.237051879760014e-05,
"loss": 1.8402,
"step": 2660
},
{
"epoch": 1.2655527906150017,
"grad_norm": 0.5766279369511716,
"learning_rate": 6.21298170284511e-05,
"loss": 1.8558,
"step": 2670
},
{
"epoch": 1.2702926887071928,
"grad_norm": 0.48863073587665085,
"learning_rate": 6.188881619991834e-05,
"loss": 1.9337,
"step": 2680
},
{
"epoch": 1.2750325867993837,
"grad_norm": 0.5958235159214345,
"learning_rate": 6.164752225385235e-05,
"loss": 1.9018,
"step": 2690
},
{
"epoch": 1.2797724848915748,
"grad_norm": 0.5127854587716114,
"learning_rate": 6.140594113933042e-05,
"loss": 1.928,
"step": 2700
},
{
"epoch": 1.284512382983766,
"grad_norm": 0.4918233056408275,
"learning_rate": 6.116407881250994e-05,
"loss": 1.9623,
"step": 2710
},
{
"epoch": 1.2892522810759568,
"grad_norm": 0.4759408966884228,
"learning_rate": 6.0921941236481505e-05,
"loss": 1.876,
"step": 2720
},
{
"epoch": 1.293992179168148,
"grad_norm": 0.49692255085585224,
"learning_rate": 6.067953438112205e-05,
"loss": 1.871,
"step": 2730
},
{
"epoch": 1.2987320772603388,
"grad_norm": 0.51069268079758,
"learning_rate": 6.043686422294747e-05,
"loss": 1.9503,
"step": 2740
},
{
"epoch": 1.30347197535253,
"grad_norm": 0.4848235028179103,
"learning_rate": 6.019393674496543e-05,
"loss": 1.9636,
"step": 2750
},
{
"epoch": 1.308211873444721,
"grad_norm": 0.7269161906292443,
"learning_rate": 5.995075793652775e-05,
"loss": 1.8818,
"step": 2760
},
{
"epoch": 1.312951771536912,
"grad_norm": 0.46011103384366614,
"learning_rate": 5.9707333793182794e-05,
"loss": 1.9123,
"step": 2770
},
{
"epoch": 1.317691669629103,
"grad_norm": 0.5009880993886451,
"learning_rate": 5.946367031652761e-05,
"loss": 1.9407,
"step": 2780
},
{
"epoch": 1.3224315677212939,
"grad_norm": 0.5049332736921734,
"learning_rate": 5.921977351406004e-05,
"loss": 1.8624,
"step": 2790
},
{
"epoch": 1.327171465813485,
"grad_norm": 0.4984446750273935,
"learning_rate": 5.8975649399030485e-05,
"loss": 1.8407,
"step": 2800
},
{
"epoch": 1.331911363905676,
"grad_norm": 0.5202629992326526,
"learning_rate": 5.873130399029374e-05,
"loss": 1.8723,
"step": 2810
},
{
"epoch": 1.336651261997867,
"grad_norm": 0.57260787674711,
"learning_rate": 5.8486743312160584e-05,
"loss": 1.9077,
"step": 2820
},
{
"epoch": 1.341391160090058,
"grad_norm": 0.47793956835922086,
"learning_rate": 5.824197339424923e-05,
"loss": 1.9855,
"step": 2830
},
{
"epoch": 1.346131058182249,
"grad_norm": 0.4699288477951403,
"learning_rate": 5.799700027133666e-05,
"loss": 1.9131,
"step": 2840
},
{
"epoch": 1.35087095627444,
"grad_norm": 0.504238497502292,
"learning_rate": 5.7751829983209896e-05,
"loss": 1.9438,
"step": 2850
},
{
"epoch": 1.3556108543666312,
"grad_norm": 0.4814570049600418,
"learning_rate": 5.750646857451701e-05,
"loss": 1.9549,
"step": 2860
},
{
"epoch": 1.360350752458822,
"grad_norm": 0.5038793494327912,
"learning_rate": 5.726092209461814e-05,
"loss": 1.9016,
"step": 2870
},
{
"epoch": 1.3650906505510132,
"grad_norm": 0.5240318677978467,
"learning_rate": 5.701519659743636e-05,
"loss": 1.9323,
"step": 2880
},
{
"epoch": 1.369830548643204,
"grad_norm": 0.5135642745972475,
"learning_rate": 5.6769298141308345e-05,
"loss": 1.8633,
"step": 2890
},
{
"epoch": 1.3745704467353952,
"grad_norm": 0.5115968529507217,
"learning_rate": 5.652323278883511e-05,
"loss": 1.8486,
"step": 2900
},
{
"epoch": 1.3793103448275863,
"grad_norm": 0.4973184073827783,
"learning_rate": 5.6277006606732465e-05,
"loss": 1.9067,
"step": 2910
},
{
"epoch": 1.3840502429197772,
"grad_norm": 0.48576803898302945,
"learning_rate": 5.603062566568144e-05,
"loss": 1.9167,
"step": 2920
},
{
"epoch": 1.3887901410119683,
"grad_norm": 0.532613823404453,
"learning_rate": 5.5784096040178624e-05,
"loss": 1.916,
"step": 2930
},
{
"epoch": 1.3935300391041592,
"grad_norm": 0.5402345956070669,
"learning_rate": 5.5537423808386457e-05,
"loss": 1.9193,
"step": 2940
},
{
"epoch": 1.3982699371963503,
"grad_norm": 0.4920153790997806,
"learning_rate": 5.5290615051983276e-05,
"loss": 1.8214,
"step": 2950
},
{
"epoch": 1.4030098352885414,
"grad_norm": 0.5305053717830343,
"learning_rate": 5.504367585601342e-05,
"loss": 1.8724,
"step": 2960
},
{
"epoch": 1.4077497333807323,
"grad_norm": 0.5348665608450567,
"learning_rate": 5.479661230873723e-05,
"loss": 1.9576,
"step": 2970
},
{
"epoch": 1.4124896314729234,
"grad_norm": 0.5212184732874925,
"learning_rate": 5.4549430501480895e-05,
"loss": 1.9409,
"step": 2980
},
{
"epoch": 1.4172295295651143,
"grad_norm": 0.513803010422433,
"learning_rate": 5.43021365284863e-05,
"loss": 1.8691,
"step": 2990
},
{
"epoch": 1.4219694276573054,
"grad_norm": 0.5405731422319697,
"learning_rate": 5.405473648676074e-05,
"loss": 1.9071,
"step": 3000
},
{
"epoch": 1.4267093257494965,
"grad_norm": 0.5828580104321831,
"learning_rate": 5.380723647592668e-05,
"loss": 1.8781,
"step": 3010
},
{
"epoch": 1.4314492238416874,
"grad_norm": 0.4730373307838654,
"learning_rate": 5.3559642598071244e-05,
"loss": 1.9514,
"step": 3020
},
{
"epoch": 1.4361891219338785,
"grad_norm": 0.5098706245647135,
"learning_rate": 5.3311960957595885e-05,
"loss": 1.9019,
"step": 3030
},
{
"epoch": 1.4409290200260694,
"grad_norm": 0.4902558604014986,
"learning_rate": 5.306419766106582e-05,
"loss": 1.8003,
"step": 3040
},
{
"epoch": 1.4456689181182605,
"grad_norm": 0.5662981198334492,
"learning_rate": 5.2816358817059483e-05,
"loss": 1.9584,
"step": 3050
},
{
"epoch": 1.4504088162104516,
"grad_norm": 0.5080795735549143,
"learning_rate": 5.2568450536017946e-05,
"loss": 1.8299,
"step": 3060
},
{
"epoch": 1.4551487143026425,
"grad_norm": 0.4883320170692768,
"learning_rate": 5.23204789300942e-05,
"loss": 1.8948,
"step": 3070
},
{
"epoch": 1.4598886123948336,
"grad_norm": 0.5018665885085004,
"learning_rate": 5.207245011300256e-05,
"loss": 1.9096,
"step": 3080
},
{
"epoch": 1.4646285104870245,
"grad_norm": 0.49985987707909735,
"learning_rate": 5.182437019986781e-05,
"loss": 1.8725,
"step": 3090
},
{
"epoch": 1.4693684085792156,
"grad_norm": 0.5501802725606001,
"learning_rate": 5.157624530707457e-05,
"loss": 1.852,
"step": 3100
},
{
"epoch": 1.4741083066714067,
"grad_norm": 0.5050415458131547,
"learning_rate": 5.132808155211637e-05,
"loss": 1.9234,
"step": 3110
},
{
"epoch": 1.4788482047635976,
"grad_norm": 0.5388328369977669,
"learning_rate": 5.107988505344493e-05,
"loss": 1.8503,
"step": 3120
},
{
"epoch": 1.4835881028557887,
"grad_norm": 0.5294932998067775,
"learning_rate": 5.083166193031924e-05,
"loss": 1.8602,
"step": 3130
},
{
"epoch": 1.4883280009479796,
"grad_norm": 0.5081432892581731,
"learning_rate": 5.058341830265473e-05,
"loss": 1.8916,
"step": 3140
},
{
"epoch": 1.4930678990401707,
"grad_norm": 0.48231454449779565,
"learning_rate": 5.033516029087231e-05,
"loss": 1.9268,
"step": 3150
},
{
"epoch": 1.4978077971323618,
"grad_norm": 0.5031248301603529,
"learning_rate": 5.008689401574762e-05,
"loss": 1.8619,
"step": 3160
},
{
"epoch": 1.5025476952245527,
"grad_norm": 0.48955254310210605,
"learning_rate": 4.983862559825994e-05,
"loss": 1.9342,
"step": 3170
},
{
"epoch": 1.5072875933167436,
"grad_norm": 0.5786990144175583,
"learning_rate": 4.959036115944146e-05,
"loss": 1.9487,
"step": 3180
},
{
"epoch": 1.5120274914089347,
"grad_norm": 0.5204059056090741,
"learning_rate": 4.93421068202262e-05,
"loss": 1.9237,
"step": 3190
},
{
"epoch": 1.5167673895011258,
"grad_norm": 0.5063131987653341,
"learning_rate": 4.909386870129921e-05,
"loss": 1.9752,
"step": 3200
},
{
"epoch": 1.5215072875933169,
"grad_norm": 0.48289993909064316,
"learning_rate": 4.884565292294563e-05,
"loss": 1.8891,
"step": 3210
},
{
"epoch": 1.5262471856855078,
"grad_norm": 0.5172395191973475,
"learning_rate": 4.859746560489979e-05,
"loss": 1.8907,
"step": 3220
},
{
"epoch": 1.5309870837776987,
"grad_norm": 0.4807916914066212,
"learning_rate": 4.834931286619432e-05,
"loss": 1.9074,
"step": 3230
},
{
"epoch": 1.5357269818698898,
"grad_norm": 0.5144939695987174,
"learning_rate": 4.810120082500934e-05,
"loss": 1.8338,
"step": 3240
},
{
"epoch": 1.5404668799620809,
"grad_norm": 0.5199756044880577,
"learning_rate": 4.785313559852156e-05,
"loss": 1.965,
"step": 3250
},
{
"epoch": 1.545206778054272,
"grad_norm": 0.5415928562917922,
"learning_rate": 4.7605123302753433e-05,
"loss": 1.8472,
"step": 3260
},
{
"epoch": 1.5499466761464629,
"grad_norm": 0.5335132590972799,
"learning_rate": 4.735717005242248e-05,
"loss": 1.8558,
"step": 3270
},
{
"epoch": 1.5546865742386538,
"grad_norm": 0.5581108907205053,
"learning_rate": 4.710928196079042e-05,
"loss": 1.8794,
"step": 3280
},
{
"epoch": 1.5594264723308449,
"grad_norm": 0.5335645184315633,
"learning_rate": 4.6861465139512475e-05,
"loss": 1.8271,
"step": 3290
},
{
"epoch": 1.564166370423036,
"grad_norm": 0.5470177997128685,
"learning_rate": 4.661372569848678e-05,
"loss": 1.8935,
"step": 3300
},
{
"epoch": 1.568906268515227,
"grad_norm": 0.5362519757955545,
"learning_rate": 4.636606974570361e-05,
"loss": 1.8072,
"step": 3310
},
{
"epoch": 1.573646166607418,
"grad_norm": 0.6040810957613818,
"learning_rate": 4.611850338709482e-05,
"loss": 1.7864,
"step": 3320
},
{
"epoch": 1.5783860646996088,
"grad_norm": 0.5318403452991018,
"learning_rate": 4.5871032726383386e-05,
"loss": 1.8524,
"step": 3330
},
{
"epoch": 1.5831259627918,
"grad_norm": 0.5512446332300014,
"learning_rate": 4.562366386493286e-05,
"loss": 1.8972,
"step": 3340
},
{
"epoch": 1.587865860883991,
"grad_norm": 0.5083043080271707,
"learning_rate": 4.537640290159688e-05,
"loss": 1.7909,
"step": 3350
},
{
"epoch": 1.5926057589761822,
"grad_norm": 0.516558139348224,
"learning_rate": 4.512925593256895e-05,
"loss": 1.9006,
"step": 3360
},
{
"epoch": 1.597345657068373,
"grad_norm": 0.5406712324925647,
"learning_rate": 4.4882229051232e-05,
"loss": 1.9456,
"step": 3370
},
{
"epoch": 1.602085555160564,
"grad_norm": 0.5537236012465999,
"learning_rate": 4.463532834800825e-05,
"loss": 1.8696,
"step": 3380
},
{
"epoch": 1.606825453252755,
"grad_norm": 0.5501268633544832,
"learning_rate": 4.438855991020896e-05,
"loss": 1.9089,
"step": 3390
},
{
"epoch": 1.6115653513449462,
"grad_norm": 0.5642376324584947,
"learning_rate": 4.414192982188446e-05,
"loss": 1.868,
"step": 3400
},
{
"epoch": 1.616305249437137,
"grad_norm": 0.49603254737837815,
"learning_rate": 4.3895444163674006e-05,
"loss": 1.9261,
"step": 3410
},
{
"epoch": 1.6210451475293282,
"grad_norm": 0.5264212888797052,
"learning_rate": 4.364910901265606e-05,
"loss": 1.9271,
"step": 3420
},
{
"epoch": 1.625785045621519,
"grad_norm": 0.5165427594444576,
"learning_rate": 4.340293044219825e-05,
"loss": 1.8798,
"step": 3430
},
{
"epoch": 1.6305249437137102,
"grad_norm": 0.5111756681074762,
"learning_rate": 4.315691452180777e-05,
"loss": 1.8821,
"step": 3440
},
{
"epoch": 1.6352648418059013,
"grad_norm": 0.5353729238490614,
"learning_rate": 4.2911067316981656e-05,
"loss": 1.9193,
"step": 3450
},
{
"epoch": 1.6400047398980921,
"grad_norm": 0.5427362289483532,
"learning_rate": 4.2665394889057325e-05,
"loss": 1.8648,
"step": 3460
},
{
"epoch": 1.6447446379902833,
"grad_norm": 0.5316532712452083,
"learning_rate": 4.2419903295063045e-05,
"loss": 1.8696,
"step": 3470
},
{
"epoch": 1.6494845360824741,
"grad_norm": 0.5445515739019248,
"learning_rate": 4.2174598587568706e-05,
"loss": 1.7773,
"step": 3480
},
{
"epoch": 1.6542244341746652,
"grad_norm": 0.515985891781636,
"learning_rate": 4.192948681453645e-05,
"loss": 1.9528,
"step": 3490
},
{
"epoch": 1.6589643322668564,
"grad_norm": 0.533497568011406,
"learning_rate": 4.168457401917169e-05,
"loss": 1.9089,
"step": 3500
},
{
"epoch": 1.6637042303590472,
"grad_norm": 0.5034380410666982,
"learning_rate": 4.1439866239774065e-05,
"loss": 1.902,
"step": 3510
},
{
"epoch": 1.6684441284512384,
"grad_norm": 0.5008886693586585,
"learning_rate": 4.119536950958853e-05,
"loss": 1.8597,
"step": 3520
},
{
"epoch": 1.6731840265434292,
"grad_norm": 0.5042866133180605,
"learning_rate": 4.095108985665668e-05,
"loss": 1.941,
"step": 3530
},
{
"epoch": 1.6779239246356203,
"grad_norm": 0.4894456961892347,
"learning_rate": 4.070703330366809e-05,
"loss": 1.8749,
"step": 3540
},
{
"epoch": 1.6826638227278115,
"grad_norm": 0.5304927617260963,
"learning_rate": 4.0463205867811834e-05,
"loss": 1.9169,
"step": 3550
},
{
"epoch": 1.6874037208200023,
"grad_norm": 0.5192399220515885,
"learning_rate": 4.0219613560628074e-05,
"loss": 1.8853,
"step": 3560
},
{
"epoch": 1.6921436189121932,
"grad_norm": 0.5436581114459818,
"learning_rate": 3.997626238785997e-05,
"loss": 1.9093,
"step": 3570
},
{
"epoch": 1.6968835170043843,
"grad_norm": 0.5671093634463978,
"learning_rate": 3.973315834930549e-05,
"loss": 1.8667,
"step": 3580
},
{
"epoch": 1.7016234150965754,
"grad_norm": 0.5505401718757482,
"learning_rate": 3.949030743866955e-05,
"loss": 1.8701,
"step": 3590
},
{
"epoch": 1.7063633131887666,
"grad_norm": 0.5107784655812311,
"learning_rate": 3.924771564341621e-05,
"loss": 1.8796,
"step": 3600
},
{
"epoch": 1.7111032112809574,
"grad_norm": 0.5123424894974382,
"learning_rate": 3.900538894462112e-05,
"loss": 1.9345,
"step": 3610
},
{
"epoch": 1.7158431093731483,
"grad_norm": 0.5975803333556319,
"learning_rate": 3.876333331682394e-05,
"loss": 1.9071,
"step": 3620
},
{
"epoch": 1.7205830074653394,
"grad_norm": 0.5607215795184285,
"learning_rate": 3.8521554727881115e-05,
"loss": 1.8444,
"step": 3630
},
{
"epoch": 1.7253229055575305,
"grad_norm": 0.5812681320546813,
"learning_rate": 3.828005913881876e-05,
"loss": 1.8783,
"step": 3640
},
{
"epoch": 1.7300628036497216,
"grad_norm": 0.5809996822930421,
"learning_rate": 3.803885250368562e-05,
"loss": 1.8667,
"step": 3650
},
{
"epoch": 1.7348027017419125,
"grad_norm": 0.5264379258394054,
"learning_rate": 3.7797940769406324e-05,
"loss": 1.8832,
"step": 3660
},
{
"epoch": 1.7395425998341034,
"grad_norm": 0.5452547674401557,
"learning_rate": 3.755732987563476e-05,
"loss": 1.9126,
"step": 3670
},
{
"epoch": 1.7442824979262945,
"grad_norm": 0.5573756045226962,
"learning_rate": 3.731702575460763e-05,
"loss": 1.9267,
"step": 3680
},
{
"epoch": 1.7490223960184856,
"grad_norm": 0.5891329270301621,
"learning_rate": 3.707703433099815e-05,
"loss": 1.8927,
"step": 3690
},
{
"epoch": 1.7537622941106767,
"grad_norm": 0.5379354015536967,
"learning_rate": 3.683736152177005e-05,
"loss": 1.8829,
"step": 3700
},
{
"epoch": 1.7585021922028676,
"grad_norm": 0.584902744080287,
"learning_rate": 3.659801323603163e-05,
"loss": 1.9032,
"step": 3710
},
{
"epoch": 1.7632420902950585,
"grad_norm": 0.47271945766863005,
"learning_rate": 3.63589953748901e-05,
"loss": 1.8634,
"step": 3720
},
{
"epoch": 1.7679819883872496,
"grad_norm": 0.5602358756096469,
"learning_rate": 3.612031383130612e-05,
"loss": 1.8436,
"step": 3730
},
{
"epoch": 1.7727218864794407,
"grad_norm": 0.5171084893952771,
"learning_rate": 3.5881974489948456e-05,
"loss": 1.8279,
"step": 3740
},
{
"epoch": 1.7774617845716318,
"grad_norm": 0.5085114117110985,
"learning_rate": 3.564398322704887e-05,
"loss": 1.8842,
"step": 3750
},
{
"epoch": 1.7822016826638227,
"grad_norm": 0.5395255555244833,
"learning_rate": 3.5406345910257346e-05,
"loss": 1.8974,
"step": 3760
},
{
"epoch": 1.7869415807560136,
"grad_norm": 0.5256917642696852,
"learning_rate": 3.5169068398497344e-05,
"loss": 1.9247,
"step": 3770
},
{
"epoch": 1.7916814788482047,
"grad_norm": 0.5297510632715654,
"learning_rate": 3.493215654182134e-05,
"loss": 1.8941,
"step": 3780
},
{
"epoch": 1.7964213769403958,
"grad_norm": 0.4887292770108947,
"learning_rate": 3.4695616181266674e-05,
"loss": 1.8662,
"step": 3790
},
{
"epoch": 1.801161275032587,
"grad_norm": 0.605286928037954,
"learning_rate": 3.445945314871144e-05,
"loss": 1.7946,
"step": 3800
},
{
"epoch": 1.8059011731247778,
"grad_norm": 0.5534598174424521,
"learning_rate": 3.422367326673079e-05,
"loss": 1.9319,
"step": 3810
},
{
"epoch": 1.8106410712169687,
"grad_norm": 0.516541325820194,
"learning_rate": 3.398828234845331e-05,
"loss": 1.9102,
"step": 3820
},
{
"epoch": 1.8153809693091598,
"grad_norm": 0.5316375380294128,
"learning_rate": 3.3753286197417714e-05,
"loss": 1.9137,
"step": 3830
},
{
"epoch": 1.820120867401351,
"grad_norm": 0.5048711282201915,
"learning_rate": 3.3518690607429784e-05,
"loss": 1.8643,
"step": 3840
},
{
"epoch": 1.824860765493542,
"grad_norm": 0.5407400572506997,
"learning_rate": 3.3284501362419566e-05,
"loss": 1.8524,
"step": 3850
},
{
"epoch": 1.829600663585733,
"grad_norm": 0.5444240928370307,
"learning_rate": 3.305072423629862e-05,
"loss": 1.9604,
"step": 3860
},
{
"epoch": 1.8343405616779238,
"grad_norm": 0.5259735881080222,
"learning_rate": 3.281736499281783e-05,
"loss": 1.8699,
"step": 3870
},
{
"epoch": 1.839080459770115,
"grad_norm": 0.5412391021904834,
"learning_rate": 3.2584429385425163e-05,
"loss": 1.9233,
"step": 3880
},
{
"epoch": 1.843820357862306,
"grad_norm": 0.581528749881215,
"learning_rate": 3.235192315712394e-05,
"loss": 1.9037,
"step": 3890
},
{
"epoch": 1.8485602559544971,
"grad_norm": 0.486599214527775,
"learning_rate": 3.211985204033114e-05,
"loss": 1.881,
"step": 3900
},
{
"epoch": 1.853300154046688,
"grad_norm": 0.5732281840924196,
"learning_rate": 3.188822175673618e-05,
"loss": 1.9289,
"step": 3910
},
{
"epoch": 1.858040052138879,
"grad_norm": 0.5393218742500727,
"learning_rate": 3.165703801715969e-05,
"loss": 1.8178,
"step": 3920
},
{
"epoch": 1.86277995023107,
"grad_norm": 0.5317421200650526,
"learning_rate": 3.142630652141286e-05,
"loss": 1.7813,
"step": 3930
},
{
"epoch": 1.8675198483232611,
"grad_norm": 0.4707578563318653,
"learning_rate": 3.119603295815685e-05,
"loss": 1.8928,
"step": 3940
},
{
"epoch": 1.8722597464154522,
"grad_norm": 0.503217338566424,
"learning_rate": 3.096622300476253e-05,
"loss": 1.9702,
"step": 3950
},
{
"epoch": 1.8769996445076431,
"grad_norm": 0.5191335631232252,
"learning_rate": 3.07368823271705e-05,
"loss": 1.8832,
"step": 3960
},
{
"epoch": 1.881739542599834,
"grad_norm": 0.5929718795388419,
"learning_rate": 3.050801657975147e-05,
"loss": 1.9705,
"step": 3970
},
{
"epoch": 1.8864794406920251,
"grad_norm": 0.5203449537199084,
"learning_rate": 3.0279631405166754e-05,
"loss": 1.8005,
"step": 3980
},
{
"epoch": 1.8912193387842162,
"grad_norm": 0.6060740003713215,
"learning_rate": 3.0051732434229184e-05,
"loss": 1.8802,
"step": 3990
},
{
"epoch": 1.895959236876407,
"grad_norm": 0.5254251326665124,
"learning_rate": 2.9824325285764332e-05,
"loss": 1.9063,
"step": 4000
},
{
"epoch": 1.9006991349685982,
"grad_norm": 0.5412654814841995,
"learning_rate": 2.9597415566471874e-05,
"loss": 1.7974,
"step": 4010
},
{
"epoch": 1.905439033060789,
"grad_norm": 0.6096977687423671,
"learning_rate": 2.9371008870787474e-05,
"loss": 1.8789,
"step": 4020
},
{
"epoch": 1.9101789311529802,
"grad_norm": 0.5751076752952912,
"learning_rate": 2.914511078074481e-05,
"loss": 1.9147,
"step": 4030
},
{
"epoch": 1.9149188292451713,
"grad_norm": 0.5596872085857021,
"learning_rate": 2.891972686583791e-05,
"loss": 1.8939,
"step": 4040
},
{
"epoch": 1.9196587273373622,
"grad_norm": 0.5205001238706851,
"learning_rate": 2.8694862682883866e-05,
"loss": 1.8675,
"step": 4050
},
{
"epoch": 1.9243986254295533,
"grad_norm": 0.6060966652232279,
"learning_rate": 2.8470523775885816e-05,
"loss": 1.8542,
"step": 4060
},
{
"epoch": 1.9291385235217442,
"grad_norm": 0.5060927602134601,
"learning_rate": 2.824671567589635e-05,
"loss": 1.9095,
"step": 4070
},
{
"epoch": 1.9338784216139353,
"grad_norm": 0.527071756794979,
"learning_rate": 2.8023443900880984e-05,
"loss": 1.8144,
"step": 4080
},
{
"epoch": 1.9386183197061264,
"grad_norm": 0.6186591144971271,
"learning_rate": 2.780071395558222e-05,
"loss": 1.9328,
"step": 4090
},
{
"epoch": 1.9433582177983173,
"grad_norm": 0.5084958011646354,
"learning_rate": 2.757853133138382e-05,
"loss": 1.8292,
"step": 4100
},
{
"epoch": 1.9480981158905084,
"grad_norm": 0.5671058444452819,
"learning_rate": 2.7356901506175426e-05,
"loss": 1.8621,
"step": 4110
},
{
"epoch": 1.9528380139826993,
"grad_norm": 0.6077250993929268,
"learning_rate": 2.7135829944217406e-05,
"loss": 1.8969,
"step": 4120
},
{
"epoch": 1.9575779120748904,
"grad_norm": 0.5478709269890887,
"learning_rate": 2.6915322096006244e-05,
"loss": 1.9648,
"step": 4130
},
{
"epoch": 1.9623178101670815,
"grad_norm": 0.5304846907499281,
"learning_rate": 2.6695383398140155e-05,
"loss": 1.8867,
"step": 4140
},
{
"epoch": 1.9670577082592724,
"grad_norm": 0.5084950385451593,
"learning_rate": 2.6476019273184938e-05,
"loss": 1.8987,
"step": 4150
},
{
"epoch": 1.9717976063514633,
"grad_norm": 0.5881914443826771,
"learning_rate": 2.6257235129540424e-05,
"loss": 1.8718,
"step": 4160
},
{
"epoch": 1.9765375044436544,
"grad_norm": 0.5557425542971698,
"learning_rate": 2.603903636130701e-05,
"loss": 1.8204,
"step": 4170
},
{
"epoch": 1.9812774025358455,
"grad_norm": 0.5235298330164154,
"learning_rate": 2.5821428348152788e-05,
"loss": 1.915,
"step": 4180
},
{
"epoch": 1.9860173006280366,
"grad_norm": 0.6107709148392828,
"learning_rate": 2.560441645518078e-05,
"loss": 1.8223,
"step": 4190
},
{
"epoch": 1.9907571987202275,
"grad_norm": 0.5614697856069703,
"learning_rate": 2.538800603279673e-05,
"loss": 1.8439,
"step": 4200
},
{
"epoch": 1.9954970968124184,
"grad_norm": 0.5563269995130558,
"learning_rate": 2.5172202416577236e-05,
"loss": 1.8982,
"step": 4210
},
{
"epoch": 2.0002369949046095,
"grad_norm": 0.5673849628756762,
"learning_rate": 2.4957010927138136e-05,
"loss": 1.8956,
"step": 4220
},
{
"epoch": 2.0049768929968006,
"grad_norm": 0.5274159605663582,
"learning_rate": 2.4742436870003326e-05,
"loss": 1.8572,
"step": 4230
},
{
"epoch": 2.0097167910889917,
"grad_norm": 0.5388999304024686,
"learning_rate": 2.452848553547396e-05,
"loss": 1.8441,
"step": 4240
},
{
"epoch": 2.014456689181183,
"grad_norm": 0.5715679686982497,
"learning_rate": 2.431516219849809e-05,
"loss": 1.838,
"step": 4250
},
{
"epoch": 2.0191965872733735,
"grad_norm": 0.5795119843431206,
"learning_rate": 2.4102472118540487e-05,
"loss": 1.8329,
"step": 4260
},
{
"epoch": 2.0239364853655646,
"grad_norm": 0.5503184533431318,
"learning_rate": 2.3890420539453057e-05,
"loss": 1.8733,
"step": 4270
},
{
"epoch": 2.0286763834577557,
"grad_norm": 0.54871121092008,
"learning_rate": 2.3679012689345558e-05,
"loss": 1.8601,
"step": 4280
},
{
"epoch": 2.033416281549947,
"grad_norm": 0.5879797146794722,
"learning_rate": 2.3468253780456678e-05,
"loss": 1.7751,
"step": 4290
},
{
"epoch": 2.038156179642138,
"grad_norm": 0.5510154682184406,
"learning_rate": 2.3258149009025482e-05,
"loss": 1.827,
"step": 4300
},
{
"epoch": 2.0428960777343286,
"grad_norm": 0.513792181350148,
"learning_rate": 2.3048703555163357e-05,
"loss": 1.8474,
"step": 4310
},
{
"epoch": 2.0476359758265197,
"grad_norm": 0.5489219942664323,
"learning_rate": 2.2839922582726336e-05,
"loss": 1.8862,
"step": 4320
},
{
"epoch": 2.052375873918711,
"grad_norm": 0.6504687065880719,
"learning_rate": 2.2631811239187646e-05,
"loss": 1.7984,
"step": 4330
},
{
"epoch": 2.057115772010902,
"grad_norm": 0.6130904570523673,
"learning_rate": 2.2424374655510965e-05,
"loss": 1.7921,
"step": 4340
},
{
"epoch": 2.0618556701030926,
"grad_norm": 0.6408124203446663,
"learning_rate": 2.2217617946023765e-05,
"loss": 1.8592,
"step": 4350
},
{
"epoch": 2.0665955681952837,
"grad_norm": 0.6181447797115482,
"learning_rate": 2.201154620829137e-05,
"loss": 1.8067,
"step": 4360
},
{
"epoch": 2.071335466287475,
"grad_norm": 0.5627617017019729,
"learning_rate": 2.1806164522991118e-05,
"loss": 1.7701,
"step": 4370
},
{
"epoch": 2.076075364379666,
"grad_norm": 0.5510540438192786,
"learning_rate": 2.1601477953787214e-05,
"loss": 1.857,
"step": 4380
},
{
"epoch": 2.080815262471857,
"grad_norm": 0.6083237779423979,
"learning_rate": 2.1397491547205807e-05,
"loss": 1.7601,
"step": 4390
},
{
"epoch": 2.0855551605640477,
"grad_norm": 0.6047311337345246,
"learning_rate": 2.119421033251071e-05,
"loss": 1.8347,
"step": 4400
},
{
"epoch": 2.0902950586562388,
"grad_norm": 0.5662369508712475,
"learning_rate": 2.0991639321579214e-05,
"loss": 1.8545,
"step": 4410
},
{
"epoch": 2.09503495674843,
"grad_norm": 0.5935079368512177,
"learning_rate": 2.078978350877862e-05,
"loss": 1.879,
"step": 4420
},
{
"epoch": 2.099774854840621,
"grad_norm": 0.571586984028468,
"learning_rate": 2.058864787084309e-05,
"loss": 1.7671,
"step": 4430
},
{
"epoch": 2.104514752932812,
"grad_norm": 0.5682037137995106,
"learning_rate": 2.0388237366751006e-05,
"loss": 1.865,
"step": 4440
},
{
"epoch": 2.1092546510250028,
"grad_norm": 0.5490908649638305,
"learning_rate": 2.018855693760257e-05,
"loss": 1.78,
"step": 4450
},
{
"epoch": 2.113994549117194,
"grad_norm": 0.6176356249016943,
"learning_rate": 1.998961150649814e-05,
"loss": 1.8435,
"step": 4460
},
{
"epoch": 2.118734447209385,
"grad_norm": 0.5319868348925916,
"learning_rate": 1.9791405978416694e-05,
"loss": 1.8981,
"step": 4470
},
{
"epoch": 2.123474345301576,
"grad_norm": 0.5752723871436735,
"learning_rate": 1.9593945240095052e-05,
"loss": 1.7755,
"step": 4480
},
{
"epoch": 2.128214243393767,
"grad_norm": 0.6366681694521167,
"learning_rate": 1.9397234159907275e-05,
"loss": 1.8707,
"step": 4490
},
{
"epoch": 2.132954141485958,
"grad_norm": 0.5901487974014347,
"learning_rate": 1.920127758774466e-05,
"loss": 1.8256,
"step": 4500
},
{
"epoch": 2.137694039578149,
"grad_norm": 0.5888105104943471,
"learning_rate": 1.9006080354896267e-05,
"loss": 1.8357,
"step": 4510
},
{
"epoch": 2.14243393767034,
"grad_norm": 0.5878169661429707,
"learning_rate": 1.8811647273929628e-05,
"loss": 1.8241,
"step": 4520
},
{
"epoch": 2.147173835762531,
"grad_norm": 0.5581948418607748,
"learning_rate": 1.8617983138572277e-05,
"loss": 1.848,
"step": 4530
},
{
"epoch": 2.1519137338547223,
"grad_norm": 0.6137321662868356,
"learning_rate": 1.8425092723593395e-05,
"loss": 1.78,
"step": 4540
},
{
"epoch": 2.156653631946913,
"grad_norm": 0.558081495592443,
"learning_rate": 1.823298078468624e-05,
"loss": 1.8153,
"step": 4550
},
{
"epoch": 2.161393530039104,
"grad_norm": 0.6039625325723422,
"learning_rate": 1.8041652058350767e-05,
"loss": 1.8416,
"step": 4560
},
{
"epoch": 2.166133428131295,
"grad_norm": 0.6295821331128388,
"learning_rate": 1.785111126177691e-05,
"loss": 1.7953,
"step": 4570
},
{
"epoch": 2.1708733262234863,
"grad_norm": 0.5911527371211652,
"learning_rate": 1.7661363092728307e-05,
"loss": 1.7851,
"step": 4580
},
{
"epoch": 2.1756132243156774,
"grad_norm": 0.565852777352692,
"learning_rate": 1.7472412229426455e-05,
"loss": 1.8101,
"step": 4590
},
{
"epoch": 2.180353122407868,
"grad_norm": 0.5656454600563583,
"learning_rate": 1.7284263330435317e-05,
"loss": 1.917,
"step": 4600
},
{
"epoch": 2.185093020500059,
"grad_norm": 0.6035646498858932,
"learning_rate": 1.709692103454651e-05,
"loss": 1.8168,
"step": 4610
},
{
"epoch": 2.1898329185922503,
"grad_norm": 0.5477939270708279,
"learning_rate": 1.6910389960664992e-05,
"loss": 1.777,
"step": 4620
},
{
"epoch": 2.1945728166844414,
"grad_norm": 0.5898939001383526,
"learning_rate": 1.672467470769507e-05,
"loss": 1.7575,
"step": 4630
},
{
"epoch": 2.1993127147766325,
"grad_norm": 0.544798273283213,
"learning_rate": 1.6539779854427074e-05,
"loss": 1.8834,
"step": 4640
},
{
"epoch": 2.204052612868823,
"grad_norm": 0.610618761949142,
"learning_rate": 1.6355709959424487e-05,
"loss": 1.8785,
"step": 4650
},
{
"epoch": 2.2087925109610143,
"grad_norm": 0.6064522176814057,
"learning_rate": 1.6172469560911553e-05,
"loss": 1.7854,
"step": 4660
},
{
"epoch": 2.2135324090532054,
"grad_norm": 0.6022849345976745,
"learning_rate": 1.599006317666131e-05,
"loss": 1.8497,
"step": 4670
},
{
"epoch": 2.2182723071453965,
"grad_norm": 0.5926151325695663,
"learning_rate": 1.5808495303884297e-05,
"loss": 1.8184,
"step": 4680
},
{
"epoch": 2.2230122052375876,
"grad_norm": 0.5740462281531319,
"learning_rate": 1.562777041911761e-05,
"loss": 1.8073,
"step": 4690
},
{
"epoch": 2.2277521033297782,
"grad_norm": 0.595274030679382,
"learning_rate": 1.5447892978114592e-05,
"loss": 1.8095,
"step": 4700
},
{
"epoch": 2.2324920014219694,
"grad_norm": 0.5805561493774153,
"learning_rate": 1.526886741573496e-05,
"loss": 1.7907,
"step": 4710
},
{
"epoch": 2.2372318995141605,
"grad_norm": 0.6585750772533296,
"learning_rate": 1.5090698145835413e-05,
"loss": 1.8081,
"step": 4720
},
{
"epoch": 2.2419717976063516,
"grad_norm": 0.7616121844460758,
"learning_rate": 1.491338956116085e-05,
"loss": 1.8571,
"step": 4730
},
{
"epoch": 2.2467116956985427,
"grad_norm": 0.6037559488690589,
"learning_rate": 1.473694603323611e-05,
"loss": 1.8194,
"step": 4740
},
{
"epoch": 2.2514515937907333,
"grad_norm": 0.6412117105060221,
"learning_rate": 1.4561371912258098e-05,
"loss": 1.7447,
"step": 4750
},
{
"epoch": 2.2561914918829244,
"grad_norm": 0.6178165307415238,
"learning_rate": 1.4386671526988593e-05,
"loss": 1.8047,
"step": 4760
},
{
"epoch": 2.2609313899751156,
"grad_norm": 0.5887211775830831,
"learning_rate": 1.421284918464752e-05,
"loss": 1.8309,
"step": 4770
},
{
"epoch": 2.2656712880673067,
"grad_norm": 0.6715832023904247,
"learning_rate": 1.4039909170806764e-05,
"loss": 1.7598,
"step": 4780
},
{
"epoch": 2.2704111861594978,
"grad_norm": 0.5565711226911474,
"learning_rate": 1.386785574928446e-05,
"loss": 1.8042,
"step": 4790
},
{
"epoch": 2.2751510842516884,
"grad_norm": 1.0370061435438975,
"learning_rate": 1.3696693162039893e-05,
"loss": 1.8418,
"step": 4800
},
{
"epoch": 2.2798909823438795,
"grad_norm": 0.619379427966442,
"learning_rate": 1.3526425629068967e-05,
"loss": 1.8709,
"step": 4810
},
{
"epoch": 2.2846308804360707,
"grad_norm": 0.6181820044240368,
"learning_rate": 1.3357057348300067e-05,
"loss": 1.8222,
"step": 4820
},
{
"epoch": 2.2893707785282618,
"grad_norm": 0.6447967865409838,
"learning_rate": 1.318859249549066e-05,
"loss": 1.8183,
"step": 4830
},
{
"epoch": 2.294110676620453,
"grad_norm": 0.6058171204419526,
"learning_rate": 1.3021035224124224e-05,
"loss": 1.805,
"step": 4840
},
{
"epoch": 2.2988505747126435,
"grad_norm": 0.5434323398332925,
"learning_rate": 1.2854389665307975e-05,
"loss": 1.7541,
"step": 4850
},
{
"epoch": 2.3035904728048346,
"grad_norm": 0.6113667985824829,
"learning_rate": 1.2688659927670915e-05,
"loss": 1.758,
"step": 4860
},
{
"epoch": 2.3083303708970258,
"grad_norm": 0.5720767875706882,
"learning_rate": 1.2523850097262563e-05,
"loss": 1.8322,
"step": 4870
},
{
"epoch": 2.313070268989217,
"grad_norm": 0.5628951626795141,
"learning_rate": 1.2359964237452238e-05,
"loss": 1.7798,
"step": 4880
},
{
"epoch": 2.317810167081408,
"grad_norm": 0.6094150987430762,
"learning_rate": 1.219700638882888e-05,
"loss": 1.7842,
"step": 4890
},
{
"epoch": 2.3225500651735986,
"grad_norm": 0.6036779282592939,
"learning_rate": 1.2034980569101367e-05,
"loss": 1.8383,
"step": 4900
},
{
"epoch": 2.3272899632657897,
"grad_norm": 0.6175747345768624,
"learning_rate": 1.1873890772999502e-05,
"loss": 1.9046,
"step": 4910
},
{
"epoch": 2.332029861357981,
"grad_norm": 0.5564649373869762,
"learning_rate": 1.1713740972175574e-05,
"loss": 1.8104,
"step": 4920
},
{
"epoch": 2.336769759450172,
"grad_norm": 0.6441404862225901,
"learning_rate": 1.155453511510633e-05,
"loss": 1.7864,
"step": 4930
},
{
"epoch": 2.341509657542363,
"grad_norm": 0.6927623121031959,
"learning_rate": 1.1396277126995707e-05,
"loss": 1.829,
"step": 4940
},
{
"epoch": 2.3462495556345537,
"grad_norm": 0.6537904475611329,
"learning_rate": 1.1238970909677993e-05,
"loss": 1.8655,
"step": 4950
},
{
"epoch": 2.350989453726745,
"grad_norm": 0.5779494171909159,
"learning_rate": 1.1082620341521766e-05,
"loss": 1.7482,
"step": 4960
},
{
"epoch": 2.355729351818936,
"grad_norm": 0.6161830958900923,
"learning_rate": 1.0927229277334061e-05,
"loss": 1.7789,
"step": 4970
},
{
"epoch": 2.360469249911127,
"grad_norm": 0.5946038603032194,
"learning_rate": 1.0772801548265498e-05,
"loss": 1.8189,
"step": 4980
},
{
"epoch": 2.365209148003318,
"grad_norm": 0.6072288944056834,
"learning_rate": 1.0619340961715746e-05,
"loss": 1.8588,
"step": 4990
},
{
"epoch": 2.369949046095509,
"grad_norm": 0.5882805952028816,
"learning_rate": 1.0466851301239711e-05,
"loss": 1.8238,
"step": 5000
},
{
"epoch": 2.3746889441877,
"grad_norm": 0.6288910196539964,
"learning_rate": 1.0315336326454161e-05,
"loss": 1.7055,
"step": 5010
},
{
"epoch": 2.379428842279891,
"grad_norm": 0.6043835236662759,
"learning_rate": 1.0164799772945149e-05,
"loss": 1.8134,
"step": 5020
},
{
"epoch": 2.384168740372082,
"grad_norm": 0.5821262142704368,
"learning_rate": 1.0015245352175811e-05,
"loss": 1.797,
"step": 5030
},
{
"epoch": 2.3889086384642733,
"grad_norm": 0.6369667143877562,
"learning_rate": 9.866676751394927e-06,
"loss": 1.8199,
"step": 5040
},
{
"epoch": 2.393648536556464,
"grad_norm": 0.5924507902566707,
"learning_rate": 9.719097633545975e-06,
"loss": 1.8524,
"step": 5050
},
{
"epoch": 2.398388434648655,
"grad_norm": 0.5762513665027686,
"learning_rate": 9.572511637176811e-06,
"loss": 1.8428,
"step": 5060
},
{
"epoch": 2.403128332740846,
"grad_norm": 0.5799149040724592,
"learning_rate": 9.426922376350028e-06,
"loss": 1.8463,
"step": 5070
},
{
"epoch": 2.4078682308330372,
"grad_norm": 0.5898000658332848,
"learning_rate": 9.282333440553804e-06,
"loss": 1.7772,
"step": 5080
},
{
"epoch": 2.4126081289252284,
"grad_norm": 0.5967206158269678,
"learning_rate": 9.13874839461336e-06,
"loss": 1.8234,
"step": 5090
},
{
"epoch": 2.417348027017419,
"grad_norm": 0.6245591569289297,
"learning_rate": 8.996170778603153e-06,
"loss": 1.8047,
"step": 5100
},
{
"epoch": 2.42208792510961,
"grad_norm": 0.5981945344970201,
"learning_rate": 8.854604107759568e-06,
"loss": 1.8429,
"step": 5110
},
{
"epoch": 2.4268278232018012,
"grad_norm": 0.6112665064763977,
"learning_rate": 8.714051872394213e-06,
"loss": 1.7746,
"step": 5120
},
{
"epoch": 2.4315677212939923,
"grad_norm": 0.5847743009358597,
"learning_rate": 8.574517537807897e-06,
"loss": 1.7703,
"step": 5130
},
{
"epoch": 2.436307619386183,
"grad_norm": 0.5617053604855574,
"learning_rate": 8.436004544205217e-06,
"loss": 1.8498,
"step": 5140
},
{
"epoch": 2.441047517478374,
"grad_norm": 0.5947168640425712,
"learning_rate": 8.2985163066097e-06,
"loss": 1.8439,
"step": 5150
},
{
"epoch": 2.4457874155705652,
"grad_norm": 0.6456439652584188,
"learning_rate": 8.162056214779618e-06,
"loss": 1.8125,
"step": 5160
},
{
"epoch": 2.4505273136627563,
"grad_norm": 0.6053385247801931,
"learning_rate": 8.02662763312439e-06,
"loss": 1.8193,
"step": 5170
},
{
"epoch": 2.4552672117549474,
"grad_norm": 0.6364991896683941,
"learning_rate": 7.89223390062172e-06,
"loss": 1.8081,
"step": 5180
},
{
"epoch": 2.460007109847138,
"grad_norm": 0.630663938586301,
"learning_rate": 7.758878330735142e-06,
"loss": 1.8317,
"step": 5190
},
{
"epoch": 2.464747007939329,
"grad_norm": 0.6625585293729884,
"learning_rate": 7.626564211332465e-06,
"loss": 1.7914,
"step": 5200
},
{
"epoch": 2.4694869060315203,
"grad_norm": 0.6132933711832741,
"learning_rate": 7.49529480460458e-06,
"loss": 1.8072,
"step": 5210
},
{
"epoch": 2.4742268041237114,
"grad_norm": 0.6723366054843423,
"learning_rate": 7.3650733469851574e-06,
"loss": 1.8693,
"step": 5220
},
{
"epoch": 2.4789667022159025,
"grad_norm": 0.5948715205500895,
"learning_rate": 7.235903049070742e-06,
"loss": 1.7441,
"step": 5230
},
{
"epoch": 2.483706600308093,
"grad_norm": 0.602660875671921,
"learning_rate": 7.1077870955416685e-06,
"loss": 1.8301,
"step": 5240
},
{
"epoch": 2.4884464984002843,
"grad_norm": 0.6657860629895173,
"learning_rate": 6.98072864508349e-06,
"loss": 1.7357,
"step": 5250
},
{
"epoch": 2.4931863964924754,
"grad_norm": 0.6400301583474429,
"learning_rate": 6.854730830309203e-06,
"loss": 1.8309,
"step": 5260
},
{
"epoch": 2.4979262945846665,
"grad_norm": 0.6519457597490862,
"learning_rate": 6.729796757681861e-06,
"loss": 1.8622,
"step": 5270
},
{
"epoch": 2.502666192676857,
"grad_norm": 0.6018425213466797,
"learning_rate": 6.605929507438108e-06,
"loss": 1.8124,
"step": 5280
},
{
"epoch": 2.5074060907690483,
"grad_norm": 0.6356535657958864,
"learning_rate": 6.4831321335121706e-06,
"loss": 1.8493,
"step": 5290
},
{
"epoch": 2.5121459888612394,
"grad_norm": 0.5933711757944313,
"learning_rate": 6.361407663460612e-06,
"loss": 1.8152,
"step": 5300
},
{
"epoch": 2.5168858869534305,
"grad_norm": 0.6176252282132866,
"learning_rate": 6.240759098387628e-06,
"loss": 1.7796,
"step": 5310
},
{
"epoch": 2.5216257850456216,
"grad_norm": 0.6035543936375999,
"learning_rate": 6.12118941287112e-06,
"loss": 1.8072,
"step": 5320
},
{
"epoch": 2.5263656831378123,
"grad_norm": 0.6423602506797493,
"learning_rate": 6.002701554889306e-06,
"loss": 1.8894,
"step": 5330
},
{
"epoch": 2.5311055812300034,
"grad_norm": 0.6166718860982423,
"learning_rate": 5.885298445748072e-06,
"loss": 1.8476,
"step": 5340
},
{
"epoch": 2.5358454793221945,
"grad_norm": 0.6250486214392823,
"learning_rate": 5.768982980008924e-06,
"loss": 1.8044,
"step": 5350
},
{
"epoch": 2.5405853774143856,
"grad_norm": 0.6409013217160432,
"learning_rate": 5.653758025417616e-06,
"loss": 1.7732,
"step": 5360
},
{
"epoch": 2.5453252755065767,
"grad_norm": 0.5853729101352203,
"learning_rate": 5.5396264228335e-06,
"loss": 1.816,
"step": 5370
},
{
"epoch": 2.5500651735987674,
"grad_norm": 0.6674717253505213,
"learning_rate": 5.42659098615943e-06,
"loss": 1.828,
"step": 5380
},
{
"epoch": 2.5548050716909585,
"grad_norm": 0.6079460431124653,
"learning_rate": 5.314654502272393e-06,
"loss": 1.8305,
"step": 5390
},
{
"epoch": 2.5595449697831496,
"grad_norm": 0.6132271739956523,
"learning_rate": 5.203819730954806e-06,
"loss": 1.9389,
"step": 5400
},
{
"epoch": 2.5642848678753407,
"grad_norm": 0.6412964569520792,
"learning_rate": 5.094089404826513e-06,
"loss": 1.8878,
"step": 5410
},
{
"epoch": 2.569024765967532,
"grad_norm": 0.6314773808659059,
"learning_rate": 4.985466229277331e-06,
"loss": 1.7996,
"step": 5420
},
{
"epoch": 2.5737646640597225,
"grad_norm": 0.6019377364178156,
"learning_rate": 4.877952882400411e-06,
"loss": 1.8326,
"step": 5430
},
{
"epoch": 2.5785045621519136,
"grad_norm": 0.6375177888153616,
"learning_rate": 4.771552014926206e-06,
"loss": 1.8313,
"step": 5440
},
{
"epoch": 2.5832444602441047,
"grad_norm": 0.6184290636855982,
"learning_rate": 4.666266250157097e-06,
"loss": 1.8408,
"step": 5450
},
{
"epoch": 2.587984358336296,
"grad_norm": 0.6145812896553856,
"learning_rate": 4.562098183902713e-06,
"loss": 1.7928,
"step": 5460
},
{
"epoch": 2.592724256428487,
"grad_norm": 0.5863286484938057,
"learning_rate": 4.459050384415941e-06,
"loss": 1.7671,
"step": 5470
},
{
"epoch": 2.5974641545206776,
"grad_norm": 0.5908385265300592,
"learning_rate": 4.357125392329636e-06,
"loss": 1.8528,
"step": 5480
},
{
"epoch": 2.6022040526128687,
"grad_norm": 0.6315835702501038,
"learning_rate": 4.256325720593912e-06,
"loss": 1.8952,
"step": 5490
},
{
"epoch": 2.60694395070506,
"grad_norm": 0.5905062832031487,
"learning_rate": 4.15665385441425e-06,
"loss": 1.8604,
"step": 5500
},
{
"epoch": 2.611683848797251,
"grad_norm": 0.568727331363524,
"learning_rate": 4.0581122511901934e-06,
"loss": 1.8351,
"step": 5510
},
{
"epoch": 2.616423746889442,
"grad_norm": 0.6400621125560388,
"learning_rate": 3.960703340454791e-06,
"loss": 1.857,
"step": 5520
},
{
"epoch": 2.6211636449816327,
"grad_norm": 0.6844853412168999,
"learning_rate": 3.864429523814644e-06,
"loss": 1.8371,
"step": 5530
},
{
"epoch": 2.625903543073824,
"grad_norm": 0.6040727492768455,
"learning_rate": 3.7692931748907425e-06,
"loss": 1.8582,
"step": 5540
},
{
"epoch": 2.630643441166015,
"grad_norm": 0.6488970700922259,
"learning_rate": 3.675296639259912e-06,
"loss": 1.8466,
"step": 5550
},
{
"epoch": 2.635383339258206,
"grad_norm": 0.606860701135619,
"learning_rate": 3.5824422343970267e-06,
"loss": 1.8823,
"step": 5560
},
{
"epoch": 2.640123237350397,
"grad_norm": 0.6107041616886252,
"learning_rate": 3.4907322496178397e-06,
"loss": 1.7635,
"step": 5570
},
{
"epoch": 2.6448631354425878,
"grad_norm": 0.6205661299793865,
"learning_rate": 3.4001689460225195e-06,
"loss": 1.7604,
"step": 5580
},
{
"epoch": 2.649603033534779,
"grad_norm": 0.6114908815089501,
"learning_rate": 3.3107545564399434e-06,
"loss": 1.8452,
"step": 5590
},
{
"epoch": 2.65434293162697,
"grad_norm": 0.621202845423754,
"learning_rate": 3.2224912853726476e-06,
"loss": 1.8557,
"step": 5600
},
{
"epoch": 2.659082829719161,
"grad_norm": 0.6376438148340446,
"learning_rate": 3.1353813089424424e-06,
"loss": 1.8295,
"step": 5610
},
{
"epoch": 2.663822727811352,
"grad_norm": 0.6085163299666503,
"learning_rate": 3.0494267748367723e-06,
"loss": 1.7302,
"step": 5620
},
{
"epoch": 2.668562625903543,
"grad_norm": 0.6330680248898437,
"learning_rate": 2.9646298022557915e-06,
"loss": 1.7756,
"step": 5630
},
{
"epoch": 2.673302523995734,
"grad_norm": 0.6575109357986112,
"learning_rate": 2.8809924818600952e-06,
"loss": 1.7728,
"step": 5640
},
{
"epoch": 2.678042422087925,
"grad_norm": 0.5972530598708538,
"learning_rate": 2.7985168757191482e-06,
"loss": 1.7927,
"step": 5650
},
{
"epoch": 2.682782320180116,
"grad_norm": 0.6505229836146454,
"learning_rate": 2.7172050172604824e-06,
"loss": 1.768,
"step": 5660
},
{
"epoch": 2.6875222182723073,
"grad_norm": 0.6339702452986381,
"learning_rate": 2.63705891121957e-06,
"loss": 1.7756,
"step": 5670
},
{
"epoch": 2.692262116364498,
"grad_norm": 0.6729168831182509,
"learning_rate": 2.5580805335903457e-06,
"loss": 1.8363,
"step": 5680
},
{
"epoch": 2.697002014456689,
"grad_norm": 0.6421591660117998,
"learning_rate": 2.4802718315765527e-06,
"loss": 1.7585,
"step": 5690
},
{
"epoch": 2.70174191254888,
"grad_norm": 0.5993295713871896,
"learning_rate": 2.403634723543674e-06,
"loss": 1.8379,
"step": 5700
},
{
"epoch": 2.7064818106410713,
"grad_norm": 0.5931932390101198,
"learning_rate": 2.3281710989716933e-06,
"loss": 1.8127,
"step": 5710
},
{
"epoch": 2.7112217087332624,
"grad_norm": 0.6007499215207198,
"learning_rate": 2.2538828184084595e-06,
"loss": 1.7643,
"step": 5720
},
{
"epoch": 2.715961606825453,
"grad_norm": 0.6294360874753062,
"learning_rate": 2.1807717134238347e-06,
"loss": 1.8007,
"step": 5730
},
{
"epoch": 2.720701504917644,
"grad_norm": 0.6305932589800126,
"learning_rate": 2.1088395865645537e-06,
"loss": 1.802,
"step": 5740
},
{
"epoch": 2.7254414030098353,
"grad_norm": 0.6091954631732173,
"learning_rate": 2.038088211309769e-06,
"loss": 1.7978,
"step": 5750
},
{
"epoch": 2.7301813011020264,
"grad_norm": 0.6353525285344948,
"learning_rate": 1.968519332027302e-06,
"loss": 1.8641,
"step": 5760
},
{
"epoch": 2.7349211991942175,
"grad_norm": 0.5869911293052614,
"learning_rate": 1.9001346639306805e-06,
"loss": 1.876,
"step": 5770
},
{
"epoch": 2.739661097286408,
"grad_norm": 0.6462140073621514,
"learning_rate": 1.8329358930368245e-06,
"loss": 1.7947,
"step": 5780
},
{
"epoch": 2.7444009953785993,
"grad_norm": 0.6298906028352366,
"learning_rate": 1.7669246761244763e-06,
"loss": 1.7983,
"step": 5790
},
{
"epoch": 2.7491408934707904,
"grad_norm": 0.6351921002703318,
"learning_rate": 1.7021026406933427e-06,
"loss": 1.7563,
"step": 5800
},
{
"epoch": 2.7538807915629815,
"grad_norm": 0.6081707137727146,
"learning_rate": 1.638471384924012e-06,
"loss": 1.8005,
"step": 5810
},
{
"epoch": 2.7586206896551726,
"grad_norm": 0.6527854672102444,
"learning_rate": 1.5760324776385171e-06,
"loss": 1.8228,
"step": 5820
},
{
"epoch": 2.7633605877473633,
"grad_norm": 0.6207692422398574,
"learning_rate": 1.5147874582616518e-06,
"loss": 1.8751,
"step": 5830
},
{
"epoch": 2.7681004858395544,
"grad_norm": 0.6078351786970941,
"learning_rate": 1.4547378367830267e-06,
"loss": 1.854,
"step": 5840
},
{
"epoch": 2.7728403839317455,
"grad_norm": 0.5914179875660134,
"learning_rate": 1.3958850937198453e-06,
"loss": 1.8771,
"step": 5850
},
{
"epoch": 2.7775802820239366,
"grad_norm": 0.6150352638939602,
"learning_rate": 1.3382306800804045e-06,
"loss": 1.7422,
"step": 5860
},
{
"epoch": 2.7823201801161277,
"grad_norm": 0.6205091178728268,
"learning_rate": 1.2817760173282954e-06,
"loss": 1.8005,
"step": 5870
},
{
"epoch": 2.7870600782083184,
"grad_norm": 0.6352299718478237,
"learning_rate": 1.2265224973474042e-06,
"loss": 1.7703,
"step": 5880
},
{
"epoch": 2.7917999763005095,
"grad_norm": 0.6466624089179797,
"learning_rate": 1.1724714824075333e-06,
"loss": 1.8315,
"step": 5890
},
{
"epoch": 2.7965398743927006,
"grad_norm": 0.5968151491811187,
"learning_rate": 1.1196243051308787e-06,
"loss": 1.9011,
"step": 5900
},
{
"epoch": 2.8012797724848917,
"grad_norm": 0.6310690230989541,
"learning_rate": 1.0679822684591112e-06,
"loss": 1.8434,
"step": 5910
},
{
"epoch": 2.806019670577083,
"grad_norm": 0.6459331883257132,
"learning_rate": 1.0175466456213034e-06,
"loss": 1.7773,
"step": 5920
},
{
"epoch": 2.8107595686692735,
"grad_norm": 0.6898338914840095,
"learning_rate": 9.683186801025256e-07,
"loss": 1.8417,
"step": 5930
},
{
"epoch": 2.8154994667614646,
"grad_norm": 0.6097250867359322,
"learning_rate": 9.202995856131769e-07,
"loss": 1.8076,
"step": 5940
},
{
"epoch": 2.8202393648536557,
"grad_norm": 0.6610392263190566,
"learning_rate": 8.734905460590581e-07,
"loss": 1.7511,
"step": 5950
},
{
"epoch": 2.824979262945847,
"grad_norm": 0.6070988311686517,
"learning_rate": 8.278927155121851e-07,
"loss": 1.8309,
"step": 5960
},
{
"epoch": 2.829719161038038,
"grad_norm": 0.6261583831010433,
"learning_rate": 7.835072181823666e-07,
"loss": 1.8377,
"step": 5970
},
{
"epoch": 2.8344590591302286,
"grad_norm": 0.6243423055956993,
"learning_rate": 7.403351483894427e-07,
"loss": 1.7941,
"step": 5980
},
{
"epoch": 2.8391989572224197,
"grad_norm": 0.702784469663522,
"learning_rate": 6.983775705363238e-07,
"loss": 1.8042,
"step": 5990
},
{
"epoch": 2.8439388553146108,
"grad_norm": 0.5996597981711203,
"learning_rate": 6.576355190827499e-07,
"loss": 1.8512,
"step": 6000
},
{
"epoch": 2.848678753406802,
"grad_norm": 0.5539803926109534,
"learning_rate": 6.181099985197947e-07,
"loss": 1.8558,
"step": 6010
},
{
"epoch": 2.853418651498993,
"grad_norm": 0.5462268948543724,
"learning_rate": 5.798019833450629e-07,
"loss": 1.7838,
"step": 6020
},
{
"epoch": 2.8581585495911837,
"grad_norm": 0.6522918616165346,
"learning_rate": 5.4271241803871e-07,
"loss": 1.8523,
"step": 6030
},
{
"epoch": 2.8628984476833748,
"grad_norm": 0.6013569849197028,
"learning_rate": 5.068422170401377e-07,
"loss": 1.8239,
"step": 6040
},
{
"epoch": 2.867638345775566,
"grad_norm": 0.6217056805780841,
"learning_rate": 4.72192264725424e-07,
"loss": 1.8316,
"step": 6050
},
{
"epoch": 2.872378243867757,
"grad_norm": 0.6047869013985818,
"learning_rate": 4.387634153855791e-07,
"loss": 1.8189,
"step": 6060
},
{
"epoch": 2.877118141959948,
"grad_norm": 0.6730414277089524,
"learning_rate": 4.065564932054067e-07,
"loss": 1.7824,
"step": 6070
},
{
"epoch": 2.8818580400521387,
"grad_norm": 0.612791047561647,
"learning_rate": 3.755722922432481e-07,
"loss": 1.7867,
"step": 6080
},
{
"epoch": 2.88659793814433,
"grad_norm": 0.6615842561782111,
"learning_rate": 3.4581157641137563e-07,
"loss": 1.8359,
"step": 6090
},
{
"epoch": 2.891337836236521,
"grad_norm": 0.6358101876016702,
"learning_rate": 3.1727507945714663e-07,
"loss": 1.8628,
"step": 6100
},
{
"epoch": 2.896077734328712,
"grad_norm": 0.5951921137175086,
"learning_rate": 2.8996350494495116e-07,
"loss": 1.8516,
"step": 6110
},
{
"epoch": 2.900817632420903,
"grad_norm": 0.6310271682459363,
"learning_rate": 2.6387752623883156e-07,
"loss": 1.8437,
"step": 6120
},
{
"epoch": 2.905557530513094,
"grad_norm": 0.6305755436522482,
"learning_rate": 2.390177864858956e-07,
"loss": 1.8514,
"step": 6130
},
{
"epoch": 2.910297428605285,
"grad_norm": 0.6404150710185624,
"learning_rate": 2.1538489860044587e-07,
"loss": 1.8186,
"step": 6140
},
{
"epoch": 2.915037326697476,
"grad_norm": 0.6158013141692098,
"learning_rate": 1.92979445248892e-07,
"loss": 1.8083,
"step": 6150
},
{
"epoch": 2.919777224789667,
"grad_norm": 0.6416671093424775,
"learning_rate": 1.7180197883537308e-07,
"loss": 1.7786,
"step": 6160
},
{
"epoch": 2.9245171228818583,
"grad_norm": 0.5582605199061633,
"learning_rate": 1.518530214881242e-07,
"loss": 1.7976,
"step": 6170
},
{
"epoch": 2.929257020974049,
"grad_norm": 0.6106802327952866,
"learning_rate": 1.3313306504663115e-07,
"loss": 1.7604,
"step": 6180
},
{
"epoch": 2.93399691906624,
"grad_norm": 0.649320638486437,
"learning_rate": 1.1564257104947352e-07,
"loss": 1.8441,
"step": 6190
},
{
"epoch": 2.938736817158431,
"grad_norm": 0.5884577603080124,
"learning_rate": 9.938197072298372e-08,
"loss": 1.8196,
"step": 6200
},
{
"epoch": 2.9434767152506223,
"grad_norm": 0.6392485935256708,
"learning_rate": 8.435166497057222e-08,
"loss": 1.857,
"step": 6210
},
{
"epoch": 2.9482166133428134,
"grad_norm": 0.6506401892518179,
"learning_rate": 7.055202436287433e-08,
"loss": 1.7725,
"step": 6220
},
{
"epoch": 2.952956511435004,
"grad_norm": 0.6149298488489828,
"learning_rate": 5.7983389128596355e-08,
"loss": 1.8946,
"step": 6230
},
{
"epoch": 2.957696409527195,
"grad_norm": 0.5722181216171393,
"learning_rate": 4.664606914615011e-08,
"loss": 1.8542,
"step": 6240
},
{
"epoch": 2.9624363076193863,
"grad_norm": 0.6428450313630513,
"learning_rate": 3.654034393598127e-08,
"loss": 1.824,
"step": 6250
},
{
"epoch": 2.9671762057115774,
"grad_norm": 0.6329021168786573,
"learning_rate": 2.766646265369155e-08,
"loss": 1.8012,
"step": 6260
},
{
"epoch": 2.9719161038037685,
"grad_norm": 0.6406715656233972,
"learning_rate": 2.0024644083921352e-08,
"loss": 1.8472,
"step": 6270
},
{
"epoch": 2.976656001895959,
"grad_norm": 0.5842266635593326,
"learning_rate": 1.3615076634898582e-08,
"loss": 1.8102,
"step": 6280
},
{
"epoch": 2.9813958999881502,
"grad_norm": 0.6430039656205391,
"learning_rate": 8.437918333864536e-09,
"loss": 1.7935,
"step": 6290
},
{
"epoch": 2.9861357980803414,
"grad_norm": 0.6055802510109696,
"learning_rate": 4.493296823104842e-09,
"loss": 1.8425,
"step": 6300
},
{
"epoch": 2.990875696172532,
"grad_norm": 0.5757552404684133,
"learning_rate": 1.781309356863048e-09,
"loss": 1.8636,
"step": 6310
},
{
"epoch": 2.9956155942647236,
"grad_norm": 0.602338679600079,
"learning_rate": 3.0202279890922947e-10,
"loss": 1.7555,
"step": 6320
},
{
"epoch": 2.998933522929257,
"step": 6327,
"total_flos": 3180599149854720.0,
"train_loss": 1.9022130669246677,
"train_runtime": 57110.8809,
"train_samples_per_second": 0.887,
"train_steps_per_second": 0.111
}
],
"logging_steps": 10,
"max_steps": 6327,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3180599149854720.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}