Qwen2VL-72B-sft-v1 / trainer_state.json
Sicong's picture
Add files using upload-large-folder tool
0286882 verified
raw
history blame
54.5 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 3089,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0032372936225315637,
"grad_norm": 5.914970298041148,
"learning_rate": 3.2362459546925565e-07,
"loss": 0.8345,
"step": 10
},
{
"epoch": 0.006474587245063127,
"grad_norm": 3.7232256971173268,
"learning_rate": 6.472491909385113e-07,
"loss": 0.7733,
"step": 20
},
{
"epoch": 0.009711880867594691,
"grad_norm": 2.7946325934816634,
"learning_rate": 9.70873786407767e-07,
"loss": 0.6634,
"step": 30
},
{
"epoch": 0.012949174490126255,
"grad_norm": 3.6676755875340263,
"learning_rate": 1.2944983818770226e-06,
"loss": 0.6075,
"step": 40
},
{
"epoch": 0.01618646811265782,
"grad_norm": 2.4485205913142916,
"learning_rate": 1.6181229773462783e-06,
"loss": 0.5663,
"step": 50
},
{
"epoch": 0.019423761735189383,
"grad_norm": 2.4316459196101996,
"learning_rate": 1.941747572815534e-06,
"loss": 0.5943,
"step": 60
},
{
"epoch": 0.022661055357720946,
"grad_norm": 2.3044316553901125,
"learning_rate": 2.26537216828479e-06,
"loss": 0.5652,
"step": 70
},
{
"epoch": 0.02589834898025251,
"grad_norm": 2.9385172928902823,
"learning_rate": 2.588996763754045e-06,
"loss": 0.5634,
"step": 80
},
{
"epoch": 0.029135642602784072,
"grad_norm": 2.221999891073394,
"learning_rate": 2.912621359223301e-06,
"loss": 0.5387,
"step": 90
},
{
"epoch": 0.03237293622531564,
"grad_norm": 2.186070765481556,
"learning_rate": 3.2362459546925567e-06,
"loss": 0.5459,
"step": 100
},
{
"epoch": 0.0356102298478472,
"grad_norm": 1.9674728126053138,
"learning_rate": 3.5598705501618126e-06,
"loss": 0.5463,
"step": 110
},
{
"epoch": 0.038847523470378766,
"grad_norm": 2.1315512928027034,
"learning_rate": 3.883495145631068e-06,
"loss": 0.544,
"step": 120
},
{
"epoch": 0.04208481709291033,
"grad_norm": 2.135135712788282,
"learning_rate": 4.207119741100324e-06,
"loss": 0.5484,
"step": 130
},
{
"epoch": 0.04532211071544189,
"grad_norm": 2.184082631640455,
"learning_rate": 4.53074433656958e-06,
"loss": 0.5123,
"step": 140
},
{
"epoch": 0.048559404337973455,
"grad_norm": 2.157079343663751,
"learning_rate": 4.854368932038836e-06,
"loss": 0.5421,
"step": 150
},
{
"epoch": 0.05179669796050502,
"grad_norm": 2.171293112107742,
"learning_rate": 5.17799352750809e-06,
"loss": 0.5589,
"step": 160
},
{
"epoch": 0.05503399158303658,
"grad_norm": 1.9239122091594658,
"learning_rate": 5.501618122977347e-06,
"loss": 0.5452,
"step": 170
},
{
"epoch": 0.058271285205568145,
"grad_norm": 1.9375062757821118,
"learning_rate": 5.825242718446602e-06,
"loss": 0.5272,
"step": 180
},
{
"epoch": 0.06150857882809971,
"grad_norm": 2.3849782638039154,
"learning_rate": 6.148867313915859e-06,
"loss": 0.5132,
"step": 190
},
{
"epoch": 0.06474587245063128,
"grad_norm": 1.8674953957006846,
"learning_rate": 6.472491909385113e-06,
"loss": 0.5372,
"step": 200
},
{
"epoch": 0.06798316607316283,
"grad_norm": 1.9339311401708401,
"learning_rate": 6.79611650485437e-06,
"loss": 0.546,
"step": 210
},
{
"epoch": 0.0712204596956944,
"grad_norm": 2.1072958412701044,
"learning_rate": 7.119741100323625e-06,
"loss": 0.5242,
"step": 220
},
{
"epoch": 0.07445775331822596,
"grad_norm": 2.377518773852171,
"learning_rate": 7.443365695792882e-06,
"loss": 0.5293,
"step": 230
},
{
"epoch": 0.07769504694075753,
"grad_norm": 2.0442232616421134,
"learning_rate": 7.766990291262136e-06,
"loss": 0.5299,
"step": 240
},
{
"epoch": 0.08093234056328909,
"grad_norm": 2.0780538675854663,
"learning_rate": 8.090614886731393e-06,
"loss": 0.5356,
"step": 250
},
{
"epoch": 0.08416963418582066,
"grad_norm": 3.8664800439938944,
"learning_rate": 8.414239482200647e-06,
"loss": 0.5208,
"step": 260
},
{
"epoch": 0.08740692780835221,
"grad_norm": 1.8534756204932774,
"learning_rate": 8.737864077669904e-06,
"loss": 0.5576,
"step": 270
},
{
"epoch": 0.09064422143088378,
"grad_norm": 3.0411642696089425,
"learning_rate": 9.06148867313916e-06,
"loss": 0.5138,
"step": 280
},
{
"epoch": 0.09388151505341534,
"grad_norm": 2.4331984099340755,
"learning_rate": 9.385113268608415e-06,
"loss": 0.5218,
"step": 290
},
{
"epoch": 0.09711880867594691,
"grad_norm": 1.753588947234162,
"learning_rate": 9.708737864077671e-06,
"loss": 0.5575,
"step": 300
},
{
"epoch": 0.10035610229847847,
"grad_norm": 2.192062741982724,
"learning_rate": 9.999996807358784e-06,
"loss": 0.5312,
"step": 310
},
{
"epoch": 0.10359339592101004,
"grad_norm": 1.5161263334784383,
"learning_rate": 9.999613695346183e-06,
"loss": 0.5467,
"step": 320
},
{
"epoch": 0.1068306895435416,
"grad_norm": 1.5940034702172312,
"learning_rate": 9.998592111150392e-06,
"loss": 0.5373,
"step": 330
},
{
"epoch": 0.11006798316607316,
"grad_norm": 1.506000200507162,
"learning_rate": 9.996932185232106e-06,
"loss": 0.5314,
"step": 340
},
{
"epoch": 0.11330527678860472,
"grad_norm": 1.7139944002155716,
"learning_rate": 9.994634129571013e-06,
"loss": 0.5373,
"step": 350
},
{
"epoch": 0.11654257041113629,
"grad_norm": 1.944009244747707,
"learning_rate": 9.991698237638708e-06,
"loss": 0.5157,
"step": 360
},
{
"epoch": 0.11977986403366786,
"grad_norm": 1.3403254941353564,
"learning_rate": 9.988124884361222e-06,
"loss": 0.5472,
"step": 370
},
{
"epoch": 0.12301715765619942,
"grad_norm": 2.1963515894000794,
"learning_rate": 9.983914526071148e-06,
"loss": 0.542,
"step": 380
},
{
"epoch": 0.12625445127873097,
"grad_norm": 2.751448769848235,
"learning_rate": 9.979067700449358e-06,
"loss": 0.5386,
"step": 390
},
{
"epoch": 0.12949174490126256,
"grad_norm": 2.1722392044422545,
"learning_rate": 9.973585026456338e-06,
"loss": 0.5281,
"step": 400
},
{
"epoch": 0.1327290385237941,
"grad_norm": 1.6770114606853526,
"learning_rate": 9.967467204253153e-06,
"loss": 0.5589,
"step": 410
},
{
"epoch": 0.13596633214632567,
"grad_norm": 1.9183336831409301,
"learning_rate": 9.960715015112022e-06,
"loss": 0.5072,
"step": 420
},
{
"epoch": 0.13920362576885723,
"grad_norm": 2.384394686356415,
"learning_rate": 9.953329321316556e-06,
"loss": 0.5373,
"step": 430
},
{
"epoch": 0.1424409193913888,
"grad_norm": 2.537628989863915,
"learning_rate": 9.945311066051632e-06,
"loss": 0.5283,
"step": 440
},
{
"epoch": 0.14567821301392037,
"grad_norm": 1.4128529976835362,
"learning_rate": 9.936661273282957e-06,
"loss": 0.5444,
"step": 450
},
{
"epoch": 0.14891550663645192,
"grad_norm": 1.269855727764354,
"learning_rate": 9.927381047626283e-06,
"loss": 0.5115,
"step": 460
},
{
"epoch": 0.15215280025898348,
"grad_norm": 1.403995189704697,
"learning_rate": 9.917471574206366e-06,
"loss": 0.5058,
"step": 470
},
{
"epoch": 0.15539009388151506,
"grad_norm": 1.984703757619364,
"learning_rate": 9.9069341185056e-06,
"loss": 0.5325,
"step": 480
},
{
"epoch": 0.15862738750404662,
"grad_norm": 5.350351817604479,
"learning_rate": 9.895770026202424e-06,
"loss": 0.528,
"step": 490
},
{
"epoch": 0.16186468112657817,
"grad_norm": 1.3441213050436032,
"learning_rate": 9.883980722999467e-06,
"loss": 0.5492,
"step": 500
},
{
"epoch": 0.16510197474910973,
"grad_norm": 1.6583123951777,
"learning_rate": 9.871567714441481e-06,
"loss": 0.5166,
"step": 510
},
{
"epoch": 0.16833926837164132,
"grad_norm": 1.819057899971267,
"learning_rate": 9.858532585723071e-06,
"loss": 0.5382,
"step": 520
},
{
"epoch": 0.17157656199417287,
"grad_norm": 1.401898794325159,
"learning_rate": 9.84487700148627e-06,
"loss": 0.5266,
"step": 530
},
{
"epoch": 0.17481385561670443,
"grad_norm": 1.363660786872396,
"learning_rate": 9.830602705607946e-06,
"loss": 0.5482,
"step": 540
},
{
"epoch": 0.178051149239236,
"grad_norm": 1.347346394618236,
"learning_rate": 9.81571152097711e-06,
"loss": 0.509,
"step": 550
},
{
"epoch": 0.18128844286176757,
"grad_norm": 1.153935203111527,
"learning_rate": 9.800205349262115e-06,
"loss": 0.5229,
"step": 560
},
{
"epoch": 0.18452573648429912,
"grad_norm": 1.3687757149985038,
"learning_rate": 9.784086170667817e-06,
"loss": 0.5266,
"step": 570
},
{
"epoch": 0.18776303010683068,
"grad_norm": 1.4692106944797128,
"learning_rate": 9.767356043682687e-06,
"loss": 0.5131,
"step": 580
},
{
"epoch": 0.19100032372936226,
"grad_norm": 2.090443291629213,
"learning_rate": 9.750017104815932e-06,
"loss": 0.5237,
"step": 590
},
{
"epoch": 0.19423761735189382,
"grad_norm": 1.6775830237228662,
"learning_rate": 9.732071568324662e-06,
"loss": 0.5137,
"step": 600
},
{
"epoch": 0.19747491097442538,
"grad_norm": 1.3780856622111588,
"learning_rate": 9.713521725931107e-06,
"loss": 0.5474,
"step": 610
},
{
"epoch": 0.20071220459695693,
"grad_norm": 1.2939695420785708,
"learning_rate": 9.694369946529964e-06,
"loss": 0.5355,
"step": 620
},
{
"epoch": 0.20394949821948852,
"grad_norm": 1.4450768355958643,
"learning_rate": 9.674618675885878e-06,
"loss": 0.5514,
"step": 630
},
{
"epoch": 0.20718679184202007,
"grad_norm": 1.327992017483562,
"learning_rate": 9.654270436321103e-06,
"loss": 0.5329,
"step": 640
},
{
"epoch": 0.21042408546455163,
"grad_norm": 1.7989475405276711,
"learning_rate": 9.633327826393392e-06,
"loss": 0.5157,
"step": 650
},
{
"epoch": 0.2136613790870832,
"grad_norm": 2.05329586810244,
"learning_rate": 9.611793520564155e-06,
"loss": 0.5133,
"step": 660
},
{
"epoch": 0.21689867270961477,
"grad_norm": 1.5423450609784548,
"learning_rate": 9.589670268856913e-06,
"loss": 0.542,
"step": 670
},
{
"epoch": 0.22013596633214633,
"grad_norm": 1.4428683673038962,
"learning_rate": 9.566960896506108e-06,
"loss": 0.4971,
"step": 680
},
{
"epoch": 0.22337325995467788,
"grad_norm": 1.364527266808998,
"learning_rate": 9.543668303596313e-06,
"loss": 0.5173,
"step": 690
},
{
"epoch": 0.22661055357720944,
"grad_norm": 1.5271705470696573,
"learning_rate": 9.519795464691873e-06,
"loss": 0.517,
"step": 700
},
{
"epoch": 0.22984784719974102,
"grad_norm": 1.5089398058797954,
"learning_rate": 9.495345428457048e-06,
"loss": 0.5236,
"step": 710
},
{
"epoch": 0.23308514082227258,
"grad_norm": 2.1076475628907057,
"learning_rate": 9.470321317266679e-06,
"loss": 0.5376,
"step": 720
},
{
"epoch": 0.23632243444480414,
"grad_norm": 2.2100737808822943,
"learning_rate": 9.444726326807445e-06,
"loss": 0.5191,
"step": 730
},
{
"epoch": 0.23955972806733572,
"grad_norm": 1.5902060955759352,
"learning_rate": 9.418563725669772e-06,
"loss": 0.505,
"step": 740
},
{
"epoch": 0.24279702168986728,
"grad_norm": 1.395823374280939,
"learning_rate": 9.391836854930404e-06,
"loss": 0.5249,
"step": 750
},
{
"epoch": 0.24603431531239883,
"grad_norm": 1.409554350409635,
"learning_rate": 9.364549127725749e-06,
"loss": 0.4998,
"step": 760
},
{
"epoch": 0.2492716089349304,
"grad_norm": 1.2811036840491727,
"learning_rate": 9.33670402881599e-06,
"loss": 0.5132,
"step": 770
},
{
"epoch": 0.25250890255746195,
"grad_norm": 2.683941547894333,
"learning_rate": 9.308305114140086e-06,
"loss": 0.513,
"step": 780
},
{
"epoch": 0.2557461961799935,
"grad_norm": 1.4991894757393012,
"learning_rate": 9.279356010361647e-06,
"loss": 0.5149,
"step": 790
},
{
"epoch": 0.2589834898025251,
"grad_norm": 1.2507685206168797,
"learning_rate": 9.249860414405794e-06,
"loss": 0.4969,
"step": 800
},
{
"epoch": 0.26222078342505667,
"grad_norm": 1.35022647512136,
"learning_rate": 9.219822092987061e-06,
"loss": 0.4938,
"step": 810
},
{
"epoch": 0.2654580770475882,
"grad_norm": 1.3005194873163757,
"learning_rate": 9.18924488212835e-06,
"loss": 0.4941,
"step": 820
},
{
"epoch": 0.2686953706701198,
"grad_norm": 1.2581885113440374,
"learning_rate": 9.158132686671071e-06,
"loss": 0.4959,
"step": 830
},
{
"epoch": 0.27193266429265134,
"grad_norm": 1.9420918227035224,
"learning_rate": 9.126489479776461e-06,
"loss": 0.5171,
"step": 840
},
{
"epoch": 0.2751699579151829,
"grad_norm": 1.1789473175722394,
"learning_rate": 9.09431930241821e-06,
"loss": 0.5132,
"step": 850
},
{
"epoch": 0.27840725153771445,
"grad_norm": 1.6644531651075887,
"learning_rate": 9.061626262866403e-06,
"loss": 0.521,
"step": 860
},
{
"epoch": 0.281644545160246,
"grad_norm": 1.3772893833242836,
"learning_rate": 9.028414536162873e-06,
"loss": 0.501,
"step": 870
},
{
"epoch": 0.2848818387827776,
"grad_norm": 1.4605098649279153,
"learning_rate": 8.994688363588035e-06,
"loss": 0.5014,
"step": 880
},
{
"epoch": 0.2881191324053092,
"grad_norm": 1.2543294792487787,
"learning_rate": 8.960452052119259e-06,
"loss": 0.4897,
"step": 890
},
{
"epoch": 0.29135642602784073,
"grad_norm": 1.4701554087797506,
"learning_rate": 8.925709973880844e-06,
"loss": 0.4974,
"step": 900
},
{
"epoch": 0.2945937196503723,
"grad_norm": 1.2834425518492831,
"learning_rate": 8.890466565585684e-06,
"loss": 0.5074,
"step": 910
},
{
"epoch": 0.29783101327290384,
"grad_norm": 1.1087811452110212,
"learning_rate": 8.854726327968675e-06,
"loss": 0.5086,
"step": 920
},
{
"epoch": 0.3010683068954354,
"grad_norm": 1.1192002738248805,
"learning_rate": 8.818493825211962e-06,
"loss": 0.5145,
"step": 930
},
{
"epoch": 0.30430560051796696,
"grad_norm": 1.2164084119048657,
"learning_rate": 8.781773684362058e-06,
"loss": 0.5154,
"step": 940
},
{
"epoch": 0.30754289414049857,
"grad_norm": 1.7852611552660387,
"learning_rate": 8.744570594738965e-06,
"loss": 0.5147,
"step": 950
},
{
"epoch": 0.3107801877630301,
"grad_norm": 2.0895432300719112,
"learning_rate": 8.706889307337322e-06,
"loss": 0.5101,
"step": 960
},
{
"epoch": 0.3140174813855617,
"grad_norm": 1.305634982599103,
"learning_rate": 8.668734634219677e-06,
"loss": 0.5204,
"step": 970
},
{
"epoch": 0.31725477500809324,
"grad_norm": 1.5110057953446319,
"learning_rate": 8.630111447901974e-06,
"loss": 0.4925,
"step": 980
},
{
"epoch": 0.3204920686306248,
"grad_norm": 1.2113963088571518,
"learning_rate": 8.59102468073131e-06,
"loss": 0.5045,
"step": 990
},
{
"epoch": 0.32372936225315635,
"grad_norm": 1.1406356602000782,
"learning_rate": 8.551479324256052e-06,
"loss": 0.4903,
"step": 1000
},
{
"epoch": 0.3269666558756879,
"grad_norm": 1.2091711143518766,
"learning_rate": 8.51148042858839e-06,
"loss": 0.5018,
"step": 1010
},
{
"epoch": 0.33020394949821946,
"grad_norm": 1.2863060350140245,
"learning_rate": 8.471033101759426e-06,
"loss": 0.4991,
"step": 1020
},
{
"epoch": 0.3334412431207511,
"grad_norm": 1.6631825587574212,
"learning_rate": 8.430142509066848e-06,
"loss": 0.5096,
"step": 1030
},
{
"epoch": 0.33667853674328263,
"grad_norm": 1.2839071122317827,
"learning_rate": 8.388813872415301e-06,
"loss": 0.5036,
"step": 1040
},
{
"epoch": 0.3399158303658142,
"grad_norm": 1.4515170974118698,
"learning_rate": 8.347052469649534e-06,
"loss": 0.4917,
"step": 1050
},
{
"epoch": 0.34315312398834574,
"grad_norm": 1.1940858318622096,
"learning_rate": 8.304863633880385e-06,
"loss": 0.4927,
"step": 1060
},
{
"epoch": 0.3463904176108773,
"grad_norm": 2.041119119310852,
"learning_rate": 8.262252752803728e-06,
"loss": 0.5021,
"step": 1070
},
{
"epoch": 0.34962771123340886,
"grad_norm": 1.5598325467101413,
"learning_rate": 8.21922526801244e-06,
"loss": 0.4949,
"step": 1080
},
{
"epoch": 0.3528650048559404,
"grad_norm": 1.3210055095635682,
"learning_rate": 8.175786674301486e-06,
"loss": 0.4916,
"step": 1090
},
{
"epoch": 0.356102298478472,
"grad_norm": 1.4444985236942383,
"learning_rate": 8.131942518966205e-06,
"loss": 0.515,
"step": 1100
},
{
"epoch": 0.3593395921010036,
"grad_norm": 2.096926087895547,
"learning_rate": 8.087698401093904e-06,
"loss": 0.5046,
"step": 1110
},
{
"epoch": 0.36257688572353514,
"grad_norm": 1.8779799622755553,
"learning_rate": 8.043059970848827e-06,
"loss": 0.4925,
"step": 1120
},
{
"epoch": 0.3658141793460667,
"grad_norm": 1.1855292279659888,
"learning_rate": 7.998032928750603e-06,
"loss": 0.4915,
"step": 1130
},
{
"epoch": 0.36905147296859825,
"grad_norm": 1.2561037766110636,
"learning_rate": 7.95262302494627e-06,
"loss": 0.4921,
"step": 1140
},
{
"epoch": 0.3722887665911298,
"grad_norm": 1.2308136471771238,
"learning_rate": 7.906836058475947e-06,
"loss": 0.4824,
"step": 1150
},
{
"epoch": 0.37552606021366136,
"grad_norm": 1.4251854056491962,
"learning_rate": 7.860677876532284e-06,
"loss": 0.4998,
"step": 1160
},
{
"epoch": 0.3787633538361929,
"grad_norm": 2.348386527172616,
"learning_rate": 7.814154373713746e-06,
"loss": 0.4908,
"step": 1170
},
{
"epoch": 0.38200064745872453,
"grad_norm": 1.865144066346986,
"learning_rate": 7.76727149127184e-06,
"loss": 0.4846,
"step": 1180
},
{
"epoch": 0.3852379410812561,
"grad_norm": 1.4007403629971613,
"learning_rate": 7.720035216352398e-06,
"loss": 0.4874,
"step": 1190
},
{
"epoch": 0.38847523470378764,
"grad_norm": 1.1445479147677562,
"learning_rate": 7.672451581230997e-06,
"loss": 0.4957,
"step": 1200
},
{
"epoch": 0.3917125283263192,
"grad_norm": 1.4508487552186167,
"learning_rate": 7.624526662542602e-06,
"loss": 0.4733,
"step": 1210
},
{
"epoch": 0.39494982194885075,
"grad_norm": 1.3478086965831404,
"learning_rate": 7.57626658050556e-06,
"loss": 0.47,
"step": 1220
},
{
"epoch": 0.3981871155713823,
"grad_norm": 1.9590592831165203,
"learning_rate": 7.527677498140019e-06,
"loss": 0.4722,
"step": 1230
},
{
"epoch": 0.40142440919391387,
"grad_norm": 1.2643798071505419,
"learning_rate": 7.4787656204808865e-06,
"loss": 0.4917,
"step": 1240
},
{
"epoch": 0.4046617028164455,
"grad_norm": 1.1231557245992803,
"learning_rate": 7.429537193785417e-06,
"loss": 0.4672,
"step": 1250
},
{
"epoch": 0.40789899643897704,
"grad_norm": 1.5059792239616556,
"learning_rate": 7.379998504735534e-06,
"loss": 0.485,
"step": 1260
},
{
"epoch": 0.4111362900615086,
"grad_norm": 1.3294673643320833,
"learning_rate": 7.33015587963501e-06,
"loss": 0.4683,
"step": 1270
},
{
"epoch": 0.41437358368404015,
"grad_norm": 1.2458719277331098,
"learning_rate": 7.280015683601549e-06,
"loss": 0.4947,
"step": 1280
},
{
"epoch": 0.4176108773065717,
"grad_norm": 1.5392092717085768,
"learning_rate": 7.2295843197539506e-06,
"loss": 0.4872,
"step": 1290
},
{
"epoch": 0.42084817092910326,
"grad_norm": 1.312225721855963,
"learning_rate": 7.178868228394389e-06,
"loss": 0.465,
"step": 1300
},
{
"epoch": 0.4240854645516348,
"grad_norm": 1.2479239867456877,
"learning_rate": 7.127873886185976e-06,
"loss": 0.4854,
"step": 1310
},
{
"epoch": 0.4273227581741664,
"grad_norm": 2.0594970342805072,
"learning_rate": 7.076607805325648e-06,
"loss": 0.4957,
"step": 1320
},
{
"epoch": 0.430560051796698,
"grad_norm": 1.3369762999688763,
"learning_rate": 7.025076532712538e-06,
"loss": 0.4926,
"step": 1330
},
{
"epoch": 0.43379734541922954,
"grad_norm": 1.253628281059844,
"learning_rate": 6.97328664911191e-06,
"loss": 0.4827,
"step": 1340
},
{
"epoch": 0.4370346390417611,
"grad_norm": 1.5718283098388837,
"learning_rate": 6.921244768314762e-06,
"loss": 0.4688,
"step": 1350
},
{
"epoch": 0.44027193266429265,
"grad_norm": 1.221169832657551,
"learning_rate": 6.868957536293214e-06,
"loss": 0.4756,
"step": 1360
},
{
"epoch": 0.4435092262868242,
"grad_norm": 1.7633030570206691,
"learning_rate": 6.816431630351801e-06,
"loss": 0.4782,
"step": 1370
},
{
"epoch": 0.44674651990935577,
"grad_norm": 1.1388033599578846,
"learning_rate": 6.763673758274738e-06,
"loss": 0.4547,
"step": 1380
},
{
"epoch": 0.4499838135318873,
"grad_norm": 1.8583763069122066,
"learning_rate": 6.7106906574693175e-06,
"loss": 0.4677,
"step": 1390
},
{
"epoch": 0.4532211071544189,
"grad_norm": 2.6222452513802423,
"learning_rate": 6.657489094105511e-06,
"loss": 0.4663,
"step": 1400
},
{
"epoch": 0.4564584007769505,
"grad_norm": 1.2420631202470083,
"learning_rate": 6.604075862251892e-06,
"loss": 0.4756,
"step": 1410
},
{
"epoch": 0.45969569439948205,
"grad_norm": 1.3711399168409348,
"learning_rate": 6.55045778300802e-06,
"loss": 0.4792,
"step": 1420
},
{
"epoch": 0.4629329880220136,
"grad_norm": 1.2747272442800757,
"learning_rate": 6.496641703633339e-06,
"loss": 0.4945,
"step": 1430
},
{
"epoch": 0.46617028164454516,
"grad_norm": 1.9542903017844309,
"learning_rate": 6.442634496672771e-06,
"loss": 0.4782,
"step": 1440
},
{
"epoch": 0.4694075752670767,
"grad_norm": 1.2575700419582634,
"learning_rate": 6.388443059079046e-06,
"loss": 0.4946,
"step": 1450
},
{
"epoch": 0.47264486888960827,
"grad_norm": 1.7374379373347664,
"learning_rate": 6.334074311331938e-06,
"loss": 0.4727,
"step": 1460
},
{
"epoch": 0.47588216251213983,
"grad_norm": 1.4848465822905959,
"learning_rate": 6.279535196554497e-06,
"loss": 0.4771,
"step": 1470
},
{
"epoch": 0.47911945613467144,
"grad_norm": 1.3014806293507155,
"learning_rate": 6.224832679626371e-06,
"loss": 0.4558,
"step": 1480
},
{
"epoch": 0.482356749757203,
"grad_norm": 1.1234018842757745,
"learning_rate": 6.169973746294367e-06,
"loss": 0.4807,
"step": 1490
},
{
"epoch": 0.48559404337973455,
"grad_norm": 1.2339190353189107,
"learning_rate": 6.114965402280342e-06,
"loss": 0.4764,
"step": 1500
},
{
"epoch": 0.4888313370022661,
"grad_norm": 1.1985465287370143,
"learning_rate": 6.059814672386535e-06,
"loss": 0.4653,
"step": 1510
},
{
"epoch": 0.49206863062479766,
"grad_norm": 1.0962666033332409,
"learning_rate": 6.0045285995984795e-06,
"loss": 0.4758,
"step": 1520
},
{
"epoch": 0.4953059242473292,
"grad_norm": 1.4061424214222114,
"learning_rate": 5.9491142441855755e-06,
"loss": 0.4445,
"step": 1530
},
{
"epoch": 0.4985432178698608,
"grad_norm": 1.698961611178768,
"learning_rate": 5.8935786827994705e-06,
"loss": 0.4844,
"step": 1540
},
{
"epoch": 0.5017805114923923,
"grad_norm": 1.2773530358959824,
"learning_rate": 5.83792900757033e-06,
"loss": 0.4711,
"step": 1550
},
{
"epoch": 0.5050178051149239,
"grad_norm": 1.097531946277775,
"learning_rate": 5.782172325201155e-06,
"loss": 0.4587,
"step": 1560
},
{
"epoch": 0.5082550987374554,
"grad_norm": 1.3771950799998722,
"learning_rate": 5.726315756060214e-06,
"loss": 0.4833,
"step": 1570
},
{
"epoch": 0.511492392359987,
"grad_norm": 1.2640070993851824,
"learning_rate": 5.67036643327175e-06,
"loss": 0.4698,
"step": 1580
},
{
"epoch": 0.5147296859825187,
"grad_norm": 1.4765324665224344,
"learning_rate": 5.61433150180504e-06,
"loss": 0.4792,
"step": 1590
},
{
"epoch": 0.5179669796050502,
"grad_norm": 1.270051397725334,
"learning_rate": 5.558218117561966e-06,
"loss": 0.4691,
"step": 1600
},
{
"epoch": 0.5212042732275818,
"grad_norm": 1.487262847973142,
"learning_rate": 5.502033446463157e-06,
"loss": 0.4549,
"step": 1610
},
{
"epoch": 0.5244415668501133,
"grad_norm": 1.0818684589754615,
"learning_rate": 5.445784663532892e-06,
"loss": 0.4444,
"step": 1620
},
{
"epoch": 0.5276788604726449,
"grad_norm": 1.096657186329996,
"learning_rate": 5.389478951982795e-06,
"loss": 0.4403,
"step": 1630
},
{
"epoch": 0.5309161540951765,
"grad_norm": 1.2316375993715067,
"learning_rate": 5.3331235022945275e-06,
"loss": 0.4633,
"step": 1640
},
{
"epoch": 0.534153447717708,
"grad_norm": 1.336530587237014,
"learning_rate": 5.276725511301522e-06,
"loss": 0.4612,
"step": 1650
},
{
"epoch": 0.5373907413402396,
"grad_norm": 1.2767954358198574,
"learning_rate": 5.2202921812699224e-06,
"loss": 0.4797,
"step": 1660
},
{
"epoch": 0.5406280349627711,
"grad_norm": 1.2515640963616828,
"learning_rate": 5.163830718978814e-06,
"loss": 0.4426,
"step": 1670
},
{
"epoch": 0.5438653285853027,
"grad_norm": 1.4888354293436683,
"learning_rate": 5.107348334799897e-06,
"loss": 0.4429,
"step": 1680
},
{
"epoch": 0.5471026222078342,
"grad_norm": 1.1397480663693997,
"learning_rate": 5.050852241776687e-06,
"loss": 0.4806,
"step": 1690
},
{
"epoch": 0.5503399158303658,
"grad_norm": 1.383755335534758,
"learning_rate": 4.994349654703374e-06,
"loss": 0.4677,
"step": 1700
},
{
"epoch": 0.5535772094528973,
"grad_norm": 1.0399853616381194,
"learning_rate": 4.93784778920347e-06,
"loss": 0.4544,
"step": 1710
},
{
"epoch": 0.5568145030754289,
"grad_norm": 1.2261677975570098,
"learning_rate": 4.881353860808337e-06,
"loss": 0.4737,
"step": 1720
},
{
"epoch": 0.5600517966979605,
"grad_norm": 1.3450655153335658,
"learning_rate": 4.824875084035736e-06,
"loss": 0.45,
"step": 1730
},
{
"epoch": 0.563289090320492,
"grad_norm": 1.7303162242759071,
"learning_rate": 4.768418671468502e-06,
"loss": 0.4487,
"step": 1740
},
{
"epoch": 0.5665263839430237,
"grad_norm": 1.2092323849987936,
"learning_rate": 4.711991832833458e-06,
"loss": 0.4496,
"step": 1750
},
{
"epoch": 0.5697636775655552,
"grad_norm": 1.1729698153300774,
"learning_rate": 4.655601774080716e-06,
"loss": 0.4412,
"step": 1760
},
{
"epoch": 0.5730009711880868,
"grad_norm": 1.8430576640984782,
"learning_rate": 4.599255696463434e-06,
"loss": 0.4794,
"step": 1770
},
{
"epoch": 0.5762382648106184,
"grad_norm": 1.2327240635393302,
"learning_rate": 4.542960795618194e-06,
"loss": 0.447,
"step": 1780
},
{
"epoch": 0.5794755584331499,
"grad_norm": 1.5385068868409182,
"learning_rate": 4.486724260646083e-06,
"loss": 0.4335,
"step": 1790
},
{
"epoch": 0.5827128520556815,
"grad_norm": 1.5225619416563714,
"learning_rate": 4.430553273194609e-06,
"loss": 0.4441,
"step": 1800
},
{
"epoch": 0.585950145678213,
"grad_norm": 1.4895774823409547,
"learning_rate": 4.3744550065405864e-06,
"loss": 0.4841,
"step": 1810
},
{
"epoch": 0.5891874393007446,
"grad_norm": 1.5703613121834772,
"learning_rate": 4.318436624674067e-06,
"loss": 0.4463,
"step": 1820
},
{
"epoch": 0.5924247329232761,
"grad_norm": 1.1534441333167642,
"learning_rate": 4.262505281383476e-06,
"loss": 0.4506,
"step": 1830
},
{
"epoch": 0.5956620265458077,
"grad_norm": 1.2410143224549055,
"learning_rate": 4.206668119342033e-06,
"loss": 0.4382,
"step": 1840
},
{
"epoch": 0.5988993201683392,
"grad_norm": 1.944311167675598,
"learning_rate": 4.15093226919561e-06,
"loss": 0.4344,
"step": 1850
},
{
"epoch": 0.6021366137908708,
"grad_norm": 1.0718668211688862,
"learning_rate": 4.0953048486521105e-06,
"loss": 0.4537,
"step": 1860
},
{
"epoch": 0.6053739074134024,
"grad_norm": 1.3196234174557926,
"learning_rate": 4.039792961572513e-06,
"loss": 0.4349,
"step": 1870
},
{
"epoch": 0.6086112010359339,
"grad_norm": 1.4850011207756155,
"learning_rate": 3.984403697063679e-06,
"loss": 0.4465,
"step": 1880
},
{
"epoch": 0.6118484946584655,
"grad_norm": 1.1052614698421046,
"learning_rate": 3.929144128573035e-06,
"loss": 0.4404,
"step": 1890
},
{
"epoch": 0.6150857882809971,
"grad_norm": 1.2374254994171028,
"learning_rate": 3.87402131298527e-06,
"loss": 0.4342,
"step": 1900
},
{
"epoch": 0.6183230819035287,
"grad_norm": 1.1059894750691,
"learning_rate": 3.819042289721139e-06,
"loss": 0.4389,
"step": 1910
},
{
"epoch": 0.6215603755260602,
"grad_norm": 1.1534915703249282,
"learning_rate": 3.764214079838496e-06,
"loss": 0.4475,
"step": 1920
},
{
"epoch": 0.6247976691485918,
"grad_norm": 1.2567624750134727,
"learning_rate": 3.7095436851356813e-06,
"loss": 0.4251,
"step": 1930
},
{
"epoch": 0.6280349627711234,
"grad_norm": 1.3966399419605446,
"learning_rate": 3.655038087257356e-06,
"loss": 0.4152,
"step": 1940
},
{
"epoch": 0.6312722563936549,
"grad_norm": 2.325304921982081,
"learning_rate": 3.6007042468029174e-06,
"loss": 0.4353,
"step": 1950
},
{
"epoch": 0.6345095500161865,
"grad_norm": 1.8152216504321872,
"learning_rate": 3.5465491024375983e-06,
"loss": 0.4304,
"step": 1960
},
{
"epoch": 0.637746843638718,
"grad_norm": 1.4667613587733686,
"learning_rate": 3.4925795700063735e-06,
"loss": 0.4482,
"step": 1970
},
{
"epoch": 0.6409841372612496,
"grad_norm": 1.5490376161227752,
"learning_rate": 3.438802541650779e-06,
"loss": 0.4544,
"step": 1980
},
{
"epoch": 0.6442214308837811,
"grad_norm": 1.073502513354859,
"learning_rate": 3.3852248849287473e-06,
"loss": 0.4502,
"step": 1990
},
{
"epoch": 0.6474587245063127,
"grad_norm": 1.4150494245382768,
"learning_rate": 3.3318534419375962e-06,
"loss": 0.4553,
"step": 2000
},
{
"epoch": 0.6506960181288443,
"grad_norm": 1.1873711967885243,
"learning_rate": 3.278695028440265e-06,
"loss": 0.4719,
"step": 2010
},
{
"epoch": 0.6539333117513758,
"grad_norm": 1.0277166813799339,
"learning_rate": 3.2257564329949033e-06,
"loss": 0.446,
"step": 2020
},
{
"epoch": 0.6571706053739074,
"grad_norm": 1.6303778205817352,
"learning_rate": 3.1730444160879603e-06,
"loss": 0.4514,
"step": 2030
},
{
"epoch": 0.6604078989964389,
"grad_norm": 1.957827164457282,
"learning_rate": 3.120565709270822e-06,
"loss": 0.4364,
"step": 2040
},
{
"epoch": 0.6636451926189706,
"grad_norm": 1.1300424316339148,
"learning_rate": 3.0683270143001744e-06,
"loss": 0.4573,
"step": 2050
},
{
"epoch": 0.6668824862415021,
"grad_norm": 1.3350517057914033,
"learning_rate": 3.01633500228216e-06,
"loss": 0.4395,
"step": 2060
},
{
"epoch": 0.6701197798640337,
"grad_norm": 2.4966191319455073,
"learning_rate": 2.9645963128204426e-06,
"loss": 0.4562,
"step": 2070
},
{
"epoch": 0.6733570734865653,
"grad_norm": 1.162281286522959,
"learning_rate": 2.91311755316831e-06,
"loss": 0.4415,
"step": 2080
},
{
"epoch": 0.6765943671090968,
"grad_norm": 1.2771987132753222,
"learning_rate": 2.8619052973848936e-06,
"loss": 0.4513,
"step": 2090
},
{
"epoch": 0.6798316607316284,
"grad_norm": 1.558113731713831,
"learning_rate": 2.8109660854956324e-06,
"loss": 0.4409,
"step": 2100
},
{
"epoch": 0.6830689543541599,
"grad_norm": 2.1857614420408398,
"learning_rate": 2.760306422657083e-06,
"loss": 0.428,
"step": 2110
},
{
"epoch": 0.6863062479766915,
"grad_norm": 1.4005789104812565,
"learning_rate": 2.7099327783261905e-06,
"loss": 0.4171,
"step": 2120
},
{
"epoch": 0.689543541599223,
"grad_norm": 1.3198113477885867,
"learning_rate": 2.6598515854341046e-06,
"loss": 0.4222,
"step": 2130
},
{
"epoch": 0.6927808352217546,
"grad_norm": 1.552097021806336,
"learning_rate": 2.610069239564663e-06,
"loss": 0.4374,
"step": 2140
},
{
"epoch": 0.6960181288442862,
"grad_norm": 1.5875723544475318,
"learning_rate": 2.5605920981376607e-06,
"loss": 0.4329,
"step": 2150
},
{
"epoch": 0.6992554224668177,
"grad_norm": 1.7408409217582645,
"learning_rate": 2.5114264795969658e-06,
"loss": 0.4255,
"step": 2160
},
{
"epoch": 0.7024927160893493,
"grad_norm": 1.4137585560938575,
"learning_rate": 2.4625786626036315e-06,
"loss": 0.4553,
"step": 2170
},
{
"epoch": 0.7057300097118808,
"grad_norm": 1.2013119499594673,
"learning_rate": 2.4140548852340924e-06,
"loss": 0.4272,
"step": 2180
},
{
"epoch": 0.7089673033344124,
"grad_norm": 1.554021107843274,
"learning_rate": 2.3658613441835275e-06,
"loss": 0.4148,
"step": 2190
},
{
"epoch": 0.712204596956944,
"grad_norm": 1.0745275989784961,
"learning_rate": 2.318004193974513e-06,
"loss": 0.4489,
"step": 2200
},
{
"epoch": 0.7154418905794756,
"grad_norm": 1.224435407580586,
"learning_rate": 2.2704895461710673e-06,
"loss": 0.4335,
"step": 2210
},
{
"epoch": 0.7186791842020072,
"grad_norm": 1.4400592081825279,
"learning_rate": 2.2233234685981814e-06,
"loss": 0.438,
"step": 2220
},
{
"epoch": 0.7219164778245387,
"grad_norm": 1.1095915569574144,
"learning_rate": 2.1765119845669273e-06,
"loss": 0.4193,
"step": 2230
},
{
"epoch": 0.7251537714470703,
"grad_norm": 1.1009788319616933,
"learning_rate": 2.130061072105252e-06,
"loss": 0.4167,
"step": 2240
},
{
"epoch": 0.7283910650696018,
"grad_norm": 1.0172586653465259,
"learning_rate": 2.083976663194567e-06,
"loss": 0.4592,
"step": 2250
},
{
"epoch": 0.7316283586921334,
"grad_norm": 1.1509992611813928,
"learning_rate": 2.0382646430121962e-06,
"loss": 0.4288,
"step": 2260
},
{
"epoch": 0.7348656523146649,
"grad_norm": 1.253268879076886,
"learning_rate": 1.992930849179827e-06,
"loss": 0.4255,
"step": 2270
},
{
"epoch": 0.7381029459371965,
"grad_norm": 1.4917991891879472,
"learning_rate": 1.9479810710180124e-06,
"loss": 0.4259,
"step": 2280
},
{
"epoch": 0.741340239559728,
"grad_norm": 1.3293852268645232,
"learning_rate": 1.9034210488068505e-06,
"loss": 0.4258,
"step": 2290
},
{
"epoch": 0.7445775331822596,
"grad_norm": 1.955770547143747,
"learning_rate": 1.8592564730529268e-06,
"loss": 0.4331,
"step": 2300
},
{
"epoch": 0.7478148268047912,
"grad_norm": 4.285906632966228,
"learning_rate": 1.815492983762614e-06,
"loss": 0.41,
"step": 2310
},
{
"epoch": 0.7510521204273227,
"grad_norm": 1.3399086538893303,
"learning_rate": 1.7721361697218192e-06,
"loss": 0.431,
"step": 2320
},
{
"epoch": 0.7542894140498543,
"grad_norm": 1.7551709481473476,
"learning_rate": 1.7291915677822668e-06,
"loss": 0.4345,
"step": 2330
},
{
"epoch": 0.7575267076723858,
"grad_norm": 1.2402942552904213,
"learning_rate": 1.6866646621544213e-06,
"loss": 0.4412,
"step": 2340
},
{
"epoch": 0.7607640012949175,
"grad_norm": 1.454337649781767,
"learning_rate": 1.6445608837071363e-06,
"loss": 0.4372,
"step": 2350
},
{
"epoch": 0.7640012949174491,
"grad_norm": 1.3605659792377778,
"learning_rate": 1.6028856092740975e-06,
"loss": 0.4275,
"step": 2360
},
{
"epoch": 0.7672385885399806,
"grad_norm": 1.6891447458348832,
"learning_rate": 1.5616441609671868e-06,
"loss": 0.4345,
"step": 2370
},
{
"epoch": 0.7704758821625122,
"grad_norm": 1.36054087661354,
"learning_rate": 1.5208418054968255e-06,
"loss": 0.4303,
"step": 2380
},
{
"epoch": 0.7737131757850437,
"grad_norm": 2.1532676084820004,
"learning_rate": 1.4804837534993855e-06,
"loss": 0.416,
"step": 2390
},
{
"epoch": 0.7769504694075753,
"grad_norm": 1.0925060282525623,
"learning_rate": 1.4405751588717743e-06,
"loss": 0.4132,
"step": 2400
},
{
"epoch": 0.7801877630301068,
"grad_norm": 1.8326582821797257,
"learning_rate": 1.4011211181132612e-06,
"loss": 0.4367,
"step": 2410
},
{
"epoch": 0.7834250566526384,
"grad_norm": 1.3611403654592065,
"learning_rate": 1.3621266696746305e-06,
"loss": 0.4268,
"step": 2420
},
{
"epoch": 0.78666235027517,
"grad_norm": 1.5691241422304574,
"learning_rate": 1.3235967933147482e-06,
"loss": 0.4155,
"step": 2430
},
{
"epoch": 0.7898996438977015,
"grad_norm": 1.1610560668530532,
"learning_rate": 1.2855364094646239e-06,
"loss": 0.4081,
"step": 2440
},
{
"epoch": 0.7931369375202331,
"grad_norm": 1.3174765562689756,
"learning_rate": 1.24795037859906e-06,
"loss": 0.4276,
"step": 2450
},
{
"epoch": 0.7963742311427646,
"grad_norm": 1.5724114202346193,
"learning_rate": 1.2108435006159352e-06,
"loss": 0.4266,
"step": 2460
},
{
"epoch": 0.7996115247652962,
"grad_norm": 2.5186489719168477,
"learning_rate": 1.1742205142232472e-06,
"loss": 0.4272,
"step": 2470
},
{
"epoch": 0.8028488183878277,
"grad_norm": 1.2269224379818156,
"learning_rate": 1.1380860963339551e-06,
"loss": 0.4125,
"step": 2480
},
{
"epoch": 0.8060861120103593,
"grad_norm": 1.6275541333455594,
"learning_rate": 1.1024448614687154e-06,
"loss": 0.432,
"step": 2490
},
{
"epoch": 0.809323405632891,
"grad_norm": 1.6966584659345751,
"learning_rate": 1.0673013611665912e-06,
"loss": 0.4408,
"step": 2500
},
{
"epoch": 0.8125606992554225,
"grad_norm": 2.3214553433226452,
"learning_rate": 1.0326600834038003e-06,
"loss": 0.4341,
"step": 2510
},
{
"epoch": 0.8157979928779541,
"grad_norm": 1.7966203592073668,
"learning_rate": 9.985254520205827e-07,
"loss": 0.4381,
"step": 2520
},
{
"epoch": 0.8190352865004856,
"grad_norm": 1.558277834279681,
"learning_rate": 9.649018261562515e-07,
"loss": 0.4273,
"step": 2530
},
{
"epoch": 0.8222725801230172,
"grad_norm": 1.4720652884485796,
"learning_rate": 9.31793499692521e-07,
"loss": 0.403,
"step": 2540
},
{
"epoch": 0.8255098737455487,
"grad_norm": 1.2990539813318709,
"learning_rate": 8.992047007051502e-07,
"loss": 0.411,
"step": 2550
},
{
"epoch": 0.8287471673680803,
"grad_norm": 1.588973123293107,
"learning_rate": 8.671395909240054e-07,
"loss": 0.4426,
"step": 2560
},
{
"epoch": 0.8319844609906119,
"grad_norm": 1.5375275679186977,
"learning_rate": 8.356022652015888e-07,
"loss": 0.4356,
"step": 2570
},
{
"epoch": 0.8352217546131434,
"grad_norm": 2.0458866674407856,
"learning_rate": 8.04596750990107e-07,
"loss": 0.4332,
"step": 2580
},
{
"epoch": 0.838459048235675,
"grad_norm": 1.3170208953439513,
"learning_rate": 7.741270078271473e-07,
"loss": 0.4298,
"step": 2590
},
{
"epoch": 0.8416963418582065,
"grad_norm": 1.6800797824988918,
"learning_rate": 7.441969268300264e-07,
"loss": 0.4308,
"step": 2600
},
{
"epoch": 0.8449336354807381,
"grad_norm": 1.4007822337788043,
"learning_rate": 7.148103301988846e-07,
"loss": 0.4369,
"step": 2610
},
{
"epoch": 0.8481709291032696,
"grad_norm": 1.6939881245355604,
"learning_rate": 6.859709707285683e-07,
"loss": 0.393,
"step": 2620
},
{
"epoch": 0.8514082227258012,
"grad_norm": 1.2641245106504042,
"learning_rate": 6.576825313293811e-07,
"loss": 0.4156,
"step": 2630
},
{
"epoch": 0.8546455163483327,
"grad_norm": 1.0467779769347487,
"learning_rate": 6.299486245567677e-07,
"loss": 0.4187,
"step": 2640
},
{
"epoch": 0.8578828099708644,
"grad_norm": 1.3379069751937747,
"learning_rate": 6.027727921499654e-07,
"loss": 0.4391,
"step": 2650
},
{
"epoch": 0.861120103593396,
"grad_norm": 1.2248200595811436,
"learning_rate": 5.76158504579713e-07,
"loss": 0.4062,
"step": 2660
},
{
"epoch": 0.8643573972159275,
"grad_norm": 1.855147034712074,
"learning_rate": 5.501091606050646e-07,
"loss": 0.3993,
"step": 2670
},
{
"epoch": 0.8675946908384591,
"grad_norm": 1.0581073840185136,
"learning_rate": 5.246280868393389e-07,
"loss": 0.4208,
"step": 2680
},
{
"epoch": 0.8708319844609906,
"grad_norm": 1.8680186317258003,
"learning_rate": 4.997185373253038e-07,
"loss": 0.4205,
"step": 2690
},
{
"epoch": 0.8740692780835222,
"grad_norm": 1.1556112226095094,
"learning_rate": 4.7538369311962595e-07,
"loss": 0.436,
"step": 2700
},
{
"epoch": 0.8773065717060538,
"grad_norm": 1.2396365189028185,
"learning_rate": 4.5162666188662553e-07,
"loss": 0.4054,
"step": 2710
},
{
"epoch": 0.8805438653285853,
"grad_norm": 1.8606848711011432,
"learning_rate": 4.2845047750142364e-07,
"loss": 0.4381,
"step": 2720
},
{
"epoch": 0.8837811589511169,
"grad_norm": 1.2713949286294475,
"learning_rate": 4.058580996624961e-07,
"loss": 0.4261,
"step": 2730
},
{
"epoch": 0.8870184525736484,
"grad_norm": 1.169673887026325,
"learning_rate": 3.838524135137145e-07,
"loss": 0.4314,
"step": 2740
},
{
"epoch": 0.89025574619618,
"grad_norm": 1.4698696332980015,
"learning_rate": 3.624362292758932e-07,
"loss": 0.4261,
"step": 2750
},
{
"epoch": 0.8934930398187115,
"grad_norm": 1.117701112285988,
"learning_rate": 3.416122818879164e-07,
"loss": 0.414,
"step": 2760
},
{
"epoch": 0.8967303334412431,
"grad_norm": 1.8690202090261214,
"learning_rate": 3.2138323065747767e-07,
"loss": 0.4233,
"step": 2770
},
{
"epoch": 0.8999676270637746,
"grad_norm": 1.9444565939145249,
"learning_rate": 3.0175165892146696e-07,
"loss": 0.4314,
"step": 2780
},
{
"epoch": 0.9032049206863062,
"grad_norm": 1.7268620572069362,
"learning_rate": 2.8272007371607237e-07,
"loss": 0.4092,
"step": 2790
},
{
"epoch": 0.9064422143088378,
"grad_norm": 1.3013418095252707,
"learning_rate": 2.642909054566234e-07,
"loss": 0.4151,
"step": 2800
},
{
"epoch": 0.9096795079313694,
"grad_norm": 1.0436149624026436,
"learning_rate": 2.4646650762720936e-07,
"loss": 0.406,
"step": 2810
},
{
"epoch": 0.912916801553901,
"grad_norm": 1.1874840383962737,
"learning_rate": 2.2924915648013578e-07,
"loss": 0.4279,
"step": 2820
},
{
"epoch": 0.9161540951764325,
"grad_norm": 1.9059543067602664,
"learning_rate": 2.1264105074523367e-07,
"loss": 0.4337,
"step": 2830
},
{
"epoch": 0.9193913887989641,
"grad_norm": 1.4884966997206766,
"learning_rate": 1.966443113490729e-07,
"loss": 0.412,
"step": 2840
},
{
"epoch": 0.9226286824214956,
"grad_norm": 1.3718372730811397,
"learning_rate": 1.8126098114411073e-07,
"loss": 0.422,
"step": 2850
},
{
"epoch": 0.9258659760440272,
"grad_norm": 1.963050281617195,
"learning_rate": 1.6649302464781304e-07,
"loss": 0.4247,
"step": 2860
},
{
"epoch": 0.9291032696665588,
"grad_norm": 1.3124207972289312,
"learning_rate": 1.523423277917735e-07,
"loss": 0.4284,
"step": 2870
},
{
"epoch": 0.9323405632890903,
"grad_norm": 1.033110356628065,
"learning_rate": 1.3881069768087285e-07,
"loss": 0.4147,
"step": 2880
},
{
"epoch": 0.9355778569116219,
"grad_norm": 1.298106011447868,
"learning_rate": 1.258998623625063e-07,
"loss": 0.4221,
"step": 2890
},
{
"epoch": 0.9388151505341534,
"grad_norm": 1.1556899802309393,
"learning_rate": 1.1361147060590161e-07,
"loss": 0.4084,
"step": 2900
},
{
"epoch": 0.942052444156685,
"grad_norm": 1.4391398820149846,
"learning_rate": 1.0194709169156491e-07,
"loss": 0.4158,
"step": 2910
},
{
"epoch": 0.9452897377792165,
"grad_norm": 1.2127363073000252,
"learning_rate": 9.090821521087811e-08,
"loss": 0.4157,
"step": 2920
},
{
"epoch": 0.9485270314017481,
"grad_norm": 1.38644410758264,
"learning_rate": 8.049625087587054e-08,
"loss": 0.4288,
"step": 2930
},
{
"epoch": 0.9517643250242797,
"grad_norm": 1.904590572303641,
"learning_rate": 7.071252833919184e-08,
"loss": 0.4213,
"step": 2940
},
{
"epoch": 0.9550016186468112,
"grad_norm": 1.7465451060405173,
"learning_rate": 6.15582970243117e-08,
"loss": 0.4337,
"step": 2950
},
{
"epoch": 0.9582389122693429,
"grad_norm": 1.933526856777722,
"learning_rate": 5.3034725965960264e-08,
"loss": 0.4172,
"step": 2960
},
{
"epoch": 0.9614762058918744,
"grad_norm": 1.7005469732529432,
"learning_rate": 4.514290366084195e-08,
"loss": 0.4198,
"step": 2970
},
{
"epoch": 0.964713499514406,
"grad_norm": 1.0515454431201907,
"learning_rate": 3.788383792862393e-08,
"loss": 0.4466,
"step": 2980
},
{
"epoch": 0.9679507931369375,
"grad_norm": 1.454755463123962,
"learning_rate": 3.125845578323739e-08,
"loss": 0.4146,
"step": 2990
},
{
"epoch": 0.9711880867594691,
"grad_norm": 1.3458009194327718,
"learning_rate": 2.5267603314493848e-08,
"loss": 0.3876,
"step": 3000
},
{
"epoch": 0.9744253803820007,
"grad_norm": 1.140273366577025,
"learning_rate": 1.991204558003168e-08,
"loss": 0.4236,
"step": 3010
},
{
"epoch": 0.9776626740045322,
"grad_norm": 2.5414368780936645,
"learning_rate": 1.5192466507619742e-08,
"loss": 0.4246,
"step": 3020
},
{
"epoch": 0.9808999676270638,
"grad_norm": 1.7691943119432856,
"learning_rate": 1.110946880781616e-08,
"loss": 0.4249,
"step": 3030
},
{
"epoch": 0.9841372612495953,
"grad_norm": 1.8831840560064748,
"learning_rate": 7.663573896996568e-09,
"loss": 0.4331,
"step": 3040
},
{
"epoch": 0.9873745548721269,
"grad_norm": 1.2855991887240172,
"learning_rate": 4.855221830768475e-09,
"loss": 0.4004,
"step": 3050
},
{
"epoch": 0.9906118484946584,
"grad_norm": 1.0300289207398134,
"learning_rate": 2.684771247776774e-09,
"loss": 0.4518,
"step": 3060
},
{
"epoch": 0.99384914211719,
"grad_norm": 1.7128696766597806,
"learning_rate": 1.1524993239003801e-09,
"loss": 0.4115,
"step": 3070
},
{
"epoch": 0.9970864357397216,
"grad_norm": 1.9683937477037325,
"learning_rate": 2.5860173685721134e-10,
"loss": 0.4285,
"step": 3080
},
{
"epoch": 1.0,
"step": 3089,
"total_flos": 3417304969773056.0,
"train_loss": 0.47494275417865073,
"train_runtime": 147771.5827,
"train_samples_per_second": 2.675,
"train_steps_per_second": 0.021
}
],
"logging_steps": 10,
"max_steps": 3089,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3417304969773056.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}