top_18_ranking_stackexchange / trainer_state.json
sedrickkeh's picture
End of training
d3aa98a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9963285024154587,
"eval_steps": 500,
"global_step": 1938,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.015458937198067632,
"grad_norm": 16.53294779967864,
"learning_rate": 5e-06,
"loss": 1.0792,
"step": 10
},
{
"epoch": 0.030917874396135265,
"grad_norm": 2.649325227711683,
"learning_rate": 5e-06,
"loss": 0.9837,
"step": 20
},
{
"epoch": 0.0463768115942029,
"grad_norm": 8.284136228811613,
"learning_rate": 5e-06,
"loss": 0.9474,
"step": 30
},
{
"epoch": 0.06183574879227053,
"grad_norm": 2.524509268818302,
"learning_rate": 5e-06,
"loss": 0.9301,
"step": 40
},
{
"epoch": 0.07729468599033816,
"grad_norm": 1.5326850741391629,
"learning_rate": 5e-06,
"loss": 0.9155,
"step": 50
},
{
"epoch": 0.0927536231884058,
"grad_norm": 1.0781869595697948,
"learning_rate": 5e-06,
"loss": 0.9037,
"step": 60
},
{
"epoch": 0.10821256038647344,
"grad_norm": 0.8943372564864077,
"learning_rate": 5e-06,
"loss": 0.8967,
"step": 70
},
{
"epoch": 0.12367149758454106,
"grad_norm": 0.8868808075595438,
"learning_rate": 5e-06,
"loss": 0.8867,
"step": 80
},
{
"epoch": 0.1391304347826087,
"grad_norm": 0.6755190824568067,
"learning_rate": 5e-06,
"loss": 0.8792,
"step": 90
},
{
"epoch": 0.15458937198067632,
"grad_norm": 0.6359486667796315,
"learning_rate": 5e-06,
"loss": 0.8769,
"step": 100
},
{
"epoch": 0.17004830917874397,
"grad_norm": 0.6299873717513695,
"learning_rate": 5e-06,
"loss": 0.8717,
"step": 110
},
{
"epoch": 0.1855072463768116,
"grad_norm": 0.5832655638295026,
"learning_rate": 5e-06,
"loss": 0.8671,
"step": 120
},
{
"epoch": 0.20096618357487922,
"grad_norm": 0.5586732401198556,
"learning_rate": 5e-06,
"loss": 0.8642,
"step": 130
},
{
"epoch": 0.21642512077294687,
"grad_norm": 0.6759403203697444,
"learning_rate": 5e-06,
"loss": 0.863,
"step": 140
},
{
"epoch": 0.2318840579710145,
"grad_norm": 0.5234393167054423,
"learning_rate": 5e-06,
"loss": 0.861,
"step": 150
},
{
"epoch": 0.24734299516908212,
"grad_norm": 0.5405697501688478,
"learning_rate": 5e-06,
"loss": 0.864,
"step": 160
},
{
"epoch": 0.26280193236714977,
"grad_norm": 0.5609477995758988,
"learning_rate": 5e-06,
"loss": 0.8528,
"step": 170
},
{
"epoch": 0.2782608695652174,
"grad_norm": 0.648827500892738,
"learning_rate": 5e-06,
"loss": 0.857,
"step": 180
},
{
"epoch": 0.293719806763285,
"grad_norm": 0.6627079853918527,
"learning_rate": 5e-06,
"loss": 0.8511,
"step": 190
},
{
"epoch": 0.30917874396135264,
"grad_norm": 0.6915034637639949,
"learning_rate": 5e-06,
"loss": 0.8485,
"step": 200
},
{
"epoch": 0.32463768115942027,
"grad_norm": 0.6366893171242987,
"learning_rate": 5e-06,
"loss": 0.8462,
"step": 210
},
{
"epoch": 0.34009661835748795,
"grad_norm": 0.5070708715215638,
"learning_rate": 5e-06,
"loss": 0.8446,
"step": 220
},
{
"epoch": 0.35555555555555557,
"grad_norm": 0.5871716320705027,
"learning_rate": 5e-06,
"loss": 0.8445,
"step": 230
},
{
"epoch": 0.3710144927536232,
"grad_norm": 0.7730510671832644,
"learning_rate": 5e-06,
"loss": 0.8497,
"step": 240
},
{
"epoch": 0.3864734299516908,
"grad_norm": 0.7142598054734569,
"learning_rate": 5e-06,
"loss": 0.8438,
"step": 250
},
{
"epoch": 0.40193236714975844,
"grad_norm": 0.5417070693738956,
"learning_rate": 5e-06,
"loss": 0.8476,
"step": 260
},
{
"epoch": 0.41739130434782606,
"grad_norm": 0.6163533652451042,
"learning_rate": 5e-06,
"loss": 0.8442,
"step": 270
},
{
"epoch": 0.43285024154589374,
"grad_norm": 0.5863440757370957,
"learning_rate": 5e-06,
"loss": 0.8384,
"step": 280
},
{
"epoch": 0.44830917874396137,
"grad_norm": 0.6841438022567938,
"learning_rate": 5e-06,
"loss": 0.8416,
"step": 290
},
{
"epoch": 0.463768115942029,
"grad_norm": 0.5830887688966261,
"learning_rate": 5e-06,
"loss": 0.8363,
"step": 300
},
{
"epoch": 0.4792270531400966,
"grad_norm": 0.7565147804535631,
"learning_rate": 5e-06,
"loss": 0.8391,
"step": 310
},
{
"epoch": 0.49468599033816424,
"grad_norm": 0.5377296055296723,
"learning_rate": 5e-06,
"loss": 0.8384,
"step": 320
},
{
"epoch": 0.5101449275362319,
"grad_norm": 0.6313225350990711,
"learning_rate": 5e-06,
"loss": 0.8344,
"step": 330
},
{
"epoch": 0.5256038647342995,
"grad_norm": 0.6011085474374273,
"learning_rate": 5e-06,
"loss": 0.8352,
"step": 340
},
{
"epoch": 0.5410628019323671,
"grad_norm": 0.6313718464958992,
"learning_rate": 5e-06,
"loss": 0.8319,
"step": 350
},
{
"epoch": 0.5565217391304348,
"grad_norm": 0.5602186929251594,
"learning_rate": 5e-06,
"loss": 0.8305,
"step": 360
},
{
"epoch": 0.5719806763285025,
"grad_norm": 0.5682159814727703,
"learning_rate": 5e-06,
"loss": 0.8283,
"step": 370
},
{
"epoch": 0.58743961352657,
"grad_norm": 0.5718556832144389,
"learning_rate": 5e-06,
"loss": 0.8341,
"step": 380
},
{
"epoch": 0.6028985507246377,
"grad_norm": 0.546768373760242,
"learning_rate": 5e-06,
"loss": 0.8291,
"step": 390
},
{
"epoch": 0.6183574879227053,
"grad_norm": 0.5993423126379529,
"learning_rate": 5e-06,
"loss": 0.8361,
"step": 400
},
{
"epoch": 0.633816425120773,
"grad_norm": 0.5491459229199431,
"learning_rate": 5e-06,
"loss": 0.8291,
"step": 410
},
{
"epoch": 0.6492753623188405,
"grad_norm": 0.5168339143544802,
"learning_rate": 5e-06,
"loss": 0.83,
"step": 420
},
{
"epoch": 0.6647342995169082,
"grad_norm": 0.5210184948415354,
"learning_rate": 5e-06,
"loss": 0.8237,
"step": 430
},
{
"epoch": 0.6801932367149759,
"grad_norm": 0.5424122263787127,
"learning_rate": 5e-06,
"loss": 0.8228,
"step": 440
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.5637417843194678,
"learning_rate": 5e-06,
"loss": 0.829,
"step": 450
},
{
"epoch": 0.7111111111111111,
"grad_norm": 0.4743888435043112,
"learning_rate": 5e-06,
"loss": 0.8221,
"step": 460
},
{
"epoch": 0.7265700483091787,
"grad_norm": 0.5224626427829437,
"learning_rate": 5e-06,
"loss": 0.8254,
"step": 470
},
{
"epoch": 0.7420289855072464,
"grad_norm": 0.6508696496425336,
"learning_rate": 5e-06,
"loss": 0.8264,
"step": 480
},
{
"epoch": 0.7574879227053141,
"grad_norm": 0.5904629481154552,
"learning_rate": 5e-06,
"loss": 0.8295,
"step": 490
},
{
"epoch": 0.7729468599033816,
"grad_norm": 0.574388340778744,
"learning_rate": 5e-06,
"loss": 0.8223,
"step": 500
},
{
"epoch": 0.7884057971014493,
"grad_norm": 0.5353158279457252,
"learning_rate": 5e-06,
"loss": 0.8257,
"step": 510
},
{
"epoch": 0.8038647342995169,
"grad_norm": 0.5821234180105461,
"learning_rate": 5e-06,
"loss": 0.8265,
"step": 520
},
{
"epoch": 0.8193236714975846,
"grad_norm": 0.5068894400573091,
"learning_rate": 5e-06,
"loss": 0.8224,
"step": 530
},
{
"epoch": 0.8347826086956521,
"grad_norm": 0.5375828449207095,
"learning_rate": 5e-06,
"loss": 0.821,
"step": 540
},
{
"epoch": 0.8502415458937198,
"grad_norm": 0.5410980089439529,
"learning_rate": 5e-06,
"loss": 0.8271,
"step": 550
},
{
"epoch": 0.8657004830917875,
"grad_norm": 0.5953566167479901,
"learning_rate": 5e-06,
"loss": 0.8168,
"step": 560
},
{
"epoch": 0.881159420289855,
"grad_norm": 0.6599677690788177,
"learning_rate": 5e-06,
"loss": 0.8234,
"step": 570
},
{
"epoch": 0.8966183574879227,
"grad_norm": 0.6548750016255851,
"learning_rate": 5e-06,
"loss": 0.8198,
"step": 580
},
{
"epoch": 0.9120772946859903,
"grad_norm": 0.6314610392042965,
"learning_rate": 5e-06,
"loss": 0.8169,
"step": 590
},
{
"epoch": 0.927536231884058,
"grad_norm": 0.4920135279723095,
"learning_rate": 5e-06,
"loss": 0.8225,
"step": 600
},
{
"epoch": 0.9429951690821256,
"grad_norm": 0.5279136309564921,
"learning_rate": 5e-06,
"loss": 0.8188,
"step": 610
},
{
"epoch": 0.9584541062801932,
"grad_norm": 0.5531816471285108,
"learning_rate": 5e-06,
"loss": 0.8217,
"step": 620
},
{
"epoch": 0.9739130434782609,
"grad_norm": 0.5924801838934433,
"learning_rate": 5e-06,
"loss": 0.8221,
"step": 630
},
{
"epoch": 0.9893719806763285,
"grad_norm": 0.5117042192273262,
"learning_rate": 5e-06,
"loss": 0.8188,
"step": 640
},
{
"epoch": 0.9986473429951691,
"eval_loss": 0.8183467984199524,
"eval_runtime": 686.568,
"eval_samples_per_second": 25.389,
"eval_steps_per_second": 0.398,
"step": 646
},
{
"epoch": 1.0050241545893719,
"grad_norm": 0.6925909025947767,
"learning_rate": 5e-06,
"loss": 0.8235,
"step": 650
},
{
"epoch": 1.0204830917874397,
"grad_norm": 0.6132973767615294,
"learning_rate": 5e-06,
"loss": 0.7771,
"step": 660
},
{
"epoch": 1.0359420289855072,
"grad_norm": 0.6797848846411009,
"learning_rate": 5e-06,
"loss": 0.7802,
"step": 670
},
{
"epoch": 1.0514009661835748,
"grad_norm": 0.5295735808202817,
"learning_rate": 5e-06,
"loss": 0.7777,
"step": 680
},
{
"epoch": 1.0668599033816426,
"grad_norm": 0.5271721002677758,
"learning_rate": 5e-06,
"loss": 0.7751,
"step": 690
},
{
"epoch": 1.0823188405797102,
"grad_norm": 0.47521281338293253,
"learning_rate": 5e-06,
"loss": 0.7808,
"step": 700
},
{
"epoch": 1.0977777777777777,
"grad_norm": 0.5201403409762577,
"learning_rate": 5e-06,
"loss": 0.7769,
"step": 710
},
{
"epoch": 1.1132367149758453,
"grad_norm": 0.5374055398678584,
"learning_rate": 5e-06,
"loss": 0.7775,
"step": 720
},
{
"epoch": 1.128695652173913,
"grad_norm": 0.520683864449963,
"learning_rate": 5e-06,
"loss": 0.7787,
"step": 730
},
{
"epoch": 1.1441545893719807,
"grad_norm": 0.5406489528118505,
"learning_rate": 5e-06,
"loss": 0.7816,
"step": 740
},
{
"epoch": 1.1596135265700482,
"grad_norm": 0.585881797178412,
"learning_rate": 5e-06,
"loss": 0.7811,
"step": 750
},
{
"epoch": 1.175072463768116,
"grad_norm": 0.5490222258224376,
"learning_rate": 5e-06,
"loss": 0.7763,
"step": 760
},
{
"epoch": 1.1905314009661836,
"grad_norm": 0.6049557272461074,
"learning_rate": 5e-06,
"loss": 0.7821,
"step": 770
},
{
"epoch": 1.2059903381642512,
"grad_norm": 0.6287813068938076,
"learning_rate": 5e-06,
"loss": 0.7771,
"step": 780
},
{
"epoch": 1.221449275362319,
"grad_norm": 0.5791771698431348,
"learning_rate": 5e-06,
"loss": 0.7832,
"step": 790
},
{
"epoch": 1.2369082125603865,
"grad_norm": 0.552647068072239,
"learning_rate": 5e-06,
"loss": 0.7795,
"step": 800
},
{
"epoch": 1.252367149758454,
"grad_norm": 0.48953182542010515,
"learning_rate": 5e-06,
"loss": 0.7767,
"step": 810
},
{
"epoch": 1.2678260869565217,
"grad_norm": 0.5809037976182655,
"learning_rate": 5e-06,
"loss": 0.7784,
"step": 820
},
{
"epoch": 1.2832850241545894,
"grad_norm": 0.49664609280994976,
"learning_rate": 5e-06,
"loss": 0.7765,
"step": 830
},
{
"epoch": 1.298743961352657,
"grad_norm": 0.5514267021897065,
"learning_rate": 5e-06,
"loss": 0.7791,
"step": 840
},
{
"epoch": 1.3142028985507246,
"grad_norm": 0.6174163379347436,
"learning_rate": 5e-06,
"loss": 0.7775,
"step": 850
},
{
"epoch": 1.3296618357487922,
"grad_norm": 0.5893029009867757,
"learning_rate": 5e-06,
"loss": 0.7743,
"step": 860
},
{
"epoch": 1.34512077294686,
"grad_norm": 0.5884521899466931,
"learning_rate": 5e-06,
"loss": 0.7768,
"step": 870
},
{
"epoch": 1.3605797101449275,
"grad_norm": 0.526781782612563,
"learning_rate": 5e-06,
"loss": 0.773,
"step": 880
},
{
"epoch": 1.376038647342995,
"grad_norm": 0.5133541303046,
"learning_rate": 5e-06,
"loss": 0.774,
"step": 890
},
{
"epoch": 1.3914975845410629,
"grad_norm": 0.5514217537787884,
"learning_rate": 5e-06,
"loss": 0.7802,
"step": 900
},
{
"epoch": 1.4069565217391304,
"grad_norm": 0.5829849669974898,
"learning_rate": 5e-06,
"loss": 0.7787,
"step": 910
},
{
"epoch": 1.422415458937198,
"grad_norm": 0.6099035981973764,
"learning_rate": 5e-06,
"loss": 0.7738,
"step": 920
},
{
"epoch": 1.4378743961352658,
"grad_norm": 0.4767884324426242,
"learning_rate": 5e-06,
"loss": 0.7773,
"step": 930
},
{
"epoch": 1.4533333333333334,
"grad_norm": 0.5611337081061908,
"learning_rate": 5e-06,
"loss": 0.7767,
"step": 940
},
{
"epoch": 1.468792270531401,
"grad_norm": 0.47930773858272085,
"learning_rate": 5e-06,
"loss": 0.7765,
"step": 950
},
{
"epoch": 1.4842512077294687,
"grad_norm": 0.498168257215718,
"learning_rate": 5e-06,
"loss": 0.7728,
"step": 960
},
{
"epoch": 1.4997101449275363,
"grad_norm": 0.5576989172567428,
"learning_rate": 5e-06,
"loss": 0.7777,
"step": 970
},
{
"epoch": 1.5151690821256039,
"grad_norm": 0.5873903650866855,
"learning_rate": 5e-06,
"loss": 0.7747,
"step": 980
},
{
"epoch": 1.5306280193236717,
"grad_norm": 0.5564865473674926,
"learning_rate": 5e-06,
"loss": 0.7786,
"step": 990
},
{
"epoch": 1.546086956521739,
"grad_norm": 0.6746662280932265,
"learning_rate": 5e-06,
"loss": 0.7823,
"step": 1000
},
{
"epoch": 1.5615458937198068,
"grad_norm": 0.550553366091711,
"learning_rate": 5e-06,
"loss": 0.7704,
"step": 1010
},
{
"epoch": 1.5770048309178744,
"grad_norm": 0.555996816403915,
"learning_rate": 5e-06,
"loss": 0.7758,
"step": 1020
},
{
"epoch": 1.592463768115942,
"grad_norm": 0.5621088990135378,
"learning_rate": 5e-06,
"loss": 0.7751,
"step": 1030
},
{
"epoch": 1.6079227053140097,
"grad_norm": 0.4672348676970037,
"learning_rate": 5e-06,
"loss": 0.7742,
"step": 1040
},
{
"epoch": 1.6233816425120773,
"grad_norm": 0.49112359521062937,
"learning_rate": 5e-06,
"loss": 0.777,
"step": 1050
},
{
"epoch": 1.6388405797101449,
"grad_norm": 0.5517626252028611,
"learning_rate": 5e-06,
"loss": 0.7757,
"step": 1060
},
{
"epoch": 1.6542995169082126,
"grad_norm": 0.5518129870744243,
"learning_rate": 5e-06,
"loss": 0.7744,
"step": 1070
},
{
"epoch": 1.6697584541062802,
"grad_norm": 0.685405898117341,
"learning_rate": 5e-06,
"loss": 0.7753,
"step": 1080
},
{
"epoch": 1.6852173913043478,
"grad_norm": 0.5720673042328214,
"learning_rate": 5e-06,
"loss": 0.7753,
"step": 1090
},
{
"epoch": 1.7006763285024156,
"grad_norm": 0.4690028175072265,
"learning_rate": 5e-06,
"loss": 0.774,
"step": 1100
},
{
"epoch": 1.7161352657004831,
"grad_norm": 0.55568178811657,
"learning_rate": 5e-06,
"loss": 0.7772,
"step": 1110
},
{
"epoch": 1.7315942028985507,
"grad_norm": 0.5185570493500619,
"learning_rate": 5e-06,
"loss": 0.781,
"step": 1120
},
{
"epoch": 1.7470531400966185,
"grad_norm": 0.5292299708932318,
"learning_rate": 5e-06,
"loss": 0.7749,
"step": 1130
},
{
"epoch": 1.7625120772946858,
"grad_norm": 0.5884058161213621,
"learning_rate": 5e-06,
"loss": 0.7719,
"step": 1140
},
{
"epoch": 1.7779710144927536,
"grad_norm": 0.5072506431239099,
"learning_rate": 5e-06,
"loss": 0.7753,
"step": 1150
},
{
"epoch": 1.7934299516908214,
"grad_norm": 0.5551938392960334,
"learning_rate": 5e-06,
"loss": 0.7777,
"step": 1160
},
{
"epoch": 1.8088888888888888,
"grad_norm": 0.5566487078925049,
"learning_rate": 5e-06,
"loss": 0.7774,
"step": 1170
},
{
"epoch": 1.8243478260869566,
"grad_norm": 0.4749917546235466,
"learning_rate": 5e-06,
"loss": 0.7734,
"step": 1180
},
{
"epoch": 1.8398067632850241,
"grad_norm": 0.5022635709311233,
"learning_rate": 5e-06,
"loss": 0.7743,
"step": 1190
},
{
"epoch": 1.8552657004830917,
"grad_norm": 0.5442982810099344,
"learning_rate": 5e-06,
"loss": 0.7728,
"step": 1200
},
{
"epoch": 1.8707246376811595,
"grad_norm": 0.5155014433123901,
"learning_rate": 5e-06,
"loss": 0.774,
"step": 1210
},
{
"epoch": 1.886183574879227,
"grad_norm": 0.5934285413681538,
"learning_rate": 5e-06,
"loss": 0.7746,
"step": 1220
},
{
"epoch": 1.9016425120772946,
"grad_norm": 0.5260175972638601,
"learning_rate": 5e-06,
"loss": 0.7693,
"step": 1230
},
{
"epoch": 1.9171014492753624,
"grad_norm": 0.515080447493818,
"learning_rate": 5e-06,
"loss": 0.7717,
"step": 1240
},
{
"epoch": 1.93256038647343,
"grad_norm": 0.6011160845737209,
"learning_rate": 5e-06,
"loss": 0.7754,
"step": 1250
},
{
"epoch": 1.9480193236714975,
"grad_norm": 0.46061302659355685,
"learning_rate": 5e-06,
"loss": 0.7755,
"step": 1260
},
{
"epoch": 1.9634782608695653,
"grad_norm": 0.46039484020056154,
"learning_rate": 5e-06,
"loss": 0.7722,
"step": 1270
},
{
"epoch": 1.9789371980676327,
"grad_norm": 0.5658493454639554,
"learning_rate": 5e-06,
"loss": 0.7755,
"step": 1280
},
{
"epoch": 1.9943961352657005,
"grad_norm": 0.5908199178180503,
"learning_rate": 5e-06,
"loss": 0.7709,
"step": 1290
},
{
"epoch": 1.9990338164251207,
"eval_loss": 0.8051349520683289,
"eval_runtime": 690.0891,
"eval_samples_per_second": 25.259,
"eval_steps_per_second": 0.396,
"step": 1293
},
{
"epoch": 2.0100483091787438,
"grad_norm": 0.6943407863572525,
"learning_rate": 5e-06,
"loss": 0.7604,
"step": 1300
},
{
"epoch": 2.0255072463768116,
"grad_norm": 0.5840764739328596,
"learning_rate": 5e-06,
"loss": 0.7299,
"step": 1310
},
{
"epoch": 2.0409661835748794,
"grad_norm": 0.645835170219903,
"learning_rate": 5e-06,
"loss": 0.7254,
"step": 1320
},
{
"epoch": 2.0564251207729467,
"grad_norm": 0.6967100498978297,
"learning_rate": 5e-06,
"loss": 0.7312,
"step": 1330
},
{
"epoch": 2.0718840579710145,
"grad_norm": 0.5424795093750152,
"learning_rate": 5e-06,
"loss": 0.7283,
"step": 1340
},
{
"epoch": 2.0873429951690823,
"grad_norm": 0.5651081335517218,
"learning_rate": 5e-06,
"loss": 0.7322,
"step": 1350
},
{
"epoch": 2.1028019323671496,
"grad_norm": 0.5793019251125064,
"learning_rate": 5e-06,
"loss": 0.7317,
"step": 1360
},
{
"epoch": 2.1182608695652174,
"grad_norm": 0.5653295937261641,
"learning_rate": 5e-06,
"loss": 0.7331,
"step": 1370
},
{
"epoch": 2.133719806763285,
"grad_norm": 0.6945092784765529,
"learning_rate": 5e-06,
"loss": 0.7346,
"step": 1380
},
{
"epoch": 2.1491787439613526,
"grad_norm": 0.5795163218543443,
"learning_rate": 5e-06,
"loss": 0.7336,
"step": 1390
},
{
"epoch": 2.1646376811594203,
"grad_norm": 0.5922357321216497,
"learning_rate": 5e-06,
"loss": 0.7299,
"step": 1400
},
{
"epoch": 2.1800966183574877,
"grad_norm": 0.5570557796263025,
"learning_rate": 5e-06,
"loss": 0.7333,
"step": 1410
},
{
"epoch": 2.1955555555555555,
"grad_norm": 0.5392312450784695,
"learning_rate": 5e-06,
"loss": 0.7371,
"step": 1420
},
{
"epoch": 2.2110144927536233,
"grad_norm": 0.569063560563541,
"learning_rate": 5e-06,
"loss": 0.7314,
"step": 1430
},
{
"epoch": 2.2264734299516906,
"grad_norm": 0.6107660118171969,
"learning_rate": 5e-06,
"loss": 0.7322,
"step": 1440
},
{
"epoch": 2.2419323671497584,
"grad_norm": 0.6566517138097786,
"learning_rate": 5e-06,
"loss": 0.7356,
"step": 1450
},
{
"epoch": 2.257391304347826,
"grad_norm": 0.5806353609910259,
"learning_rate": 5e-06,
"loss": 0.7418,
"step": 1460
},
{
"epoch": 2.2728502415458935,
"grad_norm": 0.544246667709765,
"learning_rate": 5e-06,
"loss": 0.7319,
"step": 1470
},
{
"epoch": 2.2883091787439613,
"grad_norm": 0.5424208252581,
"learning_rate": 5e-06,
"loss": 0.7332,
"step": 1480
},
{
"epoch": 2.303768115942029,
"grad_norm": 0.5380434193955503,
"learning_rate": 5e-06,
"loss": 0.7342,
"step": 1490
},
{
"epoch": 2.3192270531400965,
"grad_norm": 0.5919093406358342,
"learning_rate": 5e-06,
"loss": 0.7345,
"step": 1500
},
{
"epoch": 2.3346859903381643,
"grad_norm": 0.5815232359700448,
"learning_rate": 5e-06,
"loss": 0.7357,
"step": 1510
},
{
"epoch": 2.350144927536232,
"grad_norm": 0.6561512544812266,
"learning_rate": 5e-06,
"loss": 0.7339,
"step": 1520
},
{
"epoch": 2.3656038647342994,
"grad_norm": 0.5328952220385875,
"learning_rate": 5e-06,
"loss": 0.7297,
"step": 1530
},
{
"epoch": 2.381062801932367,
"grad_norm": 0.5216733576185124,
"learning_rate": 5e-06,
"loss": 0.7298,
"step": 1540
},
{
"epoch": 2.396521739130435,
"grad_norm": 0.6063067814678768,
"learning_rate": 5e-06,
"loss": 0.7368,
"step": 1550
},
{
"epoch": 2.4119806763285023,
"grad_norm": 0.5818602690123681,
"learning_rate": 5e-06,
"loss": 0.7353,
"step": 1560
},
{
"epoch": 2.42743961352657,
"grad_norm": 0.5913577701518534,
"learning_rate": 5e-06,
"loss": 0.7338,
"step": 1570
},
{
"epoch": 2.442898550724638,
"grad_norm": 0.5527497540961946,
"learning_rate": 5e-06,
"loss": 0.7329,
"step": 1580
},
{
"epoch": 2.4583574879227053,
"grad_norm": 0.6737570445790982,
"learning_rate": 5e-06,
"loss": 0.7367,
"step": 1590
},
{
"epoch": 2.473816425120773,
"grad_norm": 0.6619470586787684,
"learning_rate": 5e-06,
"loss": 0.733,
"step": 1600
},
{
"epoch": 2.4892753623188404,
"grad_norm": 0.4750068577638755,
"learning_rate": 5e-06,
"loss": 0.7375,
"step": 1610
},
{
"epoch": 2.504734299516908,
"grad_norm": 0.6847743909506772,
"learning_rate": 5e-06,
"loss": 0.7374,
"step": 1620
},
{
"epoch": 2.520193236714976,
"grad_norm": 0.5239840846624293,
"learning_rate": 5e-06,
"loss": 0.7311,
"step": 1630
},
{
"epoch": 2.5356521739130433,
"grad_norm": 0.4721718835375596,
"learning_rate": 5e-06,
"loss": 0.7308,
"step": 1640
},
{
"epoch": 2.551111111111111,
"grad_norm": 0.51093602092176,
"learning_rate": 5e-06,
"loss": 0.7337,
"step": 1650
},
{
"epoch": 2.566570048309179,
"grad_norm": 0.5517386015611798,
"learning_rate": 5e-06,
"loss": 0.7318,
"step": 1660
},
{
"epoch": 2.5820289855072462,
"grad_norm": 0.6326674619813268,
"learning_rate": 5e-06,
"loss": 0.736,
"step": 1670
},
{
"epoch": 2.597487922705314,
"grad_norm": 0.5232840712675151,
"learning_rate": 5e-06,
"loss": 0.7325,
"step": 1680
},
{
"epoch": 2.6129468599033814,
"grad_norm": 0.4969751533645812,
"learning_rate": 5e-06,
"loss": 0.7375,
"step": 1690
},
{
"epoch": 2.628405797101449,
"grad_norm": 0.49538430512331766,
"learning_rate": 5e-06,
"loss": 0.7366,
"step": 1700
},
{
"epoch": 2.643864734299517,
"grad_norm": 0.6208865012276192,
"learning_rate": 5e-06,
"loss": 0.7372,
"step": 1710
},
{
"epoch": 2.6593236714975843,
"grad_norm": 0.5276942120485377,
"learning_rate": 5e-06,
"loss": 0.7351,
"step": 1720
},
{
"epoch": 2.674782608695652,
"grad_norm": 0.570808842039396,
"learning_rate": 5e-06,
"loss": 0.7384,
"step": 1730
},
{
"epoch": 2.69024154589372,
"grad_norm": 0.5214638213365278,
"learning_rate": 5e-06,
"loss": 0.7361,
"step": 1740
},
{
"epoch": 2.7057004830917872,
"grad_norm": 0.5190586781651014,
"learning_rate": 5e-06,
"loss": 0.7309,
"step": 1750
},
{
"epoch": 2.721159420289855,
"grad_norm": 0.5317230869170978,
"learning_rate": 5e-06,
"loss": 0.7296,
"step": 1760
},
{
"epoch": 2.736618357487923,
"grad_norm": 0.5917255596181432,
"learning_rate": 5e-06,
"loss": 0.7406,
"step": 1770
},
{
"epoch": 2.75207729468599,
"grad_norm": 0.49202576322983876,
"learning_rate": 5e-06,
"loss": 0.7324,
"step": 1780
},
{
"epoch": 2.767536231884058,
"grad_norm": 0.5594574654106287,
"learning_rate": 5e-06,
"loss": 0.7331,
"step": 1790
},
{
"epoch": 2.7829951690821257,
"grad_norm": 0.6198580773466541,
"learning_rate": 5e-06,
"loss": 0.7372,
"step": 1800
},
{
"epoch": 2.798454106280193,
"grad_norm": 0.5740394274550438,
"learning_rate": 5e-06,
"loss": 0.7359,
"step": 1810
},
{
"epoch": 2.813913043478261,
"grad_norm": 0.5501912428656768,
"learning_rate": 5e-06,
"loss": 0.7384,
"step": 1820
},
{
"epoch": 2.8293719806763287,
"grad_norm": 0.5104778986757859,
"learning_rate": 5e-06,
"loss": 0.7324,
"step": 1830
},
{
"epoch": 2.844830917874396,
"grad_norm": 0.5395220598812313,
"learning_rate": 5e-06,
"loss": 0.736,
"step": 1840
},
{
"epoch": 2.860289855072464,
"grad_norm": 0.6030104859258091,
"learning_rate": 5e-06,
"loss": 0.7327,
"step": 1850
},
{
"epoch": 2.8757487922705316,
"grad_norm": 0.556906171705928,
"learning_rate": 5e-06,
"loss": 0.7374,
"step": 1860
},
{
"epoch": 2.891207729468599,
"grad_norm": 0.6174821846225631,
"learning_rate": 5e-06,
"loss": 0.7351,
"step": 1870
},
{
"epoch": 2.9066666666666667,
"grad_norm": 0.5078906232420815,
"learning_rate": 5e-06,
"loss": 0.7326,
"step": 1880
},
{
"epoch": 2.9221256038647345,
"grad_norm": 0.6177111487230912,
"learning_rate": 5e-06,
"loss": 0.7321,
"step": 1890
},
{
"epoch": 2.937584541062802,
"grad_norm": 0.5520929737500946,
"learning_rate": 5e-06,
"loss": 0.7395,
"step": 1900
},
{
"epoch": 2.9530434782608697,
"grad_norm": 0.5185834378400617,
"learning_rate": 5e-06,
"loss": 0.7368,
"step": 1910
},
{
"epoch": 2.9685024154589374,
"grad_norm": 0.5204851978024219,
"learning_rate": 5e-06,
"loss": 0.7339,
"step": 1920
},
{
"epoch": 2.983961352657005,
"grad_norm": 0.5807949438616106,
"learning_rate": 5e-06,
"loss": 0.7386,
"step": 1930
},
{
"epoch": 2.9963285024154587,
"eval_loss": 0.8025317192077637,
"eval_runtime": 693.9451,
"eval_samples_per_second": 25.119,
"eval_steps_per_second": 0.393,
"step": 1938
},
{
"epoch": 2.9963285024154587,
"step": 1938,
"total_flos": 3246012802007040.0,
"train_loss": 0.7871413270263357,
"train_runtime": 114590.6996,
"train_samples_per_second": 8.67,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 1938,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3246012802007040.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}