pretrain_utg4java_220m_seq1024 / trainer_state.json
eljavatar's picture
Upload files model pretrain_utg4java_220m_seq1024
82c2c15 verified
raw
history blame contribute delete
No virus
33.6 kB
{
"best_metric": 0.23903648555278778,
"best_model_checkpoint": "/root/pretrain_executions/pretrain_utg4java_220m_seq1024/checkpoint-38422",
"epoch": 49.992029332058024,
"eval_steps": 500,
"global_step": 39200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.3953451299218875,
"grad_norm": 0.6701709628105164,
"learning_rate": 7.908163265306123e-05,
"loss": 0.5421,
"step": 310
},
{
"epoch": 0.790690259843775,
"grad_norm": 0.7041985988616943,
"learning_rate": 0.00015816326530612246,
"loss": 0.517,
"step": 620
},
{
"epoch": 0.9998405866411605,
"eval_loss": 0.4792475402355194,
"eval_runtime": 146.6334,
"eval_samples_per_second": 85.567,
"eval_steps_per_second": 2.68,
"step": 784
},
{
"epoch": 1.1860353897656624,
"grad_norm": 0.7499191164970398,
"learning_rate": 0.00019924510620574761,
"loss": 0.5394,
"step": 930
},
{
"epoch": 1.5813805196875497,
"grad_norm": 0.6494282484054565,
"learning_rate": 0.00019763119533527698,
"loss": 0.5327,
"step": 1240
},
{
"epoch": 1.9767256496094374,
"grad_norm": 0.5610156059265137,
"learning_rate": 0.00019601728446480633,
"loss": 0.518,
"step": 1550
},
{
"epoch": 1.9996811732823212,
"eval_loss": 0.45565617084503174,
"eval_runtime": 146.6648,
"eval_samples_per_second": 85.549,
"eval_steps_per_second": 2.68,
"step": 1568
},
{
"epoch": 2.3720707795313247,
"grad_norm": 0.6261674761772156,
"learning_rate": 0.00019440337359433573,
"loss": 0.5076,
"step": 1860
},
{
"epoch": 2.767415909453212,
"grad_norm": 0.5601200461387634,
"learning_rate": 0.00019278946272386507,
"loss": 0.4972,
"step": 2170
},
{
"epoch": 2.9995217599234816,
"eval_loss": 0.4429556429386139,
"eval_runtime": 146.6472,
"eval_samples_per_second": 85.559,
"eval_steps_per_second": 2.68,
"step": 2352
},
{
"epoch": 3.1627610393751,
"grad_norm": 0.6102643609046936,
"learning_rate": 0.00019117555185339441,
"loss": 0.4957,
"step": 2480
},
{
"epoch": 3.558106169296987,
"grad_norm": 0.6246281266212463,
"learning_rate": 0.00018956164098292379,
"loss": 0.4827,
"step": 2790
},
{
"epoch": 3.9534512992188744,
"grad_norm": 0.6781056523323059,
"learning_rate": 0.00018794773011245316,
"loss": 0.4736,
"step": 3100
},
{
"epoch": 3.999362346564642,
"eval_loss": 0.42357587814331055,
"eval_runtime": 146.7171,
"eval_samples_per_second": 85.518,
"eval_steps_per_second": 2.679,
"step": 3136
},
{
"epoch": 4.348796429140762,
"grad_norm": 0.5225201845169067,
"learning_rate": 0.00018633381924198253,
"loss": 0.4686,
"step": 3410
},
{
"epoch": 4.7441415590626494,
"grad_norm": 0.5367516279220581,
"learning_rate": 0.00018471990837151187,
"loss": 0.4575,
"step": 3720
},
{
"epoch": 4.999202933205803,
"eval_loss": 0.4168592095375061,
"eval_runtime": 146.7474,
"eval_samples_per_second": 85.501,
"eval_steps_per_second": 2.678,
"step": 3920
},
{
"epoch": 5.139486688984537,
"grad_norm": 0.5979415774345398,
"learning_rate": 0.00018310599750104124,
"loss": 0.4591,
"step": 4030
},
{
"epoch": 5.534831818906424,
"grad_norm": 0.6041168570518494,
"learning_rate": 0.0001814920866305706,
"loss": 0.4506,
"step": 4340
},
{
"epoch": 5.930176948828311,
"grad_norm": 0.5398473739624023,
"learning_rate": 0.00017987817576009998,
"loss": 0.4499,
"step": 4650
},
{
"epoch": 5.999043519846963,
"eval_loss": 0.3998393714427948,
"eval_runtime": 146.7067,
"eval_samples_per_second": 85.524,
"eval_steps_per_second": 2.679,
"step": 4704
},
{
"epoch": 6.3255220787502,
"grad_norm": 0.5446251630783081,
"learning_rate": 0.00017826426488962933,
"loss": 0.4444,
"step": 4960
},
{
"epoch": 6.720867208672087,
"grad_norm": 0.564083993434906,
"learning_rate": 0.00017665556018325697,
"loss": 0.4401,
"step": 5270
},
{
"epoch": 6.9988841064881235,
"eval_loss": 0.3943786323070526,
"eval_runtime": 146.7223,
"eval_samples_per_second": 85.515,
"eval_steps_per_second": 2.679,
"step": 5488
},
{
"epoch": 7.116212338593974,
"grad_norm": 0.5197238326072693,
"learning_rate": 0.00017504164931278634,
"loss": 0.4349,
"step": 5580
},
{
"epoch": 7.5115574685158615,
"grad_norm": 0.5063862204551697,
"learning_rate": 0.0001734277384423157,
"loss": 0.4274,
"step": 5890
},
{
"epoch": 7.906902598437749,
"grad_norm": 0.8238950371742249,
"learning_rate": 0.00017181382757184508,
"loss": 0.4275,
"step": 6200
},
{
"epoch": 8.0,
"eval_loss": 0.38801178336143494,
"eval_runtime": 146.7154,
"eval_samples_per_second": 85.519,
"eval_steps_per_second": 2.679,
"step": 6273
},
{
"epoch": 8.302247728359637,
"grad_norm": 0.4785802662372589,
"learning_rate": 0.00017019991670137442,
"loss": 0.4218,
"step": 6510
},
{
"epoch": 8.697592858281524,
"grad_norm": 0.5460196137428284,
"learning_rate": 0.0001685860058309038,
"loss": 0.4165,
"step": 6820
},
{
"epoch": 8.999840586641161,
"eval_loss": 0.3786679804325104,
"eval_runtime": 146.6889,
"eval_samples_per_second": 85.535,
"eval_steps_per_second": 2.679,
"step": 7057
},
{
"epoch": 9.092937988203412,
"grad_norm": 0.5532106161117554,
"learning_rate": 0.00016697209496043317,
"loss": 0.4147,
"step": 7130
},
{
"epoch": 9.488283118125299,
"grad_norm": 0.5270036458969116,
"learning_rate": 0.00016535818408996254,
"loss": 0.4054,
"step": 7440
},
{
"epoch": 9.883628248047186,
"grad_norm": 0.5107512474060059,
"learning_rate": 0.00016374427321949188,
"loss": 0.407,
"step": 7750
},
{
"epoch": 9.99968117328232,
"eval_loss": 0.3678111732006073,
"eval_runtime": 146.7168,
"eval_samples_per_second": 85.519,
"eval_steps_per_second": 2.679,
"step": 7841
},
{
"epoch": 10.278973377969074,
"grad_norm": 0.4663056433200836,
"learning_rate": 0.00016213036234902125,
"loss": 0.4001,
"step": 8060
},
{
"epoch": 10.67431850789096,
"grad_norm": 0.5166866183280945,
"learning_rate": 0.00016051645147855062,
"loss": 0.4012,
"step": 8370
},
{
"epoch": 10.999521759923482,
"eval_loss": 0.36857831478118896,
"eval_runtime": 146.752,
"eval_samples_per_second": 85.498,
"eval_steps_per_second": 2.678,
"step": 8625
},
{
"epoch": 11.069663637812848,
"grad_norm": 0.5623896718025208,
"learning_rate": 0.00015890254060807997,
"loss": 0.3967,
"step": 8680
},
{
"epoch": 11.465008767734735,
"grad_norm": 0.4826233983039856,
"learning_rate": 0.00015728862973760934,
"loss": 0.3902,
"step": 8990
},
{
"epoch": 11.860353897656623,
"grad_norm": 0.5024587512016296,
"learning_rate": 0.00015567471886713868,
"loss": 0.3889,
"step": 9300
},
{
"epoch": 11.999362346564642,
"eval_loss": 0.35674843192100525,
"eval_runtime": 146.7093,
"eval_samples_per_second": 85.523,
"eval_steps_per_second": 2.679,
"step": 9409
},
{
"epoch": 12.255699027578512,
"grad_norm": 0.4992258846759796,
"learning_rate": 0.00015406080799666805,
"loss": 0.3838,
"step": 9610
},
{
"epoch": 12.6510441575004,
"grad_norm": 0.4781612455844879,
"learning_rate": 0.00015244689712619742,
"loss": 0.3789,
"step": 9920
},
{
"epoch": 12.999202933205803,
"eval_loss": 0.35254529118537903,
"eval_runtime": 146.7424,
"eval_samples_per_second": 85.504,
"eval_steps_per_second": 2.678,
"step": 10193
},
{
"epoch": 13.046389287422286,
"grad_norm": 0.49535173177719116,
"learning_rate": 0.0001508329862557268,
"loss": 0.3775,
"step": 10230
},
{
"epoch": 13.441734417344174,
"grad_norm": 0.5237115621566772,
"learning_rate": 0.00014922428154935443,
"loss": 0.3734,
"step": 10540
},
{
"epoch": 13.837079547266061,
"grad_norm": 0.4549529552459717,
"learning_rate": 0.0001476155768429821,
"loss": 0.37,
"step": 10850
},
{
"epoch": 13.999043519846964,
"eval_loss": 0.3443816006183624,
"eval_runtime": 146.6954,
"eval_samples_per_second": 85.531,
"eval_steps_per_second": 2.679,
"step": 10977
},
{
"epoch": 14.232424677187948,
"grad_norm": 0.520125150680542,
"learning_rate": 0.00014600166597251147,
"loss": 0.3647,
"step": 11160
},
{
"epoch": 14.627769807109836,
"grad_norm": 0.5332316160202026,
"learning_rate": 0.00014438775510204084,
"loss": 0.3678,
"step": 11470
},
{
"epoch": 14.998884106488124,
"eval_loss": 0.3436979055404663,
"eval_runtime": 146.7179,
"eval_samples_per_second": 85.518,
"eval_steps_per_second": 2.679,
"step": 11761
},
{
"epoch": 15.023114937031723,
"grad_norm": 0.47955256700515747,
"learning_rate": 0.0001427738442315702,
"loss": 0.3664,
"step": 11780
},
{
"epoch": 15.41846006695361,
"grad_norm": 0.48371678590774536,
"learning_rate": 0.00014115993336109953,
"loss": 0.36,
"step": 12090
},
{
"epoch": 15.813805196875498,
"grad_norm": 0.4756961166858673,
"learning_rate": 0.0001395460224906289,
"loss": 0.3577,
"step": 12400
},
{
"epoch": 16.0,
"eval_loss": 0.3342459499835968,
"eval_runtime": 146.7158,
"eval_samples_per_second": 85.519,
"eval_steps_per_second": 2.679,
"step": 12546
},
{
"epoch": 16.209150326797385,
"grad_norm": 0.5593659281730652,
"learning_rate": 0.00013793211162015827,
"loss": 0.3536,
"step": 12710
},
{
"epoch": 16.604495456719274,
"grad_norm": 0.6009001135826111,
"learning_rate": 0.00013631820074968764,
"loss": 0.3539,
"step": 13020
},
{
"epoch": 16.99984058664116,
"grad_norm": 0.48707565665245056,
"learning_rate": 0.000134704289879217,
"loss": 0.3522,
"step": 13330
},
{
"epoch": 16.99984058664116,
"eval_loss": 0.33101820945739746,
"eval_runtime": 146.73,
"eval_samples_per_second": 85.511,
"eval_steps_per_second": 2.678,
"step": 13330
},
{
"epoch": 17.39518571656305,
"grad_norm": 0.4787095785140991,
"learning_rate": 0.00013309037900874636,
"loss": 0.343,
"step": 13640
},
{
"epoch": 17.790530846484934,
"grad_norm": 0.4643840789794922,
"learning_rate": 0.00013147646813827573,
"loss": 0.3466,
"step": 13950
},
{
"epoch": 17.999681173282323,
"eval_loss": 0.3281005620956421,
"eval_runtime": 146.7108,
"eval_samples_per_second": 85.522,
"eval_steps_per_second": 2.679,
"step": 14114
},
{
"epoch": 18.185875976406823,
"grad_norm": 0.4819445312023163,
"learning_rate": 0.0001298625572678051,
"loss": 0.3415,
"step": 14260
},
{
"epoch": 18.58122110632871,
"grad_norm": 0.46530964970588684,
"learning_rate": 0.00012824864639733444,
"loss": 0.3393,
"step": 14570
},
{
"epoch": 18.976566236250598,
"grad_norm": 0.5159475207328796,
"learning_rate": 0.00012663473552686382,
"loss": 0.3377,
"step": 14880
},
{
"epoch": 18.999521759923482,
"eval_loss": 0.32132235169410706,
"eval_runtime": 146.7396,
"eval_samples_per_second": 85.505,
"eval_steps_per_second": 2.678,
"step": 14898
},
{
"epoch": 19.371911366172487,
"grad_norm": 0.45964986085891724,
"learning_rate": 0.00012502082465639319,
"loss": 0.3348,
"step": 15190
},
{
"epoch": 19.767256496094372,
"grad_norm": 0.49627387523651123,
"learning_rate": 0.00012340691378592253,
"loss": 0.3316,
"step": 15500
},
{
"epoch": 19.99936234656464,
"eval_loss": 0.31626757979393005,
"eval_runtime": 146.7396,
"eval_samples_per_second": 85.505,
"eval_steps_per_second": 2.678,
"step": 15682
},
{
"epoch": 20.16260162601626,
"grad_norm": 0.48719242215156555,
"learning_rate": 0.0001217930029154519,
"loss": 0.3294,
"step": 15810
},
{
"epoch": 20.557946755938147,
"grad_norm": 0.5443927049636841,
"learning_rate": 0.00012017909204498126,
"loss": 0.3261,
"step": 16120
},
{
"epoch": 20.953291885860036,
"grad_norm": 0.4637634754180908,
"learning_rate": 0.00011856518117451063,
"loss": 0.3255,
"step": 16430
},
{
"epoch": 20.9992029332058,
"eval_loss": 0.31501948833465576,
"eval_runtime": 146.7591,
"eval_samples_per_second": 85.494,
"eval_steps_per_second": 2.678,
"step": 16466
},
{
"epoch": 21.34863701578192,
"grad_norm": 0.46018585562705994,
"learning_rate": 0.00011695127030403999,
"loss": 0.3198,
"step": 16740
},
{
"epoch": 21.74398214570381,
"grad_norm": 0.5096014738082886,
"learning_rate": 0.00011533735943356936,
"loss": 0.3226,
"step": 17050
},
{
"epoch": 21.999043519846964,
"eval_loss": 0.30657365918159485,
"eval_runtime": 146.7538,
"eval_samples_per_second": 85.497,
"eval_steps_per_second": 2.678,
"step": 17250
},
{
"epoch": 22.139327275625696,
"grad_norm": 0.44816407561302185,
"learning_rate": 0.00011372344856309872,
"loss": 0.3178,
"step": 17360
},
{
"epoch": 22.534672405547585,
"grad_norm": 0.437168151140213,
"learning_rate": 0.00011211474385672638,
"loss": 0.3172,
"step": 17670
},
{
"epoch": 22.93001753546947,
"grad_norm": 0.5836613774299622,
"learning_rate": 0.00011050083298625573,
"loss": 0.3121,
"step": 17980
},
{
"epoch": 22.998884106488124,
"eval_loss": 0.30263882875442505,
"eval_runtime": 146.7108,
"eval_samples_per_second": 85.522,
"eval_steps_per_second": 2.679,
"step": 18034
},
{
"epoch": 23.32536266539136,
"grad_norm": 0.4829230308532715,
"learning_rate": 0.00010888692211578508,
"loss": 0.3079,
"step": 18290
},
{
"epoch": 23.720707795313245,
"grad_norm": 0.4485584497451782,
"learning_rate": 0.00010727821740941275,
"loss": 0.3105,
"step": 18600
},
{
"epoch": 24.0,
"eval_loss": 0.3048921227455139,
"eval_runtime": 146.8103,
"eval_samples_per_second": 85.464,
"eval_steps_per_second": 2.677,
"step": 18819
},
{
"epoch": 24.116052925235135,
"grad_norm": 0.5251662135124207,
"learning_rate": 0.00010566430653894211,
"loss": 0.3052,
"step": 18910
},
{
"epoch": 24.511398055157024,
"grad_norm": 0.4876725971698761,
"learning_rate": 0.00010405039566847148,
"loss": 0.3045,
"step": 19220
},
{
"epoch": 24.90674318507891,
"grad_norm": 0.5600521564483643,
"learning_rate": 0.00010243648479800084,
"loss": 0.3048,
"step": 19530
},
{
"epoch": 24.99984058664116,
"eval_loss": 0.2986990809440613,
"eval_runtime": 146.734,
"eval_samples_per_second": 85.508,
"eval_steps_per_second": 2.678,
"step": 19603
},
{
"epoch": 25.3020883150008,
"grad_norm": 0.5170055627822876,
"learning_rate": 0.00010082257392753021,
"loss": 0.3003,
"step": 19840
},
{
"epoch": 25.697433444922684,
"grad_norm": 0.48347124457359314,
"learning_rate": 9.920866305705956e-05,
"loss": 0.2983,
"step": 20150
},
{
"epoch": 25.999681173282323,
"eval_loss": 0.2916134297847748,
"eval_runtime": 146.7271,
"eval_samples_per_second": 85.512,
"eval_steps_per_second": 2.678,
"step": 20387
},
{
"epoch": 26.092778574844573,
"grad_norm": 0.48907041549682617,
"learning_rate": 9.759475218658892e-05,
"loss": 0.2959,
"step": 20460
},
{
"epoch": 26.48812370476646,
"grad_norm": 0.5060804486274719,
"learning_rate": 9.598084131611829e-05,
"loss": 0.2923,
"step": 20770
},
{
"epoch": 26.883468834688347,
"grad_norm": 0.4843296706676483,
"learning_rate": 9.436693044564765e-05,
"loss": 0.2918,
"step": 21080
},
{
"epoch": 26.999521759923482,
"eval_loss": 0.29019656777381897,
"eval_runtime": 146.6934,
"eval_samples_per_second": 85.532,
"eval_steps_per_second": 2.679,
"step": 21171
},
{
"epoch": 27.278813964610233,
"grad_norm": 0.42266514897346497,
"learning_rate": 9.275301957517701e-05,
"loss": 0.2901,
"step": 21390
},
{
"epoch": 27.674159094532122,
"grad_norm": 0.5161967873573303,
"learning_rate": 9.113910870470638e-05,
"loss": 0.2889,
"step": 21700
},
{
"epoch": 27.99936234656464,
"eval_loss": 0.2833983302116394,
"eval_runtime": 146.7193,
"eval_samples_per_second": 85.517,
"eval_steps_per_second": 2.679,
"step": 21955
},
{
"epoch": 28.069504224454008,
"grad_norm": 0.4523755609989166,
"learning_rate": 8.952519783423574e-05,
"loss": 0.2871,
"step": 22010
},
{
"epoch": 28.464849354375897,
"grad_norm": 0.44348961114883423,
"learning_rate": 8.791128696376511e-05,
"loss": 0.2847,
"step": 22320
},
{
"epoch": 28.860194484297786,
"grad_norm": 0.6467667818069458,
"learning_rate": 8.630258225739276e-05,
"loss": 0.2844,
"step": 22630
},
{
"epoch": 28.9992029332058,
"eval_loss": 0.28629302978515625,
"eval_runtime": 146.7547,
"eval_samples_per_second": 85.496,
"eval_steps_per_second": 2.678,
"step": 22739
},
{
"epoch": 29.25553961421967,
"grad_norm": 0.4734992980957031,
"learning_rate": 8.468867138692213e-05,
"loss": 0.2787,
"step": 22940
},
{
"epoch": 29.65088474414156,
"grad_norm": 0.4827498495578766,
"learning_rate": 8.307476051645148e-05,
"loss": 0.2787,
"step": 23250
},
{
"epoch": 29.999043519846964,
"eval_loss": 0.2794826626777649,
"eval_runtime": 146.9198,
"eval_samples_per_second": 85.4,
"eval_steps_per_second": 2.675,
"step": 23523
},
{
"epoch": 30.046229874063446,
"grad_norm": 0.5005486607551575,
"learning_rate": 8.146084964598085e-05,
"loss": 0.2758,
"step": 23560
},
{
"epoch": 30.441575003985335,
"grad_norm": 0.5253671407699585,
"learning_rate": 7.98469387755102e-05,
"loss": 0.2761,
"step": 23870
},
{
"epoch": 30.83692013390722,
"grad_norm": 0.472740113735199,
"learning_rate": 7.823302790503957e-05,
"loss": 0.2726,
"step": 24180
},
{
"epoch": 30.998884106488124,
"eval_loss": 0.2779182493686676,
"eval_runtime": 146.7777,
"eval_samples_per_second": 85.483,
"eval_steps_per_second": 2.678,
"step": 24307
},
{
"epoch": 31.23226526382911,
"grad_norm": 0.5228144526481628,
"learning_rate": 7.661911703456893e-05,
"loss": 0.2717,
"step": 24490
},
{
"epoch": 31.627610393750995,
"grad_norm": 0.47681719064712524,
"learning_rate": 7.501041232819659e-05,
"loss": 0.2664,
"step": 24800
},
{
"epoch": 32.0,
"eval_loss": 0.27039337158203125,
"eval_runtime": 146.805,
"eval_samples_per_second": 85.467,
"eval_steps_per_second": 2.677,
"step": 25092
},
{
"epoch": 32.022955523672884,
"grad_norm": 0.4973162114620209,
"learning_rate": 7.339650145772596e-05,
"loss": 0.268,
"step": 25110
},
{
"epoch": 32.41830065359477,
"grad_norm": 0.5740240216255188,
"learning_rate": 7.178259058725531e-05,
"loss": 0.2668,
"step": 25420
},
{
"epoch": 32.813645783516655,
"grad_norm": 0.4842962622642517,
"learning_rate": 7.016867971678468e-05,
"loss": 0.2631,
"step": 25730
},
{
"epoch": 32.99984058664116,
"eval_loss": 0.2733234763145447,
"eval_runtime": 146.7109,
"eval_samples_per_second": 85.522,
"eval_steps_per_second": 2.679,
"step": 25876
},
{
"epoch": 33.20899091343855,
"grad_norm": 0.499452143907547,
"learning_rate": 6.855476884631404e-05,
"loss": 0.263,
"step": 26040
},
{
"epoch": 33.60433604336043,
"grad_norm": 0.4541178345680237,
"learning_rate": 6.69408579758434e-05,
"loss": 0.2603,
"step": 26350
},
{
"epoch": 33.99968117328232,
"grad_norm": 0.5029833912849426,
"learning_rate": 6.532694710537276e-05,
"loss": 0.258,
"step": 26660
},
{
"epoch": 33.99968117328232,
"eval_loss": 0.26625362038612366,
"eval_runtime": 146.7319,
"eval_samples_per_second": 85.51,
"eval_steps_per_second": 2.678,
"step": 26660
},
{
"epoch": 34.39502630320421,
"grad_norm": 0.5090352892875671,
"learning_rate": 6.371303623490213e-05,
"loss": 0.2544,
"step": 26970
},
{
"epoch": 34.7903714331261,
"grad_norm": 0.4605717360973358,
"learning_rate": 6.209912536443149e-05,
"loss": 0.254,
"step": 27280
},
{
"epoch": 34.99952175992348,
"eval_loss": 0.26669949293136597,
"eval_runtime": 146.7117,
"eval_samples_per_second": 85.521,
"eval_steps_per_second": 2.679,
"step": 27444
},
{
"epoch": 35.18571656304798,
"grad_norm": 0.46216222643852234,
"learning_rate": 6.048521449396085e-05,
"loss": 0.254,
"step": 27590
},
{
"epoch": 35.58106169296987,
"grad_norm": 0.49629315733909607,
"learning_rate": 5.8871303623490214e-05,
"loss": 0.2521,
"step": 27900
},
{
"epoch": 35.97640682289176,
"grad_norm": 0.48311081528663635,
"learning_rate": 5.725739275301958e-05,
"loss": 0.2493,
"step": 28210
},
{
"epoch": 35.999362346564645,
"eval_loss": 0.26483407616615295,
"eval_runtime": 146.7384,
"eval_samples_per_second": 85.506,
"eval_steps_per_second": 2.678,
"step": 28228
},
{
"epoch": 36.371751952813646,
"grad_norm": 0.43428850173950195,
"learning_rate": 5.564348188254894e-05,
"loss": 0.2455,
"step": 28520
},
{
"epoch": 36.76709708273553,
"grad_norm": 0.4786287844181061,
"learning_rate": 5.4029571012078306e-05,
"loss": 0.2454,
"step": 28830
},
{
"epoch": 36.9992029332058,
"eval_loss": 0.26446378231048584,
"eval_runtime": 146.73,
"eval_samples_per_second": 85.511,
"eval_steps_per_second": 2.678,
"step": 29012
},
{
"epoch": 37.16244221265742,
"grad_norm": 0.5931326746940613,
"learning_rate": 5.241566014160767e-05,
"loss": 0.247,
"step": 29140
},
{
"epoch": 37.55778734257931,
"grad_norm": 0.5031745433807373,
"learning_rate": 5.0801749271137035e-05,
"loss": 0.2425,
"step": 29450
},
{
"epoch": 37.953132472501196,
"grad_norm": 0.5432093739509583,
"learning_rate": 4.918783840066639e-05,
"loss": 0.2416,
"step": 29760
},
{
"epoch": 37.999043519846964,
"eval_loss": 0.2601180672645569,
"eval_runtime": 146.6811,
"eval_samples_per_second": 85.539,
"eval_steps_per_second": 2.679,
"step": 29796
},
{
"epoch": 38.34847760242308,
"grad_norm": 0.5319362878799438,
"learning_rate": 4.7573927530195756e-05,
"loss": 0.2392,
"step": 30070
},
{
"epoch": 38.743822732344974,
"grad_norm": 0.5319586396217346,
"learning_rate": 4.596001665972512e-05,
"loss": 0.2368,
"step": 30380
},
{
"epoch": 38.99888410648813,
"eval_loss": 0.25446435809135437,
"eval_runtime": 146.6972,
"eval_samples_per_second": 85.53,
"eval_steps_per_second": 2.679,
"step": 30580
},
{
"epoch": 39.13916786226686,
"grad_norm": 0.4489250183105469,
"learning_rate": 4.434610578925448e-05,
"loss": 0.2368,
"step": 30690
},
{
"epoch": 39.534512992188745,
"grad_norm": 0.48287880420684814,
"learning_rate": 4.273740108288213e-05,
"loss": 0.2353,
"step": 31000
},
{
"epoch": 39.92985812211063,
"grad_norm": 0.49850553274154663,
"learning_rate": 4.1123490212411495e-05,
"loss": 0.2321,
"step": 31310
},
{
"epoch": 40.0,
"eval_loss": 0.24883659183979034,
"eval_runtime": 146.7363,
"eval_samples_per_second": 85.507,
"eval_steps_per_second": 2.678,
"step": 31365
},
{
"epoch": 40.32520325203252,
"grad_norm": 0.4667394161224365,
"learning_rate": 3.9514785506039155e-05,
"loss": 0.2337,
"step": 31620
},
{
"epoch": 40.72054838195441,
"grad_norm": 0.5053902864456177,
"learning_rate": 3.790087463556852e-05,
"loss": 0.2284,
"step": 31930
},
{
"epoch": 40.99984058664116,
"eval_loss": 0.2544113099575043,
"eval_runtime": 146.7257,
"eval_samples_per_second": 85.513,
"eval_steps_per_second": 2.678,
"step": 32149
},
{
"epoch": 41.115893511876294,
"grad_norm": 0.47476327419281006,
"learning_rate": 3.628696376509788e-05,
"loss": 0.2286,
"step": 32240
},
{
"epoch": 41.51123864179818,
"grad_norm": 0.5025794506072998,
"learning_rate": 3.467305289462724e-05,
"loss": 0.2292,
"step": 32550
},
{
"epoch": 41.90658377172007,
"grad_norm": 0.4553293287754059,
"learning_rate": 3.3059142024156605e-05,
"loss": 0.225,
"step": 32860
},
{
"epoch": 41.99968117328232,
"eval_loss": 0.24568869173526764,
"eval_runtime": 146.7316,
"eval_samples_per_second": 85.51,
"eval_steps_per_second": 2.678,
"step": 32933
},
{
"epoch": 42.30192890164196,
"grad_norm": 0.4845215678215027,
"learning_rate": 3.144523115368597e-05,
"loss": 0.2236,
"step": 33170
},
{
"epoch": 42.69727403156384,
"grad_norm": 0.5739601850509644,
"learning_rate": 2.983132028321533e-05,
"loss": 0.2234,
"step": 33480
},
{
"epoch": 42.99952175992348,
"eval_loss": 0.24620206654071808,
"eval_runtime": 146.7264,
"eval_samples_per_second": 85.513,
"eval_steps_per_second": 2.678,
"step": 33717
},
{
"epoch": 43.092619161485736,
"grad_norm": 0.4569677412509918,
"learning_rate": 2.8217409412744688e-05,
"loss": 0.2213,
"step": 33790
},
{
"epoch": 43.48796429140762,
"grad_norm": 0.5146024227142334,
"learning_rate": 2.6603498542274052e-05,
"loss": 0.2188,
"step": 34100
},
{
"epoch": 43.88330942132951,
"grad_norm": 0.47475871443748474,
"learning_rate": 2.4989587671803416e-05,
"loss": 0.2206,
"step": 34410
},
{
"epoch": 43.999362346564645,
"eval_loss": 0.2445935159921646,
"eval_runtime": 146.7897,
"eval_samples_per_second": 85.476,
"eval_steps_per_second": 2.677,
"step": 34501
},
{
"epoch": 44.27865455125139,
"grad_norm": 0.45915085077285767,
"learning_rate": 2.337567680133278e-05,
"loss": 0.217,
"step": 34720
},
{
"epoch": 44.673999681173285,
"grad_norm": 0.4429190456867218,
"learning_rate": 2.176176593086214e-05,
"loss": 0.2165,
"step": 35030
},
{
"epoch": 44.9992029332058,
"eval_loss": 0.24302400648593903,
"eval_runtime": 146.7631,
"eval_samples_per_second": 85.492,
"eval_steps_per_second": 2.678,
"step": 35285
},
{
"epoch": 45.06934481109517,
"grad_norm": 0.5038246512413025,
"learning_rate": 2.0147855060391505e-05,
"loss": 0.217,
"step": 35340
},
{
"epoch": 45.464689941017056,
"grad_norm": 0.4302615523338318,
"learning_rate": 1.8539150354019162e-05,
"loss": 0.2137,
"step": 35650
},
{
"epoch": 45.86003507093894,
"grad_norm": 0.5075607299804688,
"learning_rate": 1.6925239483548523e-05,
"loss": 0.2145,
"step": 35960
},
{
"epoch": 45.999043519846964,
"eval_loss": 0.24222899973392487,
"eval_runtime": 146.735,
"eval_samples_per_second": 85.508,
"eval_steps_per_second": 2.678,
"step": 36069
},
{
"epoch": 46.255380200860834,
"grad_norm": 0.4777955114841461,
"learning_rate": 1.531653477717618e-05,
"loss": 0.2126,
"step": 36270
},
{
"epoch": 46.65072533078272,
"grad_norm": 0.48974084854125977,
"learning_rate": 1.3702623906705539e-05,
"loss": 0.2112,
"step": 36580
},
{
"epoch": 46.99888410648813,
"eval_loss": 0.2432757019996643,
"eval_runtime": 146.7494,
"eval_samples_per_second": 85.499,
"eval_steps_per_second": 2.678,
"step": 36853
},
{
"epoch": 47.046070460704605,
"grad_norm": 0.46624037623405457,
"learning_rate": 1.2088713036234903e-05,
"loss": 0.2089,
"step": 36890
},
{
"epoch": 47.44141559062649,
"grad_norm": 0.4808659553527832,
"learning_rate": 1.0474802165764265e-05,
"loss": 0.2085,
"step": 37200
},
{
"epoch": 47.836760720548384,
"grad_norm": 0.4421006143093109,
"learning_rate": 8.86089129529363e-06,
"loss": 0.2087,
"step": 37510
},
{
"epoch": 48.0,
"eval_loss": 0.24061799049377441,
"eval_runtime": 146.7785,
"eval_samples_per_second": 85.483,
"eval_steps_per_second": 2.678,
"step": 37638
},
{
"epoch": 48.23210585047027,
"grad_norm": 0.4642196297645569,
"learning_rate": 7.246980424822991e-06,
"loss": 0.208,
"step": 37820
},
{
"epoch": 48.627450980392155,
"grad_norm": 0.47141027450561523,
"learning_rate": 5.633069554352354e-06,
"loss": 0.2067,
"step": 38130
},
{
"epoch": 48.99984058664116,
"eval_loss": 0.23903648555278778,
"eval_runtime": 146.7908,
"eval_samples_per_second": 85.475,
"eval_steps_per_second": 2.677,
"step": 38422
},
{
"epoch": 49.02279611031405,
"grad_norm": 0.45030030608177185,
"learning_rate": 4.019158683881716e-06,
"loss": 0.2062,
"step": 38440
},
{
"epoch": 49.41814124023593,
"grad_norm": 0.48792940378189087,
"learning_rate": 2.4052478134110786e-06,
"loss": 0.2062,
"step": 38750
},
{
"epoch": 49.81348637015782,
"grad_norm": 0.40084025263786316,
"learning_rate": 7.913369429404415e-07,
"loss": 0.2055,
"step": 39060
},
{
"epoch": 49.992029332058024,
"eval_loss": 0.23992499709129333,
"eval_runtime": 146.7718,
"eval_samples_per_second": 85.486,
"eval_steps_per_second": 2.678,
"step": 39200
},
{
"epoch": 49.992029332058024,
"step": 39200,
"total_flos": 6.111014223347712e+18,
"train_loss": 0.31974333125717785,
"train_runtime": 145458.5548,
"train_samples_per_second": 34.503,
"train_steps_per_second": 0.269
}
],
"logging_steps": 310,
"max_steps": 39200,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 500,
"total_flos": 6.111014223347712e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}