{ "best_metric": 0.23903648555278778, "best_model_checkpoint": "/root/pretrain_executions/pretrain_utg4java_220m_seq1024/checkpoint-38422", "epoch": 49.992029332058024, "eval_steps": 500, "global_step": 39200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.3953451299218875, "grad_norm": 0.6701709628105164, "learning_rate": 7.908163265306123e-05, "loss": 0.5421, "step": 310 }, { "epoch": 0.790690259843775, "grad_norm": 0.7041985988616943, "learning_rate": 0.00015816326530612246, "loss": 0.517, "step": 620 }, { "epoch": 0.9998405866411605, "eval_loss": 0.4792475402355194, "eval_runtime": 146.6334, "eval_samples_per_second": 85.567, "eval_steps_per_second": 2.68, "step": 784 }, { "epoch": 1.1860353897656624, "grad_norm": 0.7499191164970398, "learning_rate": 0.00019924510620574761, "loss": 0.5394, "step": 930 }, { "epoch": 1.5813805196875497, "grad_norm": 0.6494282484054565, "learning_rate": 0.00019763119533527698, "loss": 0.5327, "step": 1240 }, { "epoch": 1.9767256496094374, "grad_norm": 0.5610156059265137, "learning_rate": 0.00019601728446480633, "loss": 0.518, "step": 1550 }, { "epoch": 1.9996811732823212, "eval_loss": 0.45565617084503174, "eval_runtime": 146.6648, "eval_samples_per_second": 85.549, "eval_steps_per_second": 2.68, "step": 1568 }, { "epoch": 2.3720707795313247, "grad_norm": 0.6261674761772156, "learning_rate": 0.00019440337359433573, "loss": 0.5076, "step": 1860 }, { "epoch": 2.767415909453212, "grad_norm": 0.5601200461387634, "learning_rate": 0.00019278946272386507, "loss": 0.4972, "step": 2170 }, { "epoch": 2.9995217599234816, "eval_loss": 0.4429556429386139, "eval_runtime": 146.6472, "eval_samples_per_second": 85.559, "eval_steps_per_second": 2.68, "step": 2352 }, { "epoch": 3.1627610393751, "grad_norm": 0.6102643609046936, "learning_rate": 0.00019117555185339441, "loss": 0.4957, "step": 2480 }, { "epoch": 3.558106169296987, "grad_norm": 0.6246281266212463, "learning_rate": 0.00018956164098292379, "loss": 0.4827, "step": 2790 }, { "epoch": 3.9534512992188744, "grad_norm": 0.6781056523323059, "learning_rate": 0.00018794773011245316, "loss": 0.4736, "step": 3100 }, { "epoch": 3.999362346564642, "eval_loss": 0.42357587814331055, "eval_runtime": 146.7171, "eval_samples_per_second": 85.518, "eval_steps_per_second": 2.679, "step": 3136 }, { "epoch": 4.348796429140762, "grad_norm": 0.5225201845169067, "learning_rate": 0.00018633381924198253, "loss": 0.4686, "step": 3410 }, { "epoch": 4.7441415590626494, "grad_norm": 0.5367516279220581, "learning_rate": 0.00018471990837151187, "loss": 0.4575, "step": 3720 }, { "epoch": 4.999202933205803, "eval_loss": 0.4168592095375061, "eval_runtime": 146.7474, "eval_samples_per_second": 85.501, "eval_steps_per_second": 2.678, "step": 3920 }, { "epoch": 5.139486688984537, "grad_norm": 0.5979415774345398, "learning_rate": 0.00018310599750104124, "loss": 0.4591, "step": 4030 }, { "epoch": 5.534831818906424, "grad_norm": 0.6041168570518494, "learning_rate": 0.0001814920866305706, "loss": 0.4506, "step": 4340 }, { "epoch": 5.930176948828311, "grad_norm": 0.5398473739624023, "learning_rate": 0.00017987817576009998, "loss": 0.4499, "step": 4650 }, { "epoch": 5.999043519846963, "eval_loss": 0.3998393714427948, "eval_runtime": 146.7067, "eval_samples_per_second": 85.524, "eval_steps_per_second": 2.679, "step": 4704 }, { "epoch": 6.3255220787502, "grad_norm": 0.5446251630783081, "learning_rate": 0.00017826426488962933, "loss": 0.4444, "step": 4960 }, { "epoch": 6.720867208672087, "grad_norm": 0.564083993434906, "learning_rate": 0.00017665556018325697, "loss": 0.4401, "step": 5270 }, { "epoch": 6.9988841064881235, "eval_loss": 0.3943786323070526, "eval_runtime": 146.7223, "eval_samples_per_second": 85.515, "eval_steps_per_second": 2.679, "step": 5488 }, { "epoch": 7.116212338593974, "grad_norm": 0.5197238326072693, "learning_rate": 0.00017504164931278634, "loss": 0.4349, "step": 5580 }, { "epoch": 7.5115574685158615, "grad_norm": 0.5063862204551697, "learning_rate": 0.0001734277384423157, "loss": 0.4274, "step": 5890 }, { "epoch": 7.906902598437749, "grad_norm": 0.8238950371742249, "learning_rate": 0.00017181382757184508, "loss": 0.4275, "step": 6200 }, { "epoch": 8.0, "eval_loss": 0.38801178336143494, "eval_runtime": 146.7154, "eval_samples_per_second": 85.519, "eval_steps_per_second": 2.679, "step": 6273 }, { "epoch": 8.302247728359637, "grad_norm": 0.4785802662372589, "learning_rate": 0.00017019991670137442, "loss": 0.4218, "step": 6510 }, { "epoch": 8.697592858281524, "grad_norm": 0.5460196137428284, "learning_rate": 0.0001685860058309038, "loss": 0.4165, "step": 6820 }, { "epoch": 8.999840586641161, "eval_loss": 0.3786679804325104, "eval_runtime": 146.6889, "eval_samples_per_second": 85.535, "eval_steps_per_second": 2.679, "step": 7057 }, { "epoch": 9.092937988203412, "grad_norm": 0.5532106161117554, "learning_rate": 0.00016697209496043317, "loss": 0.4147, "step": 7130 }, { "epoch": 9.488283118125299, "grad_norm": 0.5270036458969116, "learning_rate": 0.00016535818408996254, "loss": 0.4054, "step": 7440 }, { "epoch": 9.883628248047186, "grad_norm": 0.5107512474060059, "learning_rate": 0.00016374427321949188, "loss": 0.407, "step": 7750 }, { "epoch": 9.99968117328232, "eval_loss": 0.3678111732006073, "eval_runtime": 146.7168, "eval_samples_per_second": 85.519, "eval_steps_per_second": 2.679, "step": 7841 }, { "epoch": 10.278973377969074, "grad_norm": 0.4663056433200836, "learning_rate": 0.00016213036234902125, "loss": 0.4001, "step": 8060 }, { "epoch": 10.67431850789096, "grad_norm": 0.5166866183280945, "learning_rate": 0.00016051645147855062, "loss": 0.4012, "step": 8370 }, { "epoch": 10.999521759923482, "eval_loss": 0.36857831478118896, "eval_runtime": 146.752, "eval_samples_per_second": 85.498, "eval_steps_per_second": 2.678, "step": 8625 }, { "epoch": 11.069663637812848, "grad_norm": 0.5623896718025208, "learning_rate": 0.00015890254060807997, "loss": 0.3967, "step": 8680 }, { "epoch": 11.465008767734735, "grad_norm": 0.4826233983039856, "learning_rate": 0.00015728862973760934, "loss": 0.3902, "step": 8990 }, { "epoch": 11.860353897656623, "grad_norm": 0.5024587512016296, "learning_rate": 0.00015567471886713868, "loss": 0.3889, "step": 9300 }, { "epoch": 11.999362346564642, "eval_loss": 0.35674843192100525, "eval_runtime": 146.7093, "eval_samples_per_second": 85.523, "eval_steps_per_second": 2.679, "step": 9409 }, { "epoch": 12.255699027578512, "grad_norm": 0.4992258846759796, "learning_rate": 0.00015406080799666805, "loss": 0.3838, "step": 9610 }, { "epoch": 12.6510441575004, "grad_norm": 0.4781612455844879, "learning_rate": 0.00015244689712619742, "loss": 0.3789, "step": 9920 }, { "epoch": 12.999202933205803, "eval_loss": 0.35254529118537903, "eval_runtime": 146.7424, "eval_samples_per_second": 85.504, "eval_steps_per_second": 2.678, "step": 10193 }, { "epoch": 13.046389287422286, "grad_norm": 0.49535173177719116, "learning_rate": 0.0001508329862557268, "loss": 0.3775, "step": 10230 }, { "epoch": 13.441734417344174, "grad_norm": 0.5237115621566772, "learning_rate": 0.00014922428154935443, "loss": 0.3734, "step": 10540 }, { "epoch": 13.837079547266061, "grad_norm": 0.4549529552459717, "learning_rate": 0.0001476155768429821, "loss": 0.37, "step": 10850 }, { "epoch": 13.999043519846964, "eval_loss": 0.3443816006183624, "eval_runtime": 146.6954, "eval_samples_per_second": 85.531, "eval_steps_per_second": 2.679, "step": 10977 }, { "epoch": 14.232424677187948, "grad_norm": 0.520125150680542, "learning_rate": 0.00014600166597251147, "loss": 0.3647, "step": 11160 }, { "epoch": 14.627769807109836, "grad_norm": 0.5332316160202026, "learning_rate": 0.00014438775510204084, "loss": 0.3678, "step": 11470 }, { "epoch": 14.998884106488124, "eval_loss": 0.3436979055404663, "eval_runtime": 146.7179, "eval_samples_per_second": 85.518, "eval_steps_per_second": 2.679, "step": 11761 }, { "epoch": 15.023114937031723, "grad_norm": 0.47955256700515747, "learning_rate": 0.0001427738442315702, "loss": 0.3664, "step": 11780 }, { "epoch": 15.41846006695361, "grad_norm": 0.48371678590774536, "learning_rate": 0.00014115993336109953, "loss": 0.36, "step": 12090 }, { "epoch": 15.813805196875498, "grad_norm": 0.4756961166858673, "learning_rate": 0.0001395460224906289, "loss": 0.3577, "step": 12400 }, { "epoch": 16.0, "eval_loss": 0.3342459499835968, "eval_runtime": 146.7158, "eval_samples_per_second": 85.519, "eval_steps_per_second": 2.679, "step": 12546 }, { "epoch": 16.209150326797385, "grad_norm": 0.5593659281730652, "learning_rate": 0.00013793211162015827, "loss": 0.3536, "step": 12710 }, { "epoch": 16.604495456719274, "grad_norm": 0.6009001135826111, "learning_rate": 0.00013631820074968764, "loss": 0.3539, "step": 13020 }, { "epoch": 16.99984058664116, "grad_norm": 0.48707565665245056, "learning_rate": 0.000134704289879217, "loss": 0.3522, "step": 13330 }, { "epoch": 16.99984058664116, "eval_loss": 0.33101820945739746, "eval_runtime": 146.73, "eval_samples_per_second": 85.511, "eval_steps_per_second": 2.678, "step": 13330 }, { "epoch": 17.39518571656305, "grad_norm": 0.4787095785140991, "learning_rate": 0.00013309037900874636, "loss": 0.343, "step": 13640 }, { "epoch": 17.790530846484934, "grad_norm": 0.4643840789794922, "learning_rate": 0.00013147646813827573, "loss": 0.3466, "step": 13950 }, { "epoch": 17.999681173282323, "eval_loss": 0.3281005620956421, "eval_runtime": 146.7108, "eval_samples_per_second": 85.522, "eval_steps_per_second": 2.679, "step": 14114 }, { "epoch": 18.185875976406823, "grad_norm": 0.4819445312023163, "learning_rate": 0.0001298625572678051, "loss": 0.3415, "step": 14260 }, { "epoch": 18.58122110632871, "grad_norm": 0.46530964970588684, "learning_rate": 0.00012824864639733444, "loss": 0.3393, "step": 14570 }, { "epoch": 18.976566236250598, "grad_norm": 0.5159475207328796, "learning_rate": 0.00012663473552686382, "loss": 0.3377, "step": 14880 }, { "epoch": 18.999521759923482, "eval_loss": 0.32132235169410706, "eval_runtime": 146.7396, "eval_samples_per_second": 85.505, "eval_steps_per_second": 2.678, "step": 14898 }, { "epoch": 19.371911366172487, "grad_norm": 0.45964986085891724, "learning_rate": 0.00012502082465639319, "loss": 0.3348, "step": 15190 }, { "epoch": 19.767256496094372, "grad_norm": 0.49627387523651123, "learning_rate": 0.00012340691378592253, "loss": 0.3316, "step": 15500 }, { "epoch": 19.99936234656464, "eval_loss": 0.31626757979393005, "eval_runtime": 146.7396, "eval_samples_per_second": 85.505, "eval_steps_per_second": 2.678, "step": 15682 }, { "epoch": 20.16260162601626, "grad_norm": 0.48719242215156555, "learning_rate": 0.0001217930029154519, "loss": 0.3294, "step": 15810 }, { "epoch": 20.557946755938147, "grad_norm": 0.5443927049636841, "learning_rate": 0.00012017909204498126, "loss": 0.3261, "step": 16120 }, { "epoch": 20.953291885860036, "grad_norm": 0.4637634754180908, "learning_rate": 0.00011856518117451063, "loss": 0.3255, "step": 16430 }, { "epoch": 20.9992029332058, "eval_loss": 0.31501948833465576, "eval_runtime": 146.7591, "eval_samples_per_second": 85.494, "eval_steps_per_second": 2.678, "step": 16466 }, { "epoch": 21.34863701578192, "grad_norm": 0.46018585562705994, "learning_rate": 0.00011695127030403999, "loss": 0.3198, "step": 16740 }, { "epoch": 21.74398214570381, "grad_norm": 0.5096014738082886, "learning_rate": 0.00011533735943356936, "loss": 0.3226, "step": 17050 }, { "epoch": 21.999043519846964, "eval_loss": 0.30657365918159485, "eval_runtime": 146.7538, "eval_samples_per_second": 85.497, "eval_steps_per_second": 2.678, "step": 17250 }, { "epoch": 22.139327275625696, "grad_norm": 0.44816407561302185, "learning_rate": 0.00011372344856309872, "loss": 0.3178, "step": 17360 }, { "epoch": 22.534672405547585, "grad_norm": 0.437168151140213, "learning_rate": 0.00011211474385672638, "loss": 0.3172, "step": 17670 }, { "epoch": 22.93001753546947, "grad_norm": 0.5836613774299622, "learning_rate": 0.00011050083298625573, "loss": 0.3121, "step": 17980 }, { "epoch": 22.998884106488124, "eval_loss": 0.30263882875442505, "eval_runtime": 146.7108, "eval_samples_per_second": 85.522, "eval_steps_per_second": 2.679, "step": 18034 }, { "epoch": 23.32536266539136, "grad_norm": 0.4829230308532715, "learning_rate": 0.00010888692211578508, "loss": 0.3079, "step": 18290 }, { "epoch": 23.720707795313245, "grad_norm": 0.4485584497451782, "learning_rate": 0.00010727821740941275, "loss": 0.3105, "step": 18600 }, { "epoch": 24.0, "eval_loss": 0.3048921227455139, "eval_runtime": 146.8103, "eval_samples_per_second": 85.464, "eval_steps_per_second": 2.677, "step": 18819 }, { "epoch": 24.116052925235135, "grad_norm": 0.5251662135124207, "learning_rate": 0.00010566430653894211, "loss": 0.3052, "step": 18910 }, { "epoch": 24.511398055157024, "grad_norm": 0.4876725971698761, "learning_rate": 0.00010405039566847148, "loss": 0.3045, "step": 19220 }, { "epoch": 24.90674318507891, "grad_norm": 0.5600521564483643, "learning_rate": 0.00010243648479800084, "loss": 0.3048, "step": 19530 }, { "epoch": 24.99984058664116, "eval_loss": 0.2986990809440613, "eval_runtime": 146.734, "eval_samples_per_second": 85.508, "eval_steps_per_second": 2.678, "step": 19603 }, { "epoch": 25.3020883150008, "grad_norm": 0.5170055627822876, "learning_rate": 0.00010082257392753021, "loss": 0.3003, "step": 19840 }, { "epoch": 25.697433444922684, "grad_norm": 0.48347124457359314, "learning_rate": 9.920866305705956e-05, "loss": 0.2983, "step": 20150 }, { "epoch": 25.999681173282323, "eval_loss": 0.2916134297847748, "eval_runtime": 146.7271, "eval_samples_per_second": 85.512, "eval_steps_per_second": 2.678, "step": 20387 }, { "epoch": 26.092778574844573, "grad_norm": 0.48907041549682617, "learning_rate": 9.759475218658892e-05, "loss": 0.2959, "step": 20460 }, { "epoch": 26.48812370476646, "grad_norm": 0.5060804486274719, "learning_rate": 9.598084131611829e-05, "loss": 0.2923, "step": 20770 }, { "epoch": 26.883468834688347, "grad_norm": 0.4843296706676483, "learning_rate": 9.436693044564765e-05, "loss": 0.2918, "step": 21080 }, { "epoch": 26.999521759923482, "eval_loss": 0.29019656777381897, "eval_runtime": 146.6934, "eval_samples_per_second": 85.532, "eval_steps_per_second": 2.679, "step": 21171 }, { "epoch": 27.278813964610233, "grad_norm": 0.42266514897346497, "learning_rate": 9.275301957517701e-05, "loss": 0.2901, "step": 21390 }, { "epoch": 27.674159094532122, "grad_norm": 0.5161967873573303, "learning_rate": 9.113910870470638e-05, "loss": 0.2889, "step": 21700 }, { "epoch": 27.99936234656464, "eval_loss": 0.2833983302116394, "eval_runtime": 146.7193, "eval_samples_per_second": 85.517, "eval_steps_per_second": 2.679, "step": 21955 }, { "epoch": 28.069504224454008, "grad_norm": 0.4523755609989166, "learning_rate": 8.952519783423574e-05, "loss": 0.2871, "step": 22010 }, { "epoch": 28.464849354375897, "grad_norm": 0.44348961114883423, "learning_rate": 8.791128696376511e-05, "loss": 0.2847, "step": 22320 }, { "epoch": 28.860194484297786, "grad_norm": 0.6467667818069458, "learning_rate": 8.630258225739276e-05, "loss": 0.2844, "step": 22630 }, { "epoch": 28.9992029332058, "eval_loss": 0.28629302978515625, "eval_runtime": 146.7547, "eval_samples_per_second": 85.496, "eval_steps_per_second": 2.678, "step": 22739 }, { "epoch": 29.25553961421967, "grad_norm": 0.4734992980957031, "learning_rate": 8.468867138692213e-05, "loss": 0.2787, "step": 22940 }, { "epoch": 29.65088474414156, "grad_norm": 0.4827498495578766, "learning_rate": 8.307476051645148e-05, "loss": 0.2787, "step": 23250 }, { "epoch": 29.999043519846964, "eval_loss": 0.2794826626777649, "eval_runtime": 146.9198, "eval_samples_per_second": 85.4, "eval_steps_per_second": 2.675, "step": 23523 }, { "epoch": 30.046229874063446, "grad_norm": 0.5005486607551575, "learning_rate": 8.146084964598085e-05, "loss": 0.2758, "step": 23560 }, { "epoch": 30.441575003985335, "grad_norm": 0.5253671407699585, "learning_rate": 7.98469387755102e-05, "loss": 0.2761, "step": 23870 }, { "epoch": 30.83692013390722, "grad_norm": 0.472740113735199, "learning_rate": 7.823302790503957e-05, "loss": 0.2726, "step": 24180 }, { "epoch": 30.998884106488124, "eval_loss": 0.2779182493686676, "eval_runtime": 146.7777, "eval_samples_per_second": 85.483, "eval_steps_per_second": 2.678, "step": 24307 }, { "epoch": 31.23226526382911, "grad_norm": 0.5228144526481628, "learning_rate": 7.661911703456893e-05, "loss": 0.2717, "step": 24490 }, { "epoch": 31.627610393750995, "grad_norm": 0.47681719064712524, "learning_rate": 7.501041232819659e-05, "loss": 0.2664, "step": 24800 }, { "epoch": 32.0, "eval_loss": 0.27039337158203125, "eval_runtime": 146.805, "eval_samples_per_second": 85.467, "eval_steps_per_second": 2.677, "step": 25092 }, { "epoch": 32.022955523672884, "grad_norm": 0.4973162114620209, "learning_rate": 7.339650145772596e-05, "loss": 0.268, "step": 25110 }, { "epoch": 32.41830065359477, "grad_norm": 0.5740240216255188, "learning_rate": 7.178259058725531e-05, "loss": 0.2668, "step": 25420 }, { "epoch": 32.813645783516655, "grad_norm": 0.4842962622642517, "learning_rate": 7.016867971678468e-05, "loss": 0.2631, "step": 25730 }, { "epoch": 32.99984058664116, "eval_loss": 0.2733234763145447, "eval_runtime": 146.7109, "eval_samples_per_second": 85.522, "eval_steps_per_second": 2.679, "step": 25876 }, { "epoch": 33.20899091343855, "grad_norm": 0.499452143907547, "learning_rate": 6.855476884631404e-05, "loss": 0.263, "step": 26040 }, { "epoch": 33.60433604336043, "grad_norm": 0.4541178345680237, "learning_rate": 6.69408579758434e-05, "loss": 0.2603, "step": 26350 }, { "epoch": 33.99968117328232, "grad_norm": 0.5029833912849426, "learning_rate": 6.532694710537276e-05, "loss": 0.258, "step": 26660 }, { "epoch": 33.99968117328232, "eval_loss": 0.26625362038612366, "eval_runtime": 146.7319, "eval_samples_per_second": 85.51, "eval_steps_per_second": 2.678, "step": 26660 }, { "epoch": 34.39502630320421, "grad_norm": 0.5090352892875671, "learning_rate": 6.371303623490213e-05, "loss": 0.2544, "step": 26970 }, { "epoch": 34.7903714331261, "grad_norm": 0.4605717360973358, "learning_rate": 6.209912536443149e-05, "loss": 0.254, "step": 27280 }, { "epoch": 34.99952175992348, "eval_loss": 0.26669949293136597, "eval_runtime": 146.7117, "eval_samples_per_second": 85.521, "eval_steps_per_second": 2.679, "step": 27444 }, { "epoch": 35.18571656304798, "grad_norm": 0.46216222643852234, "learning_rate": 6.048521449396085e-05, "loss": 0.254, "step": 27590 }, { "epoch": 35.58106169296987, "grad_norm": 0.49629315733909607, "learning_rate": 5.8871303623490214e-05, "loss": 0.2521, "step": 27900 }, { "epoch": 35.97640682289176, "grad_norm": 0.48311081528663635, "learning_rate": 5.725739275301958e-05, "loss": 0.2493, "step": 28210 }, { "epoch": 35.999362346564645, "eval_loss": 0.26483407616615295, "eval_runtime": 146.7384, "eval_samples_per_second": 85.506, "eval_steps_per_second": 2.678, "step": 28228 }, { "epoch": 36.371751952813646, "grad_norm": 0.43428850173950195, "learning_rate": 5.564348188254894e-05, "loss": 0.2455, "step": 28520 }, { "epoch": 36.76709708273553, "grad_norm": 0.4786287844181061, "learning_rate": 5.4029571012078306e-05, "loss": 0.2454, "step": 28830 }, { "epoch": 36.9992029332058, "eval_loss": 0.26446378231048584, "eval_runtime": 146.73, "eval_samples_per_second": 85.511, "eval_steps_per_second": 2.678, "step": 29012 }, { "epoch": 37.16244221265742, "grad_norm": 0.5931326746940613, "learning_rate": 5.241566014160767e-05, "loss": 0.247, "step": 29140 }, { "epoch": 37.55778734257931, "grad_norm": 0.5031745433807373, "learning_rate": 5.0801749271137035e-05, "loss": 0.2425, "step": 29450 }, { "epoch": 37.953132472501196, "grad_norm": 0.5432093739509583, "learning_rate": 4.918783840066639e-05, "loss": 0.2416, "step": 29760 }, { "epoch": 37.999043519846964, "eval_loss": 0.2601180672645569, "eval_runtime": 146.6811, "eval_samples_per_second": 85.539, "eval_steps_per_second": 2.679, "step": 29796 }, { "epoch": 38.34847760242308, "grad_norm": 0.5319362878799438, "learning_rate": 4.7573927530195756e-05, "loss": 0.2392, "step": 30070 }, { "epoch": 38.743822732344974, "grad_norm": 0.5319586396217346, "learning_rate": 4.596001665972512e-05, "loss": 0.2368, "step": 30380 }, { "epoch": 38.99888410648813, "eval_loss": 0.25446435809135437, "eval_runtime": 146.6972, "eval_samples_per_second": 85.53, "eval_steps_per_second": 2.679, "step": 30580 }, { "epoch": 39.13916786226686, "grad_norm": 0.4489250183105469, "learning_rate": 4.434610578925448e-05, "loss": 0.2368, "step": 30690 }, { "epoch": 39.534512992188745, "grad_norm": 0.48287880420684814, "learning_rate": 4.273740108288213e-05, "loss": 0.2353, "step": 31000 }, { "epoch": 39.92985812211063, "grad_norm": 0.49850553274154663, "learning_rate": 4.1123490212411495e-05, "loss": 0.2321, "step": 31310 }, { "epoch": 40.0, "eval_loss": 0.24883659183979034, "eval_runtime": 146.7363, "eval_samples_per_second": 85.507, "eval_steps_per_second": 2.678, "step": 31365 }, { "epoch": 40.32520325203252, "grad_norm": 0.4667394161224365, "learning_rate": 3.9514785506039155e-05, "loss": 0.2337, "step": 31620 }, { "epoch": 40.72054838195441, "grad_norm": 0.5053902864456177, "learning_rate": 3.790087463556852e-05, "loss": 0.2284, "step": 31930 }, { "epoch": 40.99984058664116, "eval_loss": 0.2544113099575043, "eval_runtime": 146.7257, "eval_samples_per_second": 85.513, "eval_steps_per_second": 2.678, "step": 32149 }, { "epoch": 41.115893511876294, "grad_norm": 0.47476327419281006, "learning_rate": 3.628696376509788e-05, "loss": 0.2286, "step": 32240 }, { "epoch": 41.51123864179818, "grad_norm": 0.5025794506072998, "learning_rate": 3.467305289462724e-05, "loss": 0.2292, "step": 32550 }, { "epoch": 41.90658377172007, "grad_norm": 0.4553293287754059, "learning_rate": 3.3059142024156605e-05, "loss": 0.225, "step": 32860 }, { "epoch": 41.99968117328232, "eval_loss": 0.24568869173526764, "eval_runtime": 146.7316, "eval_samples_per_second": 85.51, "eval_steps_per_second": 2.678, "step": 32933 }, { "epoch": 42.30192890164196, "grad_norm": 0.4845215678215027, "learning_rate": 3.144523115368597e-05, "loss": 0.2236, "step": 33170 }, { "epoch": 42.69727403156384, "grad_norm": 0.5739601850509644, "learning_rate": 2.983132028321533e-05, "loss": 0.2234, "step": 33480 }, { "epoch": 42.99952175992348, "eval_loss": 0.24620206654071808, "eval_runtime": 146.7264, "eval_samples_per_second": 85.513, "eval_steps_per_second": 2.678, "step": 33717 }, { "epoch": 43.092619161485736, "grad_norm": 0.4569677412509918, "learning_rate": 2.8217409412744688e-05, "loss": 0.2213, "step": 33790 }, { "epoch": 43.48796429140762, "grad_norm": 0.5146024227142334, "learning_rate": 2.6603498542274052e-05, "loss": 0.2188, "step": 34100 }, { "epoch": 43.88330942132951, "grad_norm": 0.47475871443748474, "learning_rate": 2.4989587671803416e-05, "loss": 0.2206, "step": 34410 }, { "epoch": 43.999362346564645, "eval_loss": 0.2445935159921646, "eval_runtime": 146.7897, "eval_samples_per_second": 85.476, "eval_steps_per_second": 2.677, "step": 34501 }, { "epoch": 44.27865455125139, "grad_norm": 0.45915085077285767, "learning_rate": 2.337567680133278e-05, "loss": 0.217, "step": 34720 }, { "epoch": 44.673999681173285, "grad_norm": 0.4429190456867218, "learning_rate": 2.176176593086214e-05, "loss": 0.2165, "step": 35030 }, { "epoch": 44.9992029332058, "eval_loss": 0.24302400648593903, "eval_runtime": 146.7631, "eval_samples_per_second": 85.492, "eval_steps_per_second": 2.678, "step": 35285 }, { "epoch": 45.06934481109517, "grad_norm": 0.5038246512413025, "learning_rate": 2.0147855060391505e-05, "loss": 0.217, "step": 35340 }, { "epoch": 45.464689941017056, "grad_norm": 0.4302615523338318, "learning_rate": 1.8539150354019162e-05, "loss": 0.2137, "step": 35650 }, { "epoch": 45.86003507093894, "grad_norm": 0.5075607299804688, "learning_rate": 1.6925239483548523e-05, "loss": 0.2145, "step": 35960 }, { "epoch": 45.999043519846964, "eval_loss": 0.24222899973392487, "eval_runtime": 146.735, "eval_samples_per_second": 85.508, "eval_steps_per_second": 2.678, "step": 36069 }, { "epoch": 46.255380200860834, "grad_norm": 0.4777955114841461, "learning_rate": 1.531653477717618e-05, "loss": 0.2126, "step": 36270 }, { "epoch": 46.65072533078272, "grad_norm": 0.48974084854125977, "learning_rate": 1.3702623906705539e-05, "loss": 0.2112, "step": 36580 }, { "epoch": 46.99888410648813, "eval_loss": 0.2432757019996643, "eval_runtime": 146.7494, "eval_samples_per_second": 85.499, "eval_steps_per_second": 2.678, "step": 36853 }, { "epoch": 47.046070460704605, "grad_norm": 0.46624037623405457, "learning_rate": 1.2088713036234903e-05, "loss": 0.2089, "step": 36890 }, { "epoch": 47.44141559062649, "grad_norm": 0.4808659553527832, "learning_rate": 1.0474802165764265e-05, "loss": 0.2085, "step": 37200 }, { "epoch": 47.836760720548384, "grad_norm": 0.4421006143093109, "learning_rate": 8.86089129529363e-06, "loss": 0.2087, "step": 37510 }, { "epoch": 48.0, "eval_loss": 0.24061799049377441, "eval_runtime": 146.7785, "eval_samples_per_second": 85.483, "eval_steps_per_second": 2.678, "step": 37638 }, { "epoch": 48.23210585047027, "grad_norm": 0.4642196297645569, "learning_rate": 7.246980424822991e-06, "loss": 0.208, "step": 37820 }, { "epoch": 48.627450980392155, "grad_norm": 0.47141027450561523, "learning_rate": 5.633069554352354e-06, "loss": 0.2067, "step": 38130 }, { "epoch": 48.99984058664116, "eval_loss": 0.23903648555278778, "eval_runtime": 146.7908, "eval_samples_per_second": 85.475, "eval_steps_per_second": 2.677, "step": 38422 }, { "epoch": 49.02279611031405, "grad_norm": 0.45030030608177185, "learning_rate": 4.019158683881716e-06, "loss": 0.2062, "step": 38440 }, { "epoch": 49.41814124023593, "grad_norm": 0.48792940378189087, "learning_rate": 2.4052478134110786e-06, "loss": 0.2062, "step": 38750 }, { "epoch": 49.81348637015782, "grad_norm": 0.40084025263786316, "learning_rate": 7.913369429404415e-07, "loss": 0.2055, "step": 39060 }, { "epoch": 49.992029332058024, "eval_loss": 0.23992499709129333, "eval_runtime": 146.7718, "eval_samples_per_second": 85.486, "eval_steps_per_second": 2.678, "step": 39200 }, { "epoch": 49.992029332058024, "step": 39200, "total_flos": 6.111014223347712e+18, "train_loss": 0.31974333125717785, "train_runtime": 145458.5548, "train_samples_per_second": 34.503, "train_steps_per_second": 0.269 } ], "logging_steps": 310, "max_steps": 39200, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "total_flos": 6.111014223347712e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }