{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 8975, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011142061281337047, "grad_norm": 0.04801425710320473, "learning_rate": 2.7859033291544785e-07, "loss": 2.4677, "step": 100 }, { "epoch": 0.022284122562674095, "grad_norm": 0.04515479877591133, "learning_rate": 5.571806658308957e-07, "loss": 2.465, "step": 200 }, { "epoch": 0.033426183844011144, "grad_norm": 0.04905751347541809, "learning_rate": 8.357709987463436e-07, "loss": 2.479, "step": 300 }, { "epoch": 0.04456824512534819, "grad_norm": 0.0585104376077652, "learning_rate": 1.1143613316617914e-06, "loss": 2.4715, "step": 400 }, { "epoch": 0.055710306406685235, "grad_norm": 0.06588415056467056, "learning_rate": 1.3929516645772392e-06, "loss": 2.4912, "step": 500 }, { "epoch": 0.06685236768802229, "grad_norm": 0.07128334790468216, "learning_rate": 1.6715419974926873e-06, "loss": 2.449, "step": 600 }, { "epoch": 0.07799442896935933, "grad_norm": 0.08448737859725952, "learning_rate": 1.9501323304081347e-06, "loss": 2.4613, "step": 700 }, { "epoch": 0.08913649025069638, "grad_norm": 0.09213274717330933, "learning_rate": 2.2287226633235828e-06, "loss": 2.4366, "step": 800 }, { "epoch": 0.10027855153203342, "grad_norm": 0.10825659334659576, "learning_rate": 2.507312996239031e-06, "loss": 2.4342, "step": 900 }, { "epoch": 0.11142061281337047, "grad_norm": 0.11989778280258179, "learning_rate": 2.7859033291544785e-06, "loss": 2.4227, "step": 1000 }, { "epoch": 0.12256267409470752, "grad_norm": 0.11828930675983429, "learning_rate": 3.064493662069926e-06, "loss": 2.4346, "step": 1100 }, { "epoch": 0.13370473537604458, "grad_norm": 0.13713237643241882, "learning_rate": 3.3430839949853746e-06, "loss": 2.3954, "step": 1200 }, { "epoch": 0.14484679665738162, "grad_norm": 0.13783149421215057, "learning_rate": 3.6216743279008222e-06, "loss": 2.4027, "step": 1300 }, { "epoch": 0.15598885793871867, "grad_norm": 0.14380280673503876, "learning_rate": 3.9002646608162694e-06, "loss": 2.3894, "step": 1400 }, { "epoch": 0.1671309192200557, "grad_norm": 0.1586257964372635, "learning_rate": 4.178854993731718e-06, "loss": 2.3923, "step": 1500 }, { "epoch": 0.17827298050139276, "grad_norm": 0.15052290260791779, "learning_rate": 4.4574453266471655e-06, "loss": 2.3792, "step": 1600 }, { "epoch": 0.1894150417827298, "grad_norm": 0.16659782826900482, "learning_rate": 4.736035659562614e-06, "loss": 2.3657, "step": 1700 }, { "epoch": 0.20055710306406685, "grad_norm": 0.16844528913497925, "learning_rate": 5.014625992478062e-06, "loss": 2.3748, "step": 1800 }, { "epoch": 0.2116991643454039, "grad_norm": 0.17568887770175934, "learning_rate": 5.29321632539351e-06, "loss": 2.3609, "step": 1900 }, { "epoch": 0.22284122562674094, "grad_norm": 0.19198767840862274, "learning_rate": 5.571806658308957e-06, "loss": 2.3544, "step": 2000 }, { "epoch": 0.233983286908078, "grad_norm": 0.19439826905727386, "learning_rate": 5.850396991224405e-06, "loss": 2.3589, "step": 2100 }, { "epoch": 0.24512534818941503, "grad_norm": 0.2022334635257721, "learning_rate": 6.128987324139852e-06, "loss": 2.3612, "step": 2200 }, { "epoch": 0.2562674094707521, "grad_norm": 0.20067375898361206, "learning_rate": 6.4075776570553e-06, "loss": 2.3417, "step": 2300 }, { "epoch": 0.26740947075208915, "grad_norm": 0.2165047824382782, "learning_rate": 6.686167989970749e-06, "loss": 2.3481, "step": 2400 }, { "epoch": 0.2785515320334262, "grad_norm": 0.21726970374584198, "learning_rate": 6.9647583228861955e-06, "loss": 2.3331, "step": 2500 }, { "epoch": 0.28969359331476324, "grad_norm": 0.21808430552482605, "learning_rate": 7.2433486558016444e-06, "loss": 2.336, "step": 2600 }, { "epoch": 0.3008356545961003, "grad_norm": 0.23211392760276794, "learning_rate": 7.5219389887170925e-06, "loss": 2.3332, "step": 2700 }, { "epoch": 0.31197771587743733, "grad_norm": 0.24134476482868195, "learning_rate": 7.800529321632539e-06, "loss": 2.3377, "step": 2800 }, { "epoch": 0.3231197771587744, "grad_norm": 0.2478109449148178, "learning_rate": 8.079119654547987e-06, "loss": 2.3209, "step": 2900 }, { "epoch": 0.3342618384401114, "grad_norm": 0.25040534138679504, "learning_rate": 8.357709987463437e-06, "loss": 2.3218, "step": 3000 }, { "epoch": 0.34540389972144847, "grad_norm": 0.25530460476875305, "learning_rate": 8.636300320378883e-06, "loss": 2.3063, "step": 3100 }, { "epoch": 0.3565459610027855, "grad_norm": 0.258489727973938, "learning_rate": 8.914890653294331e-06, "loss": 2.3218, "step": 3200 }, { "epoch": 0.36768802228412256, "grad_norm": 0.28399255871772766, "learning_rate": 9.193480986209779e-06, "loss": 2.3063, "step": 3300 }, { "epoch": 0.3788300835654596, "grad_norm": 0.2863540053367615, "learning_rate": 9.472071319125227e-06, "loss": 2.2997, "step": 3400 }, { "epoch": 0.38997214484679665, "grad_norm": 0.2868782877922058, "learning_rate": 9.750661652040675e-06, "loss": 2.2947, "step": 3500 }, { "epoch": 0.4011142061281337, "grad_norm": 0.2859346270561218, "learning_rate": 1.0029251984956123e-05, "loss": 2.3054, "step": 3600 }, { "epoch": 0.41225626740947074, "grad_norm": 0.29307645559310913, "learning_rate": 1.030784231787157e-05, "loss": 2.2882, "step": 3700 }, { "epoch": 0.4233983286908078, "grad_norm": 0.3006058633327484, "learning_rate": 1.058643265078702e-05, "loss": 2.2893, "step": 3800 }, { "epoch": 0.43454038997214484, "grad_norm": 0.31586816906929016, "learning_rate": 1.0865022983702467e-05, "loss": 2.2967, "step": 3900 }, { "epoch": 0.4456824512534819, "grad_norm": 0.3375140428543091, "learning_rate": 1.1143613316617914e-05, "loss": 2.2694, "step": 4000 }, { "epoch": 0.4568245125348189, "grad_norm": 0.30628982186317444, "learning_rate": 1.142220364953336e-05, "loss": 2.2709, "step": 4100 }, { "epoch": 0.467966573816156, "grad_norm": 0.34296101331710815, "learning_rate": 1.170079398244881e-05, "loss": 2.2783, "step": 4200 }, { "epoch": 0.479108635097493, "grad_norm": 0.33544307947158813, "learning_rate": 1.1979384315364258e-05, "loss": 2.2825, "step": 4300 }, { "epoch": 0.49025069637883006, "grad_norm": 0.3508812487125397, "learning_rate": 1.2257974648279704e-05, "loss": 2.2793, "step": 4400 }, { "epoch": 0.5013927576601671, "grad_norm": 0.34851330518722534, "learning_rate": 1.2536564981195154e-05, "loss": 2.2703, "step": 4500 }, { "epoch": 0.5125348189415042, "grad_norm": 0.3589787185192108, "learning_rate": 1.28151553141106e-05, "loss": 2.2649, "step": 4600 }, { "epoch": 0.5236768802228412, "grad_norm": 0.3842375576496124, "learning_rate": 1.3093745647026049e-05, "loss": 2.264, "step": 4700 }, { "epoch": 0.5348189415041783, "grad_norm": 0.3963667154312134, "learning_rate": 1.3372335979941498e-05, "loss": 2.2567, "step": 4800 }, { "epoch": 0.5459610027855153, "grad_norm": 0.40061014890670776, "learning_rate": 1.3650926312856945e-05, "loss": 2.2567, "step": 4900 }, { "epoch": 0.5571030640668524, "grad_norm": 0.3819388747215271, "learning_rate": 1.3929516645772391e-05, "loss": 2.2545, "step": 5000 }, { "epoch": 0.5682451253481894, "grad_norm": 0.40177789330482483, "learning_rate": 1.420810697868784e-05, "loss": 2.2509, "step": 5100 }, { "epoch": 0.5793871866295265, "grad_norm": 0.4258297085762024, "learning_rate": 1.4486697311603289e-05, "loss": 2.2466, "step": 5200 }, { "epoch": 0.5905292479108635, "grad_norm": 0.44000041484832764, "learning_rate": 1.4765287644518735e-05, "loss": 2.2453, "step": 5300 }, { "epoch": 0.6016713091922006, "grad_norm": 0.4116784632205963, "learning_rate": 1.5043877977434185e-05, "loss": 2.2708, "step": 5400 }, { "epoch": 0.6128133704735376, "grad_norm": 0.39489638805389404, "learning_rate": 1.5322468310349633e-05, "loss": 2.2656, "step": 5500 }, { "epoch": 0.6239554317548747, "grad_norm": 0.43062567710876465, "learning_rate": 1.5601058643265078e-05, "loss": 2.2305, "step": 5600 }, { "epoch": 0.6350974930362117, "grad_norm": 0.4567050635814667, "learning_rate": 1.587964897618053e-05, "loss": 2.2392, "step": 5700 }, { "epoch": 0.6462395543175488, "grad_norm": 0.40272602438926697, "learning_rate": 1.6158239309095974e-05, "loss": 2.2327, "step": 5800 }, { "epoch": 0.6573816155988857, "grad_norm": 0.4182547628879547, "learning_rate": 1.6436829642011422e-05, "loss": 2.227, "step": 5900 }, { "epoch": 0.6685236768802229, "grad_norm": 0.4616607129573822, "learning_rate": 1.6715419974926873e-05, "loss": 2.223, "step": 6000 }, { "epoch": 0.6796657381615598, "grad_norm": 0.4605095386505127, "learning_rate": 1.6994010307842318e-05, "loss": 2.229, "step": 6100 }, { "epoch": 0.6908077994428969, "grad_norm": 0.48064395785331726, "learning_rate": 1.7272600640757766e-05, "loss": 2.2285, "step": 6200 }, { "epoch": 0.7019498607242339, "grad_norm": 0.45375484228134155, "learning_rate": 1.7551190973673214e-05, "loss": 2.231, "step": 6300 }, { "epoch": 0.713091922005571, "grad_norm": 0.4386723041534424, "learning_rate": 1.7829781306588662e-05, "loss": 2.2295, "step": 6400 }, { "epoch": 0.724233983286908, "grad_norm": 0.4519370198249817, "learning_rate": 1.810837163950411e-05, "loss": 2.2185, "step": 6500 }, { "epoch": 0.7353760445682451, "grad_norm": 0.45770174264907837, "learning_rate": 1.8386961972419558e-05, "loss": 2.2146, "step": 6600 }, { "epoch": 0.7465181058495822, "grad_norm": 0.46970096230506897, "learning_rate": 1.8665552305335006e-05, "loss": 2.2268, "step": 6700 }, { "epoch": 0.7576601671309192, "grad_norm": 0.4301040768623352, "learning_rate": 1.8944142638250454e-05, "loss": 2.2186, "step": 6800 }, { "epoch": 0.7688022284122563, "grad_norm": 0.49275335669517517, "learning_rate": 1.9222732971165902e-05, "loss": 2.2143, "step": 6900 }, { "epoch": 0.7799442896935933, "grad_norm": 0.4531406760215759, "learning_rate": 1.950132330408135e-05, "loss": 2.2087, "step": 7000 }, { "epoch": 0.7910863509749304, "grad_norm": 0.44058841466903687, "learning_rate": 1.97799136369968e-05, "loss": 2.2081, "step": 7100 }, { "epoch": 0.8022284122562674, "grad_norm": 0.4620118737220764, "learning_rate": 1.9993253992196043e-05, "loss": 2.2029, "step": 7200 }, { "epoch": 0.8133704735376045, "grad_norm": 0.4749448299407959, "learning_rate": 1.9776845355951603e-05, "loss": 2.1974, "step": 7300 }, { "epoch": 0.8245125348189415, "grad_norm": 0.4482609033584595, "learning_rate": 1.9262050899616325e-05, "loss": 2.2229, "step": 7400 }, { "epoch": 0.8356545961002786, "grad_norm": 0.4689159095287323, "learning_rate": 1.8464581965697866e-05, "loss": 2.2081, "step": 7500 }, { "epoch": 0.8467966573816156, "grad_norm": 0.4448583126068115, "learning_rate": 1.7408777020560473e-05, "loss": 2.2181, "step": 7600 }, { "epoch": 0.8579387186629527, "grad_norm": 0.513080894947052, "learning_rate": 1.6126858853144854e-05, "loss": 2.2045, "step": 7700 }, { "epoch": 0.8690807799442897, "grad_norm": 0.4850214421749115, "learning_rate": 1.465795114698568e-05, "loss": 2.1966, "step": 7800 }, { "epoch": 0.8802228412256268, "grad_norm": 0.5757237672805786, "learning_rate": 1.3046884439396632e-05, "loss": 2.2019, "step": 7900 }, { "epoch": 0.8913649025069638, "grad_norm": 0.48911547660827637, "learning_rate": 1.1342827909521198e-05, "loss": 2.1994, "step": 8000 }, { "epoch": 0.9025069637883009, "grad_norm": 0.5143416523933411, "learning_rate": 9.597788752588108e-06, "loss": 2.207, "step": 8100 }, { "epoch": 0.9136490250696379, "grad_norm": 0.483064204454422, "learning_rate": 7.882117054290375e-06, "loss": 2.1971, "step": 8200 }, { "epoch": 0.924791086350975, "grad_norm": 0.4610692262649536, "learning_rate": 6.213603769467132e-06, "loss": 2.1977, "step": 8300 }, { "epoch": 0.935933147632312, "grad_norm": 0.50405353307724, "learning_rate": 4.660649942045826e-06, "loss": 2.1955, "step": 8400 }, { "epoch": 0.947075208913649, "grad_norm": 0.47849225997924805, "learning_rate": 3.27065116687201e-06, "loss": 2.1906, "step": 8500 }, { "epoch": 0.958217270194986, "grad_norm": 0.49040549993515015, "learning_rate": 2.0860297089355943e-06, "loss": 2.1875, "step": 8600 }, { "epoch": 0.9693593314763231, "grad_norm": 0.49369877576828003, "learning_rate": 1.1429397910307794e-06, "loss": 2.2054, "step": 8700 }, { "epoch": 0.9805013927576601, "grad_norm": 0.5104972124099731, "learning_rate": 4.701641798198353e-07, "loss": 2.2121, "step": 8800 }, { "epoch": 0.9916434540389972, "grad_norm": 0.48128950595855713, "learning_rate": 8.823574609897134e-08, "loss": 2.1928, "step": 8900 }, { "epoch": 1.0, "step": 8975, "total_flos": 1.3078189256852111e+18, "train_loss": 2.2911263094506222, "train_runtime": 2843.733, "train_samples_per_second": 50.495, "train_steps_per_second": 3.156 } ], "logging_steps": 100, "max_steps": 8975, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3078189256852111e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }