{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 100, "global_step": 1160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1724137931034483, "grad_norm": 59.60115051269531, "learning_rate": 5.172413793103448e-06, "loss": 5.4773, "step": 10 }, { "epoch": 0.3448275862068966, "grad_norm": 0.6899381875991821, "learning_rate": 1.3793103448275863e-05, "loss": 0.5795, "step": 20 }, { "epoch": 0.5172413793103449, "grad_norm": 0.37347301840782166, "learning_rate": 2.2413793103448276e-05, "loss": 0.1975, "step": 30 }, { "epoch": 0.6896551724137931, "grad_norm": 0.5068468451499939, "learning_rate": 3.103448275862069e-05, "loss": 0.1843, "step": 40 }, { "epoch": 0.8620689655172413, "grad_norm": 0.4019465148448944, "learning_rate": 3.965517241379311e-05, "loss": 0.1866, "step": 50 }, { "epoch": 1.0344827586206897, "grad_norm": 0.3685491681098938, "learning_rate": 4.827586206896552e-05, "loss": 0.1578, "step": 60 }, { "epoch": 1.206896551724138, "grad_norm": 0.3507291376590729, "learning_rate": 5.689655172413794e-05, "loss": 0.1198, "step": 70 }, { "epoch": 1.3793103448275863, "grad_norm": 0.437311053276062, "learning_rate": 6.551724137931034e-05, "loss": 0.1032, "step": 80 }, { "epoch": 1.5517241379310345, "grad_norm": 0.36062315106391907, "learning_rate": 7.413793103448277e-05, "loss": 0.1172, "step": 90 }, { "epoch": 1.7241379310344827, "grad_norm": 0.4424302279949188, "learning_rate": 8.275862068965517e-05, "loss": 0.1212, "step": 100 }, { "epoch": 1.7241379310344827, "eval_loss": 0.07583244889974594, "eval_runtime": 8.495, "eval_samples_per_second": 55.327, "eval_steps_per_second": 6.945, "step": 100 }, { "epoch": 1.896551724137931, "grad_norm": 0.32817938923835754, "learning_rate": 9.137931034482759e-05, "loss": 0.1186, "step": 110 }, { "epoch": 2.0689655172413794, "grad_norm": 0.27987948060035706, "learning_rate": 0.0001, "loss": 0.1074, "step": 120 }, { "epoch": 2.2413793103448274, "grad_norm": 0.3074031472206116, "learning_rate": 9.997736367166968e-05, "loss": 0.0599, "step": 130 }, { "epoch": 2.413793103448276, "grad_norm": 0.29311925172805786, "learning_rate": 9.990947518281311e-05, "loss": 0.0648, "step": 140 }, { "epoch": 2.586206896551724, "grad_norm": 0.29754340648651123, "learning_rate": 9.979639600327522e-05, "loss": 0.0708, "step": 150 }, { "epoch": 2.7586206896551726, "grad_norm": 0.3316575586795807, "learning_rate": 9.963822852095345e-05, "loss": 0.0684, "step": 160 }, { "epoch": 2.9310344827586206, "grad_norm": 0.25907158851623535, "learning_rate": 9.943511594909023e-05, "loss": 0.0697, "step": 170 }, { "epoch": 3.103448275862069, "grad_norm": 0.22101767361164093, "learning_rate": 9.918724219660013e-05, "loss": 0.0518, "step": 180 }, { "epoch": 3.2758620689655173, "grad_norm": 0.26941612362861633, "learning_rate": 9.889483170154903e-05, "loss": 0.0395, "step": 190 }, { "epoch": 3.4482758620689653, "grad_norm": 0.18463601171970367, "learning_rate": 9.855814922793582e-05, "loss": 0.0439, "step": 200 }, { "epoch": 3.4482758620689653, "eval_loss": 0.03639198839664459, "eval_runtime": 8.4882, "eval_samples_per_second": 55.371, "eval_steps_per_second": 6.951, "step": 200 }, { "epoch": 3.6206896551724137, "grad_norm": 0.22298499941825867, "learning_rate": 9.817749962596115e-05, "loss": 0.0453, "step": 210 }, { "epoch": 3.793103448275862, "grad_norm": 0.18436843156814575, "learning_rate": 9.775322755599978e-05, "loss": 0.0426, "step": 220 }, { "epoch": 3.9655172413793105, "grad_norm": 0.19848646223545074, "learning_rate": 9.728571717652677e-05, "loss": 0.0463, "step": 230 }, { "epoch": 4.137931034482759, "grad_norm": 0.13991042971611023, "learning_rate": 9.677539179628005e-05, "loss": 0.034, "step": 240 }, { "epoch": 4.310344827586207, "grad_norm": 0.15892928838729858, "learning_rate": 9.622271349097411e-05, "loss": 0.0301, "step": 250 }, { "epoch": 4.482758620689655, "grad_norm": 0.27909988164901733, "learning_rate": 9.562818268491216e-05, "loss": 0.0333, "step": 260 }, { "epoch": 4.655172413793103, "grad_norm": 0.19205643236637115, "learning_rate": 9.499233769787535e-05, "loss": 0.0319, "step": 270 }, { "epoch": 4.827586206896552, "grad_norm": 0.19656455516815186, "learning_rate": 9.431575425769938e-05, "loss": 0.032, "step": 280 }, { "epoch": 5.0, "grad_norm": 0.15377789735794067, "learning_rate": 9.359904497898009e-05, "loss": 0.0325, "step": 290 }, { "epoch": 5.172413793103448, "grad_norm": 0.11539698392152786, "learning_rate": 9.284285880837946e-05, "loss": 0.0208, "step": 300 }, { "epoch": 5.172413793103448, "eval_loss": 0.021401584148406982, "eval_runtime": 8.4928, "eval_samples_per_second": 55.341, "eval_steps_per_second": 6.947, "step": 300 }, { "epoch": 5.344827586206897, "grad_norm": 0.09723104536533356, "learning_rate": 9.2047880437035e-05, "loss": 0.024, "step": 310 }, { "epoch": 5.517241379310345, "grad_norm": 0.10288870334625244, "learning_rate": 9.121482968060384e-05, "loss": 0.025, "step": 320 }, { "epoch": 5.689655172413794, "grad_norm": 0.13955926895141602, "learning_rate": 9.034446082750352e-05, "loss": 0.0267, "step": 330 }, { "epoch": 5.862068965517241, "grad_norm": 0.09979069977998734, "learning_rate": 8.943756195593916e-05, "loss": 0.0249, "step": 340 }, { "epoch": 6.0344827586206895, "grad_norm": 0.100404754281044, "learning_rate": 8.849495422033549e-05, "loss": 0.0223, "step": 350 }, { "epoch": 6.206896551724138, "grad_norm": 0.06972885876893997, "learning_rate": 8.751749110782012e-05, "loss": 0.0175, "step": 360 }, { "epoch": 6.379310344827586, "grad_norm": 0.10955193638801575, "learning_rate": 8.650605766543089e-05, "loss": 0.0191, "step": 370 }, { "epoch": 6.551724137931035, "grad_norm": 0.08194795995950699, "learning_rate": 8.546156969874723e-05, "loss": 0.0194, "step": 380 }, { "epoch": 6.724137931034483, "grad_norm": 0.10594207793474197, "learning_rate": 8.438497294267117e-05, "loss": 0.0215, "step": 390 }, { "epoch": 6.896551724137931, "grad_norm": 0.09979037195444107, "learning_rate": 8.327724220510873e-05, "loss": 0.0209, "step": 400 }, { "epoch": 6.896551724137931, "eval_loss": 0.016114579513669014, "eval_runtime": 8.4861, "eval_samples_per_second": 55.385, "eval_steps_per_second": 6.953, "step": 400 }, { "epoch": 7.068965517241379, "grad_norm": 0.06945820152759552, "learning_rate": 8.213938048432697e-05, "loss": 0.0196, "step": 410 }, { "epoch": 7.241379310344827, "grad_norm": 0.07130986452102661, "learning_rate": 8.097241806078615e-05, "loss": 0.0166, "step": 420 }, { "epoch": 7.413793103448276, "grad_norm": 0.07004086673259735, "learning_rate": 7.977741156426901e-05, "loss": 0.0168, "step": 430 }, { "epoch": 7.586206896551724, "grad_norm": 0.08699115365743637, "learning_rate": 7.855544301715203e-05, "loss": 0.0179, "step": 440 }, { "epoch": 7.758620689655173, "grad_norm": 0.11357751488685608, "learning_rate": 7.730761885468485e-05, "loss": 0.0182, "step": 450 }, { "epoch": 7.931034482758621, "grad_norm": 0.08816002309322357, "learning_rate": 7.603506892316512e-05, "loss": 0.0182, "step": 460 }, { "epoch": 8.10344827586207, "grad_norm": 0.07963044941425323, "learning_rate": 7.47389454569155e-05, "loss": 0.0174, "step": 470 }, { "epoch": 8.275862068965518, "grad_norm": 0.07185494154691696, "learning_rate": 7.342042203498951e-05, "loss": 0.0145, "step": 480 }, { "epoch": 8.448275862068966, "grad_norm": 0.0649387463927269, "learning_rate": 7.208069251855078e-05, "loss": 0.016, "step": 490 }, { "epoch": 8.620689655172415, "grad_norm": 0.06952589750289917, "learning_rate": 7.07209699698876e-05, "loss": 0.0161, "step": 500 }, { "epoch": 8.620689655172415, "eval_loss": 0.014497065916657448, "eval_runtime": 8.487, "eval_samples_per_second": 55.379, "eval_steps_per_second": 6.952, "step": 500 }, { "epoch": 8.793103448275861, "grad_norm": 0.07031919807195663, "learning_rate": 6.934248555404198e-05, "loss": 0.0175, "step": 510 }, { "epoch": 8.96551724137931, "grad_norm": 0.08514226227998734, "learning_rate": 6.79464874240473e-05, "loss": 0.0182, "step": 520 }, { "epoch": 9.137931034482758, "grad_norm": 0.07341048121452332, "learning_rate": 6.653423959078436e-05, "loss": 0.0147, "step": 530 }, { "epoch": 9.310344827586206, "grad_norm": 0.06410729140043259, "learning_rate": 6.510702077847863e-05, "loss": 0.0144, "step": 540 }, { "epoch": 9.482758620689655, "grad_norm": 0.06970565766096115, "learning_rate": 6.366612326687554e-05, "loss": 0.0153, "step": 550 }, { "epoch": 9.655172413793103, "grad_norm": 0.0829567089676857, "learning_rate": 6.221285172114157e-05, "loss": 0.0152, "step": 560 }, { "epoch": 9.827586206896552, "grad_norm": 0.06180880591273308, "learning_rate": 6.0748522010551215e-05, "loss": 0.0154, "step": 570 }, { "epoch": 10.0, "grad_norm": 0.07084178179502487, "learning_rate": 5.927446001702899e-05, "loss": 0.0164, "step": 580 }, { "epoch": 10.172413793103448, "grad_norm": 0.05729280412197113, "learning_rate": 5.779200043462549e-05, "loss": 0.0134, "step": 590 }, { "epoch": 10.344827586206897, "grad_norm": 0.05883341655135155, "learning_rate": 5.6302485561014475e-05, "loss": 0.0142, "step": 600 }, { "epoch": 10.344827586206897, "eval_loss": 0.013431085273623466, "eval_runtime": 8.4852, "eval_samples_per_second": 55.391, "eval_steps_per_second": 6.953, "step": 600 }, { "epoch": 10.517241379310345, "grad_norm": 0.05796743184328079, "learning_rate": 5.4807264082105195e-05, "loss": 0.0144, "step": 610 }, { "epoch": 10.689655172413794, "grad_norm": 0.07652316242456436, "learning_rate": 5.330768985087059e-05, "loss": 0.0146, "step": 620 }, { "epoch": 10.862068965517242, "grad_norm": 0.06839638203382492, "learning_rate": 5.180512066149682e-05, "loss": 0.0154, "step": 630 }, { "epoch": 11.03448275862069, "grad_norm": 0.05099385976791382, "learning_rate": 5.030091701996428e-05, "loss": 0.0147, "step": 640 }, { "epoch": 11.206896551724139, "grad_norm": 0.05085451900959015, "learning_rate": 4.879644091217317e-05, "loss": 0.0133, "step": 650 }, { "epoch": 11.379310344827585, "grad_norm": 0.07139307260513306, "learning_rate": 4.729305457072913e-05, "loss": 0.0136, "step": 660 }, { "epoch": 11.551724137931034, "grad_norm": 0.06991631537675858, "learning_rate": 4.579211924150547e-05, "loss": 0.0142, "step": 670 }, { "epoch": 11.724137931034482, "grad_norm": 0.06048361957073212, "learning_rate": 4.429499395109877e-05, "loss": 0.0146, "step": 680 }, { "epoch": 11.89655172413793, "grad_norm": 0.06556624919176102, "learning_rate": 4.280303427629404e-05, "loss": 0.0148, "step": 690 }, { "epoch": 12.068965517241379, "grad_norm": 0.05133816972374916, "learning_rate": 4.131759111665349e-05, "loss": 0.0141, "step": 700 }, { "epoch": 12.068965517241379, "eval_loss": 0.012837924063205719, "eval_runtime": 8.4874, "eval_samples_per_second": 55.376, "eval_steps_per_second": 6.951, "step": 700 }, { "epoch": 12.241379310344827, "grad_norm": 0.05528413876891136, "learning_rate": 3.9840009471340194e-05, "loss": 0.0129, "step": 710 }, { "epoch": 12.413793103448276, "grad_norm": 0.05302777513861656, "learning_rate": 3.8371627221284495e-05, "loss": 0.0136, "step": 720 }, { "epoch": 12.586206896551724, "grad_norm": 0.06363388895988464, "learning_rate": 3.691377391779543e-05, "loss": 0.0139, "step": 730 }, { "epoch": 12.758620689655173, "grad_norm": 0.056679144501686096, "learning_rate": 3.546776957871445e-05, "loss": 0.0141, "step": 740 }, { "epoch": 12.931034482758621, "grad_norm": 0.07062114775180817, "learning_rate": 3.403492349320101e-05, "loss": 0.0143, "step": 750 }, { "epoch": 13.10344827586207, "grad_norm": 0.05683109909296036, "learning_rate": 3.261653303623263e-05, "loss": 0.0134, "step": 760 }, { "epoch": 13.275862068965518, "grad_norm": 0.05229064077138901, "learning_rate": 3.121388249389269e-05, "loss": 0.0128, "step": 770 }, { "epoch": 13.448275862068966, "grad_norm": 0.07219678908586502, "learning_rate": 2.982824190050958e-05, "loss": 0.0133, "step": 780 }, { "epoch": 13.620689655172415, "grad_norm": 0.06031765043735504, "learning_rate": 2.846086588870006e-05, "loss": 0.0136, "step": 790 }, { "epoch": 13.793103448275861, "grad_norm": 0.06178103759884834, "learning_rate": 2.711299255335833e-05, "loss": 0.0138, "step": 800 }, { "epoch": 13.793103448275861, "eval_loss": 0.012654323130846024, "eval_runtime": 8.4929, "eval_samples_per_second": 55.34, "eval_steps_per_second": 6.947, "step": 800 }, { "epoch": 13.96551724137931, "grad_norm": 0.051913850009441376, "learning_rate": 2.5785842330619038e-05, "loss": 0.014, "step": 810 }, { "epoch": 14.137931034482758, "grad_norm": 0.05191033333539963, "learning_rate": 2.4480616892809594e-05, "loss": 0.0129, "step": 820 }, { "epoch": 14.310344827586206, "grad_norm": 0.055176835507154465, "learning_rate": 2.3198498060392232e-05, "loss": 0.0127, "step": 830 }, { "epoch": 14.482758620689655, "grad_norm": 0.0499984435737133, "learning_rate": 2.194064673188089e-05, "loss": 0.013, "step": 840 }, { "epoch": 14.655172413793103, "grad_norm": 0.05339398235082626, "learning_rate": 2.070820183270211e-05, "loss": 0.0134, "step": 850 }, { "epoch": 14.827586206896552, "grad_norm": 0.06339651346206665, "learning_rate": 1.9502279283951364e-05, "loss": 0.0135, "step": 860 }, { "epoch": 15.0, "grad_norm": 0.061148300766944885, "learning_rate": 1.832397099197882e-05, "loss": 0.0135, "step": 870 }, { "epoch": 15.172413793103448, "grad_norm": 0.05747521296143532, "learning_rate": 1.7174343859719333e-05, "loss": 0.0125, "step": 880 }, { "epoch": 15.344827586206897, "grad_norm": 0.05627927929162979, "learning_rate": 1.6054438820661854e-05, "loss": 0.0126, "step": 890 }, { "epoch": 15.517241379310345, "grad_norm": 0.0764516219496727, "learning_rate": 1.4965269896332885e-05, "loss": 0.0129, "step": 900 }, { "epoch": 15.517241379310345, "eval_loss": 0.01242972631007433, "eval_runtime": 8.4929, "eval_samples_per_second": 55.34, "eval_steps_per_second": 6.947, "step": 900 }, { "epoch": 15.689655172413794, "grad_norm": 0.0588865764439106, "learning_rate": 1.3907823278147563e-05, "loss": 0.013, "step": 910 }, { "epoch": 15.862068965517242, "grad_norm": 0.06028977409005165, "learning_rate": 1.2883056434459506e-05, "loss": 0.0132, "step": 920 }, { "epoch": 16.03448275862069, "grad_norm": 0.0516447052359581, "learning_rate": 1.1891897243618182e-05, "loss": 0.0132, "step": 930 }, { "epoch": 16.20689655172414, "grad_norm": 0.053294651210308075, "learning_rate": 1.0935243153818436e-05, "loss": 0.0123, "step": 940 }, { "epoch": 16.379310344827587, "grad_norm": 0.059708479791879654, "learning_rate": 1.0013960370503261e-05, "loss": 0.0125, "step": 950 }, { "epoch": 16.551724137931036, "grad_norm": 0.051865532994270325, "learning_rate": 9.12888307205541e-06, "loss": 0.0126, "step": 960 }, { "epoch": 16.724137931034484, "grad_norm": 0.06396190822124481, "learning_rate": 8.280812654487891e-06, "loss": 0.0128, "step": 970 }, { "epoch": 16.896551724137932, "grad_norm": 0.060719434171915054, "learning_rate": 7.470517005817474e-06, "loss": 0.013, "step": 980 }, { "epoch": 17.06896551724138, "grad_norm": 0.05675433203577995, "learning_rate": 6.698729810778065e-06, "loss": 0.0126, "step": 990 }, { "epoch": 17.24137931034483, "grad_norm": 0.06285712867975235, "learning_rate": 5.966149886503614e-06, "loss": 0.0123, "step": 1000 }, { "epoch": 17.24137931034483, "eval_loss": 0.01223958469927311, "eval_runtime": 8.4904, "eval_samples_per_second": 55.357, "eval_steps_per_second": 6.949, "step": 1000 }, { "epoch": 17.413793103448278, "grad_norm": 0.06330039352178574, "learning_rate": 5.27344054978186e-06, "loss": 0.0123, "step": 1010 }, { "epoch": 17.586206896551722, "grad_norm": 0.05098165571689606, "learning_rate": 4.621229016452156e-06, "loss": 0.0124, "step": 1020 }, { "epoch": 17.75862068965517, "grad_norm": 0.06866684556007385, "learning_rate": 4.010105833490857e-06, "loss": 0.0125, "step": 1030 }, { "epoch": 17.93103448275862, "grad_norm": 0.06366662681102753, "learning_rate": 3.4406243442987764e-06, "loss": 0.0126, "step": 1040 }, { "epoch": 18.103448275862068, "grad_norm": 0.06267203390598297, "learning_rate": 2.9133001876746004e-06, "loss": 0.0124, "step": 1050 }, { "epoch": 18.275862068965516, "grad_norm": 0.05903858318924904, "learning_rate": 2.428610830928152e-06, "loss": 0.0122, "step": 1060 }, { "epoch": 18.448275862068964, "grad_norm": 0.05347118899226189, "learning_rate": 1.9869951375561523e-06, "loss": 0.0123, "step": 1070 }, { "epoch": 18.620689655172413, "grad_norm": 0.0637890100479126, "learning_rate": 1.5888529698718346e-06, "loss": 0.0122, "step": 1080 }, { "epoch": 18.79310344827586, "grad_norm": 0.051785457879304886, "learning_rate": 1.2345448269483916e-06, "loss": 0.0124, "step": 1090 }, { "epoch": 18.96551724137931, "grad_norm": 0.06472212821245193, "learning_rate": 9.243915182039431e-07, "loss": 0.0123, "step": 1100 }, { "epoch": 18.96551724137931, "eval_loss": 0.012206222862005234, "eval_runtime": 8.4831, "eval_samples_per_second": 55.404, "eval_steps_per_second": 6.955, "step": 1100 }, { "epoch": 19.137931034482758, "grad_norm": 0.06167494133114815, "learning_rate": 6.58673872923693e-07, "loss": 0.0122, "step": 1110 }, { "epoch": 19.310344827586206, "grad_norm": 0.06309988349676132, "learning_rate": 4.376324859820924e-07, "loss": 0.0122, "step": 1120 }, { "epoch": 19.482758620689655, "grad_norm": 0.05701352283358574, "learning_rate": 2.614674999955269e-07, "loss": 0.0122, "step": 1130 }, { "epoch": 19.655172413793103, "grad_norm": 0.06496866792440414, "learning_rate": 1.3033842410251075e-07, "loss": 0.0122, "step": 1140 }, { "epoch": 19.82758620689655, "grad_norm": 0.05343254283070564, "learning_rate": 4.436398953567289e-08, "loss": 0.0122, "step": 1150 }, { "epoch": 20.0, "grad_norm": 0.06212453171610832, "learning_rate": 3.622042116169233e-09, "loss": 0.0122, "step": 1160 } ], "logging_steps": 10, "max_steps": 1160, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.74429913710592e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }