{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 1000, "global_step": 31479, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.047650814828933574, "grad_norm": 2.0100979804992676, "learning_rate": 0.00019682327901140442, "loss": 2.3566, "step": 500 }, { "epoch": 0.09530162965786715, "grad_norm": 1.877261996269226, "learning_rate": 0.00019364655802280888, "loss": 2.2178, "step": 1000 }, { "epoch": 0.09530162965786715, "eval_loss": 2.3788223266601562, "eval_runtime": 80.302, "eval_samples_per_second": 173.271, "eval_steps_per_second": 7.223, "step": 1000 }, { "epoch": 0.14295244448680072, "grad_norm": 1.7388309240341187, "learning_rate": 0.00019046983703421329, "loss": 2.1744, "step": 1500 }, { "epoch": 0.1906032593157343, "grad_norm": 1.8366143703460693, "learning_rate": 0.00018729311604561772, "loss": 2.163, "step": 2000 }, { "epoch": 0.1906032593157343, "eval_loss": 2.3654611110687256, "eval_runtime": 80.3504, "eval_samples_per_second": 173.166, "eval_steps_per_second": 7.218, "step": 2000 }, { "epoch": 0.23825407414466787, "grad_norm": 1.6628751754760742, "learning_rate": 0.00018411639505702213, "loss": 2.1515, "step": 2500 }, { "epoch": 0.28590488897360145, "grad_norm": 1.6291817426681519, "learning_rate": 0.0001809396740684266, "loss": 2.1196, "step": 3000 }, { "epoch": 0.28590488897360145, "eval_loss": 2.3521649837493896, "eval_runtime": 80.224, "eval_samples_per_second": 173.439, "eval_steps_per_second": 7.23, "step": 3000 }, { "epoch": 0.333555703802535, "grad_norm": 1.7604336738586426, "learning_rate": 0.000177762953079831, "loss": 2.1074, "step": 3500 }, { "epoch": 0.3812065186314686, "grad_norm": 1.34886634349823, "learning_rate": 0.00017458623209123543, "loss": 2.0926, "step": 4000 }, { "epoch": 0.3812065186314686, "eval_loss": 2.3420486450195312, "eval_runtime": 80.1942, "eval_samples_per_second": 173.504, "eval_steps_per_second": 7.232, "step": 4000 }, { "epoch": 0.42885733346040217, "grad_norm": 1.3510360717773438, "learning_rate": 0.00017140951110263986, "loss": 2.074, "step": 4500 }, { "epoch": 0.47650814828933574, "grad_norm": 1.272275447845459, "learning_rate": 0.0001682327901140443, "loss": 2.0752, "step": 5000 }, { "epoch": 0.47650814828933574, "eval_loss": 2.3270885944366455, "eval_runtime": 80.2619, "eval_samples_per_second": 173.358, "eval_steps_per_second": 7.226, "step": 5000 }, { "epoch": 0.5241589631182694, "grad_norm": 1.289753794670105, "learning_rate": 0.0001650560691254487, "loss": 2.0487, "step": 5500 }, { "epoch": 0.5718097779472029, "grad_norm": 1.1615971326828003, "learning_rate": 0.00016187934813685314, "loss": 2.0437, "step": 6000 }, { "epoch": 0.5718097779472029, "eval_loss": 2.3274528980255127, "eval_runtime": 80.2214, "eval_samples_per_second": 173.445, "eval_steps_per_second": 7.23, "step": 6000 }, { "epoch": 0.6194605927761365, "grad_norm": 1.3484673500061035, "learning_rate": 0.00015870262714825757, "loss": 2.0134, "step": 6500 }, { "epoch": 0.66711140760507, "grad_norm": 1.4737777709960938, "learning_rate": 0.000155525906159662, "loss": 2.0379, "step": 7000 }, { "epoch": 0.66711140760507, "eval_loss": 2.3164169788360596, "eval_runtime": 80.2177, "eval_samples_per_second": 173.453, "eval_steps_per_second": 7.23, "step": 7000 }, { "epoch": 0.7147622224340037, "grad_norm": 1.1502068042755127, "learning_rate": 0.00015234918517106642, "loss": 1.9916, "step": 7500 }, { "epoch": 0.7624130372629372, "grad_norm": 1.2299320697784424, "learning_rate": 0.00014917246418247085, "loss": 2.0068, "step": 8000 }, { "epoch": 0.7624130372629372, "eval_loss": 2.311408042907715, "eval_runtime": 80.2576, "eval_samples_per_second": 173.367, "eval_steps_per_second": 7.227, "step": 8000 }, { "epoch": 0.8100638520918708, "grad_norm": 1.2537345886230469, "learning_rate": 0.00014599574319387528, "loss": 1.9886, "step": 8500 }, { "epoch": 0.8577146669208043, "grad_norm": 1.0486429929733276, "learning_rate": 0.00014281902220527972, "loss": 1.9882, "step": 9000 }, { "epoch": 0.8577146669208043, "eval_loss": 2.304290294647217, "eval_runtime": 80.1372, "eval_samples_per_second": 173.627, "eval_steps_per_second": 7.238, "step": 9000 }, { "epoch": 0.905365481749738, "grad_norm": 1.1815516948699951, "learning_rate": 0.00013964230121668413, "loss": 1.9732, "step": 9500 }, { "epoch": 0.9530162965786715, "grad_norm": 1.2301689386367798, "learning_rate": 0.0001364655802280886, "loss": 1.9787, "step": 10000 }, { "epoch": 0.9530162965786715, "eval_loss": 2.2939772605895996, "eval_runtime": 80.1592, "eval_samples_per_second": 173.579, "eval_steps_per_second": 7.236, "step": 10000 }, { "epoch": 1.0006671114076051, "grad_norm": 1.497831106185913, "learning_rate": 0.000133288859239493, "loss": 1.9557, "step": 10500 }, { "epoch": 1.0483179262365387, "grad_norm": 1.3323341608047485, "learning_rate": 0.00013011213825089743, "loss": 1.7231, "step": 11000 }, { "epoch": 1.0483179262365387, "eval_loss": 2.313231945037842, "eval_runtime": 80.1199, "eval_samples_per_second": 173.665, "eval_steps_per_second": 7.239, "step": 11000 }, { "epoch": 1.0959687410654722, "grad_norm": 1.8000659942626953, "learning_rate": 0.00012693541726230184, "loss": 1.714, "step": 11500 }, { "epoch": 1.1436195558944058, "grad_norm": 1.2369180917739868, "learning_rate": 0.0001237586962737063, "loss": 1.7114, "step": 12000 }, { "epoch": 1.1436195558944058, "eval_loss": 2.313917875289917, "eval_runtime": 80.1492, "eval_samples_per_second": 173.601, "eval_steps_per_second": 7.237, "step": 12000 }, { "epoch": 1.1912703707233394, "grad_norm": 1.431038498878479, "learning_rate": 0.0001205819752851107, "loss": 1.7283, "step": 12500 }, { "epoch": 1.238921185552273, "grad_norm": 1.4570106267929077, "learning_rate": 0.00011740525429651514, "loss": 1.7033, "step": 13000 }, { "epoch": 1.238921185552273, "eval_loss": 2.310853958129883, "eval_runtime": 80.0945, "eval_samples_per_second": 173.72, "eval_steps_per_second": 7.241, "step": 13000 }, { "epoch": 1.2865720003812064, "grad_norm": 1.557187795639038, "learning_rate": 0.00011422853330791956, "loss": 1.7289, "step": 13500 }, { "epoch": 1.33422281521014, "grad_norm": 1.5775034427642822, "learning_rate": 0.000111051812319324, "loss": 1.7151, "step": 14000 }, { "epoch": 1.33422281521014, "eval_loss": 2.300920009613037, "eval_runtime": 80.1537, "eval_samples_per_second": 173.591, "eval_steps_per_second": 7.236, "step": 14000 }, { "epoch": 1.3818736300390737, "grad_norm": 1.2451566457748413, "learning_rate": 0.00010787509133072841, "loss": 1.7218, "step": 14500 }, { "epoch": 1.4295244448680071, "grad_norm": 1.650688886642456, "learning_rate": 0.00010469837034213286, "loss": 1.7202, "step": 15000 }, { "epoch": 1.4295244448680071, "eval_loss": 2.290478467941284, "eval_runtime": 80.1852, "eval_samples_per_second": 173.523, "eval_steps_per_second": 7.233, "step": 15000 }, { "epoch": 1.4771752596969407, "grad_norm": 1.4705020189285278, "learning_rate": 0.00010152164935353727, "loss": 1.721, "step": 15500 }, { "epoch": 1.5248260745258744, "grad_norm": 1.530394434928894, "learning_rate": 9.834492836494172e-05, "loss": 1.7261, "step": 16000 }, { "epoch": 1.5248260745258744, "eval_loss": 2.2944624423980713, "eval_runtime": 80.1122, "eval_samples_per_second": 173.682, "eval_steps_per_second": 7.24, "step": 16000 }, { "epoch": 1.572476889354808, "grad_norm": 1.667024850845337, "learning_rate": 9.516820737634614e-05, "loss": 1.7072, "step": 16500 }, { "epoch": 1.6201277041837416, "grad_norm": 1.4624521732330322, "learning_rate": 9.199148638775057e-05, "loss": 1.7091, "step": 17000 }, { "epoch": 1.6201277041837416, "eval_loss": 2.2861549854278564, "eval_runtime": 80.0947, "eval_samples_per_second": 173.719, "eval_steps_per_second": 7.241, "step": 17000 }, { "epoch": 1.6677785190126753, "grad_norm": 1.7141919136047363, "learning_rate": 8.881476539915499e-05, "loss": 1.7281, "step": 17500 }, { "epoch": 1.7154293338416087, "grad_norm": 1.367767333984375, "learning_rate": 8.563804441055943e-05, "loss": 1.7098, "step": 18000 }, { "epoch": 1.7154293338416087, "eval_loss": 2.2811758518218994, "eval_runtime": 80.1424, "eval_samples_per_second": 173.616, "eval_steps_per_second": 7.237, "step": 18000 }, { "epoch": 1.7630801486705423, "grad_norm": 1.530991792678833, "learning_rate": 8.246132342196385e-05, "loss": 1.6994, "step": 18500 }, { "epoch": 1.8107309634994757, "grad_norm": 1.4421322345733643, "learning_rate": 7.928460243336828e-05, "loss": 1.6943, "step": 19000 }, { "epoch": 1.8107309634994757, "eval_loss": 2.273425579071045, "eval_runtime": 80.1385, "eval_samples_per_second": 173.624, "eval_steps_per_second": 7.237, "step": 19000 }, { "epoch": 1.8583817783284093, "grad_norm": 1.5695687532424927, "learning_rate": 7.610788144477272e-05, "loss": 1.7, "step": 19500 }, { "epoch": 1.906032593157343, "grad_norm": 1.6507039070129395, "learning_rate": 7.293116045617714e-05, "loss": 1.7035, "step": 20000 }, { "epoch": 1.906032593157343, "eval_loss": 2.266268730163574, "eval_runtime": 80.1631, "eval_samples_per_second": 173.571, "eval_steps_per_second": 7.235, "step": 20000 }, { "epoch": 1.9536834079862766, "grad_norm": 1.41545832157135, "learning_rate": 6.975443946758157e-05, "loss": 1.6948, "step": 20500 }, { "epoch": 2.0013342228152102, "grad_norm": 1.3855451345443726, "learning_rate": 6.657771847898599e-05, "loss": 1.6776, "step": 21000 }, { "epoch": 2.0013342228152102, "eval_loss": 2.302978515625, "eval_runtime": 80.1675, "eval_samples_per_second": 173.562, "eval_steps_per_second": 7.235, "step": 21000 }, { "epoch": 2.048985037644144, "grad_norm": 1.3997050523757935, "learning_rate": 6.340099749039043e-05, "loss": 1.438, "step": 21500 }, { "epoch": 2.0966358524730775, "grad_norm": 1.4828859567642212, "learning_rate": 6.0224276501794854e-05, "loss": 1.4406, "step": 22000 }, { "epoch": 2.0966358524730775, "eval_loss": 2.3172175884246826, "eval_runtime": 80.1748, "eval_samples_per_second": 173.546, "eval_steps_per_second": 7.234, "step": 22000 }, { "epoch": 2.1442866673020107, "grad_norm": 1.8176885843276978, "learning_rate": 5.704755551319928e-05, "loss": 1.4555, "step": 22500 }, { "epoch": 2.1919374821309443, "grad_norm": 1.48106050491333, "learning_rate": 5.387083452460371e-05, "loss": 1.4659, "step": 23000 }, { "epoch": 2.1919374821309443, "eval_loss": 2.3182783126831055, "eval_runtime": 80.2101, "eval_samples_per_second": 173.47, "eval_steps_per_second": 7.231, "step": 23000 }, { "epoch": 2.239588296959878, "grad_norm": 1.6957001686096191, "learning_rate": 5.0694113536008136e-05, "loss": 1.448, "step": 23500 }, { "epoch": 2.2872391117888116, "grad_norm": 1.3845641613006592, "learning_rate": 4.7517392547412564e-05, "loss": 1.4608, "step": 24000 }, { "epoch": 2.2872391117888116, "eval_loss": 2.318488836288452, "eval_runtime": 80.1689, "eval_samples_per_second": 173.559, "eval_steps_per_second": 7.235, "step": 24000 }, { "epoch": 2.334889926617745, "grad_norm": 1.9913188219070435, "learning_rate": 4.434067155881699e-05, "loss": 1.439, "step": 24500 }, { "epoch": 2.382540741446679, "grad_norm": 1.8244202136993408, "learning_rate": 4.116395057022142e-05, "loss": 1.4423, "step": 25000 }, { "epoch": 2.382540741446679, "eval_loss": 2.3121349811553955, "eval_runtime": 80.1537, "eval_samples_per_second": 173.591, "eval_steps_per_second": 7.236, "step": 25000 }, { "epoch": 2.4301915562756125, "grad_norm": 1.347023606300354, "learning_rate": 3.7987229581625846e-05, "loss": 1.4506, "step": 25500 }, { "epoch": 2.477842371104546, "grad_norm": 1.49163019657135, "learning_rate": 3.481050859303028e-05, "loss": 1.4378, "step": 26000 }, { "epoch": 2.477842371104546, "eval_loss": 2.3090391159057617, "eval_runtime": 80.1708, "eval_samples_per_second": 173.554, "eval_steps_per_second": 7.235, "step": 26000 }, { "epoch": 2.5254931859334793, "grad_norm": 1.7945301532745361, "learning_rate": 3.163378760443471e-05, "loss": 1.4436, "step": 26500 }, { "epoch": 2.573144000762413, "grad_norm": 1.5082517862319946, "learning_rate": 2.8457066615839136e-05, "loss": 1.4277, "step": 27000 }, { "epoch": 2.573144000762413, "eval_loss": 2.3082542419433594, "eval_runtime": 80.1802, "eval_samples_per_second": 173.534, "eval_steps_per_second": 7.234, "step": 27000 }, { "epoch": 2.6207948155913465, "grad_norm": 1.4329321384429932, "learning_rate": 2.5280345627243563e-05, "loss": 1.4301, "step": 27500 }, { "epoch": 2.66844563042028, "grad_norm": 1.2606436014175415, "learning_rate": 2.2103624638647987e-05, "loss": 1.4251, "step": 28000 }, { "epoch": 2.66844563042028, "eval_loss": 2.2960703372955322, "eval_runtime": 80.1531, "eval_samples_per_second": 173.593, "eval_steps_per_second": 7.236, "step": 28000 }, { "epoch": 2.716096445249214, "grad_norm": 1.4542068243026733, "learning_rate": 1.8926903650052415e-05, "loss": 1.4248, "step": 28500 }, { "epoch": 2.7637472600781474, "grad_norm": 1.6642916202545166, "learning_rate": 1.5750182661456846e-05, "loss": 1.4219, "step": 29000 }, { "epoch": 2.7637472600781474, "eval_loss": 2.296442985534668, "eval_runtime": 80.1753, "eval_samples_per_second": 173.545, "eval_steps_per_second": 7.234, "step": 29000 }, { "epoch": 2.811398074907081, "grad_norm": 2.0301756858825684, "learning_rate": 1.2573461672861273e-05, "loss": 1.4281, "step": 29500 }, { "epoch": 2.8590488897360142, "grad_norm": 1.6031594276428223, "learning_rate": 9.3967406842657e-06, "loss": 1.434, "step": 30000 }, { "epoch": 2.8590488897360142, "eval_loss": 2.2933690547943115, "eval_runtime": 80.1482, "eval_samples_per_second": 173.603, "eval_steps_per_second": 7.237, "step": 30000 }, { "epoch": 2.9066997045649483, "grad_norm": 1.6658378839492798, "learning_rate": 6.22001969567013e-06, "loss": 1.4291, "step": 30500 }, { "epoch": 2.9543505193938815, "grad_norm": 1.589982032775879, "learning_rate": 3.0432987070745578e-06, "loss": 1.4279, "step": 31000 }, { "epoch": 2.9543505193938815, "eval_loss": 2.2906086444854736, "eval_runtime": 80.2746, "eval_samples_per_second": 173.33, "eval_steps_per_second": 7.225, "step": 31000 } ], "logging_steps": 500, "max_steps": 31479, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.934891962368e+16, "train_batch_size": 24, "trial_name": null, "trial_params": null }