{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 510, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00392156862745098, "grad_norm": 16.544786089270207, "learning_rate": 3.921568627450981e-07, "loss": 1.6547, "step": 1 }, { "epoch": 0.0196078431372549, "grad_norm": 14.762176989308287, "learning_rate": 1.96078431372549e-06, "loss": 1.617, "step": 5 }, { "epoch": 0.0392156862745098, "grad_norm": 5.1165863939415654, "learning_rate": 3.92156862745098e-06, "loss": 1.4078, "step": 10 }, { "epoch": 0.058823529411764705, "grad_norm": 2.2787880179843194, "learning_rate": 5.882352941176471e-06, "loss": 1.2895, "step": 15 }, { "epoch": 0.0784313725490196, "grad_norm": 1.832234050305862, "learning_rate": 7.84313725490196e-06, "loss": 1.2313, "step": 20 }, { "epoch": 0.09803921568627451, "grad_norm": 1.2251164606770868, "learning_rate": 9.803921568627451e-06, "loss": 1.1832, "step": 25 }, { "epoch": 0.11764705882352941, "grad_norm": 1.51446787757622, "learning_rate": 1.1764705882352942e-05, "loss": 1.1558, "step": 30 }, { "epoch": 0.13725490196078433, "grad_norm": 1.1054530221391963, "learning_rate": 1.3725490196078432e-05, "loss": 1.1678, "step": 35 }, { "epoch": 0.1568627450980392, "grad_norm": 1.1955722975383631, "learning_rate": 1.568627450980392e-05, "loss": 1.162, "step": 40 }, { "epoch": 0.17647058823529413, "grad_norm": 1.1601166405223855, "learning_rate": 1.7647058823529414e-05, "loss": 1.1303, "step": 45 }, { "epoch": 0.19607843137254902, "grad_norm": 1.2824519717268024, "learning_rate": 1.9607843137254903e-05, "loss": 1.1272, "step": 50 }, { "epoch": 0.21568627450980393, "grad_norm": 1.1682993723393238, "learning_rate": 1.999625253802851e-05, "loss": 1.1209, "step": 55 }, { "epoch": 0.23529411764705882, "grad_norm": 1.302075491367261, "learning_rate": 1.9981033287370443e-05, "loss": 1.0887, "step": 60 }, { "epoch": 0.2549019607843137, "grad_norm": 1.4556930650649318, "learning_rate": 1.9954125840299165e-05, "loss": 1.1226, "step": 65 }, { "epoch": 0.27450980392156865, "grad_norm": 1.289748942480489, "learning_rate": 1.9915561706530882e-05, "loss": 1.1327, "step": 70 }, { "epoch": 0.29411764705882354, "grad_norm": 1.1271075422611054, "learning_rate": 1.9865386046236597e-05, "loss": 1.1238, "step": 75 }, { "epoch": 0.3137254901960784, "grad_norm": 1.2436214858595933, "learning_rate": 1.9803657617157693e-05, "loss": 1.08, "step": 80 }, { "epoch": 0.3333333333333333, "grad_norm": 1.1690258085683884, "learning_rate": 1.973044870579824e-05, "loss": 1.111, "step": 85 }, { "epoch": 0.35294117647058826, "grad_norm": 1.0669536039386904, "learning_rate": 1.9645845042774555e-05, "loss": 1.1074, "step": 90 }, { "epoch": 0.37254901960784315, "grad_norm": 1.2617056965745426, "learning_rate": 1.9549945702421144e-05, "loss": 1.1073, "step": 95 }, { "epoch": 0.39215686274509803, "grad_norm": 1.0667234116768218, "learning_rate": 1.9442862986770645e-05, "loss": 1.1018, "step": 100 }, { "epoch": 0.4117647058823529, "grad_norm": 1.1310556155759037, "learning_rate": 1.932472229404356e-05, "loss": 1.1108, "step": 105 }, { "epoch": 0.43137254901960786, "grad_norm": 1.0707842591727832, "learning_rate": 1.9195661971801825e-05, "loss": 1.0978, "step": 110 }, { "epoch": 0.45098039215686275, "grad_norm": 1.1964147323589545, "learning_rate": 1.9055833154938208e-05, "loss": 1.0952, "step": 115 }, { "epoch": 0.47058823529411764, "grad_norm": 1.0532587111658636, "learning_rate": 1.8905399588691165e-05, "loss": 1.0901, "step": 120 }, { "epoch": 0.49019607843137253, "grad_norm": 1.0534305550044942, "learning_rate": 1.8744537436892517e-05, "loss": 1.0739, "step": 125 }, { "epoch": 0.5098039215686274, "grad_norm": 1.032737951798708, "learning_rate": 1.8573435075672422e-05, "loss": 1.0871, "step": 130 }, { "epoch": 0.5294117647058824, "grad_norm": 1.0758071447152808, "learning_rate": 1.839229287286327e-05, "loss": 1.0945, "step": 135 }, { "epoch": 0.5490196078431373, "grad_norm": 1.109162566034754, "learning_rate": 1.8201322953360758e-05, "loss": 1.0708, "step": 140 }, { "epoch": 0.5686274509803921, "grad_norm": 1.317192816494283, "learning_rate": 1.800074895071704e-05, "loss": 1.0764, "step": 145 }, { "epoch": 0.5882352941176471, "grad_norm": 1.0795334451744183, "learning_rate": 1.7790805745256703e-05, "loss": 1.0791, "step": 150 }, { "epoch": 0.6078431372549019, "grad_norm": 1.0411588891000574, "learning_rate": 1.7571739189022365e-05, "loss": 1.091, "step": 155 }, { "epoch": 0.6274509803921569, "grad_norm": 1.0108445010492009, "learning_rate": 1.7343805817871885e-05, "loss": 1.0809, "step": 160 }, { "epoch": 0.6470588235294118, "grad_norm": 1.0901231094990715, "learning_rate": 1.710727255106447e-05, "loss": 1.0602, "step": 165 }, { "epoch": 0.6666666666666666, "grad_norm": 1.0945695101274928, "learning_rate": 1.686241637868734e-05, "loss": 1.0777, "step": 170 }, { "epoch": 0.6862745098039216, "grad_norm": 1.193395829329031, "learning_rate": 1.660952403728902e-05, "loss": 1.0444, "step": 175 }, { "epoch": 0.7058823529411765, "grad_norm": 1.1047822048211582, "learning_rate": 1.634889167409923e-05, "loss": 1.0534, "step": 180 }, { "epoch": 0.7254901960784313, "grad_norm": 1.114058959698183, "learning_rate": 1.6080824500228367e-05, "loss": 1.0529, "step": 185 }, { "epoch": 0.7450980392156863, "grad_norm": 0.9904799394022883, "learning_rate": 1.5805636433252892e-05, "loss": 1.0793, "step": 190 }, { "epoch": 0.7647058823529411, "grad_norm": 1.2525818789945302, "learning_rate": 1.552364972960506e-05, "loss": 1.0708, "step": 195 }, { "epoch": 0.7843137254901961, "grad_norm": 1.1154866308925613, "learning_rate": 1.5235194607197508e-05, "loss": 1.0241, "step": 200 }, { "epoch": 0.803921568627451, "grad_norm": 1.0785210608261893, "learning_rate": 1.494060885872464e-05, "loss": 1.0733, "step": 205 }, { "epoch": 0.8235294117647058, "grad_norm": 1.1856668561642758, "learning_rate": 1.4640237456093636e-05, "loss": 1.0704, "step": 210 }, { "epoch": 0.8431372549019608, "grad_norm": 1.1422476630742018, "learning_rate": 1.4334432146448272e-05, "loss": 1.0655, "step": 215 }, { "epoch": 0.8627450980392157, "grad_norm": 0.9839617489985488, "learning_rate": 1.4023551040258726e-05, "loss": 1.0551, "step": 220 }, { "epoch": 0.8823529411764706, "grad_norm": 1.0486919008457822, "learning_rate": 1.3707958191959609e-05, "loss": 1.0479, "step": 225 }, { "epoch": 0.9019607843137255, "grad_norm": 1.0279452846404875, "learning_rate": 1.3388023173627413e-05, "loss": 1.0466, "step": 230 }, { "epoch": 0.9215686274509803, "grad_norm": 1.0670615822796066, "learning_rate": 1.3064120642196549e-05, "loss": 1.0554, "step": 235 }, { "epoch": 0.9411764705882353, "grad_norm": 1.0549543392774092, "learning_rate": 1.2736629900720832e-05, "loss": 1.0648, "step": 240 }, { "epoch": 0.9607843137254902, "grad_norm": 0.9674738336064063, "learning_rate": 1.2405934454194146e-05, "loss": 1.0197, "step": 245 }, { "epoch": 0.9803921568627451, "grad_norm": 0.9981821541554068, "learning_rate": 1.2072421560450497e-05, "loss": 1.0355, "step": 250 }, { "epoch": 1.0, "grad_norm": 0.9908436174645702, "learning_rate": 1.1736481776669307e-05, "loss": 1.0409, "step": 255 }, { "epoch": 1.0, "eval_loss": 1.0343828201293945, "eval_runtime": 4.2299, "eval_samples_per_second": 37.826, "eval_steps_per_second": 0.709, "step": 255 }, { "epoch": 1.0196078431372548, "grad_norm": 2.78732105665186, "learning_rate": 1.1398508502017047e-05, "loss": 0.7588, "step": 260 }, { "epoch": 1.0392156862745099, "grad_norm": 1.5405892203726659, "learning_rate": 1.1058897516960817e-05, "loss": 0.7294, "step": 265 }, { "epoch": 1.0588235294117647, "grad_norm": 1.1503472595942978, "learning_rate": 1.0718046519793276e-05, "loss": 0.7224, "step": 270 }, { "epoch": 1.0784313725490196, "grad_norm": 1.1159446663960584, "learning_rate": 1.0376354660911772e-05, "loss": 0.7354, "step": 275 }, { "epoch": 1.0980392156862746, "grad_norm": 1.0502332790639606, "learning_rate": 1.0034222075396954e-05, "loss": 0.7255, "step": 280 }, { "epoch": 1.1176470588235294, "grad_norm": 0.9509421475398706, "learning_rate": 9.692049414438298e-06, "loss": 0.7154, "step": 285 }, { "epoch": 1.1372549019607843, "grad_norm": 0.988479903928271, "learning_rate": 9.350237376155269e-06, "loss": 0.7218, "step": 290 }, { "epoch": 1.156862745098039, "grad_norm": 1.0085376248525226, "learning_rate": 9.00918623636349e-06, "loss": 0.7366, "step": 295 }, { "epoch": 1.1764705882352942, "grad_norm": 0.9961601641040383, "learning_rate": 8.669295379835467e-06, "loss": 0.7184, "step": 300 }, { "epoch": 1.196078431372549, "grad_norm": 0.9921390197234601, "learning_rate": 8.330962832604747e-06, "loss": 0.7313, "step": 305 }, { "epoch": 1.215686274509804, "grad_norm": 1.0706453846667212, "learning_rate": 7.994584795861248e-06, "loss": 0.7257, "step": 310 }, { "epoch": 1.2352941176470589, "grad_norm": 1.0297863168482826, "learning_rate": 7.660555181983517e-06, "loss": 0.7468, "step": 315 }, { "epoch": 1.2549019607843137, "grad_norm": 0.9995332244709463, "learning_rate": 7.329265153251285e-06, "loss": 0.7439, "step": 320 }, { "epoch": 1.2745098039215685, "grad_norm": 1.0186580227806559, "learning_rate": 7.001102663778533e-06, "loss": 0.7172, "step": 325 }, { "epoch": 1.2941176470588236, "grad_norm": 1.014034224454933, "learning_rate": 6.6764520052034054e-06, "loss": 0.7309, "step": 330 }, { "epoch": 1.3137254901960784, "grad_norm": 1.0151160383199134, "learning_rate": 6.3556933566670656e-06, "loss": 0.719, "step": 335 }, { "epoch": 1.3333333333333333, "grad_norm": 0.9563996745028378, "learning_rate": 6.039202339608432e-06, "loss": 0.7448, "step": 340 }, { "epoch": 1.3529411764705883, "grad_norm": 0.9430025085055084, "learning_rate": 5.727349577896194e-06, "loss": 0.715, "step": 345 }, { "epoch": 1.3725490196078431, "grad_norm": 0.9520104411677209, "learning_rate": 5.420500263813141e-06, "loss": 0.7423, "step": 350 }, { "epoch": 1.392156862745098, "grad_norm": 0.9533462340832225, "learning_rate": 5.119013730401152e-06, "loss": 0.7206, "step": 355 }, { "epoch": 1.4117647058823528, "grad_norm": 0.944013138090968, "learning_rate": 4.823243030667576e-06, "loss": 0.7112, "step": 360 }, { "epoch": 1.4313725490196079, "grad_norm": 0.9654739136152124, "learning_rate": 4.533534524145756e-06, "loss": 0.7092, "step": 365 }, { "epoch": 1.4509803921568627, "grad_norm": 1.0325541079170395, "learning_rate": 4.2502274712939355e-06, "loss": 0.7218, "step": 370 }, { "epoch": 1.4705882352941178, "grad_norm": 0.945807759954701, "learning_rate": 3.973653636207437e-06, "loss": 0.7304, "step": 375 }, { "epoch": 1.4901960784313726, "grad_norm": 0.9339565355678595, "learning_rate": 3.704136898109403e-06, "loss": 0.723, "step": 380 }, { "epoch": 1.5098039215686274, "grad_norm": 0.9680921345423967, "learning_rate": 3.4419928720750274e-06, "loss": 0.7382, "step": 385 }, { "epoch": 1.5294117647058822, "grad_norm": 0.9738097860413395, "learning_rate": 3.1875285394334575e-06, "loss": 0.7129, "step": 390 }, { "epoch": 1.5490196078431373, "grad_norm": 0.9541739057939187, "learning_rate": 2.9410418882801682e-06, "loss": 0.7392, "step": 395 }, { "epoch": 1.5686274509803921, "grad_norm": 0.9315613334411681, "learning_rate": 2.702821564520732e-06, "loss": 0.7323, "step": 400 }, { "epoch": 1.5882352941176472, "grad_norm": 0.9959745037027158, "learning_rate": 2.4731465338547556e-06, "loss": 0.7501, "step": 405 }, { "epoch": 1.607843137254902, "grad_norm": 0.9359344044277552, "learning_rate": 2.252285755095652e-06, "loss": 0.733, "step": 410 }, { "epoch": 1.6274509803921569, "grad_norm": 0.9222172843750491, "learning_rate": 2.0404978652089325e-06, "loss": 0.6955, "step": 415 }, { "epoch": 1.6470588235294117, "grad_norm": 0.9295279337365954, "learning_rate": 1.8380308764377841e-06, "loss": 0.7209, "step": 420 }, { "epoch": 1.6666666666666665, "grad_norm": 0.9978286311378533, "learning_rate": 1.6451218858706374e-06, "loss": 0.7229, "step": 425 }, { "epoch": 1.6862745098039216, "grad_norm": 0.9673841209469143, "learning_rate": 1.4619967977908157e-06, "loss": 0.7197, "step": 430 }, { "epoch": 1.7058823529411766, "grad_norm": 0.9537359526264119, "learning_rate": 1.2888700591334225e-06, "loss": 0.7144, "step": 435 }, { "epoch": 1.7254901960784315, "grad_norm": 0.9009373079713732, "learning_rate": 1.1259444083592585e-06, "loss": 0.7078, "step": 440 }, { "epoch": 1.7450980392156863, "grad_norm": 0.9618775347330766, "learning_rate": 9.734106380398022e-07, "loss": 0.743, "step": 445 }, { "epoch": 1.7647058823529411, "grad_norm": 0.951269838967216, "learning_rate": 8.31447371431372e-07, "loss": 0.7278, "step": 450 }, { "epoch": 1.784313725490196, "grad_norm": 0.912258612261103, "learning_rate": 7.002208532999933e-07, "loss": 0.7241, "step": 455 }, { "epoch": 1.803921568627451, "grad_norm": 0.9276161963363886, "learning_rate": 5.798847552420184e-07, "loss": 0.7143, "step": 460 }, { "epoch": 1.8235294117647058, "grad_norm": 0.9666247110758565, "learning_rate": 4.7057999572843516e-07, "loss": 0.7165, "step": 465 }, { "epoch": 1.843137254901961, "grad_norm": 0.9040376476718873, "learning_rate": 3.7243457508358784e-07, "loss": 0.7192, "step": 470 }, { "epoch": 1.8627450980392157, "grad_norm": 0.9529298475220304, "learning_rate": 2.8556342559159513e-07, "loss": 0.7213, "step": 475 }, { "epoch": 1.8823529411764706, "grad_norm": 0.9373675816598103, "learning_rate": 2.1006827690595478e-07, "loss": 0.7247, "step": 480 }, { "epoch": 1.9019607843137254, "grad_norm": 0.9049110221925867, "learning_rate": 1.4603753691998735e-07, "loss": 0.7217, "step": 485 }, { "epoch": 1.9215686274509802, "grad_norm": 1.0200136766624364, "learning_rate": 9.354618823758654e-08, "loss": 0.712, "step": 490 }, { "epoch": 1.9411764705882353, "grad_norm": 1.0048113834797883, "learning_rate": 5.265570036553813e-08, "loss": 0.7337, "step": 495 }, { "epoch": 1.9607843137254903, "grad_norm": 0.9199685223066211, "learning_rate": 2.3413957730226144e-08, "loss": 0.7121, "step": 500 }, { "epoch": 1.9803921568627452, "grad_norm": 0.8894954442490418, "learning_rate": 5.855203603017945e-09, "loss": 0.7124, "step": 505 }, { "epoch": 2.0, "grad_norm": 0.9330570772451235, "learning_rate": 0.0, "loss": 0.7276, "step": 510 }, { "epoch": 2.0, "eval_loss": 1.0487182140350342, "eval_runtime": 5.9345, "eval_samples_per_second": 26.961, "eval_steps_per_second": 0.506, "step": 510 }, { "epoch": 2.0, "step": 510, "total_flos": 106783624396800.0, "train_loss": 0.9181881465163886, "train_runtime": 3659.086, "train_samples_per_second": 8.919, "train_steps_per_second": 0.139 } ], "logging_steps": 5, "max_steps": 510, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 106783624396800.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }