{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.926829268292683, "eval_steps": 500, "global_step": 90, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 8.280744552612305, "learning_rate": 6.666666666666667e-06, "loss": 1.565, "step": 1 }, { "epoch": 0.07, "grad_norm": 8.70754337310791, "learning_rate": 1.3333333333333333e-05, "loss": 1.6161, "step": 2 }, { "epoch": 0.1, "grad_norm": 8.472146987915039, "learning_rate": 2e-05, "loss": 1.5893, "step": 3 }, { "epoch": 0.13, "grad_norm": 5.23038911819458, "learning_rate": 1.999348095389677e-05, "loss": 1.3983, "step": 4 }, { "epoch": 0.16, "grad_norm": 2.8397700786590576, "learning_rate": 1.9973932315179502e-05, "loss": 1.2206, "step": 5 }, { "epoch": 0.2, "grad_norm": 2.181596040725708, "learning_rate": 1.9941379571543597e-05, "loss": 1.1032, "step": 6 }, { "epoch": 0.23, "grad_norm": 1.1165341138839722, "learning_rate": 1.9895865165556375e-05, "loss": 1.065, "step": 7 }, { "epoch": 0.26, "grad_norm": 0.8952399492263794, "learning_rate": 1.9837448439320027e-05, "loss": 1.035, "step": 8 }, { "epoch": 0.29, "grad_norm": 0.7685825824737549, "learning_rate": 1.976620555710087e-05, "loss": 0.9635, "step": 9 }, { "epoch": 0.33, "grad_norm": 0.7761624455451965, "learning_rate": 1.9682229406025635e-05, "loss": 0.9618, "step": 10 }, { "epoch": 0.36, "grad_norm": 0.7371810078620911, "learning_rate": 1.9585629474974413e-05, "loss": 0.9613, "step": 11 }, { "epoch": 0.39, "grad_norm": 0.6214499473571777, "learning_rate": 1.9476531711828027e-05, "loss": 0.9501, "step": 12 }, { "epoch": 0.42, "grad_norm": 0.5390974879264832, "learning_rate": 1.935507835925601e-05, "loss": 0.964, "step": 13 }, { "epoch": 0.46, "grad_norm": 0.5018035769462585, "learning_rate": 1.9221427769259333e-05, "loss": 0.9304, "step": 14 }, { "epoch": 0.49, "grad_norm": 0.526158332824707, "learning_rate": 1.9075754196709574e-05, "loss": 0.9554, "step": 15 }, { "epoch": 0.52, "grad_norm": 0.5072513222694397, "learning_rate": 1.8918247572153822e-05, "loss": 0.9078, "step": 16 }, { "epoch": 0.55, "grad_norm": 0.4988534450531006, "learning_rate": 1.8749113254181498e-05, "loss": 0.9291, "step": 17 }, { "epoch": 0.59, "grad_norm": 0.4912777841091156, "learning_rate": 1.8568571761675893e-05, "loss": 0.9626, "step": 18 }, { "epoch": 0.62, "grad_norm": 0.5130652785301208, "learning_rate": 1.837685848629965e-05, "loss": 0.9074, "step": 19 }, { "epoch": 0.65, "grad_norm": 0.4877808690071106, "learning_rate": 1.817422338558892e-05, "loss": 0.8972, "step": 20 }, { "epoch": 0.68, "grad_norm": 0.48035475611686707, "learning_rate": 1.796093065705644e-05, "loss": 0.9162, "step": 21 }, { "epoch": 0.72, "grad_norm": 0.45466920733451843, "learning_rate": 1.7737258393728363e-05, "loss": 0.9067, "step": 22 }, { "epoch": 0.75, "grad_norm": 0.49428102374076843, "learning_rate": 1.7503498221564026e-05, "loss": 0.9073, "step": 23 }, { "epoch": 0.78, "grad_norm": 0.49855947494506836, "learning_rate": 1.725995491923131e-05, "loss": 0.941, "step": 24 }, { "epoch": 0.81, "grad_norm": 0.4245299696922302, "learning_rate": 1.7006946020733426e-05, "loss": 0.8765, "step": 25 }, { "epoch": 0.85, "grad_norm": 0.45259377360343933, "learning_rate": 1.6744801401405138e-05, "loss": 0.8845, "step": 26 }, { "epoch": 0.88, "grad_norm": 0.45279204845428467, "learning_rate": 1.647386284781828e-05, "loss": 0.9069, "step": 27 }, { "epoch": 0.91, "grad_norm": 0.4377584159374237, "learning_rate": 1.6194483612157232e-05, "loss": 0.8889, "step": 28 }, { "epoch": 0.94, "grad_norm": 0.44903799891471863, "learning_rate": 1.590702795164551e-05, "loss": 0.8981, "step": 29 }, { "epoch": 0.98, "grad_norm": 0.44891077280044556, "learning_rate": 1.5611870653623826e-05, "loss": 0.8773, "step": 30 }, { "epoch": 1.01, "grad_norm": 0.49123069643974304, "learning_rate": 1.530939654689887e-05, "loss": 0.8863, "step": 31 }, { "epoch": 1.04, "grad_norm": 0.49654659628868103, "learning_rate": 1.5000000000000002e-05, "loss": 0.7918, "step": 32 }, { "epoch": 1.07, "grad_norm": 0.4908214807510376, "learning_rate": 1.4684084406997903e-05, "loss": 0.761, "step": 33 }, { "epoch": 1.11, "grad_norm": 0.45682042837142944, "learning_rate": 1.4362061661555675e-05, "loss": 0.781, "step": 34 }, { "epoch": 1.14, "grad_norm": 0.466132253408432, "learning_rate": 1.4034351619898088e-05, "loss": 0.7392, "step": 35 }, { "epoch": 1.17, "grad_norm": 0.4905536472797394, "learning_rate": 1.3701381553399147e-05, "loss": 0.763, "step": 36 }, { "epoch": 1.2, "grad_norm": 0.48623326420783997, "learning_rate": 1.3363585591501751e-05, "loss": 0.7628, "step": 37 }, { "epoch": 1.24, "grad_norm": 0.48992201685905457, "learning_rate": 1.3021404155695728e-05, "loss": 0.7798, "step": 38 }, { "epoch": 1.27, "grad_norm": 0.45171797275543213, "learning_rate": 1.2675283385292212e-05, "loss": 0.7196, "step": 39 }, { "epoch": 1.3, "grad_norm": 0.48216456174850464, "learning_rate": 1.2325674555743106e-05, "loss": 0.7346, "step": 40 }, { "epoch": 1.33, "grad_norm": 0.44965651631355286, "learning_rate": 1.1973033490264e-05, "loss": 0.7058, "step": 41 }, { "epoch": 1.37, "grad_norm": 0.5034527778625488, "learning_rate": 1.161781996552765e-05, "loss": 0.7719, "step": 42 }, { "epoch": 1.4, "grad_norm": 0.5173734426498413, "learning_rate": 1.1260497112202895e-05, "loss": 0.745, "step": 43 }, { "epoch": 1.43, "grad_norm": 0.49784746766090393, "learning_rate": 1.0901530811120655e-05, "loss": 0.7376, "step": 44 }, { "epoch": 1.46, "grad_norm": 0.4722931385040283, "learning_rate": 1.0541389085854177e-05, "loss": 0.7419, "step": 45 }, { "epoch": 1.5, "grad_norm": 0.4892752170562744, "learning_rate": 1.0180541492505605e-05, "loss": 0.6948, "step": 46 }, { "epoch": 1.53, "grad_norm": 0.4865082800388336, "learning_rate": 9.819458507494395e-06, "loss": 0.634, "step": 47 }, { "epoch": 1.56, "grad_norm": 0.5185601711273193, "learning_rate": 9.458610914145826e-06, "loss": 0.6824, "step": 48 }, { "epoch": 1.59, "grad_norm": 0.47616997361183167, "learning_rate": 9.098469188879348e-06, "loss": 0.7001, "step": 49 }, { "epoch": 1.63, "grad_norm": 0.49744558334350586, "learning_rate": 8.739502887797108e-06, "loss": 0.7358, "step": 50 }, { "epoch": 1.66, "grad_norm": 0.5162651538848877, "learning_rate": 8.382180034472353e-06, "loss": 0.7079, "step": 51 }, { "epoch": 1.69, "grad_norm": 0.5150533318519592, "learning_rate": 8.026966509736001e-06, "loss": 0.6997, "step": 52 }, { "epoch": 1.72, "grad_norm": 1.0735119581222534, "learning_rate": 7.674325444256899e-06, "loss": 0.7211, "step": 53 }, { "epoch": 1.76, "grad_norm": 0.4927270710468292, "learning_rate": 7.324716614707794e-06, "loss": 0.7257, "step": 54 }, { "epoch": 1.79, "grad_norm": 0.49033042788505554, "learning_rate": 6.978595844304272e-06, "loss": 0.728, "step": 55 }, { "epoch": 1.82, "grad_norm": 0.5927804112434387, "learning_rate": 6.636414408498249e-06, "loss": 0.6914, "step": 56 }, { "epoch": 1.85, "grad_norm": 0.5300837159156799, "learning_rate": 6.298618446600856e-06, "loss": 0.7206, "step": 57 }, { "epoch": 1.89, "grad_norm": 0.4967361092567444, "learning_rate": 5.965648380101916e-06, "loss": 0.6897, "step": 58 }, { "epoch": 1.92, "grad_norm": 0.5123832821846008, "learning_rate": 5.637938338444325e-06, "loss": 0.7064, "step": 59 }, { "epoch": 1.95, "grad_norm": 0.5195019245147705, "learning_rate": 5.3159155930021e-06, "loss": 0.6979, "step": 60 }, { "epoch": 1.98, "grad_norm": 0.5248487591743469, "learning_rate": 5.000000000000003e-06, "loss": 0.6823, "step": 61 }, { "epoch": 2.02, "grad_norm": 0.5308733582496643, "learning_rate": 4.690603453101134e-06, "loss": 0.6827, "step": 62 }, { "epoch": 2.05, "grad_norm": 0.5279427766799927, "learning_rate": 4.388129346376177e-06, "loss": 0.6421, "step": 63 }, { "epoch": 2.08, "grad_norm": 0.4864795207977295, "learning_rate": 4.092972048354491e-06, "loss": 0.5883, "step": 64 }, { "epoch": 2.11, "grad_norm": 0.4978441894054413, "learning_rate": 3.8055163878427703e-06, "loss": 0.5958, "step": 65 }, { "epoch": 2.15, "grad_norm": 0.5329935550689697, "learning_rate": 3.5261371521817247e-06, "loss": 0.6346, "step": 66 }, { "epoch": 2.18, "grad_norm": 0.4853770434856415, "learning_rate": 3.255198598594862e-06, "loss": 0.5986, "step": 67 }, { "epoch": 2.21, "grad_norm": 0.5128601789474487, "learning_rate": 2.9930539792665767e-06, "loss": 0.6008, "step": 68 }, { "epoch": 2.24, "grad_norm": 0.5175537467002869, "learning_rate": 2.740045080768694e-06, "loss": 0.5825, "step": 69 }, { "epoch": 2.28, "grad_norm": 0.5446229577064514, "learning_rate": 2.496501778435977e-06, "loss": 0.6201, "step": 70 }, { "epoch": 2.31, "grad_norm": 0.5520351529121399, "learning_rate": 2.2627416062716366e-06, "loss": 0.6502, "step": 71 }, { "epoch": 2.34, "grad_norm": 0.5046796202659607, "learning_rate": 2.0390693429435626e-06, "loss": 0.5877, "step": 72 }, { "epoch": 2.37, "grad_norm": 0.5116910338401794, "learning_rate": 1.8257766144110823e-06, "loss": 0.5992, "step": 73 }, { "epoch": 2.41, "grad_norm": 0.5164862871170044, "learning_rate": 1.6231415137003536e-06, "loss": 0.5826, "step": 74 }, { "epoch": 2.44, "grad_norm": 0.5107815265655518, "learning_rate": 1.4314282383241097e-06, "loss": 0.5679, "step": 75 }, { "epoch": 2.47, "grad_norm": 0.5102241039276123, "learning_rate": 1.2508867458185037e-06, "loss": 0.5972, "step": 76 }, { "epoch": 2.5, "grad_norm": 0.5415460467338562, "learning_rate": 1.0817524278461777e-06, "loss": 0.5738, "step": 77 }, { "epoch": 2.54, "grad_norm": 0.525007426738739, "learning_rate": 9.242458032904311e-07, "loss": 0.592, "step": 78 }, { "epoch": 2.57, "grad_norm": 0.48747336864471436, "learning_rate": 7.785722307406685e-07, "loss": 0.5858, "step": 79 }, { "epoch": 2.6, "grad_norm": 0.5466867089271545, "learning_rate": 6.449216407439906e-07, "loss": 0.6307, "step": 80 }, { "epoch": 2.63, "grad_norm": 0.529702365398407, "learning_rate": 5.234682881719766e-07, "loss": 0.6177, "step": 81 }, { "epoch": 2.67, "grad_norm": 0.550619900226593, "learning_rate": 4.1437052502558693e-07, "loss": 0.6064, "step": 82 }, { "epoch": 2.7, "grad_norm": 0.5231672525405884, "learning_rate": 3.1777059397436693e-07, "loss": 0.6194, "step": 83 }, { "epoch": 2.73, "grad_norm": 0.506498396396637, "learning_rate": 2.3379444289913344e-07, "loss": 0.6191, "step": 84 }, { "epoch": 2.76, "grad_norm": 0.5445461273193359, "learning_rate": 1.6255156067997325e-07, "loss": 0.5635, "step": 85 }, { "epoch": 2.8, "grad_norm": 0.5693822503089905, "learning_rate": 1.041348344436277e-07, "loss": 0.6426, "step": 86 }, { "epoch": 2.83, "grad_norm": 0.5474693775177002, "learning_rate": 5.862042845640403e-08, "loss": 0.6243, "step": 87 }, { "epoch": 2.86, "grad_norm": 0.5175514221191406, "learning_rate": 2.606768482050215e-08, "loss": 0.5747, "step": 88 }, { "epoch": 2.89, "grad_norm": 0.5419387221336365, "learning_rate": 6.5190461032305085e-09, "loss": 0.6008, "step": 89 }, { "epoch": 2.93, "grad_norm": 0.4993443787097931, "learning_rate": 0.0, "loss": 0.6343, "step": 90 }, { "epoch": 2.93, "step": 90, "total_flos": 2.582894619708621e+16, "train_loss": 0.7904547598626879, "train_runtime": 253.8743, "train_samples_per_second": 46.381, "train_steps_per_second": 0.355 } ], "logging_steps": 1.0, "max_steps": 90, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "total_flos": 2.582894619708621e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }