{ "best_metric": null, "best_model_checkpoint": null, "epoch": 51.54639175257732, "eval_steps": 500, "global_step": 25000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0309278350515463, "grad_norm": 4.000246524810791, "learning_rate": 3.4364261168384886e-06, "loss": 8.264, "step": 500 }, { "epoch": 2.0618556701030926, "grad_norm": 2.342319965362549, "learning_rate": 6.872852233676977e-06, "loss": 6.6664, "step": 1000 }, { "epoch": 3.0927835051546393, "grad_norm": 2.5340118408203125, "learning_rate": 1.0309278350515464e-05, "loss": 5.3366, "step": 1500 }, { "epoch": 4.123711340206185, "grad_norm": 2.5916709899902344, "learning_rate": 1.3745704467353954e-05, "loss": 4.4002, "step": 2000 }, { "epoch": 5.154639175257732, "grad_norm": 3.448418378829956, "learning_rate": 1.7182130584192442e-05, "loss": 3.8367, "step": 2500 }, { "epoch": 6.185567010309279, "grad_norm": 3.4657623767852783, "learning_rate": 2.0618556701030927e-05, "loss": 3.4293, "step": 3000 }, { "epoch": 7.216494845360825, "grad_norm": 4.080520153045654, "learning_rate": 2.4054982817869417e-05, "loss": 3.1112, "step": 3500 }, { "epoch": 8.24742268041237, "grad_norm": 3.7761707305908203, "learning_rate": 2.749140893470791e-05, "loss": 2.8723, "step": 4000 }, { "epoch": 9.278350515463918, "grad_norm": 3.7703592777252197, "learning_rate": 3.0927835051546395e-05, "loss": 2.6786, "step": 4500 }, { "epoch": 10.309278350515465, "grad_norm": 2.716610908508301, "learning_rate": 3.4364261168384884e-05, "loss": 2.5182, "step": 5000 }, { "epoch": 11.34020618556701, "grad_norm": 3.0980777740478516, "learning_rate": 3.7800687285223366e-05, "loss": 2.388, "step": 5500 }, { "epoch": 12.371134020618557, "grad_norm": 2.8109469413757324, "learning_rate": 4.1237113402061855e-05, "loss": 2.271, "step": 6000 }, { "epoch": 13.402061855670103, "grad_norm": 2.512942314147949, "learning_rate": 4.466666666666667e-05, "loss": 2.1741, "step": 6500 }, { "epoch": 14.43298969072165, "grad_norm": 2.585681200027466, "learning_rate": 4.810309278350515e-05, "loss": 2.0924, "step": 7000 }, { "epoch": 15.463917525773196, "grad_norm": 2.4047884941101074, "learning_rate": 5.153951890034364e-05, "loss": 2.011, "step": 7500 }, { "epoch": 16.49484536082474, "grad_norm": 2.4080662727355957, "learning_rate": 5.497594501718213e-05, "loss": 1.946, "step": 8000 }, { "epoch": 17.52577319587629, "grad_norm": 2.2564289569854736, "learning_rate": 5.840549828178694e-05, "loss": 1.8899, "step": 8500 }, { "epoch": 18.556701030927837, "grad_norm": 2.0903544425964355, "learning_rate": 6.184192439862543e-05, "loss": 1.8404, "step": 9000 }, { "epoch": 19.587628865979383, "grad_norm": 2.0401394367218018, "learning_rate": 6.527835051546391e-05, "loss": 1.7894, "step": 9500 }, { "epoch": 20.61855670103093, "grad_norm": 2.026660919189453, "learning_rate": 6.87147766323024e-05, "loss": 1.7505, "step": 10000 }, { "epoch": 21.649484536082475, "grad_norm": 1.9435638189315796, "learning_rate": 7.21512027491409e-05, "loss": 1.7097, "step": 10500 }, { "epoch": 22.68041237113402, "grad_norm": 1.9047859907150269, "learning_rate": 7.558075601374571e-05, "loss": 1.6748, "step": 11000 }, { "epoch": 23.711340206185568, "grad_norm": 1.8212696313858032, "learning_rate": 7.90171821305842e-05, "loss": 1.64, "step": 11500 }, { "epoch": 24.742268041237114, "grad_norm": 1.7629321813583374, "learning_rate": 8.245360824742269e-05, "loss": 1.6115, "step": 12000 }, { "epoch": 25.77319587628866, "grad_norm": 1.7129898071289062, "learning_rate": 8.589003436426117e-05, "loss": 1.5824, "step": 12500 }, { "epoch": 26.804123711340207, "grad_norm": 1.6478886604309082, "learning_rate": 8.932646048109967e-05, "loss": 1.5537, "step": 13000 }, { "epoch": 27.835051546391753, "grad_norm": 1.6928553581237793, "learning_rate": 9.276288659793815e-05, "loss": 1.528, "step": 13500 }, { "epoch": 28.8659793814433, "grad_norm": 1.5595742464065552, "learning_rate": 9.619931271477663e-05, "loss": 1.5037, "step": 14000 }, { "epoch": 29.896907216494846, "grad_norm": 1.6010233163833618, "learning_rate": 9.963573883161513e-05, "loss": 1.4804, "step": 14500 }, { "epoch": 30.927835051546392, "grad_norm": 1.4490132331848145, "learning_rate": 9.99574236655172e-05, "loss": 1.4564, "step": 15000 }, { "epoch": 31.95876288659794, "grad_norm": 1.449069857597351, "learning_rate": 9.980854570375779e-05, "loss": 1.4322, "step": 15500 }, { "epoch": 32.98969072164948, "grad_norm": 1.4955676794052124, "learning_rate": 9.955306053101556e-05, "loss": 1.4091, "step": 16000 }, { "epoch": 34.02061855670103, "grad_norm": 1.4652965068817139, "learning_rate": 9.919234343505417e-05, "loss": 1.3867, "step": 16500 }, { "epoch": 35.05154639175258, "grad_norm": 1.4383347034454346, "learning_rate": 9.872572097155327e-05, "loss": 1.367, "step": 17000 }, { "epoch": 36.08247422680412, "grad_norm": 1.3232207298278809, "learning_rate": 9.815480890990188e-05, "loss": 1.3452, "step": 17500 }, { "epoch": 37.11340206185567, "grad_norm": 1.3220405578613281, "learning_rate": 9.748082919588761e-05, "loss": 1.3265, "step": 18000 }, { "epoch": 38.144329896907216, "grad_norm": 1.2943527698516846, "learning_rate": 9.670522437509286e-05, "loss": 1.3091, "step": 18500 }, { "epoch": 39.175257731958766, "grad_norm": 1.3372294902801514, "learning_rate": 9.582965450535715e-05, "loss": 1.2913, "step": 19000 }, { "epoch": 40.20618556701031, "grad_norm": 1.359191656112671, "learning_rate": 9.485599360368925e-05, "loss": 1.2745, "step": 19500 }, { "epoch": 41.23711340206186, "grad_norm": 1.3238861560821533, "learning_rate": 9.378632563523418e-05, "loss": 1.2581, "step": 20000 }, { "epoch": 42.2680412371134, "grad_norm": 1.311515212059021, "learning_rate": 9.262535873205258e-05, "loss": 1.2448, "step": 20500 }, { "epoch": 43.29896907216495, "grad_norm": 1.3535444736480713, "learning_rate": 9.137092541559738e-05, "loss": 1.2271, "step": 21000 }, { "epoch": 44.329896907216494, "grad_norm": 1.3142534494400024, "learning_rate": 9.002794426238008e-05, "loss": 1.2132, "step": 21500 }, { "epoch": 45.36082474226804, "grad_norm": 1.3405091762542725, "learning_rate": 8.859928970836587e-05, "loss": 1.1984, "step": 22000 }, { "epoch": 46.391752577319586, "grad_norm": 1.339012861251831, "learning_rate": 8.709112242917366e-05, "loss": 1.1846, "step": 22500 }, { "epoch": 47.422680412371136, "grad_norm": 1.336722493171692, "learning_rate": 8.550062673710893e-05, "loss": 1.1725, "step": 23000 }, { "epoch": 48.45360824742268, "grad_norm": 1.2952606678009033, "learning_rate": 8.383414764197326e-05, "loss": 1.1577, "step": 23500 }, { "epoch": 49.48453608247423, "grad_norm": 1.2937185764312744, "learning_rate": 8.209525197524074e-05, "loss": 1.1451, "step": 24000 }, { "epoch": 50.51546391752577, "grad_norm": 1.3922706842422485, "learning_rate": 8.029134275478738e-05, "loss": 1.1334, "step": 24500 }, { "epoch": 51.54639175257732, "grad_norm": 1.2733055353164673, "learning_rate": 7.841905215460069e-05, "loss": 1.1203, "step": 25000 } ], "logging_steps": 500, "max_steps": 48500, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.046659236626432e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }