{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9842696629213483, "eval_steps": 500, "global_step": 498, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0599250936329588, "grad_norm": 0.43755268500481287, "learning_rate": 5e-06, "loss": 0.7166, "step": 10 }, { "epoch": 0.1198501872659176, "grad_norm": 0.27887068997618736, "learning_rate": 5e-06, "loss": 0.6497, "step": 20 }, { "epoch": 0.1797752808988764, "grad_norm": 0.23086038315964058, "learning_rate": 5e-06, "loss": 0.6345, "step": 30 }, { "epoch": 0.2397003745318352, "grad_norm": 0.2341891068390615, "learning_rate": 5e-06, "loss": 0.6191, "step": 40 }, { "epoch": 0.299625468164794, "grad_norm": 0.20601260069861693, "learning_rate": 5e-06, "loss": 0.6116, "step": 50 }, { "epoch": 0.3595505617977528, "grad_norm": 0.20563409032056135, "learning_rate": 5e-06, "loss": 0.5996, "step": 60 }, { "epoch": 0.41947565543071164, "grad_norm": 0.22531739495172456, "learning_rate": 5e-06, "loss": 0.6009, "step": 70 }, { "epoch": 0.4794007490636704, "grad_norm": 0.20987173643198698, "learning_rate": 5e-06, "loss": 0.5868, "step": 80 }, { "epoch": 0.5393258426966292, "grad_norm": 0.218100189832543, "learning_rate": 5e-06, "loss": 0.5867, "step": 90 }, { "epoch": 0.599250936329588, "grad_norm": 0.20699834408218468, "learning_rate": 5e-06, "loss": 0.5742, "step": 100 }, { "epoch": 0.6591760299625468, "grad_norm": 0.21651508965442265, "learning_rate": 5e-06, "loss": 0.5793, "step": 110 }, { "epoch": 0.7191011235955056, "grad_norm": 0.21505752361824734, "learning_rate": 5e-06, "loss": 0.5758, "step": 120 }, { "epoch": 0.7790262172284644, "grad_norm": 0.20535093449616573, "learning_rate": 5e-06, "loss": 0.5722, "step": 130 }, { "epoch": 0.8389513108614233, "grad_norm": 0.21007383376279185, "learning_rate": 5e-06, "loss": 0.566, "step": 140 }, { "epoch": 0.898876404494382, "grad_norm": 0.20414838879898575, "learning_rate": 5e-06, "loss": 0.5683, "step": 150 }, { "epoch": 0.9588014981273408, "grad_norm": 0.2070485319078679, "learning_rate": 5e-06, "loss": 0.5616, "step": 160 }, { "epoch": 0.9947565543071161, "eval_loss": 0.564882755279541, "eval_runtime": 171.7377, "eval_samples_per_second": 26.179, "eval_steps_per_second": 0.413, "step": 166 }, { "epoch": 1.0187265917602997, "grad_norm": 0.2501598895319095, "learning_rate": 5e-06, "loss": 0.5638, "step": 170 }, { "epoch": 1.0786516853932584, "grad_norm": 0.21286574219603713, "learning_rate": 5e-06, "loss": 0.5375, "step": 180 }, { "epoch": 1.1385767790262173, "grad_norm": 0.21450560334810348, "learning_rate": 5e-06, "loss": 0.5492, "step": 190 }, { "epoch": 1.198501872659176, "grad_norm": 0.24697885639857056, "learning_rate": 5e-06, "loss": 0.5372, "step": 200 }, { "epoch": 1.2584269662921348, "grad_norm": 0.22507578960487684, "learning_rate": 5e-06, "loss": 0.5421, "step": 210 }, { "epoch": 1.3183520599250937, "grad_norm": 0.215757150388599, "learning_rate": 5e-06, "loss": 0.5343, "step": 220 }, { "epoch": 1.3782771535580525, "grad_norm": 0.20852383609091527, "learning_rate": 5e-06, "loss": 0.5342, "step": 230 }, { "epoch": 1.4382022471910112, "grad_norm": 0.2303655417835911, "learning_rate": 5e-06, "loss": 0.5323, "step": 240 }, { "epoch": 1.4981273408239701, "grad_norm": 0.2234138263551834, "learning_rate": 5e-06, "loss": 0.531, "step": 250 }, { "epoch": 1.5580524344569289, "grad_norm": 0.2169924894523469, "learning_rate": 5e-06, "loss": 0.5223, "step": 260 }, { "epoch": 1.6179775280898876, "grad_norm": 0.23687218077177816, "learning_rate": 5e-06, "loss": 0.5353, "step": 270 }, { "epoch": 1.6779026217228465, "grad_norm": 0.21583840528024137, "learning_rate": 5e-06, "loss": 0.5294, "step": 280 }, { "epoch": 1.7378277153558053, "grad_norm": 0.23534186766123893, "learning_rate": 5e-06, "loss": 0.5295, "step": 290 }, { "epoch": 1.797752808988764, "grad_norm": 0.22832718342353497, "learning_rate": 5e-06, "loss": 0.5234, "step": 300 }, { "epoch": 1.857677902621723, "grad_norm": 0.2182245052548769, "learning_rate": 5e-06, "loss": 0.5254, "step": 310 }, { "epoch": 1.9176029962546817, "grad_norm": 0.2339332310716746, "learning_rate": 5e-06, "loss": 0.527, "step": 320 }, { "epoch": 1.9775280898876404, "grad_norm": 0.22377767747656874, "learning_rate": 5e-06, "loss": 0.5224, "step": 330 }, { "epoch": 1.9955056179775281, "eval_loss": 0.5431503653526306, "eval_runtime": 172.1699, "eval_samples_per_second": 26.114, "eval_steps_per_second": 0.412, "step": 333 }, { "epoch": 2.0374531835205993, "grad_norm": 0.22333193029912074, "learning_rate": 5e-06, "loss": 0.5163, "step": 340 }, { "epoch": 2.097378277153558, "grad_norm": 0.21868652020103954, "learning_rate": 5e-06, "loss": 0.4937, "step": 350 }, { "epoch": 2.157303370786517, "grad_norm": 0.28765166235887135, "learning_rate": 5e-06, "loss": 0.4951, "step": 360 }, { "epoch": 2.2172284644194757, "grad_norm": 0.22543561231331818, "learning_rate": 5e-06, "loss": 0.4991, "step": 370 }, { "epoch": 2.2771535580524347, "grad_norm": 0.2326538158591729, "learning_rate": 5e-06, "loss": 0.4958, "step": 380 }, { "epoch": 2.337078651685393, "grad_norm": 0.22438885993310223, "learning_rate": 5e-06, "loss": 0.4951, "step": 390 }, { "epoch": 2.397003745318352, "grad_norm": 0.21898245588065546, "learning_rate": 5e-06, "loss": 0.5016, "step": 400 }, { "epoch": 2.4569288389513106, "grad_norm": 0.23740177057741296, "learning_rate": 5e-06, "loss": 0.4962, "step": 410 }, { "epoch": 2.5168539325842696, "grad_norm": 0.2247697624740929, "learning_rate": 5e-06, "loss": 0.5034, "step": 420 }, { "epoch": 2.5767790262172285, "grad_norm": 0.21874023100759993, "learning_rate": 5e-06, "loss": 0.4974, "step": 430 }, { "epoch": 2.6367041198501875, "grad_norm": 0.21718625410004955, "learning_rate": 5e-06, "loss": 0.4986, "step": 440 }, { "epoch": 2.696629213483146, "grad_norm": 0.224073653725935, "learning_rate": 5e-06, "loss": 0.4939, "step": 450 }, { "epoch": 2.756554307116105, "grad_norm": 0.2254630995617039, "learning_rate": 5e-06, "loss": 0.4925, "step": 460 }, { "epoch": 2.8164794007490634, "grad_norm": 0.22113881489914033, "learning_rate": 5e-06, "loss": 0.4924, "step": 470 }, { "epoch": 2.8764044943820224, "grad_norm": 0.2288326125817775, "learning_rate": 5e-06, "loss": 0.4964, "step": 480 }, { "epoch": 2.9363295880149813, "grad_norm": 0.23496214308685273, "learning_rate": 5e-06, "loss": 0.4975, "step": 490 }, { "epoch": 2.9842696629213483, "eval_loss": 0.532910943031311, "eval_runtime": 172.5554, "eval_samples_per_second": 26.055, "eval_steps_per_second": 0.411, "step": 498 }, { "epoch": 2.9842696629213483, "step": 498, "total_flos": 833959168573440.0, "train_loss": 0.5430538529851829, "train_runtime": 28026.0355, "train_samples_per_second": 9.142, "train_steps_per_second": 0.018 } ], "logging_steps": 10, "max_steps": 498, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 833959168573440.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }