{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.18726591760299627, "eval_steps": 13, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003745318352059925, "grad_norm": 1.4232933521270752, "learning_rate": 1e-05, "loss": 2.6144, "step": 1 }, { "epoch": 0.003745318352059925, "eval_loss": 2.7996814250946045, "eval_runtime": 168.8282, "eval_samples_per_second": 2.665, "eval_steps_per_second": 1.333, "step": 1 }, { "epoch": 0.00749063670411985, "grad_norm": 1.3964163064956665, "learning_rate": 2e-05, "loss": 2.6416, "step": 2 }, { "epoch": 0.011235955056179775, "grad_norm": 1.4900565147399902, "learning_rate": 3e-05, "loss": 2.6927, "step": 3 }, { "epoch": 0.0149812734082397, "grad_norm": 1.2893002033233643, "learning_rate": 4e-05, "loss": 2.2962, "step": 4 }, { "epoch": 0.018726591760299626, "grad_norm": 1.5299968719482422, "learning_rate": 5e-05, "loss": 2.997, "step": 5 }, { "epoch": 0.02247191011235955, "grad_norm": 1.5312691926956177, "learning_rate": 6e-05, "loss": 2.8766, "step": 6 }, { "epoch": 0.026217228464419477, "grad_norm": 1.6678813695907593, "learning_rate": 7e-05, "loss": 2.7168, "step": 7 }, { "epoch": 0.0299625468164794, "grad_norm": 1.3376212120056152, "learning_rate": 8e-05, "loss": 2.2845, "step": 8 }, { "epoch": 0.033707865168539325, "grad_norm": 1.4087949991226196, "learning_rate": 9e-05, "loss": 2.2124, "step": 9 }, { "epoch": 0.03745318352059925, "grad_norm": 1.4170143604278564, "learning_rate": 0.0001, "loss": 2.3934, "step": 10 }, { "epoch": 0.04119850187265917, "grad_norm": 1.392784833908081, "learning_rate": 9.98458666866564e-05, "loss": 2.1857, "step": 11 }, { "epoch": 0.0449438202247191, "grad_norm": 1.4366754293441772, "learning_rate": 9.938441702975689e-05, "loss": 1.9748, "step": 12 }, { "epoch": 0.04868913857677903, "grad_norm": 1.5409985780715942, "learning_rate": 9.861849601988383e-05, "loss": 1.9675, "step": 13 }, { "epoch": 0.04868913857677903, "eval_loss": 1.975732684135437, "eval_runtime": 14.12, "eval_samples_per_second": 31.87, "eval_steps_per_second": 15.935, "step": 13 }, { "epoch": 0.052434456928838954, "grad_norm": 1.596232533454895, "learning_rate": 9.755282581475769e-05, "loss": 1.8742, "step": 14 }, { "epoch": 0.056179775280898875, "grad_norm": 1.6847773790359497, "learning_rate": 9.619397662556435e-05, "loss": 1.9345, "step": 15 }, { "epoch": 0.0599250936329588, "grad_norm": 1.5066741704940796, "learning_rate": 9.45503262094184e-05, "loss": 1.7222, "step": 16 }, { "epoch": 0.06367041198501873, "grad_norm": 1.4937152862548828, "learning_rate": 9.263200821770461e-05, "loss": 1.8033, "step": 17 }, { "epoch": 0.06741573033707865, "grad_norm": 1.9379730224609375, "learning_rate": 9.045084971874738e-05, "loss": 1.8392, "step": 18 }, { "epoch": 0.07116104868913857, "grad_norm": 1.7714548110961914, "learning_rate": 8.802029828000156e-05, "loss": 1.7254, "step": 19 }, { "epoch": 0.0749063670411985, "grad_norm": 1.6995341777801514, "learning_rate": 8.535533905932738e-05, "loss": 1.5896, "step": 20 }, { "epoch": 0.07865168539325842, "grad_norm": 1.6350839138031006, "learning_rate": 8.247240241650918e-05, "loss": 1.8164, "step": 21 }, { "epoch": 0.08239700374531835, "grad_norm": 1.4452717304229736, "learning_rate": 7.938926261462366e-05, "loss": 1.7086, "step": 22 }, { "epoch": 0.08614232209737828, "grad_norm": 1.4310646057128906, "learning_rate": 7.612492823579745e-05, "loss": 1.7038, "step": 23 }, { "epoch": 0.0898876404494382, "grad_norm": 1.5116603374481201, "learning_rate": 7.269952498697734e-05, "loss": 1.3802, "step": 24 }, { "epoch": 0.09363295880149813, "grad_norm": 1.7284044027328491, "learning_rate": 6.91341716182545e-05, "loss": 1.5872, "step": 25 }, { "epoch": 0.09737827715355805, "grad_norm": 1.4467360973358154, "learning_rate": 6.545084971874738e-05, "loss": 1.464, "step": 26 }, { "epoch": 0.09737827715355805, "eval_loss": 1.513379454612732, "eval_runtime": 14.1358, "eval_samples_per_second": 31.834, "eval_steps_per_second": 15.917, "step": 26 }, { "epoch": 0.10112359550561797, "grad_norm": 1.6498875617980957, "learning_rate": 6.167226819279528e-05, "loss": 1.6355, "step": 27 }, { "epoch": 0.10486891385767791, "grad_norm": 1.5848513841629028, "learning_rate": 5.782172325201155e-05, "loss": 1.4611, "step": 28 }, { "epoch": 0.10861423220973783, "grad_norm": 1.6103304624557495, "learning_rate": 5.392295478639225e-05, "loss": 1.463, "step": 29 }, { "epoch": 0.11235955056179775, "grad_norm": 1.4713010787963867, "learning_rate": 5e-05, "loss": 1.4531, "step": 30 }, { "epoch": 0.11610486891385768, "grad_norm": 1.4418309926986694, "learning_rate": 4.607704521360776e-05, "loss": 1.4607, "step": 31 }, { "epoch": 0.1198501872659176, "grad_norm": 1.3354123830795288, "learning_rate": 4.2178276747988446e-05, "loss": 1.4086, "step": 32 }, { "epoch": 0.12359550561797752, "grad_norm": 1.5024315118789673, "learning_rate": 3.832773180720475e-05, "loss": 1.4571, "step": 33 }, { "epoch": 0.12734082397003746, "grad_norm": 1.4268651008605957, "learning_rate": 3.4549150281252636e-05, "loss": 1.3551, "step": 34 }, { "epoch": 0.13108614232209737, "grad_norm": 1.4591658115386963, "learning_rate": 3.086582838174551e-05, "loss": 1.3186, "step": 35 }, { "epoch": 0.1348314606741573, "grad_norm": 1.5342679023742676, "learning_rate": 2.7300475013022663e-05, "loss": 1.1651, "step": 36 }, { "epoch": 0.13857677902621723, "grad_norm": 1.763529896736145, "learning_rate": 2.3875071764202563e-05, "loss": 1.1908, "step": 37 }, { "epoch": 0.14232209737827714, "grad_norm": 1.6287809610366821, "learning_rate": 2.061073738537635e-05, "loss": 1.4347, "step": 38 }, { "epoch": 0.14606741573033707, "grad_norm": 1.562390923500061, "learning_rate": 1.7527597583490822e-05, "loss": 1.4056, "step": 39 }, { "epoch": 0.14606741573033707, "eval_loss": 1.3499964475631714, "eval_runtime": 14.1251, "eval_samples_per_second": 31.858, "eval_steps_per_second": 15.929, "step": 39 }, { "epoch": 0.149812734082397, "grad_norm": 1.6496071815490723, "learning_rate": 1.4644660940672627e-05, "loss": 1.1467, "step": 40 }, { "epoch": 0.15355805243445692, "grad_norm": 1.5362358093261719, "learning_rate": 1.1979701719998453e-05, "loss": 1.3055, "step": 41 }, { "epoch": 0.15730337078651685, "grad_norm": 1.656622052192688, "learning_rate": 9.549150281252633e-06, "loss": 1.2599, "step": 42 }, { "epoch": 0.16104868913857678, "grad_norm": 1.7673784494400024, "learning_rate": 7.367991782295391e-06, "loss": 1.6338, "step": 43 }, { "epoch": 0.1647940074906367, "grad_norm": 1.6738536357879639, "learning_rate": 5.449673790581611e-06, "loss": 1.3426, "step": 44 }, { "epoch": 0.16853932584269662, "grad_norm": 1.5348960161209106, "learning_rate": 3.8060233744356633e-06, "loss": 1.3081, "step": 45 }, { "epoch": 0.17228464419475656, "grad_norm": 1.4670664072036743, "learning_rate": 2.4471741852423237e-06, "loss": 1.2674, "step": 46 }, { "epoch": 0.1760299625468165, "grad_norm": 1.5279347896575928, "learning_rate": 1.3815039801161721e-06, "loss": 1.3096, "step": 47 }, { "epoch": 0.1797752808988764, "grad_norm": 1.749185562133789, "learning_rate": 6.15582970243117e-07, "loss": 1.3092, "step": 48 }, { "epoch": 0.18352059925093633, "grad_norm": 1.499311089515686, "learning_rate": 1.5413331334360182e-07, "loss": 1.1741, "step": 49 }, { "epoch": 0.18726591760299627, "grad_norm": 1.6856210231781006, "learning_rate": 0.0, "loss": 1.6819, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 13, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.284371676168192e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }