{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.881844380403458, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05763688760806916, "grad_norm": 4.8593597412109375, "learning_rate": 0.000125, "loss": 1.0043, "step": 20 }, { "epoch": 0.11527377521613832, "grad_norm": 0.14300437271595, "learning_rate": 0.00019841427155599603, "loss": 0.5844, "step": 40 }, { "epoch": 0.1729106628242075, "grad_norm": 0.11132702976465225, "learning_rate": 0.00019444995044598612, "loss": 0.4951, "step": 60 }, { "epoch": 0.23054755043227665, "grad_norm": 0.15473473072052002, "learning_rate": 0.0001904856293359762, "loss": 0.4809, "step": 80 }, { "epoch": 0.2881844380403458, "grad_norm": 0.12199735641479492, "learning_rate": 0.00018652130822596633, "loss": 0.4703, "step": 100 }, { "epoch": 0.345821325648415, "grad_norm": 0.11592856794595718, "learning_rate": 0.0001825569871159564, "loss": 0.4622, "step": 120 }, { "epoch": 0.4034582132564842, "grad_norm": 0.11163052916526794, "learning_rate": 0.00017859266600594648, "loss": 0.4527, "step": 140 }, { "epoch": 0.4610951008645533, "grad_norm": 0.11111810058355331, "learning_rate": 0.00017462834489593657, "loss": 0.451, "step": 160 }, { "epoch": 0.5187319884726225, "grad_norm": 0.11076587438583374, "learning_rate": 0.00017066402378592668, "loss": 0.4447, "step": 180 }, { "epoch": 0.5763688760806917, "grad_norm": 0.09930027276277542, "learning_rate": 0.00016669970267591675, "loss": 0.4486, "step": 200 }, { "epoch": 0.6340057636887608, "grad_norm": 0.09756297618150711, "learning_rate": 0.00016273538156590683, "loss": 0.4369, "step": 220 }, { "epoch": 0.69164265129683, "grad_norm": 0.11096686124801636, "learning_rate": 0.00015877106045589692, "loss": 0.4412, "step": 240 }, { "epoch": 0.7492795389048992, "grad_norm": 0.09731573611497879, "learning_rate": 0.00015480673934588704, "loss": 0.4407, "step": 260 }, { "epoch": 0.8069164265129684, "grad_norm": 0.10571198165416718, "learning_rate": 0.0001508424182358771, "loss": 0.4376, "step": 280 }, { "epoch": 0.8645533141210374, "grad_norm": 0.09965088218450546, "learning_rate": 0.0001468780971258672, "loss": 0.4368, "step": 300 }, { "epoch": 0.9221902017291066, "grad_norm": 0.09912007302045822, "learning_rate": 0.0001429137760158573, "loss": 0.4294, "step": 320 }, { "epoch": 0.9798270893371758, "grad_norm": 0.09754997491836548, "learning_rate": 0.0001389494549058474, "loss": 0.4309, "step": 340 }, { "epoch": 1.037463976945245, "grad_norm": 0.10138026624917984, "learning_rate": 0.00013498513379583746, "loss": 0.4188, "step": 360 }, { "epoch": 1.0951008645533142, "grad_norm": 0.09764310717582703, "learning_rate": 0.00013102081268582754, "loss": 0.4167, "step": 380 }, { "epoch": 1.1527377521613833, "grad_norm": 0.10624668747186661, "learning_rate": 0.00012705649157581766, "loss": 0.4167, "step": 400 }, { "epoch": 1.2103746397694524, "grad_norm": 0.1042829230427742, "learning_rate": 0.00012309217046580775, "loss": 0.42, "step": 420 }, { "epoch": 1.2680115273775217, "grad_norm": 0.09762328863143921, "learning_rate": 0.00011912784935579781, "loss": 0.4147, "step": 440 }, { "epoch": 1.3256484149855907, "grad_norm": 0.1053580567240715, "learning_rate": 0.00011516352824578791, "loss": 0.4173, "step": 460 }, { "epoch": 1.38328530259366, "grad_norm": 0.09830950945615768, "learning_rate": 0.000111199207135778, "loss": 0.4107, "step": 480 }, { "epoch": 1.440922190201729, "grad_norm": 0.1017039492726326, "learning_rate": 0.0001072348860257681, "loss": 0.4129, "step": 500 }, { "epoch": 1.4985590778097984, "grad_norm": 0.10052554309368134, "learning_rate": 0.00010327056491575817, "loss": 0.4115, "step": 520 }, { "epoch": 1.5561959654178674, "grad_norm": 0.09901268035173416, "learning_rate": 9.930624380574827e-05, "loss": 0.4113, "step": 540 }, { "epoch": 1.6138328530259365, "grad_norm": 0.0998842865228653, "learning_rate": 9.534192269573836e-05, "loss": 0.4075, "step": 560 }, { "epoch": 1.6714697406340058, "grad_norm": 0.10228322446346283, "learning_rate": 9.137760158572845e-05, "loss": 0.4103, "step": 580 }, { "epoch": 1.729106628242075, "grad_norm": 0.10224767029285431, "learning_rate": 8.741328047571854e-05, "loss": 0.4097, "step": 600 }, { "epoch": 1.7867435158501441, "grad_norm": 0.10533642023801804, "learning_rate": 8.344895936570862e-05, "loss": 0.4069, "step": 620 }, { "epoch": 1.8443804034582132, "grad_norm": 0.11008591949939728, "learning_rate": 7.948463825569871e-05, "loss": 0.4043, "step": 640 }, { "epoch": 1.9020172910662825, "grad_norm": 0.1080278530716896, "learning_rate": 7.55203171456888e-05, "loss": 0.4058, "step": 660 }, { "epoch": 1.9596541786743515, "grad_norm": 0.10025861114263535, "learning_rate": 7.155599603567889e-05, "loss": 0.4035, "step": 680 }, { "epoch": 2.0172910662824206, "grad_norm": 0.1050715371966362, "learning_rate": 6.759167492566898e-05, "loss": 0.3953, "step": 700 }, { "epoch": 2.07492795389049, "grad_norm": 0.10905614495277405, "learning_rate": 6.362735381565907e-05, "loss": 0.3898, "step": 720 }, { "epoch": 2.132564841498559, "grad_norm": 0.11501201242208481, "learning_rate": 5.966303270564916e-05, "loss": 0.3916, "step": 740 }, { "epoch": 2.1902017291066285, "grad_norm": 0.10602011531591415, "learning_rate": 5.569871159563925e-05, "loss": 0.3902, "step": 760 }, { "epoch": 2.2478386167146973, "grad_norm": 0.11458944529294968, "learning_rate": 5.1734390485629335e-05, "loss": 0.391, "step": 780 }, { "epoch": 2.3054755043227666, "grad_norm": 0.10967529565095901, "learning_rate": 4.7770069375619424e-05, "loss": 0.3882, "step": 800 }, { "epoch": 2.363112391930836, "grad_norm": 0.11452831327915192, "learning_rate": 4.380574826560951e-05, "loss": 0.3924, "step": 820 }, { "epoch": 2.4207492795389047, "grad_norm": 0.11521276086568832, "learning_rate": 3.98414271555996e-05, "loss": 0.3884, "step": 840 }, { "epoch": 2.478386167146974, "grad_norm": 0.11083399504423141, "learning_rate": 3.587710604558969e-05, "loss": 0.391, "step": 860 }, { "epoch": 2.5360230547550433, "grad_norm": 0.1139487475156784, "learning_rate": 3.191278493557978e-05, "loss": 0.3903, "step": 880 }, { "epoch": 2.5936599423631126, "grad_norm": 0.11477449536323547, "learning_rate": 2.794846382556987e-05, "loss": 0.3897, "step": 900 }, { "epoch": 2.6512968299711814, "grad_norm": 0.11389576643705368, "learning_rate": 2.3984142715559964e-05, "loss": 0.3882, "step": 920 }, { "epoch": 2.7089337175792507, "grad_norm": 0.11239298433065414, "learning_rate": 2.0019821605550053e-05, "loss": 0.388, "step": 940 }, { "epoch": 2.76657060518732, "grad_norm": 0.10633145272731781, "learning_rate": 1.605550049554014e-05, "loss": 0.3823, "step": 960 }, { "epoch": 2.824207492795389, "grad_norm": 0.12195642292499542, "learning_rate": 1.2091179385530229e-05, "loss": 0.3878, "step": 980 }, { "epoch": 2.881844380403458, "grad_norm": 0.1192099004983902, "learning_rate": 8.126858275520318e-06, "loss": 0.3877, "step": 1000 } ], "logging_steps": 20, "max_steps": 1041, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.450003270942065e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }