{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.10614772224679346, "eval_steps": 500, "global_step": 60, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017691287041132243, "grad_norm": 1.9742480441813004, "learning_rate": 2e-08, "loss": 3.2598, "step": 1 }, { "epoch": 0.0035382574082264487, "grad_norm": 2.038016027037428, "learning_rate": 4e-08, "loss": 3.4531, "step": 2 }, { "epoch": 0.005307386112339673, "grad_norm": 2.225136431627388, "learning_rate": 6e-08, "loss": 3.4746, "step": 3 }, { "epoch": 0.007076514816452897, "grad_norm": 2.0178713550531033, "learning_rate": 8e-08, "loss": 3.293, "step": 4 }, { "epoch": 0.008845643520566122, "grad_norm": 2.042908378033956, "learning_rate": 1e-07, "loss": 3.2383, "step": 5 }, { "epoch": 0.010614772224679346, "grad_norm": 2.1329682870055464, "learning_rate": 1.2e-07, "loss": 3.2441, "step": 6 }, { "epoch": 0.01238390092879257, "grad_norm": 2.1377356269998873, "learning_rate": 1.4e-07, "loss": 3.293, "step": 7 }, { "epoch": 0.014153029632905795, "grad_norm": 2.0514676704041146, "learning_rate": 1.6e-07, "loss": 3.4004, "step": 8 }, { "epoch": 0.01592215833701902, "grad_norm": 1.94775665931649, "learning_rate": 1.8e-07, "loss": 3.4355, "step": 9 }, { "epoch": 0.017691287041132243, "grad_norm": 2.0977166835164653, "learning_rate": 2e-07, "loss": 3.3711, "step": 10 }, { "epoch": 0.019460415745245468, "grad_norm": 2.192564032711453, "learning_rate": 2.1999999999999998e-07, "loss": 3.3398, "step": 11 }, { "epoch": 0.021229544449358692, "grad_norm": 2.1498795965856914, "learning_rate": 2.4e-07, "loss": 3.1562, "step": 12 }, { "epoch": 0.022998673153471916, "grad_norm": 2.109291509736264, "learning_rate": 2.6e-07, "loss": 3.3652, "step": 13 }, { "epoch": 0.02476780185758514, "grad_norm": 2.044792800372603, "learning_rate": 2.8e-07, "loss": 3.2461, "step": 14 }, { "epoch": 0.026536930561698365, "grad_norm": 2.167283994785129, "learning_rate": 3e-07, "loss": 3.3301, "step": 15 }, { "epoch": 0.02830605926581159, "grad_norm": 1.995664320722997, "learning_rate": 3.2e-07, "loss": 3.3867, "step": 16 }, { "epoch": 0.03007518796992481, "grad_norm": 2.00843521276344, "learning_rate": 3.4000000000000003e-07, "loss": 3.2363, "step": 17 }, { "epoch": 0.03184431667403804, "grad_norm": 2.107294400286055, "learning_rate": 3.6e-07, "loss": 3.3809, "step": 18 }, { "epoch": 0.03361344537815126, "grad_norm": 1.9965688324131208, "learning_rate": 3.7999999999999996e-07, "loss": 3.2637, "step": 19 }, { "epoch": 0.03538257408226449, "grad_norm": 2.1690567393421936, "learning_rate": 4e-07, "loss": 3.2461, "step": 20 }, { "epoch": 0.03715170278637771, "grad_norm": 1.9509465820725813, "learning_rate": 4.1999999999999995e-07, "loss": 3.3359, "step": 21 }, { "epoch": 0.038920831490490936, "grad_norm": 2.180359699431997, "learning_rate": 4.3999999999999997e-07, "loss": 3.498, "step": 22 }, { "epoch": 0.040689960194604156, "grad_norm": 1.9303585281557267, "learning_rate": 4.6e-07, "loss": 3.4453, "step": 23 }, { "epoch": 0.042459088898717384, "grad_norm": 2.105287899781242, "learning_rate": 4.8e-07, "loss": 3.4531, "step": 24 }, { "epoch": 0.044228217602830605, "grad_norm": 1.996519659869237, "learning_rate": 5e-07, "loss": 3.3633, "step": 25 }, { "epoch": 0.04599734630694383, "grad_norm": 2.0672497293218903, "learning_rate": 5.2e-07, "loss": 3.4746, "step": 26 }, { "epoch": 0.047766475011057054, "grad_norm": 2.0187116926490165, "learning_rate": 5.4e-07, "loss": 3.3105, "step": 27 }, { "epoch": 0.04953560371517028, "grad_norm": 2.185446736666104, "learning_rate": 5.6e-07, "loss": 3.1953, "step": 28 }, { "epoch": 0.0513047324192835, "grad_norm": 1.9785091042817515, "learning_rate": 5.8e-07, "loss": 3.2207, "step": 29 }, { "epoch": 0.05307386112339673, "grad_norm": 1.983411961208081, "learning_rate": 6e-07, "loss": 3.1953, "step": 30 }, { "epoch": 0.05484298982750995, "grad_norm": 1.8887794910668352, "learning_rate": 6.2e-07, "loss": 3.127, "step": 31 }, { "epoch": 0.05661211853162318, "grad_norm": 2.024592500623624, "learning_rate": 6.4e-07, "loss": 3.3652, "step": 32 }, { "epoch": 0.0583812472357364, "grad_norm": 2.033056092327317, "learning_rate": 6.6e-07, "loss": 3.4629, "step": 33 }, { "epoch": 0.06015037593984962, "grad_norm": 2.1137985890313646, "learning_rate": 6.800000000000001e-07, "loss": 3.4277, "step": 34 }, { "epoch": 0.06191950464396285, "grad_norm": 2.135970317417631, "learning_rate": 7e-07, "loss": 3.5664, "step": 35 }, { "epoch": 0.06368863334807608, "grad_norm": 1.9525141602052385, "learning_rate": 7.2e-07, "loss": 3.3164, "step": 36 }, { "epoch": 0.0654577620521893, "grad_norm": 1.9679140574444143, "learning_rate": 7.4e-07, "loss": 3.1348, "step": 37 }, { "epoch": 0.06722689075630252, "grad_norm": 2.0697308820659295, "learning_rate": 7.599999999999999e-07, "loss": 3.3574, "step": 38 }, { "epoch": 0.06899601946041574, "grad_norm": 2.0879787228782463, "learning_rate": 7.799999999999999e-07, "loss": 3.3477, "step": 39 }, { "epoch": 0.07076514816452897, "grad_norm": 2.0051097367804234, "learning_rate": 8e-07, "loss": 3.3477, "step": 40 }, { "epoch": 0.0725342768686422, "grad_norm": 2.039846964044792, "learning_rate": 8.199999999999999e-07, "loss": 3.2109, "step": 41 }, { "epoch": 0.07430340557275542, "grad_norm": 2.13344976323939, "learning_rate": 8.399999999999999e-07, "loss": 3.3848, "step": 42 }, { "epoch": 0.07607253427686864, "grad_norm": 2.2830511277961585, "learning_rate": 8.599999999999999e-07, "loss": 3.1602, "step": 43 }, { "epoch": 0.07784166298098187, "grad_norm": 1.9427108734819927, "learning_rate": 8.799999999999999e-07, "loss": 3.1914, "step": 44 }, { "epoch": 0.07961079168509509, "grad_norm": 1.9926391710215448, "learning_rate": 9e-07, "loss": 3.3711, "step": 45 }, { "epoch": 0.08137992038920831, "grad_norm": 2.2237278323731107, "learning_rate": 9.2e-07, "loss": 3.4746, "step": 46 }, { "epoch": 0.08314904909332153, "grad_norm": 2.123759872019136, "learning_rate": 9.399999999999999e-07, "loss": 3.3926, "step": 47 }, { "epoch": 0.08491817779743477, "grad_norm": 2.138037893897646, "learning_rate": 9.6e-07, "loss": 3.377, "step": 48 }, { "epoch": 0.08668730650154799, "grad_norm": 2.074234748374453, "learning_rate": 9.8e-07, "loss": 3.3457, "step": 49 }, { "epoch": 0.08845643520566121, "grad_norm": 2.162562414477262, "learning_rate": 1e-06, "loss": 3.2148, "step": 50 }, { "epoch": 0.09022556390977443, "grad_norm": 2.091661753228539, "learning_rate": 1.02e-06, "loss": 3.3613, "step": 51 }, { "epoch": 0.09199469261388767, "grad_norm": 2.1815638012188963, "learning_rate": 1.04e-06, "loss": 3.2949, "step": 52 }, { "epoch": 0.09376382131800089, "grad_norm": 2.127146363547092, "learning_rate": 1.06e-06, "loss": 3.4297, "step": 53 }, { "epoch": 0.09553295002211411, "grad_norm": 2.201243987413546, "learning_rate": 1.08e-06, "loss": 3.1445, "step": 54 }, { "epoch": 0.09730207872622733, "grad_norm": 2.1006629919292075, "learning_rate": 1.1e-06, "loss": 3.4238, "step": 55 }, { "epoch": 0.09907120743034056, "grad_norm": 2.2056247586234115, "learning_rate": 1.12e-06, "loss": 3.3516, "step": 56 }, { "epoch": 0.10084033613445378, "grad_norm": 2.300409874538962, "learning_rate": 1.1399999999999999e-06, "loss": 3.4766, "step": 57 }, { "epoch": 0.102609464838567, "grad_norm": 2.10637739408632, "learning_rate": 1.16e-06, "loss": 3.3477, "step": 58 }, { "epoch": 0.10437859354268023, "grad_norm": 2.047319466977983, "learning_rate": 1.18e-06, "loss": 3.3965, "step": 59 }, { "epoch": 0.10614772224679346, "grad_norm": 2.0670306109309102, "learning_rate": 1.2e-06, "loss": 3.2266, "step": 60 } ], "logging_steps": 1, "max_steps": 565, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 15477887729664.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }