{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.205761316872428, "eval_steps": 500, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0027434842249657062, "grad_norm": 0.6255580186843872, "learning_rate": 2.0000000000000003e-06, "loss": 2.1501, "step": 1 }, { "epoch": 0.0054869684499314125, "grad_norm": 0.5966613292694092, "learning_rate": 4.000000000000001e-06, "loss": 2.1396, "step": 2 }, { "epoch": 0.00823045267489712, "grad_norm": 0.6047211289405823, "learning_rate": 6e-06, "loss": 2.1381, "step": 3 }, { "epoch": 0.010973936899862825, "grad_norm": 0.5786992311477661, "learning_rate": 8.000000000000001e-06, "loss": 2.0521, "step": 4 }, { "epoch": 0.013717421124828532, "grad_norm": 0.5930487513542175, "learning_rate": 1e-05, "loss": 2.1488, "step": 5 }, { "epoch": 0.01646090534979424, "grad_norm": 0.6468656063079834, "learning_rate": 1.2e-05, "loss": 2.1839, "step": 6 }, { "epoch": 0.019204389574759947, "grad_norm": 0.6152164340019226, "learning_rate": 1.4000000000000001e-05, "loss": 2.1315, "step": 7 }, { "epoch": 0.02194787379972565, "grad_norm": 0.5863370299339294, "learning_rate": 1.6000000000000003e-05, "loss": 2.0977, "step": 8 }, { "epoch": 0.024691358024691357, "grad_norm": 0.5273774862289429, "learning_rate": 1.8e-05, "loss": 2.0164, "step": 9 }, { "epoch": 0.027434842249657063, "grad_norm": 0.539458155632019, "learning_rate": 2e-05, "loss": 2.0963, "step": 10 }, { "epoch": 0.03017832647462277, "grad_norm": 0.5333603620529175, "learning_rate": 2.2000000000000003e-05, "loss": 2.0824, "step": 11 }, { "epoch": 0.03292181069958848, "grad_norm": 0.505551815032959, "learning_rate": 2.4e-05, "loss": 1.995, "step": 12 }, { "epoch": 0.03566529492455418, "grad_norm": 0.523439347743988, "learning_rate": 2.6000000000000002e-05, "loss": 2.0304, "step": 13 }, { "epoch": 0.038408779149519894, "grad_norm": 0.471587210893631, "learning_rate": 2.8000000000000003e-05, "loss": 2.021, "step": 14 }, { "epoch": 0.0411522633744856, "grad_norm": 0.4423275887966156, "learning_rate": 3e-05, "loss": 1.9335, "step": 15 }, { "epoch": 0.0438957475994513, "grad_norm": 0.42515936493873596, "learning_rate": 3.2000000000000005e-05, "loss": 2.022, "step": 16 }, { "epoch": 0.04663923182441701, "grad_norm": 0.41880151629447937, "learning_rate": 3.4000000000000007e-05, "loss": 1.9723, "step": 17 }, { "epoch": 0.04938271604938271, "grad_norm": 0.4344858229160309, "learning_rate": 3.6e-05, "loss": 2.0162, "step": 18 }, { "epoch": 0.05212620027434842, "grad_norm": 0.3777664601802826, "learning_rate": 3.8e-05, "loss": 1.9406, "step": 19 }, { "epoch": 0.05486968449931413, "grad_norm": 0.3823831081390381, "learning_rate": 4e-05, "loss": 1.9227, "step": 20 }, { "epoch": 0.05761316872427984, "grad_norm": 0.39637988805770874, "learning_rate": 4.2e-05, "loss": 1.9796, "step": 21 }, { "epoch": 0.06035665294924554, "grad_norm": 0.3595951795578003, "learning_rate": 4.4000000000000006e-05, "loss": 1.8776, "step": 22 }, { "epoch": 0.06310013717421124, "grad_norm": 0.3291175365447998, "learning_rate": 4.600000000000001e-05, "loss": 1.8456, "step": 23 }, { "epoch": 0.06584362139917696, "grad_norm": 0.3699107766151428, "learning_rate": 4.8e-05, "loss": 1.8606, "step": 24 }, { "epoch": 0.06858710562414266, "grad_norm": 0.36269885301589966, "learning_rate": 5e-05, "loss": 1.8443, "step": 25 }, { "epoch": 0.07133058984910837, "grad_norm": 0.3507707118988037, "learning_rate": 5.2000000000000004e-05, "loss": 1.7648, "step": 26 }, { "epoch": 0.07407407407407407, "grad_norm": 0.3639073371887207, "learning_rate": 5.4000000000000005e-05, "loss": 1.8047, "step": 27 }, { "epoch": 0.07681755829903979, "grad_norm": 0.3715658485889435, "learning_rate": 5.6000000000000006e-05, "loss": 1.7764, "step": 28 }, { "epoch": 0.07956104252400549, "grad_norm": 0.3636416792869568, "learning_rate": 5.8e-05, "loss": 1.8089, "step": 29 }, { "epoch": 0.0823045267489712, "grad_norm": 0.4226425290107727, "learning_rate": 6e-05, "loss": 1.7236, "step": 30 }, { "epoch": 0.0850480109739369, "grad_norm": 0.37769123911857605, "learning_rate": 6.2e-05, "loss": 1.6935, "step": 31 }, { "epoch": 0.0877914951989026, "grad_norm": 0.37444913387298584, "learning_rate": 6.400000000000001e-05, "loss": 1.6441, "step": 32 }, { "epoch": 0.09053497942386832, "grad_norm": 0.37308382987976074, "learning_rate": 6.6e-05, "loss": 1.6189, "step": 33 }, { "epoch": 0.09327846364883402, "grad_norm": 0.4107051193714142, "learning_rate": 6.800000000000001e-05, "loss": 1.662, "step": 34 }, { "epoch": 0.09602194787379972, "grad_norm": 0.4351920783519745, "learning_rate": 7e-05, "loss": 1.6044, "step": 35 }, { "epoch": 0.09876543209876543, "grad_norm": 0.4590749740600586, "learning_rate": 7.2e-05, "loss": 1.5397, "step": 36 }, { "epoch": 0.10150891632373114, "grad_norm": 0.43422356247901917, "learning_rate": 7.4e-05, "loss": 1.5517, "step": 37 }, { "epoch": 0.10425240054869685, "grad_norm": 0.44985321164131165, "learning_rate": 7.6e-05, "loss": 1.4565, "step": 38 }, { "epoch": 0.10699588477366255, "grad_norm": 0.4231052100658417, "learning_rate": 7.800000000000001e-05, "loss": 1.4592, "step": 39 }, { "epoch": 0.10973936899862825, "grad_norm": 0.3938014507293701, "learning_rate": 8e-05, "loss": 1.5119, "step": 40 }, { "epoch": 0.11248285322359397, "grad_norm": 0.4007266163825989, "learning_rate": 8.2e-05, "loss": 1.4205, "step": 41 }, { "epoch": 0.11522633744855967, "grad_norm": 0.40789809823036194, "learning_rate": 8.4e-05, "loss": 1.44, "step": 42 }, { "epoch": 0.11796982167352538, "grad_norm": 0.3244912326335907, "learning_rate": 8.6e-05, "loss": 1.3924, "step": 43 }, { "epoch": 0.12071330589849108, "grad_norm": 0.34083986282348633, "learning_rate": 8.800000000000001e-05, "loss": 1.4029, "step": 44 }, { "epoch": 0.12345679012345678, "grad_norm": 0.36940997838974, "learning_rate": 9e-05, "loss": 1.333, "step": 45 }, { "epoch": 0.1262002743484225, "grad_norm": 0.35098233819007874, "learning_rate": 9.200000000000001e-05, "loss": 1.4171, "step": 46 }, { "epoch": 0.1289437585733882, "grad_norm": 0.3776918053627014, "learning_rate": 9.4e-05, "loss": 1.3849, "step": 47 }, { "epoch": 0.13168724279835392, "grad_norm": 0.48279502987861633, "learning_rate": 9.6e-05, "loss": 1.329, "step": 48 }, { "epoch": 0.13443072702331962, "grad_norm": 0.4614526033401489, "learning_rate": 9.8e-05, "loss": 1.3562, "step": 49 }, { "epoch": 0.13717421124828533, "grad_norm": 0.3900231122970581, "learning_rate": 0.0001, "loss": 1.3945, "step": 50 }, { "epoch": 0.13991769547325103, "grad_norm": 0.20536381006240845, "learning_rate": 0.00010200000000000001, "loss": 1.3483, "step": 51 }, { "epoch": 0.14266117969821673, "grad_norm": 0.15070989727973938, "learning_rate": 0.00010400000000000001, "loss": 1.3259, "step": 52 }, { "epoch": 0.14540466392318244, "grad_norm": 0.14189720153808594, "learning_rate": 0.00010600000000000002, "loss": 1.2869, "step": 53 }, { "epoch": 0.14814814814814814, "grad_norm": 0.14223229885101318, "learning_rate": 0.00010800000000000001, "loss": 1.3437, "step": 54 }, { "epoch": 0.15089163237311384, "grad_norm": 0.13781367242336273, "learning_rate": 0.00011000000000000002, "loss": 1.3451, "step": 55 }, { "epoch": 0.15363511659807957, "grad_norm": 0.11993853747844696, "learning_rate": 0.00011200000000000001, "loss": 1.3052, "step": 56 }, { "epoch": 0.15637860082304528, "grad_norm": 0.11463834345340729, "learning_rate": 0.00011399999999999999, "loss": 1.3202, "step": 57 }, { "epoch": 0.15912208504801098, "grad_norm": 0.12319689244031906, "learning_rate": 0.000116, "loss": 1.2739, "step": 58 }, { "epoch": 0.16186556927297668, "grad_norm": 0.11980075389146805, "learning_rate": 0.000118, "loss": 1.3162, "step": 59 }, { "epoch": 0.1646090534979424, "grad_norm": 0.11416748911142349, "learning_rate": 0.00012, "loss": 1.3558, "step": 60 }, { "epoch": 0.1673525377229081, "grad_norm": 0.12114512920379639, "learning_rate": 0.000122, "loss": 1.3582, "step": 61 }, { "epoch": 0.1700960219478738, "grad_norm": 0.11863748729228973, "learning_rate": 0.000124, "loss": 1.2973, "step": 62 }, { "epoch": 0.1728395061728395, "grad_norm": 0.10819629579782486, "learning_rate": 0.000126, "loss": 1.291, "step": 63 }, { "epoch": 0.1755829903978052, "grad_norm": 0.11032682657241821, "learning_rate": 0.00012800000000000002, "loss": 1.2807, "step": 64 }, { "epoch": 0.17832647462277093, "grad_norm": 0.10899264365434647, "learning_rate": 0.00013000000000000002, "loss": 1.2973, "step": 65 }, { "epoch": 0.18106995884773663, "grad_norm": 0.1106266900897026, "learning_rate": 0.000132, "loss": 1.2992, "step": 66 }, { "epoch": 0.18381344307270234, "grad_norm": 0.11511031538248062, "learning_rate": 0.000134, "loss": 1.301, "step": 67 }, { "epoch": 0.18655692729766804, "grad_norm": 0.10725060850381851, "learning_rate": 0.00013600000000000003, "loss": 1.3199, "step": 68 }, { "epoch": 0.18930041152263374, "grad_norm": 0.1098162978887558, "learning_rate": 0.000138, "loss": 1.2724, "step": 69 }, { "epoch": 0.19204389574759945, "grad_norm": 0.11557920277118683, "learning_rate": 0.00014, "loss": 1.2343, "step": 70 }, { "epoch": 0.19478737997256515, "grad_norm": 0.1220446228981018, "learning_rate": 0.000142, "loss": 1.2399, "step": 71 }, { "epoch": 0.19753086419753085, "grad_norm": 0.11786019057035446, "learning_rate": 0.000144, "loss": 1.2704, "step": 72 }, { "epoch": 0.20027434842249658, "grad_norm": 0.11220725625753403, "learning_rate": 0.000146, "loss": 1.2782, "step": 73 }, { "epoch": 0.2030178326474623, "grad_norm": 0.10791537165641785, "learning_rate": 0.000148, "loss": 1.304, "step": 74 }, { "epoch": 0.205761316872428, "grad_norm": 0.12175633758306503, "learning_rate": 0.00015000000000000001, "loss": 1.3144, "step": 75 } ], "logging_steps": 1, "max_steps": 364, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.139644296272282e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }