|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.205761316872428, |
|
"eval_steps": 500, |
|
"global_step": 75, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0027434842249657062, |
|
"grad_norm": 0.6255580186843872, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 2.1501, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0054869684499314125, |
|
"grad_norm": 0.5966613292694092, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.1396, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00823045267489712, |
|
"grad_norm": 0.6047211289405823, |
|
"learning_rate": 6e-06, |
|
"loss": 2.1381, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.010973936899862825, |
|
"grad_norm": 0.5786992311477661, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 2.0521, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.013717421124828532, |
|
"grad_norm": 0.5930487513542175, |
|
"learning_rate": 1e-05, |
|
"loss": 2.1488, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01646090534979424, |
|
"grad_norm": 0.6468656063079834, |
|
"learning_rate": 1.2e-05, |
|
"loss": 2.1839, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.019204389574759947, |
|
"grad_norm": 0.6152164340019226, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 2.1315, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02194787379972565, |
|
"grad_norm": 0.5863370299339294, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 2.0977, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.024691358024691357, |
|
"grad_norm": 0.5273774862289429, |
|
"learning_rate": 1.8e-05, |
|
"loss": 2.0164, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.027434842249657063, |
|
"grad_norm": 0.539458155632019, |
|
"learning_rate": 2e-05, |
|
"loss": 2.0963, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03017832647462277, |
|
"grad_norm": 0.5333603620529175, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 2.0824, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03292181069958848, |
|
"grad_norm": 0.505551815032959, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.995, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03566529492455418, |
|
"grad_norm": 0.523439347743988, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 2.0304, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.038408779149519894, |
|
"grad_norm": 0.471587210893631, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 2.021, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0411522633744856, |
|
"grad_norm": 0.4423275887966156, |
|
"learning_rate": 3e-05, |
|
"loss": 1.9335, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0438957475994513, |
|
"grad_norm": 0.42515936493873596, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 2.022, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04663923182441701, |
|
"grad_norm": 0.41880151629447937, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 1.9723, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.04938271604938271, |
|
"grad_norm": 0.4344858229160309, |
|
"learning_rate": 3.6e-05, |
|
"loss": 2.0162, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.05212620027434842, |
|
"grad_norm": 0.3777664601802826, |
|
"learning_rate": 3.8e-05, |
|
"loss": 1.9406, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05486968449931413, |
|
"grad_norm": 0.3823831081390381, |
|
"learning_rate": 4e-05, |
|
"loss": 1.9227, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05761316872427984, |
|
"grad_norm": 0.39637988805770874, |
|
"learning_rate": 4.2e-05, |
|
"loss": 1.9796, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.06035665294924554, |
|
"grad_norm": 0.3595951795578003, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 1.8776, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.06310013717421124, |
|
"grad_norm": 0.3291175365447998, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 1.8456, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.06584362139917696, |
|
"grad_norm": 0.3699107766151428, |
|
"learning_rate": 4.8e-05, |
|
"loss": 1.8606, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06858710562414266, |
|
"grad_norm": 0.36269885301589966, |
|
"learning_rate": 5e-05, |
|
"loss": 1.8443, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07133058984910837, |
|
"grad_norm": 0.3507707118988037, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 1.7648, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.07407407407407407, |
|
"grad_norm": 0.3639073371887207, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 1.8047, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.07681755829903979, |
|
"grad_norm": 0.3715658485889435, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 1.7764, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07956104252400549, |
|
"grad_norm": 0.3636416792869568, |
|
"learning_rate": 5.8e-05, |
|
"loss": 1.8089, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0823045267489712, |
|
"grad_norm": 0.4226425290107727, |
|
"learning_rate": 6e-05, |
|
"loss": 1.7236, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0850480109739369, |
|
"grad_norm": 0.37769123911857605, |
|
"learning_rate": 6.2e-05, |
|
"loss": 1.6935, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0877914951989026, |
|
"grad_norm": 0.37444913387298584, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 1.6441, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.09053497942386832, |
|
"grad_norm": 0.37308382987976074, |
|
"learning_rate": 6.6e-05, |
|
"loss": 1.6189, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.09327846364883402, |
|
"grad_norm": 0.4107051193714142, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 1.662, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.09602194787379972, |
|
"grad_norm": 0.4351920783519745, |
|
"learning_rate": 7e-05, |
|
"loss": 1.6044, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09876543209876543, |
|
"grad_norm": 0.4590749740600586, |
|
"learning_rate": 7.2e-05, |
|
"loss": 1.5397, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.10150891632373114, |
|
"grad_norm": 0.43422356247901917, |
|
"learning_rate": 7.4e-05, |
|
"loss": 1.5517, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.10425240054869685, |
|
"grad_norm": 0.44985321164131165, |
|
"learning_rate": 7.6e-05, |
|
"loss": 1.4565, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.10699588477366255, |
|
"grad_norm": 0.4231052100658417, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 1.4592, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.10973936899862825, |
|
"grad_norm": 0.3938014507293701, |
|
"learning_rate": 8e-05, |
|
"loss": 1.5119, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11248285322359397, |
|
"grad_norm": 0.4007266163825989, |
|
"learning_rate": 8.2e-05, |
|
"loss": 1.4205, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.11522633744855967, |
|
"grad_norm": 0.40789809823036194, |
|
"learning_rate": 8.4e-05, |
|
"loss": 1.44, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.11796982167352538, |
|
"grad_norm": 0.3244912326335907, |
|
"learning_rate": 8.6e-05, |
|
"loss": 1.3924, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.12071330589849108, |
|
"grad_norm": 0.34083986282348633, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 1.4029, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.12345679012345678, |
|
"grad_norm": 0.36940997838974, |
|
"learning_rate": 9e-05, |
|
"loss": 1.333, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1262002743484225, |
|
"grad_norm": 0.35098233819007874, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 1.4171, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.1289437585733882, |
|
"grad_norm": 0.3776918053627014, |
|
"learning_rate": 9.4e-05, |
|
"loss": 1.3849, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.13168724279835392, |
|
"grad_norm": 0.48279502987861633, |
|
"learning_rate": 9.6e-05, |
|
"loss": 1.329, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.13443072702331962, |
|
"grad_norm": 0.4614526033401489, |
|
"learning_rate": 9.8e-05, |
|
"loss": 1.3562, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.13717421124828533, |
|
"grad_norm": 0.3900231122970581, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3945, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13991769547325103, |
|
"grad_norm": 0.20536381006240845, |
|
"learning_rate": 0.00010200000000000001, |
|
"loss": 1.3483, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.14266117969821673, |
|
"grad_norm": 0.15070989727973938, |
|
"learning_rate": 0.00010400000000000001, |
|
"loss": 1.3259, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.14540466392318244, |
|
"grad_norm": 0.14189720153808594, |
|
"learning_rate": 0.00010600000000000002, |
|
"loss": 1.2869, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 0.14223229885101318, |
|
"learning_rate": 0.00010800000000000001, |
|
"loss": 1.3437, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.15089163237311384, |
|
"grad_norm": 0.13781367242336273, |
|
"learning_rate": 0.00011000000000000002, |
|
"loss": 1.3451, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.15363511659807957, |
|
"grad_norm": 0.11993853747844696, |
|
"learning_rate": 0.00011200000000000001, |
|
"loss": 1.3052, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.15637860082304528, |
|
"grad_norm": 0.11463834345340729, |
|
"learning_rate": 0.00011399999999999999, |
|
"loss": 1.3202, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.15912208504801098, |
|
"grad_norm": 0.12319689244031906, |
|
"learning_rate": 0.000116, |
|
"loss": 1.2739, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.16186556927297668, |
|
"grad_norm": 0.11980075389146805, |
|
"learning_rate": 0.000118, |
|
"loss": 1.3162, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.1646090534979424, |
|
"grad_norm": 0.11416748911142349, |
|
"learning_rate": 0.00012, |
|
"loss": 1.3558, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1673525377229081, |
|
"grad_norm": 0.12114512920379639, |
|
"learning_rate": 0.000122, |
|
"loss": 1.3582, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.1700960219478738, |
|
"grad_norm": 0.11863748729228973, |
|
"learning_rate": 0.000124, |
|
"loss": 1.2973, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1728395061728395, |
|
"grad_norm": 0.10819629579782486, |
|
"learning_rate": 0.000126, |
|
"loss": 1.291, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.1755829903978052, |
|
"grad_norm": 0.11032682657241821, |
|
"learning_rate": 0.00012800000000000002, |
|
"loss": 1.2807, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.17832647462277093, |
|
"grad_norm": 0.10899264365434647, |
|
"learning_rate": 0.00013000000000000002, |
|
"loss": 1.2973, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.18106995884773663, |
|
"grad_norm": 0.1106266900897026, |
|
"learning_rate": 0.000132, |
|
"loss": 1.2992, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.18381344307270234, |
|
"grad_norm": 0.11511031538248062, |
|
"learning_rate": 0.000134, |
|
"loss": 1.301, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.18655692729766804, |
|
"grad_norm": 0.10725060850381851, |
|
"learning_rate": 0.00013600000000000003, |
|
"loss": 1.3199, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.18930041152263374, |
|
"grad_norm": 0.1098162978887558, |
|
"learning_rate": 0.000138, |
|
"loss": 1.2724, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.19204389574759945, |
|
"grad_norm": 0.11557920277118683, |
|
"learning_rate": 0.00014, |
|
"loss": 1.2343, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.19478737997256515, |
|
"grad_norm": 0.1220446228981018, |
|
"learning_rate": 0.000142, |
|
"loss": 1.2399, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.19753086419753085, |
|
"grad_norm": 0.11786019057035446, |
|
"learning_rate": 0.000144, |
|
"loss": 1.2704, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.20027434842249658, |
|
"grad_norm": 0.11220725625753403, |
|
"learning_rate": 0.000146, |
|
"loss": 1.2782, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.2030178326474623, |
|
"grad_norm": 0.10791537165641785, |
|
"learning_rate": 0.000148, |
|
"loss": 1.304, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.205761316872428, |
|
"grad_norm": 0.12175633758306503, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.3144, |
|
"step": 75 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 364, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.139644296272282e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|