|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 15.238095238095237, |
|
"eval_steps": 500, |
|
"global_step": 160, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.19047619047619047, |
|
"grad_norm": 0.9942206144332886, |
|
"learning_rate": 0.00019750000000000003, |
|
"loss": 9.5613, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.38095238095238093, |
|
"grad_norm": 1.2790788412094116, |
|
"learning_rate": 0.000195, |
|
"loss": 9.2339, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 1.9939367771148682, |
|
"learning_rate": 0.00019250000000000002, |
|
"loss": 8.7953, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.7619047619047619, |
|
"grad_norm": 2.010485887527466, |
|
"learning_rate": 0.00019, |
|
"loss": 8.2168, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 1.352328896522522, |
|
"learning_rate": 0.0001875, |
|
"loss": 7.8941, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 0.9626594185829163, |
|
"learning_rate": 0.00018500000000000002, |
|
"loss": 7.6817, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 1.1568268537521362, |
|
"learning_rate": 0.0001825, |
|
"loss": 7.5131, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 1.5238095238095237, |
|
"grad_norm": 1.0264520645141602, |
|
"learning_rate": 0.00018, |
|
"loss": 7.4247, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 0.9865540862083435, |
|
"learning_rate": 0.0001775, |
|
"loss": 7.4369, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 1.0182702541351318, |
|
"learning_rate": 0.000175, |
|
"loss": 7.3787, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 2.0952380952380953, |
|
"grad_norm": 0.7922359108924866, |
|
"learning_rate": 0.00017250000000000002, |
|
"loss": 7.373, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 2.2857142857142856, |
|
"grad_norm": 0.7033187747001648, |
|
"learning_rate": 0.00017, |
|
"loss": 7.3096, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 2.4761904761904763, |
|
"grad_norm": 2.9758119583129883, |
|
"learning_rate": 0.0001675, |
|
"loss": 7.1991, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.7531760931015015, |
|
"learning_rate": 0.000165, |
|
"loss": 7.2661, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 1.3790533542633057, |
|
"learning_rate": 0.00016250000000000002, |
|
"loss": 7.2782, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 3.0476190476190474, |
|
"grad_norm": 0.6538093686103821, |
|
"learning_rate": 0.00016, |
|
"loss": 7.2109, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 3.238095238095238, |
|
"grad_norm": 0.6145215630531311, |
|
"learning_rate": 0.0001575, |
|
"loss": 7.2192, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 3.4285714285714284, |
|
"grad_norm": 0.4128475785255432, |
|
"learning_rate": 0.000155, |
|
"loss": 7.2892, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 3.619047619047619, |
|
"grad_norm": 1.0160013437271118, |
|
"learning_rate": 0.0001525, |
|
"loss": 7.2049, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 3.8095238095238093, |
|
"grad_norm": 0.5834835171699524, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 7.1672, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.4894554615020752, |
|
"learning_rate": 0.0001475, |
|
"loss": 7.1269, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 4.190476190476191, |
|
"grad_norm": 0.593618631362915, |
|
"learning_rate": 0.000145, |
|
"loss": 7.0175, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 4.380952380952381, |
|
"grad_norm": 1.6190487146377563, |
|
"learning_rate": 0.00014250000000000002, |
|
"loss": 7.2919, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 4.571428571428571, |
|
"grad_norm": 0.755859911441803, |
|
"learning_rate": 0.00014, |
|
"loss": 7.1624, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 4.761904761904762, |
|
"grad_norm": 0.46613645553588867, |
|
"learning_rate": 0.0001375, |
|
"loss": 7.2233, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 4.9523809523809526, |
|
"grad_norm": 0.5973020792007446, |
|
"learning_rate": 0.00013500000000000003, |
|
"loss": 7.1642, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 5.142857142857143, |
|
"grad_norm": 0.97837233543396, |
|
"learning_rate": 0.0001325, |
|
"loss": 7.1172, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 5.333333333333333, |
|
"grad_norm": 0.9348046183586121, |
|
"learning_rate": 0.00013000000000000002, |
|
"loss": 7.1564, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 5.523809523809524, |
|
"grad_norm": 0.6632198691368103, |
|
"learning_rate": 0.0001275, |
|
"loss": 7.0821, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"grad_norm": 0.7776179909706116, |
|
"learning_rate": 0.000125, |
|
"loss": 7.2272, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 5.904761904761905, |
|
"grad_norm": 0.6282438039779663, |
|
"learning_rate": 0.00012250000000000002, |
|
"loss": 7.0926, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 6.095238095238095, |
|
"grad_norm": 0.6008353233337402, |
|
"learning_rate": 0.00012, |
|
"loss": 7.1073, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 6.285714285714286, |
|
"grad_norm": 0.8796420097351074, |
|
"learning_rate": 0.00011750000000000001, |
|
"loss": 7.1737, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 6.476190476190476, |
|
"grad_norm": 0.6400454640388489, |
|
"learning_rate": 0.00011499999999999999, |
|
"loss": 7.0924, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 0.5479526519775391, |
|
"learning_rate": 0.00011250000000000001, |
|
"loss": 7.1275, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 6.857142857142857, |
|
"grad_norm": 0.5992618203163147, |
|
"learning_rate": 0.00011000000000000002, |
|
"loss": 7.0599, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 7.0476190476190474, |
|
"grad_norm": 0.5336684584617615, |
|
"learning_rate": 0.0001075, |
|
"loss": 7.0206, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 7.238095238095238, |
|
"grad_norm": 0.3991040289402008, |
|
"learning_rate": 0.000105, |
|
"loss": 7.0123, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 7.428571428571429, |
|
"grad_norm": 1.032917857170105, |
|
"learning_rate": 0.0001025, |
|
"loss": 7.0267, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 7.619047619047619, |
|
"grad_norm": 0.5554404854774475, |
|
"learning_rate": 0.0001, |
|
"loss": 7.0203, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 7.809523809523809, |
|
"grad_norm": 0.7755109667778015, |
|
"learning_rate": 9.75e-05, |
|
"loss": 7.1445, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 1.8295842409133911, |
|
"learning_rate": 9.5e-05, |
|
"loss": 7.0002, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 8.19047619047619, |
|
"grad_norm": 1.4985620975494385, |
|
"learning_rate": 9.250000000000001e-05, |
|
"loss": 7.0613, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 8.380952380952381, |
|
"grad_norm": 1.0733778476715088, |
|
"learning_rate": 9e-05, |
|
"loss": 7.0594, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 8.571428571428571, |
|
"grad_norm": 0.7009026408195496, |
|
"learning_rate": 8.75e-05, |
|
"loss": 6.9432, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 8.761904761904763, |
|
"grad_norm": 1.195196509361267, |
|
"learning_rate": 8.5e-05, |
|
"loss": 6.9266, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 8.952380952380953, |
|
"grad_norm": 2.6835684776306152, |
|
"learning_rate": 8.25e-05, |
|
"loss": 6.9855, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 9.142857142857142, |
|
"grad_norm": 0.7434377670288086, |
|
"learning_rate": 8e-05, |
|
"loss": 6.7975, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 9.333333333333334, |
|
"grad_norm": 0.5993837118148804, |
|
"learning_rate": 7.75e-05, |
|
"loss": 7.0476, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 9.523809523809524, |
|
"grad_norm": 0.4656153619289398, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 6.9894, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 9.714285714285714, |
|
"grad_norm": 0.7926774621009827, |
|
"learning_rate": 7.25e-05, |
|
"loss": 6.9854, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 9.904761904761905, |
|
"grad_norm": 1.0828678607940674, |
|
"learning_rate": 7e-05, |
|
"loss": 6.9185, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 10.095238095238095, |
|
"grad_norm": 0.6923830509185791, |
|
"learning_rate": 6.750000000000001e-05, |
|
"loss": 6.9804, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 10.285714285714286, |
|
"grad_norm": 0.5546735525131226, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 6.9273, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 10.476190476190476, |
|
"grad_norm": 0.8265076875686646, |
|
"learning_rate": 6.25e-05, |
|
"loss": 6.9087, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 10.666666666666666, |
|
"grad_norm": 0.3945198655128479, |
|
"learning_rate": 6e-05, |
|
"loss": 6.9375, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 10.857142857142858, |
|
"grad_norm": 0.5948878526687622, |
|
"learning_rate": 5.7499999999999995e-05, |
|
"loss": 6.8764, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 11.047619047619047, |
|
"grad_norm": 0.7741471529006958, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 6.7551, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 11.238095238095237, |
|
"grad_norm": 0.32554784417152405, |
|
"learning_rate": 5.25e-05, |
|
"loss": 6.8862, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 11.428571428571429, |
|
"grad_norm": 0.5033702850341797, |
|
"learning_rate": 5e-05, |
|
"loss": 6.7297, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 11.619047619047619, |
|
"grad_norm": 0.5291158556938171, |
|
"learning_rate": 4.75e-05, |
|
"loss": 6.9826, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 11.80952380952381, |
|
"grad_norm": 0.39498385787010193, |
|
"learning_rate": 4.5e-05, |
|
"loss": 6.837, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.4402136206626892, |
|
"learning_rate": 4.25e-05, |
|
"loss": 7.0434, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 12.19047619047619, |
|
"grad_norm": 0.6476764678955078, |
|
"learning_rate": 4e-05, |
|
"loss": 6.8524, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 12.380952380952381, |
|
"grad_norm": 0.330609530210495, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 6.8742, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 12.571428571428571, |
|
"grad_norm": 0.5420040488243103, |
|
"learning_rate": 3.5e-05, |
|
"loss": 6.7931, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 12.761904761904763, |
|
"grad_norm": 0.3482373356819153, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 6.883, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 12.952380952380953, |
|
"grad_norm": 0.3476051092147827, |
|
"learning_rate": 3e-05, |
|
"loss": 6.9857, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 13.142857142857142, |
|
"grad_norm": 0.43590274453163147, |
|
"learning_rate": 2.8749999999999997e-05, |
|
"loss": 8.116, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 13.333333333333334, |
|
"grad_norm": 0.2993098497390747, |
|
"learning_rate": 2.625e-05, |
|
"loss": 6.657, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 13.523809523809524, |
|
"grad_norm": 0.3477262556552887, |
|
"learning_rate": 2.375e-05, |
|
"loss": 6.9781, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 13.714285714285714, |
|
"grad_norm": 0.47370073199272156, |
|
"learning_rate": 2.125e-05, |
|
"loss": 6.9277, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 13.904761904761905, |
|
"grad_norm": 0.3924289345741272, |
|
"learning_rate": 1.8750000000000002e-05, |
|
"loss": 6.8967, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 14.095238095238095, |
|
"grad_norm": 0.5621922612190247, |
|
"learning_rate": 1.6250000000000002e-05, |
|
"loss": 6.7197, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 14.285714285714286, |
|
"grad_norm": 0.3454875349998474, |
|
"learning_rate": 1.3750000000000002e-05, |
|
"loss": 6.9314, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 14.476190476190476, |
|
"grad_norm": 0.3146642744541168, |
|
"learning_rate": 1.125e-05, |
|
"loss": 6.9142, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 14.666666666666666, |
|
"grad_norm": 0.3762160837650299, |
|
"learning_rate": 8.75e-06, |
|
"loss": 6.8759, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 14.857142857142858, |
|
"grad_norm": 0.33906954526901245, |
|
"learning_rate": 6.25e-06, |
|
"loss": 6.8712, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 15.047619047619047, |
|
"grad_norm": 0.3414846360683441, |
|
"learning_rate": 3.75e-06, |
|
"loss": 6.737, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 15.238095238095237, |
|
"grad_norm": 0.4463809132575989, |
|
"learning_rate": 1.25e-06, |
|
"loss": 6.9144, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 15.238095238095237, |
|
"step": 160, |
|
"total_flos": 800861569170024.0, |
|
"train_loss": 7.170098584890366, |
|
"train_runtime": 677.1666, |
|
"train_samples_per_second": 3.969, |
|
"train_steps_per_second": 0.236 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 160, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 16, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 800861569170024.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|